/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1265 - (show annotations)
Sun Mar 3 10:42:46 2013 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 215933 byte(s)
Error occurred while calculating annotation data.
Fix missing callout in alternative of conditional group when auto callout is 
set.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2013 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_COMMIT (-998)
91 #define MATCH_KETRPOS (-997)
92 #define MATCH_ONCE (-996)
93 #define MATCH_PRUNE (-995)
94 #define MATCH_SKIP (-994)
95 #define MATCH_SKIP_ARG (-993)
96 #define MATCH_THEN (-992)
97
98 /* Maximum number of ints of offset to save on the stack for recursive calls.
99 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
100 because the offset vector is always a multiple of 3 long. */
101
102 #define REC_STACK_SAVE_MAX 30
103
104 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
105
106 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
107 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
108
109 #ifdef PCRE_DEBUG
110 /*************************************************
111 * Debugging function to print chars *
112 *************************************************/
113
114 /* Print a sequence of chars in printable format, stopping at the end of the
115 subject if the requested.
116
117 Arguments:
118 p points to characters
119 length number to print
120 is_subject TRUE if printing from within md->start_subject
121 md pointer to matching data block, if is_subject is TRUE
122
123 Returns: nothing
124 */
125
126 static void
127 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
128 {
129 pcre_uint32 c;
130 BOOL utf = md->utf;
131 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
132 while (length-- > 0)
133 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
134 }
135 #endif
136
137
138
139 /*************************************************
140 * Match a back-reference *
141 *************************************************/
142
143 /* Normally, if a back reference hasn't been set, the length that is passed is
144 negative, so the match always fails. However, in JavaScript compatibility mode,
145 the length passed is zero. Note that in caseless UTF-8 mode, the number of
146 subject bytes matched may be different to the number of reference bytes.
147
148 Arguments:
149 offset index into the offset vector
150 eptr pointer into the subject
151 length length of reference to be matched (number of bytes)
152 md points to match data block
153 caseless TRUE if caseless
154
155 Returns: >= 0 the number of subject bytes matched
156 -1 no match
157 -2 partial match; always given if at end subject
158 */
159
160 static int
161 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
162 BOOL caseless)
163 {
164 PCRE_PUCHAR eptr_start = eptr;
165 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
166 #ifdef SUPPORT_UTF
167 BOOL utf = md->utf;
168 #endif
169
170 #ifdef PCRE_DEBUG
171 if (eptr >= md->end_subject)
172 printf("matching subject <null>");
173 else
174 {
175 printf("matching subject ");
176 pchars(eptr, length, TRUE, md);
177 }
178 printf(" against backref ");
179 pchars(p, length, FALSE, md);
180 printf("\n");
181 #endif
182
183 /* Always fail if reference not set (and not JavaScript compatible - in that
184 case the length is passed as zero). */
185
186 if (length < 0) return -1;
187
188 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
189 properly if Unicode properties are supported. Otherwise, we can check only
190 ASCII characters. */
191
192 if (caseless)
193 {
194 #ifdef SUPPORT_UTF
195 #ifdef SUPPORT_UCP
196 if (utf)
197 {
198 /* Match characters up to the end of the reference. NOTE: the number of
199 data units matched may differ, because in UTF-8 there are some characters
200 whose upper and lower case versions code have different numbers of bytes.
201 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
202 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
203 sequence of two of the latter. It is important, therefore, to check the
204 length along the reference, not along the subject (earlier code did this
205 wrong). */
206
207 PCRE_PUCHAR endptr = p + length;
208 while (p < endptr)
209 {
210 pcre_uint32 c, d;
211 const ucd_record *ur;
212 if (eptr >= md->end_subject) return -2; /* Partial match */
213 GETCHARINC(c, eptr);
214 GETCHARINC(d, p);
215 ur = GET_UCD(d);
216 if (c != d && c != d + ur->other_case)
217 {
218 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
219 for (;;)
220 {
221 if (c < *pp) return -1;
222 if (c == *pp++) break;
223 }
224 }
225 }
226 }
227 else
228 #endif
229 #endif
230
231 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
232 is no UCP support. */
233 {
234 while (length-- > 0)
235 {
236 pcre_uint32 cc, cp;
237 if (eptr >= md->end_subject) return -2; /* Partial match */
238 cc = RAWUCHARTEST(eptr);
239 cp = RAWUCHARTEST(p);
240 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
241 p++;
242 eptr++;
243 }
244 }
245 }
246
247 /* In the caseful case, we can just compare the bytes, whether or not we
248 are in UTF-8 mode. */
249
250 else
251 {
252 while (length-- > 0)
253 {
254 if (eptr >= md->end_subject) return -2; /* Partial match */
255 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
256 }
257 }
258
259 return (int)(eptr - eptr_start);
260 }
261
262
263
264 /***************************************************************************
265 ****************************************************************************
266 RECURSION IN THE match() FUNCTION
267
268 The match() function is highly recursive, though not every recursive call
269 increases the recursive depth. Nevertheless, some regular expressions can cause
270 it to recurse to a great depth. I was writing for Unix, so I just let it call
271 itself recursively. This uses the stack for saving everything that has to be
272 saved for a recursive call. On Unix, the stack can be large, and this works
273 fine.
274
275 It turns out that on some non-Unix-like systems there are problems with
276 programs that use a lot of stack. (This despite the fact that every last chip
277 has oodles of memory these days, and techniques for extending the stack have
278 been known for decades.) So....
279
280 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
281 calls by keeping local variables that need to be preserved in blocks of memory
282 obtained from malloc() instead instead of on the stack. Macros are used to
283 achieve this so that the actual code doesn't look very different to what it
284 always used to.
285
286 The original heap-recursive code used longjmp(). However, it seems that this
287 can be very slow on some operating systems. Following a suggestion from Stan
288 Switzer, the use of longjmp() has been abolished, at the cost of having to
289 provide a unique number for each call to RMATCH. There is no way of generating
290 a sequence of numbers at compile time in C. I have given them names, to make
291 them stand out more clearly.
292
293 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
294 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
295 tests. Furthermore, not using longjmp() means that local dynamic variables
296 don't have indeterminate values; this has meant that the frame size can be
297 reduced because the result can be "passed back" by straight setting of the
298 variable instead of being passed in the frame.
299 ****************************************************************************
300 ***************************************************************************/
301
302 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
303 below must be updated in sync. */
304
305 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
306 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
307 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
308 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
309 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
310 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
311 RM61, RM62, RM63, RM64, RM65, RM66, RM67, RM68 };
312
313 /* These versions of the macros use the stack, as normal. There are debugging
314 versions and production versions. Note that the "rw" argument of RMATCH isn't
315 actually used in this definition. */
316
317 #ifndef NO_RECURSE
318 #define REGISTER register
319
320 #ifdef PCRE_DEBUG
321 #define RMATCH(ra,rb,rc,rd,re,rw) \
322 { \
323 printf("match() called in line %d\n", __LINE__); \
324 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
325 printf("to line %d\n", __LINE__); \
326 }
327 #define RRETURN(ra) \
328 { \
329 printf("match() returned %d from line %d\n", ra, __LINE__); \
330 return ra; \
331 }
332 #else
333 #define RMATCH(ra,rb,rc,rd,re,rw) \
334 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
335 #define RRETURN(ra) return ra
336 #endif
337
338 #else
339
340
341 /* These versions of the macros manage a private stack on the heap. Note that
342 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
343 argument of match(), which never changes. */
344
345 #define REGISTER
346
347 #define RMATCH(ra,rb,rc,rd,re,rw)\
348 {\
349 heapframe *newframe = frame->Xnextframe;\
350 if (newframe == NULL)\
351 {\
352 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
353 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
354 newframe->Xnextframe = NULL;\
355 frame->Xnextframe = newframe;\
356 }\
357 frame->Xwhere = rw;\
358 newframe->Xeptr = ra;\
359 newframe->Xecode = rb;\
360 newframe->Xmstart = mstart;\
361 newframe->Xoffset_top = rc;\
362 newframe->Xeptrb = re;\
363 newframe->Xrdepth = frame->Xrdepth + 1;\
364 newframe->Xprevframe = frame;\
365 frame = newframe;\
366 DPRINTF(("restarting from line %d\n", __LINE__));\
367 goto HEAP_RECURSE;\
368 L_##rw:\
369 DPRINTF(("jumped back to line %d\n", __LINE__));\
370 }
371
372 #define RRETURN(ra)\
373 {\
374 heapframe *oldframe = frame;\
375 frame = oldframe->Xprevframe;\
376 if (frame != NULL)\
377 {\
378 rrc = ra;\
379 goto HEAP_RETURN;\
380 }\
381 return ra;\
382 }
383
384
385 /* Structure for remembering the local variables in a private frame */
386
387 typedef struct heapframe {
388 struct heapframe *Xprevframe;
389 struct heapframe *Xnextframe;
390
391 /* Function arguments that may change */
392
393 PCRE_PUCHAR Xeptr;
394 const pcre_uchar *Xecode;
395 PCRE_PUCHAR Xmstart;
396 int Xoffset_top;
397 eptrblock *Xeptrb;
398 unsigned int Xrdepth;
399
400 /* Function local variables */
401
402 PCRE_PUCHAR Xcallpat;
403 #ifdef SUPPORT_UTF
404 PCRE_PUCHAR Xcharptr;
405 #endif
406 PCRE_PUCHAR Xdata;
407 PCRE_PUCHAR Xnext;
408 PCRE_PUCHAR Xpp;
409 PCRE_PUCHAR Xprev;
410 PCRE_PUCHAR Xsaved_eptr;
411
412 recursion_info Xnew_recursive;
413
414 BOOL Xcur_is_word;
415 BOOL Xcondition;
416 BOOL Xprev_is_word;
417
418 #ifdef SUPPORT_UCP
419 int Xprop_type;
420 unsigned int Xprop_value;
421 int Xprop_fail_result;
422 int Xoclength;
423 pcre_uchar Xocchars[6];
424 #endif
425
426 int Xcodelink;
427 int Xctype;
428 unsigned int Xfc;
429 int Xfi;
430 int Xlength;
431 int Xmax;
432 int Xmin;
433 unsigned int Xnumber;
434 int Xoffset;
435 unsigned int Xop;
436 pcre_int32 Xsave_capture_last;
437 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
438 int Xstacksave[REC_STACK_SAVE_MAX];
439
440 eptrblock Xnewptrb;
441
442 /* Where to jump back to */
443
444 int Xwhere;
445
446 } heapframe;
447
448 #endif
449
450
451 /***************************************************************************
452 ***************************************************************************/
453
454
455
456 /*************************************************
457 * Match from current position *
458 *************************************************/
459
460 /* This function is called recursively in many circumstances. Whenever it
461 returns a negative (error) response, the outer incarnation must also return the
462 same response. */
463
464 /* These macros pack up tests that are used for partial matching, and which
465 appear several times in the code. We set the "hit end" flag if the pointer is
466 at the end of the subject and also past the start of the subject (i.e.
467 something has been matched). For hard partial matching, we then return
468 immediately. The second one is used when we already know we are past the end of
469 the subject. */
470
471 #define CHECK_PARTIAL()\
472 if (md->partial != 0 && eptr >= md->end_subject && \
473 eptr > md->start_used_ptr) \
474 { \
475 md->hitend = TRUE; \
476 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
477 }
478
479 #define SCHECK_PARTIAL()\
480 if (md->partial != 0 && eptr > md->start_used_ptr) \
481 { \
482 md->hitend = TRUE; \
483 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
484 }
485
486
487 /* Performance note: It might be tempting to extract commonly used fields from
488 the md structure (e.g. utf, end_subject) into individual variables to improve
489 performance. Tests using gcc on a SPARC disproved this; in the first case, it
490 made performance worse.
491
492 Arguments:
493 eptr pointer to current character in subject
494 ecode pointer to current position in compiled code
495 mstart pointer to the current match start position (can be modified
496 by encountering \K)
497 offset_top current top pointer
498 md pointer to "static" info for the match
499 eptrb pointer to chain of blocks containing eptr at start of
500 brackets - for testing for empty matches
501 rdepth the recursion depth
502
503 Returns: MATCH_MATCH if matched ) these values are >= 0
504 MATCH_NOMATCH if failed to match )
505 a negative MATCH_xxx value for PRUNE, SKIP, etc
506 a negative PCRE_ERROR_xxx value if aborted by an error condition
507 (e.g. stopped by repeated call or recursion limit)
508 */
509
510 static int
511 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
512 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
513 unsigned int rdepth)
514 {
515 /* These variables do not need to be preserved over recursion in this function,
516 so they can be ordinary variables in all cases. Mark some of them with
517 "register" because they are used a lot in loops. */
518
519 register int rrc; /* Returns from recursive calls */
520 register int i; /* Used for loops not involving calls to RMATCH() */
521 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
522 register BOOL utf; /* Local copy of UTF flag for speed */
523
524 BOOL minimize, possessive; /* Quantifier options */
525 BOOL caseless;
526 int condcode;
527
528 /* When recursion is not being used, all "local" variables that have to be
529 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
530 frame on the stack here; subsequent instantiations are obtained from the heap
531 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
532 the top-level on the stack rather than malloc-ing them all gives a performance
533 boost in many cases where there is not much "recursion". */
534
535 #ifdef NO_RECURSE
536 heapframe *frame = (heapframe *)md->match_frames_base;
537
538 /* Copy in the original argument variables */
539
540 frame->Xeptr = eptr;
541 frame->Xecode = ecode;
542 frame->Xmstart = mstart;
543 frame->Xoffset_top = offset_top;
544 frame->Xeptrb = eptrb;
545 frame->Xrdepth = rdepth;
546
547 /* This is where control jumps back to to effect "recursion" */
548
549 HEAP_RECURSE:
550
551 /* Macros make the argument variables come from the current frame */
552
553 #define eptr frame->Xeptr
554 #define ecode frame->Xecode
555 #define mstart frame->Xmstart
556 #define offset_top frame->Xoffset_top
557 #define eptrb frame->Xeptrb
558 #define rdepth frame->Xrdepth
559
560 /* Ditto for the local variables */
561
562 #ifdef SUPPORT_UTF
563 #define charptr frame->Xcharptr
564 #endif
565 #define callpat frame->Xcallpat
566 #define codelink frame->Xcodelink
567 #define data frame->Xdata
568 #define next frame->Xnext
569 #define pp frame->Xpp
570 #define prev frame->Xprev
571 #define saved_eptr frame->Xsaved_eptr
572
573 #define new_recursive frame->Xnew_recursive
574
575 #define cur_is_word frame->Xcur_is_word
576 #define condition frame->Xcondition
577 #define prev_is_word frame->Xprev_is_word
578
579 #ifdef SUPPORT_UCP
580 #define prop_type frame->Xprop_type
581 #define prop_value frame->Xprop_value
582 #define prop_fail_result frame->Xprop_fail_result
583 #define oclength frame->Xoclength
584 #define occhars frame->Xocchars
585 #endif
586
587 #define ctype frame->Xctype
588 #define fc frame->Xfc
589 #define fi frame->Xfi
590 #define length frame->Xlength
591 #define max frame->Xmax
592 #define min frame->Xmin
593 #define number frame->Xnumber
594 #define offset frame->Xoffset
595 #define op frame->Xop
596 #define save_capture_last frame->Xsave_capture_last
597 #define save_offset1 frame->Xsave_offset1
598 #define save_offset2 frame->Xsave_offset2
599 #define save_offset3 frame->Xsave_offset3
600 #define stacksave frame->Xstacksave
601
602 #define newptrb frame->Xnewptrb
603
604 /* When recursion is being used, local variables are allocated on the stack and
605 get preserved during recursion in the normal way. In this environment, fi and
606 i, and fc and c, can be the same variables. */
607
608 #else /* NO_RECURSE not defined */
609 #define fi i
610 #define fc c
611
612 /* Many of the following variables are used only in small blocks of the code.
613 My normal style of coding would have declared them within each of those blocks.
614 However, in order to accommodate the version of this code that uses an external
615 "stack" implemented on the heap, it is easier to declare them all here, so the
616 declarations can be cut out in a block. The only declarations within blocks
617 below are for variables that do not have to be preserved over a recursive call
618 to RMATCH(). */
619
620 #ifdef SUPPORT_UTF
621 const pcre_uchar *charptr;
622 #endif
623 const pcre_uchar *callpat;
624 const pcre_uchar *data;
625 const pcre_uchar *next;
626 PCRE_PUCHAR pp;
627 const pcre_uchar *prev;
628 PCRE_PUCHAR saved_eptr;
629
630 recursion_info new_recursive;
631
632 BOOL cur_is_word;
633 BOOL condition;
634 BOOL prev_is_word;
635
636 #ifdef SUPPORT_UCP
637 int prop_type;
638 unsigned int prop_value;
639 int prop_fail_result;
640 int oclength;
641 pcre_uchar occhars[6];
642 #endif
643
644 int codelink;
645 int ctype;
646 int length;
647 int max;
648 int min;
649 unsigned int number;
650 int offset;
651 unsigned int op;
652 pcre_int32 save_capture_last;
653 int save_offset1, save_offset2, save_offset3;
654 int stacksave[REC_STACK_SAVE_MAX];
655
656 eptrblock newptrb;
657
658 /* There is a special fudge for calling match() in a way that causes it to
659 measure the size of its basic stack frame when the stack is being used for
660 recursion. The second argument (ecode) being NULL triggers this behaviour. It
661 cannot normally ever be NULL. The return is the negated value of the frame
662 size. */
663
664 if (ecode == NULL)
665 {
666 if (rdepth == 0)
667 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
668 else
669 {
670 int len = (char *)&rdepth - (char *)eptr;
671 return (len > 0)? -len : len;
672 }
673 }
674 #endif /* NO_RECURSE */
675
676 /* To save space on the stack and in the heap frame, I have doubled up on some
677 of the local variables that are used only in localised parts of the code, but
678 still need to be preserved over recursive calls of match(). These macros define
679 the alternative names that are used. */
680
681 #define allow_zero cur_is_word
682 #define cbegroup condition
683 #define code_offset codelink
684 #define condassert condition
685 #define matched_once prev_is_word
686 #define foc number
687 #define save_mark data
688
689 /* These statements are here to stop the compiler complaining about unitialized
690 variables. */
691
692 #ifdef SUPPORT_UCP
693 prop_value = 0;
694 prop_fail_result = 0;
695 #endif
696
697
698 /* This label is used for tail recursion, which is used in a few cases even
699 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
700 used. Thanks to Ian Taylor for noticing this possibility and sending the
701 original patch. */
702
703 TAIL_RECURSE:
704
705 /* OK, now we can get on with the real code of the function. Recursive calls
706 are specified by the macro RMATCH and RRETURN is used to return. When
707 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
708 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
709 defined). However, RMATCH isn't like a function call because it's quite a
710 complicated macro. It has to be used in one particular way. This shouldn't,
711 however, impact performance when true recursion is being used. */
712
713 #ifdef SUPPORT_UTF
714 utf = md->utf; /* Local copy of the flag */
715 #else
716 utf = FALSE;
717 #endif
718
719 /* First check that we haven't called match() too many times, or that we
720 haven't exceeded the recursive call limit. */
721
722 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
723 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
724
725 /* At the start of a group with an unlimited repeat that may match an empty
726 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
727 done this way to save having to use another function argument, which would take
728 up space on the stack. See also MATCH_CONDASSERT below.
729
730 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
731 such remembered pointers, to be checked when we hit the closing ket, in order
732 to break infinite loops that match no characters. When match() is called in
733 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
734 NOT be used with tail recursion, because the memory block that is used is on
735 the stack, so a new one may be required for each match(). */
736
737 if (md->match_function_type == MATCH_CBEGROUP)
738 {
739 newptrb.epb_saved_eptr = eptr;
740 newptrb.epb_prev = eptrb;
741 eptrb = &newptrb;
742 md->match_function_type = 0;
743 }
744
745 /* Now start processing the opcodes. */
746
747 for (;;)
748 {
749 minimize = possessive = FALSE;
750 op = *ecode;
751
752 switch(op)
753 {
754 case OP_MARK:
755 md->nomatch_mark = ecode + 2;
756 md->mark = NULL; /* In case previously set by assertion */
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
758 eptrb, RM55);
759 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
760 md->mark == NULL) md->mark = ecode + 2;
761
762 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
763 argument, and we must check whether that argument matches this MARK's
764 argument. It is passed back in md->start_match_ptr (an overloading of that
765 variable). If it does match, we reset that variable to the current subject
766 position and return MATCH_SKIP. Otherwise, pass back the return code
767 unaltered. */
768
769 else if (rrc == MATCH_SKIP_ARG &&
770 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
771 {
772 md->start_match_ptr = eptr;
773 RRETURN(MATCH_SKIP);
774 }
775 RRETURN(rrc);
776
777 case OP_FAIL:
778 RRETURN(MATCH_NOMATCH);
779
780 /* COMMIT overrides PRUNE, SKIP, and THEN */
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
786 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
787 rrc != MATCH_THEN)
788 RRETURN(rrc);
789 RRETURN(MATCH_COMMIT);
790
791 /* PRUNE overrides THEN */
792
793 case OP_PRUNE:
794 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
795 eptrb, RM51);
796 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
797 RRETURN(MATCH_PRUNE);
798
799 case OP_PRUNE_ARG:
800 md->nomatch_mark = ecode + 2;
801 md->mark = NULL; /* In case previously set by assertion */
802 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
803 eptrb, RM56);
804 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
805 md->mark == NULL) md->mark = ecode + 2;
806 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
807 RRETURN(MATCH_PRUNE);
808
809 /* SKIP overrides PRUNE and THEN */
810
811 case OP_SKIP:
812 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
813 eptrb, RM53);
814 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
815 RRETURN(rrc);
816 md->start_match_ptr = eptr; /* Pass back current position */
817 RRETURN(MATCH_SKIP);
818
819 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
820 nomatch_mark. There is a flag that disables this opcode when re-matching a
821 pattern that ended with a SKIP for which there was not a matching MARK. */
822
823 case OP_SKIP_ARG:
824 if (md->ignore_skip_arg)
825 {
826 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
827 break;
828 }
829 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
830 eptrb, RM57);
831 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
832 RRETURN(rrc);
833
834 /* Pass back the current skip name by overloading md->start_match_ptr and
835 returning the special MATCH_SKIP_ARG return code. This will either be
836 caught by a matching MARK, or get to the top, where it causes a rematch
837 with the md->ignore_skip_arg flag set. */
838
839 md->start_match_ptr = ecode + 2;
840 RRETURN(MATCH_SKIP_ARG);
841
842 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
843 the branch in which it occurs can be determined. Overload the start of
844 match pointer to do this. */
845
846 case OP_THEN:
847 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
848 eptrb, RM54);
849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
850 md->start_match_ptr = ecode;
851 RRETURN(MATCH_THEN);
852
853 case OP_THEN_ARG:
854 md->nomatch_mark = ecode + 2;
855 md->mark = NULL; /* In case previously set by assertion */
856 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
857 md, eptrb, RM58);
858 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
859 md->mark == NULL) md->mark = ecode + 2;
860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
861 md->start_match_ptr = ecode;
862 RRETURN(MATCH_THEN);
863
864 /* Handle an atomic group that does not contain any capturing parentheses.
865 This can be handled like an assertion. Prior to 8.13, all atomic groups
866 were handled this way. In 8.13, the code was changed as below for ONCE, so
867 that backups pass through the group and thereby reset captured values.
868 However, this uses a lot more stack, so in 8.20, atomic groups that do not
869 contain any captures generate OP_ONCE_NC, which can be handled in the old,
870 less stack intensive way.
871
872 Check the alternative branches in turn - the matching won't pass the KET
873 for this kind of subpattern. If any one branch matches, we carry on as at
874 the end of a normal bracket, leaving the subject pointer, but resetting
875 the start-of-match value in case it was changed by \K. */
876
877 case OP_ONCE_NC:
878 prev = ecode;
879 saved_eptr = eptr;
880 save_mark = md->mark;
881 do
882 {
883 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
884 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
885 {
886 mstart = md->start_match_ptr;
887 break;
888 }
889 if (rrc == MATCH_THEN)
890 {
891 next = ecode + GET(ecode,1);
892 if (md->start_match_ptr < next &&
893 (*ecode == OP_ALT || *next == OP_ALT))
894 rrc = MATCH_NOMATCH;
895 }
896
897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
898 ecode += GET(ecode,1);
899 md->mark = save_mark;
900 }
901 while (*ecode == OP_ALT);
902
903 /* If hit the end of the group (which could be repeated), fail */
904
905 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
906
907 /* Continue as from after the group, updating the offsets high water
908 mark, since extracts may have been taken. */
909
910 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
911
912 offset_top = md->end_offset_top;
913 eptr = md->end_match_ptr;
914
915 /* For a non-repeating ket, just continue at this level. This also
916 happens for a repeating ket if no characters were matched in the group.
917 This is the forcible breaking of infinite loops as implemented in Perl
918 5.005. */
919
920 if (*ecode == OP_KET || eptr == saved_eptr)
921 {
922 ecode += 1+LINK_SIZE;
923 break;
924 }
925
926 /* The repeating kets try the rest of the pattern or restart from the
927 preceding bracket, in the appropriate order. The second "call" of match()
928 uses tail recursion, to avoid using another stack frame. */
929
930 if (*ecode == OP_KETRMIN)
931 {
932 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
934 ecode = prev;
935 goto TAIL_RECURSE;
936 }
937 else /* OP_KETRMAX */
938 {
939 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
941 ecode += 1 + LINK_SIZE;
942 goto TAIL_RECURSE;
943 }
944 /* Control never gets here */
945
946 /* Handle a capturing bracket, other than those that are possessive with an
947 unlimited repeat. If there is space in the offset vector, save the current
948 subject position in the working slot at the top of the vector. We mustn't
949 change the current values of the data slot, because they may be set from a
950 previous iteration of this group, and be referred to by a reference inside
951 the group. A failure to match might occur after the group has succeeded,
952 if something later on doesn't match. For this reason, we need to restore
953 the working value and also the values of the final offsets, in case they
954 were set by a previous iteration of the same bracket.
955
956 If there isn't enough space in the offset vector, treat this as if it were
957 a non-capturing bracket. Don't worry about setting the flag for the error
958 case here; that is handled in the code for KET. */
959
960 case OP_CBRA:
961 case OP_SCBRA:
962 number = GET2(ecode, 1+LINK_SIZE);
963 offset = number << 1;
964
965 #ifdef PCRE_DEBUG
966 printf("start bracket %d\n", number);
967 printf("subject=");
968 pchars(eptr, 16, TRUE, md);
969 printf("\n");
970 #endif
971
972 if (offset < md->offset_max)
973 {
974 save_offset1 = md->offset_vector[offset];
975 save_offset2 = md->offset_vector[offset+1];
976 save_offset3 = md->offset_vector[md->offset_end - number];
977 save_capture_last = md->capture_last;
978 save_mark = md->mark;
979
980 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
981 md->offset_vector[md->offset_end - number] =
982 (int)(eptr - md->start_subject);
983
984 for (;;)
985 {
986 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
987 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
988 eptrb, RM1);
989 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
990
991 /* If we backed up to a THEN, check whether it is within the current
992 branch by comparing the address of the THEN that is passed back with
993 the end of the branch. If it is within the current branch, and the
994 branch is one of two or more alternatives (it either starts or ends
995 with OP_ALT), we have reached the limit of THEN's action, so convert
996 the return code to NOMATCH, which will cause normal backtracking to
997 happen from now on. Otherwise, THEN is passed back to an outer
998 alternative. This implements Perl's treatment of parenthesized groups,
999 where a group not containing | does not affect the current alternative,
1000 that is, (X) is NOT the same as (X|(*F)). */
1001
1002 if (rrc == MATCH_THEN)
1003 {
1004 next = ecode + GET(ecode,1);
1005 if (md->start_match_ptr < next &&
1006 (*ecode == OP_ALT || *next == OP_ALT))
1007 rrc = MATCH_NOMATCH;
1008 }
1009
1010 /* Anything other than NOMATCH is passed back. */
1011
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013 md->capture_last = save_capture_last;
1014 ecode += GET(ecode, 1);
1015 md->mark = save_mark;
1016 if (*ecode != OP_ALT) break;
1017 }
1018
1019 DPRINTF(("bracket %d failed\n", number));
1020 md->offset_vector[offset] = save_offset1;
1021 md->offset_vector[offset+1] = save_offset2;
1022 md->offset_vector[md->offset_end - number] = save_offset3;
1023
1024 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1025
1026 RRETURN(rrc);
1027 }
1028
1029 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1030 as a non-capturing bracket. */
1031
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034
1035 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1036
1037 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1038 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1039
1040 /* Non-capturing or atomic group, except for possessive with unlimited
1041 repeat and ONCE group with no captures. Loop for all the alternatives.
1042
1043 When we get to the final alternative within the brackets, we used to return
1044 the result of a recursive call to match() whatever happened so it was
1045 possible to reduce stack usage by turning this into a tail recursion,
1046 except in the case of a possibly empty group. However, now that there is
1047 the possiblity of (*THEN) occurring in the final alternative, this
1048 optimization is no longer always possible.
1049
1050 We can optimize if we know there are no (*THEN)s in the pattern; at present
1051 this is the best that can be done.
1052
1053 MATCH_ONCE is returned when the end of an atomic group is successfully
1054 reached, but subsequent matching fails. It passes back up the tree (causing
1055 captured values to be reset) until the original atomic group level is
1056 reached. This is tested by comparing md->once_target with the start of the
1057 group. At this point, the return is converted into MATCH_NOMATCH so that
1058 previous backup points can be taken. */
1059
1060 case OP_ONCE:
1061 case OP_BRA:
1062 case OP_SBRA:
1063 DPRINTF(("start non-capturing bracket\n"));
1064
1065 for (;;)
1066 {
1067 if (op >= OP_SBRA || op == OP_ONCE)
1068 md->match_function_type = MATCH_CBEGROUP;
1069
1070 /* If this is not a possibly empty group, and there are no (*THEN)s in
1071 the pattern, and this is the final alternative, optimize as described
1072 above. */
1073
1074 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1075 {
1076 ecode += PRIV(OP_lengths)[*ecode];
1077 goto TAIL_RECURSE;
1078 }
1079
1080 /* In all other cases, we have to make another call to match(). */
1081
1082 save_mark = md->mark;
1083 save_capture_last = md->capture_last;
1084 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1085 RM2);
1086
1087 /* See comment in the code for capturing groups above about handling
1088 THEN. */
1089
1090 if (rrc == MATCH_THEN)
1091 {
1092 next = ecode + GET(ecode,1);
1093 if (md->start_match_ptr < next &&
1094 (*ecode == OP_ALT || *next == OP_ALT))
1095 rrc = MATCH_NOMATCH;
1096 }
1097
1098 if (rrc != MATCH_NOMATCH)
1099 {
1100 if (rrc == MATCH_ONCE)
1101 {
1102 const pcre_uchar *scode = ecode;
1103 if (*scode != OP_ONCE) /* If not at start, find it */
1104 {
1105 while (*scode == OP_ALT) scode += GET(scode, 1);
1106 scode -= GET(scode, 1);
1107 }
1108 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1109 }
1110 RRETURN(rrc);
1111 }
1112 ecode += GET(ecode, 1);
1113 md->mark = save_mark;
1114 if (*ecode != OP_ALT) break;
1115 md->capture_last = save_capture_last;
1116 }
1117
1118 RRETURN(MATCH_NOMATCH);
1119
1120 /* Handle possessive capturing brackets with an unlimited repeat. We come
1121 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1122 handled similarly to the normal case above. However, the matching is
1123 different. The end of these brackets will always be OP_KETRPOS, which
1124 returns MATCH_KETRPOS without going further in the pattern. By this means
1125 we can handle the group by iteration rather than recursion, thereby
1126 reducing the amount of stack needed. */
1127
1128 case OP_CBRAPOS:
1129 case OP_SCBRAPOS:
1130 allow_zero = FALSE;
1131
1132 POSSESSIVE_CAPTURE:
1133 number = GET2(ecode, 1+LINK_SIZE);
1134 offset = number << 1;
1135
1136 #ifdef PCRE_DEBUG
1137 printf("start possessive bracket %d\n", number);
1138 printf("subject=");
1139 pchars(eptr, 16, TRUE, md);
1140 printf("\n");
1141 #endif
1142
1143 if (offset < md->offset_max)
1144 {
1145 matched_once = FALSE;
1146 code_offset = (int)(ecode - md->start_code);
1147
1148 save_offset1 = md->offset_vector[offset];
1149 save_offset2 = md->offset_vector[offset+1];
1150 save_offset3 = md->offset_vector[md->offset_end - number];
1151 save_capture_last = md->capture_last;
1152
1153 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1154
1155 /* Each time round the loop, save the current subject position for use
1156 when the group matches. For MATCH_MATCH, the group has matched, so we
1157 restart it with a new subject starting position, remembering that we had
1158 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1159 usual. If we haven't matched any alternatives in any iteration, check to
1160 see if a previous iteration matched. If so, the group has matched;
1161 continue from afterwards. Otherwise it has failed; restore the previous
1162 capture values before returning NOMATCH. */
1163
1164 for (;;)
1165 {
1166 md->offset_vector[md->offset_end - number] =
1167 (int)(eptr - md->start_subject);
1168 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1170 eptrb, RM63);
1171 if (rrc == MATCH_KETRPOS)
1172 {
1173 offset_top = md->end_offset_top;
1174 eptr = md->end_match_ptr;
1175 ecode = md->start_code + code_offset;
1176 save_capture_last = md->capture_last;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 md->capture_last = save_capture_last;
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (!matched_once)
1199 {
1200 md->offset_vector[offset] = save_offset1;
1201 md->offset_vector[offset+1] = save_offset2;
1202 md->offset_vector[md->offset_end - number] = save_offset3;
1203 }
1204
1205 if (allow_zero || matched_once)
1206 {
1207 ecode += 1 + LINK_SIZE;
1208 break;
1209 }
1210
1211 RRETURN(MATCH_NOMATCH);
1212 }
1213
1214 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1215 as a non-capturing bracket. */
1216
1217 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1218 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1219
1220 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1221
1222 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1223 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1224
1225 /* Non-capturing possessive bracket with unlimited repeat. We come here
1226 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1227 without the capturing complication. It is written out separately for speed
1228 and cleanliness. */
1229
1230 case OP_BRAPOS:
1231 case OP_SBRAPOS:
1232 allow_zero = FALSE;
1233
1234 POSSESSIVE_NON_CAPTURE:
1235 matched_once = FALSE;
1236 code_offset = (int)(ecode - md->start_code);
1237 save_capture_last = md->capture_last;
1238
1239 for (;;)
1240 {
1241 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1242 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1243 eptrb, RM48);
1244 if (rrc == MATCH_KETRPOS)
1245 {
1246 offset_top = md->end_offset_top;
1247 eptr = md->end_match_ptr;
1248 ecode = md->start_code + code_offset;
1249 matched_once = TRUE;
1250 continue;
1251 }
1252
1253 /* See comment in the code for capturing groups above about handling
1254 THEN. */
1255
1256 if (rrc == MATCH_THEN)
1257 {
1258 next = ecode + GET(ecode,1);
1259 if (md->start_match_ptr < next &&
1260 (*ecode == OP_ALT || *next == OP_ALT))
1261 rrc = MATCH_NOMATCH;
1262 }
1263
1264 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1265 ecode += GET(ecode, 1);
1266 if (*ecode != OP_ALT) break;
1267 md->capture_last = save_capture_last;
1268 }
1269
1270 if (matched_once || allow_zero)
1271 {
1272 ecode += 1 + LINK_SIZE;
1273 break;
1274 }
1275 RRETURN(MATCH_NOMATCH);
1276
1277 /* Control never reaches here. */
1278
1279 /* Conditional group: compilation checked that there are no more than
1280 two branches. If the condition is false, skipping the first branch takes us
1281 past the end if there is only one branch, but that's OK because that is
1282 exactly what going to the ket would do. */
1283
1284 case OP_COND:
1285 case OP_SCOND:
1286 codelink = GET(ecode, 1);
1287
1288 /* Because of the way auto-callout works during compile, a callout item is
1289 inserted between OP_COND and an assertion condition. */
1290
1291 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1292 {
1293 if (PUBL(callout) != NULL)
1294 {
1295 PUBL(callout_block) cb;
1296 cb.version = 2; /* Version 1 of the callout block */
1297 cb.callout_number = ecode[LINK_SIZE+2];
1298 cb.offset_vector = md->offset_vector;
1299 #if defined COMPILE_PCRE8
1300 cb.subject = (PCRE_SPTR)md->start_subject;
1301 #elif defined COMPILE_PCRE16
1302 cb.subject = (PCRE_SPTR16)md->start_subject;
1303 #elif defined COMPILE_PCRE32
1304 cb.subject = (PCRE_SPTR32)md->start_subject;
1305 #endif
1306 cb.subject_length = (int)(md->end_subject - md->start_subject);
1307 cb.start_match = (int)(mstart - md->start_subject);
1308 cb.current_position = (int)(eptr - md->start_subject);
1309 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1310 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1311 cb.capture_top = offset_top/2;
1312 cb.capture_last = md->capture_last & CAPLMASK;
1313 /* Internal change requires this for API compatibility. */
1314 if (cb.capture_last == 0) cb.capture_last = -1;
1315 cb.callout_data = md->callout_data;
1316 cb.mark = md->nomatch_mark;
1317 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1318 if (rrc < 0) RRETURN(rrc);
1319 }
1320 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1321 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1322 }
1323
1324 condcode = ecode[LINK_SIZE+1];
1325
1326 /* Now see what the actual condition is */
1327
1328 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1329 {
1330 if (md->recursive == NULL) /* Not recursing => FALSE */
1331 {
1332 condition = FALSE;
1333 ecode += GET(ecode, 1);
1334 }
1335 else
1336 {
1337 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1338 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1339
1340 /* If the test is for recursion into a specific subpattern, and it is
1341 false, but the test was set up by name, scan the table to see if the
1342 name refers to any other numbers, and test them. The condition is true
1343 if any one is set. */
1344
1345 if (!condition && condcode == OP_NRREF)
1346 {
1347 pcre_uchar *slotA = md->name_table;
1348 for (i = 0; i < md->name_count; i++)
1349 {
1350 if (GET2(slotA, 0) == recno) break;
1351 slotA += md->name_entry_size;
1352 }
1353
1354 /* Found a name for the number - there can be only one; duplicate
1355 names for different numbers are allowed, but not vice versa. First
1356 scan down for duplicates. */
1357
1358 if (i < md->name_count)
1359 {
1360 pcre_uchar *slotB = slotA;
1361 while (slotB > md->name_table)
1362 {
1363 slotB -= md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 condition = GET2(slotB, 0) == md->recursive->group_num;
1367 if (condition) break;
1368 }
1369 else break;
1370 }
1371
1372 /* Scan up for duplicates */
1373
1374 if (!condition)
1375 {
1376 slotB = slotA;
1377 for (i++; i < md->name_count; i++)
1378 {
1379 slotB += md->name_entry_size;
1380 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1381 {
1382 condition = GET2(slotB, 0) == md->recursive->group_num;
1383 if (condition) break;
1384 }
1385 else break;
1386 }
1387 }
1388 }
1389 }
1390
1391 /* Chose branch according to the condition */
1392
1393 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1394 }
1395 }
1396
1397 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1398 {
1399 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1400 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1401
1402 /* If the numbered capture is unset, but the reference was by name,
1403 scan the table to see if the name refers to any other numbers, and test
1404 them. The condition is true if any one is set. This is tediously similar
1405 to the code above, but not close enough to try to amalgamate. */
1406
1407 if (!condition && condcode == OP_NCREF)
1408 {
1409 unsigned int refno = offset >> 1;
1410 pcre_uchar *slotA = md->name_table;
1411
1412 for (i = 0; i < md->name_count; i++)
1413 {
1414 if (GET2(slotA, 0) == refno) break;
1415 slotA += md->name_entry_size;
1416 }
1417
1418 /* Found a name for the number - there can be only one; duplicate names
1419 for different numbers are allowed, but not vice versa. First scan down
1420 for duplicates. */
1421
1422 if (i < md->name_count)
1423 {
1424 pcre_uchar *slotB = slotA;
1425 while (slotB > md->name_table)
1426 {
1427 slotB -= md->name_entry_size;
1428 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1429 {
1430 offset = GET2(slotB, 0) << 1;
1431 condition = offset < offset_top &&
1432 md->offset_vector[offset] >= 0;
1433 if (condition) break;
1434 }
1435 else break;
1436 }
1437
1438 /* Scan up for duplicates */
1439
1440 if (!condition)
1441 {
1442 slotB = slotA;
1443 for (i++; i < md->name_count; i++)
1444 {
1445 slotB += md->name_entry_size;
1446 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1447 {
1448 offset = GET2(slotB, 0) << 1;
1449 condition = offset < offset_top &&
1450 md->offset_vector[offset] >= 0;
1451 if (condition) break;
1452 }
1453 else break;
1454 }
1455 }
1456 }
1457 }
1458
1459 /* Chose branch according to the condition */
1460
1461 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1462 }
1463
1464 else if (condcode == OP_DEF) /* DEFINE - always false */
1465 {
1466 condition = FALSE;
1467 ecode += GET(ecode, 1);
1468 }
1469
1470 /* The condition is an assertion. Call match() to evaluate it - setting
1471 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1472 an assertion. */
1473
1474 else
1475 {
1476 md->match_function_type = MATCH_CONDASSERT;
1477 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1478 if (rrc == MATCH_MATCH)
1479 {
1480 if (md->end_offset_top > offset_top)
1481 offset_top = md->end_offset_top; /* Captures may have happened */
1482 condition = TRUE;
1483 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1484 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1485 }
1486
1487 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1488 assertion; it is therefore treated as NOMATCH. */
1489
1490 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1491 {
1492 RRETURN(rrc); /* Need braces because of following else */
1493 }
1494 else
1495 {
1496 condition = FALSE;
1497 ecode += codelink;
1498 }
1499 }
1500
1501 /* We are now at the branch that is to be obeyed. As there is only one, can
1502 use tail recursion to avoid using another stack frame, except when there is
1503 unlimited repeat of a possibly empty group. In the latter case, a recursive
1504 call to match() is always required, unless the second alternative doesn't
1505 exist, in which case we can just plough on. Note that, for compatibility
1506 with Perl, the | in a conditional group is NOT treated as creating two
1507 alternatives. If a THEN is encountered in the branch, it propagates out to
1508 the enclosing alternative (unless nested in a deeper set of alternatives,
1509 of course). */
1510
1511 if (condition || *ecode == OP_ALT)
1512 {
1513 if (op != OP_SCOND)
1514 {
1515 ecode += 1 + LINK_SIZE;
1516 goto TAIL_RECURSE;
1517 }
1518
1519 md->match_function_type = MATCH_CBEGROUP;
1520 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1521 RRETURN(rrc);
1522 }
1523
1524 /* Condition false & no alternative; continue after the group. */
1525
1526 else
1527 {
1528 ecode += 1 + LINK_SIZE;
1529 }
1530 break;
1531
1532
1533 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1534 to close any currently open capturing brackets. */
1535
1536 case OP_CLOSE:
1537 number = GET2(ecode, 1); /* Must be less than 65536 */
1538 offset = number << 1;
1539
1540 #ifdef PCRE_DEBUG
1541 printf("end bracket %d at *ACCEPT", number);
1542 printf("\n");
1543 #endif
1544
1545 md->capture_last = (md->capture_last & OVFLMASK) | number;
1546 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1547 {
1548 md->offset_vector[offset] =
1549 md->offset_vector[md->offset_end - number];
1550 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1551 if (offset_top <= offset) offset_top = offset + 2;
1552 }
1553 ecode += 1 + IMM2_SIZE;
1554 break;
1555
1556
1557 /* End of the pattern, either real or forced. */
1558
1559 case OP_END:
1560 case OP_ACCEPT:
1561 case OP_ASSERT_ACCEPT:
1562
1563 /* If we have matched an empty string, fail if not in an assertion and not
1564 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1565 is set and we have matched at the start of the subject. In both cases,
1566 backtracking will then try other alternatives, if any. */
1567
1568 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1569 md->recursive == NULL &&
1570 (md->notempty ||
1571 (md->notempty_atstart &&
1572 mstart == md->start_subject + md->start_offset)))
1573 RRETURN(MATCH_NOMATCH);
1574
1575 /* Otherwise, we have a match. */
1576
1577 md->end_match_ptr = eptr; /* Record where we ended */
1578 md->end_offset_top = offset_top; /* and how many extracts were taken */
1579 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1580
1581 /* For some reason, the macros don't work properly if an expression is
1582 given as the argument to RRETURN when the heap is in use. */
1583
1584 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1585 RRETURN(rrc);
1586
1587 /* Assertion brackets. Check the alternative branches in turn - the
1588 matching won't pass the KET for an assertion. If any one branch matches,
1589 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1590 start of each branch to move the current point backwards, so the code at
1591 this level is identical to the lookahead case. When the assertion is part
1592 of a condition, we want to return immediately afterwards. The caller of
1593 this incarnation of the match() function will have set MATCH_CONDASSERT in
1594 md->match_function type, and one of these opcodes will be the first opcode
1595 that is processed. We use a local variable that is preserved over calls to
1596 match() to remember this case. */
1597
1598 case OP_ASSERT:
1599 case OP_ASSERTBACK:
1600 save_mark = md->mark;
1601 if (md->match_function_type == MATCH_CONDASSERT)
1602 {
1603 condassert = TRUE;
1604 md->match_function_type = 0;
1605 }
1606 else condassert = FALSE;
1607
1608 do
1609 {
1610 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1611 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1612 {
1613 mstart = md->start_match_ptr; /* In case \K reset it */
1614 break;
1615 }
1616 md->mark = save_mark;
1617
1618 /* A COMMIT failure must fail the entire assertion, without trying any
1619 subsequent branches. */
1620
1621 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1622
1623 /* PCRE does not allow THEN to escape beyond an assertion; it
1624 is treated as NOMATCH. */
1625
1626 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1627 ecode += GET(ecode, 1);
1628 }
1629 while (*ecode == OP_ALT);
1630
1631 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1632
1633 /* If checking an assertion for a condition, return MATCH_MATCH. */
1634
1635 if (condassert) RRETURN(MATCH_MATCH);
1636
1637 /* Continue from after the assertion, updating the offsets high water
1638 mark, since extracts may have been taken during the assertion. */
1639
1640 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1641 ecode += 1 + LINK_SIZE;
1642 offset_top = md->end_offset_top;
1643 continue;
1644
1645 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1646 PRUNE, or COMMIT means we must assume failure without checking subsequent
1647 branches. */
1648
1649 case OP_ASSERT_NOT:
1650 case OP_ASSERTBACK_NOT:
1651 save_mark = md->mark;
1652 if (md->match_function_type == MATCH_CONDASSERT)
1653 {
1654 condassert = TRUE;
1655 md->match_function_type = 0;
1656 }
1657 else condassert = FALSE;
1658
1659 do
1660 {
1661 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1662 md->mark = save_mark;
1663 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1664 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1665 {
1666 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1667 break;
1668 }
1669
1670 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1671 as NOMATCH. */
1672
1673 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1674 ecode += GET(ecode,1);
1675 }
1676 while (*ecode == OP_ALT);
1677
1678 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1679
1680 ecode += 1 + LINK_SIZE;
1681 continue;
1682
1683 /* Move the subject pointer back. This occurs only at the start of
1684 each branch of a lookbehind assertion. If we are too close to the start to
1685 move back, this match function fails. When working with UTF-8 we move
1686 back a number of characters, not bytes. */
1687
1688 case OP_REVERSE:
1689 #ifdef SUPPORT_UTF
1690 if (utf)
1691 {
1692 i = GET(ecode, 1);
1693 while (i-- > 0)
1694 {
1695 eptr--;
1696 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1697 BACKCHAR(eptr);
1698 }
1699 }
1700 else
1701 #endif
1702
1703 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1704
1705 {
1706 eptr -= GET(ecode, 1);
1707 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1708 }
1709
1710 /* Save the earliest consulted character, then skip to next op code */
1711
1712 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1713 ecode += 1 + LINK_SIZE;
1714 break;
1715
1716 /* The callout item calls an external function, if one is provided, passing
1717 details of the match so far. This is mainly for debugging, though the
1718 function is able to force a failure. */
1719
1720 case OP_CALLOUT:
1721 if (PUBL(callout) != NULL)
1722 {
1723 PUBL(callout_block) cb;
1724 cb.version = 2; /* Version 1 of the callout block */
1725 cb.callout_number = ecode[1];
1726 cb.offset_vector = md->offset_vector;
1727 #if defined COMPILE_PCRE8
1728 cb.subject = (PCRE_SPTR)md->start_subject;
1729 #elif defined COMPILE_PCRE16
1730 cb.subject = (PCRE_SPTR16)md->start_subject;
1731 #elif defined COMPILE_PCRE32
1732 cb.subject = (PCRE_SPTR32)md->start_subject;
1733 #endif
1734 cb.subject_length = (int)(md->end_subject - md->start_subject);
1735 cb.start_match = (int)(mstart - md->start_subject);
1736 cb.current_position = (int)(eptr - md->start_subject);
1737 cb.pattern_position = GET(ecode, 2);
1738 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1739 cb.capture_top = offset_top/2;
1740 cb.capture_last = md->capture_last & CAPLMASK;
1741 /* Internal change requires this for API compatibility. */
1742 if (cb.capture_last == 0) cb.capture_last = -1;
1743 cb.callout_data = md->callout_data;
1744 cb.mark = md->nomatch_mark;
1745 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1746 if (rrc < 0) RRETURN(rrc);
1747 }
1748 ecode += 2 + 2*LINK_SIZE;
1749 break;
1750
1751 /* Recursion either matches the current regex, or some subexpression. The
1752 offset data is the offset to the starting bracket from the start of the
1753 whole pattern. (This is so that it works from duplicated subpatterns.)
1754
1755 The state of the capturing groups is preserved over recursion, and
1756 re-instated afterwards. We don't know how many are started and not yet
1757 finished (offset_top records the completed total) so we just have to save
1758 all the potential data. There may be up to 65535 such values, which is too
1759 large to put on the stack, but using malloc for small numbers seems
1760 expensive. As a compromise, the stack is used when there are no more than
1761 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1762
1763 There are also other values that have to be saved. We use a chained
1764 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1765 for the original version of this logic. It has, however, been hacked around
1766 a lot, so he is not to blame for the current way it works. */
1767
1768 case OP_RECURSE:
1769 {
1770 recursion_info *ri;
1771 unsigned int recno;
1772
1773 callpat = md->start_code + GET(ecode, 1);
1774 recno = (callpat == md->start_code)? 0 :
1775 GET2(callpat, 1 + LINK_SIZE);
1776
1777 /* Check for repeating a recursion without advancing the subject pointer.
1778 This should catch convoluted mutual recursions. (Some simple cases are
1779 caught at compile time.) */
1780
1781 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1782 if (recno == ri->group_num && eptr == ri->subject_position)
1783 RRETURN(PCRE_ERROR_RECURSELOOP);
1784
1785 /* Add to "recursing stack" */
1786
1787 new_recursive.group_num = recno;
1788 new_recursive.saved_capture_last = md->capture_last;
1789 new_recursive.subject_position = eptr;
1790 new_recursive.prevrec = md->recursive;
1791 md->recursive = &new_recursive;
1792
1793 /* Where to continue from afterwards */
1794
1795 ecode += 1 + LINK_SIZE;
1796
1797 /* Now save the offset data */
1798
1799 new_recursive.saved_max = md->offset_end;
1800 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1801 new_recursive.offset_save = stacksave;
1802 else
1803 {
1804 new_recursive.offset_save =
1805 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1806 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1807 }
1808 memcpy(new_recursive.offset_save, md->offset_vector,
1809 new_recursive.saved_max * sizeof(int));
1810
1811 /* OK, now we can do the recursion. After processing each alternative,
1812 restore the offset data and the last captured value. If there were nested
1813 recursions, md->recursive might be changed, so reset it before looping.
1814 */
1815
1816 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1817 cbegroup = (*callpat >= OP_SBRA);
1818 do
1819 {
1820 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1821 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1822 md, eptrb, RM6);
1823 memcpy(md->offset_vector, new_recursive.offset_save,
1824 new_recursive.saved_max * sizeof(int));
1825 md->capture_last = new_recursive.saved_capture_last;
1826 md->recursive = new_recursive.prevrec;
1827 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1828 {
1829 DPRINTF(("Recursion matched\n"));
1830 if (new_recursive.offset_save != stacksave)
1831 (PUBL(free))(new_recursive.offset_save);
1832
1833 /* Set where we got to in the subject, and reset the start in case
1834 it was changed by \K. This *is* propagated back out of a recursion,
1835 for Perl compatibility. */
1836
1837 eptr = md->end_match_ptr;
1838 mstart = md->start_match_ptr;
1839 goto RECURSION_MATCHED; /* Exit loop; end processing */
1840 }
1841
1842 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1843 is treated as NOMATCH. */
1844
1845 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1846 rrc != MATCH_COMMIT)
1847 {
1848 DPRINTF(("Recursion gave error %d\n", rrc));
1849 if (new_recursive.offset_save != stacksave)
1850 (PUBL(free))(new_recursive.offset_save);
1851 RRETURN(rrc);
1852 }
1853
1854 md->recursive = &new_recursive;
1855 callpat += GET(callpat, 1);
1856 }
1857 while (*callpat == OP_ALT);
1858
1859 DPRINTF(("Recursion didn't match\n"));
1860 md->recursive = new_recursive.prevrec;
1861 if (new_recursive.offset_save != stacksave)
1862 (PUBL(free))(new_recursive.offset_save);
1863 RRETURN(MATCH_NOMATCH);
1864 }
1865
1866 RECURSION_MATCHED:
1867 break;
1868
1869 /* An alternation is the end of a branch; scan along to find the end of the
1870 bracketed group and go to there. */
1871
1872 case OP_ALT:
1873 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1874 break;
1875
1876 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1877 indicating that it may occur zero times. It may repeat infinitely, or not
1878 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1879 with fixed upper repeat limits are compiled as a number of copies, with the
1880 optional ones preceded by BRAZERO or BRAMINZERO. */
1881
1882 case OP_BRAZERO:
1883 next = ecode + 1;
1884 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 do next += GET(next, 1); while (*next == OP_ALT);
1887 ecode = next + 1 + LINK_SIZE;
1888 break;
1889
1890 case OP_BRAMINZERO:
1891 next = ecode + 1;
1892 do next += GET(next, 1); while (*next == OP_ALT);
1893 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1895 ecode++;
1896 break;
1897
1898 case OP_SKIPZERO:
1899 next = ecode+1;
1900 do next += GET(next,1); while (*next == OP_ALT);
1901 ecode = next + 1 + LINK_SIZE;
1902 break;
1903
1904 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1905 here; just jump to the group, with allow_zero set TRUE. */
1906
1907 case OP_BRAPOSZERO:
1908 op = *(++ecode);
1909 allow_zero = TRUE;
1910 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1911 goto POSSESSIVE_NON_CAPTURE;
1912
1913 /* End of a group, repeated or non-repeating. */
1914
1915 case OP_KET:
1916 case OP_KETRMIN:
1917 case OP_KETRMAX:
1918 case OP_KETRPOS:
1919 prev = ecode - GET(ecode, 1);
1920
1921 /* If this was a group that remembered the subject start, in order to break
1922 infinite repeats of empty string matches, retrieve the subject start from
1923 the chain. Otherwise, set it NULL. */
1924
1925 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1926 {
1927 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1928 eptrb = eptrb->epb_prev; /* Backup to previous group */
1929 }
1930 else saved_eptr = NULL;
1931
1932 /* If we are at the end of an assertion group or a non-capturing atomic
1933 group, stop matching and return MATCH_MATCH, but record the current high
1934 water mark for use by positive assertions. We also need to record the match
1935 start in case it was changed by \K. */
1936
1937 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1938 *prev == OP_ONCE_NC)
1939 {
1940 md->end_match_ptr = eptr; /* For ONCE_NC */
1941 md->end_offset_top = offset_top;
1942 md->start_match_ptr = mstart;
1943 RRETURN(MATCH_MATCH); /* Sets md->mark */
1944 }
1945
1946 /* For capturing groups we have to check the group number back at the start
1947 and if necessary complete handling an extraction by setting the offsets and
1948 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1949 into group 0, so it won't be picked up here. Instead, we catch it when the
1950 OP_END is reached. Other recursion is handled here. We just have to record
1951 the current subject position and start match pointer and give a MATCH
1952 return. */
1953
1954 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1955 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1956 {
1957 number = GET2(prev, 1+LINK_SIZE);
1958 offset = number << 1;
1959
1960 #ifdef PCRE_DEBUG
1961 printf("end bracket %d", number);
1962 printf("\n");
1963 #endif
1964
1965 /* Handle a recursively called group. */
1966
1967 if (md->recursive != NULL && md->recursive->group_num == number)
1968 {
1969 md->end_match_ptr = eptr;
1970 md->start_match_ptr = mstart;
1971 RRETURN(MATCH_MATCH);
1972 }
1973
1974 /* Deal with capturing */
1975
1976 md->capture_last = (md->capture_last & OVFLMASK) | number;
1977 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1978 {
1979 /* If offset is greater than offset_top, it means that we are
1980 "skipping" a capturing group, and that group's offsets must be marked
1981 unset. In earlier versions of PCRE, all the offsets were unset at the
1982 start of matching, but this doesn't work because atomic groups and
1983 assertions can cause a value to be set that should later be unset.
1984 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1985 part of the atomic group, but this is not on the final matching path,
1986 so must be unset when 2 is set. (If there is no group 2, there is no
1987 problem, because offset_top will then be 2, indicating no capture.) */
1988
1989 if (offset > offset_top)
1990 {
1991 register int *iptr = md->offset_vector + offset_top;
1992 register int *iend = md->offset_vector + offset;
1993 while (iptr < iend) *iptr++ = -1;
1994 }
1995
1996 /* Now make the extraction */
1997
1998 md->offset_vector[offset] =
1999 md->offset_vector[md->offset_end - number];
2000 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2001 if (offset_top <= offset) offset_top = offset + 2;
2002 }
2003 }
2004
2005 /* For an ordinary non-repeating ket, just continue at this level. This
2006 also happens for a repeating ket if no characters were matched in the
2007 group. This is the forcible breaking of infinite loops as implemented in
2008 Perl 5.005. For a non-repeating atomic group that includes captures,
2009 establish a backup point by processing the rest of the pattern at a lower
2010 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2011 original OP_ONCE level, thereby bypassing intermediate backup points, but
2012 resetting any captures that happened along the way. */
2013
2014 if (*ecode == OP_KET || eptr == saved_eptr)
2015 {
2016 if (*prev == OP_ONCE)
2017 {
2018 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2019 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2021 RRETURN(MATCH_ONCE);
2022 }
2023 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2024 break;
2025 }
2026
2027 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2028 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2029 at a time from the outer level, thus saving stack. */
2030
2031 if (*ecode == OP_KETRPOS)
2032 {
2033 md->end_match_ptr = eptr;
2034 md->end_offset_top = offset_top;
2035 RRETURN(MATCH_KETRPOS);
2036 }
2037
2038 /* The normal repeating kets try the rest of the pattern or restart from
2039 the preceding bracket, in the appropriate order. In the second case, we can
2040 use tail recursion to avoid using another stack frame, unless we have an
2041 an atomic group or an unlimited repeat of a group that can match an empty
2042 string. */
2043
2044 if (*ecode == OP_KETRMIN)
2045 {
2046 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048 if (*prev == OP_ONCE)
2049 {
2050 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2052 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2053 RRETURN(MATCH_ONCE);
2054 }
2055 if (*prev >= OP_SBRA) /* Could match an empty string */
2056 {
2057 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2058 RRETURN(rrc);
2059 }
2060 ecode = prev;
2061 goto TAIL_RECURSE;
2062 }
2063 else /* OP_KETRMAX */
2064 {
2065 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2066 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2067 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2068 if (*prev == OP_ONCE)
2069 {
2070 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2072 md->once_target = prev;
2073 RRETURN(MATCH_ONCE);
2074 }
2075 ecode += 1 + LINK_SIZE;
2076 goto TAIL_RECURSE;
2077 }
2078 /* Control never gets here */
2079
2080 /* Not multiline mode: start of subject assertion, unless notbol. */
2081
2082 case OP_CIRC:
2083 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2084
2085 /* Start of subject assertion */
2086
2087 case OP_SOD:
2088 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2089 ecode++;
2090 break;
2091
2092 /* Multiline mode: start of subject unless notbol, or after any newline. */
2093
2094 case OP_CIRCM:
2095 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2096 if (eptr != md->start_subject &&
2097 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2098 RRETURN(MATCH_NOMATCH);
2099 ecode++;
2100 break;
2101
2102 /* Start of match assertion */
2103
2104 case OP_SOM:
2105 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2106 ecode++;
2107 break;
2108
2109 /* Reset the start of match point */
2110
2111 case OP_SET_SOM:
2112 mstart = eptr;
2113 ecode++;
2114 break;
2115
2116 /* Multiline mode: assert before any newline, or before end of subject
2117 unless noteol is set. */
2118
2119 case OP_DOLLM:
2120 if (eptr < md->end_subject)
2121 {
2122 if (!IS_NEWLINE(eptr))
2123 {
2124 if (md->partial != 0 &&
2125 eptr + 1 >= md->end_subject &&
2126 NLBLOCK->nltype == NLTYPE_FIXED &&
2127 NLBLOCK->nllen == 2 &&
2128 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2129 {
2130 md->hitend = TRUE;
2131 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2132 }
2133 RRETURN(MATCH_NOMATCH);
2134 }
2135 }
2136 else
2137 {
2138 if (md->noteol) RRETURN(MATCH_NOMATCH);
2139 SCHECK_PARTIAL();
2140 }
2141 ecode++;
2142 break;
2143
2144 /* Not multiline mode: assert before a terminating newline or before end of
2145 subject unless noteol is set. */
2146
2147 case OP_DOLL:
2148 if (md->noteol) RRETURN(MATCH_NOMATCH);
2149 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2150
2151 /* ... else fall through for endonly */
2152
2153 /* End of subject assertion (\z) */
2154
2155 case OP_EOD:
2156 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2157 SCHECK_PARTIAL();
2158 ecode++;
2159 break;
2160
2161 /* End of subject or ending \n assertion (\Z) */
2162
2163 case OP_EODN:
2164 ASSERT_NL_OR_EOS:
2165 if (eptr < md->end_subject &&
2166 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2167 {
2168 if (md->partial != 0 &&
2169 eptr + 1 >= md->end_subject &&
2170 NLBLOCK->nltype == NLTYPE_FIXED &&
2171 NLBLOCK->nllen == 2 &&
2172 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2173 {
2174 md->hitend = TRUE;
2175 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2176 }
2177 RRETURN(MATCH_NOMATCH);
2178 }
2179
2180 /* Either at end of string or \n before end. */
2181
2182 SCHECK_PARTIAL();
2183 ecode++;
2184 break;
2185
2186 /* Word boundary assertions */
2187
2188 case OP_NOT_WORD_BOUNDARY:
2189 case OP_WORD_BOUNDARY:
2190 {
2191
2192 /* Find out if the previous and current characters are "word" characters.
2193 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2194 be "non-word" characters. Remember the earliest consulted character for
2195 partial matching. */
2196
2197 #ifdef SUPPORT_UTF
2198 if (utf)
2199 {
2200 /* Get status of previous character */
2201
2202 if (eptr == md->start_subject) prev_is_word = FALSE; else
2203 {
2204 PCRE_PUCHAR lastptr = eptr - 1;
2205 BACKCHAR(lastptr);
2206 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2207 GETCHAR(c, lastptr);
2208 #ifdef SUPPORT_UCP
2209 if (md->use_ucp)
2210 {
2211 if (c == '_') prev_is_word = TRUE; else
2212 {
2213 int cat = UCD_CATEGORY(c);
2214 prev_is_word = (cat == ucp_L || cat == ucp_N);
2215 }
2216 }
2217 else
2218 #endif
2219 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2220 }
2221
2222 /* Get status of next character */
2223
2224 if (eptr >= md->end_subject)
2225 {
2226 SCHECK_PARTIAL();
2227 cur_is_word = FALSE;
2228 }
2229 else
2230 {
2231 GETCHAR(c, eptr);
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2234 {
2235 if (c == '_') cur_is_word = TRUE; else
2236 {
2237 int cat = UCD_CATEGORY(c);
2238 cur_is_word = (cat == ucp_L || cat == ucp_N);
2239 }
2240 }
2241 else
2242 #endif
2243 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2244 }
2245 }
2246 else
2247 #endif
2248
2249 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2250 consistency with the behaviour of \w we do use it in this case. */
2251
2252 {
2253 /* Get status of previous character */
2254
2255 if (eptr == md->start_subject) prev_is_word = FALSE; else
2256 {
2257 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2258 #ifdef SUPPORT_UCP
2259 if (md->use_ucp)
2260 {
2261 c = eptr[-1];
2262 if (c == '_') prev_is_word = TRUE; else
2263 {
2264 int cat = UCD_CATEGORY(c);
2265 prev_is_word = (cat == ucp_L || cat == ucp_N);
2266 }
2267 }
2268 else
2269 #endif
2270 prev_is_word = MAX_255(eptr[-1])
2271 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2272 }
2273
2274 /* Get status of next character */
2275
2276 if (eptr >= md->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 cur_is_word = FALSE;
2280 }
2281 else
2282 #ifdef SUPPORT_UCP
2283 if (md->use_ucp)
2284 {
2285 c = *eptr;
2286 if (c == '_') cur_is_word = TRUE; else
2287 {
2288 int cat = UCD_CATEGORY(c);
2289 cur_is_word = (cat == ucp_L || cat == ucp_N);
2290 }
2291 }
2292 else
2293 #endif
2294 cur_is_word = MAX_255(*eptr)
2295 && ((md->ctypes[*eptr] & ctype_word) != 0);
2296 }
2297
2298 /* Now see if the situation is what we want */
2299
2300 if ((*ecode++ == OP_WORD_BOUNDARY)?
2301 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2302 RRETURN(MATCH_NOMATCH);
2303 }
2304 break;
2305
2306 /* Match any single character type except newline; have to take care with
2307 CRLF newlines and partial matching. */
2308
2309 case OP_ANY:
2310 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2311 if (md->partial != 0 &&
2312 eptr + 1 >= md->end_subject &&
2313 NLBLOCK->nltype == NLTYPE_FIXED &&
2314 NLBLOCK->nllen == 2 &&
2315 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2316 {
2317 md->hitend = TRUE;
2318 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2319 }
2320
2321 /* Fall through */
2322
2323 /* Match any single character whatsoever. */
2324
2325 case OP_ALLANY:
2326 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2327 { /* not be updated before SCHECK_PARTIAL. */
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2330 }
2331 eptr++;
2332 #ifdef SUPPORT_UTF
2333 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2334 #endif
2335 ecode++;
2336 break;
2337
2338 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2339 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2340
2341 case OP_ANYBYTE:
2342 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2343 { /* not be updated before SCHECK_PARTIAL. */
2344 SCHECK_PARTIAL();
2345 RRETURN(MATCH_NOMATCH);
2346 }
2347 eptr++;
2348 ecode++;
2349 break;
2350
2351 case OP_NOT_DIGIT:
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 RRETURN(MATCH_NOMATCH);
2356 }
2357 GETCHARINCTEST(c, eptr);
2358 if (
2359 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2360 c < 256 &&
2361 #endif
2362 (md->ctypes[c] & ctype_digit) != 0
2363 )
2364 RRETURN(MATCH_NOMATCH);
2365 ecode++;
2366 break;
2367
2368 case OP_DIGIT:
2369 if (eptr >= md->end_subject)
2370 {
2371 SCHECK_PARTIAL();
2372 RRETURN(MATCH_NOMATCH);
2373 }
2374 GETCHARINCTEST(c, eptr);
2375 if (
2376 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2377 c > 255 ||
2378 #endif
2379 (md->ctypes[c] & ctype_digit) == 0
2380 )
2381 RRETURN(MATCH_NOMATCH);
2382 ecode++;
2383 break;
2384
2385 case OP_NOT_WHITESPACE:
2386 if (eptr >= md->end_subject)
2387 {
2388 SCHECK_PARTIAL();
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 GETCHARINCTEST(c, eptr);
2392 if (
2393 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2394 c < 256 &&
2395 #endif
2396 (md->ctypes[c] & ctype_space) != 0
2397 )
2398 RRETURN(MATCH_NOMATCH);
2399 ecode++;
2400 break;
2401
2402 case OP_WHITESPACE:
2403 if (eptr >= md->end_subject)
2404 {
2405 SCHECK_PARTIAL();
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 GETCHARINCTEST(c, eptr);
2409 if (
2410 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2411 c > 255 ||
2412 #endif
2413 (md->ctypes[c] & ctype_space) == 0
2414 )
2415 RRETURN(MATCH_NOMATCH);
2416 ecode++;
2417 break;
2418
2419 case OP_NOT_WORDCHAR:
2420 if (eptr >= md->end_subject)
2421 {
2422 SCHECK_PARTIAL();
2423 RRETURN(MATCH_NOMATCH);
2424 }
2425 GETCHARINCTEST(c, eptr);
2426 if (
2427 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2428 c < 256 &&
2429 #endif
2430 (md->ctypes[c] & ctype_word) != 0
2431 )
2432 RRETURN(MATCH_NOMATCH);
2433 ecode++;
2434 break;
2435
2436 case OP_WORDCHAR:
2437 if (eptr >= md->end_subject)
2438 {
2439 SCHECK_PARTIAL();
2440 RRETURN(MATCH_NOMATCH);
2441 }
2442 GETCHARINCTEST(c, eptr);
2443 if (
2444 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2445 c > 255 ||
2446 #endif
2447 (md->ctypes[c] & ctype_word) == 0
2448 )
2449 RRETURN(MATCH_NOMATCH);
2450 ecode++;
2451 break;
2452
2453 case OP_ANYNL:
2454 if (eptr >= md->end_subject)
2455 {
2456 SCHECK_PARTIAL();
2457 RRETURN(MATCH_NOMATCH);
2458 }
2459 GETCHARINCTEST(c, eptr);
2460 switch(c)
2461 {
2462 default: RRETURN(MATCH_NOMATCH);
2463
2464 case CHAR_CR:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 }
2469 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2470 break;
2471
2472 case CHAR_LF:
2473 break;
2474
2475 case CHAR_VT:
2476 case CHAR_FF:
2477 case CHAR_NEL:
2478 #ifndef EBCDIC
2479 case 0x2028:
2480 case 0x2029:
2481 #endif /* Not EBCDIC */
2482 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2483 break;
2484 }
2485 ecode++;
2486 break;
2487
2488 case OP_NOT_HSPACE:
2489 if (eptr >= md->end_subject)
2490 {
2491 SCHECK_PARTIAL();
2492 RRETURN(MATCH_NOMATCH);
2493 }
2494 GETCHARINCTEST(c, eptr);
2495 switch(c)
2496 {
2497 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2498 default: break;
2499 }
2500 ecode++;
2501 break;
2502
2503 case OP_HSPACE:
2504 if (eptr >= md->end_subject)
2505 {
2506 SCHECK_PARTIAL();
2507 RRETURN(MATCH_NOMATCH);
2508 }
2509 GETCHARINCTEST(c, eptr);
2510 switch(c)
2511 {
2512 HSPACE_CASES: break; /* Byte and multibyte cases */
2513 default: RRETURN(MATCH_NOMATCH);
2514 }
2515 ecode++;
2516 break;
2517
2518 case OP_NOT_VSPACE:
2519 if (eptr >= md->end_subject)
2520 {
2521 SCHECK_PARTIAL();
2522 RRETURN(MATCH_NOMATCH);
2523 }
2524 GETCHARINCTEST(c, eptr);
2525 switch(c)
2526 {
2527 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2528 default: break;
2529 }
2530 ecode++;
2531 break;
2532
2533 case OP_VSPACE:
2534 if (eptr >= md->end_subject)
2535 {
2536 SCHECK_PARTIAL();
2537 RRETURN(MATCH_NOMATCH);
2538 }
2539 GETCHARINCTEST(c, eptr);
2540 switch(c)
2541 {
2542 VSPACE_CASES: break;
2543 default: RRETURN(MATCH_NOMATCH);
2544 }
2545 ecode++;
2546 break;
2547
2548 #ifdef SUPPORT_UCP
2549 /* Check the next character by Unicode property. We will get here only
2550 if the support is in the binary; otherwise a compile-time error occurs. */
2551
2552 case OP_PROP:
2553 case OP_NOTPROP:
2554 if (eptr >= md->end_subject)
2555 {
2556 SCHECK_PARTIAL();
2557 RRETURN(MATCH_NOMATCH);
2558 }
2559 GETCHARINCTEST(c, eptr);
2560 {
2561 const pcre_uint32 *cp;
2562 const ucd_record *prop = GET_UCD(c);
2563
2564 switch(ecode[1])
2565 {
2566 case PT_ANY:
2567 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2568 break;
2569
2570 case PT_LAMP:
2571 if ((prop->chartype == ucp_Lu ||
2572 prop->chartype == ucp_Ll ||
2573 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2576
2577 case PT_GC:
2578 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2579 RRETURN(MATCH_NOMATCH);
2580 break;
2581
2582 case PT_PC:
2583 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_SC:
2588 if ((ecode[2] != prop->script) == (op == OP_PROP))
2589 RRETURN(MATCH_NOMATCH);
2590 break;
2591
2592 /* These are specials */
2593
2594 case PT_ALNUM:
2595 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2596 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2597 RRETURN(MATCH_NOMATCH);
2598 break;
2599
2600 case PT_SPACE: /* Perl space */
2601 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2602 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2603 == (op == OP_NOTPROP))
2604 RRETURN(MATCH_NOMATCH);
2605 break;
2606
2607 case PT_PXSPACE: /* POSIX space */
2608 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2609 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2610 c == CHAR_FF || c == CHAR_CR)
2611 == (op == OP_NOTPROP))
2612 RRETURN(MATCH_NOMATCH);
2613 break;
2614
2615 case PT_WORD:
2616 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2617 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2618 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2619 RRETURN(MATCH_NOMATCH);
2620 break;
2621
2622 case PT_CLIST:
2623 cp = PRIV(ucd_caseless_sets) + ecode[2];
2624 for (;;)
2625 {
2626 if (c < *cp)
2627 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2628 if (c == *cp++)
2629 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2630 }
2631 break;
2632
2633 case PT_UCNC:
2634 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2635 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2636 c >= 0xe000) == (op == OP_NOTPROP))
2637 RRETURN(MATCH_NOMATCH);
2638 break;
2639
2640 /* This should never occur */
2641
2642 default:
2643 RRETURN(PCRE_ERROR_INTERNAL);
2644 }
2645
2646 ecode += 3;
2647 }
2648 break;
2649
2650 /* Match an extended Unicode sequence. We will get here only if the support
2651 is in the binary; otherwise a compile-time error occurs. */
2652
2653 case OP_EXTUNI:
2654 if (eptr >= md->end_subject)
2655 {
2656 SCHECK_PARTIAL();
2657 RRETURN(MATCH_NOMATCH);
2658 }
2659 else
2660 {
2661 int lgb, rgb;
2662 GETCHARINCTEST(c, eptr);
2663 lgb = UCD_GRAPHBREAK(c);
2664 while (eptr < md->end_subject)
2665 {
2666 int len = 1;
2667 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2668 rgb = UCD_GRAPHBREAK(c);
2669 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2670 lgb = rgb;
2671 eptr += len;
2672 }
2673 }
2674 CHECK_PARTIAL();
2675 ecode++;
2676 break;
2677 #endif /* SUPPORT_UCP */
2678
2679
2680 /* Match a back reference, possibly repeatedly. Look past the end of the
2681 item to see if there is repeat information following. The code is similar
2682 to that for character classes, but repeated for efficiency. Then obey
2683 similar code to character type repeats - written out again for speed.
2684 However, if the referenced string is the empty string, always treat
2685 it as matched, any number of times (otherwise there could be infinite
2686 loops). */
2687
2688 case OP_REF:
2689 case OP_REFI:
2690 caseless = op == OP_REFI;
2691 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2692 ecode += 1 + IMM2_SIZE;
2693
2694 /* If the reference is unset, there are two possibilities:
2695
2696 (a) In the default, Perl-compatible state, set the length negative;
2697 this ensures that every attempt at a match fails. We can't just fail
2698 here, because of the possibility of quantifiers with zero minima.
2699
2700 (b) If the JavaScript compatibility flag is set, set the length to zero
2701 so that the back reference matches an empty string.
2702
2703 Otherwise, set the length to the length of what was matched by the
2704 referenced subpattern. */
2705
2706 if (offset >= offset_top || md->offset_vector[offset] < 0)
2707 length = (md->jscript_compat)? 0 : -1;
2708 else
2709 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2710
2711 /* Set up for repetition, or handle the non-repeated case */
2712
2713 switch (*ecode)
2714 {
2715 case OP_CRSTAR:
2716 case OP_CRMINSTAR:
2717 case OP_CRPLUS:
2718 case OP_CRMINPLUS:
2719 case OP_CRQUERY:
2720 case OP_CRMINQUERY:
2721 c = *ecode++ - OP_CRSTAR;
2722 minimize = (c & 1) != 0;
2723 min = rep_min[c]; /* Pick up values from tables; */
2724 max = rep_max[c]; /* zero for max => infinity */
2725 if (max == 0) max = INT_MAX;
2726 break;
2727
2728 case OP_CRRANGE:
2729 case OP_CRMINRANGE:
2730 minimize = (*ecode == OP_CRMINRANGE);
2731 min = GET2(ecode, 1);
2732 max = GET2(ecode, 1 + IMM2_SIZE);
2733 if (max == 0) max = INT_MAX;
2734 ecode += 1 + 2 * IMM2_SIZE;
2735 break;
2736
2737 default: /* No repeat follows */
2738 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2739 {
2740 if (length == -2) eptr = md->end_subject; /* Partial match */
2741 CHECK_PARTIAL();
2742 RRETURN(MATCH_NOMATCH);
2743 }
2744 eptr += length;
2745 continue; /* With the main loop */
2746 }
2747
2748 /* Handle repeated back references. If the length of the reference is
2749 zero, just continue with the main loop. If the length is negative, it
2750 means the reference is unset in non-Java-compatible mode. If the minimum is
2751 zero, we can continue at the same level without recursion. For any other
2752 minimum, carrying on will result in NOMATCH. */
2753
2754 if (length == 0) continue;
2755 if (length < 0 && min == 0) continue;
2756
2757 /* First, ensure the minimum number of matches are present. We get back
2758 the length of the reference string explicitly rather than passing the
2759 address of eptr, so that eptr can be a register variable. */
2760
2761 for (i = 1; i <= min; i++)
2762 {
2763 int slength;
2764 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2765 {
2766 if (slength == -2) eptr = md->end_subject; /* Partial match */
2767 CHECK_PARTIAL();
2768 RRETURN(MATCH_NOMATCH);
2769 }
2770 eptr += slength;
2771 }
2772
2773 /* If min = max, continue at the same level without recursion.
2774 They are not both allowed to be zero. */
2775
2776 if (min == max) continue;
2777
2778 /* If minimizing, keep trying and advancing the pointer */
2779
2780 if (minimize)
2781 {
2782 for (fi = min;; fi++)
2783 {
2784 int slength;
2785 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2787 if (fi >= max) RRETURN(MATCH_NOMATCH);
2788 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2789 {
2790 if (slength == -2) eptr = md->end_subject; /* Partial match */
2791 CHECK_PARTIAL();
2792 RRETURN(MATCH_NOMATCH);
2793 }
2794 eptr += slength;
2795 }
2796 /* Control never gets here */
2797 }
2798
2799 /* If maximizing, find the longest string and work backwards */
2800
2801 else
2802 {
2803 pp = eptr;
2804 for (i = min; i < max; i++)
2805 {
2806 int slength;
2807 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2808 {
2809 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2810 the soft partial matching case. */
2811
2812 if (slength == -2 && md->partial != 0 &&
2813 md->end_subject > md->start_used_ptr)
2814 {
2815 md->hitend = TRUE;
2816 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2817 }
2818 break;
2819 }
2820 eptr += slength;
2821 }
2822
2823 while (eptr >= pp)
2824 {
2825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2827 eptr -= length;
2828 }
2829 RRETURN(MATCH_NOMATCH);
2830 }
2831 /* Control never gets here */
2832
2833 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2834 used when all the characters in the class have values in the range 0-255,
2835 and either the matching is caseful, or the characters are in the range
2836 0-127 when UTF-8 processing is enabled. The only difference between
2837 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2838 encountered.
2839
2840 First, look past the end of the item to see if there is repeat information
2841 following. Then obey similar code to character type repeats - written out
2842 again for speed. */
2843
2844 case OP_NCLASS:
2845 case OP_CLASS:
2846 {
2847 /* The data variable is saved across frames, so the byte map needs to
2848 be stored there. */
2849 #define BYTE_MAP ((pcre_uint8 *)data)
2850 data = ecode + 1; /* Save for matching */
2851 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2852
2853 switch (*ecode)
2854 {
2855 case OP_CRSTAR:
2856 case OP_CRMINSTAR:
2857 case OP_CRPLUS:
2858 case OP_CRMINPLUS:
2859 case OP_CRQUERY:
2860 case OP_CRMINQUERY:
2861 c = *ecode++ - OP_CRSTAR;
2862 minimize = (c & 1) != 0;
2863 min = rep_min[c]; /* Pick up values from tables; */
2864 max = rep_max[c]; /* zero for max => infinity */
2865 if (max == 0) max = INT_MAX;
2866 break;
2867
2868 case OP_CRRANGE:
2869 case OP_CRMINRANGE:
2870 minimize = (*ecode == OP_CRMINRANGE);
2871 min = GET2(ecode, 1);
2872 max = GET2(ecode, 1 + IMM2_SIZE);
2873 if (max == 0) max = INT_MAX;
2874 ecode += 1 + 2 * IMM2_SIZE;
2875 break;
2876
2877 default: /* No repeat follows */
2878 min = max = 1;
2879 break;
2880 }
2881
2882 /* First, ensure the minimum number of matches are present. */
2883
2884 #ifdef SUPPORT_UTF
2885 if (utf)
2886 {
2887 for (i = 1; i <= min; i++)
2888 {
2889 if (eptr >= md->end_subject)
2890 {
2891 SCHECK_PARTIAL();
2892 RRETURN(MATCH_NOMATCH);
2893 }
2894 GETCHARINC(c, eptr);
2895 if (c > 255)
2896 {
2897 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2898 }
2899 else
2900 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2901 }
2902 }
2903 else
2904 #endif
2905 /* Not UTF mode */
2906 {
2907 for (i = 1; i <= min; i++)
2908 {
2909 if (eptr >= md->end_subject)
2910 {
2911 SCHECK_PARTIAL();
2912 RRETURN(MATCH_NOMATCH);
2913 }
2914 c = *eptr++;
2915 #ifndef COMPILE_PCRE8
2916 if (c > 255)
2917 {
2918 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2919 }
2920 else
2921 #endif
2922 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2923 }
2924 }
2925
2926 /* If max == min we can continue with the main loop without the
2927 need to recurse. */
2928
2929 if (min == max) continue;
2930
2931 /* If minimizing, keep testing the rest of the expression and advancing
2932 the pointer while it matches the class. */
2933
2934 if (minimize)
2935 {
2936 #ifdef SUPPORT_UTF
2937 if (utf)
2938 {
2939 for (fi = min;; fi++)
2940 {
2941 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2942 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2943 if (fi >= max) RRETURN(MATCH_NOMATCH);
2944 if (eptr >= md->end_subject)
2945 {
2946 SCHECK_PARTIAL();
2947 RRETURN(MATCH_NOMATCH);
2948 }
2949 GETCHARINC(c, eptr);
2950 if (c > 255)
2951 {
2952 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2953 }
2954 else
2955 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2956 }
2957 }
2958 else
2959 #endif
2960 /* Not UTF mode */
2961 {
2962 for (fi = min;; fi++)
2963 {
2964 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2965 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2966 if (fi >= max) RRETURN(MATCH_NOMATCH);
2967 if (eptr >= md->end_subject)
2968 {
2969 SCHECK_PARTIAL();
2970 RRETURN(MATCH_NOMATCH);
2971 }
2972 c = *eptr++;
2973 #ifndef COMPILE_PCRE8
2974 if (c > 255)
2975 {
2976 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2977 }
2978 else
2979 #endif
2980 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2981 }
2982 }
2983 /* Control never gets here */
2984 }
2985
2986 /* If maximizing, find the longest possible run, then work backwards. */
2987
2988 else
2989 {
2990 pp = eptr;
2991
2992 #ifdef SUPPORT_UTF
2993 if (utf)
2994 {
2995 for (i = min; i < max; i++)
2996 {
2997 int len = 1;
2998 if (eptr >= md->end_subject)
2999 {
3000 SCHECK_PARTIAL();
3001 break;
3002 }
3003 GETCHARLEN(c, eptr, len);
3004 if (c > 255)
3005 {
3006 if (op == OP_CLASS) break;
3007 }
3008 else
3009 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3010 eptr += len;
3011 }
3012 for (;;)
3013 {
3014 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3016 if (eptr-- == pp) break; /* Stop if tried at original pos */
3017 BACKCHAR(eptr);
3018 }
3019 }
3020 else
3021 #endif
3022 /* Not UTF mode */
3023 {
3024 for (i = min; i < max; i++)
3025 {
3026 if (eptr >= md->end_subject)
3027 {
3028 SCHECK_PARTIAL();
3029 break;
3030 }
3031 c = *eptr;
3032 #ifndef COMPILE_PCRE8
3033 if (c > 255)
3034 {
3035 if (op == OP_CLASS) break;
3036 }
3037 else
3038 #endif
3039 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3040 eptr++;
3041 }
3042 while (eptr >= pp)
3043 {
3044 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3046 eptr--;
3047 }
3048 }
3049
3050 RRETURN(MATCH_NOMATCH);
3051 }
3052 #undef BYTE_MAP
3053 }
3054 /* Control never gets here */
3055
3056
3057 /* Match an extended character class. This opcode is encountered only
3058 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3059 mode, because Unicode properties are supported in non-UTF-8 mode. */
3060
3061 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3062 case OP_XCLASS:
3063 {
3064 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3065 ecode += GET(ecode, 1); /* Advance past the item */
3066
3067 switch (*ecode)
3068 {
3069 case OP_CRSTAR:
3070 case OP_CRMINSTAR:
3071 case OP_CRPLUS:
3072 case OP_CRMINPLUS:
3073 case OP_CRQUERY:
3074 case OP_CRMINQUERY:
3075 c = *ecode++ - OP_CRSTAR;
3076 minimize = (c & 1) != 0;
3077 min = rep_min[c]; /* Pick up values from tables; */
3078 max = rep_max[c]; /* zero for max => infinity */
3079 if (max == 0) max = INT_MAX;
3080 break;
3081
3082 case OP_CRRANGE:
3083 case OP_CRMINRANGE:
3084 minimize = (*ecode == OP_CRMINRANGE);
3085 min = GET2(ecode, 1);
3086 max = GET2(ecode, 1 + IMM2_SIZE);
3087 if (max == 0) max = INT_MAX;
3088 ecode += 1 + 2 * IMM2_SIZE;
3089 break;
3090
3091 default: /* No repeat follows */
3092 min = max = 1;
3093 break;
3094 }
3095
3096 /* First, ensure the minimum number of matches are present. */
3097
3098 for (i = 1; i <= min; i++)
3099 {
3100 if (eptr >= md->end_subject)
3101 {
3102 SCHECK_PARTIAL();
3103 RRETURN(MATCH_NOMATCH);
3104 }
3105 GETCHARINCTEST(c, eptr);
3106 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3107 }
3108
3109 /* If max == min we can continue with the main loop without the
3110 need to recurse. */
3111
3112 if (min == max) continue;
3113
3114 /* If minimizing, keep testing the rest of the expression and advancing
3115 the pointer while it matches the class. */
3116
3117 if (minimize)
3118 {
3119 for (fi = min;; fi++)
3120 {
3121 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3122 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3123 if (fi >= max) RRETURN(MATCH_NOMATCH);
3124 if (eptr >= md->end_subject)
3125 {
3126 SCHECK_PARTIAL();
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129 GETCHARINCTEST(c, eptr);
3130 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3131 }
3132 /* Control never gets here */
3133 }
3134
3135 /* If maximizing, find the longest possible run, then work backwards. */
3136
3137 else
3138 {
3139 pp = eptr;
3140 for (i = min; i < max; i++)
3141 {
3142 int len = 1;
3143 if (eptr >= md->end_subject)
3144 {
3145 SCHECK_PARTIAL();
3146 break;
3147 }
3148 #ifdef SUPPORT_UTF
3149 GETCHARLENTEST(c, eptr, len);
3150 #else
3151 c = *eptr;
3152 #endif
3153 if (!PRIV(xclass)(c, data, utf)) break;
3154 eptr += len;
3155 }
3156 for(;;)
3157 {
3158 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3159 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3160 if (eptr-- == pp) break; /* Stop if tried at original pos */
3161 #ifdef SUPPORT_UTF
3162 if (utf) BACKCHAR(eptr);
3163 #endif
3164 }
3165 RRETURN(MATCH_NOMATCH);
3166 }
3167
3168 /* Control never gets here */
3169 }
3170 #endif /* End of XCLASS */
3171
3172 /* Match a single character, casefully */
3173
3174 case OP_CHAR:
3175 #ifdef SUPPORT_UTF
3176 if (utf)
3177 {
3178 length = 1;
3179 ecode++;
3180 GETCHARLEN(fc, ecode, length);
3181 if (length > md->end_subject - eptr)
3182 {
3183 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3184 RRETURN(MATCH_NOMATCH);
3185 }
3186 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3187 }
3188 else
3189 #endif
3190 /* Not UTF mode */
3191 {
3192 if (md->end_subject - eptr < 1)
3193 {
3194 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3195 RRETURN(MATCH_NOMATCH);
3196 }
3197 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3198 ecode += 2;
3199 }
3200 break;
3201
3202 /* Match a single character, caselessly. If we are at the end of the
3203 subject, give up immediately. */
3204
3205 case OP_CHARI:
3206 if (eptr >= md->end_subject)
3207 {
3208 SCHECK_PARTIAL();
3209 RRETURN(MATCH_NOMATCH);
3210 }
3211
3212 #ifdef SUPPORT_UTF
3213 if (utf)
3214 {
3215 length = 1;
3216 ecode++;
3217 GETCHARLEN(fc, ecode, length);
3218
3219 /* If the pattern character's value is < 128, we have only one byte, and
3220 we know that its other case must also be one byte long, so we can use the
3221 fast lookup table. We know that there is at least one byte left in the
3222 subject. */
3223
3224 if (fc < 128)
3225 {
3226 pcre_uint32 cc = RAWUCHAR(eptr);
3227 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3228 ecode++;
3229 eptr++;
3230 }
3231
3232 /* Otherwise we must pick up the subject character. Note that we cannot
3233 use the value of "length" to check for sufficient bytes left, because the
3234 other case of the character may have more or fewer bytes. */
3235
3236 else
3237 {
3238 pcre_uint32 dc;
3239 GETCHARINC(dc, eptr);
3240 ecode += length;
3241
3242 /* If we have Unicode property support, we can use it to test the other
3243 case of the character, if there is one. */
3244
3245 if (fc != dc)
3246 {
3247 #ifdef SUPPORT_UCP
3248 if (dc != UCD_OTHERCASE(fc))
3249 #endif
3250 RRETURN(MATCH_NOMATCH);
3251 }
3252 }
3253 }
3254 else
3255 #endif /* SUPPORT_UTF */
3256
3257 /* Not UTF mode */
3258 {
3259 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3260 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3261 eptr++;
3262 ecode += 2;
3263 }
3264 break;
3265
3266 /* Match a single character repeatedly. */
3267
3268 case OP_EXACT:
3269 case OP_EXACTI:
3270 min = max = GET2(ecode, 1);
3271 ecode += 1 + IMM2_SIZE;
3272 goto REPEATCHAR;
3273
3274 case OP_POSUPTO:
3275 case OP_POSUPTOI:
3276 possessive = TRUE;
3277 /* Fall through */
3278
3279 case OP_UPTO:
3280 case OP_UPTOI:
3281 case OP_MINUPTO:
3282 case OP_MINUPTOI:
3283 min = 0;
3284 max = GET2(ecode, 1);
3285 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3286 ecode += 1 + IMM2_SIZE;
3287 goto REPEATCHAR;
3288
3289 case OP_POSSTAR:
3290 case OP_POSSTARI:
3291 possessive = TRUE;
3292 min = 0;
3293 max = INT_MAX;
3294 ecode++;
3295 goto REPEATCHAR;
3296
3297 case OP_POSPLUS:
3298 case OP_POSPLUSI:
3299 possessive = TRUE;
3300 min = 1;
3301 max = INT_MAX;
3302 ecode++;
3303 goto REPEATCHAR;
3304
3305 case OP_POSQUERY:
3306 case OP_POSQUERYI:
3307 possessive = TRUE;
3308 min = 0;
3309 max = 1;
3310 ecode++;
3311 goto REPEATCHAR;
3312
3313 case OP_STAR:
3314 case OP_STARI:
3315 case OP_MINSTAR:
3316 case OP_MINSTARI:
3317 case OP_PLUS:
3318 case OP_PLUSI:
3319 case OP_MINPLUS:
3320 case OP_MINPLUSI:
3321 case OP_QUERY:
3322 case OP_QUERYI:
3323 case OP_MINQUERY:
3324 case OP_MINQUERYI:
3325 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3326 minimize = (c & 1) != 0;
3327 min = rep_min[c]; /* Pick up values from tables; */
3328 max = rep_max[c]; /* zero for max => infinity */
3329 if (max == 0) max = INT_MAX;
3330
3331 /* Common code for all repeated single-character matches. */
3332
3333 REPEATCHAR:
3334 #ifdef SUPPORT_UTF
3335 if (utf)
3336 {
3337 length = 1;
3338 charptr = ecode;
3339 GETCHARLEN(fc, ecode, length);
3340 ecode += length;
3341
3342 /* Handle multibyte character matching specially here. There is
3343 support for caseless matching if UCP support is present. */
3344
3345 if (length > 1)
3346 {
3347 #ifdef SUPPORT_UCP
3348 pcre_uint32 othercase;
3349 if (op >= OP_STARI && /* Caseless */
3350 (othercase = UCD_OTHERCASE(fc)) != fc)
3351 oclength = PRIV(ord2utf)(othercase, occhars);
3352 else oclength = 0;
3353 #endif /* SUPPORT_UCP */
3354
3355 for (i = 1; i <= min; i++)
3356 {
3357 if (eptr <= md->end_subject - length &&
3358 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3359 #ifdef SUPPORT_UCP
3360 else if (oclength > 0 &&
3361 eptr <= md->end_subject - oclength &&
3362 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3363 #endif /* SUPPORT_UCP */
3364 else
3365 {
3366 CHECK_PARTIAL();
3367 RRETURN(MATCH_NOMATCH);
3368 }
3369 }
3370
3371 if (min == max) continue;
3372
3373 if (minimize)
3374 {
3375 for (fi = min;; fi++)
3376 {
3377 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3379 if (fi >= max) RRETURN(MATCH_NOMATCH);
3380 if (eptr <= md->end_subject - length &&
3381 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3382 #ifdef SUPPORT_UCP
3383 else if (oclength > 0 &&
3384 eptr <= md->end_subject - oclength &&
3385 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3386 #endif /* SUPPORT_UCP */
3387 else
3388 {
3389 CHECK_PARTIAL();
3390 RRETURN(MATCH_NOMATCH);
3391 }
3392 }
3393 /* Control never gets here */
3394 }
3395
3396 else /* Maximize */
3397 {
3398 pp = eptr;
3399 for (i = min; i < max; i++)
3400 {
3401 if (eptr <= md->end_subject - length &&
3402 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3403 #ifdef SUPPORT_UCP
3404 else if (oclength > 0 &&
3405 eptr <= md->end_subject - oclength &&
3406 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3407 #endif /* SUPPORT_UCP */
3408 else
3409 {
3410 CHECK_PARTIAL();
3411 break;
3412 }
3413 }
3414
3415 if (possessive) continue;
3416
3417 for(;;)
3418 {
3419 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3421 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3422 #ifdef SUPPORT_UCP
3423 eptr--;
3424 BACKCHAR(eptr);
3425 #else /* without SUPPORT_UCP */
3426 eptr -= length;
3427 #endif /* SUPPORT_UCP */
3428 }
3429 }
3430 /* Control never gets here */
3431 }
3432
3433 /* If the length of a UTF-8 character is 1, we fall through here, and
3434 obey the code as for non-UTF-8 characters below, though in this case the
3435 value of fc will always be < 128. */
3436 }
3437 else
3438 #endif /* SUPPORT_UTF */
3439 /* When not in UTF-8 mode, load a single-byte character. */
3440 fc = *ecode++;
3441
3442 /* The value of fc at this point is always one character, though we may
3443 or may not be in UTF mode. The code is duplicated for the caseless and
3444 caseful cases, for speed, since matching characters is likely to be quite
3445 common. First, ensure the minimum number of matches are present. If min =
3446 max, continue at the same level without recursing. Otherwise, if
3447 minimizing, keep trying the rest of the expression and advancing one
3448 matching character if failing, up to the maximum. Alternatively, if
3449 maximizing, find the maximum number of characters and work backwards. */
3450
3451 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3452 max, (char *)eptr));
3453
3454 if (op >= OP_STARI) /* Caseless */
3455 {
3456 #ifdef COMPILE_PCRE8
3457 /* fc must be < 128 if UTF is enabled. */
3458 foc = md->fcc[fc];
3459 #else
3460 #ifdef SUPPORT_UTF
3461 #ifdef SUPPORT_UCP
3462 if (utf && fc > 127)
3463 foc = UCD_OTHERCASE(fc);
3464 #else
3465 if (utf && fc > 127)
3466 foc = fc;
3467 #endif /* SUPPORT_UCP */
3468 else
3469 #endif /* SUPPORT_UTF */
3470 foc = TABLE_GET(fc, md->fcc, fc);
3471 #endif /* COMPILE_PCRE8 */
3472
3473 for (i = 1; i <= min; i++)
3474 {
3475 pcre_uint32 cc; /* Faster than pcre_uchar */
3476 if (eptr >= md->end_subject)
3477 {
3478 SCHECK_PARTIAL();
3479 RRETURN(MATCH_NOMATCH);
3480 }
3481 cc = RAWUCHARTEST(eptr);
3482 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3483 eptr++;
3484 }
3485 if (min == max) continue;
3486 if (minimize)
3487 {
3488 for (fi = min;; fi++)
3489 {
3490 pcre_uint32 cc; /* Faster than pcre_uchar */
3491 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493 if (fi >= max) RRETURN(MATCH_NOMATCH);
3494 if (eptr >= md->end_subject)
3495 {
3496 SCHECK_PARTIAL();
3497 RRETURN(MATCH_NOMATCH);
3498 }
3499 cc = RAWUCHARTEST(eptr);
3500 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3501 eptr++;
3502 }
3503 /* Control never gets here */
3504 }
3505 else /* Maximize */
3506 {
3507 pp = eptr;
3508 for (i = min; i < max; i++)
3509 {
3510 pcre_uint32 cc; /* Faster than pcre_uchar */
3511 if (eptr >= md->end_subject)
3512 {
3513 SCHECK_PARTIAL();
3514 break;
3515 }
3516 cc = RAWUCHARTEST(eptr);
3517 if (fc != cc && foc != cc) break;
3518 eptr++;
3519 }
3520
3521 if (possessive) continue;
3522
3523 while (eptr >= pp)
3524 {
3525 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3526 eptr--;
3527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3528 }
3529 RRETURN(MATCH_NOMATCH);
3530 }
3531 /* Control never gets here */
3532 }
3533
3534 /* Caseful comparisons (includes all multi-byte characters) */
3535
3536 else
3537 {
3538 for (i = 1; i <= min; i++)
3539 {
3540 if (eptr >= md->end_subject)
3541 {
3542 SCHECK_PARTIAL();
3543 RRETURN(MATCH_NOMATCH);
3544 }
3545 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3546 }
3547
3548 if (min == max) continue;
3549
3550 if (minimize)
3551 {
3552 for (fi = min;; fi++)
3553 {
3554 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3556 if (fi >= max) RRETURN(MATCH_NOMATCH);
3557 if (eptr >= md->end_subject)
3558 {
3559 SCHECK_PARTIAL();
3560 RRETURN(MATCH_NOMATCH);
3561 }
3562 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3563 }
3564 /* Control never gets here */
3565 }
3566 else /* Maximize */
3567 {
3568 pp = eptr;
3569 for (i = min; i < max; i++)
3570 {
3571 if (eptr >= md->end_subject)
3572 {
3573 SCHECK_PARTIAL();
3574 break;
3575 }
3576 if (fc != RAWUCHARTEST(eptr)) break;
3577 eptr++;
3578 }
3579 if (possessive) continue;
3580
3581 while (eptr >= pp)
3582 {
3583 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3584 eptr--;
3585 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3586 }
3587 RRETURN(MATCH_NOMATCH);
3588 }
3589 }
3590 /* Control never gets here */
3591
3592 /* Match a negated single one-byte character. The character we are
3593 checking can be multibyte. */
3594
3595 case OP_NOT:
3596 case OP_NOTI:
3597 if (eptr >= md->end_subject)
3598 {
3599 SCHECK_PARTIAL();
3600 RRETURN(MATCH_NOMATCH);
3601 }
3602 #ifdef SUPPORT_UTF
3603 if (utf)
3604 {
3605 register pcre_uint32 ch, och;
3606
3607 ecode++;
3608 GETCHARINC(ch, ecode);
3609 GETCHARINC(c, eptr);
3610
3611 if (op == OP_NOT)
3612 {
3613 if (ch == c) RRETURN(MATCH_NOMATCH);
3614 }
3615 else
3616 {
3617 #ifdef SUPPORT_UCP
3618 if (ch > 127)
3619 och = UCD_OTHERCASE(ch);
3620 #else
3621 if (ch > 127)
3622 och = ch;
3623 #endif /* SUPPORT_UCP */
3624 else
3625 och = TABLE_GET(ch, md->fcc, ch);
3626 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3627 }
3628 }
3629 else
3630 #endif
3631 {
3632 register pcre_uint32 ch = ecode[1];
3633 c = *eptr++;
3634 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3635 RRETURN(MATCH_NOMATCH);
3636 ecode += 2;
3637 }
3638 break;
3639
3640 /* Match a negated single one-byte character repeatedly. This is almost a
3641 repeat of the code for a repeated single character, but I haven't found a
3642 nice way of commoning these up that doesn't require a test of the
3643 positive/negative option for each character match. Maybe that wouldn't add
3644 very much to the time taken, but character matching *is* what this is all
3645 about... */
3646
3647 case OP_NOTEXACT:
3648 case OP_NOTEXACTI:
3649 min = max = GET2(ecode, 1);
3650 ecode += 1 + IMM2_SIZE;
3651 goto REPEATNOTCHAR;
3652
3653 case OP_NOTUPTO:
3654 case OP_NOTUPTOI:
3655 case OP_NOTMINUPTO:
3656 case OP_NOTMINUPTOI:
3657 min = 0;
3658 max = GET2(ecode, 1);
3659 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3660 ecode += 1 + IMM2_SIZE;
3661 goto REPEATNOTCHAR;
3662
3663 case OP_NOTPOSSTAR:
3664 case OP_NOTPOSSTARI:
3665 possessive = TRUE;
3666 min = 0;
3667 max = INT_MAX;
3668 ecode++;
3669 goto REPEATNOTCHAR;
3670
3671 case OP_NOTPOSPLUS:
3672 case OP_NOTPOSPLUSI:
3673 possessive = TRUE;
3674 min = 1;
3675 max = INT_MAX;
3676 ecode++;
3677 goto REPEATNOTCHAR;
3678
3679 case OP_NOTPOSQUERY:
3680 case OP_NOTPOSQUERYI:
3681 possessive = TRUE;
3682 min = 0;
3683 max = 1;
3684 ecode++;
3685 goto REPEATNOTCHAR;
3686
3687 case OP_NOTPOSUPTO:
3688 case OP_NOTPOSUPTOI:
3689 possessive = TRUE;
3690 min = 0;
3691 max = GET2(ecode, 1);
3692 ecode += 1 + IMM2_SIZE;
3693 goto REPEATNOTCHAR;
3694
3695 case OP_NOTSTAR:
3696 case OP_NOTSTARI:
3697 case OP_NOTMINSTAR:
3698 case OP_NOTMINSTARI:
3699 case OP_NOTPLUS:
3700 case OP_NOTPLUSI:
3701 case OP_NOTMINPLUS:
3702 case OP_NOTMINPLUSI:
3703 case OP_NOTQUERY:
3704 case OP_NOTQUERYI:
3705 case OP_NOTMINQUERY:
3706 case OP_NOTMINQUERYI:
3707 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3708 minimize = (c & 1) != 0;
3709 min = rep_min[c]; /* Pick up values from tables; */
3710 max = rep_max[c]; /* zero for max => infinity */
3711 if (max == 0) max = INT_MAX;
3712
3713 /* Common code for all repeated single-byte matches. */
3714
3715 REPEATNOTCHAR:
3716 GETCHARINCTEST(fc, ecode);
3717
3718 /* The code is duplicated for the caseless and caseful cases, for speed,
3719 since matching characters is likely to be quite common. First, ensure the
3720 minimum number of matches are present. If min = max, continue at the same
3721 level without recursing. Otherwise, if minimizing, keep trying the rest of
3722 the expression and advancing one matching character if failing, up to the
3723 maximum. Alternatively, if maximizing, find the maximum number of
3724 characters and work backwards. */
3725
3726 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3727 max, (char *)eptr));
3728
3729 if (op >= OP_NOTSTARI) /* Caseless */
3730 {
3731 #ifdef SUPPORT_UTF
3732 #ifdef SUPPORT_UCP
3733 if (utf && fc > 127)
3734 foc = UCD_OTHERCASE(fc);
3735 #else
3736 if (utf && fc > 127)
3737 foc = fc;
3738 #endif /* SUPPORT_UCP */
3739 else
3740 #endif /* SUPPORT_UTF */
3741 foc = TABLE_GET(fc, md->fcc, fc);
3742
3743 #ifdef SUPPORT_UTF
3744 if (utf)
3745 {
3746 register pcre_uint32 d;
3747 for (i = 1; i <= min; i++)
3748 {
3749 if (eptr >= md->end_subject)
3750 {
3751 SCHECK_PARTIAL();
3752 RRETURN(MATCH_NOMATCH);
3753 }
3754 GETCHARINC(d, eptr);
3755 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3756 }
3757 }
3758 else
3759 #endif
3760 /* Not UTF mode */
3761 {
3762 for (i = 1; i <= min; i++)
3763 {
3764 if (eptr >= md->end_subject)
3765 {
3766 SCHECK_PARTIAL();
3767 RRETURN(MATCH_NOMATCH);
3768 }
3769 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3770 eptr++;
3771 }
3772 }
3773
3774 if (min == max) continue;
3775
3776 if (minimize)
3777 {
3778 #ifdef SUPPORT_UTF
3779 if (utf)
3780 {
3781 register pcre_uint32 d;
3782 for (fi = min;; fi++)
3783 {
3784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3786 if (fi >= max) RRETURN(MATCH_NOMATCH);
3787 if (eptr >= md->end_subject)
3788 {
3789 SCHECK_PARTIAL();
3790 RRETURN(MATCH_NOMATCH);
3791 }
3792 GETCHARINC(d, eptr);
3793 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3794 }
3795 }
3796 else
3797 #endif
3798 /* Not UTF mode */
3799 {
3800 for (fi = min;; fi++)
3801 {
3802 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3804 if (fi >= max) RRETURN(MATCH_NOMATCH);
3805 if (eptr >= md->end_subject)
3806 {
3807 SCHECK_PARTIAL();
3808 RRETURN(MATCH_NOMATCH);
3809 }
3810 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3811 eptr++;
3812 }
3813 }
3814 /* Control never gets here */
3815 }
3816
3817 /* Maximize case */
3818
3819 else
3820 {
3821 pp = eptr;
3822
3823 #ifdef SUPPORT_UTF
3824 if (utf)
3825 {
3826 register pcre_uint32 d;
3827 for (i = min; i < max; i++)
3828 {
3829 int len = 1;
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 break;
3834 }
3835 GETCHARLEN(d, eptr, len);
3836 if (fc == d || (unsigned int)foc == d) break;
3837 eptr += len;
3838 }
3839 if (possessive) continue;
3840 for(;;)
3841 {
3842 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3843 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3844 if (eptr-- == pp) break; /* Stop if tried at original pos */
3845 BACKCHAR(eptr);
3846 }
3847 }
3848 else
3849 #endif
3850 /* Not UTF mode */
3851 {
3852 for (i = min; i < max; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 break;
3858 }
3859 if (fc == *eptr || foc == *eptr) break;
3860 eptr++;
3861 }
3862 if (possessive) continue;
3863 while (eptr >= pp)
3864 {
3865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3867 eptr--;
3868 }
3869 }
3870
3871 RRETURN(MATCH_NOMATCH);
3872 }
3873 /* Control never gets here */
3874 }
3875
3876 /* Caseful comparisons */
3877
3878 else
3879 {
3880 #ifdef SUPPORT_UTF
3881 if (utf)
3882 {
3883 register pcre_uint32 d;
3884 for (i = 1; i <= min; i++)
3885 {
3886 if (eptr >= md->end_subject)
3887 {
3888 SCHECK_PARTIAL();
3889 RRETURN(MATCH_NOMATCH);
3890 }
3891 GETCHARINC(d, eptr);
3892 if (fc == d) RRETURN(MATCH_NOMATCH);
3893 }
3894 }
3895 else
3896 #endif
3897 /* Not UTF mode */
3898 {
3899 for (i = 1; i <= min; i++)
3900 {
3901 if (eptr >= md->end_subject)
3902 {
3903 SCHECK_PARTIAL();
3904 RRETURN(MATCH_NOMATCH);
3905 }
3906 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3907 }
3908 }
3909
3910 if (min == max) continue;
3911
3912 if (minimize)
3913 {
3914 #ifdef SUPPORT_UTF
3915 if (utf)
3916 {
3917 register pcre_uint32 d;
3918 for (fi = min;; fi++)
3919 {
3920 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3922 if (fi >= max) RRETURN(MATCH_NOMATCH);
3923 if (eptr >= md->end_subject)
3924 {
3925 SCHECK_PARTIAL();
3926 RRETURN(MATCH_NOMATCH);
3927 }
3928 GETCHARINC(d, eptr);
3929 if (fc == d) RRETURN(MATCH_NOMATCH);
3930 }
3931 }
3932 else
3933 #endif
3934 /* Not UTF mode */
3935 {
3936 for (fi = min;; fi++)
3937 {
3938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3940 if (fi >= max) RRETURN(MATCH_NOMATCH);
3941 if (eptr >= md->end_subject)
3942 {
3943 SCHECK_PARTIAL();
3944 RRETURN(MATCH_NOMATCH);
3945 }
3946 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3947 }
3948 }
3949 /* Control never gets here */
3950 }
3951
3952 /* Maximize case */
3953
3954 else
3955 {
3956 pp = eptr;
3957
3958 #ifdef SUPPORT_UTF
3959 if (utf)
3960 {
3961 register pcre_uint32 d;
3962 for (i = min; i < max; i++)
3963 {
3964 int len = 1;
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 break;
3969 }
3970 GETCHARLEN(d, eptr, len);
3971 if (fc == d) break;
3972 eptr += len;
3973 }
3974 if (possessive) continue;
3975 for(;;)
3976 {
3977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3979 if (eptr-- == pp) break; /* Stop if tried at original pos */
3980 BACKCHAR(eptr);
3981 }
3982 }
3983 else
3984 #endif
3985 /* Not UTF mode */
3986 {
3987 for (i = min; i < max; i++)
3988 {
3989 if (eptr >= md->end_subject)
3990 {
3991 SCHECK_PARTIAL();
3992 break;
3993 }
3994 if (fc == *eptr) break;
3995 eptr++;
3996 }
3997 if (possessive) continue;
3998 while (eptr >= pp)
3999 {
4000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4002 eptr--;
4003 }
4004 }
4005
4006 RRETURN(MATCH_NOMATCH);
4007 }
4008 }
4009 /* Control never gets here */
4010
4011 /* Match a single character type repeatedly; several different opcodes
4012 share code. This is very similar to the code for single characters, but we
4013 repeat it in the interests of efficiency. */
4014
4015 case OP_TYPEEXACT:
4016 min = max = GET2(ecode, 1);
4017 minimize = TRUE;
4018 ecode += 1 + IMM2_SIZE;
4019 goto REPEATTYPE;
4020
4021 case OP_TYPEUPTO:
4022 case OP_TYPEMINUPTO:
4023 min = 0;
4024 max = GET2(ecode, 1);
4025 minimize = *ecode == OP_TYPEMINUPTO;
4026 ecode += 1 + IMM2_SIZE;
4027 goto REPEATTYPE;
4028
4029 case OP_TYPEPOSSTAR:
4030 possessive = TRUE;
4031 min = 0;
4032 max = INT_MAX;
4033 ecode++;
4034 goto REPEATTYPE;
4035
4036 case OP_TYPEPOSPLUS:
4037 possessive = TRUE;
4038 min = 1;
4039 max = INT_MAX;
4040 ecode++;
4041 goto REPEATTYPE;
4042
4043 case OP_TYPEPOSQUERY:
4044 possessive = TRUE;
4045 min = 0;
4046 max = 1;
4047 ecode++;
4048 goto REPEATTYPE;
4049
4050 case OP_TYPEPOSUPTO:
4051 possessive = TRUE;
4052 min = 0;
4053 max = GET2(ecode, 1);
4054 ecode += 1 + IMM2_SIZE;
4055 goto REPEATTYPE;
4056
4057 case OP_TYPESTAR:
4058 case OP_TYPEMINSTAR:
4059 case OP_TYPEPLUS:
4060 case OP_TYPEMINPLUS:
4061 case OP_TYPEQUERY:
4062 case OP_TYPEMINQUERY:
4063 c = *ecode++ - OP_TYPESTAR;
4064 minimize = (c & 1) != 0;
4065 min = rep_min[c]; /* Pick up values from tables; */
4066 max = rep_max[c]; /* zero for max => infinity */
4067 if (max == 0) max = INT_MAX;
4068
4069 /* Common code for all repeated single character type matches. Note that
4070 in UTF-8 mode, '.' matches a character of any length, but for the other
4071 character types, the valid characters are all one-byte long. */
4072
4073 REPEATTYPE:
4074 ctype = *ecode++; /* Code for the character type */
4075
4076 #ifdef SUPPORT_UCP
4077 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4078 {
4079 prop_fail_result = ctype == OP_NOTPROP;
4080 prop_type = *ecode++;
4081 prop_value = *ecode++;
4082 }
4083 else prop_type = -1;
4084 #endif
4085
4086 /* First, ensure the minimum number of matches are present. Use inline
4087 code for maximizing the speed, and do the type test once at the start
4088 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4089 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4090 and single-bytes. */
4091
4092 if (min > 0)
4093 {
4094 #ifdef SUPPORT_UCP
4095 if (prop_type >= 0)
4096 {
4097 switch(prop_type)
4098 {
4099 case PT_ANY:
4100 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4101 for (i = 1; i <= min; i++)
4102 {
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 GETCHARINCTEST(c, eptr);
4109 }
4110 break;
4111
4112 case PT_LAMP:
4113 for (i = 1; i <= min; i++)
4114 {
4115 int chartype;
4116 if (eptr >= md->end_subject)
4117 {
4118 SCHECK_PARTIAL();
4119 RRETURN(MATCH_NOMATCH);
4120 }
4121 GETCHARINCTEST(c, eptr);
4122 chartype = UCD_CHARTYPE(c);
4123 if ((chartype == ucp_Lu ||
4124 chartype == ucp_Ll ||
4125 chartype == ucp_Lt) == prop_fail_result)
4126 RRETURN(MATCH_NOMATCH);
4127 }
4128 break;
4129
4130 case PT_GC:
4131 for (i = 1; i <= min; i++)
4132 {
4133 if (eptr >= md->end_subject)
4134 {
4135 SCHECK_PARTIAL();
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 GETCHARINCTEST(c, eptr);
4139 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4140 RRETURN(MATCH_NOMATCH);
4141 }
4142 break;
4143
4144 case PT_PC:
4145 for (i = 1; i <= min; i++)
4146 {
4147 if (eptr >= md->end_subject)
4148 {
4149 SCHECK_PARTIAL();
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 GETCHARINCTEST(c, eptr);
4153 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4154 RRETURN(MATCH_NOMATCH);
4155 }
4156 break;
4157
4158 case PT_SC:
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 RRETURN(MATCH_NOMATCH);
4165 }
4166 GETCHARINCTEST(c, eptr);
4167 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_ALNUM:
4173 for (i = 1; i <= min; i++)
4174 {
4175 int category;
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 GETCHARINCTEST(c, eptr);
4182 category = UCD_CATEGORY(c);
4183 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 break;
4187
4188 case PT_SPACE: /* Perl space */
4189 for (i = 1; i <= min; i++)
4190 {
4191 if (eptr >= md->end_subject)
4192 {
4193 SCHECK_PARTIAL();
4194 RRETURN(MATCH_NOMATCH);
4195 }
4196 GETCHARINCTEST(c, eptr);
4197 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4198 c == CHAR_FF || c == CHAR_CR)
4199 == prop_fail_result)
4200 RRETURN(MATCH_NOMATCH);
4201 }
4202 break;
4203
4204 case PT_PXSPACE: /* POSIX space */
4205 for (i = 1; i <= min; i++)
4206 {
4207 if (eptr >= md->end_subject)
4208 {
4209 SCHECK_PARTIAL();
4210 RRETURN(MATCH_NOMATCH);
4211 }
4212 GETCHARINCTEST(c, eptr);
4213 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4214 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4215 == prop_fail_result)
4216 RRETURN(MATCH_NOMATCH);
4217 }
4218 break;
4219
4220 case PT_WORD:
4221 for (i = 1; i <= min; i++)
4222 {
4223 int category;
4224 if (eptr >= md->end_subject)
4225 {
4226 SCHECK_PARTIAL();
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 GETCHARINCTEST(c, eptr);
4230 category = UCD_CATEGORY(c);
4231 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4232 == prop_fail_result)
4233 RRETURN(MATCH_NOMATCH);
4234 }
4235 break;
4236
4237 case PT_CLIST:
4238 for (i = 1; i <= min; i++)
4239 {
4240 const pcre_uint32 *cp;
4241 if (eptr >= md->end_subject)
4242 {
4243 SCHECK_PARTIAL();
4244 RRETURN(MATCH_NOMATCH);
4245 }
4246 GETCHARINCTEST(c, eptr);
4247 cp = PRIV(ucd_caseless_sets) + prop_value;
4248 for (;;)
4249 {
4250 if (c < *cp)
4251 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4252 if (c == *cp++)
4253 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4254 }
4255 }
4256 break;
4257
4258 case PT_UCNC:
4259 for (i = 1; i <= min; i++)
4260 {
4261 if (eptr >= md->end_subject)
4262 {
4263 SCHECK_PARTIAL();
4264 RRETURN(MATCH_NOMATCH);
4265 }
4266 GETCHARINCTEST(c, eptr);
4267 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4268 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4269 c >= 0xe000) == prop_fail_result)
4270 RRETURN(MATCH_NOMATCH);
4271 }
4272 break;
4273
4274 /* This should not occur */
4275
4276 default:
4277 RRETURN(PCRE_ERROR_INTERNAL);
4278 }
4279 }
4280
4281 /* Match extended Unicode sequences. We will get here only if the
4282 support is in the binary; otherwise a compile-time error occurs. */
4283
4284 else if (ctype == OP_EXTUNI)
4285 {
4286 for (i = 1; i <= min; i++)
4287 {
4288 if (eptr >= md->end_subject)
4289 {
4290 SCHECK_PARTIAL();
4291 RRETURN(MATCH_NOMATCH);
4292 }
4293 else
4294 {
4295 int lgb, rgb;
4296 GETCHARINCTEST(c, eptr);
4297 lgb = UCD_GRAPHBREAK(c);
4298 while (eptr < md->end_subject)
4299 {
4300 int len = 1;
4301 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4302 rgb = UCD_GRAPHBREAK(c);
4303 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4304 lgb = rgb;
4305 eptr += len;
4306 }
4307 }
4308 CHECK_PARTIAL();
4309 }
4310 }
4311
4312 else
4313 #endif /* SUPPORT_UCP */
4314
4315 /* Handle all other cases when the coding is UTF-8 */
4316
4317 #ifdef SUPPORT_UTF
4318 if (utf) switch(ctype)
4319 {
4320 case OP_ANY:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4329 if (md->partial != 0 &&
4330 eptr + 1 >= md->end_subject &&
4331 NLBLOCK->nltype == NLTYPE_FIXED &&
4332 NLBLOCK->nllen == 2 &&
4333 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4334 {
4335 md->hitend = TRUE;
4336 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4337 }
4338 eptr++;
4339 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4340 }
4341 break;
4342
4343 case OP_ALLANY:
4344 for (i = 1; i <= min; i++)
4345 {
4346 if (eptr >= md->end_subject)
4347 {
4348 SCHECK_PARTIAL();
4349 RRETURN(MATCH_NOMATCH);
4350 }
4351 eptr++;
4352 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4353 }
4354 break;
4355
4356 case OP_ANYBYTE:
4357 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4358 eptr += min;
4359 break;
4360
4361 case OP_ANYNL:
4362 for (i = 1; i <= min; i++)
4363 {
4364 if (eptr >= md->end_subject)
4365 {
4366 SCHECK_PARTIAL();
4367 RRETURN(MATCH_NOMATCH);
4368 }
4369 GETCHARINC(c, eptr);
4370 switch(c)
4371 {
4372 default: RRETURN(MATCH_NOMATCH);
4373
4374 case CHAR_CR:
4375 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4376 break;
4377
4378 case CHAR_LF:
4379 break;
4380
4381 case CHAR_VT:
4382 case CHAR_FF:
4383 case CHAR_NEL:
4384 #ifndef EBCDIC
4385 case 0x2028:
4386 case 0x2029:
4387 #endif /* Not EBCDIC */
4388 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4389 break;
4390 }
4391 }
4392 break;
4393
4394 case OP_NOT_HSPACE:
4395 for (i = 1; i <= min; i++)
4396 {
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 RRETURN(MATCH_NOMATCH);
4401 }
4402 GETCHARINC(c, eptr);
4403 switch(c)
4404 {
4405 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4406 default: break;
4407 }
4408 }
4409 break;
4410
4411 case OP_HSPACE:
4412 for (i = 1; i <= min; i++)
4413 {
4414 if (eptr >= md->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 RRETURN(MATCH_NOMATCH);
4418 }
4419 GETCHARINC(c, eptr);
4420 switch(c)
4421 {
4422 HSPACE_CASES: break; /* Byte and multibyte cases */
4423 default: RRETURN(MATCH_NOMATCH);
4424 }
4425 }
4426 break;
4427
4428 case OP_NOT_VSPACE:
4429 for (i = 1; i <= min; i++)
4430 {
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 RRETURN(MATCH_NOMATCH);
4435 }
4436 GETCHARINC(c, eptr);
4437 switch(c)
4438 {
4439 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4440 default: break;
4441 }
4442 }
4443 break;
4444
4445 case OP_VSPACE:
4446 for (i = 1; i <= min; i++)
4447 {
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 RRETURN(MATCH_NOMATCH);
4452 }
4453 GETCHARINC(c, eptr);
4454 switch(c)
4455 {
4456 VSPACE_CASES: break;
4457 default: RRETURN(MATCH_NOMATCH);
4458 }
4459 }
4460 break;
4461
4462 case OP_NOT_DIGIT:
4463 for (i = 1; i <= min; i++)
4464 {
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 RRETURN(MATCH_NOMATCH);
4469 }
4470 GETCHARINC(c, eptr);
4471 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4472 RRETURN(MATCH_NOMATCH);
4473 }
4474 break;
4475
4476 case OP_DIGIT:
4477 for (i = 1; i <= min; i++)
4478 {
4479 pcre_uint32 cc;
4480 if (eptr >= md->end_subject)
4481 {
4482 SCHECK_PARTIAL();
4483 RRETURN(MATCH_NOMATCH);
4484 }
4485 cc = RAWUCHAR(eptr);
4486 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4487 RRETURN(MATCH_NOMATCH);
4488 eptr++;
4489 /* No need to skip more bytes - we know it's a 1-byte character */
4490 }
4491 break;
4492
4493 case OP_NOT_WHITESPACE:
4494 for (i = 1; i <= min; i++)
4495 {
4496 pcre_uint32 cc;
4497 if (eptr >= md->end_subject)
4498 {
4499 SCHECK_PARTIAL();
4500 RRETURN(MATCH_NOMATCH);
4501 }
4502 cc = RAWUCHAR(eptr);
4503 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4504 RRETURN(MATCH_NOMATCH);
4505 eptr++;
4506 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4507 }
4508 break;
4509
4510 case OP_WHITESPACE:
4511 for (i = 1; i <= min; i++)
4512 {
4513 pcre_uint32 cc;
4514 if (eptr >= md->end_subject)
4515 {
4516 SCHECK_PARTIAL();
4517 RRETURN(MATCH_NOMATCH);
4518 }
4519 cc = RAWUCHAR(eptr);
4520 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4521 RRETURN(MATCH_NOMATCH);
4522 eptr++;
4523 /* No need to skip more bytes - we know it's a 1-byte character */
4524 }
4525 break;
4526
4527 case OP_NOT_WORDCHAR:
4528 for (i = 1; i <= min; i++)
4529 {
4530 pcre_uint32 cc;
4531 if (eptr >= md->end_subject)
4532 {
4533 SCHECK_PARTIAL();
4534 RRETURN(MATCH_NOMATCH);
4535 }
4536 cc = RAWUCHAR(eptr);
4537 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4538 RRETURN(MATCH_NOMATCH);
4539 eptr++;
4540 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4541 }
4542 break;
4543
4544 case OP_WORDCHAR:
4545 for (i = 1; i <= min; i++)
4546 {
4547 pcre_uint32 cc;
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 RRETURN(MATCH_NOMATCH);
4552 }
4553 cc = RAWUCHAR(eptr);
4554 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4555 RRETURN(MATCH_NOMATCH);
4556 eptr++;
4557 /* No need to skip more bytes - we know it's a 1-byte character */
4558 }
4559 break;
4560
4561 default:
4562 RRETURN(PCRE_ERROR_INTERNAL);
4563 } /* End switch(ctype) */
4564
4565 else
4566 #endif /* SUPPORT_UTF */
4567
4568 /* Code for the non-UTF-8 case for minimum matching of operators other
4569 than OP_PROP and OP_NOTPROP. */
4570
4571 switch(ctype)
4572 {
4573 case OP_ANY:
4574 for (i = 1; i <= min; i++)
4575 {
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4580 }
4581 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4582 if (md->partial != 0 &&
4583 eptr + 1 >= md->end_subject &&
4584 NLBLOCK->nltype == NLTYPE_FIXED &&
4585 NLBLOCK->nllen == 2 &&
4586 *eptr == NLBLOCK->nl[0])
4587 {
4588 md->hitend = TRUE;
4589 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4590 }
4591 eptr++;
4592 }
4593 break;
4594
4595 case OP_ALLANY:
4596 if (eptr > md->end_subject - min)
4597 {
4598 SCHECK_PARTIAL();
4599 RRETURN(MATCH_NOMATCH);
4600 }
4601 eptr += min;
4602 break;
4603
4604 case OP_ANYBYTE:
4605 if (eptr > md->end_subject - min)
4606 {
4607 SCHECK_PARTIAL();
4608 RRETURN(MATCH_NOMATCH);
4609 }
4610 eptr += min;
4611 break;
4612
4613 case OP_ANYNL:
4614 for (i = 1; i <= min; i++)
4615 {
4616 if (eptr >= md->end_subject)
4617 {
4618 SCHECK_PARTIAL();
4619 RRETURN(MATCH_NOMATCH);
4620 }
4621 switch(*eptr++)
4622 {
4623 default: RRETURN(MATCH_NOMATCH);
4624
4625 case CHAR_CR:
4626 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4627 break;
4628
4629 case CHAR_LF:
4630 break;
4631
4632 case CHAR_VT:
4633 case CHAR_FF:
4634 case CHAR_NEL:
4635 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4636 case 0x2028:
4637 case 0x2029:
4638 #endif
4639 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4640 break;
4641 }
4642 }
4643 break;
4644
4645 case OP_NOT_HSPACE:
4646 for (i = 1; i <= min; i++)
4647 {
4648 if (eptr >= md->end_subject)
4649 {
4650 SCHECK_PARTIAL();
4651 RRETURN(MATCH_NOMATCH);
4652 }
4653 switch(*eptr++)
4654 {
4655 default: break;
4656 HSPACE_BYTE_CASES:
4657 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4658 HSPACE_MULTIBYTE_CASES:
4659 #endif
4660 RRETURN(MATCH_NOMATCH);
4661 }
4662 }
4663 break;
4664
4665 case OP_HSPACE:
4666 for (i = 1; i <= min; i++)
4667 {
4668 if (eptr >= md->end_subject)
4669 {
4670 SCHECK_PARTIAL();
4671 RRETURN(MATCH_NOMATCH);
4672 }
4673 switch(*eptr++)
4674 {
4675 default: RRETURN(MATCH_NOMATCH);
4676 HSPACE_BYTE_CASES:
4677 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4678 HSPACE_MULTIBYTE_CASES:
4679 #endif
4680 break;
4681 }
4682 }
4683 break;
4684
4685 case OP_NOT_VSPACE:
4686 for (i = 1; i <= min; i++)
4687 {
4688 if (eptr >= md->end_subject)
4689 {
4690 SCHECK_PARTIAL();
4691 RRETURN(MATCH_NOMATCH);
4692 }
4693 switch(*eptr++)
4694 {
4695 VSPACE_BYTE_CASES:
4696 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4697 VSPACE_MULTIBYTE_CASES:
4698 #endif
4699 RRETURN(MATCH_NOMATCH);
4700 default: break;
4701 }
4702 }
4703 break;
4704
4705 case OP_VSPACE:
4706 for (i = 1; i <= min; i++)
4707 {
4708 if (eptr >= md->end_subject)
4709 {
4710 SCHECK_PARTIAL();
4711 RRETURN(MATCH_NOMATCH);
4712 }
4713 switch(*eptr++)
4714 {
4715 default: RRETURN(MATCH_NOMATCH);
4716 VSPACE_BYTE_CASES:
4717 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4718 VSPACE_MULTIBYTE_CASES:
4719 #endif
4720 break;
4721 }
4722 }
4723 break;
4724
4725 case OP_NOT_DIGIT:
4726 for (i = 1; i <= min; i++)
4727 {
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 RRETURN(MATCH_NOMATCH);
4732 }
4733 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4734 RRETURN(MATCH_NOMATCH);
4735 eptr++;
4736 }
4737 break;
4738
4739 case OP_DIGIT:
4740 for (i = 1; i <= min; i++)
4741 {
4742 if (eptr >= md->end_subject)
4743 {
4744 SCHECK_PARTIAL();
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4748 RRETURN(MATCH_NOMATCH);
4749 eptr++;
4750 }
4751 break;
4752
4753 case OP_NOT_WHITESPACE:
4754 for (i = 1; i <= min; i++)
4755 {
4756 if (eptr >= md->end_subject)
4757 {
4758 SCHECK_PARTIAL();
4759 RRETURN(MATCH_NOMATCH);
4760 }
4761 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4762 RRETURN(MATCH_NOMATCH);
4763 eptr++;
4764 }
4765 break;
4766
4767 case OP_WHITESPACE:
4768 for (i = 1; i <= min; i++)
4769 {
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 RRETURN(MATCH_NOMATCH);
4774 }
4775 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4776 RRETURN(MATCH_NOMATCH);
4777 eptr++;
4778 }
4779 break;
4780
4781 case OP_NOT_WORDCHAR:
4782 for (i = 1; i <= min; i++)
4783 {
4784 if (eptr >= md->end_subject)
4785 {
4786 SCHECK_PARTIAL();
4787 RRETURN(MATCH_NOMATCH);
4788 }
4789 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4790 RRETURN(MATCH_NOMATCH);
4791 eptr++;
4792 }
4793 break;
4794
4795 case OP_WORDCHAR:
4796 for (i = 1; i <= min; i++)
4797 {
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4804 RRETURN(MATCH_NOMATCH);
4805 eptr++;
4806 }
4807 break;
4808
4809 default:
4810 RRETURN(PCRE_ERROR_INTERNAL);
4811 }
4812 }
4813
4814 /* If min = max, continue at the same level without recursing */
4815
4816 if (min == max) continue;
4817
4818 /* If minimizing, we have to test the rest of the pattern before each
4819 subsequent match. Again, separate the UTF-8 case for speed, and also
4820 separate the UCP cases. */
4821
4822 if (minimize)
4823 {
4824 #ifdef SUPPORT_UCP
4825 if (prop_type >= 0)
4826 {
4827 switch(prop_type)
4828 {
4829 case PT_ANY:
4830 for (fi = min;; fi++)
4831 {
4832 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4834 if (fi >= max) RRETURN(MATCH_NOMATCH);
4835 if (eptr >= md->end_subject)
4836 {
4837 SCHECK_PARTIAL();
4838 RRETURN(MATCH_NOMATCH);
4839 }
4840 GETCHARINCTEST(c, eptr);
4841 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4842 }
4843 /* Control never gets here */
4844
4845 case PT_LAMP:
4846 for (fi = min;; fi++)
4847 {
4848 int chartype;
4849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4851 if (fi >= max) RRETURN(MATCH_NOMATCH);
4852 if (eptr >= md->end_subject)
4853 {
4854 SCHECK_PARTIAL();
4855 RRETURN(MATCH_NOMATCH);
4856 }
4857 GETCHARINCTEST(c, eptr);
4858 chartype = UCD_CHARTYPE(c);
4859 if ((chartype == ucp_Lu ||
4860 chartype == ucp_Ll ||
4861 chartype == ucp_Lt) == prop_fail_result)
4862 RRETURN(MATCH_NOMATCH);
4863 }
4864 /* Control never gets here */
4865
4866 case PT_GC:
4867 for (fi = min;; fi++)
4868 {
4869 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4871 if (fi >= max) RRETURN(MATCH_NOMATCH);
4872 if (eptr >= md->end_subject)
4873 {
4874 SCHECK_PARTIAL();
4875 RRETURN(MATCH_NOMATCH);
4876 }
4877 GETCHARINCTEST(c, eptr);
4878 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4879 RRETURN(MATCH_NOMATCH);
4880 }
4881 /* Control never gets here */
4882
4883 case PT_PC:
4884 for (fi = min;; fi++)
4885 {
4886 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4888 if (fi >= max) RRETURN(MATCH_NOMATCH);
4889 if (eptr >= md->end_subject)
4890 {
4891 SCHECK_PARTIAL();
4892 RRETURN(MATCH_NOMATCH);
4893 }
4894 GETCHARINCTEST(c, eptr);
4895 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4896 RRETURN(MATCH_NOMATCH);
4897 }
4898 /* Control never gets here */
4899
4900 case PT_SC:
4901 for (fi = min;; fi++)
4902 {
4903 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4905 if (fi >= max) RRETURN(MATCH_NOMATCH);
4906 if (eptr >= md->end_subject)
4907 {
4908 SCHECK_PARTIAL();
4909 RRETURN(MATCH_NOMATCH);
4910 }
4911 GETCHARINCTEST(c, eptr);
4912 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 /* Control never gets here */
4916
4917 case PT_ALNUM:
4918 for (fi = min;; fi++)
4919 {
4920 int category;
4921 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4922 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4923 if (fi >= max) RRETURN(MATCH_NOMATCH);
4924 if (eptr >= md->end_subject)
4925 {
4926 SCHECK_PARTIAL();
4927 RRETURN(MATCH_NOMATCH);
4928 }
4929 GETCHARINCTEST(c, eptr);
4930 category = UCD_CATEGORY(c);
4931 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 /* Control never gets here */
4935
4936 case PT_SPACE: /* Perl space */
4937 for (fi = min;; fi++)
4938 {
4939 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4941 if (fi >= max) RRETURN(MATCH_NOMATCH);
4942 if (eptr >= md->end_subject)
4943 {
4944 SCHECK_PARTIAL();
4945 RRETURN(MATCH_NOMATCH);
4946 }
4947 GETCHARINCTEST(c, eptr);
4948 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4949 c == CHAR_FF || c == CHAR_CR)
4950 == prop_fail_result)
4951 RRETURN(MATCH_NOMATCH);
4952 }
4953 /* Control never gets here */
4954
4955 case PT_PXSPACE: /* POSIX space */
4956 for (fi = min;; fi++)
4957 {
4958 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4959 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4960 if (fi >= max) RRETURN(MATCH_NOMATCH);
4961 if (eptr >= md->end_subject)
4962 {
4963 SCHECK_PARTIAL();
4964 RRETURN(MATCH_NOMATCH);
4965 }
4966 GETCHARINCTEST(c, eptr);
4967 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4968 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4969 == prop_fail_result)
4970 RRETURN(MATCH_NOMATCH);
4971 }
4972 /* Control never gets here */
4973
4974 case PT_WORD:
4975 for (fi = min;; fi++)
4976 {
4977 int category;
4978 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4980 if (fi >= max) RRETURN(MATCH_NOMATCH);
4981 if (eptr >= md->end_subject)
4982 {
4983 SCHECK_PARTIAL();
4984 RRETURN(MATCH_NOMATCH);
4985 }
4986 GETCHARINCTEST(c, eptr);
4987 category = UCD_CATEGORY(c);
4988 if ((category == ucp_L ||
4989 category == ucp_N ||
4990 c == CHAR_UNDERSCORE)
4991 == prop_fail_result)
4992 RRETURN(MATCH_NOMATCH);
4993 }
4994 /* Control never gets here */
4995
4996 case PT_CLIST:
4997 for (fi = min;; fi++)
4998 {
4999 const pcre_uint32 *cp;
5000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5002 if (fi >= max) RRETURN(MATCH_NOMATCH);
5003 if (eptr >= md->end_subject)
5004 {
5005 SCHECK_PARTIAL();
5006 RRETURN(MATCH_NOMATCH);
5007 }
5008 GETCHARINCTEST(c, eptr);
5009 cp = PRIV(ucd_caseless_sets) + prop_value;
5010 for (;;)
5011 {
5012 if (c < *cp)
5013 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5014 if (c == *cp++)
5015 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5016 }
5017 }
5018 /* Control never gets here */
5019
5020 case PT_UCNC:
5021 for (fi = min;; fi++)
5022 {
5023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM68);
5024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5025 if (fi >= max) RRETURN(MATCH_NOMATCH);
5026 if (eptr >= md->end_subject)
5027 {
5028 SCHECK_PARTIAL();
5029 RRETURN(MATCH_NOMATCH);
5030 }
5031 GETCHARINCTEST(c, eptr);
5032 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5033 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5034 c >= 0xe000) == prop_fail_result)
5035 RRETURN(MATCH_NOMATCH);
5036 }
5037 /* Control never gets here */
5038
5039 /* This should never occur */
5040 default:
5041 RRETURN(PCRE_ERROR_INTERNAL);
5042 }
5043 }
5044
5045 /* Match extended Unicode sequences. We will get here only if the
5046 support is in the binary; otherwise a compile-time error occurs. */
5047
5048 else if (ctype == OP_EXTUNI)
5049 {
5050 for (fi = min;; fi++)
5051 {
5052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5054 if (fi >= max) RRETURN(MATCH_NOMATCH);
5055 if (eptr >= md->end_subject)
5056 {
5057 SCHECK_PARTIAL();
5058 RRETURN(MATCH_NOMATCH);
5059 }
5060 else
5061 {
5062 int lgb, rgb;
5063 GETCHARINCTEST(c, eptr);
5064 lgb = UCD_GRAPHBREAK(c);
5065 while (eptr < md->end_subject)
5066 {
5067 int len = 1;
5068 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5069 rgb = UCD_GRAPHBREAK(c);
5070 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5071 lgb = rgb;
5072 eptr += len;
5073 }
5074 }
5075 CHECK_PARTIAL();
5076 }
5077 }
5078 else
5079 #endif /* SUPPORT_UCP */
5080
5081 #ifdef SUPPORT_UTF
5082 if (utf)
5083 {
5084 for (fi = min;; fi++)
5085 {
5086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5088 if (fi >= max) RRETURN(MATCH_NOMATCH);
5089 if (eptr >= md->end_subject)
5090 {
5091 SCHECK_PARTIAL();
5092 RRETURN(MATCH_NOMATCH);
5093 }
5094 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5095 RRETURN(MATCH_NOMATCH);
5096 GETCHARINC(c, eptr);
5097 switch(ctype)
5098 {
5099 case OP_ANY: /* This is the non-NL case */
5100 if (md->partial != 0 && /* Take care with CRLF partial */
5101 eptr >= md->end_subject &&
5102 NLBLOCK->nltype == NLTYPE_FIXED &&
5103 NLBLOCK->nllen == 2 &&
5104 c == NLBLOCK->nl[0])
5105 {
5106 md->hitend = TRUE;
5107 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5108 }
5109 break;
5110
5111 case OP_ALLANY:
5112 case OP_ANYBYTE:
5113 break;
5114
5115 case OP_ANYNL:
5116 switch(c)
5117 {
5118 default: RRETURN(MATCH_NOMATCH);
5119 case CHAR_CR:
5120 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5121 break;
5122
5123 case CHAR_LF:
5124 break;
5125
5126 case CHAR_VT:
5127 case CHAR_FF:
5128 case CHAR_NEL:
5129 #ifndef EBCDIC
5130 case 0x2028:
5131 case 0x2029:
5132 #endif /* Not EBCDIC */
5133 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5134 break;
5135 }
5136 break;
5137
5138 case OP_NOT_HSPACE:
5139 switch(c)
5140 {
5141 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5142 default: break;
5143 }
5144 break;
5145
5146 case OP_HSPACE:
5147 switch(c)
5148 {
5149 HSPACE_CASES: break;
5150 default: RRETURN(MATCH_NOMATCH);
5151 }
5152 break;
5153
5154 case OP_NOT_VSPACE:
5155 switch(c)
5156 {
5157 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5158 default: break;
5159 }
5160 break;
5161
5162 case OP_VSPACE:
5163 switch(c)
5164 {
5165 VSPACE_CASES: break;
5166 default: RRETURN(MATCH_NOMATCH);
5167 }
5168 break;
5169
5170 case OP_NOT_DIGIT:
5171 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5172 RRETURN(MATCH_NOMATCH);
5173 break;
5174
5175 case OP_DIGIT:
5176 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5177 RRETURN(MATCH_NOMATCH);
5178 break;
5179
5180 case OP_NOT_WHITESPACE:
5181 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5182 RRETURN(MATCH_NOMATCH);
5183 break;
5184
5185 case OP_WHITESPACE:
5186 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5187 RRETURN(MATCH_NOMATCH);
5188 break;
5189
5190 case OP_NOT_WORDCHAR:
5191 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5192 RRETURN(MATCH_NOMATCH);
5193 break;
5194
5195 case OP_WORDCHAR:
5196 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5197 RRETURN(MATCH_NOMATCH);
5198 break;
5199
5200 default:
5201 RRETURN(PCRE_ERROR_INTERNAL);
5202 }
5203 }
5204 }
5205 else
5206 #endif
5207 /* Not UTF mode */
5208 {
5209 for (fi = min;; fi++)
5210 {
5211 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5213 if (fi >= max) RRETURN(MATCH_NOMATCH);
5214 if (eptr >= md->end_subject)
5215 {
5216 SCHECK_PARTIAL();
5217 RRETURN(MATCH_NOMATCH);
5218 }
5219 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5220 RRETURN(MATCH_NOMATCH);
5221 c = *eptr++;
5222 switch(ctype)
5223 {
5224 case OP_ANY: /* This is the non-NL case */
5225 if (md->partial != 0 && /* Take care with CRLF partial */
5226 eptr >= md->end_subject &&
5227 NLBLOCK->nltype == NLTYPE_FIXED &&
5228 NLBLOCK->nllen == 2 &&
5229 c == NLBLOCK->nl[0])
5230 {
5231 md->hitend = TRUE;
5232 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5233 }
5234 break;
5235
5236 case OP_ALLANY:
5237 case OP_ANYBYTE:
5238 break;
5239
5240 case OP_ANYNL:
5241 switch(c)
5242 {
5243 default: RRETURN(MATCH_NOMATCH);
5244 case CHAR_CR:
5245 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5246 break;
5247
5248 case CHAR_LF:
5249 break;
5250
5251 case CHAR_VT:
5252 case CHAR_FF:
5253 case CHAR_NEL:
5254 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5255 case 0x2028:
5256 case 0x2029:
5257 #endif
5258 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5259 break;
5260 }
5261 break;
5262
5263 case OP_NOT_HSPACE:
5264 switch(c)
5265 {
5266 default: break;
5267 HSPACE_BYTE_CASES:
5268 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5269 HSPACE_MULTIBYTE_CASES:
5270 #endif
5271 RRETURN(MATCH_NOMATCH);
5272 }
5273 break;
5274
5275 case OP_HSPACE:
5276 switch(c)
5277 {
5278 default: RRETURN(MATCH_NOMATCH);
5279 HSPACE_BYTE_CASES:
5280 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5281 HSPACE_MULTIBYTE_CASES:
5282 #endif
5283 break;
5284 }
5285 break;
5286
5287 case OP_NOT_VSPACE:
5288 switch(c)
5289 {
5290 default: break;
5291 VSPACE_BYTE_CASES:
5292 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5293 VSPACE_MULTIBYTE_CASES:
5294 #endif
5295 RRETURN(MATCH_NOMATCH);
5296 }
5297 break;
5298
5299 case OP_VSPACE:
5300 switch(c)
5301 {
5302 default: RRETURN(MATCH_NOMATCH);
5303 VSPACE_BYTE_CASES:
5304 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5305 VSPACE_MULTIBYTE_CASES:
5306 #endif
5307 break;
5308 }
5309 break;
5310
5311 case OP_NOT_DIGIT:
5312 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5313 break;
5314
5315 case OP_DIGIT:
5316 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5317 break;
5318
5319 case OP_NOT_WHITESPACE:
5320 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5321 break;
5322
5323 case OP_WHITESPACE:
5324 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5325 break;
5326
5327 case OP_NOT_WORDCHAR:
5328 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5329 break;
5330
5331 case OP_WORDCHAR:
5332 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5333 break;
5334
5335 default:
5336 RRETURN(PCRE_ERROR_INTERNAL);
5337 }
5338 }
5339 }
5340 /* Control never gets here */
5341 }
5342
5343 /* If maximizing, it is worth using inline code for speed, doing the type
5344 test once at the start (i.e. keep it out of the loop). Again, keep the
5345 UTF-8 and UCP stuff separate. */
5346
5347 else
5348 {
5349 pp = eptr; /* Remember where we started */
5350
5351 #ifdef SUPPORT_UCP
5352 if (prop_type >= 0)
5353 {
5354 switch(prop_type)
5355 {
5356 case PT_ANY:
5357 for (i = min; i < max; i++)
5358 {
5359 int len = 1;
5360 if (eptr >= md->end_subject)
5361 {
5362 SCHECK_PARTIAL();
5363 break;
5364 }
5365 GETCHARLENTEST(c, eptr, len);
5366 if (prop_fail_result) break;
5367 eptr+= len;
5368 }
5369 break;
5370
5371 case PT_LAMP:
5372 for (i = min; i < max; i++)
5373 {
5374 int chartype;
5375 int len = 1;
5376 if (eptr >= md->end_subject)
5377 {
5378 SCHECK_PARTIAL();
5379 break;
5380 }
5381 GETCHARLENTEST(c, eptr, len);
5382 chartype = UCD_CHARTYPE(c);
5383 if ((chartype == ucp_Lu ||
5384 chartype == ucp_Ll ||
5385 chartype == ucp_Lt) == prop_fail_result)
5386 break;
5387 eptr+= len;
5388 }
5389 break;
5390
5391 case PT_GC:
5392 for (i = min; i < max; i++)
5393 {
5394 int len = 1;
5395 if (eptr >= md->end_subject)
5396 {
5397 SCHECK_PARTIAL();
5398 break;
5399 }
5400 GETCHARLENTEST(c, eptr, len);
5401 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5402 eptr+= len;
5403 }
5404 break;
5405
5406 case PT_PC:
5407 for (i = min; i < max; i++)
5408 {
5409 int len = 1;
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 GETCHARLENTEST(c, eptr, len);
5416 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5417 eptr+= len;
5418 }
5419 break;
5420
5421 case PT_SC:
5422 for (i = min; i < max; i++)
5423 {
5424 int len = 1;
5425 if (eptr >= md->end_subject)
5426 {
5427 SCHECK_PARTIAL();
5428 break;
5429 }
5430 GETCHARLENTEST(c, eptr, len);
5431 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5432 eptr+= len;
5433 }
5434 break;
5435
5436 case PT_ALNUM:
5437 for (i = min; i < max; i++)
5438 {
5439 int category;
5440 int len = 1;
5441 if (eptr >= md->end_subject)
5442 {
5443 SCHECK_PARTIAL();
5444 break;
5445 }
5446 GETCHARLENTEST(c, eptr, len);
5447 category = UCD_CATEGORY(c);
5448 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5449 break;
5450 eptr+= len;
5451 }
5452 break;
5453
5454 case PT_SPACE: /* Perl space */
5455 for (i = min; i < max; i++)
5456 {
5457 int len = 1;
5458 if (eptr >= md->end_subject)
5459 {
5460 SCHECK_PARTIAL();
5461 break;
5462 }
5463 GETCHARLENTEST(c, eptr, len);
5464 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5465 c == CHAR_FF || c == CHAR_CR)
5466 == prop_fail_result)
5467 break;
5468 eptr+= len;
5469 }
5470 break;
5471
5472 case PT_PXSPACE: /* POSIX space */
5473 for (i = min; i < max; i++)
5474 {
5475 int len = 1;
5476 if (eptr >= md->end_subject)
5477 {
5478 SCHECK_PARTIAL();
5479 break;
5480 }
5481 GETCHARLENTEST(c, eptr, len);
5482 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5483 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5484 == prop_fail_result)
5485 break;
5486 eptr+= len;
5487 }
5488 break;
5489
5490 case PT_WORD:
5491 for (i = min; i < max; i++)
5492 {
5493 int category;
5494 int len = 1;
5495 if (eptr >= md->end_subject)
5496 {
5497 SCHECK_PARTIAL();
5498 break;
5499 }
5500 GETCHARLENTEST(c, eptr, len);
5501 category = UCD_CATEGORY(c);
5502 if ((category == ucp_L || category == ucp_N ||
5503 c == CHAR_UNDERSCORE) == prop_fail_result)
5504 break;
5505 eptr+= len;
5506 }
5507 break;
5508
5509 case PT_CLIST:
5510 for (i = min; i < max; i++)
5511 {
5512 const pcre_uint32 *cp;
5513 int len = 1;
5514 if (eptr >= md->end_subject)
5515 {
5516 SCHECK_PARTIAL();
5517 break;
5518 }
5519 GETCHARLENTEST(c, eptr, len);
5520 cp = PRIV(ucd_caseless_sets) + prop_value;
5521 for (;;)
5522 {
5523 if (c < *cp)
5524 { if (prop_fail_result) break; else goto GOT_MAX; }
5525 if (c == *cp++)
5526 { if (prop_fail_result) goto GOT_MAX; else break; }
5527 }
5528 eptr += len;
5529 }
5530 GOT_MAX:
5531 break;
5532
5533 case PT_UCNC:
5534 for (i = min; i < max; i++)
5535 {
5536 int len = 1;
5537 if (eptr >= md->end_subject)
5538 {
5539 SCHECK_PARTIAL();
5540 break;
5541 }
5542 GETCHARLENTEST(c, eptr, len);
5543 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5544 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5545 c >= 0xe000) == prop_fail_result)
5546 break;
5547 eptr += len;
5548 }
5549 break;
5550
5551 default:
5552 RRETURN(PCRE_ERROR_INTERNAL);
5553 }
5554
5555 /* eptr is now past the end of the maximum run */
5556
5557 if (possessive) continue;
5558 for(;;)
5559 {
5560 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5561 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5562 if (eptr-- == pp) break; /* Stop if tried at original pos */
5563 if (utf) BACKCHAR(eptr);
5564 }
5565 }
5566
5567 /* Match extended Unicode sequences. We will get here only if the
5568 support is in the binary; otherwise a compile-time error occurs. */
5569
5570 else if (ctype == OP_EXTUNI)
5571 {
5572 for (i = min; i < max; i++)
5573 {
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 else
5580 {
5581 int lgb, rgb;
5582 GETCHARINCTEST(c, eptr);
5583 lgb = UCD_GRAPHBREAK(c);
5584 while (eptr < md->end_subject)
5585 {
5586 int len = 1;
5587 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5588 rgb = UCD_GRAPHBREAK(c);
5589 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5590 lgb = rgb;
5591 eptr += len;
5592 }
5593 }
5594 CHECK_PARTIAL();
5595 }
5596
5597 /* eptr is now past the end of the maximum run */
5598
5599 if (possessive) continue;
5600
5601 for(;;)
5602 {
5603 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5605 if (eptr-- == pp) break; /* Stop if tried at original pos */
5606 for (;;) /* Move back over one extended */
5607 {
5608 if (!utf) c = *eptr; else
5609 {
5610 BACKCHAR(eptr);
5611 GETCHAR(c, eptr);
5612 }
5613 if (UCD_CATEGORY(c) != ucp_M) break;
5614 eptr--;
5615 }
5616 }
5617 }
5618
5619 else
5620 #endif /* SUPPORT_UCP */
5621
5622 #ifdef SUPPORT_UTF
5623 if (utf)
5624 {
5625 switch(ctype)
5626 {
5627 case OP_ANY:
5628 if (max < INT_MAX)
5629 {
5630 for (i = min; i < max; i++)
5631 {
5632 if (eptr >= md->end_subject)
5633 {
5634 SCHECK_PARTIAL();
5635 break;
5636 }
5637 if (IS_NEWLINE(eptr)) break;
5638 if (md->partial != 0 && /* Take care with CRLF partial */
5639 eptr + 1 >= md->end_subject &&
5640 NLBLOCK->nltype == NLTYPE_FIXED &&
5641 NLBLOCK->nllen == 2 &&
5642 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5643 {
5644 md->hitend = TRUE;
5645 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5646 }
5647 eptr++;
5648 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5649 }
5650 }
5651
5652 /* Handle unlimited UTF-8 repeat */
5653
5654 else
5655 {
5656 for (i = min; i < max; i++)
5657 {
5658 if (eptr >= md->end_subject)
5659 {
5660 SCHECK_PARTIAL();
5661 break;
5662 }
5663 if (IS_NEWLINE(eptr)) break;
5664 if (md->partial != 0 && /* Take care with CRLF partial */
5665 eptr + 1 >= md->end_subject &&
5666 NLBLOCK->nltype == NLTYPE_FIXED &&
5667 NLBLOCK->nllen == 2 &&
5668 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5669 {
5670 md->hitend = TRUE;
5671 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5672 }
5673 eptr++;
5674 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5675 }
5676 }
5677 break;
5678
5679 case OP_ALLANY:
5680 if (max < INT_MAX)
5681 {
5682 for (i = min; i < max; i++)
5683 {
5684 if (eptr >= md->end_subject)
5685 {
5686 SCHECK_PARTIAL();
5687 break;
5688 }
5689 eptr++;
5690 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5691 }
5692 }
5693 else
5694 {
5695 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5696 SCHECK_PARTIAL();
5697 }
5698 break;
5699
5700 /* The byte case is the same as non-UTF8 */
5701
5702 case OP_ANYBYTE:
5703 c = max - min;
5704 if (c > (unsigned int)(md->end_subject - eptr))
5705 {
5706 eptr = md->end_subject;
5707 SCHECK_PARTIAL();
5708 }
5709 else eptr += c;
5710 break;
5711
5712 case OP_ANYNL:
5713 for (i = min; i < max; i++)
5714 {
5715 int len = 1;
5716 if (eptr >= md->end_subject)
5717 {
5718 SCHECK_PARTIAL();
5719 break;
5720 }
5721 GETCHARLEN(c, eptr, len);
5722 if (c == CHAR_CR)
5723 {
5724 if (++eptr >= md->end_subject) break;
5725 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5726 }
5727 else
5728 {
5729 if (c != CHAR_LF &&
5730 (md->bsr_anycrlf ||
5731 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5732 #ifndef EBCDIC
5733 && c != 0x2028 && c != 0x2029
5734 #endif /* Not EBCDIC */
5735 )))
5736 break;
5737 eptr += len;
5738 }
5739 }
5740 break;
5741
5742 case OP_NOT_HSPACE:
5743 case OP_HSPACE:
5744 for (i = min; i < max; i++)
5745 {
5746 BOOL gotspace;
5747 int len = 1;
5748 if (eptr >= md->end_subject)
5749 {
5750 SCHECK_PARTIAL();
5751 break;
5752 }
5753 GETCHARLEN(c, eptr, len);
5754 switch(c)
5755 {
5756 HSPACE_CASES: gotspace = TRUE; break;
5757 default: gotspace = FALSE; break;
5758 }
5759 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5760 eptr += len;
5761 }
5762 break;
5763
5764 case OP_NOT_VSPACE:
5765 case OP_VSPACE:
5766 for (i = min; i < max; i++)
5767 {
5768 BOOL gotspace;
5769 int len = 1;
5770 if (eptr >= md->end_subject)
5771 {
5772 SCHECK_PARTIAL();
5773 break;
5774 }
5775 GETCHARLEN(c, eptr, len);
5776 switch(c)
5777 {
5778 VSPACE_CASES: gotspace = TRUE; break;
5779 default: gotspace = FALSE; break;
5780 }
5781 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5782 eptr += len;
5783 }
5784 break;
5785
5786 case OP_NOT_DIGIT:
5787 for (i = min; i < max; i++)
5788 {
5789 int len = 1;
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 GETCHARLEN(c, eptr, len);
5796 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5797 eptr+= len;
5798 }
5799 break;
5800
5801 case OP_DIGIT:
5802 for (i = min; i < max; i++)
5803 {
5804 int len = 1;
5805 if (eptr >= md->end_subject)
5806 {
5807 SCHECK_PARTIAL();
5808 break;
5809 }
5810 GETCHARLEN(c, eptr, len);
5811 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5812 eptr+= len;
5813 }
5814 break;
5815
5816 case OP_NOT_WHITESPACE:
5817 for (i = min; i < max; i++)
5818 {
5819 int len = 1;
5820 if (eptr >= md->end_subject)
5821 {
5822 SCHECK_PARTIAL();
5823 break;
5824 }
5825 GETCHARLEN(c, eptr, len);
5826 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5827 eptr+= len;
5828 }
5829 break;
5830
5831 case OP_WHITESPACE:
5832 for (i = min; i < max; i++)
5833 {
5834 int len = 1;
5835 if (eptr >= md->end_subject)
5836 {
5837 SCHECK_PARTIAL();
5838 break;
5839 }
5840 GETCHARLEN(c, eptr, len);
5841 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5842 eptr+= len;
5843 }
5844 break;
5845
5846 case OP_NOT_WORDCHAR:
5847 for (i = min; i < max; i++)
5848 {
5849 int len = 1;
5850 if (eptr >= md->end_subject)
5851 {
5852 SCHECK_PARTIAL();
5853 break;
5854 }
5855 GETCHARLEN(c, eptr, len);
5856 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5857 eptr+= len;
5858 }
5859 break;
5860
5861 case OP_WORDCHAR:
5862 for (i = min; i < max; i++)
5863 {
5864 int len = 1;
5865 if (eptr >= md->end_subject)
5866 {
5867 SCHECK_PARTIAL();
5868 break;
5869 }
5870 GETCHARLEN(c, eptr, len);
5871 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5872 eptr+= len;
5873 }
5874 break;
5875
5876 default:
5877 RRETURN(PCRE_ERROR_INTERNAL);
5878 }
5879
5880 /* eptr is now past the end of the maximum run. If possessive, we are
5881 done (no backing up). Otherwise, match at this position; anything other
5882 than no match is immediately returned. For nomatch, back up one
5883 character, unless we are matching \R and the last thing matched was
5884 \r\n, in which case, back up two bytes. */
5885
5886 if (possessive) continue;
5887 for(;;)
5888 {
5889 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5891 if (eptr-- == pp) break; /* Stop if tried at original pos */
5892 BACKCHAR(eptr);
5893 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5894 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5895 }
5896 }
5897 else
5898 #endif /* SUPPORT_UTF */
5899 /* Not UTF mode */
5900 {
5901 switch(ctype)
5902 {
5903 case OP_ANY:
5904 for (i = min; i < max; i++)
5905 {
5906 if (eptr >= md->end_subject)
5907 {
5908 SCHECK_PARTIAL();
5909 break;
5910 }
5911 if (IS_NEWLINE(eptr)) break;
5912 if (md->partial != 0 && /* Take care with CRLF partial */
5913 eptr + 1 >= md->end_subject &&
5914 NLBLOCK->nltype == NLTYPE_FIXED &&
5915 NLBLOCK->nllen == 2 &&
5916 *eptr == NLBLOCK->nl[0])
5917 {
5918 md->hitend = TRUE;
5919 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5920 }
5921 eptr++;
5922 }
5923 break;
5924
5925 case OP_ALLANY:
5926 case OP_ANYBYTE:
5927 c = max - min;
5928 if (c > (unsigned int)(md->end_subject - eptr))
5929 {
5930 eptr = md->end_subject;
5931 SCHECK_PARTIAL();
5932 }
5933 else eptr += c;
5934 break;
5935
5936 case OP_ANYNL:
5937 for (i = min; i < max; i++)
5938 {
5939 if (eptr >= md->end_subject)
5940 {
5941 SCHECK_PARTIAL();
5942 break;
5943 }
5944 c = *eptr;
5945 if (c == CHAR_CR)
5946 {
5947 if (++eptr >= md->end_subject) break;
5948 if (*eptr == CHAR_LF) eptr++;
5949 }
5950 else
5951 {
5952 if (c != CHAR_LF && (md->bsr_anycrlf ||
5953 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5954 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5955 && c != 0x2028 && c != 0x2029
5956 #endif
5957 ))) break;
5958 eptr++;
5959 }
5960 }
5961 break;
5962
5963 case OP_NOT_HSPACE:
5964 for (i = min; i < max; i++)
5965 {
5966 if (eptr >= md->end_subject)
5967 {
5968 SCHECK_PARTIAL();
5969 break;
5970 }
5971 switch(*eptr)
5972 {
5973 default: eptr++; break;
5974 HSPACE_BYTE_CASES:
5975 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5976 HSPACE_MULTIBYTE_CASES:
5977 #endif
5978 goto ENDLOOP00;
5979 }
5980 }
5981 ENDLOOP00:
5982 break;
5983
5984 case OP_HSPACE:
5985 for (i = min; i < max; i++)
5986 {
5987 if (eptr >= md->end_subject)
5988 {
5989 SCHECK_PARTIAL();
5990 break;
5991 }
5992 switch(*eptr)
5993 {
5994 default: goto ENDLOOP01;
5995 HSPACE_BYTE_CASES:
5996 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5997 HSPACE_MULTIBYTE_CASES:
5998 #endif
5999 eptr++; break;
6000 }
6001 }
6002 ENDLOOP01:
6003 break;
6004
6005 case OP_NOT_VSPACE:
6006 for (i = min; i < max; i++)
6007 {
6008 if (eptr >= md->end_subject)
6009 {
6010 SCHECK_PARTIAL();
6011 break;
6012 }
6013 switch(*eptr)
6014 {
6015 default: eptr++; break;
6016 VSPACE_BYTE_CASES:
6017 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6018 VSPACE_MULTIBYTE_CASES:
6019 #endif
6020 goto ENDLOOP02;
6021 }
6022 }
6023 ENDLOOP02:
6024 break;
6025
6026 case OP_VSPACE:
6027 for (i = min; i < max; i++)
6028 {
6029 if (eptr >= md->end_subject)
6030 {
6031 SCHECK_PARTIAL();
6032 break;
6033 }
6034 switch(*eptr)
6035 {
6036 default: goto ENDLOOP03;
6037 VSPACE_BYTE_CASES:
6038 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6039 VSPACE_MULTIBYTE_CASES:
6040 #endif
6041 eptr++; break;
6042 }
6043 }
6044 ENDLOOP03:
6045 break;
6046
6047 case OP_NOT_DIGIT:
6048 for (i = min; i < max; i++)
6049 {
6050 if (eptr >= md->end_subject)
6051 {
6052 SCHECK_PARTIAL();
6053 break;
6054 }
6055 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6056 eptr++;
6057 }
6058 break;
6059
6060 case OP_DIGIT:
6061 for (i = min; i < max; i++)
6062 {
6063 if (eptr >= md->end_subject)
6064 {
6065 SCHECK_PARTIAL();
6066 break;
6067 }
6068 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6069 eptr++;
6070 }
6071 break;
6072
6073 case OP_NOT_WHITESPACE:
6074 for (i = min; i < max; i++)
6075 {
6076 if (eptr >= md->end_subject)
6077 {
6078 SCHECK_PARTIAL();
6079 break;
6080 }
6081 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6082 eptr++;
6083 }
6084 break;
6085
6086 case OP_WHITESPACE:
6087 for (i = min; i < max; i++)
6088 {
6089 if (eptr >= md->end_subject)
6090 {
6091 SCHECK_PARTIAL();
6092 break;
6093 }
6094 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6095 eptr++;
6096 }
6097 break;
6098
6099 case OP_NOT_WORDCHAR:
6100 for (i = min; i < max; i++)
6101 {
6102 if (eptr >= md->end_subject)
6103 {
6104 SCHECK_PARTIAL();
6105 break;
6106 }
6107 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6108 eptr++;
6109 }
6110 break;
6111
6112 case OP_WORDCHAR:
6113 for (i = min; i < max; i++)
6114 {
6115 if (eptr >= md->end_subject)
6116 {
6117 SCHECK_PARTIAL();
6118 break;
6119 }
6120 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6121 eptr++;
6122 }
6123 break;
6124
6125 default:
6126 RRETURN(PCRE_ERROR_INTERNAL);
6127 }
6128
6129 /* eptr is now past the end of the maximum run. If possessive, we are
6130 done (no backing up). Otherwise, match at this position; anything other
6131 than no match is immediately returned. For nomatch, back up one
6132 character (byte), unless we are matching \R and the last thing matched
6133 was \r\n, in which case, back up two bytes. */
6134
6135 if (possessive) continue;
6136 while (eptr >= pp)
6137 {
6138 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6139 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6140 eptr--;
6141 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6142 eptr[-1] == CHAR_CR) eptr--;
6143 }
6144 }
6145
6146 /* Get here if we can't make it match with any permitted repetitions */
6147
6148 RRETURN(MATCH_NOMATCH);
6149 }
6150 /* Control never gets here */
6151
6152 /* There's been some horrible disaster. Arrival here can only mean there is
6153 something seriously wrong in the code above or the OP_xxx definitions. */
6154
6155 default:
6156 DPRINTF(("Unknown opcode %d\n", *ecode));
6157 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6158 }
6159
6160 /* Do not stick any code in here without much thought; it is assumed
6161 that "continue" in the code above comes out to here to repeat the main
6162 loop. */
6163
6164 } /* End of main loop */
6165 /* Control never reaches here */
6166
6167
6168 /* When compiling to use the heap rather than the stack for recursive calls to
6169 match(), the RRETURN() macro jumps here. The number that is saved in
6170 frame->Xwhere indicates which label we actually want to return to. */
6171
6172 #ifdef NO_RECURSE
6173 #define LBL(val) case val: goto L_RM##val;
6174 HEAP_RETURN:
6175 switch (frame->Xwhere)
6176 {
6177 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6178 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6179 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6180 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6181 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6182 LBL(65) LBL(66)
6183 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6184 LBL(21)
6185 #endif
6186 #ifdef SUPPORT_UTF
6187 LBL(16) LBL(18) LBL(20)
6188 LBL(22) LBL(23) LBL(28) LBL(30)
6189 LBL(32) LBL(34) LBL(42) LBL(46)
6190 #ifdef SUPPORT_UCP
6191 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6192 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67) LBL(68)
6193 #endif /* SUPPORT_UCP */
6194 #endif /* SUPPORT_UTF */
6195 default:
6196 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6197 return PCRE_ERROR_INTERNAL;
6198 }
6199 #undef LBL
6200 #endif /* NO_RECURSE */
6201 }
6202
6203
6204 /***************************************************************************
6205 ****************************************************************************
6206 RECURSION IN THE match() FUNCTION
6207
6208 Undefine all the macros that were defined above to handle this. */
6209
6210 #ifdef NO_RECURSE
6211 #undef eptr
6212 #undef ecode
6213 #undef mstart
6214 #undef offset_top
6215 #undef eptrb
6216 #undef flags
6217
6218 #undef callpat
6219 #undef charptr
6220 #undef data
6221 #undef next
6222 #undef pp
6223 #undef prev
6224 #undef saved_eptr
6225
6226 #undef new_recursive
6227
6228 #undef cur_is_word
6229 #undef condition
6230 #undef prev_is_word
6231
6232 #undef ctype
6233 #undef length
6234 #undef max
6235 #undef min
6236 #undef number
6237 #undef offset
6238 #undef op
6239 #undef save_capture_last
6240 #undef save_offset1
6241 #undef save_offset2
6242 #undef save_offset3
6243 #undef stacksave
6244
6245 #undef newptrb
6246
6247 #endif
6248
6249 /* These two are defined as macros in both cases */
6250
6251 #undef fc
6252 #undef fi
6253
6254 /***************************************************************************
6255 ***************************************************************************/
6256
6257
6258 #ifdef NO_RECURSE
6259 /*************************************************
6260 * Release allocated heap frames *
6261 *************************************************/
6262
6263 /* This function releases all the allocated frames. The base frame is on the
6264 machine stack, and so must not be freed.
6265
6266 Argument: the address of the base frame
6267 Returns: nothing
6268 */
6269
6270 static void
6271 release_match_heapframes (heapframe *frame_base)
6272 {
6273 heapframe *nextframe = frame_base->Xnextframe;
6274 while (nextframe != NULL)
6275 {
6276 heapframe *oldframe = nextframe;
6277 nextframe = nextframe->Xnextframe;
6278 (PUBL(stack_free))(oldframe);
6279 }
6280 }
6281 #endif
6282
6283
6284 /*************************************************
6285 * Execute a Regular Expression *
6286 *************************************************/
6287
6288 /* This function applies a compiled re to a subject string and picks out
6289 portions of the string if it matches. Two elements in the vector are set for
6290 each substring: the offsets to the start and end of the substring.
6291
6292 Arguments:
6293 argument_re points to the compiled expression
6294 extra_data points to extra data or is NULL
6295 subject points to the subject string
6296 length length of subject string (may contain binary zeros)
6297 start_offset where to start in the subject string
6298 options option bits
6299 offsets points to a vector of ints to be filled in with offsets
6300 offsetcount the number of elements in the vector
6301
6302 Returns: > 0 => success; value is the number of elements filled in
6303 = 0 => success, but offsets is not big enough
6304 -1 => failed to match
6305 < -1 => some kind of unexpected problem
6306 */
6307
6308 #if defined COMPILE_PCRE8
6309 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6310 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6311 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6312 int offsetcount)
6313 #elif defined COMPILE_PCRE16
6314 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6315 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6316 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6317 int offsetcount)
6318 #elif defined COMPILE_PCRE32
6319 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6320 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6321 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6322 int offsetcount)
6323 #endif
6324 {
6325 int rc, ocount, arg_offset_max;
6326 int newline;
6327 BOOL using_temporary_offsets = FALSE;
6328 BOOL anchored;
6329 BOOL startline;
6330 BOOL firstline;
6331 BOOL utf;
6332 BOOL has_first_char = FALSE;
6333 BOOL has_req_char = FALSE;
6334 pcre_uchar first_char = 0;
6335 pcre_uchar first_char2 = 0;
6336 pcre_uchar req_char = 0;
6337 pcre_uchar req_char2 = 0;
6338 match_data match_block;
6339 match_data *md = &match_block;
6340 const pcre_uint8 *tables;
6341 const pcre_uint8 *start_bits = NULL;
6342 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6343 PCRE_PUCHAR end_subject;
6344 PCRE_PUCHAR start_partial = NULL;
6345 PCRE_PUCHAR match_partial;
6346 PCRE_PUCHAR req_char_ptr = start_match - 1;
6347
6348 const pcre_study_data *study;
6349 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6350
6351 #ifdef NO_RECURSE
6352 heapframe frame_zero;
6353 frame_zero.Xprevframe = NULL; /* Marks the top level */
6354 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6355 md->match_frames_base = &frame_zero;
6356 #endif
6357
6358 /* Check for the special magic call that measures the size of the stack used
6359 per recursive call of match(). Without the funny casting for sizeof, a Windows
6360 compiler gave this error: "unary minus operator applied to unsigned type,
6361 result still unsigned". Hopefully the cast fixes that. */
6362
6363 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6364 start_offset == -999)
6365 #ifdef NO_RECURSE
6366 return -((int)sizeof(heapframe));
6367 #else
6368 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6369 #endif
6370
6371 /* Plausibility checks */
6372
6373 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6374 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6375 return PCRE_ERROR_NULL;
6376 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6377 if (length < 0) return PCRE_ERROR_BADLENGTH;
6378 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6379
6380 /* Check that the first field in the block is the magic number. If it is not,
6381 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6382 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6383 means that the pattern is likely compiled with different endianness. */
6384
6385 if (re->magic_number != MAGIC_NUMBER)
6386 return re->magic_number == REVERSED_MAGIC_NUMBER?
6387 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6388 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6389
6390 /* These two settings are used in the code for checking a UTF-8 string that
6391 follows immediately afterwards. Other values in the md block are used only
6392 during "normal" pcre_exec() processing, not when the JIT support is in use,
6393 so they are set up later. */
6394
6395 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6396 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6397 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6398 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6399
6400 /* Check a UTF-8 string if required. Pass back the character offset and error
6401 code for an invalid string if a results vector is available. */
6402
6403 #ifdef SUPPORT_UTF
6404 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6405 {
6406 int erroroffset;
6407 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6408 if (errorcode != 0)
6409 {
6410 if (offsetcount >= 2)
6411 {
6412 offsets[0] = erroroffset;
6413 offsets[1] = errorcode;
6414 }
6415 #if defined COMPILE_PCRE8
6416 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6417 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6418 #elif defined COMPILE_PCRE16
6419 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6420 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6421 #elif defined COMPILE_PCRE32
6422 return PCRE_ERROR_BADUTF32;
6423 #endif
6424 }
6425 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6426 /* Check that a start_offset points to the start of a UTF character. */
6427 if (start_offset > 0 && start_offset < length &&
6428 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6429 return PCRE_ERROR_BADUTF8_OFFSET;
6430 #endif
6431 }
6432 #endif
6433
6434 /* If the pattern was successfully studied with JIT support, run the JIT
6435 executable instead of the rest of this function. Most options must be set at
6436 compile time for the JIT code to be usable. Fallback to the normal code path if
6437 an unsupported flag is set. */
6438
6439 #ifdef SUPPORT_JIT
6440 if (extra_data != NULL
6441 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6442 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6443 && extra_data->executable_jit != NULL
6444 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6445 {
6446 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6447 start_offset, options, offsets, offsetcount);
6448
6449 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6450 mode is not compiled. In this case we simply fallback to interpreter. */
6451
6452 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6453 }
6454 #endif
6455
6456 /* Carry on with non-JIT matching. This information is for finding all the
6457 numbers associated with a given name, for condition testing. */
6458
6459 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6460 md->name_count = re->name_count;
6461 md->name_entry_size = re->name_entry_size;
6462
6463 /* Fish out the optional data from the extra_data structure, first setting
6464 the default values. */
6465
6466 study = NULL;
6467 md->match_limit = MATCH_LIMIT;
6468 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6469 md->callout_data = NULL;
6470
6471 /* The table pointer is always in native byte order. */
6472
6473 tables = re->tables;
6474
6475 if (extra_data != NULL)
6476 {
6477 register unsigned int flags = extra_data->flags;
6478 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6479 study = (const pcre_study_data *)extra_data->study_data;
6480 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6481 md->match_limit = extra_data->match_limit;
6482 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6483 md->match_limit_recursion = extra_data->match_limit_recursion;
6484 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6485 md->callout_data = extra_data->callout_data;
6486 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6487 }
6488
6489 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6490 is a feature that makes it possible to save compiled regex and re-use them
6491 in other programs later. */
6492
6493 if (tables == NULL) tables = PRIV(default_tables);
6494
6495 /* Set up other data */
6496
6497 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6498 startline = (re->flags & PCRE_STARTLINE) != 0;
6499 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6500
6501 /* The code starts after the real_pcre block and the capture name table. */
6502
6503 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6504 re->name_count * re->name_entry_size;
6505
6506 md->start_subject = (PCRE_PUCHAR)subject;
6507 md->start_offset = start_offset;
6508 md->end_subject = md->start_subject + length;
6509 end_subject = md->end_subject;
6510
6511 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6512 md->use_ucp = (re->options & PCRE_UCP) != 0;
6513 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6514 md->ignore_skip_arg = FALSE;
6515
6516 /* Some options are unpacked into BOOL variables in the hope that testing
6517 them will be faster than individual option bits. */
6518
6519 md->notbol = (options & PCRE_NOTBOL) != 0;
6520 md->noteol = (options & PCRE_NOTEOL) != 0;
6521 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6522 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6523
6524 md->hitend = FALSE;
6525 md->mark = md->nomatch_mark = NULL; /* In case never set */
6526
6527 md->recursive = NULL; /* No recursion at top level */
6528 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6529
6530 md->lcc = tables + lcc_offset;
6531 md->fcc = tables + fcc_offset;
6532 md->ctypes = tables + ctypes_offset;
6533
6534 /* Handle different \R options. */
6535
6536 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6537 {
6538 case 0:
6539 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6540 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6541 else
6542 #ifdef BSR_ANYCRLF
6543 md->bsr_anycrlf = TRUE;
6544 #else
6545 md->bsr_anycrlf = FALSE;
6546 #endif
6547 break;
6548
6549 case PCRE_BSR_ANYCRLF:
6550 md->bsr_anycrlf = TRUE;
6551 break;
6552
6553 case PCRE_BSR_UNICODE:
6554 md->bsr_anycrlf = FALSE;
6555 break;
6556
6557 default: return PCRE_ERROR_BADNEWLINE;
6558 }
6559
6560 /* Handle different types of newline. The three bits give eight cases. If
6561 nothing is set at run time, whatever was used at compile time applies. */
6562
6563 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6564 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6565 {
6566 case 0: newline = NEWLINE; break; /* Compile-time default */
6567 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6568 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6569 case PCRE_NEWLINE_CR+
6570 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6571 case PCRE_NEWLINE_ANY: newline = -1; break;
6572 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6573 default: return PCRE_ERROR_BADNEWLINE;
6574 }
6575
6576 if (newline == -2)
6577 {
6578 md->nltype = NLTYPE_ANYCRLF;
6579 }
6580 else if (newline < 0)
6581 {
6582 md->nltype = NLTYPE_ANY;
6583 }
6584 else
6585 {
6586 md->nltype = NLTYPE_FIXED;
6587 if (newline > 255)
6588 {
6589 md->nllen = 2;
6590 md->nl[0] = (newline >> 8) & 255;
6591 md->nl[1] = newline & 255;
6592 }
6593 else
6594 {
6595 md->nllen = 1;
6596 md->nl[0] = newline;
6597 }
6598 }
6599
6600 /* Partial matching was originally supported only for a restricted set of
6601 regexes; from release 8.00 there are no restrictions, but the bits are still
6602 defined (though never set). So there's no harm in leaving this code. */
6603
6604 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6605 return PCRE_ERROR_BADPARTIAL;
6606
6607 /* If the expression has got more back references than the offsets supplied can
6608 hold, we get a temporary chunk of working store to use during the matching.
6609 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6610 of 3. */
6611
6612 ocount = offsetcount - (offsetcount % 3);
6613 arg_offset_max = (2*ocount)/3;
6614
6615 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6616 {
6617 ocount = re->top_backref * 3 + 3;
6618 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6619 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6620 using_temporary_offsets = TRUE;
6621 DPRINTF(("Got memory to hold back references\n"));
6622 }
6623 else md->offset_vector = offsets;
6624 md->offset_end = ocount;
6625 md->offset_max = (2*ocount)/3;
6626 md->capture_last = 0;
6627
6628 /* Reset the working variable associated with each extraction. These should
6629 never be used unless previously set, but they get saved and restored, and so we
6630 initialize them to avoid reading uninitialized locations. Also, unset the
6631 offsets for the matched string. This is really just for tidiness with callouts,
6632 in case they inspect these fields. */
6633
6634 if (md->offset_vector != NULL)
6635 {
6636 register int *iptr = md->offset_vector + ocount;
6637 register int *iend = iptr - re->top_bracket;
6638 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6639 while (--iptr >= iend) *iptr = -1;
6640 md->offset_vector[0] = md->offset_vector[1] = -1;
6641 }
6642
6643 /* Set up the first character to match, if available. The first_char value is
6644 never set for an anchored regular expression, but the anchoring may be forced
6645 at run time, so we have to test for anchoring. The first char may be unset for
6646 an unanchored pattern, of course. If there's no first char and the pattern was
6647 studied, there may be a bitmap of possible first characters. */
6648
6649 if (!anchored)
6650 {
6651 if ((re->flags & PCRE_FIRSTSET) != 0)
6652 {
6653 has_first_char = TRUE;
6654 first_char = first_char2 = (pcre_uchar)(re->first_char);
6655 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6656 {
6657 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6658 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6659 if (utf && first_char > 127)
6660 first_char2 = UCD_OTHERCASE(first_char);
6661 #endif
6662 }
6663 }
6664 else
6665 if (!startline && study != NULL &&
6666 (study->flags & PCRE_STUDY_MAPPED) != 0)
6667 start_bits = study->start_bits;
6668 }
6669
6670 /* For anchored or unanchored matches, there may be a "last known required
6671 character" set. */
6672
6673 if ((re->flags & PCRE_REQCHSET) != 0)
6674 {
6675 has_req_char = TRUE;
6676 req_char = req_char2 = (pcre_uchar)(re->req_char);
6677 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6678 {
6679 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6680 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6681 if (utf && req_char > 127)
6682 req_char2 = UCD_OTHERCASE(req_char);
6683 #endif
6684 }
6685 }
6686
6687
6688 /* ==========================================================================*/
6689
6690 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6691 the loop runs just once. */
6692
6693 for(;;)
6694 {
6695 PCRE_PUCHAR save_end_subject = end_subject;
6696 PCRE_PUCHAR new_start_match;
6697
6698 /* If firstline is TRUE, the start of the match is constrained to the first
6699 line of a multiline string. That is, the match must be before or at the first
6700 newline. Implement this by temporarily adjusting end_subject so that we stop
6701 scanning at a newline. If the match fails at the newline, later code breaks
6702 this loop. */
6703
6704 if (firstline)
6705 {
6706 PCRE_PUCHAR t = start_match;
6707 #ifdef SUPPORT_UTF
6708 if (utf)
6709 {
6710 while (t < md->end_subject && !IS_NEWLINE(t))
6711 {
6712 t++;
6713 ACROSSCHAR(t < end_subject, *t, t++);
6714 }
6715 }
6716 else
6717 #endif
6718 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6719 end_subject = t;
6720 }
6721
6722 /* There are some optimizations that avoid running the match if a known
6723 starting point is not found, or if a known later character is not present.
6724 However, there is an option that disables these, for testing and for ensuring
6725 that all callouts do actually occur. The option can be set in the regex by
6726 (*NO_START_OPT) or passed in match-time options. */
6727
6728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6729 {
6730 /* Advance to a unique first char if there is one. */
6731
6732 if (has_first_char)
6733 {
6734 pcre_uchar smc;
6735
6736 if (first_char != first_char2)
6737 while (start_match < end_subject &&
6738 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6739 start_match++;
6740 else
6741 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6742 start_match++;
6743 }
6744
6745 /* Or to just after a linebreak for a multiline match */
6746
6747 else if (startline)
6748 {
6749 if (start_match > md->start_subject + start_offset)
6750 {
6751 #ifdef SUPPORT_UTF
6752 if (utf)
6753 {
6754 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6755 {
6756 start_match++;
6757 ACROSSCHAR(start_match < end_subject, *start_match,
6758 start_match++);
6759 }