/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1041 - (show annotations)
Sun Sep 16 10:16:27 2012 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 207183 byte(s)
Turn case lists for horizontal and vertical white space into macros so they are 
defined only once.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95
96
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
101
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
104
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
110
111 Returns: nothing
112 */
113
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
116 {
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 }
122 #endif
123
124
125
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
129
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
134
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
141
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
145 */
146
147 static int
148 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
150 {
151 PCRE_PUCHAR eptr_start = eptr;
152 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 bytes matched may differ, because there are some characters whose upper and
184 lower case versions code as different numbers of bytes. For example, U+023A
185 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
186 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
187 the latter. It is important, therefore, to check the length along the
188 reference, not along the subject (earlier code did this wrong). */
189
190 PCRE_PUCHAR endptr = p + length;
191 while (p < endptr)
192 {
193 int c, d;
194 if (eptr >= md->end_subject) return -2; /* Partial match */
195 GETCHARINC(c, eptr);
196 GETCHARINC(d, p);
197 if (c != d && c != UCD_OTHERCASE(d)) return -1;
198 }
199 }
200 else
201 #endif
202 #endif
203
204 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
205 is no UCP support. */
206 {
207 while (length-- > 0)
208 {
209 if (eptr >= md->end_subject) return -2; /* Partial match */
210 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
211 p++;
212 eptr++;
213 }
214 }
215 }
216
217 /* In the caseful case, we can just compare the bytes, whether or not we
218 are in UTF-8 mode. */
219
220 else
221 {
222 while (length-- > 0)
223 {
224 if (eptr >= md->end_subject) return -2; /* Partial match */
225 if (*p++ != *eptr++) return -1;
226 }
227 }
228
229 return (int)(eptr - eptr_start);
230 }
231
232
233
234 /***************************************************************************
235 ****************************************************************************
236 RECURSION IN THE match() FUNCTION
237
238 The match() function is highly recursive, though not every recursive call
239 increases the recursive depth. Nevertheless, some regular expressions can cause
240 it to recurse to a great depth. I was writing for Unix, so I just let it call
241 itself recursively. This uses the stack for saving everything that has to be
242 saved for a recursive call. On Unix, the stack can be large, and this works
243 fine.
244
245 It turns out that on some non-Unix-like systems there are problems with
246 programs that use a lot of stack. (This despite the fact that every last chip
247 has oodles of memory these days, and techniques for extending the stack have
248 been known for decades.) So....
249
250 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
251 calls by keeping local variables that need to be preserved in blocks of memory
252 obtained from malloc() instead instead of on the stack. Macros are used to
253 achieve this so that the actual code doesn't look very different to what it
254 always used to.
255
256 The original heap-recursive code used longjmp(). However, it seems that this
257 can be very slow on some operating systems. Following a suggestion from Stan
258 Switzer, the use of longjmp() has been abolished, at the cost of having to
259 provide a unique number for each call to RMATCH. There is no way of generating
260 a sequence of numbers at compile time in C. I have given them names, to make
261 them stand out more clearly.
262
263 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
264 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
265 tests. Furthermore, not using longjmp() means that local dynamic variables
266 don't have indeterminate values; this has meant that the frame size can be
267 reduced because the result can be "passed back" by straight setting of the
268 variable instead of being passed in the frame.
269 ****************************************************************************
270 ***************************************************************************/
271
272 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
273 below must be updated in sync. */
274
275 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
276 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
277 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
278 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
279 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
280 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
281 RM61, RM62, RM63, RM64, RM65, RM66 };
282
283 /* These versions of the macros use the stack, as normal. There are debugging
284 versions and production versions. Note that the "rw" argument of RMATCH isn't
285 actually used in this definition. */
286
287 #ifndef NO_RECURSE
288 #define REGISTER register
289
290 #ifdef PCRE_DEBUG
291 #define RMATCH(ra,rb,rc,rd,re,rw) \
292 { \
293 printf("match() called in line %d\n", __LINE__); \
294 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
295 printf("to line %d\n", __LINE__); \
296 }
297 #define RRETURN(ra) \
298 { \
299 printf("match() returned %d from line %d ", ra, __LINE__); \
300 return ra; \
301 }
302 #else
303 #define RMATCH(ra,rb,rc,rd,re,rw) \
304 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
305 #define RRETURN(ra) return ra
306 #endif
307
308 #else
309
310
311 /* These versions of the macros manage a private stack on the heap. Note that
312 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
313 argument of match(), which never changes. */
314
315 #define REGISTER
316
317 #define RMATCH(ra,rb,rc,rd,re,rw)\
318 {\
319 heapframe *newframe = frame->Xnextframe;\
320 if (newframe == NULL)\
321 {\
322 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
323 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
324 newframe->Xnextframe = NULL;\
325 frame->Xnextframe = newframe;\
326 }\
327 frame->Xwhere = rw;\
328 newframe->Xeptr = ra;\
329 newframe->Xecode = rb;\
330 newframe->Xmstart = mstart;\
331 newframe->Xoffset_top = rc;\
332 newframe->Xeptrb = re;\
333 newframe->Xrdepth = frame->Xrdepth + 1;\
334 newframe->Xprevframe = frame;\
335 frame = newframe;\
336 DPRINTF(("restarting from line %d\n", __LINE__));\
337 goto HEAP_RECURSE;\
338 L_##rw:\
339 DPRINTF(("jumped back to line %d\n", __LINE__));\
340 }
341
342 #define RRETURN(ra)\
343 {\
344 heapframe *oldframe = frame;\
345 frame = oldframe->Xprevframe;\
346 if (frame != NULL)\
347 {\
348 rrc = ra;\
349 goto HEAP_RETURN;\
350 }\
351 return ra;\
352 }
353
354
355 /* Structure for remembering the local variables in a private frame */
356
357 typedef struct heapframe {
358 struct heapframe *Xprevframe;
359 struct heapframe *Xnextframe;
360
361 /* Function arguments that may change */
362
363 PCRE_PUCHAR Xeptr;
364 const pcre_uchar *Xecode;
365 PCRE_PUCHAR Xmstart;
366 int Xoffset_top;
367 eptrblock *Xeptrb;
368 unsigned int Xrdepth;
369
370 /* Function local variables */
371
372 PCRE_PUCHAR Xcallpat;
373 #ifdef SUPPORT_UTF
374 PCRE_PUCHAR Xcharptr;
375 #endif
376 PCRE_PUCHAR Xdata;
377 PCRE_PUCHAR Xnext;
378 PCRE_PUCHAR Xpp;
379 PCRE_PUCHAR Xprev;
380 PCRE_PUCHAR Xsaved_eptr;
381
382 recursion_info Xnew_recursive;
383
384 BOOL Xcur_is_word;
385 BOOL Xcondition;
386 BOOL Xprev_is_word;
387
388 #ifdef SUPPORT_UCP
389 int Xprop_type;
390 int Xprop_value;
391 int Xprop_fail_result;
392 int Xoclength;
393 pcre_uchar Xocchars[6];
394 #endif
395
396 int Xcodelink;
397 int Xctype;
398 unsigned int Xfc;
399 int Xfi;
400 int Xlength;
401 int Xmax;
402 int Xmin;
403 int Xnumber;
404 int Xoffset;
405 int Xop;
406 int Xsave_capture_last;
407 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
408 int Xstacksave[REC_STACK_SAVE_MAX];
409
410 eptrblock Xnewptrb;
411
412 /* Where to jump back to */
413
414 int Xwhere;
415
416 } heapframe;
417
418 #endif
419
420
421 /***************************************************************************
422 ***************************************************************************/
423
424
425
426 /*************************************************
427 * Match from current position *
428 *************************************************/
429
430 /* This function is called recursively in many circumstances. Whenever it
431 returns a negative (error) response, the outer incarnation must also return the
432 same response. */
433
434 /* These macros pack up tests that are used for partial matching, and which
435 appear several times in the code. We set the "hit end" flag if the pointer is
436 at the end of the subject and also past the start of the subject (i.e.
437 something has been matched). For hard partial matching, we then return
438 immediately. The second one is used when we already know we are past the end of
439 the subject. */
440
441 #define CHECK_PARTIAL()\
442 if (md->partial != 0 && eptr >= md->end_subject && \
443 eptr > md->start_used_ptr) \
444 { \
445 md->hitend = TRUE; \
446 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
447 }
448
449 #define SCHECK_PARTIAL()\
450 if (md->partial != 0 && eptr > md->start_used_ptr) \
451 { \
452 md->hitend = TRUE; \
453 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
454 }
455
456
457 /* Performance note: It might be tempting to extract commonly used fields from
458 the md structure (e.g. utf, end_subject) into individual variables to improve
459 performance. Tests using gcc on a SPARC disproved this; in the first case, it
460 made performance worse.
461
462 Arguments:
463 eptr pointer to current character in subject
464 ecode pointer to current position in compiled code
465 mstart pointer to the current match start position (can be modified
466 by encountering \K)
467 offset_top current top pointer
468 md pointer to "static" info for the match
469 eptrb pointer to chain of blocks containing eptr at start of
470 brackets - for testing for empty matches
471 rdepth the recursion depth
472
473 Returns: MATCH_MATCH if matched ) these values are >= 0
474 MATCH_NOMATCH if failed to match )
475 a negative MATCH_xxx value for PRUNE, SKIP, etc
476 a negative PCRE_ERROR_xxx value if aborted by an error condition
477 (e.g. stopped by repeated call or recursion limit)
478 */
479
480 static int
481 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
482 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
483 unsigned int rdepth)
484 {
485 /* These variables do not need to be preserved over recursion in this function,
486 so they can be ordinary variables in all cases. Mark some of them with
487 "register" because they are used a lot in loops. */
488
489 register int rrc; /* Returns from recursive calls */
490 register int i; /* Used for loops not involving calls to RMATCH() */
491 register unsigned int c; /* Character values not kept over RMATCH() calls */
492 register BOOL utf; /* Local copy of UTF flag for speed */
493
494 BOOL minimize, possessive; /* Quantifier options */
495 BOOL caseless;
496 int condcode;
497
498 /* When recursion is not being used, all "local" variables that have to be
499 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
500 frame on the stack here; subsequent instantiations are obtained from the heap
501 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
502 the top-level on the stack rather than malloc-ing them all gives a performance
503 boost in many cases where there is not much "recursion". */
504
505 #ifdef NO_RECURSE
506 heapframe *frame = (heapframe *)md->match_frames_base;
507
508 /* Copy in the original argument variables */
509
510 frame->Xeptr = eptr;
511 frame->Xecode = ecode;
512 frame->Xmstart = mstart;
513 frame->Xoffset_top = offset_top;
514 frame->Xeptrb = eptrb;
515 frame->Xrdepth = rdepth;
516
517 /* This is where control jumps back to to effect "recursion" */
518
519 HEAP_RECURSE:
520
521 /* Macros make the argument variables come from the current frame */
522
523 #define eptr frame->Xeptr
524 #define ecode frame->Xecode
525 #define mstart frame->Xmstart
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define oclength frame->Xoclength
554 #define occhars frame->Xocchars
555 #endif
556
557 #define ctype frame->Xctype
558 #define fc frame->Xfc
559 #define fi frame->Xfi
560 #define length frame->Xlength
561 #define max frame->Xmax
562 #define min frame->Xmin
563 #define number frame->Xnumber
564 #define offset frame->Xoffset
565 #define op frame->Xop
566 #define save_capture_last frame->Xsave_capture_last
567 #define save_offset1 frame->Xsave_offset1
568 #define save_offset2 frame->Xsave_offset2
569 #define save_offset3 frame->Xsave_offset3
570 #define stacksave frame->Xstacksave
571
572 #define newptrb frame->Xnewptrb
573
574 /* When recursion is being used, local variables are allocated on the stack and
575 get preserved during recursion in the normal way. In this environment, fi and
576 i, and fc and c, can be the same variables. */
577
578 #else /* NO_RECURSE not defined */
579 #define fi i
580 #define fc c
581
582 /* Many of the following variables are used only in small blocks of the code.
583 My normal style of coding would have declared them within each of those blocks.
584 However, in order to accommodate the version of this code that uses an external
585 "stack" implemented on the heap, it is easier to declare them all here, so the
586 declarations can be cut out in a block. The only declarations within blocks
587 below are for variables that do not have to be preserved over a recursive call
588 to RMATCH(). */
589
590 #ifdef SUPPORT_UTF
591 const pcre_uchar *charptr;
592 #endif
593 const pcre_uchar *callpat;
594 const pcre_uchar *data;
595 const pcre_uchar *next;
596 PCRE_PUCHAR pp;
597 const pcre_uchar *prev;
598 PCRE_PUCHAR saved_eptr;
599
600 recursion_info new_recursive;
601
602 BOOL cur_is_word;
603 BOOL condition;
604 BOOL prev_is_word;
605
606 #ifdef SUPPORT_UCP
607 int prop_type;
608 int prop_value;
609 int prop_fail_result;
610 int oclength;
611 pcre_uchar occhars[6];
612 #endif
613
614 int codelink;
615 int ctype;
616 int length;
617 int max;
618 int min;
619 int number;
620 int offset;
621 int op;
622 int save_capture_last;
623 int save_offset1, save_offset2, save_offset3;
624 int stacksave[REC_STACK_SAVE_MAX];
625
626 eptrblock newptrb;
627
628 /* There is a special fudge for calling match() in a way that causes it to
629 measure the size of its basic stack frame when the stack is being used for
630 recursion. The second argument (ecode) being NULL triggers this behaviour. It
631 cannot normally ever be NULL. The return is the negated value of the frame
632 size. */
633
634 if (ecode == NULL)
635 {
636 if (rdepth == 0)
637 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
638 else
639 {
640 int len = (char *)&rdepth - (char *)eptr;
641 return (len > 0)? -len : len;
642 }
643 }
644 #endif /* NO_RECURSE */
645
646 /* To save space on the stack and in the heap frame, I have doubled up on some
647 of the local variables that are used only in localised parts of the code, but
648 still need to be preserved over recursive calls of match(). These macros define
649 the alternative names that are used. */
650
651 #define allow_zero cur_is_word
652 #define cbegroup condition
653 #define code_offset codelink
654 #define condassert condition
655 #define matched_once prev_is_word
656 #define foc number
657 #define save_mark data
658
659 /* These statements are here to stop the compiler complaining about unitialized
660 variables. */
661
662 #ifdef SUPPORT_UCP
663 prop_value = 0;
664 prop_fail_result = 0;
665 #endif
666
667
668 /* This label is used for tail recursion, which is used in a few cases even
669 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
670 used. Thanks to Ian Taylor for noticing this possibility and sending the
671 original patch. */
672
673 TAIL_RECURSE:
674
675 /* OK, now we can get on with the real code of the function. Recursive calls
676 are specified by the macro RMATCH and RRETURN is used to return. When
677 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
678 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
679 defined). However, RMATCH isn't like a function call because it's quite a
680 complicated macro. It has to be used in one particular way. This shouldn't,
681 however, impact performance when true recursion is being used. */
682
683 #ifdef SUPPORT_UTF
684 utf = md->utf; /* Local copy of the flag */
685 #else
686 utf = FALSE;
687 #endif
688
689 /* First check that we haven't called match() too many times, or that we
690 haven't exceeded the recursive call limit. */
691
692 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
693 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
694
695 /* At the start of a group with an unlimited repeat that may match an empty
696 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
697 done this way to save having to use another function argument, which would take
698 up space on the stack. See also MATCH_CONDASSERT below.
699
700 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
701 such remembered pointers, to be checked when we hit the closing ket, in order
702 to break infinite loops that match no characters. When match() is called in
703 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
704 NOT be used with tail recursion, because the memory block that is used is on
705 the stack, so a new one may be required for each match(). */
706
707 if (md->match_function_type == MATCH_CBEGROUP)
708 {
709 newptrb.epb_saved_eptr = eptr;
710 newptrb.epb_prev = eptrb;
711 eptrb = &newptrb;
712 md->match_function_type = 0;
713 }
714
715 /* Now start processing the opcodes. */
716
717 for (;;)
718 {
719 minimize = possessive = FALSE;
720 op = *ecode;
721
722 switch(op)
723 {
724 case OP_MARK:
725 md->nomatch_mark = ecode + 2;
726 md->mark = NULL; /* In case previously set by assertion */
727 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
728 eptrb, RM55);
729 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
730 md->mark == NULL) md->mark = ecode + 2;
731
732 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
733 argument, and we must check whether that argument matches this MARK's
734 argument. It is passed back in md->start_match_ptr (an overloading of that
735 variable). If it does match, we reset that variable to the current subject
736 position and return MATCH_SKIP. Otherwise, pass back the return code
737 unaltered. */
738
739 else if (rrc == MATCH_SKIP_ARG &&
740 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
741 {
742 md->start_match_ptr = eptr;
743 RRETURN(MATCH_SKIP);
744 }
745 RRETURN(rrc);
746
747 case OP_FAIL:
748 RRETURN(MATCH_NOMATCH);
749
750 /* COMMIT overrides PRUNE, SKIP, and THEN */
751
752 case OP_COMMIT:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM52);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
756 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
757 rrc != MATCH_THEN)
758 RRETURN(rrc);
759 RRETURN(MATCH_COMMIT);
760
761 /* PRUNE overrides THEN */
762
763 case OP_PRUNE:
764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
765 eptrb, RM51);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
767 RRETURN(MATCH_PRUNE);
768
769 case OP_PRUNE_ARG:
770 md->nomatch_mark = ecode + 2;
771 md->mark = NULL; /* In case previously set by assertion */
772 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
773 eptrb, RM56);
774 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
775 md->mark == NULL) md->mark = ecode + 2;
776 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
777 RRETURN(MATCH_PRUNE);
778
779 /* SKIP overrides PRUNE and THEN */
780
781 case OP_SKIP:
782 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
783 eptrb, RM53);
784 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
785 RRETURN(rrc);
786 md->start_match_ptr = eptr; /* Pass back current position */
787 RRETURN(MATCH_SKIP);
788
789 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
790 nomatch_mark. There is a flag that disables this opcode when re-matching a
791 pattern that ended with a SKIP for which there was not a matching MARK. */
792
793 case OP_SKIP_ARG:
794 if (md->ignore_skip_arg)
795 {
796 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
797 break;
798 }
799 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
800 eptrb, RM57);
801 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
802 RRETURN(rrc);
803
804 /* Pass back the current skip name by overloading md->start_match_ptr and
805 returning the special MATCH_SKIP_ARG return code. This will either be
806 caught by a matching MARK, or get to the top, where it causes a rematch
807 with the md->ignore_skip_arg flag set. */
808
809 md->start_match_ptr = ecode + 2;
810 RRETURN(MATCH_SKIP_ARG);
811
812 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
813 the branch in which it occurs can be determined. Overload the start of
814 match pointer to do this. */
815
816 case OP_THEN:
817 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
818 eptrb, RM54);
819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
820 md->start_match_ptr = ecode;
821 RRETURN(MATCH_THEN);
822
823 case OP_THEN_ARG:
824 md->nomatch_mark = ecode + 2;
825 md->mark = NULL; /* In case previously set by assertion */
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
827 md, eptrb, RM58);
828 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
829 md->mark == NULL) md->mark = ecode + 2;
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 md->start_match_ptr = ecode;
832 RRETURN(MATCH_THEN);
833
834 /* Handle an atomic group that does not contain any capturing parentheses.
835 This can be handled like an assertion. Prior to 8.13, all atomic groups
836 were handled this way. In 8.13, the code was changed as below for ONCE, so
837 that backups pass through the group and thereby reset captured values.
838 However, this uses a lot more stack, so in 8.20, atomic groups that do not
839 contain any captures generate OP_ONCE_NC, which can be handled in the old,
840 less stack intensive way.
841
842 Check the alternative branches in turn - the matching won't pass the KET
843 for this kind of subpattern. If any one branch matches, we carry on as at
844 the end of a normal bracket, leaving the subject pointer, but resetting
845 the start-of-match value in case it was changed by \K. */
846
847 case OP_ONCE_NC:
848 prev = ecode;
849 saved_eptr = eptr;
850 save_mark = md->mark;
851 do
852 {
853 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
854 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
855 {
856 mstart = md->start_match_ptr;
857 break;
858 }
859 if (rrc == MATCH_THEN)
860 {
861 next = ecode + GET(ecode,1);
862 if (md->start_match_ptr < next &&
863 (*ecode == OP_ALT || *next == OP_ALT))
864 rrc = MATCH_NOMATCH;
865 }
866
867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
868 ecode += GET(ecode,1);
869 md->mark = save_mark;
870 }
871 while (*ecode == OP_ALT);
872
873 /* If hit the end of the group (which could be repeated), fail */
874
875 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
876
877 /* Continue as from after the group, updating the offsets high water
878 mark, since extracts may have been taken. */
879
880 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
881
882 offset_top = md->end_offset_top;
883 eptr = md->end_match_ptr;
884
885 /* For a non-repeating ket, just continue at this level. This also
886 happens for a repeating ket if no characters were matched in the group.
887 This is the forcible breaking of infinite loops as implemented in Perl
888 5.005. */
889
890 if (*ecode == OP_KET || eptr == saved_eptr)
891 {
892 ecode += 1+LINK_SIZE;
893 break;
894 }
895
896 /* The repeating kets try the rest of the pattern or restart from the
897 preceding bracket, in the appropriate order. The second "call" of match()
898 uses tail recursion, to avoid using another stack frame. */
899
900 if (*ecode == OP_KETRMIN)
901 {
902 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
904 ecode = prev;
905 goto TAIL_RECURSE;
906 }
907 else /* OP_KETRMAX */
908 {
909 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
911 ecode += 1 + LINK_SIZE;
912 goto TAIL_RECURSE;
913 }
914 /* Control never gets here */
915
916 /* Handle a capturing bracket, other than those that are possessive with an
917 unlimited repeat. If there is space in the offset vector, save the current
918 subject position in the working slot at the top of the vector. We mustn't
919 change the current values of the data slot, because they may be set from a
920 previous iteration of this group, and be referred to by a reference inside
921 the group. A failure to match might occur after the group has succeeded,
922 if something later on doesn't match. For this reason, we need to restore
923 the working value and also the values of the final offsets, in case they
924 were set by a previous iteration of the same bracket.
925
926 If there isn't enough space in the offset vector, treat this as if it were
927 a non-capturing bracket. Don't worry about setting the flag for the error
928 case here; that is handled in the code for KET. */
929
930 case OP_CBRA:
931 case OP_SCBRA:
932 number = GET2(ecode, 1+LINK_SIZE);
933 offset = number << 1;
934
935 #ifdef PCRE_DEBUG
936 printf("start bracket %d\n", number);
937 printf("subject=");
938 pchars(eptr, 16, TRUE, md);
939 printf("\n");
940 #endif
941
942 if (offset < md->offset_max)
943 {
944 save_offset1 = md->offset_vector[offset];
945 save_offset2 = md->offset_vector[offset+1];
946 save_offset3 = md->offset_vector[md->offset_end - number];
947 save_capture_last = md->capture_last;
948 save_mark = md->mark;
949
950 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
951 md->offset_vector[md->offset_end - number] =
952 (int)(eptr - md->start_subject);
953
954 for (;;)
955 {
956 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
957 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
958 eptrb, RM1);
959 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
960
961 /* If we backed up to a THEN, check whether it is within the current
962 branch by comparing the address of the THEN that is passed back with
963 the end of the branch. If it is within the current branch, and the
964 branch is one of two or more alternatives (it either starts or ends
965 with OP_ALT), we have reached the limit of THEN's action, so convert
966 the return code to NOMATCH, which will cause normal backtracking to
967 happen from now on. Otherwise, THEN is passed back to an outer
968 alternative. This implements Perl's treatment of parenthesized groups,
969 where a group not containing | does not affect the current alternative,
970 that is, (X) is NOT the same as (X|(*F)). */
971
972 if (rrc == MATCH_THEN)
973 {
974 next = ecode + GET(ecode,1);
975 if (md->start_match_ptr < next &&
976 (*ecode == OP_ALT || *next == OP_ALT))
977 rrc = MATCH_NOMATCH;
978 }
979
980 /* Anything other than NOMATCH is passed back. */
981
982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
983 md->capture_last = save_capture_last;
984 ecode += GET(ecode, 1);
985 md->mark = save_mark;
986 if (*ecode != OP_ALT) break;
987 }
988
989 DPRINTF(("bracket %d failed\n", number));
990 md->offset_vector[offset] = save_offset1;
991 md->offset_vector[offset+1] = save_offset2;
992 md->offset_vector[md->offset_end - number] = save_offset3;
993
994 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
995
996 RRETURN(rrc);
997 }
998
999 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1000 as a non-capturing bracket. */
1001
1002 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1003 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1004
1005 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1006
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1009
1010 /* Non-capturing or atomic group, except for possessive with unlimited
1011 repeat and ONCE group with no captures. Loop for all the alternatives.
1012
1013 When we get to the final alternative within the brackets, we used to return
1014 the result of a recursive call to match() whatever happened so it was
1015 possible to reduce stack usage by turning this into a tail recursion,
1016 except in the case of a possibly empty group. However, now that there is
1017 the possiblity of (*THEN) occurring in the final alternative, this
1018 optimization is no longer always possible.
1019
1020 We can optimize if we know there are no (*THEN)s in the pattern; at present
1021 this is the best that can be done.
1022
1023 MATCH_ONCE is returned when the end of an atomic group is successfully
1024 reached, but subsequent matching fails. It passes back up the tree (causing
1025 captured values to be reset) until the original atomic group level is
1026 reached. This is tested by comparing md->once_target with the start of the
1027 group. At this point, the return is converted into MATCH_NOMATCH so that
1028 previous backup points can be taken. */
1029
1030 case OP_ONCE:
1031 case OP_BRA:
1032 case OP_SBRA:
1033 DPRINTF(("start non-capturing bracket\n"));
1034
1035 for (;;)
1036 {
1037 if (op >= OP_SBRA || op == OP_ONCE)
1038 md->match_function_type = MATCH_CBEGROUP;
1039
1040 /* If this is not a possibly empty group, and there are no (*THEN)s in
1041 the pattern, and this is the final alternative, optimize as described
1042 above. */
1043
1044 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1045 {
1046 ecode += PRIV(OP_lengths)[*ecode];
1047 goto TAIL_RECURSE;
1048 }
1049
1050 /* In all other cases, we have to make another call to match(). */
1051
1052 save_mark = md->mark;
1053 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1054 RM2);
1055
1056 /* See comment in the code for capturing groups above about handling
1057 THEN. */
1058
1059 if (rrc == MATCH_THEN)
1060 {
1061 next = ecode + GET(ecode,1);
1062 if (md->start_match_ptr < next &&
1063 (*ecode == OP_ALT || *next == OP_ALT))
1064 rrc = MATCH_NOMATCH;
1065 }
1066
1067 if (rrc != MATCH_NOMATCH)
1068 {
1069 if (rrc == MATCH_ONCE)
1070 {
1071 const pcre_uchar *scode = ecode;
1072 if (*scode != OP_ONCE) /* If not at start, find it */
1073 {
1074 while (*scode == OP_ALT) scode += GET(scode, 1);
1075 scode -= GET(scode, 1);
1076 }
1077 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1078 }
1079 RRETURN(rrc);
1080 }
1081 ecode += GET(ecode, 1);
1082 md->mark = save_mark;
1083 if (*ecode != OP_ALT) break;
1084 }
1085
1086 RRETURN(MATCH_NOMATCH);
1087
1088 /* Handle possessive capturing brackets with an unlimited repeat. We come
1089 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1090 handled similarly to the normal case above. However, the matching is
1091 different. The end of these brackets will always be OP_KETRPOS, which
1092 returns MATCH_KETRPOS without going further in the pattern. By this means
1093 we can handle the group by iteration rather than recursion, thereby
1094 reducing the amount of stack needed. */
1095
1096 case OP_CBRAPOS:
1097 case OP_SCBRAPOS:
1098 allow_zero = FALSE;
1099
1100 POSSESSIVE_CAPTURE:
1101 number = GET2(ecode, 1+LINK_SIZE);
1102 offset = number << 1;
1103
1104 #ifdef PCRE_DEBUG
1105 printf("start possessive bracket %d\n", number);
1106 printf("subject=");
1107 pchars(eptr, 16, TRUE, md);
1108 printf("\n");
1109 #endif
1110
1111 if (offset < md->offset_max)
1112 {
1113 matched_once = FALSE;
1114 code_offset = (int)(ecode - md->start_code);
1115
1116 save_offset1 = md->offset_vector[offset];
1117 save_offset2 = md->offset_vector[offset+1];
1118 save_offset3 = md->offset_vector[md->offset_end - number];
1119 save_capture_last = md->capture_last;
1120
1121 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1122
1123 /* Each time round the loop, save the current subject position for use
1124 when the group matches. For MATCH_MATCH, the group has matched, so we
1125 restart it with a new subject starting position, remembering that we had
1126 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1127 usual. If we haven't matched any alternatives in any iteration, check to
1128 see if a previous iteration matched. If so, the group has matched;
1129 continue from afterwards. Otherwise it has failed; restore the previous
1130 capture values before returning NOMATCH. */
1131
1132 for (;;)
1133 {
1134 md->offset_vector[md->offset_end - number] =
1135 (int)(eptr - md->start_subject);
1136 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1137 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1138 eptrb, RM63);
1139 if (rrc == MATCH_KETRPOS)
1140 {
1141 offset_top = md->end_offset_top;
1142 eptr = md->end_match_ptr;
1143 ecode = md->start_code + code_offset;
1144 save_capture_last = md->capture_last;
1145 matched_once = TRUE;
1146 continue;
1147 }
1148
1149 /* See comment in the code for capturing groups above about handling
1150 THEN. */
1151
1152 if (rrc == MATCH_THEN)
1153 {
1154 next = ecode + GET(ecode,1);
1155 if (md->start_match_ptr < next &&
1156 (*ecode == OP_ALT || *next == OP_ALT))
1157 rrc = MATCH_NOMATCH;
1158 }
1159
1160 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1161 md->capture_last = save_capture_last;
1162 ecode += GET(ecode, 1);
1163 if (*ecode != OP_ALT) break;
1164 }
1165
1166 if (!matched_once)
1167 {
1168 md->offset_vector[offset] = save_offset1;
1169 md->offset_vector[offset+1] = save_offset2;
1170 md->offset_vector[md->offset_end - number] = save_offset3;
1171 }
1172
1173 if (allow_zero || matched_once)
1174 {
1175 ecode += 1 + LINK_SIZE;
1176 break;
1177 }
1178
1179 RRETURN(MATCH_NOMATCH);
1180 }
1181
1182 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1183 as a non-capturing bracket. */
1184
1185 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1186 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1187
1188 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1189
1190 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1191 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1192
1193 /* Non-capturing possessive bracket with unlimited repeat. We come here
1194 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1195 without the capturing complication. It is written out separately for speed
1196 and cleanliness. */
1197
1198 case OP_BRAPOS:
1199 case OP_SBRAPOS:
1200 allow_zero = FALSE;
1201
1202 POSSESSIVE_NON_CAPTURE:
1203 matched_once = FALSE;
1204 code_offset = (int)(ecode - md->start_code);
1205
1206 for (;;)
1207 {
1208 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1209 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1210 eptrb, RM48);
1211 if (rrc == MATCH_KETRPOS)
1212 {
1213 offset_top = md->end_offset_top;
1214 eptr = md->end_match_ptr;
1215 ecode = md->start_code + code_offset;
1216 matched_once = TRUE;
1217 continue;
1218 }
1219
1220 /* See comment in the code for capturing groups above about handling
1221 THEN. */
1222
1223 if (rrc == MATCH_THEN)
1224 {
1225 next = ecode + GET(ecode,1);
1226 if (md->start_match_ptr < next &&
1227 (*ecode == OP_ALT || *next == OP_ALT))
1228 rrc = MATCH_NOMATCH;
1229 }
1230
1231 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1232 ecode += GET(ecode, 1);
1233 if (*ecode != OP_ALT) break;
1234 }
1235
1236 if (matched_once || allow_zero)
1237 {
1238 ecode += 1 + LINK_SIZE;
1239 break;
1240 }
1241 RRETURN(MATCH_NOMATCH);
1242
1243 /* Control never reaches here. */
1244
1245 /* Conditional group: compilation checked that there are no more than
1246 two branches. If the condition is false, skipping the first branch takes us
1247 past the end if there is only one branch, but that's OK because that is
1248 exactly what going to the ket would do. */
1249
1250 case OP_COND:
1251 case OP_SCOND:
1252 codelink = GET(ecode, 1);
1253
1254 /* Because of the way auto-callout works during compile, a callout item is
1255 inserted between OP_COND and an assertion condition. */
1256
1257 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1258 {
1259 if (PUBL(callout) != NULL)
1260 {
1261 PUBL(callout_block) cb;
1262 cb.version = 2; /* Version 1 of the callout block */
1263 cb.callout_number = ecode[LINK_SIZE+2];
1264 cb.offset_vector = md->offset_vector;
1265 #ifdef COMPILE_PCRE8
1266 cb.subject = (PCRE_SPTR)md->start_subject;
1267 #else
1268 cb.subject = (PCRE_SPTR16)md->start_subject;
1269 #endif
1270 cb.subject_length = (int)(md->end_subject - md->start_subject);
1271 cb.start_match = (int)(mstart - md->start_subject);
1272 cb.current_position = (int)(eptr - md->start_subject);
1273 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1274 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1275 cb.capture_top = offset_top/2;
1276 cb.capture_last = md->capture_last;
1277 cb.callout_data = md->callout_data;
1278 cb.mark = md->nomatch_mark;
1279 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1280 if (rrc < 0) RRETURN(rrc);
1281 }
1282 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1283 }
1284
1285 condcode = ecode[LINK_SIZE+1];
1286
1287 /* Now see what the actual condition is */
1288
1289 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1290 {
1291 if (md->recursive == NULL) /* Not recursing => FALSE */
1292 {
1293 condition = FALSE;
1294 ecode += GET(ecode, 1);
1295 }
1296 else
1297 {
1298 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1299 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1300
1301 /* If the test is for recursion into a specific subpattern, and it is
1302 false, but the test was set up by name, scan the table to see if the
1303 name refers to any other numbers, and test them. The condition is true
1304 if any one is set. */
1305
1306 if (!condition && condcode == OP_NRREF)
1307 {
1308 pcre_uchar *slotA = md->name_table;
1309 for (i = 0; i < md->name_count; i++)
1310 {
1311 if (GET2(slotA, 0) == recno) break;
1312 slotA += md->name_entry_size;
1313 }
1314
1315 /* Found a name for the number - there can be only one; duplicate
1316 names for different numbers are allowed, but not vice versa. First
1317 scan down for duplicates. */
1318
1319 if (i < md->name_count)
1320 {
1321 pcre_uchar *slotB = slotA;
1322 while (slotB > md->name_table)
1323 {
1324 slotB -= md->name_entry_size;
1325 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1326 {
1327 condition = GET2(slotB, 0) == md->recursive->group_num;
1328 if (condition) break;
1329 }
1330 else break;
1331 }
1332
1333 /* Scan up for duplicates */
1334
1335 if (!condition)
1336 {
1337 slotB = slotA;
1338 for (i++; i < md->name_count; i++)
1339 {
1340 slotB += md->name_entry_size;
1341 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1342 {
1343 condition = GET2(slotB, 0) == md->recursive->group_num;
1344 if (condition) break;
1345 }
1346 else break;
1347 }
1348 }
1349 }
1350 }
1351
1352 /* Chose branch according to the condition */
1353
1354 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1355 }
1356 }
1357
1358 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1359 {
1360 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1361 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1362
1363 /* If the numbered capture is unset, but the reference was by name,
1364 scan the table to see if the name refers to any other numbers, and test
1365 them. The condition is true if any one is set. This is tediously similar
1366 to the code above, but not close enough to try to amalgamate. */
1367
1368 if (!condition && condcode == OP_NCREF)
1369 {
1370 int refno = offset >> 1;
1371 pcre_uchar *slotA = md->name_table;
1372
1373 for (i = 0; i < md->name_count; i++)
1374 {
1375 if (GET2(slotA, 0) == refno) break;
1376 slotA += md->name_entry_size;
1377 }
1378
1379 /* Found a name for the number - there can be only one; duplicate names
1380 for different numbers are allowed, but not vice versa. First scan down
1381 for duplicates. */
1382
1383 if (i < md->name_count)
1384 {
1385 pcre_uchar *slotB = slotA;
1386 while (slotB > md->name_table)
1387 {
1388 slotB -= md->name_entry_size;
1389 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1390 {
1391 offset = GET2(slotB, 0) << 1;
1392 condition = offset < offset_top &&
1393 md->offset_vector[offset] >= 0;
1394 if (condition) break;
1395 }
1396 else break;
1397 }
1398
1399 /* Scan up for duplicates */
1400
1401 if (!condition)
1402 {
1403 slotB = slotA;
1404 for (i++; i < md->name_count; i++)
1405 {
1406 slotB += md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1408 {
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1413 }
1414 else break;
1415 }
1416 }
1417 }
1418 }
1419
1420 /* Chose branch according to the condition */
1421
1422 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1423 }
1424
1425 else if (condcode == OP_DEF) /* DEFINE - always false */
1426 {
1427 condition = FALSE;
1428 ecode += GET(ecode, 1);
1429 }
1430
1431 /* The condition is an assertion. Call match() to evaluate it - setting
1432 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1433 an assertion. */
1434
1435 else
1436 {
1437 md->match_function_type = MATCH_CONDASSERT;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1439 if (rrc == MATCH_MATCH)
1440 {
1441 if (md->end_offset_top > offset_top)
1442 offset_top = md->end_offset_top; /* Captures may have happened */
1443 condition = TRUE;
1444 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1445 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1446 }
1447
1448 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1449 assertion; it is therefore treated as NOMATCH. */
1450
1451 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1452 {
1453 RRETURN(rrc); /* Need braces because of following else */
1454 }
1455 else
1456 {
1457 condition = FALSE;
1458 ecode += codelink;
1459 }
1460 }
1461
1462 /* We are now at the branch that is to be obeyed. As there is only one, can
1463 use tail recursion to avoid using another stack frame, except when there is
1464 unlimited repeat of a possibly empty group. In the latter case, a recursive
1465 call to match() is always required, unless the second alternative doesn't
1466 exist, in which case we can just plough on. Note that, for compatibility
1467 with Perl, the | in a conditional group is NOT treated as creating two
1468 alternatives. If a THEN is encountered in the branch, it propagates out to
1469 the enclosing alternative (unless nested in a deeper set of alternatives,
1470 of course). */
1471
1472 if (condition || *ecode == OP_ALT)
1473 {
1474 if (op != OP_SCOND)
1475 {
1476 ecode += 1 + LINK_SIZE;
1477 goto TAIL_RECURSE;
1478 }
1479
1480 md->match_function_type = MATCH_CBEGROUP;
1481 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1482 RRETURN(rrc);
1483 }
1484
1485 /* Condition false & no alternative; continue after the group. */
1486
1487 else
1488 {
1489 ecode += 1 + LINK_SIZE;
1490 }
1491 break;
1492
1493
1494 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1495 to close any currently open capturing brackets. */
1496
1497 case OP_CLOSE:
1498 number = GET2(ecode, 1);
1499 offset = number << 1;
1500
1501 #ifdef PCRE_DEBUG
1502 printf("end bracket %d at *ACCEPT", number);
1503 printf("\n");
1504 #endif
1505
1506 md->capture_last = number;
1507 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1508 {
1509 md->offset_vector[offset] =
1510 md->offset_vector[md->offset_end - number];
1511 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1512 if (offset_top <= offset) offset_top = offset + 2;
1513 }
1514 ecode += 1 + IMM2_SIZE;
1515 break;
1516
1517
1518 /* End of the pattern, either real or forced. */
1519
1520 case OP_END:
1521 case OP_ACCEPT:
1522 case OP_ASSERT_ACCEPT:
1523
1524 /* If we have matched an empty string, fail if not in an assertion and not
1525 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1526 is set and we have matched at the start of the subject. In both cases,
1527 backtracking will then try other alternatives, if any. */
1528
1529 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1530 md->recursive == NULL &&
1531 (md->notempty ||
1532 (md->notempty_atstart &&
1533 mstart == md->start_subject + md->start_offset)))
1534 RRETURN(MATCH_NOMATCH);
1535
1536 /* Otherwise, we have a match. */
1537
1538 md->end_match_ptr = eptr; /* Record where we ended */
1539 md->end_offset_top = offset_top; /* and how many extracts were taken */
1540 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1541
1542 /* For some reason, the macros don't work properly if an expression is
1543 given as the argument to RRETURN when the heap is in use. */
1544
1545 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1546 RRETURN(rrc);
1547
1548 /* Assertion brackets. Check the alternative branches in turn - the
1549 matching won't pass the KET for an assertion. If any one branch matches,
1550 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1551 start of each branch to move the current point backwards, so the code at
1552 this level is identical to the lookahead case. When the assertion is part
1553 of a condition, we want to return immediately afterwards. The caller of
1554 this incarnation of the match() function will have set MATCH_CONDASSERT in
1555 md->match_function type, and one of these opcodes will be the first opcode
1556 that is processed. We use a local variable that is preserved over calls to
1557 match() to remember this case. */
1558
1559 case OP_ASSERT:
1560 case OP_ASSERTBACK:
1561 save_mark = md->mark;
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1573 {
1574 mstart = md->start_match_ptr; /* In case \K reset it */
1575 break;
1576 }
1577 md->mark = save_mark;
1578
1579 /* A COMMIT failure must fail the entire assertion, without trying any
1580 subsequent branches. */
1581
1582 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1583
1584 /* PCRE does not allow THEN to escape beyond an assertion; it
1585 is treated as NOMATCH. */
1586
1587 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1588 ecode += GET(ecode, 1);
1589 }
1590 while (*ecode == OP_ALT);
1591
1592 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1593
1594 /* If checking an assertion for a condition, return MATCH_MATCH. */
1595
1596 if (condassert) RRETURN(MATCH_MATCH);
1597
1598 /* Continue from after the assertion, updating the offsets high water
1599 mark, since extracts may have been taken during the assertion. */
1600
1601 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1602 ecode += 1 + LINK_SIZE;
1603 offset_top = md->end_offset_top;
1604 continue;
1605
1606 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1607 PRUNE, or COMMIT means we must assume failure without checking subsequent
1608 branches. */
1609
1610 case OP_ASSERT_NOT:
1611 case OP_ASSERTBACK_NOT:
1612 save_mark = md->mark;
1613 if (md->match_function_type == MATCH_CONDASSERT)
1614 {
1615 condassert = TRUE;
1616 md->match_function_type = 0;
1617 }
1618 else condassert = FALSE;
1619
1620 do
1621 {
1622 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1623 md->mark = save_mark;
1624 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1625 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1626 {
1627 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1628 break;
1629 }
1630
1631 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1632 as NOMATCH. */
1633
1634 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1635 ecode += GET(ecode,1);
1636 }
1637 while (*ecode == OP_ALT);
1638
1639 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1640
1641 ecode += 1 + LINK_SIZE;
1642 continue;
1643
1644 /* Move the subject pointer back. This occurs only at the start of
1645 each branch of a lookbehind assertion. If we are too close to the start to
1646 move back, this match function fails. When working with UTF-8 we move
1647 back a number of characters, not bytes. */
1648
1649 case OP_REVERSE:
1650 #ifdef SUPPORT_UTF
1651 if (utf)
1652 {
1653 i = GET(ecode, 1);
1654 while (i-- > 0)
1655 {
1656 eptr--;
1657 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1658 BACKCHAR(eptr);
1659 }
1660 }
1661 else
1662 #endif
1663
1664 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1665
1666 {
1667 eptr -= GET(ecode, 1);
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 }
1670
1671 /* Save the earliest consulted character, then skip to next op code */
1672
1673 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1674 ecode += 1 + LINK_SIZE;
1675 break;
1676
1677 /* The callout item calls an external function, if one is provided, passing
1678 details of the match so far. This is mainly for debugging, though the
1679 function is able to force a failure. */
1680
1681 case OP_CALLOUT:
1682 if (PUBL(callout) != NULL)
1683 {
1684 PUBL(callout_block) cb;
1685 cb.version = 2; /* Version 1 of the callout block */
1686 cb.callout_number = ecode[1];
1687 cb.offset_vector = md->offset_vector;
1688 #ifdef COMPILE_PCRE8
1689 cb.subject = (PCRE_SPTR)md->start_subject;
1690 #else
1691 cb.subject = (PCRE_SPTR16)md->start_subject;
1692 #endif
1693 cb.subject_length = (int)(md->end_subject - md->start_subject);
1694 cb.start_match = (int)(mstart - md->start_subject);
1695 cb.current_position = (int)(eptr - md->start_subject);
1696 cb.pattern_position = GET(ecode, 2);
1697 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1698 cb.capture_top = offset_top/2;
1699 cb.capture_last = md->capture_last;
1700 cb.callout_data = md->callout_data;
1701 cb.mark = md->nomatch_mark;
1702 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1703 if (rrc < 0) RRETURN(rrc);
1704 }
1705 ecode += 2 + 2*LINK_SIZE;
1706 break;
1707
1708 /* Recursion either matches the current regex, or some subexpression. The
1709 offset data is the offset to the starting bracket from the start of the
1710 whole pattern. (This is so that it works from duplicated subpatterns.)
1711
1712 The state of the capturing groups is preserved over recursion, and
1713 re-instated afterwards. We don't know how many are started and not yet
1714 finished (offset_top records the completed total) so we just have to save
1715 all the potential data. There may be up to 65535 such values, which is too
1716 large to put on the stack, but using malloc for small numbers seems
1717 expensive. As a compromise, the stack is used when there are no more than
1718 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1719
1720 There are also other values that have to be saved. We use a chained
1721 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1722 for the original version of this logic. It has, however, been hacked around
1723 a lot, so he is not to blame for the current way it works. */
1724
1725 case OP_RECURSE:
1726 {
1727 recursion_info *ri;
1728 int recno;
1729
1730 callpat = md->start_code + GET(ecode, 1);
1731 recno = (callpat == md->start_code)? 0 :
1732 GET2(callpat, 1 + LINK_SIZE);
1733
1734 /* Check for repeating a recursion without advancing the subject pointer.
1735 This should catch convoluted mutual recursions. (Some simple cases are
1736 caught at compile time.) */
1737
1738 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1739 if (recno == ri->group_num && eptr == ri->subject_position)
1740 RRETURN(PCRE_ERROR_RECURSELOOP);
1741
1742 /* Add to "recursing stack" */
1743
1744 new_recursive.group_num = recno;
1745 new_recursive.subject_position = eptr;
1746 new_recursive.prevrec = md->recursive;
1747 md->recursive = &new_recursive;
1748
1749 /* Where to continue from afterwards */
1750
1751 ecode += 1 + LINK_SIZE;
1752
1753 /* Now save the offset data */
1754
1755 new_recursive.saved_max = md->offset_end;
1756 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1757 new_recursive.offset_save = stacksave;
1758 else
1759 {
1760 new_recursive.offset_save =
1761 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1762 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1763 }
1764 memcpy(new_recursive.offset_save, md->offset_vector,
1765 new_recursive.saved_max * sizeof(int));
1766
1767 /* OK, now we can do the recursion. After processing each alternative,
1768 restore the offset data. If there were nested recursions, md->recursive
1769 might be changed, so reset it before looping. */
1770
1771 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1772 cbegroup = (*callpat >= OP_SBRA);
1773 do
1774 {
1775 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1776 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1777 md, eptrb, RM6);
1778 memcpy(md->offset_vector, new_recursive.offset_save,
1779 new_recursive.saved_max * sizeof(int));
1780 md->recursive = new_recursive.prevrec;
1781 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1782 {
1783 DPRINTF(("Recursion matched\n"));
1784 if (new_recursive.offset_save != stacksave)
1785 (PUBL(free))(new_recursive.offset_save);
1786
1787 /* Set where we got to in the subject, and reset the start in case
1788 it was changed by \K. This *is* propagated back out of a recursion,
1789 for Perl compatibility. */
1790
1791 eptr = md->end_match_ptr;
1792 mstart = md->start_match_ptr;
1793 goto RECURSION_MATCHED; /* Exit loop; end processing */
1794 }
1795
1796 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1797 is treated as NOMATCH. */
1798
1799 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1800 rrc != MATCH_COMMIT)
1801 {
1802 DPRINTF(("Recursion gave error %d\n", rrc));
1803 if (new_recursive.offset_save != stacksave)
1804 (PUBL(free))(new_recursive.offset_save);
1805 RRETURN(rrc);
1806 }
1807
1808 md->recursive = &new_recursive;
1809 callpat += GET(callpat, 1);
1810 }
1811 while (*callpat == OP_ALT);
1812
1813 DPRINTF(("Recursion didn't match\n"));
1814 md->recursive = new_recursive.prevrec;
1815 if (new_recursive.offset_save != stacksave)
1816 (PUBL(free))(new_recursive.offset_save);
1817 RRETURN(MATCH_NOMATCH);
1818 }
1819
1820 RECURSION_MATCHED:
1821 break;
1822
1823 /* An alternation is the end of a branch; scan along to find the end of the
1824 bracketed group and go to there. */
1825
1826 case OP_ALT:
1827 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1828 break;
1829
1830 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1831 indicating that it may occur zero times. It may repeat infinitely, or not
1832 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1833 with fixed upper repeat limits are compiled as a number of copies, with the
1834 optional ones preceded by BRAZERO or BRAMINZERO. */
1835
1836 case OP_BRAZERO:
1837 next = ecode + 1;
1838 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1840 do next += GET(next, 1); while (*next == OP_ALT);
1841 ecode = next + 1 + LINK_SIZE;
1842 break;
1843
1844 case OP_BRAMINZERO:
1845 next = ecode + 1;
1846 do next += GET(next, 1); while (*next == OP_ALT);
1847 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1849 ecode++;
1850 break;
1851
1852 case OP_SKIPZERO:
1853 next = ecode+1;
1854 do next += GET(next,1); while (*next == OP_ALT);
1855 ecode = next + 1 + LINK_SIZE;
1856 break;
1857
1858 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1859 here; just jump to the group, with allow_zero set TRUE. */
1860
1861 case OP_BRAPOSZERO:
1862 op = *(++ecode);
1863 allow_zero = TRUE;
1864 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1865 goto POSSESSIVE_NON_CAPTURE;
1866
1867 /* End of a group, repeated or non-repeating. */
1868
1869 case OP_KET:
1870 case OP_KETRMIN:
1871 case OP_KETRMAX:
1872 case OP_KETRPOS:
1873 prev = ecode - GET(ecode, 1);
1874
1875 /* If this was a group that remembered the subject start, in order to break
1876 infinite repeats of empty string matches, retrieve the subject start from
1877 the chain. Otherwise, set it NULL. */
1878
1879 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1880 {
1881 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1882 eptrb = eptrb->epb_prev; /* Backup to previous group */
1883 }
1884 else saved_eptr = NULL;
1885
1886 /* If we are at the end of an assertion group or a non-capturing atomic
1887 group, stop matching and return MATCH_MATCH, but record the current high
1888 water mark for use by positive assertions. We also need to record the match
1889 start in case it was changed by \K. */
1890
1891 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1892 *prev == OP_ONCE_NC)
1893 {
1894 md->end_match_ptr = eptr; /* For ONCE_NC */
1895 md->end_offset_top = offset_top;
1896 md->start_match_ptr = mstart;
1897 RRETURN(MATCH_MATCH); /* Sets md->mark */
1898 }
1899
1900 /* For capturing groups we have to check the group number back at the start
1901 and if necessary complete handling an extraction by setting the offsets and
1902 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1903 into group 0, so it won't be picked up here. Instead, we catch it when the
1904 OP_END is reached. Other recursion is handled here. We just have to record
1905 the current subject position and start match pointer and give a MATCH
1906 return. */
1907
1908 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1909 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1910 {
1911 number = GET2(prev, 1+LINK_SIZE);
1912 offset = number << 1;
1913
1914 #ifdef PCRE_DEBUG
1915 printf("end bracket %d", number);
1916 printf("\n");
1917 #endif
1918
1919 /* Handle a recursively called group. */
1920
1921 if (md->recursive != NULL && md->recursive->group_num == number)
1922 {
1923 md->end_match_ptr = eptr;
1924 md->start_match_ptr = mstart;
1925 RRETURN(MATCH_MATCH);
1926 }
1927
1928 /* Deal with capturing */
1929
1930 md->capture_last = number;
1931 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1932 {
1933 /* If offset is greater than offset_top, it means that we are
1934 "skipping" a capturing group, and that group's offsets must be marked
1935 unset. In earlier versions of PCRE, all the offsets were unset at the
1936 start of matching, but this doesn't work because atomic groups and
1937 assertions can cause a value to be set that should later be unset.
1938 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1939 part of the atomic group, but this is not on the final matching path,
1940 so must be unset when 2 is set. (If there is no group 2, there is no
1941 problem, because offset_top will then be 2, indicating no capture.) */
1942
1943 if (offset > offset_top)
1944 {
1945 register int *iptr = md->offset_vector + offset_top;
1946 register int *iend = md->offset_vector + offset;
1947 while (iptr < iend) *iptr++ = -1;
1948 }
1949
1950 /* Now make the extraction */
1951
1952 md->offset_vector[offset] =
1953 md->offset_vector[md->offset_end - number];
1954 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1955 if (offset_top <= offset) offset_top = offset + 2;
1956 }
1957 }
1958
1959 /* For an ordinary non-repeating ket, just continue at this level. This
1960 also happens for a repeating ket if no characters were matched in the
1961 group. This is the forcible breaking of infinite loops as implemented in
1962 Perl 5.005. For a non-repeating atomic group that includes captures,
1963 establish a backup point by processing the rest of the pattern at a lower
1964 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1965 original OP_ONCE level, thereby bypassing intermediate backup points, but
1966 resetting any captures that happened along the way. */
1967
1968 if (*ecode == OP_KET || eptr == saved_eptr)
1969 {
1970 if (*prev == OP_ONCE)
1971 {
1972 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1975 RRETURN(MATCH_ONCE);
1976 }
1977 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1978 break;
1979 }
1980
1981 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1982 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1983 at a time from the outer level, thus saving stack. */
1984
1985 if (*ecode == OP_KETRPOS)
1986 {
1987 md->end_match_ptr = eptr;
1988 md->end_offset_top = offset_top;
1989 RRETURN(MATCH_KETRPOS);
1990 }
1991
1992 /* The normal repeating kets try the rest of the pattern or restart from
1993 the preceding bracket, in the appropriate order. In the second case, we can
1994 use tail recursion to avoid using another stack frame, unless we have an
1995 an atomic group or an unlimited repeat of a group that can match an empty
1996 string. */
1997
1998 if (*ecode == OP_KETRMIN)
1999 {
2000 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2002 if (*prev == OP_ONCE)
2003 {
2004 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2005 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2006 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2007 RRETURN(MATCH_ONCE);
2008 }
2009 if (*prev >= OP_SBRA) /* Could match an empty string */
2010 {
2011 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2012 RRETURN(rrc);
2013 }
2014 ecode = prev;
2015 goto TAIL_RECURSE;
2016 }
2017 else /* OP_KETRMAX */
2018 {
2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2020 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev;
2027 RRETURN(MATCH_ONCE);
2028 }
2029 ecode += 1 + LINK_SIZE;
2030 goto TAIL_RECURSE;
2031 }
2032 /* Control never gets here */
2033
2034 /* Not multiline mode: start of subject assertion, unless notbol. */
2035
2036 case OP_CIRC:
2037 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2038
2039 /* Start of subject assertion */
2040
2041 case OP_SOD:
2042 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2043 ecode++;
2044 break;
2045
2046 /* Multiline mode: start of subject unless notbol, or after any newline. */
2047
2048 case OP_CIRCM:
2049 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2050 if (eptr != md->start_subject &&
2051 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2052 RRETURN(MATCH_NOMATCH);
2053 ecode++;
2054 break;
2055
2056 /* Start of match assertion */
2057
2058 case OP_SOM:
2059 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2060 ecode++;
2061 break;
2062
2063 /* Reset the start of match point */
2064
2065 case OP_SET_SOM:
2066 mstart = eptr;
2067 ecode++;
2068 break;
2069
2070 /* Multiline mode: assert before any newline, or before end of subject
2071 unless noteol is set. */
2072
2073 case OP_DOLLM:
2074 if (eptr < md->end_subject)
2075 {
2076 if (!IS_NEWLINE(eptr))
2077 {
2078 if (md->partial != 0 &&
2079 eptr + 1 >= md->end_subject &&
2080 NLBLOCK->nltype == NLTYPE_FIXED &&
2081 NLBLOCK->nllen == 2 &&
2082 *eptr == NLBLOCK->nl[0])
2083 {
2084 md->hitend = TRUE;
2085 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2086 }
2087 RRETURN(MATCH_NOMATCH);
2088 }
2089 }
2090 else
2091 {
2092 if (md->noteol) RRETURN(MATCH_NOMATCH);
2093 SCHECK_PARTIAL();
2094 }
2095 ecode++;
2096 break;
2097
2098 /* Not multiline mode: assert before a terminating newline or before end of
2099 subject unless noteol is set. */
2100
2101 case OP_DOLL:
2102 if (md->noteol) RRETURN(MATCH_NOMATCH);
2103 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2104
2105 /* ... else fall through for endonly */
2106
2107 /* End of subject assertion (\z) */
2108
2109 case OP_EOD:
2110 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2111 SCHECK_PARTIAL();
2112 ecode++;
2113 break;
2114
2115 /* End of subject or ending \n assertion (\Z) */
2116
2117 case OP_EODN:
2118 ASSERT_NL_OR_EOS:
2119 if (eptr < md->end_subject &&
2120 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2121 {
2122 if (md->partial != 0 &&
2123 eptr + 1 >= md->end_subject &&
2124 NLBLOCK->nltype == NLTYPE_FIXED &&
2125 NLBLOCK->nllen == 2 &&
2126 *eptr == NLBLOCK->nl[0])
2127 {
2128 md->hitend = TRUE;
2129 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2130 }
2131 RRETURN(MATCH_NOMATCH);
2132 }
2133
2134 /* Either at end of string or \n before end. */
2135
2136 SCHECK_PARTIAL();
2137 ecode++;
2138 break;
2139
2140 /* Word boundary assertions */
2141
2142 case OP_NOT_WORD_BOUNDARY:
2143 case OP_WORD_BOUNDARY:
2144 {
2145
2146 /* Find out if the previous and current characters are "word" characters.
2147 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2148 be "non-word" characters. Remember the earliest consulted character for
2149 partial matching. */
2150
2151 #ifdef SUPPORT_UTF
2152 if (utf)
2153 {
2154 /* Get status of previous character */
2155
2156 if (eptr == md->start_subject) prev_is_word = FALSE; else
2157 {
2158 PCRE_PUCHAR lastptr = eptr - 1;
2159 BACKCHAR(lastptr);
2160 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2161 GETCHAR(c, lastptr);
2162 #ifdef SUPPORT_UCP
2163 if (md->use_ucp)
2164 {
2165 if (c == '_') prev_is_word = TRUE; else
2166 {
2167 int cat = UCD_CATEGORY(c);
2168 prev_is_word = (cat == ucp_L || cat == ucp_N);
2169 }
2170 }
2171 else
2172 #endif
2173 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2174 }
2175
2176 /* Get status of next character */
2177
2178 if (eptr >= md->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 cur_is_word = FALSE;
2182 }
2183 else
2184 {
2185 GETCHAR(c, eptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2188 {
2189 if (c == '_') cur_is_word = TRUE; else
2190 {
2191 int cat = UCD_CATEGORY(c);
2192 cur_is_word = (cat == ucp_L || cat == ucp_N);
2193 }
2194 }
2195 else
2196 #endif
2197 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2198 }
2199 }
2200 else
2201 #endif
2202
2203 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2204 consistency with the behaviour of \w we do use it in this case. */
2205
2206 {
2207 /* Get status of previous character */
2208
2209 if (eptr == md->start_subject) prev_is_word = FALSE; else
2210 {
2211 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2212 #ifdef SUPPORT_UCP
2213 if (md->use_ucp)
2214 {
2215 c = eptr[-1];
2216 if (c == '_') prev_is_word = TRUE; else
2217 {
2218 int cat = UCD_CATEGORY(c);
2219 prev_is_word = (cat == ucp_L || cat == ucp_N);
2220 }
2221 }
2222 else
2223 #endif
2224 prev_is_word = MAX_255(eptr[-1])
2225 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2226 }
2227
2228 /* Get status of next character */
2229
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 cur_is_word = FALSE;
2234 }
2235 else
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2238 {
2239 c = *eptr;
2240 if (c == '_') cur_is_word = TRUE; else
2241 {
2242 int cat = UCD_CATEGORY(c);
2243 cur_is_word = (cat == ucp_L || cat == ucp_N);
2244 }
2245 }
2246 else
2247 #endif
2248 cur_is_word = MAX_255(*eptr)
2249 && ((md->ctypes[*eptr] & ctype_word) != 0);
2250 }
2251
2252 /* Now see if the situation is what we want */
2253
2254 if ((*ecode++ == OP_WORD_BOUNDARY)?
2255 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2256 RRETURN(MATCH_NOMATCH);
2257 }
2258 break;
2259
2260 /* Match any single character type except newline; have to take care with
2261 CRLF newlines and partial matching. */
2262
2263 case OP_ANY:
2264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2265 if (md->partial != 0 &&
2266 eptr + 1 >= md->end_subject &&
2267 NLBLOCK->nltype == NLTYPE_FIXED &&
2268 NLBLOCK->nllen == 2 &&
2269 *eptr == NLBLOCK->nl[0])
2270 {
2271 md->hitend = TRUE;
2272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2273 }
2274
2275 /* Fall through */
2276
2277 /* Match any single character whatsoever. */
2278
2279 case OP_ALLANY:
2280 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2281 { /* not be updated before SCHECK_PARTIAL. */
2282 SCHECK_PARTIAL();
2283 RRETURN(MATCH_NOMATCH);
2284 }
2285 eptr++;
2286 #ifdef SUPPORT_UTF
2287 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2288 #endif
2289 ecode++;
2290 break;
2291
2292 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2293 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2294
2295 case OP_ANYBYTE:
2296 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2297 { /* not be updated before SCHECK_PARTIAL. */
2298 SCHECK_PARTIAL();
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 eptr++;
2302 ecode++;
2303 break;
2304
2305 case OP_NOT_DIGIT:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c < 256 &&
2315 #endif
2316 (md->ctypes[c] & ctype_digit) != 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_DIGIT:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c > 255 ||
2332 #endif
2333 (md->ctypes[c] & ctype_digit) == 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_NOT_WHITESPACE:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 if (
2347 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2348 c < 256 &&
2349 #endif
2350 (md->ctypes[c] & ctype_space) != 0
2351 )
2352 RRETURN(MATCH_NOMATCH);
2353 ecode++;
2354 break;
2355
2356 case OP_WHITESPACE:
2357 if (eptr >= md->end_subject)
2358 {
2359 SCHECK_PARTIAL();
2360 RRETURN(MATCH_NOMATCH);
2361 }
2362 GETCHARINCTEST(c, eptr);
2363 if (
2364 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2365 c > 255 ||
2366 #endif
2367 (md->ctypes[c] & ctype_space) == 0
2368 )
2369 RRETURN(MATCH_NOMATCH);
2370 ecode++;
2371 break;
2372
2373 case OP_NOT_WORDCHAR:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 RRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 if (
2381 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2382 c < 256 &&
2383 #endif
2384 (md->ctypes[c] & ctype_word) != 0
2385 )
2386 RRETURN(MATCH_NOMATCH);
2387 ecode++;
2388 break;
2389
2390 case OP_WORDCHAR:
2391 if (eptr >= md->end_subject)
2392 {
2393 SCHECK_PARTIAL();
2394 RRETURN(MATCH_NOMATCH);
2395 }
2396 GETCHARINCTEST(c, eptr);
2397 if (
2398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2399 c > 255 ||
2400 #endif
2401 (md->ctypes[c] & ctype_word) == 0
2402 )
2403 RRETURN(MATCH_NOMATCH);
2404 ecode++;
2405 break;
2406
2407 case OP_ANYNL:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 switch(c)
2415 {
2416 default: RRETURN(MATCH_NOMATCH);
2417
2418 case CHAR_CR:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 }
2423 else if (*eptr == CHAR_LF) eptr++;
2424 break;
2425
2426 case CHAR_LF:
2427 break;
2428
2429 case CHAR_VT:
2430 case CHAR_FF:
2431 case CHAR_NEL:
2432 #ifndef EBCDIC
2433 case 0x2028:
2434 case 0x2029:
2435 #endif /* Not EBCDIC */
2436 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2437 break;
2438 }
2439 ecode++;
2440 break;
2441
2442 case OP_NOT_HSPACE:
2443 if (eptr >= md->end_subject)
2444 {
2445 SCHECK_PARTIAL();
2446 RRETURN(MATCH_NOMATCH);
2447 }
2448 GETCHARINCTEST(c, eptr);
2449 switch(c)
2450 {
2451 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2452 default: break;
2453 }
2454 ecode++;
2455 break;
2456
2457 case OP_HSPACE:
2458 if (eptr >= md->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 RRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(c, eptr);
2464 switch(c)
2465 {
2466 HSPACE_CASES: break; /* Byte and multibyte cases */
2467 default: RRETURN(MATCH_NOMATCH);
2468 }
2469 ecode++;
2470 break;
2471
2472 case OP_NOT_VSPACE:
2473 if (eptr >= md->end_subject)
2474 {
2475 SCHECK_PARTIAL();
2476 RRETURN(MATCH_NOMATCH);
2477 }
2478 GETCHARINCTEST(c, eptr);
2479 switch(c)
2480 {
2481 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2482 default: break;
2483 }
2484 ecode++;
2485 break;
2486
2487 case OP_VSPACE:
2488 if (eptr >= md->end_subject)
2489 {
2490 SCHECK_PARTIAL();
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 GETCHARINCTEST(c, eptr);
2494 switch(c)
2495 {
2496 VSPACE_CASES: break;
2497 default: RRETURN(MATCH_NOMATCH);
2498 }
2499 ecode++;
2500 break;
2501
2502 #ifdef SUPPORT_UCP
2503 /* Check the next character by Unicode property. We will get here only
2504 if the support is in the binary; otherwise a compile-time error occurs. */
2505
2506 case OP_PROP:
2507 case OP_NOTPROP:
2508 if (eptr >= md->end_subject)
2509 {
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2512 }
2513 GETCHARINCTEST(c, eptr);
2514 {
2515 const ucd_record *prop = GET_UCD(c);
2516
2517 switch(ecode[1])
2518 {
2519 case PT_ANY:
2520 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2521 break;
2522
2523 case PT_LAMP:
2524 if ((prop->chartype == ucp_Lu ||
2525 prop->chartype == ucp_Ll ||
2526 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2527 RRETURN(MATCH_NOMATCH);
2528 break;
2529
2530 case PT_GC:
2531 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2532 RRETURN(MATCH_NOMATCH);
2533 break;
2534
2535 case PT_PC:
2536 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2537 RRETURN(MATCH_NOMATCH);
2538 break;
2539
2540 case PT_SC:
2541 if ((ecode[2] != prop->script) == (op == OP_PROP))
2542 RRETURN(MATCH_NOMATCH);
2543 break;
2544
2545 /* These are specials */
2546
2547 case PT_ALNUM:
2548 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2549 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2550 RRETURN(MATCH_NOMATCH);
2551 break;
2552
2553 case PT_SPACE: /* Perl space */
2554 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2555 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2556 == (op == OP_NOTPROP))
2557 RRETURN(MATCH_NOMATCH);
2558 break;
2559
2560 case PT_PXSPACE: /* POSIX space */
2561 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2562 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2563 c == CHAR_FF || c == CHAR_CR)
2564 == (op == OP_NOTPROP))
2565 RRETURN(MATCH_NOMATCH);
2566 break;
2567
2568 case PT_WORD:
2569 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2570 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2571 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2572 RRETURN(MATCH_NOMATCH);
2573 break;
2574
2575 /* This should never occur */
2576
2577 default:
2578 RRETURN(PCRE_ERROR_INTERNAL);
2579 }
2580
2581 ecode += 3;
2582 }
2583 break;
2584
2585 /* Match an extended Unicode sequence. We will get here only if the support
2586 is in the binary; otherwise a compile-time error occurs. */
2587
2588 case OP_EXTUNI:
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 else
2595 {
2596 int lgb, rgb;
2597 GETCHARINCTEST(c, eptr);
2598 lgb = UCD_GRAPHBREAK(c);
2599 while (eptr < md->end_subject)
2600 {
2601 int len = 1;
2602 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2603 rgb = UCD_GRAPHBREAK(c);
2604 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2605 lgb = rgb;
2606 eptr += len;
2607 }
2608 }
2609 CHECK_PARTIAL();
2610 ecode++;
2611 break;
2612 #endif
2613
2614
2615 /* Match a back reference, possibly repeatedly. Look past the end of the
2616 item to see if there is repeat information following. The code is similar
2617 to that for character classes, but repeated for efficiency. Then obey
2618 similar code to character type repeats - written out again for speed.
2619 However, if the referenced string is the empty string, always treat
2620 it as matched, any number of times (otherwise there could be infinite
2621 loops). */
2622
2623 case OP_REF:
2624 case OP_REFI:
2625 caseless = op == OP_REFI;
2626 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2627 ecode += 1 + IMM2_SIZE;
2628
2629 /* If the reference is unset, there are two possibilities:
2630
2631 (a) In the default, Perl-compatible state, set the length negative;
2632 this ensures that every attempt at a match fails. We can't just fail
2633 here, because of the possibility of quantifiers with zero minima.
2634
2635 (b) If the JavaScript compatibility flag is set, set the length to zero
2636 so that the back reference matches an empty string.
2637
2638 Otherwise, set the length to the length of what was matched by the
2639 referenced subpattern. */
2640
2641 if (offset >= offset_top || md->offset_vector[offset] < 0)
2642 length = (md->jscript_compat)? 0 : -1;
2643 else
2644 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2645
2646 /* Set up for repetition, or handle the non-repeated case */
2647
2648 switch (*ecode)
2649 {
2650 case OP_CRSTAR:
2651 case OP_CRMINSTAR:
2652 case OP_CRPLUS:
2653 case OP_CRMINPLUS:
2654 case OP_CRQUERY:
2655 case OP_CRMINQUERY:
2656 c = *ecode++ - OP_CRSTAR;
2657 minimize = (c & 1) != 0;
2658 min = rep_min[c]; /* Pick up values from tables; */
2659 max = rep_max[c]; /* zero for max => infinity */
2660 if (max == 0) max = INT_MAX;
2661 break;
2662
2663 case OP_CRRANGE:
2664 case OP_CRMINRANGE:
2665 minimize = (*ecode == OP_CRMINRANGE);
2666 min = GET2(ecode, 1);
2667 max = GET2(ecode, 1 + IMM2_SIZE);
2668 if (max == 0) max = INT_MAX;
2669 ecode += 1 + 2 * IMM2_SIZE;
2670 break;
2671
2672 default: /* No repeat follows */
2673 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2674 {
2675 if (length == -2) eptr = md->end_subject; /* Partial match */
2676 CHECK_PARTIAL();
2677 RRETURN(MATCH_NOMATCH);
2678 }
2679 eptr += length;
2680 continue; /* With the main loop */
2681 }
2682
2683 /* Handle repeated back references. If the length of the reference is
2684 zero, just continue with the main loop. If the length is negative, it
2685 means the reference is unset in non-Java-compatible mode. If the minimum is
2686 zero, we can continue at the same level without recursion. For any other
2687 minimum, carrying on will result in NOMATCH. */
2688
2689 if (length == 0) continue;
2690 if (length < 0 && min == 0) continue;
2691
2692 /* First, ensure the minimum number of matches are present. We get back
2693 the length of the reference string explicitly rather than passing the
2694 address of eptr, so that eptr can be a register variable. */
2695
2696 for (i = 1; i <= min; i++)
2697 {
2698 int slength;
2699 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2700 {
2701 if (slength == -2) eptr = md->end_subject; /* Partial match */
2702 CHECK_PARTIAL();
2703 RRETURN(MATCH_NOMATCH);
2704 }
2705 eptr += slength;
2706 }
2707
2708 /* If min = max, continue at the same level without recursion.
2709 They are not both allowed to be zero. */
2710
2711 if (min == max) continue;
2712
2713 /* If minimizing, keep trying and advancing the pointer */
2714
2715 if (minimize)
2716 {
2717 for (fi = min;; fi++)
2718 {
2719 int slength;
2720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 if (fi >= max) RRETURN(MATCH_NOMATCH);
2723 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2724 {
2725 if (slength == -2) eptr = md->end_subject; /* Partial match */
2726 CHECK_PARTIAL();
2727 RRETURN(MATCH_NOMATCH);
2728 }
2729 eptr += slength;
2730 }
2731 /* Control never gets here */
2732 }
2733
2734 /* If maximizing, find the longest string and work backwards */
2735
2736 else
2737 {
2738 pp = eptr;
2739 for (i = min; i < max; i++)
2740 {
2741 int slength;
2742 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2743 {
2744 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2745 the soft partial matching case. */
2746
2747 if (slength == -2 && md->partial != 0 &&
2748 md->end_subject > md->start_used_ptr)
2749 {
2750 md->hitend = TRUE;
2751 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2752 }
2753 break;
2754 }
2755 eptr += slength;
2756 }
2757
2758 while (eptr >= pp)
2759 {
2760 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2762 eptr -= length;
2763 }
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 /* Control never gets here */
2767
2768 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2769 used when all the characters in the class have values in the range 0-255,
2770 and either the matching is caseful, or the characters are in the range
2771 0-127 when UTF-8 processing is enabled. The only difference between
2772 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2773 encountered.
2774
2775 First, look past the end of the item to see if there is repeat information
2776 following. Then obey similar code to character type repeats - written out
2777 again for speed. */
2778
2779 case OP_NCLASS:
2780 case OP_CLASS:
2781 {
2782 /* The data variable is saved across frames, so the byte map needs to
2783 be stored there. */
2784 #define BYTE_MAP ((pcre_uint8 *)data)
2785 data = ecode + 1; /* Save for matching */
2786 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2787
2788 switch (*ecode)
2789 {
2790 case OP_CRSTAR:
2791 case OP_CRMINSTAR:
2792 case OP_CRPLUS:
2793 case OP_CRMINPLUS:
2794 case OP_CRQUERY:
2795 case OP_CRMINQUERY:
2796 c = *ecode++ - OP_CRSTAR;
2797 minimize = (c & 1) != 0;
2798 min = rep_min[c]; /* Pick up values from tables; */
2799 max = rep_max[c]; /* zero for max => infinity */
2800 if (max == 0) max = INT_MAX;
2801 break;
2802
2803 case OP_CRRANGE:
2804 case OP_CRMINRANGE:
2805 minimize = (*ecode == OP_CRMINRANGE);
2806 min = GET2(ecode, 1);
2807 max = GET2(ecode, 1 + IMM2_SIZE);
2808 if (max == 0) max = INT_MAX;
2809 ecode += 1 + 2 * IMM2_SIZE;
2810 break;
2811
2812 default: /* No repeat follows */
2813 min = max = 1;
2814 break;
2815 }
2816
2817 /* First, ensure the minimum number of matches are present. */
2818
2819 #ifdef SUPPORT_UTF
2820 if (utf)
2821 {
2822 for (i = 1; i <= min; i++)
2823 {
2824 if (eptr >= md->end_subject)
2825 {
2826 SCHECK_PARTIAL();
2827 RRETURN(MATCH_NOMATCH);
2828 }
2829 GETCHARINC(c, eptr);
2830 if (c > 255)
2831 {
2832 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2833 }
2834 else
2835 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2836 }
2837 }
2838 else
2839 #endif
2840 /* Not UTF mode */
2841 {
2842 for (i = 1; i <= min; i++)
2843 {
2844 if (eptr >= md->end_subject)
2845 {
2846 SCHECK_PARTIAL();
2847 RRETURN(MATCH_NOMATCH);
2848 }
2849 c = *eptr++;
2850 #ifndef COMPILE_PCRE8
2851 if (c > 255)
2852 {
2853 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2854 }
2855 else
2856 #endif
2857 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2858 }
2859 }
2860
2861 /* If max == min we can continue with the main loop without the
2862 need to recurse. */
2863
2864 if (min == max) continue;
2865
2866 /* If minimizing, keep testing the rest of the expression and advancing
2867 the pointer while it matches the class. */
2868
2869 if (minimize)
2870 {
2871 #ifdef SUPPORT_UTF
2872 if (utf)
2873 {
2874 for (fi = min;; fi++)
2875 {
2876 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2878 if (fi >= max) RRETURN(MATCH_NOMATCH);
2879 if (eptr >= md->end_subject)
2880 {
2881 SCHECK_PARTIAL();
2882 RRETURN(MATCH_NOMATCH);
2883 }
2884 GETCHARINC(c, eptr);
2885 if (c > 255)
2886 {
2887 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2888 }
2889 else
2890 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2891 }
2892 }
2893 else
2894 #endif
2895 /* Not UTF mode */
2896 {
2897 for (fi = min;; fi++)
2898 {
2899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2901 if (fi >= max) RRETURN(MATCH_NOMATCH);
2902 if (eptr >= md->end_subject)
2903 {
2904 SCHECK_PARTIAL();
2905 RRETURN(MATCH_NOMATCH);
2906 }
2907 c = *eptr++;
2908 #ifndef COMPILE_PCRE8
2909 if (c > 255)
2910 {
2911 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2912 }
2913 else
2914 #endif
2915 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2916 }
2917 }
2918 /* Control never gets here */
2919 }
2920
2921 /* If maximizing, find the longest possible run, then work backwards. */
2922
2923 else
2924 {
2925 pp = eptr;
2926
2927 #ifdef SUPPORT_UTF
2928 if (utf)
2929 {
2930 for (i = min; i < max; i++)
2931 {
2932 int len = 1;
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 break;
2937 }
2938 GETCHARLEN(c, eptr, len);
2939 if (c > 255)
2940 {
2941 if (op == OP_CLASS) break;
2942 }
2943 else
2944 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2945 eptr += len;
2946 }
2947 for (;;)
2948 {
2949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2951 if (eptr-- == pp) break; /* Stop if tried at original pos */
2952 BACKCHAR(eptr);
2953 }
2954 }
2955 else
2956 #endif
2957 /* Not UTF mode */
2958 {
2959 for (i = min; i < max; i++)
2960 {
2961 if (eptr >= md->end_subject)
2962 {
2963 SCHECK_PARTIAL();
2964 break;
2965 }
2966 c = *eptr;
2967 #ifndef COMPILE_PCRE8
2968 if (c > 255)
2969 {
2970 if (op == OP_CLASS) break;
2971 }
2972 else
2973 #endif
2974 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2975 eptr++;
2976 }
2977 while (eptr >= pp)
2978 {
2979 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2980 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2981 eptr--;
2982 }
2983 }
2984
2985 RRETURN(MATCH_NOMATCH);
2986 }
2987 #undef BYTE_MAP
2988 }
2989 /* Control never gets here */
2990
2991
2992 /* Match an extended character class. This opcode is encountered only
2993 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2994 mode, because Unicode properties are supported in non-UTF-8 mode. */
2995
2996 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2997 case OP_XCLASS:
2998 {
2999 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3000 ecode += GET(ecode, 1); /* Advance past the item */
3001
3002 switch (*ecode)
3003 {
3004 case OP_CRSTAR:
3005 case OP_CRMINSTAR:
3006 case OP_CRPLUS:
3007 case OP_CRMINPLUS:
3008 case OP_CRQUERY:
3009 case OP_CRMINQUERY:
3010 c = *ecode++ - OP_CRSTAR;
3011 minimize = (c & 1) != 0;
3012 min = rep_min[c]; /* Pick up values from tables; */
3013 max = rep_max[c]; /* zero for max => infinity */
3014 if (max == 0) max = INT_MAX;
3015 break;
3016
3017 case OP_CRRANGE:
3018 case OP_CRMINRANGE:
3019 minimize = (*ecode == OP_CRMINRANGE);
3020 min = GET2(ecode, 1);
3021 max = GET2(ecode, 1 + IMM2_SIZE);
3022 if (max == 0) max = INT_MAX;
3023 ecode += 1 + 2 * IMM2_SIZE;
3024 break;
3025
3026 default: /* No repeat follows */
3027 min = max = 1;
3028 break;
3029 }
3030
3031 /* First, ensure the minimum number of matches are present. */
3032
3033 for (i = 1; i <= min; i++)
3034 {
3035 if (eptr >= md->end_subject)
3036 {
3037 SCHECK_PARTIAL();
3038 RRETURN(MATCH_NOMATCH);
3039 }
3040 GETCHARINCTEST(c, eptr);
3041 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3042 }
3043
3044 /* If max == min we can continue with the main loop without the
3045 need to recurse. */
3046
3047 if (min == max) continue;
3048
3049 /* If minimizing, keep testing the rest of the expression and advancing
3050 the pointer while it matches the class. */
3051
3052 if (minimize)
3053 {
3054 for (fi = min;; fi++)
3055 {
3056 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3058 if (fi >= max) RRETURN(MATCH_NOMATCH);
3059 if (eptr >= md->end_subject)
3060 {
3061 SCHECK_PARTIAL();
3062 RRETURN(MATCH_NOMATCH);
3063 }
3064 GETCHARINCTEST(c, eptr);
3065 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3066 }
3067 /* Control never gets here */
3068 }
3069
3070 /* If maximizing, find the longest possible run, then work backwards. */
3071
3072 else
3073 {
3074 pp = eptr;
3075 for (i = min; i < max; i++)
3076 {
3077 int len = 1;
3078 if (eptr >= md->end_subject)
3079 {
3080 SCHECK_PARTIAL();
3081 break;
3082 }
3083 #ifdef SUPPORT_UTF
3084 GETCHARLENTEST(c, eptr, len);
3085 #else
3086 c = *eptr;
3087 #endif
3088 if (!PRIV(xclass)(c, data, utf)) break;
3089 eptr += len;
3090 }
3091 for(;;)
3092 {
3093 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095 if (eptr-- == pp) break; /* Stop if tried at original pos */
3096 #ifdef SUPPORT_UTF
3097 if (utf) BACKCHAR(eptr);
3098 #endif
3099 }
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102
3103 /* Control never gets here */
3104 }
3105 #endif /* End of XCLASS */
3106
3107 /* Match a single character, casefully */
3108
3109 case OP_CHAR:
3110 #ifdef SUPPORT_UTF
3111 if (utf)
3112 {
3113 length = 1;
3114 ecode++;
3115 GETCHARLEN(fc, ecode, length);
3116 if (length > md->end_subject - eptr)
3117 {
3118 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3122 }
3123 else
3124 #endif
3125 /* Not UTF mode */
3126 {
3127 if (md->end_subject - eptr < 1)
3128 {
3129 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3130 RRETURN(MATCH_NOMATCH);
3131 }
3132 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3133 ecode += 2;
3134 }
3135 break;
3136
3137 /* Match a single character, caselessly. If we are at the end of the
3138 subject, give up immediately. */
3139
3140 case OP_CHARI:
3141 if (eptr >= md->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 RRETURN(MATCH_NOMATCH);
3145 }
3146
3147 #ifdef SUPPORT_UTF
3148 if (utf)
3149 {
3150 length = 1;
3151 ecode++;
3152 GETCHARLEN(fc, ecode, length);
3153
3154 /* If the pattern character's value is < 128, we have only one byte, and
3155 we know that its other case must also be one byte long, so we can use the
3156 fast lookup table. We know that there is at least one byte left in the
3157 subject. */
3158
3159 if (fc < 128)
3160 {
3161 if (md->lcc[fc]
3162 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3163 ecode++;
3164 eptr++;
3165 }
3166
3167 /* Otherwise we must pick up the subject character. Note that we cannot
3168 use the value of "length" to check for sufficient bytes left, because the
3169 other case of the character may have more or fewer bytes. */
3170
3171 else
3172 {
3173 unsigned int dc;
3174 GETCHARINC(dc, eptr);
3175 ecode += length;
3176
3177 /* If we have Unicode property support, we can use it to test the other
3178 case of the character, if there is one. */
3179
3180 if (fc != dc)
3181 {
3182 #ifdef SUPPORT_UCP
3183 if (dc != UCD_OTHERCASE(fc))
3184 #endif
3185 RRETURN(MATCH_NOMATCH);
3186 }
3187 }
3188 }
3189 else
3190 #endif /* SUPPORT_UTF */
3191
3192 /* Not UTF mode */
3193 {
3194 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3195 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3196 eptr++;
3197 ecode += 2;
3198 }
3199 break;
3200
3201 /* Match a single character repeatedly. */
3202
3203 case OP_EXACT:
3204 case OP_EXACTI:
3205 min = max = GET2(ecode, 1);
3206 ecode += 1 + IMM2_SIZE;
3207 goto REPEATCHAR;
3208
3209 case OP_POSUPTO:
3210 case OP_POSUPTOI:
3211 possessive = TRUE;
3212 /* Fall through */
3213
3214 case OP_UPTO:
3215 case OP_UPTOI:
3216 case OP_MINUPTO:
3217 case OP_MINUPTOI:
3218 min = 0;
3219 max = GET2(ecode, 1);
3220 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3221 ecode += 1 + IMM2_SIZE;
3222 goto REPEATCHAR;
3223
3224 case OP_POSSTAR:
3225 case OP_POSSTARI:
3226 possessive = TRUE;
3227 min = 0;
3228 max = INT_MAX;
3229 ecode++;
3230 goto REPEATCHAR;
3231
3232 case OP_POSPLUS:
3233 case OP_POSPLUSI:
3234 possessive = TRUE;
3235 min = 1;
3236 max = INT_MAX;
3237 ecode++;
3238 goto REPEATCHAR;
3239
3240 case OP_POSQUERY:
3241 case OP_POSQUERYI:
3242 possessive = TRUE;
3243 min = 0;
3244 max = 1;
3245 ecode++;
3246 goto REPEATCHAR;
3247
3248 case OP_STAR:
3249 case OP_STARI:
3250 case OP_MINSTAR:
3251 case OP_MINSTARI:
3252 case OP_PLUS:
3253 case OP_PLUSI:
3254 case OP_MINPLUS:
3255 case OP_MINPLUSI:
3256 case OP_QUERY:
3257 case OP_QUERYI:
3258 case OP_MINQUERY:
3259 case OP_MINQUERYI:
3260 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3261 minimize = (c & 1) != 0;
3262 min = rep_min[c]; /* Pick up values from tables; */
3263 max = rep_max[c]; /* zero for max => infinity */
3264 if (max == 0) max = INT_MAX;
3265
3266 /* Common code for all repeated single-character matches. */
3267
3268 REPEATCHAR:
3269 #ifdef SUPPORT_UTF
3270 if (utf)
3271 {
3272 length = 1;
3273 charptr = ecode;
3274 GETCHARLEN(fc, ecode, length);
3275 ecode += length;
3276
3277 /* Handle multibyte character matching specially here. There is
3278 support for caseless matching if UCP support is present. */
3279
3280 if (length > 1)
3281 {
3282 #ifdef SUPPORT_UCP
3283 unsigned int othercase;
3284 if (op >= OP_STARI && /* Caseless */
3285 (othercase = UCD_OTHERCASE(fc)) != fc)
3286 oclength = PRIV(ord2utf)(othercase, occhars);
3287 else oclength = 0;
3288 #endif /* SUPPORT_UCP */
3289
3290 for (i = 1; i <= min; i++)
3291 {
3292 if (eptr <= md->end_subject - length &&
3293 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3294 #ifdef SUPPORT_UCP
3295 else if (oclength > 0 &&
3296 eptr <= md->end_subject - oclength &&
3297 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3298 #endif /* SUPPORT_UCP */
3299 else
3300 {
3301 CHECK_PARTIAL();
3302 RRETURN(MATCH_NOMATCH);
3303 }
3304 }
3305
3306 if (min == max) continue;
3307
3308 if (minimize)
3309 {
3310 for (fi = min;; fi++)
3311 {
3312 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3313 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3314 if (fi >= max) RRETURN(MATCH_NOMATCH);
3315 if (eptr <= md->end_subject - length &&
3316 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3317 #ifdef SUPPORT_UCP
3318 else if (oclength > 0 &&
3319 eptr <= md->end_subject - oclength &&
3320 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3321 #endif /* SUPPORT_UCP */
3322 else
3323 {
3324 CHECK_PARTIAL();
3325 RRETURN(MATCH_NOMATCH);
3326 }
3327 }
3328 /* Control never gets here */
3329 }
3330
3331 else /* Maximize */
3332 {
3333 pp = eptr;
3334 for (i = min; i < max; i++)
3335 {
3336 if (eptr <= md->end_subject - length &&
3337 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3338 #ifdef SUPPORT_UCP
3339 else if (oclength > 0 &&
3340 eptr <= md->end_subject - oclength &&
3341 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3342 #endif /* SUPPORT_UCP */
3343 else
3344 {
3345 CHECK_PARTIAL();
3346 break;
3347 }
3348 }
3349
3350 if (possessive) continue;
3351
3352 for(;;)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3357 #ifdef SUPPORT_UCP
3358 eptr--;
3359 BACKCHAR(eptr);
3360 #else /* without SUPPORT_UCP */
3361 eptr -= length;
3362 #endif /* SUPPORT_UCP */
3363 }
3364 }
3365 /* Control never gets here */
3366 }
3367
3368 /* If the length of a UTF-8 character is 1, we fall through here, and
3369 obey the code as for non-UTF-8 characters below, though in this case the
3370 value of fc will always be < 128. */
3371 }
3372 else
3373 #endif /* SUPPORT_UTF */
3374 /* When not in UTF-8 mode, load a single-byte character. */
3375 fc = *ecode++;
3376
3377 /* The value of fc at this point is always one character, though we may
3378 or may not be in UTF mode. The code is duplicated for the caseless and
3379 caseful cases, for speed, since matching characters is likely to be quite
3380 common. First, ensure the minimum number of matches are present. If min =
3381 max, continue at the same level without recursing. Otherwise, if
3382 minimizing, keep trying the rest of the expression and advancing one
3383 matching character if failing, up to the maximum. Alternatively, if
3384 maximizing, find the maximum number of characters and work backwards. */
3385
3386 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3387 max, (char *)eptr));
3388
3389 if (op >= OP_STARI) /* Caseless */
3390 {
3391 #ifdef COMPILE_PCRE8
3392 /* fc must be < 128 if UTF is enabled. */
3393 foc = md->fcc[fc];
3394 #else
3395 #ifdef SUPPORT_UTF
3396 #ifdef SUPPORT_UCP
3397 if (utf && fc > 127)
3398 foc = UCD_OTHERCASE(fc);
3399 #else
3400 if (utf && fc > 127)
3401 foc = fc;
3402 #endif /* SUPPORT_UCP */
3403 else
3404 #endif /* SUPPORT_UTF */
3405 foc = TABLE_GET(fc, md->fcc, fc);
3406 #endif /* COMPILE_PCRE8 */
3407
3408 for (i = 1; i <= min; i++)
3409 {
3410 if (eptr >= md->end_subject)
3411 {
3412 SCHECK_PARTIAL();
3413 RRETURN(MATCH_NOMATCH);
3414 }
3415 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3416 eptr++;
3417 }
3418 if (min == max) continue;
3419 if (minimize)
3420 {
3421 for (fi = min;; fi++)
3422 {
3423 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425 if (fi >= max) RRETURN(MATCH_NOMATCH);
3426 if (eptr >= md->end_subject)
3427 {
3428 SCHECK_PARTIAL();
3429 RRETURN(MATCH_NOMATCH);
3430 }
3431 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3432 eptr++;
3433 }
3434 /* Control never gets here */
3435 }
3436 else /* Maximize */
3437 {
3438 pp = eptr;
3439 for (i = min; i < max; i++)
3440 {
3441 if (eptr >= md->end_subject)
3442 {
3443 SCHECK_PARTIAL();
3444 break;
3445 }
3446 if (fc != *eptr && foc != *eptr) break;
3447 eptr++;
3448 }
3449
3450 if (possessive) continue;
3451
3452 while (eptr >= pp)
3453 {
3454 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3455 eptr--;
3456 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457 }
3458 RRETURN(MATCH_NOMATCH);
3459 }
3460 /* Control never gets here */
3461 }
3462
3463 /* Caseful comparisons (includes all multi-byte characters) */
3464
3465 else
3466 {
3467 for (i = 1; i <= min; i++)
3468 {
3469 if (eptr >= md->end_subject)
3470 {
3471 SCHECK_PARTIAL();
3472 RRETURN(MATCH_NOMATCH);
3473 }
3474 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3475 }
3476
3477 if (min == max) continue;
3478
3479 if (minimize)
3480 {
3481 for (fi = min;; fi++)
3482 {
3483 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3485 if (fi >= max) RRETURN(MATCH_NOMATCH);
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 RRETURN(MATCH_NOMATCH);
3490 }
3491 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3492 }
3493 /* Control never gets here */
3494 }
3495 else /* Maximize */
3496 {
3497 pp = eptr;
3498 for (i = min; i < max; i++)
3499 {
3500 if (eptr >= md->end_subject)
3501 {
3502 SCHECK_PARTIAL();
3503 break;
3504 }
3505 if (fc != *eptr) break;
3506 eptr++;
3507 }
3508 if (possessive) continue;
3509
3510 while (eptr >= pp)
3511 {
3512 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3513 eptr--;
3514 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3515 }
3516 RRETURN(MATCH_NOMATCH);
3517 }
3518 }
3519 /* Control never gets here */
3520
3521 /* Match a negated single one-byte character. The character we are
3522 checking can be multibyte. */
3523
3524 case OP_NOT:
3525 case OP_NOTI:
3526 if (eptr >= md->end_subject)
3527 {
3528 SCHECK_PARTIAL();
3529 RRETURN(MATCH_NOMATCH);
3530 }
3531 #ifdef SUPPORT_UTF
3532 if (utf)
3533 {
3534 register unsigned int ch, och;
3535
3536 ecode++;
3537 GETCHARINC(ch, ecode);
3538 GETCHARINC(c, eptr);
3539
3540 if (op == OP_NOT)
3541 {
3542 if (ch == c) RRETURN(MATCH_NOMATCH);
3543 }
3544 else
3545 {
3546 #ifdef SUPPORT_UCP
3547 if (ch > 127)
3548 och = UCD_OTHERCASE(ch);
3549 #else
3550 if (ch > 127)
3551 och = ch;
3552 #endif /* SUPPORT_UCP */
3553 else
3554 och = TABLE_GET(ch, md->fcc, ch);
3555 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3556 }
3557 }
3558 else
3559 #endif
3560 {
3561 register unsigned int ch = ecode[1];
3562 c = *eptr++;
3563 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3564 RRETURN(MATCH_NOMATCH);
3565 ecode += 2;
3566 }
3567 break;
3568
3569 /* Match a negated single one-byte character repeatedly. This is almost a
3570 repeat of the code for a repeated single character, but I haven't found a
3571 nice way of commoning these up that doesn't require a test of the
3572 positive/negative option for each character match. Maybe that wouldn't add
3573 very much to the time taken, but character matching *is* what this is all
3574 about... */
3575
3576 case OP_NOTEXACT:
3577 case OP_NOTEXACTI:
3578 min = max = GET2(ecode, 1);
3579 ecode += 1 + IMM2_SIZE;
3580 goto REPEATNOTCHAR;
3581
3582 case OP_NOTUPTO:
3583 case OP_NOTUPTOI:
3584 case OP_NOTMINUPTO:
3585 case OP_NOTMINUPTOI:
3586 min = 0;
3587 max = GET2(ecode, 1);
3588 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3589 ecode += 1 + IMM2_SIZE;
3590 goto REPEATNOTCHAR;
3591
3592 case OP_NOTPOSSTAR:
3593 case OP_NOTPOSSTARI:
3594 possessive = TRUE;
3595 min = 0;
3596 max = INT_MAX;
3597 ecode++;
3598 goto REPEATNOTCHAR;
3599
3600 case OP_NOTPOSPLUS:
3601 case OP_NOTPOSPLUSI:
3602 possessive = TRUE;
3603 min = 1;
3604 max = INT_MAX;
3605 ecode++;
3606 goto REPEATNOTCHAR;
3607
3608 case OP_NOTPOSQUERY:
3609 case OP_NOTPOSQUERYI:
3610 possessive = TRUE;
3611 min = 0;
3612 max = 1;
3613 ecode++;
3614 goto REPEATNOTCHAR;
3615
3616 case OP_NOTPOSUPTO:
3617 case OP_NOTPOSUPTOI:
3618 possessive = TRUE;
3619 min = 0;
3620 max = GET2(ecode, 1);
3621 ecode += 1 + IMM2_SIZE;
3622 goto REPEATNOTCHAR;
3623
3624 case OP_NOTSTAR:
3625 case OP_NOTSTARI:
3626 case OP_NOTMINSTAR:
3627 case OP_NOTMINSTARI:
3628 case OP_NOTPLUS:
3629 case OP_NOTPLUSI:
3630 case OP_NOTMINPLUS:
3631 case OP_NOTMINPLUSI:
3632 case OP_NOTQUERY:
3633 case OP_NOTQUERYI:
3634 case OP_NOTMINQUERY:
3635 case OP_NOTMINQUERYI:
3636 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3637 minimize = (c & 1) != 0;
3638 min = rep_min[c]; /* Pick up values from tables; */
3639 max = rep_max[c]; /* zero for max => infinity */
3640 if (max == 0) max = INT_MAX;
3641
3642 /* Common code for all repeated single-byte matches. */
3643
3644 REPEATNOTCHAR:
3645 GETCHARINCTEST(fc, ecode);
3646
3647 /* The code is duplicated for the caseless and caseful cases, for speed,
3648 since matching characters is likely to be quite common. First, ensure the
3649 minimum number of matches are present. If min = max, continue at the same
3650 level without recursing. Otherwise, if minimizing, keep trying the rest of
3651 the expression and advancing one matching character if failing, up to the
3652 maximum. Alternatively, if maximizing, find the maximum number of
3653 characters and work backwards. */
3654
3655 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3656 max, (char *)eptr));
3657
3658 if (op >= OP_NOTSTARI) /* Caseless */
3659 {
3660 #ifdef SUPPORT_UTF
3661 #ifdef SUPPORT_UCP
3662 if (utf && fc > 127)
3663 foc = UCD_OTHERCASE(fc);
3664 #else
3665 if (utf && fc > 127)
3666 foc = fc;
3667 #endif /* SUPPORT_UCP */
3668 else
3669 #endif /* SUPPORT_UTF */
3670 foc = TABLE_GET(fc, md->fcc, fc);
3671
3672 #ifdef SUPPORT_UTF
3673 if (utf)
3674 {
3675 register unsigned int d;
3676 for (i = 1; i <= min; i++)
3677 {
3678 if (eptr >= md->end_subject)
3679 {
3680 SCHECK_PARTIAL();
3681 RRETURN(MATCH_NOMATCH);
3682 }
3683 GETCHARINC(d, eptr);
3684 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3685 }
3686 }
3687 else
3688 #endif
3689 /* Not UTF mode */
3690 {
3691 for (i = 1; i <= min; i++)
3692 {
3693 if (eptr >= md->end_subject)
3694 {
3695 SCHECK_PARTIAL();
3696 RRETURN(MATCH_NOMATCH);
3697 }
3698 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3699 eptr++;
3700 }
3701 }
3702
3703 if (min == max) continue;
3704
3705 if (minimize)
3706 {
3707 #ifdef SUPPORT_UTF
3708 if (utf)
3709 {
3710 register unsigned int d;
3711 for (fi = min;; fi++)
3712 {
3713 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3715 if (fi >= max) RRETURN(MATCH_NOMATCH);
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 RRETURN(MATCH_NOMATCH);
3720 }
3721 GETCHARINC(d, eptr);
3722 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3723 }
3724 }
3725 else
3726 #endif
3727 /* Not UTF mode */
3728 {
3729 for (fi = min;; fi++)
3730 {
3731 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3733 if (fi >= max) RRETURN(MATCH_NOMATCH);
3734 if (eptr >= md->end_subject)
3735 {
3736 SCHECK_PARTIAL();
3737 RRETURN(MATCH_NOMATCH);
3738 }
3739 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3740 eptr++;
3741 }
3742 }
3743 /* Control never gets here */
3744 }
3745
3746 /* Maximize case */
3747
3748 else
3749 {
3750 pp = eptr;
3751
3752 #ifdef SUPPORT_UTF
3753 if (utf)
3754 {
3755 register unsigned int d;
3756 for (i = min; i < max; i++)
3757 {
3758 int len = 1;
3759 if (eptr >= md->end_subject)
3760 {
3761 SCHECK_PARTIAL();
3762 break;
3763 }
3764 GETCHARLEN(d, eptr, len);
3765 if (fc == d || (unsigned int)foc == d) break;
3766 eptr += len;
3767 }
3768 if (possessive) continue;
3769 for(;;)
3770 {
3771 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3772 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3773 if (eptr-- == pp) break; /* Stop if tried at original pos */
3774 BACKCHAR(eptr);
3775 }
3776 }
3777 else
3778 #endif
3779 /* Not UTF mode */
3780 {
3781 for (i = min; i < max; i++)
3782 {
3783 if (eptr >= md->end_subject)
3784 {
3785 SCHECK_PARTIAL();
3786 break;
3787 }
3788 if (fc == *eptr || foc == *eptr) break;
3789 eptr++;
3790 }
3791 if (possessive) continue;
3792 while (eptr >= pp)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 eptr--;
3797 }
3798 }
3799
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 /* Control never gets here */
3803 }
3804
3805 /* Caseful comparisons */
3806
3807 else
3808 {
3809 #ifdef SUPPORT_UTF
3810 if (utf)
3811 {
3812 register unsigned int d;
3813 for (i = 1; i <= min; i++)
3814 {
3815 if (eptr >= md->end_subject)
3816 {
3817 SCHECK_PARTIAL();
3818 RRETURN(MATCH_NOMATCH);
3819 }
3820 GETCHARINC(d, eptr);
3821 if (fc == d) RRETURN(MATCH_NOMATCH);
3822 }
3823 }
3824 else
3825 #endif
3826 /* Not UTF mode */
3827 {
3828 for (i = 1; i <= min; i++)
3829 {
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 RRETURN(MATCH_NOMATCH);
3834 }
3835 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3836 }
3837 }
3838
3839 if (min == max) continue;
3840
3841 if (minimize)
3842 {
3843 #ifdef SUPPORT_UTF
3844 if (utf)
3845 {
3846 register unsigned int d;
3847 for (fi = min;; fi++)
3848 {
3849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3851 if (fi >= max) RRETURN(MATCH_NOMATCH);
3852 if (eptr >= md->end_subject)
3853 {
3854 SCHECK_PARTIAL();
3855 RRETURN(MATCH_NOMATCH);
3856 }
3857 GETCHARINC(d, eptr);
3858 if (fc == d) RRETURN(MATCH_NOMATCH);
3859 }
3860 }
3861 else
3862 #endif
3863 /* Not UTF mode */
3864 {
3865 for (fi = min;; fi++)
3866 {
3867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3869 if (fi >= max) RRETURN(MATCH_NOMATCH);
3870 if (eptr >= md->end_subject)
3871 {
3872 SCHECK_PARTIAL();
3873 RRETURN(MATCH_NOMATCH);
3874 }
3875 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3876 }
3877 }
3878 /* Control never gets here */
3879 }
3880
3881 /* Maximize case */
3882
3883 else
3884 {
3885 pp = eptr;
3886
3887 #ifdef SUPPORT_UTF
3888 if (utf)
3889 {
3890 register unsigned int d;
3891 for (i = min; i < max; i++)
3892 {
3893 int len = 1;
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 break;
3898 }
3899 GETCHARLEN(d, eptr, len);
3900 if (fc == d) break;
3901 eptr += len;
3902 }
3903 if (possessive) continue;
3904 for(;;)
3905 {
3906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3908 if (eptr-- == pp) break; /* Stop if tried at original pos */
3909 BACKCHAR(eptr);
3910 }
3911 }
3912 else
3913 #endif
3914 /* Not UTF mode */
3915 {
3916 for (i = min; i < max; i++)
3917 {
3918 if (eptr >= md->end_subject)
3919 {
3920 SCHECK_PARTIAL();
3921 break;
3922 }
3923 if (fc == *eptr) break;
3924 eptr++;
3925 }
3926 if (possessive) continue;
3927 while (eptr >= pp)
3928 {
3929 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3931 eptr--;
3932 }
3933 }
3934
3935 RRETURN(MATCH_NOMATCH);
3936 }
3937 }
3938 /* Control never gets here */
3939
3940 /* Match a single character type repeatedly; several different opcodes
3941 share code. This is very similar to the code for single characters, but we
3942 repeat it in the interests of efficiency. */
3943
3944 case OP_TYPEEXACT:
3945 min = max = GET2(ecode, 1);
3946 minimize = TRUE;
3947 ecode += 1 + IMM2_SIZE;
3948 goto REPEATTYPE;
3949
3950 case OP_TYPEUPTO:
3951 case OP_TYPEMINUPTO:
3952 min = 0;
3953 max = GET2(ecode, 1);
3954 minimize = *ecode == OP_TYPEMINUPTO;
3955 ecode += 1 + IMM2_SIZE;
3956 goto REPEATTYPE;
3957
3958 case OP_TYPEPOSSTAR:
3959 possessive = TRUE;
3960 min = 0;
3961 max = INT_MAX;
3962 ecode++;
3963 goto REPEATTYPE;
3964
3965 case OP_TYPEPOSPLUS:
3966 possessive = TRUE;
3967 min = 1;
3968 max = INT_MAX;
3969 ecode++;
3970 goto REPEATTYPE;
3971
3972 case OP_TYPEPOSQUERY:
3973 possessive = TRUE;
3974 min = 0;
3975 max = 1;
3976 ecode++;
3977 goto REPEATTYPE;
3978
3979 case OP_TYPEPOSUPTO:
3980 possessive = TRUE;
3981 min = 0;
3982 max = GET2(ecode, 1);
3983 ecode += 1 + IMM2_SIZE;
3984 goto REPEATTYPE;
3985
3986 case OP_TYPESTAR:
3987 case OP_TYPEMINSTAR:
3988 case OP_TYPEPLUS:
3989 case OP_TYPEMINPLUS:
3990 case OP_TYPEQUERY:
3991 case OP_TYPEMINQUERY:
3992 c = *ecode++ - OP_TYPESTAR;
3993 minimize = (c & 1) != 0;
3994 min = rep_min[c]; /* Pick up values from tables; */
3995 max = rep_max[c]; /* zero for max => infinity */
3996 if (max == 0) max = INT_MAX;
3997
3998 /* Common code for all repeated single character type matches. Note that
3999 in UTF-8 mode, '.' matches a character of any length, but for the other
4000 character types, the valid characters are all one-byte long. */
4001
4002 REPEATTYPE:
4003 ctype = *ecode++; /* Code for the character type */
4004
4005 #ifdef SUPPORT_UCP
4006 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4007 {
4008 prop_fail_result = ctype == OP_NOTPROP;
4009 prop_type = *ecode++;
4010 prop_value = *ecode++;
4011 }
4012 else prop_type = -1;
4013 #endif
4014
4015 /* First, ensure the minimum number of matches are present. Use inline
4016 code for maximizing the speed, and do the type test once at the start
4017 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4018 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4019 and single-bytes. */
4020
4021 if (min > 0)
4022 {
4023 #ifdef SUPPORT_UCP
4024 if (prop_type >= 0)
4025 {
4026 switch(prop_type)
4027 {
4028 case PT_ANY:
4029 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4030 for (i = 1; i <= min; i++)
4031 {
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 RRETURN(MATCH_NOMATCH);
4036 }
4037 GETCHARINCTEST(c, eptr);
4038 }
4039 break;
4040
4041 case PT_LAMP:
4042 for (i = 1; i <= min; i++)
4043 {
4044 int chartype;
4045 if (eptr >= md->end_subject)
4046 {
4047 SCHECK_PARTIAL();
4048 RRETURN(MATCH_NOMATCH);
4049 }
4050 GETCHARINCTEST(c, eptr);
4051 chartype = UCD_CHARTYPE(c);
4052 if ((chartype == ucp_Lu ||
4053 chartype == ucp_Ll ||
4054 chartype == ucp_Lt) == prop_fail_result)
4055 RRETURN(MATCH_NOMATCH);
4056 }
4057 break;
4058
4059 case PT_GC:
4060 for (i = 1; i <= min; i++)
4061 {
4062 if (eptr >= md->end_subject)
4063 {
4064 SCHECK_PARTIAL();
4065 RRETURN(MATCH_NOMATCH);
4066 }
4067 GETCHARINCTEST(c, eptr);
4068 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4069 RRETURN(MATCH_NOMATCH);
4070 }
4071 break;
4072
4073 case PT_PC:
4074 for (i = 1; i <= min; i++)
4075 {
4076 if (eptr >= md->end_subject)
4077 {
4078 SCHECK_PARTIAL();
4079 RRETURN(MATCH_NOMATCH);
4080 }
4081 GETCHARINCTEST(c, eptr);
4082 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4083 RRETURN(MATCH_NOMATCH);
4084 }
4085 break;
4086
4087 case PT_SC:
4088 for (i = 1; i <= min; i++)
4089 {
4090 if (eptr >= md->end_subject)
4091 {
4092 SCHECK_PARTIAL();
4093 RRETURN(MATCH_NOMATCH);
4094 }
4095 GETCHARINCTEST(c, eptr);
4096 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4097 RRETURN(MATCH_NOMATCH);
4098 }
4099 break;
4100
4101 case PT_ALNUM:
4102 for (i = 1; i <= min; i++)
4103 {
4104 int category;
4105 if (eptr >= md->end_subject)
4106 {
4107 SCHECK_PARTIAL();
4108 RRETURN(MATCH_NOMATCH);
4109 }
4110 GETCHARINCTEST(c, eptr);
4111 category = UCD_CATEGORY(c);
4112 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4113 RRETURN(MATCH_NOMATCH);
4114 }
4115 break;
4116
4117 case PT_SPACE: /* Perl space */
4118 for (i = 1; i <= min; i++)
4119 {
4120 if (eptr >= md->end_subject)
4121 {
4122 SCHECK_PARTIAL();
4123 RRETURN(MATCH_NOMATCH);
4124 }
4125 GETCHARINCTEST(c, eptr);
4126 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4127 c == CHAR_FF || c == CHAR_CR)
4128 == prop_fail_result)
4129 RRETURN(MATCH_NOMATCH);
4130 }
4131 break;
4132
4133 case PT_PXSPACE: /* POSIX space */
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 RRETURN(MATCH_NOMATCH);
4140 }
4141 GETCHARINCTEST(c, eptr);
4142 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4143 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4144 == prop_fail_result)
4145 RRETURN(MATCH_NOMATCH);
4146 }
4147 break;
4148
4149 case PT_WORD:
4150 for (i = 1; i <= min; i++)
4151 {
4152 int category;
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 GETCHARINCTEST(c, eptr);
4159 category = UCD_CATEGORY(c);
4160 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4161 == prop_fail_result)
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 break;
4165
4166 /* This should not occur */
4167
4168 default:
4169 RRETURN(PCRE_ERROR_INTERNAL);
4170 }
4171 }
4172
4173 /* Match extended Unicode sequences. We will get here only if the
4174 support is in the binary; otherwise a compile-time error occurs. */
4175
4176 else if (ctype == OP_EXTUNI)
4177 {
4178 for (i = 1; i <= min; i++)
4179 {
4180 if (eptr >= md->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 else
4186 {
4187 int lgb, rgb;
4188 GETCHARINCTEST(c, eptr);
4189 lgb = UCD_GRAPHBREAK(c);
4190 while (eptr < md->end_subject)
4191 {
4192 int len = 1;
4193 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4194 rgb = UCD_GRAPHBREAK(c);
4195 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4196 lgb = rgb;
4197 eptr += len;
4198 }
4199 }
4200 CHECK_PARTIAL();
4201 }
4202 }
4203
4204 else
4205 #endif /* SUPPORT_UCP */
4206
4207 /* Handle all other cases when the coding is UTF-8 */
4208
4209 #ifdef SUPPORT_UTF
4210 if (utf) switch(ctype)
4211 {
4212 case OP_ANY:
4213 for (i = 1; i <= min; i++)
4214 {
4215 if (eptr >= md->end_subject)
4216 {
4217 SCHECK_PARTIAL();
4218 RRETURN(MATCH_NOMATCH);
4219 }
4220 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4221 if (md->partial != 0 &&
4222 eptr + 1 >= md->end_subject &&
4223 NLBLOCK->nltype == NLTYPE_FIXED &&
4224 NLBLOCK->nllen == 2 &&
4225 *eptr == NLBLOCK->nl[0])
4226 {
4227 md->hitend = TRUE;
4228 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4229 }
4230 eptr++;
4231 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4232 }
4233 break;
4234
4235 case OP_ALLANY:
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 RRETURN(MATCH_NOMATCH);
4242 }
4243 eptr++;
4244 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4245 }
4246 break;
4247
4248 case OP_ANYBYTE:
4249 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4250 eptr += min;
4251 break;
4252
4253 case OP_ANYNL:
4254 for (i = 1; i <= min; i++)
4255 {
4256 if (eptr >= md->end_subject)
4257 {
4258 SCHECK_PARTIAL();
4259 RRETURN(MATCH_NOMATCH);
4260 }
4261 GETCHARINC(c, eptr);
4262 switch(c)
4263 {
4264 default: RRETURN(MATCH_NOMATCH);
4265
4266 case CHAR_CR:
4267 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4268 break;
4269
4270 case CHAR_LF:
4271 break;
4272
4273 case CHAR_VT:
4274 case CHAR_FF:
4275 case CHAR_NEL:
4276 #ifndef EBCDIC
4277 case 0x2028:
4278 case 0x2029:
4279 #endif /* Not EBCDIC */
4280 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4281 break;
4282 }
4283 }
4284 break;
4285
4286 case OP_NOT_HSPACE:
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= md->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 RRETURN(MATCH_NOMATCH);
4293 }
4294 GETCHARINC(c, eptr);
4295 switch(c)
4296 {
4297 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4298 default: break;
4299 }
4300 }
4301 break;
4302
4303 case OP_HSPACE:
4304 for (i = 1; i <= min; i++)
4305 {
4306 if (eptr >= md->end_subject)
4307 {
4308 SCHECK_PARTIAL();
4309 RRETURN(MATCH_NOMATCH);
4310 }
4311 GETCHARINC(c, eptr);
4312 switch(c)
4313 {
4314 HSPACE_CASES: break; /* Byte and multibyte cases */
4315 default: RRETURN(MATCH_NOMATCH);
4316 }
4317 }
4318 break;
4319
4320 case OP_NOT_VSPACE:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINC(c, eptr);
4329 switch(c)
4330 {
4331 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4332 default: break;
4333 }
4334 }
4335 break;
4336
4337 case OP_VSPACE:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 GETCHARINC(c, eptr);
4346 switch(c)
4347 {
4348 VSPACE_CASES: break;
4349 default: RRETURN(MATCH_NOMATCH);
4350 }
4351 }
4352 break;
4353
4354 case OP_NOT_DIGIT:
4355 for (i = 1; i <= min; i++)
4356 {
4357 if (eptr >= md->end_subject)
4358 {
4359 SCHECK_PARTIAL();
4360 RRETURN(MATCH_NOMATCH);
4361 }
4362 GETCHARINC(c, eptr);
4363 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4364 RRETURN(MATCH_NOMATCH);
4365 }
4366 break;
4367
4368 case OP_DIGIT:
4369 for (i = 1; i <= min; i++)
4370 {
4371 if (eptr >= md->end_subject)
4372 {
4373 SCHECK_PARTIAL();
4374 RRETURN(MATCH_NOMATCH);
4375 }
4376 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4377 RRETURN(MATCH_NOMATCH);
4378 eptr++;
4379 /* No need to skip more bytes - we know it's a 1-byte character */
4380 }
4381 break;
4382
4383 case OP_NOT_WHITESPACE:
4384 for (i = 1; i <= min; i++)
4385 {
4386 if (eptr >= md->end_subject)
4387 {
4388 SCHECK_PARTIAL();
4389 RRETURN(MATCH_NOMATCH);
4390 }
4391 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4392 RRETURN(MATCH_NOMATCH);
4393 eptr++;
4394 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4395 }
4396 break;
4397
4398 case OP_WHITESPACE:
4399 for (i = 1; i <= min; i++)
4400 {
4401 if (eptr >= md->end_subject)
4402 {
4403 SCHECK_PARTIAL();
4404 RRETURN(MATCH_NOMATCH);
4405 }
4406 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4407 RRETURN(MATCH_NOMATCH);
4408 eptr++;
4409 /* No need to skip more bytes - we know it's a 1-byte character */
4410 }
4411 break;
4412
4413 case OP_NOT_WORDCHAR:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 RRETURN(MATCH_NOMATCH);
4420 }
4421 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4422 RRETURN(MATCH_NOMATCH);
4423 eptr++;
4424 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4425 }
4426 break;
4427
4428 case OP_WORDCHAR:
4429 for (i = 1; i <= min; i++)
4430 {
4431 if (eptr >= md->end_subject)
4432 {
4433 SCHECK_PARTIAL();
4434 RRETURN(MATCH_NOMATCH);
4435 }
4436 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4437 RRETURN(MATCH_NOMATCH);
4438 eptr++;
4439 /* No need to skip more bytes - we know it's a 1-byte character */
4440 }
4441 break;
4442
4443 default:
4444 RRETURN(PCRE_ERROR_INTERNAL);
4445 } /* End switch(ctype) */
4446
4447 else
4448 #endif /* SUPPORT_UTF */
4449
4450 /* Code for the non-UTF-8 case for minimum matching of operators other
4451 than OP_PROP and OP_NOTPROP. */
4452
4453 switch(ctype)
4454 {
4455 case OP_ANY:
4456 for (i = 1; i <= min; i++)
4457 {
4458 if (eptr >= md->end_subject)
4459 {
4460 SCHECK_PARTIAL();
4461 RRETURN(MATCH_NOMATCH);
4462 }
4463 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4464 if (md->partial != 0 &&
4465 eptr + 1 >= md->end_subject &&
4466 NLBLOCK->nltype == NLTYPE_FIXED &&
4467 NLBLOCK->nllen == 2 &&
4468 *eptr == NLBLOCK->nl[0])
4469 {
4470 md->hitend = TRUE;
4471 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4472 }
4473 eptr++;
4474 }
4475 break;
4476
4477 case OP_ALLANY:
4478 if (eptr > md->end_subject - min)
4479 {
4480 SCHECK_PARTIAL();
4481 RRETURN(MATCH_NOMATCH);
4482 }
4483 eptr += min;
4484 break;
4485
4486 case OP_ANYBYTE:
4487 if (eptr > md->end_subject - min)
4488 {
4489 SCHECK_PARTIAL();
4490 RRETURN(MATCH_NOMATCH);
4491 }
4492 eptr += min;
4493 break;
4494
4495 case OP_ANYNL:
4496 for (i = 1; i <= min; i++)
4497 {
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 RRETURN(MATCH_NOMATCH);
4502 }
4503 switch(*eptr++)
4504 {
4505 default: RRETURN(MATCH_NOMATCH);
4506
4507 case CHAR_CR:
4508 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4509 break;
4510
4511 case CHAR_LF:
4512 break;
4513
4514 case CHAR_VT:
4515 case CHAR_FF:
4516 case CHAR_NEL:
4517 #ifdef COMPILE_PCRE16
4518 case 0x2028:
4519 case 0x2029:
4520 #endif
4521 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4522 break;
4523 }
4524 }
4525 break;
4526
4527 case OP_NOT_HSPACE:
4528 for (i = 1; i <= min; i++)
4529 {
4530 if (eptr >= md->end_subject)
4531 {
4532 SCHECK_PARTIAL();
4533 RRETURN(MATCH_NOMATCH);
4534 }
4535 switch(*eptr++)
4536 {
4537 default: break;
4538 HSPACE_BYTE_CASES:
4539 #ifdef COMPILE_PCRE16
4540 HSPACE_MULTIBYTE_CASES:
4541 #endif
4542 RRETURN(MATCH_NOMATCH);
4543 }
4544 }
4545 break;
4546
4547 case OP_HSPACE:
4548 for (i = 1; i <= min; i++)
4549 {
4550 if (eptr >= md->end_subject)
4551 {
4552 SCHECK_PARTIAL();
4553 RRETURN(MATCH_NOMATCH);
4554 }
4555 switch(*eptr++)
4556 {
4557 default: RRETURN(MATCH_NOMATCH);
4558 HSPACE_BYTE_CASES:
4559 #ifdef COMPILE_PCRE16
4560 HSPACE_MULTIBYTE_CASES:
4561 #endif
4562 break;
4563 }
4564 }
4565 break;
4566
4567 case OP_NOT_VSPACE:
4568 for (i = 1; i <= min; i++)
4569 {
4570 if (eptr >= md->end_subject)
4571 {
4572 SCHECK_PARTIAL();
4573 RRETURN(MATCH_NOMATCH);
4574 }
4575 switch(*eptr++)
4576 {
4577 VSPACE_BYTE_CASES:
4578 #ifdef COMPILE_PCRE16
4579 VSPACE_MULTIBYTE_CASES:
4580 #endif
4581 RRETURN(MATCH_NOMATCH);
4582 default: break;
4583 }
4584 }
4585 break;
4586
4587 case OP_VSPACE:
4588 for (i = 1; i <= min; i++)
4589 {
4590 if (eptr >= md->end_subject)
4591 {
4592 SCHECK_PARTIAL();
4593 RRETURN(MATCH_NOMATCH);
4594 }
4595 switch(*eptr++)
4596 {
4597 default: RRETURN(MATCH_NOMATCH);
4598 VSPACE_BYTE_CASES:
4599 #ifdef COMPILE_PCRE16
4600 VSPACE_MULTIBYTE_CASES:
4601 #endif
4602 break;
4603 }
4604 }
4605 break;
4606
4607 case OP_NOT_DIGIT:
4608 for (i = 1; i <= min; i++)
4609 {
4610 if (eptr >= md->end_subject)
4611 {
4612 SCHECK_PARTIAL();
4613 RRETURN(MATCH_NOMATCH);
4614 }
4615 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4616 RRETURN(MATCH_NOMATCH);
4617 eptr++;
4618 }
4619 break;
4620
4621 case OP_DIGIT:
4622 for (i = 1; i <= min; i++)
4623 {
4624 if (eptr >= md->end_subject)
4625 {
4626 SCHECK_PARTIAL();
4627 RRETURN(MATCH_NOMATCH);
4628 }
4629 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4630 RRETURN(MATCH_NOMATCH);
4631 eptr++;
4632 }
4633 break;
4634
4635 case OP_NOT_WHITESPACE:
4636 for (i = 1; i <= min; i++)
4637 {
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 RRETURN(MATCH_NOMATCH);
4642 }
4643 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4644 RRETURN(MATCH_NOMATCH);
4645 eptr++;
4646 }
4647 break;
4648
4649 case OP_WHITESPACE:
4650 for (i = 1; i <= min; i++)
4651 {
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4656 }
4657 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4658 RRETURN(MATCH_NOMATCH);
4659 eptr++;
4660 }
4661 break;
4662
4663 case OP_NOT_WORDCHAR:
4664 for (i = 1; i <= min; i++)
4665 {
4666 if (eptr >= md->end_subject)
4667 {
4668 SCHECK_PARTIAL();
4669 RRETURN(MATCH_NOMATCH);
4670 }
4671 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4672 RRETURN(MATCH_NOMATCH);
4673 eptr++;
4674 }
4675 break;
4676
4677 case OP_WORDCHAR:
4678 for (i = 1; i <= min; i++)
4679 {
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 RRETURN(MATCH_NOMATCH);
4684 }
4685 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4686 RRETURN(MATCH_NOMATCH);
4687 eptr++;
4688 }
4689 break;
4690
4691 default:
4692 RRETURN(PCRE_ERROR_INTERNAL);
4693 }
4694 }
4695
4696 /* If min = max, continue at the same level without recursing */
4697
4698 if (min == max) continue;
4699
4700 /* If minimizing, we have to test the rest of the pattern before each
4701 subsequent match. Again, separate the UTF-8 case for speed, and also
4702 separate the UCP cases. */
4703
4704 if (minimize)
4705 {
4706 #ifdef SUPPORT_UCP
4707 if (prop_type >= 0)
4708 {
4709 switch(prop_type)
4710 {
4711 case PT_ANY:
4712 for (fi = min;; fi++)
4713 {
4714 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4716 if (fi >= max) RRETURN(MATCH_NOMATCH);
4717 if (eptr >= md->end_subject)
4718 {
4719 SCHECK_PARTIAL();
4720 RRETURN(MATCH_NOMATCH);
4721 }
4722 GETCHARINCTEST(c, eptr);
4723 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4724 }
4725 /* Control never gets here */
4726
4727 case PT_LAMP:
4728 for (fi = min;; fi++)
4729 {
4730 int chartype;
4731 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4733 if (fi >= max) RRETURN(MATCH_NOMATCH);
4734 if (eptr >= md->end_subject)
4735 {
4736 SCHECK_PARTIAL();
4737 RRETURN(MATCH_NOMATCH);
4738 }
4739 GETCHARINCTEST(c, eptr);
4740 chartype = UCD_CHARTYPE(c);
4741 if ((chartype == ucp_Lu ||
4742 chartype == ucp_Ll ||
4743 chartype == ucp_Lt) == prop_fail_result)
4744 RRETURN(MATCH_NOMATCH);
4745 }
4746 /* Control never gets here */
4747
4748 case PT_GC:
4749 for (fi = min;; fi++)
4750 {
4751 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4753 if (fi >= max) RRETURN(MATCH_NOMATCH);
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 GETCHARINCTEST(c, eptr);
4760 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 /* Control never gets here */
4764
4765 case PT_PC:
4766 for (fi = min;; fi++)
4767 {
4768 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4769 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4770 if (fi >= max) RRETURN(MATCH_NOMATCH);
4771 if (eptr >= md->end_subject)
4772 {
4773 SCHECK_PARTIAL();
4774 RRETURN(MATCH_NOMATCH);
4775 }
4776 GETCHARINCTEST(c, eptr);
4777 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4778 RRETURN(MATCH_NOMATCH);
4779 }
4780 /* Control never gets here */
4781
4782 case PT_SC:
4783 for (fi = min;; fi++)
4784 {
4785 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4787 if (fi >= max) RRETURN(MATCH_NOMATCH);
4788 if (eptr >= md->end_subject)
4789 {
4790 SCHECK_PARTIAL();
4791 RRETURN(MATCH_NOMATCH);
4792 }
4793 GETCHARINCTEST(c, eptr);
4794 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 /* Control never gets here */
4798
4799 case PT_ALNUM:
4800 for (fi = min;; fi++)
4801 {
4802 int category;
4803 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4805 if (fi >= max) RRETURN(MATCH_NOMATCH);
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 RRETURN(MATCH_NOMATCH);
4810 }
4811 GETCHARINCTEST(c, eptr);
4812 category = UCD_CATEGORY(c);
4813 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 /* Control never gets here */
4817
4818 case PT_SPACE: /* Perl space */
4819 for (fi = min;; fi++)
4820 {
4821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4823 if (fi >= max) RRETURN(MATCH_NOMATCH);
4824 if (eptr >= md->end_subject)
4825 {
4826 SCHECK_PARTIAL();
4827 RRETURN(MATCH_NOMATCH);
4828 }
4829 GETCHARINCTEST(c, eptr);
4830 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4831 c == CHAR_FF || c == CHAR_CR)
4832 == prop_fail_result)
4833 RRETURN(MATCH_NOMATCH);
4834 }
4835 /* Control never gets here */
4836
4837 case PT_PXSPACE: /* POSIX space */
4838 for (fi = min;; fi++)
4839 {
4840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4842 if (fi >= max) RRETURN(MATCH_NOMATCH);
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 RRETURN(MATCH_NOMATCH);
4847 }
4848 GETCHARINCTEST(c, eptr);
4849 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4850 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4851 == prop_fail_result)
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 /* Control never gets here */
4855
4856 case PT_WORD:
4857 for (fi = min;; fi++)
4858 {
4859 int category;
4860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4862 if (fi >= max) RRETURN(MATCH_NOMATCH);
4863 if (eptr >= md->end_subject)
4864 {
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 GETCHARINCTEST(c, eptr);
4869 category = UCD_CATEGORY(c);
4870 if ((category == ucp_L ||
4871 category == ucp_N ||
4872 c == CHAR_UNDERSCORE)
4873 == prop_fail_result)
4874 RRETURN(MATCH_NOMATCH);
4875 }
4876 /* Control never gets here */
4877
4878 /* This should never occur */
4879
4880 default:
4881 RRETURN(PCRE_ERROR_INTERNAL);
4882 }
4883 }
4884
4885 /* Match extended Unicode sequences. We will get here only if the
4886 support is in the binary; otherwise a compile-time error occurs. */
4887
4888 else if (ctype == OP_EXTUNI)
4889 {
4890 for (fi = min;; fi++)
4891 {
4892 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4894 if (fi >= max) RRETURN(MATCH_NOMATCH);
4895 if (eptr >= md->end_subject)
4896 {
4897 SCHECK_PARTIAL();
4898 RRETURN(MATCH_NOMATCH);
4899 }
4900 else
4901 {
4902 int lgb, rgb;
4903 GETCHARINCTEST(c, eptr);
4904 lgb = UCD_GRAPHBREAK(c);
4905 while (eptr < md->end_subject)
4906 {
4907 int len = 1;
4908 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4909 rgb = UCD_GRAPHBREAK(c);
4910 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4911 lgb = rgb;
4912 eptr += len;
4913 }
4914 }
4915 CHECK_PARTIAL();
4916 }
4917 }
4918 else
4919 #endif /* SUPPORT_UCP */
4920
4921 #ifdef SUPPORT_UTF
4922 if (utf)
4923 {
4924 for (fi = min;; fi++)
4925 {
4926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 if (fi >= max) RRETURN(MATCH_NOMATCH);
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4935 RRETURN(MATCH_NOMATCH);
4936 GETCHARINC(c, eptr);
4937 switch(ctype)
4938 {
4939 case OP_ANY: /* This is the non-NL case */
4940 if (md->partial != 0 && /* Take care with CRLF partial */
4941 eptr >= md->end_subject &&
4942 NLBLOCK->nltype == NLTYPE_FIXED &&
4943 NLBLOCK->nllen == 2 &&
4944 c == NLBLOCK->nl[0])
4945 {
4946 md->hitend = TRUE;
4947 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4948 }
4949 break;
4950
4951 case OP_ALLANY:
4952 case OP_ANYBYTE:
4953 break;
4954
4955 case OP_ANYNL:
4956 switch(c)
4957 {
4958 default: RRETURN(MATCH_NOMATCH);
4959 case CHAR_CR:
4960 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4961 break;
4962
4963 case CHAR_LF:
4964 break;
4965
4966 case CHAR_VT:
4967 case CHAR_FF:
4968 case CHAR_NEL:
4969 #ifndef EBCDIC
4970 case 0x2028:
4971 case 0x2029:
4972 #endif /* Not EBCDIC */
4973 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4974 break;
4975 }
4976 break;
4977
4978 case OP_NOT_HSPACE:
4979 switch(c)
4980 {
4981 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
4982 default: break;
4983 }
4984 break;
4985
4986 case OP_HSPACE:
4987 switch(c)
4988 {
4989 HSPACE_CASES: break;
4990 default: RRETURN(MATCH_NOMATCH);
4991 }
4992 break;
4993
4994 case OP_NOT_VSPACE:
4995 switch(c)
4996 {
4997 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4998 default: break;
4999 }
5000 break;
5001
5002 case OP_VSPACE:
5003 switch(c)
5004 {
5005 VSPACE_CASES: break;
5006 default: RRETURN(MATCH_NOMATCH);
5007 }
5008 break;
5009
5010 case OP_NOT_DIGIT:
5011 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5012 RRETURN(MATCH_NOMATCH);
5013 break;
5014
5015 case OP_DIGIT:
5016 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5017 RRETURN(MATCH_NOMATCH);
5018 break;
5019
5020 case OP_NOT_WHITESPACE:
5021 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5022 RRETURN(MATCH_NOMATCH);
5023 break;
5024
5025 case OP_WHITESPACE:
5026 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5027 RRETURN(MATCH_NOMATCH);
5028 break;
5029
5030 case OP_NOT_WORDCHAR:
5031 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5032 RRETURN(MATCH_NOMATCH);
5033 break;
5034
5035 case OP_WORDCHAR:
5036 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5037 RRETURN(MATCH_NOMATCH);
5038 break;
5039
5040 default:
5041 RRETURN(PCRE_ERROR_INTERNAL);
5042 }
5043 }
5044 }
5045 else
5046 #endif
5047 /* Not UTF mode */
5048 {
5049 for (fi = min;; fi++)
5050 {
5051 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5053 if (fi >= max) RRETURN(MATCH_NOMATCH);
5054 if (eptr >= md->end_subject)
5055 {
5056 SCHECK_PARTIAL();
5057 RRETURN(MATCH_NOMATCH);
5058 }
5059 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5060 RRETURN(MATCH_NOMATCH);
5061 c = *eptr++;
5062 switch(ctype)
5063 {
5064 case OP_ANY: /* This is the non-NL case */
5065 if (md->partial != 0 && /* Take care with CRLF partial */
5066 eptr >= md->end_subject &&
5067 NLBLOCK->nltype == NLTYPE_FIXED &&
5068 NLBLOCK->nllen == 2 &&
5069 c == NLBLOCK->nl[0])
5070 {
5071 md->hitend = TRUE;
5072 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5073 }
5074 break;
5075
5076 case OP_ALLANY:
5077 case OP_ANYBYTE:
5078 break;
5079
5080 case OP_ANYNL:
5081 switch(c)
5082 {
5083 default: RRETURN(MATCH_NOMATCH);
5084 case CHAR_CR:
5085 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5086 break;
5087
5088 case CHAR_LF:
5089 break;
5090
5091 case CHAR_VT:
5092 case CHAR_FF:
5093 case CHAR_NEL:
5094 #ifdef COMPILE_PCRE16
5095 case 0x2028:
5096 case 0x2029:
5097 #endif
5098 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5099 break;
5100 }
5101 break;
5102
5103 case OP_NOT_HSPACE:
5104 switch(c)
5105 {
5106 default: break;
5107 HSPACE_BYTE_CASES:
5108 #ifdef COMPILE_PCRE16
5109 HSPACE_MULTIBYTE_CASES:
5110 #endif
5111 RRETURN(MATCH_NOMATCH);
5112 }
5113 break;
5114
5115 case OP_HSPACE:
5116 switch(c)
5117 {
5118 default: RRETURN(MATCH_NOMATCH);
5119 HSPACE_BYTE_CASES:
5120 #ifdef COMPILE_PCRE16
5121 HSPACE_MULTIBYTE_CASES:
5122 #endif
5123 break;
5124 }
5125 break;
5126
5127 case OP_NOT_VSPACE:
5128 switch(c)
5129 {
5130 default: break;
5131 VSPACE_BYTE_CASES:
5132 #ifdef COMPILE_PCRE16
5133 VSPACE_MULTIBYTE_CASES:
5134 #endif
5135 RRETURN(MATCH_NOMATCH);
5136 }
5137 break;
5138
5139 case OP_VSPACE:
5140 switch(c)
5141 {
5142 default: RRETURN(MATCH_NOMATCH);
5143 VSPACE_BYTE_CASES:
5144 #ifdef COMPILE_PCRE16
5145 VSPACE_MULTIBYTE_CASES:
5146 #endif
5147 break;
5148 }
5149 break;
5150
5151 case OP_NOT_DIGIT:
5152 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5153 break;
5154
5155 case OP_DIGIT:
5156 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5157 break;
5158
5159 case OP_NOT_WHITESPACE:
5160 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5161 break;
5162
5163 case OP_WHITESPACE:
5164 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5165 break;
5166
5167 case OP_NOT_WORDCHAR:
5168 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5169 break;
5170
5171 case OP_WORDCHAR:
5172 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5173 break;
5174
5175 default:
5176 RRETURN(PCRE_ERROR_INTERNAL);
5177 }
5178 }
5179 }
5180 /* Control never gets here */
5181 }
5182
5183 /* If maximizing, it is worth using inline code for speed, doing the type
5184 test once at the start (i.e. keep it out of the loop). Again, keep the
5185 UTF-8 and UCP stuff separate. */
5186
5187 else
5188 {
5189 pp = eptr; /* Remember where we started */
5190
5191 #ifdef SUPPORT_UCP
5192 if (prop_type >= 0)
5193 {
5194 switch(prop_type)
5195 {
5196 case PT_ANY:
5197 for (i = min; i < max; i++)
5198 {
5199 int len = 1;
5200 if (eptr >= md->end_subject)
5201 {
5202 SCHECK_PARTIAL();
5203 break;
5204 }
5205 GETCHARLENTEST(c, eptr, len);
5206 if (prop_fail_result) break;
5207 eptr+= len;
5208 }
5209 break;
5210
5211 case PT_LAMP:
5212 for (i = min; i < max; i++)
5213 {
5214 int chartype;
5215 int len = 1;
5216 if (eptr >= md->end_subject)
5217 {
5218 SCHECK_PARTIAL();
5219 break;
5220 }
5221 GETCHARLENTEST(c, eptr, len);
5222 chartype = UCD_CHARTYPE(c);
5223 if ((chartype == ucp_Lu ||
5224 chartype == ucp_Ll ||
5225 chartype == ucp_Lt) == prop_fail_result)
5226 break;
5227 eptr+= len;
5228 }
5229 break;
5230
5231 case PT_GC:
5232 for (i = min; i < max; i++)
5233 {
5234 int len = 1;
5235 if (eptr >= md->end_subject)
5236 {
5237 SCHECK_PARTIAL();
5238 break;
5239 }
5240 GETCHARLENTEST(c, eptr, len);
5241 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5242 eptr+= len;
5243 }
5244 break;
5245
5246 case PT_PC:
5247 for (i = min; i < max; i++)
5248 {
5249 int len = 1;
5250 if (eptr >= md->end_subject)
5251 {
5252 SCHECK_PARTIAL();
5253 break;
5254 }
5255 GETCHARLENTEST(c, eptr, len);
5256 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5257 eptr+= len;
5258 }
5259 break;
5260
5261 case PT_SC:
5262 for (i = min; i < max; i++)
5263 {
5264 int len = 1;
5265 if (eptr >= md->end_subject)
5266 {
5267 SCHECK_PARTIAL();
5268 break;
5269 }
5270 GETCHARLENTEST(c, eptr, len);
5271 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5272 eptr+= len;
5273 }
5274 break;
5275
5276 case PT_ALNUM:
5277 for (i = min; i < max; i++)
5278 {
5279 int category;
5280 int len = 1;
5281 if (eptr >= md->end_subject)
5282 {
5283 SCHECK_PARTIAL();
5284 break;
5285 }
5286 GETCHARLENTEST(c, eptr, len);
5287 category = UCD_CATEGORY(c);
5288 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5289 break;
5290 eptr+= len;
5291 }
5292 break;
5293
5294 case PT_SPACE: /* Perl space */
5295 for (i = min; i < max; i++)
5296 {
5297 int len = 1;
5298 if (eptr >= md->end_subject)
5299 {
5300 SCHECK_PARTIAL();
5301 break;
5302 }
5303 GETCHARLENTEST(c, eptr, len);
5304 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5305 c == CHAR_FF || c == CHAR_CR)
5306 == prop_fail_result)
5307 break;
5308 eptr+= len;
5309 }
5310 break;
5311
5312 case PT_PXSPACE: /* POSIX space */
5313 for (i = min; i < max; i++)
5314 {
5315 int len = 1;
5316 if (eptr >= md->end_subject)
5317 {
5318 SCHECK_PARTIAL();
5319 break;
5320 }
5321 GETCHARLENTEST(c, eptr, len);
5322 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5323 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5324 == prop_fail_result)
5325 break;
5326 eptr+= len;
5327 }
5328 break;
5329
5330 case PT_WORD:
5331 for (i = min; i < max; i++)
5332 {
5333 int category;
5334 int len = 1;
5335 if (eptr >= md->end_subject)
5336 {
5337 SCHECK_PARTIAL();
5338 break;
5339 }
5340 GETCHARLENTEST(c, eptr, len);
5341 category = UCD_CATEGORY(c);
5342 if ((category == ucp_L || category == ucp_N ||
5343 c == CHAR_UNDERSCORE) == prop_fail_result)
5344 break;
5345 eptr+= len;
5346 }
5347 break;
5348
5349 default:
5350 RRETURN(PCRE_ERROR_INTERNAL);
5351 }
5352
5353 /* eptr is now past the end of the maximum run */
5354
5355 if (possessive) continue;
5356 for(;;)
5357 {
5358 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5360 if (eptr-- == pp) break; /* Stop if tried at original pos */
5361 if (utf) BACKCHAR(eptr);
5362 }
5363 }
5364
5365 /* Match extended Unicode sequences. We will get here only if the
5366 support is in the binary; otherwise a compile-time error occurs. */
5367
5368 else if (ctype == OP_EXTUNI)
5369 {
5370 for (i = min; i < max; i++)
5371 {
5372 if (eptr >= md->end_subject)
5373 {
5374 SCHECK_PARTIAL();
5375 break;
5376 }
5377 else
5378 {
5379 int lgb, rgb;
5380 GETCHARINCTEST(c, eptr);
5381 lgb = UCD_GRAPHBREAK(c);
5382 while (eptr < md->end_subject)
5383 {
5384 int len = 1;
5385 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5386 rgb = UCD_GRAPHBREAK(c);
5387 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5388 lgb = rgb;
5389 eptr += len;
5390 }
5391 }
5392 CHECK_PARTIAL();
5393 }
5394
5395 /* eptr is now past the end of the maximum run */
5396
5397 if (possessive) continue;
5398
5399 for(;;)
5400 {
5401 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5403 if (eptr-- == pp) break; /* Stop if tried at original pos */
5404 for (;;) /* Move back over one extended */
5405 {
5406 if (!utf) c = *eptr; else
5407 {
5408 BACKCHAR(eptr);
5409 GETCHAR(c, eptr);
5410 }
5411 if (UCD_CATEGORY(c) != ucp_M) break;
5412 eptr--;
5413 }
5414 }
5415 }
5416
5417 else
5418 #endif /* SUPPORT_UCP */
5419
5420 #ifdef SUPPORT_UTF
5421 if (utf)
5422 {
5423 switch(ctype)
5424 {
5425 case OP_ANY:
5426 if (max < INT_MAX)
5427 {
5428 for (i = min; i < max; i++)
5429 {
5430 if (eptr >= md->end_subject)
5431 {
5432 SCHECK_PARTIAL();
5433 break;
5434 }
5435 if (IS_NEWLINE(eptr)) break;
5436 if (md->partial != 0 && /* Take care with CRLF partial */
5437 eptr + 1 >= md->end_subject &&
5438 NLBLOCK->nltype == NLTYPE_FIXED &&
5439 NLBLOCK->nllen == 2 &&
5440 *eptr == NLBLOCK->nl[0])
5441 {
5442 md->hitend = TRUE;
5443 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5444 }
5445 eptr++;
5446 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5447 }
5448 }
5449
5450 /* Handle unlimited UTF-8 repeat */
5451
5452 else
5453 {
5454 for (i = min; i < max; i++)
5455 {
5456 if (eptr >= md->end_subject)
5457 {
5458 SCHECK_PARTIAL();
5459 break;
5460 }
5461 if (IS_NEWLINE(eptr)) break;
5462 if (md->partial != 0 && /* Take care with CRLF partial */
5463 eptr + 1 >= md->end_subject &&
5464 NLBLOCK->nltype == NLTYPE_FIXED &&
5465 NLBLOCK->nllen == 2 &&
5466 *eptr == NLBLOCK->nl[0])
5467 {
5468 md->hitend = TRUE;
5469 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5470 }
5471 eptr++;
5472 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5473 }
5474 }
5475 break;
5476
5477 case OP_ALLANY:
5478 if (max < INT_MAX)
5479 {
5480 for (i = min; i < max; i++)
5481 {
5482 if (eptr >= md->end_subject)
5483 {
5484 SCHECK_PARTIAL();
5485 break;
5486 }
5487 eptr++;
5488 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5489 }
5490 }
5491 else
5492 {
5493 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5494 SCHECK_PARTIAL();
5495 }
5496 break;
5497
5498 /* The byte case is the same as non-UTF8 */
5499
5500 case OP_ANYBYTE:
5501 c = max - min;
5502 if (c > (unsigned int)(md->end_subject - eptr))
5503 {
5504 eptr = md->end_subject;
5505 SCHECK_PARTIAL();
5506 }
5507 else eptr += c;
5508 break;
5509
5510 case OP_ANYNL:
5511 for (i = min; i < max; i++)
5512 {
5513 int len = 1;
5514 if (eptr >= md->end_subject)
5515 {
5516 SCHECK_PARTIAL();
5517 break;
5518 }
5519 GETCHARLEN(c, eptr, len);
5520 if (c == CHAR_CR)
5521 {
5522 if (++eptr >= md->end_subject) break;
5523 if (*eptr == CHAR_LF) eptr++;
5524 }
5525 else
5526 {
5527 if (c != CHAR_LF &&
5528 (md->bsr_anycrlf ||
5529 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5530 #ifndef EBCDIC
5531 && c != 0x2028 && c != 0x2029
5532 #endif /* Not EBCDIC */
5533 )))
5534 break;
5535 eptr += len;
5536 }
5537 }
5538 break;
5539
5540 case OP_NOT_HSPACE:
5541 case OP_HSPACE:
5542 for (i = min; i < max; i++)
5543 {
5544 BOOL gotspace;
5545 int len = 1;
5546 if (eptr >= md->end_subject)
5547 {
5548 SCHECK_PARTIAL();
5549 break;
5550 }
5551 GETCHARLEN(c, eptr, len);
5552 switch(c)
5553 {
5554 HSPACE_CASES: gotspace = TRUE; break;
5555 default: gotspace = FALSE; break;
5556 }
5557 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5558 eptr += len;
5559 }
5560 break;
5561
5562 case OP_NOT_VSPACE:
5563 case OP_VSPACE:
5564 for (i = min; i < max; i++)
5565 {
5566 BOOL gotspace;
5567 int len = 1;
5568 if (eptr >= md->end_subject)
5569 {
5570 SCHECK_PARTIAL();
5571 break;
5572 }
5573 GETCHARLEN(c, eptr, len);
5574 switch(c)
5575 {
5576 VSPACE_CASES: gotspace = TRUE; break;
5577 default: gotspace = FALSE; break;
5578 }
5579 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5580 eptr += len;
5581 }
5582 break;
5583
5584 case OP_NOT_DIGIT:
5585 for (i = min; i < max; i++)
5586 {
5587 int len = 1;
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 GETCHARLEN(c, eptr, len);
5594 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5595 eptr+= len;
5596 }
5597 break;
5598
5599 case OP_DIGIT:
5600 for (i = min; i < max; i++)
5601 {
5602 int len = 1;
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 GETCHARLEN(c, eptr, len);
5609 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5610 eptr+= len;
5611 }
5612 break;
5613
5614 case OP_NOT_WHITESPACE:
5615 for (i = min; i < max; i++)
5616 {
5617 int len = 1;
5618 if (eptr >= md->end_subject)
5619 {
5620 SCHECK_PARTIAL();
5621 break;
5622 }
5623 GETCHARLEN(c, eptr, len);
5624 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5625 eptr+= len;
5626 }
5627 break;
5628
5629 case OP_WHITESPACE:
5630 for (i = min; i < max; i++)
5631 {
5632 int len = 1;
5633 if (eptr >= md->end_subject)
5634 {
5635 SCHECK_PARTIAL();
5636 break;
5637 }
5638 GETCHARLEN(c, eptr, len);
5639 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5640 eptr+= len;
5641 }
5642 break;
5643
5644 case OP_NOT_WORDCHAR:
5645 for (i = min; i < max; i++)
5646 {
5647 int len = 1;
5648 if (eptr >= md->end_subject)
5649 {
5650 SCHECK_PARTIAL();
5651 break;
5652 }
5653 GETCHARLEN(c, eptr, len);
5654 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5655 eptr+= len;
5656 }
5657 break;
5658
5659 case OP_WORDCHAR:
5660 for (i = min; i < max; i++)
5661 {
5662 int len = 1;
5663 if (eptr >= md->end_subject)
5664 {
5665 SCHECK_PARTIAL();
5666 break;
5667 }
5668 GETCHARLEN(c, eptr, len);
5669 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5670 eptr+= len;
5671 }
5672 break;
5673
5674 default:
5675 RRETURN(PCRE_ERROR_INTERNAL);
5676 }
5677
5678 /* eptr is now past the end of the maximum run. If possessive, we are
5679 done (no backing up). Otherwise, match at this position; anything other
5680 than no match is immediately returned. For nomatch, back up one
5681 character, unless we are matching \R and the last thing matched was
5682 \r\n, in which case, back up two bytes. */
5683
5684 if (possessive) continue;
5685 for(;;)
5686 {
5687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5689 if (eptr-- == pp) break; /* Stop if tried at original pos */
5690 BACKCHAR(eptr);
5691 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL &&
5692 eptr[-1] == CHAR_CR) eptr--;
5693 }
5694 }
5695 else
5696 #endif /* SUPPORT_UTF */
5697 /* Not UTF mode */
5698 {
5699 switch(ctype)
5700 {
5701 case OP_ANY:
5702 for (i = min; i < max; i++)
5703 {
5704 if (eptr >= md->end_subject)
5705 {
5706 SCHECK_PARTIAL();
5707 break;
5708 }
5709 if (IS_NEWLINE(eptr)) break;
5710 if (md->partial != 0 && /* Take care with CRLF partial */
5711 eptr + 1 >= md->end_subject &&
5712 NLBLOCK->nltype == NLTYPE_FIXED &&
5713 NLBLOCK->nllen == 2 &&
5714 *eptr == NLBLOCK->nl[0])
5715 {
5716 md->hitend = TRUE;
5717 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5718 }
5719 eptr++;
5720 }
5721 break;
5722
5723 case OP_ALLANY:
5724 case OP_ANYBYTE:
5725 c = max - min;
5726 if (c > (unsigned int)(md->end_subject - eptr))
5727 {
5728 eptr = md->end_subject;
5729 SCHECK_PARTIAL();
5730 }
5731 else eptr += c;
5732 break;
5733
5734 case OP_ANYNL:
5735 for (i = min; i < max; i++)
5736 {
5737 if (eptr >= md->end_subject)
5738 {
5739 SCHECK_PARTIAL();
5740 break;
5741 }
5742 c = *eptr;
5743 if (c == CHAR_CR)
5744 {
5745 if (++eptr >= md->end_subject) break;
5746 if (*eptr == CHAR_LF) eptr++;
5747 }
5748 else
5749 {
5750 if (c != CHAR_LF && (md->bsr_anycrlf ||
5751 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5752 #ifdef COMPILE_PCRE16
5753 && c != 0x2028 && c != 0x2029
5754 #endif
5755 ))) break;
5756 eptr++;
5757 }
5758 }
5759 break;
5760
5761 case OP_NOT_HSPACE:
5762 for (i = min; i < max; i++)
5763 {
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 switch(*eptr)
5770 {
5771 default: eptr++; break;
5772 HSPACE_BYTE_CASES:
5773 #ifdef COMPILE_PCRE16
5774 HSPACE_MULTIBYTE_CASES:
5775 #endif
5776 goto ENDLOOP00;
5777 }
5778 }
5779 ENDLOOP00:
5780 break;
5781
5782 case OP_HSPACE:
5783 for (i = min; i < max; i++)
5784 {
5785 if (eptr >= md->end_subject)
5786 {
5787 SCHECK_PARTIAL();
5788 break;
5789 }
5790 switch(*eptr)
5791 {
5792 default: goto ENDLOOP01;
5793 HSPACE_BYTE_CASES:
5794 #ifdef COMPILE_PCRE16
5795 HSPACE_MULTIBYTE_CASES:
5796 #endif
5797 eptr++; break;
5798 }
5799 }
5800 ENDLOOP01:
5801 break;
5802
5803 case OP_NOT_VSPACE:
5804 for (i = min; i < max; i++)
5805 {
5806 if (eptr >= md->end_subject)
5807 {
5808 SCHECK_PARTIAL();
5809 break;
5810 }
5811 switch(*eptr)
5812 {
5813 default: eptr++; break;
5814 VSPACE_BYTE_CASES:
5815 #ifdef COMPILE_PCRE16
5816 VSPACE_MULTIBYTE_CASES:
5817 #endif
5818 goto ENDLOOP02;
5819 }
5820 }
5821 ENDLOOP02:
5822 break;
5823
5824 case OP_VSPACE:
5825 for (i = min; i < max; i++)
5826 {
5827 if (eptr >= md->end_subject)
5828 {
5829 SCHECK_PARTIAL();
5830 break;
5831 }
5832 switch(*eptr)
5833 {
5834 default: goto ENDLOOP03;
5835 VSPACE_BYTE_CASES:
5836 #ifdef COMPILE_PCRE16
5837 VSPACE_MULTIBYTE_CASES:
5838 #endif
5839 eptr++; break;
5840 }
5841 }
5842 ENDLOOP03:
5843 break;
5844
5845 case OP_NOT_DIGIT:
5846 for (i = min; i < max; i++)
5847 {
5848 if (eptr >= md->end_subject)
5849 {
5850 SCHECK_PARTIAL();
5851 break;
5852 }
5853 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5854 eptr++;
5855 }
5856 break;
5857
5858 case OP_DIGIT:
5859 for (i = min; i < max; i++)
5860 {
5861 if (eptr >= md->end_subject)
5862 {
5863 SCHECK_PARTIAL();
5864 break;
5865 }
5866 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5867 eptr++;
5868 }
5869 break;
5870
5871 case OP_NOT_WHITESPACE:
5872 for (i = min; i < max; i++)
5873 {
5874 if (eptr >= md->end_subject)
5875 {
5876 SCHECK_PARTIAL();
5877 break;
5878 }
5879 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5880 eptr++;
5881 }
5882 break;
5883
5884 case OP_WHITESPACE:
5885 for (i = min; i < max; i++)
5886 {
5887 if (eptr >= md->end_subject)
5888 {
5889 SCHECK_PARTIAL();
5890 break;
5891 }
5892 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5893 eptr++;
5894 }
5895 break;
5896
5897 case OP_NOT_WORDCHAR:
5898 for (i = min; i < max; i++)
5899 {
5900 if (eptr >= md->end_subject)
5901 {
5902 SCHECK_PARTIAL();
5903 break;
5904 }
5905 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5906 eptr++;
5907 }
5908 break;
5909
5910 case OP_WORDCHAR:
5911 for (i = min; i < max; i++)
5912 {
5913 if (eptr >= md->end_subject)
5914 {
5915 SCHECK_PARTIAL();
5916 break;
5917 }
5918 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5919 eptr++;
5920 }
5921 break;
5922
5923 default:
5924 RRETURN(PCRE_ERROR_INTERNAL);
5925 }
5926
5927 /* eptr is now past the end of the maximum run. If possessive, we are
5928 done (no backing up). Otherwise, match at this position; anything other
5929 than no match is immediately returned. For nomatch, back up one
5930 character (byte), unless we are matching \R and the last thing matched
5931 was \r\n, in which case, back up two bytes. */
5932
5933 if (possessive) continue;
5934 while (eptr >= pp)
5935 {
5936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5938 eptr--;
5939 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
5940 eptr[-1] == CHAR_CR) eptr--;
5941 }
5942 }
5943
5944 /* Get here if we can't make it match with any permitted repetitions */
5945
5946 RRETURN(MATCH_NOMATCH);
5947 }
5948 /* Control never gets here */
5949
5950 /* There's been some horrible disaster. Arrival here can only mean there is
5951 something seriously wrong in the code above or the OP_xxx definitions. */
5952
5953 default:
5954 DPRINTF(("Unknown opcode %d\n", *ecode));
5955 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5956 }
5957
5958 /* Do not stick any code in here without much thought; it is assumed
5959 that "continue" in the code above comes out to here to repeat the main
5960 loop. */
5961
5962 } /* End of main loop */
5963 /* Control never reaches here */
5964
5965
5966 /* When compiling to use the heap rather than the stack for recursive calls to
5967 match(), the RRETURN() macro jumps here. The number that is saved in
5968 frame->Xwhere indicates which label we actually want to return to. */
5969
5970 #ifdef NO_RECURSE
5971 #define LBL(val) case val: goto L_RM##val;
5972 HEAP_RETURN:
5973 switch (frame->Xwhere)
5974 {
5975 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5976 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5977 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5978 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5979 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5980 LBL(65) LBL(66)
5981 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5982 LBL(21)
5983 #endif
5984 #ifdef SUPPORT_UTF
5985 LBL(16) LBL(18) LBL(20)
5986 LBL(22) LBL(23) LBL(28) LBL(30)
5987 LBL(32) LBL(34) LBL(42) LBL(46)
5988 #ifdef SUPPORT_UCP
5989 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5990 LBL(59) LBL(60) LBL(61) LBL(62)
5991 #endif /* SUPPORT_UCP */
5992 #endif /* SUPPORT_UTF */
5993 default:
5994 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5995
5996 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
5997
5998 return PCRE_ERROR_INTERNAL;
5999 }
6000 #undef LBL
6001 #endif /* NO_RECURSE */
6002 }
6003
6004
6005 /***************************************************************************
6006 ****************************************************************************
6007 RECURSION IN THE match() FUNCTION
6008
6009 Undefine all the macros that were defined above to handle this. */
6010
6011 #ifdef NO_RECURSE
6012 #undef eptr
6013 #undef ecode
6014 #undef mstart
6015 #undef offset_top
6016 #undef eptrb
6017 #undef flags
6018
6019 #undef callpat
6020 #undef charptr
6021 #undef data
6022 #undef next
6023 #undef pp
6024 #undef prev
6025 #undef saved_eptr
6026
6027 #undef new_recursive
6028
6029 #undef cur_is_word
6030 #undef condition
6031 #undef prev_is_word
6032
6033 #undef ctype
6034 #undef length
6035 #undef max
6036 #undef min
6037 #undef number
6038 #undef offset
6039 #undef op
6040 #undef save_capture_last
6041 #undef save_offset1
6042 #undef save_offset2
6043 #undef save_offset3
6044 #undef stacksave
6045
6046 #undef newptrb
6047
6048 #endif
6049
6050 /* These two are defined as macros in both cases */
6051
6052 #undef fc
6053 #undef fi
6054
6055 /***************************************************************************
6056 ***************************************************************************/
6057
6058
6059 #ifdef NO_RECURSE
6060 /*************************************************
6061 * Release allocated heap frames *
6062 *************************************************/
6063
6064 /* This function releases all the allocated frames. The base frame is on the
6065 machine stack, and so must not be freed.
6066
6067 Argument: the address of the base frame
6068 Returns: nothing
6069 */
6070
6071 static void
6072 release_match_heapframes (heapframe *frame_base)
6073 {
6074 heapframe *nextframe = frame_base->Xnextframe;
6075 while (nextframe != NULL)
6076 {
6077 heapframe *oldframe = nextframe;
6078 nextframe = nextframe->Xnextframe;
6079 (PUBL(stack_free))(oldframe);
6080 }
6081 }
6082 #endif
6083
6084
6085 /*************************************************
6086 * Execute a Regular Expression *
6087 *************************************************/
6088
6089 /* This function applies a compiled re to a subject string and picks out
6090 portions of the string if it matches. Two elements in the vector are set for
6091 each substring: the offsets to the start and end of the substring.
6092
6093 Arguments:
6094 argument_re points to the compiled expression
6095 extra_data points to extra data or is NULL
6096 subject points to the subject string
6097 length length of subject string (may contain binary zeros)
6098 start_offset where to start in the subject string
6099 options option bits
6100 offsets points to a vector of ints to be filled in with offsets
6101 offsetcount the number of elements in the vector
6102
6103 Returns: > 0 => success; value is the number of elements filled in
6104 = 0 => success, but offsets is not big enough
6105 -1 => failed to match
6106 < -1 => some kind of unexpected problem
6107 */
6108
6109 #ifdef COMPILE_PCRE8
6110 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6111 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6112 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6113 int offsetcount)
6114 #else
6115 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6116 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6117 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6118 int offsetcount)
6119 #endif
6120 {
6121 int rc, ocount, arg_offset_max;
6122 int newline;
6123 BOOL using_temporary_offsets = FALSE;
6124 BOOL anchored;
6125 BOOL startline;
6126 BOOL firstline;
6127 BOOL utf;
6128 BOOL has_first_char = FALSE;
6129 BOOL has_req_char = FALSE;
6130 pcre_uchar first_char = 0;
6131 pcre_uchar first_char2 = 0;
6132 pcre_uchar req_char = 0;
6133 pcre_uchar req_char2 = 0;
6134 match_data match_block;
6135 match_data *md = &match_block;
6136 const pcre_uint8 *tables;
6137 const pcre_uint8 *start_bits = NULL;
6138 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6139 PCRE_PUCHAR end_subject;
6140 PCRE_PUCHAR start_partial = NULL;
6141 PCRE_PUCHAR req_char_ptr = start_match - 1;
6142
6143 const pcre_study_data *study;
6144 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6145
6146 #ifdef NO_RECURSE
6147 heapframe frame_zero;
6148 frame_zero.Xprevframe = NULL; /* Marks the top level */
6149 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6150 md->match_frames_base = &frame_zero;
6151 #endif
6152
6153 /* Check for the special magic call that measures the size of the stack used
6154 per recursive call of match(). Without the funny casting for sizeof, a Windows
6155 compiler gave this error: "unary minus operator applied to unsigned type,
6156 result still unsigned". Hopefully the cast fixes that. */
6157
6158 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6159 start_offset == -999)
6160 #ifdef NO_RECURSE
6161 return -((int)sizeof(heapframe));
6162 #else
6163 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6164 #endif
6165
6166 /* Plausibility checks */
6167
6168 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6169 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6170 return PCRE_ERROR_NULL;
6171 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6172 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6173
6174 /* Check that the first field in the block is the magic number. If it is not,
6175 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6176 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6177 means that the pattern is likely compiled with different endianness. */
6178
6179 if (re->magic_number != MAGIC_NUMBER)
6180 return re->magic_number == REVERSED_MAGIC_NUMBER?
6181 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6182 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6183
6184 /* These two settings are used in the code for checking a UTF-8 string that
6185 follows immediately afterwards. Other values in the md block are used only
6186 during "normal" pcre_exec() processing, not when the JIT support is in use,
6187 so they are set up later. */
6188
6189 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6190 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6191 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6192 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6193
6194 /* Check a UTF-8 string if required. Pass back the character offset and error
6195 code for an invalid string if a results vector is available. */
6196
6197 #ifdef SUPPORT_UTF
6198 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6199 {
6200 int erroroffset;
6201 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6202 if (errorcode != 0)
6203 {
6204 if (offsetcount >= 2)
6205 {
6206 offsets[0] = erroroffset;
6207 offsets[1] = errorcode;
6208 }
6209 #ifdef COMPILE_PCRE16
6210 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6211 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6212 #else
6213 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6214 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6215 #endif
6216 }
6217
6218 /* Check that a start_offset points to the start of a UTF character. */
6219 if (start_offset > 0 && start_offset < length &&
6220 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6221 return PCRE_ERROR_BADUTF8_OFFSET;
6222 }
6223 #endif
6224
6225 /* If the pattern was successfully studied with JIT support, run the JIT
6226 executable instead of the rest of this function. Most options must be set at
6227 compile time for the JIT code to be usable. Fallback to the normal code path if
6228 an unsupported flag is set. */
6229
6230 #ifdef SUPPORT_JIT
6231 if (extra_data != NULL
6232 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6233 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6234 && extra_data->executable_jit != NULL
6235 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6236 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6237 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6238 {
6239 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6240 start_offset, options, offsets, offsetcount);
6241
6242 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6243 mode is not compiled. In this case we simply fallback to interpreter. */
6244
6245 if (rc != PCRE_ERROR_NULL) return rc;
6246 }
6247 #endif
6248
6249 /* Carry on with non-JIT matching. This information is for finding all the
6250 numbers associated with a given name, for condition testing. */
6251
6252 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6253 md->name_count = re->name_count;
6254 md->name_entry_size = re->name_entry_size;
6255
6256 /* Fish out the optional data from the extra_data structure, first setting
6257 the default values. */
6258
6259 study = NULL;
6260 md->match_limit = MATCH_LIMIT;
6261 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6262 md->callout_data = NULL;
6263
6264 /* The table pointer is always in native byte order. */
6265
6266 tables = re->tables;
6267
6268 if (extra_data != NULL)
6269 {
6270 register unsigned int flags = extra_data->flags;
6271 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6272 study = (const pcre_study_data *)extra_data->study_data;
6273 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6274 md->match_limit = extra_data->match_limit;
6275 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6276 md->match_limit_recursion = extra_data->match_limit_recursion;
6277 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6278 md->callout_data = extra_data->callout_data;
6279 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6280 }
6281
6282 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6283 is a feature that makes it possible to save compiled regex and re-use them
6284 in other programs later. */
6285
6286 if (tables == NULL) tables = PRIV(default_tables);
6287
6288 /* Set up other data */
6289
6290 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6291 startline = (re->flags & PCRE_STARTLINE) != 0;
6292 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6293
6294 /* The code starts after the real_pcre block and the capture name table. */
6295
6296 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6297 re->name_count * re->name_entry_size;
6298
6299 md->start_subject = (PCRE_PUCHAR)subject;
6300 md->start_offset = start_offset;
6301 md->end_subject = md->start_subject + length;
6302 end_subject = md->end_subject;
6303
6304 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6305 md->use_ucp = (re->options & PCRE_UCP) != 0;
6306 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6307 md->ignore_skip_arg = FALSE;
6308
6309 /* Some options are unpacked into BOOL variables in the hope that testing
6310 them will be faster than individual option bits. */
6311
6312 md->notbol = (options & PCRE_NOTBOL) != 0;
6313 md->noteol = (options & PCRE_NOTEOL) != 0;
6314 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6315 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6316
6317 md->hitend = FALSE;
6318 md->mark = md->nomatch_mark = NULL; /* In case never set */
6319
6320 md->recursive = NULL; /* No recursion at top level */
6321 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6322
6323 md->lcc = tables + lcc_offset;
6324 md->fcc = tables + fcc_offset;
6325 md->ctypes = tables + ctypes_offset;
6326
6327 /* Handle different \R options. */
6328
6329 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6330 {
6331 case 0:
6332 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6333 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6334 else
6335 #ifdef BSR_ANYCRLF
6336 md->bsr_anycrlf = TRUE;
6337 #else
6338 md->bsr_anycrlf = FALSE;
6339 #endif
6340 break;
6341
6342 case PCRE_BSR_ANYCRLF:
6343 md->bsr_anycrlf = TRUE;
6344 break;
6345
6346 case PCRE_BSR_UNICODE:
6347 md->bsr_anycrlf = FALSE;
6348 break;
6349
6350 default: return PCRE_ERROR_BADNEWLINE;
6351 }
6352
6353 /* Handle different types of newline. The three bits give eight cases. If
6354 nothing is set at run time, whatever was used at compile time applies. */
6355
6356 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6357 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6358 {
6359 case 0: newline = NEWLINE; break; /* Compile-time default */
6360 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6361 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6362 case PCRE_NEWLINE_CR+
6363 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6364 case PCRE_NEWLINE_ANY: newline = -1; break;
6365 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6366 default: return PCRE_ERROR_BADNEWLINE;
6367 }
6368
6369 if (newline == -2)
6370 {
6371 md->nltype = NLTYPE_ANYCRLF;
6372 }
6373 else if (newline < 0)
6374 {
6375 md->nltype = NLTYPE_ANY;
6376 }
6377 else
6378 {
6379 md->nltype = NLTYPE_FIXED;
6380 if (newline > 255)
6381 {
6382 md->nllen = 2;
6383 md->nl[0] = (newline >> 8) & 255;
6384 md->nl[1] = newline & 255;
6385 }
6386 else
6387 {
6388 md->nllen = 1;
6389 md->nl[0] = newline;
6390 }
6391 }
6392
6393 /* Partial matching was originally supported only for a restricted set of
6394 regexes; from release 8.00 there are no restrictions, but the bits are still
6395 defined (though never set). So there's no harm in leaving this code. */
6396
6397 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6398 return PCRE_ERROR_BADPARTIAL;
6399
6400 /* If the expression has got more back references than the offsets supplied can
6401 hold, we get a temporary chunk of working store to use during the matching.
6402 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6403 of 3. */
6404
6405 ocount = offsetcount - (offsetcount % 3);
6406 arg_offset_max = (2*ocount)/3;
6407
6408 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6409 {
6410 ocount = re->top_backref * 3 + 3;
6411 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6412 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6413 using_temporary_offsets = TRUE;
6414 DPRINTF(("Got memory to hold back references\n"));
6415 }
6416 else md->offset_vector = offsets;
6417
6418 md->offset_end = ocount;
6419 md->offset_max = (2*ocount)/3;
6420 md->offset_overflow = FALSE;
6421 md->capture_last = -1;
6422
6423 /* Reset the working variable associated with each extraction. These should
6424 never be used unless previously set, but they get saved and restored, and so we
6425 initialize them to avoid reading uninitialized locations. Also, unset the
6426 offsets for the matched string. This is really just for tidiness with callouts,
6427 in case they inspect these fields. */
6428
6429 if (md->offset_vector != NULL)
6430 {
6431 register int *iptr = md->offset_vector + ocount;
6432 register int *iend = iptr - re->top_bracket;
6433 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6434 while (--iptr >= iend) *iptr = -1;
6435 md->offset_vector[0] = md->offset_vector[1] = -1;
6436 }
6437
6438 /* Set up the first character to match, if available. The first_char value is
6439 never set for an anchored regular expression, but the anchoring may be forced
6440 at run time, so we have to test for anchoring. The first char may be unset for
6441 an unanchored pattern, of course. If there's no first char and the pattern was
6442 studied, there may be a bitmap of possible first characters. */
6443
6444 if (!anchored)
6445 {
6446 if ((re->flags & PCRE_FIRSTSET) != 0)
6447 {
6448 has_first_char = TRUE;
6449 first_char = first_char2 = (pcre_uchar)(re->first_char);
6450 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6451 {
6452 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6453 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6454 if (utf && first_char > 127)
6455 first_char2 = UCD_OTHERCASE(first_char);
6456 #endif
6457 }
6458 }
6459 else
6460 if (!startline && study != NULL &&
6461 (study->flags & PCRE_STUDY_MAPPED) != 0)
6462 start_bits = study->start_bits;
6463 }
6464
6465 /* For anchored or unanchored matches, there may be a "last known required
6466 character" set. */
6467
6468 if ((re->flags & PCRE_REQCHSET) != 0)
6469 {
6470 has_req_char = TRUE;
6471 req_char = req_char2 = (pcre_uchar)(re->req_char);
6472 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6473 {
6474 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6475 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6476 if (utf && req_char > 127)
6477 req_char2 = UCD_OTHERCASE(req_char);
6478 #endif
6479 }
6480 }
6481
6482
6483 /* ==========================================================================*/
6484
6485 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6486 the loop runs just once. */
6487
6488 for(;;)
6489 {
6490 PCRE_PUCHAR save_end_subject = end_subject;
6491 PCRE_PUCHAR new_start_match;
6492
6493 /* If firstline is TRUE, the start of the match is constrained to the first
6494 line of a multiline string. That is, the match must be before or at the first
6495 newline. Implement this by temporarily adjusting end_subject so that we stop
6496 scanning at a newline. If the match fails at the newline, later code breaks
6497 this loop. */
6498
6499 if (firstline)
6500 {
6501 PCRE_PUCHAR t = start_match;
6502 #ifdef SUPPORT_UTF
6503 if (utf)
6504 {
6505 while (t < md->end_subject && !IS_NEWLINE(t))
6506 {
6507 t++;
6508 ACROSSCHAR(t < end_subject, *t, t++);
6509 }
6510 }
6511 else
6512 #endif
6513 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6514 end_subject = t;
6515 }
6516
6517 /* There are some optimizations that avoid running the match if a known
6518 starting point is not found, or if a known later character is not present.
6519 However, there is an option that disables these, for testing and for ensuring
6520 that all callouts do actually occur. The option can be set in the regex by
6521 (*NO_START_OPT) or passed in match-time options. */
6522
6523 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6524 {
6525 /* Advance to a unique first char if there is one. */
6526
6527 if (has_first_char)
6528 {
6529 if (first_char != first_char2)
6530 while (start_match < end_subject &&
6531 *start_match != first_char && *start_match != first_char2)
6532 start_match++;
6533 else
6534 while (start_match < end_subject && *start_match != first_char)
6535 start_match++;
6536 }
6537
6538 /* Or to just after a linebreak for a multiline match */
6539
6540 else if (startline)
6541 {
6542 if (start_match > md->start_subject + start_offset)
6543 {
6544 #ifdef SUPPORT_UTF
6545 if (utf)
6546 {
6547 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6548 {
6549 start_match++;
6550 ACROSSCHAR(start_match < end_subject, *start_match,
6551 start_match++);
6552 }
6553 }
6554 else
6555 #endif
6556 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6557 start_match++;
6558
6559 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6560 and we are now at a LF, advance the match position by one more character.
6561 */
6562
6563 if (start_match[-1] == CHAR_CR &&
6564 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6565 start_match < end_subject &&
6566 *start_match == CHAR_NL)
6567 start_match++;
6568 }
6569 }
6570
6571 /* Or to a non-unique first byte after study */
6572
6573 else if (start_bits != NULL)
6574 {
6575 while (start_match < end_subject)
6576 {
6577 register unsigned int c = *start_match;
6578 #ifndef COMPILE_PCRE8
6579 if (c > 255) c = 255;
6580 #endif
6581 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6582 {
6583 start_match++;
6584 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6585 /* In non 8-bit mode, the iteration will stop for
6586 characters > 255 at the beginning or not stop at all. */
6587 if (utf)
6588 ACROSSCHAR(start_match < end_subject, *start_match,
6589 start_match++);
6590 #endif
6591 }
6592 else break;
6593 }
6594 }
6595 } /* Starting optimizations */
6596
6597 /* Restore fudged end_subject */
6598
6599 end_subject = save_end_subject;
6600
6601 /* The following two optimizations are disabled for partial matching or if
6602 disabling is explicitly requested. */
6603
6604 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6605 {
6606 /* If the pattern was studied, a minimum subject length may be set. This is
6607 a lower bound; no actual string of that length may actually match the
6608 pattern. Although the value is, strictly, in characters, we treat it as
6609 bytes to avoid spending too much time in this optimization. */
6610
6611 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6612 (pcre_uint32)(end_subject - start_match) < study->minlength)
6613 {
6614 rc = MATCH_NOMATCH;
6615 break;
6616 }
6617
6618 /* If req_char is set, we know that that character must appear in the
6619 subject for the match to succeed. If the first character is set, req_char
6620 must be later in the subject; otherwise the test starts at the match point.
6621 This optimization can save a huge amount of backtracking in patterns with
6622 nested unlimited repeats that aren't going to match. Writing separate code
6623 for cased/caseless versions makes it go faster, as does using an
6624 autoincrement and backing off on a match.
6625
6626 HOWEVER: when the subject string is very, very long, searching to its end
6627 can take a long time, and give bad performance on quite ordinary patterns.
6628 This showed up when somebody was matching something like /^\d+C/ on a
6629 32-megabyte string... so we don't do this when the string is sufficiently
6630 long. */
6631
6632 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6633 {
6634 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6635
6636 /* We don't need to repeat the search if we haven't yet reached the
6637 place we found it at last time. */
6638
6639 if (p > req_char_ptr)
6640 {
6641 if (req_char != req_char2)
6642 {
6643 while (p < end_subject)
6644 {
6645 register int pp = *p++;
6646 if (pp == req_char || pp == req_char2) { p--; break; }
6647 }
6648 }
6649 else
6650 {
6651 while (p < end_subject)
6652 {
6653 if (*p++ == req_char) { p--; break; }
6654 }
6655 }
6656
6657 /* If we can't find the required character, break the matching loop,
6658 forcing a match failure. */
6659
6660 if (p >= end_subject)
6661 {
6662 rc = MATCH_NOMATCH;
6663 break;
6664 }
6665
6666 /* If we have found the required character, save the point where we
6667 found it, so that we don't search again next time round the loop if
6668 the start hasn't passed this character yet. */
6669
6670 req_char_ptr = p;
6671 }
6672 }
6673 }
6674
6675 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6676 printf(">>>> Match against: ");
6677 pchars(start_match, end_subject - start_match, TRUE, md);
6678 printf("\n");
6679 #endif
6680
6681 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6682 first starting point for which a partial match was found. */
6683
6684 md->start_match_ptr = start_match;
6685 md->start_used_ptr = start_match;
6686 md->match_call_count = 0;
6687 md->match_function_type = 0;
6688 md->end_offset_top = 0;
6689 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6690 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6691
6692 switch(rc)
6693 {
6694 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6695 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6696 entirely. The only way we can do that is to re-do the match at the same
6697 point, with a flag to force SKIP with an argument to be ignored. Just
6698 treating this case as NOMATCH does not work because it does not check other
6699 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6700
6701 case MATCH_SKIP_ARG:
6702 new_start_match = start_match;
6703 md->ignore_skip_arg = TRUE;
6704 break;
6705
6706 /* SKIP passes back the next starting point explicitly, but if it is the
6707 same as the match we have just done, treat it as NOMATCH. */
6708
6709 case MATCH_SKIP:
6710 if (md->start_match_ptr != start_match)
6711 {
6712 new_start_match = md->start_match_ptr;
6713 break;
6714 }
6715 /* Fall through */
6716
6717 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6718 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6719
6720 case MATCH_NOMATCH:
6721 case MATCH_PRUNE:
6722 case MATCH_THEN:
6723 md->ignore_skip_arg = FALSE;
6724 new_start_match = start_match + 1;
6725 #ifdef SUPPORT_UTF
6726 if (utf)
6727 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6728 new_start_match++);
6729 #endif
6730 break;
6731
6732 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6733
6734 case MATCH_COMMIT:
6735 rc = MATCH_NOMATCH;
6736 goto ENDLOOP;
6737
6738 /* Any other return is either a match, or some kind of error. */
6739
6740 default:
6741 goto ENDLOOP;
6742 }
6743
6744 /* Control reaches here for the various types of "no match at this point"
6745 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6746
6747 rc = MATCH_NOMATCH;
6748
6749 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6750 newline in the subject (though it may continue over the newline). Therefore,
6751 if we have just failed to match, starting at a newline, do not continue. */
6752
6753 if (firstline && IS_NEWLINE(start_match)) break;
6754
6755 /* Advance to new matching position */
6756
6757 start_match = new_start_match;
6758
6759 /* Break the loop if the pattern is anchored or if we have passed the end of
6760 the subject. */
6761
6762 if (anchored || start_match > end_subject) break;
6763
6764 /* If we have just passed a CR and we are now at a LF, and the pattern does
6765 not contain any explicit matches for \r or \n, and the newline option is CRLF
6766 or ANY or ANYCRLF, advance the match position by one more character. In
6767 normal matching start_match will aways be greater than the first position at
6768 this stage, but a failed *SKIP can cause a return at the same point, which is
6769