/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1046 - (show annotations)
Tue Sep 25 16:27:58 2012 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 210255 byte(s)
Error occurred while calculating annotation data.
All the remaining changes for handling characters with more than one other 
case.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95
96
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
101
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
104
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
110
111 Returns: nothing
112 */
113
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
116 {
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 }
122 #endif
123
124
125
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
129
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
134
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
141
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
145 */
146
147 static int
148 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
150 {
151 PCRE_PUCHAR eptr_start = eptr;
152 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 data units matched may differ, because in UTF-8 there are some characters
184 whose upper and lower case versions code have different numbers of bytes.
185 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
186 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
187 sequence of two of the latter. It is important, therefore, to check the
188 length along the reference, not along the subject (earlier code did this
189 wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 unsigned int c, d;
195 const ucd_record *ur;
196 if (eptr >= md->end_subject) return -2; /* Partial match */
197 GETCHARINC(c, eptr);
198 GETCHARINC(d, p);
199 ur = GET_UCD(d);
200 if (c != d && c != d + ur->other_case)
201 {
202 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
203 for (;;)
204 {
205 if (c < *pp) return -1;
206 if (c == *pp++) break;
207 }
208 }
209 }
210 }
211 else
212 #endif
213 #endif
214
215 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
216 is no UCP support. */
217 {
218 while (length-- > 0)
219 {
220 if (eptr >= md->end_subject) return -2; /* Partial match */
221 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
222 p++;
223 eptr++;
224 }
225 }
226 }
227
228 /* In the caseful case, we can just compare the bytes, whether or not we
229 are in UTF-8 mode. */
230
231 else
232 {
233 while (length-- > 0)
234 {
235 if (eptr >= md->end_subject) return -2; /* Partial match */
236 if (*p++ != *eptr++) return -1;
237 }
238 }
239
240 return (int)(eptr - eptr_start);
241 }
242
243
244
245 /***************************************************************************
246 ****************************************************************************
247 RECURSION IN THE match() FUNCTION
248
249 The match() function is highly recursive, though not every recursive call
250 increases the recursive depth. Nevertheless, some regular expressions can cause
251 it to recurse to a great depth. I was writing for Unix, so I just let it call
252 itself recursively. This uses the stack for saving everything that has to be
253 saved for a recursive call. On Unix, the stack can be large, and this works
254 fine.
255
256 It turns out that on some non-Unix-like systems there are problems with
257 programs that use a lot of stack. (This despite the fact that every last chip
258 has oodles of memory these days, and techniques for extending the stack have
259 been known for decades.) So....
260
261 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
262 calls by keeping local variables that need to be preserved in blocks of memory
263 obtained from malloc() instead instead of on the stack. Macros are used to
264 achieve this so that the actual code doesn't look very different to what it
265 always used to.
266
267 The original heap-recursive code used longjmp(). However, it seems that this
268 can be very slow on some operating systems. Following a suggestion from Stan
269 Switzer, the use of longjmp() has been abolished, at the cost of having to
270 provide a unique number for each call to RMATCH. There is no way of generating
271 a sequence of numbers at compile time in C. I have given them names, to make
272 them stand out more clearly.
273
274 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
275 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
276 tests. Furthermore, not using longjmp() means that local dynamic variables
277 don't have indeterminate values; this has meant that the frame size can be
278 reduced because the result can be "passed back" by straight setting of the
279 variable instead of being passed in the frame.
280 ****************************************************************************
281 ***************************************************************************/
282
283 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
284 below must be updated in sync. */
285
286 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
287 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
288 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
289 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
290 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
291 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
292 RM61, RM62, RM63, RM64, RM65, RM66 };
293
294 /* These versions of the macros use the stack, as normal. There are debugging
295 versions and production versions. Note that the "rw" argument of RMATCH isn't
296 actually used in this definition. */
297
298 #ifndef NO_RECURSE
299 #define REGISTER register
300
301 #ifdef PCRE_DEBUG
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 { \
304 printf("match() called in line %d\n", __LINE__); \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
306 printf("to line %d\n", __LINE__); \
307 }
308 #define RRETURN(ra) \
309 { \
310 printf("match() returned %d from line %d ", ra, __LINE__); \
311 return ra; \
312 }
313 #else
314 #define RMATCH(ra,rb,rc,rd,re,rw) \
315 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
316 #define RRETURN(ra) return ra
317 #endif
318
319 #else
320
321
322 /* These versions of the macros manage a private stack on the heap. Note that
323 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
324 argument of match(), which never changes. */
325
326 #define REGISTER
327
328 #define RMATCH(ra,rb,rc,rd,re,rw)\
329 {\
330 heapframe *newframe = frame->Xnextframe;\
331 if (newframe == NULL)\
332 {\
333 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
334 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
335 newframe->Xnextframe = NULL;\
336 frame->Xnextframe = newframe;\
337 }\
338 frame->Xwhere = rw;\
339 newframe->Xeptr = ra;\
340 newframe->Xecode = rb;\
341 newframe->Xmstart = mstart;\
342 newframe->Xoffset_top = rc;\
343 newframe->Xeptrb = re;\
344 newframe->Xrdepth = frame->Xrdepth + 1;\
345 newframe->Xprevframe = frame;\
346 frame = newframe;\
347 DPRINTF(("restarting from line %d\n", __LINE__));\
348 goto HEAP_RECURSE;\
349 L_##rw:\
350 DPRINTF(("jumped back to line %d\n", __LINE__));\
351 }
352
353 #define RRETURN(ra)\
354 {\
355 heapframe *oldframe = frame;\
356 frame = oldframe->Xprevframe;\
357 if (frame != NULL)\
358 {\
359 rrc = ra;\
360 goto HEAP_RETURN;\
361 }\
362 return ra;\
363 }
364
365
366 /* Structure for remembering the local variables in a private frame */
367
368 typedef struct heapframe {
369 struct heapframe *Xprevframe;
370 struct heapframe *Xnextframe;
371
372 /* Function arguments that may change */
373
374 PCRE_PUCHAR Xeptr;
375 const pcre_uchar *Xecode;
376 PCRE_PUCHAR Xmstart;
377 int Xoffset_top;
378 eptrblock *Xeptrb;
379 unsigned int Xrdepth;
380
381 /* Function local variables */
382
383 PCRE_PUCHAR Xcallpat;
384 #ifdef SUPPORT_UTF
385 PCRE_PUCHAR Xcharptr;
386 #endif
387 PCRE_PUCHAR Xdata;
388 PCRE_PUCHAR Xnext;
389 PCRE_PUCHAR Xpp;
390 PCRE_PUCHAR Xprev;
391 PCRE_PUCHAR Xsaved_eptr;
392
393 recursion_info Xnew_recursive;
394
395 BOOL Xcur_is_word;
396 BOOL Xcondition;
397 BOOL Xprev_is_word;
398
399 #ifdef SUPPORT_UCP
400 int Xprop_type;
401 int Xprop_value;
402 int Xprop_fail_result;
403 int Xoclength;
404 pcre_uchar Xocchars[6];
405 #endif
406
407 int Xcodelink;
408 int Xctype;
409 unsigned int Xfc;
410 int Xfi;
411 int Xlength;
412 int Xmax;
413 int Xmin;
414 int Xnumber;
415 int Xoffset;
416 int Xop;
417 int Xsave_capture_last;
418 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
419 int Xstacksave[REC_STACK_SAVE_MAX];
420
421 eptrblock Xnewptrb;
422
423 /* Where to jump back to */
424
425 int Xwhere;
426
427 } heapframe;
428
429 #endif
430
431
432 /***************************************************************************
433 ***************************************************************************/
434
435
436
437 /*************************************************
438 * Match from current position *
439 *************************************************/
440
441 /* This function is called recursively in many circumstances. Whenever it
442 returns a negative (error) response, the outer incarnation must also return the
443 same response. */
444
445 /* These macros pack up tests that are used for partial matching, and which
446 appear several times in the code. We set the "hit end" flag if the pointer is
447 at the end of the subject and also past the start of the subject (i.e.
448 something has been matched). For hard partial matching, we then return
449 immediately. The second one is used when we already know we are past the end of
450 the subject. */
451
452 #define CHECK_PARTIAL()\
453 if (md->partial != 0 && eptr >= md->end_subject && \
454 eptr > md->start_used_ptr) \
455 { \
456 md->hitend = TRUE; \
457 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
458 }
459
460 #define SCHECK_PARTIAL()\
461 if (md->partial != 0 && eptr > md->start_used_ptr) \
462 { \
463 md->hitend = TRUE; \
464 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
465 }
466
467
468 /* Performance note: It might be tempting to extract commonly used fields from
469 the md structure (e.g. utf, end_subject) into individual variables to improve
470 performance. Tests using gcc on a SPARC disproved this; in the first case, it
471 made performance worse.
472
473 Arguments:
474 eptr pointer to current character in subject
475 ecode pointer to current position in compiled code
476 mstart pointer to the current match start position (can be modified
477 by encountering \K)
478 offset_top current top pointer
479 md pointer to "static" info for the match
480 eptrb pointer to chain of blocks containing eptr at start of
481 brackets - for testing for empty matches
482 rdepth the recursion depth
483
484 Returns: MATCH_MATCH if matched ) these values are >= 0
485 MATCH_NOMATCH if failed to match )
486 a negative MATCH_xxx value for PRUNE, SKIP, etc
487 a negative PCRE_ERROR_xxx value if aborted by an error condition
488 (e.g. stopped by repeated call or recursion limit)
489 */
490
491 static int
492 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
493 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
494 unsigned int rdepth)
495 {
496 /* These variables do not need to be preserved over recursion in this function,
497 so they can be ordinary variables in all cases. Mark some of them with
498 "register" because they are used a lot in loops. */
499
500 register int rrc; /* Returns from recursive calls */
501 register int i; /* Used for loops not involving calls to RMATCH() */
502 register unsigned int c; /* Character values not kept over RMATCH() calls */
503 register BOOL utf; /* Local copy of UTF flag for speed */
504
505 BOOL minimize, possessive; /* Quantifier options */
506 BOOL caseless;
507 int condcode;
508
509 /* When recursion is not being used, all "local" variables that have to be
510 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
511 frame on the stack here; subsequent instantiations are obtained from the heap
512 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
513 the top-level on the stack rather than malloc-ing them all gives a performance
514 boost in many cases where there is not much "recursion". */
515
516 #ifdef NO_RECURSE
517 heapframe *frame = (heapframe *)md->match_frames_base;
518
519 /* Copy in the original argument variables */
520
521 frame->Xeptr = eptr;
522 frame->Xecode = ecode;
523 frame->Xmstart = mstart;
524 frame->Xoffset_top = offset_top;
525 frame->Xeptrb = eptrb;
526 frame->Xrdepth = rdepth;
527
528 /* This is where control jumps back to to effect "recursion" */
529
530 HEAP_RECURSE:
531
532 /* Macros make the argument variables come from the current frame */
533
534 #define eptr frame->Xeptr
535 #define ecode frame->Xecode
536 #define mstart frame->Xmstart
537 #define offset_top frame->Xoffset_top
538 #define eptrb frame->Xeptrb
539 #define rdepth frame->Xrdepth
540
541 /* Ditto for the local variables */
542
543 #ifdef SUPPORT_UTF
544 #define charptr frame->Xcharptr
545 #endif
546 #define callpat frame->Xcallpat
547 #define codelink frame->Xcodelink
548 #define data frame->Xdata
549 #define next frame->Xnext
550 #define pp frame->Xpp
551 #define prev frame->Xprev
552 #define saved_eptr frame->Xsaved_eptr
553
554 #define new_recursive frame->Xnew_recursive
555
556 #define cur_is_word frame->Xcur_is_word
557 #define condition frame->Xcondition
558 #define prev_is_word frame->Xprev_is_word
559
560 #ifdef SUPPORT_UCP
561 #define prop_type frame->Xprop_type
562 #define prop_value frame->Xprop_value
563 #define prop_fail_result frame->Xprop_fail_result
564 #define oclength frame->Xoclength
565 #define occhars frame->Xocchars
566 #endif
567
568 #define ctype frame->Xctype
569 #define fc frame->Xfc
570 #define fi frame->Xfi
571 #define length frame->Xlength
572 #define max frame->Xmax
573 #define min frame->Xmin
574 #define number frame->Xnumber
575 #define offset frame->Xoffset
576 #define op frame->Xop
577 #define save_capture_last frame->Xsave_capture_last
578 #define save_offset1 frame->Xsave_offset1
579 #define save_offset2 frame->Xsave_offset2
580 #define save_offset3 frame->Xsave_offset3
581 #define stacksave frame->Xstacksave
582
583 #define newptrb frame->Xnewptrb
584
585 /* When recursion is being used, local variables are allocated on the stack and
586 get preserved during recursion in the normal way. In this environment, fi and
587 i, and fc and c, can be the same variables. */
588
589 #else /* NO_RECURSE not defined */
590 #define fi i
591 #define fc c
592
593 /* Many of the following variables are used only in small blocks of the code.
594 My normal style of coding would have declared them within each of those blocks.
595 However, in order to accommodate the version of this code that uses an external
596 "stack" implemented on the heap, it is easier to declare them all here, so the
597 declarations can be cut out in a block. The only declarations within blocks
598 below are for variables that do not have to be preserved over a recursive call
599 to RMATCH(). */
600
601 #ifdef SUPPORT_UTF
602 const pcre_uchar *charptr;
603 #endif
604 const pcre_uchar *callpat;
605 const pcre_uchar *data;
606 const pcre_uchar *next;
607 PCRE_PUCHAR pp;
608 const pcre_uchar *prev;
609 PCRE_PUCHAR saved_eptr;
610
611 recursion_info new_recursive;
612
613 BOOL cur_is_word;
614 BOOL condition;
615 BOOL prev_is_word;
616
617 #ifdef SUPPORT_UCP
618 int prop_type;
619 int prop_value;
620 int prop_fail_result;
621 int oclength;
622 pcre_uchar occhars[6];
623 #endif
624
625 int codelink;
626 int ctype;
627 int length;
628 int max;
629 int min;
630 int number;
631 int offset;
632 int op;
633 int save_capture_last;
634 int save_offset1, save_offset2, save_offset3;
635 int stacksave[REC_STACK_SAVE_MAX];
636
637 eptrblock newptrb;
638
639 /* There is a special fudge for calling match() in a way that causes it to
640 measure the size of its basic stack frame when the stack is being used for
641 recursion. The second argument (ecode) being NULL triggers this behaviour. It
642 cannot normally ever be NULL. The return is the negated value of the frame
643 size. */
644
645 if (ecode == NULL)
646 {
647 if (rdepth == 0)
648 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
649 else
650 {
651 int len = (char *)&rdepth - (char *)eptr;
652 return (len > 0)? -len : len;
653 }
654 }
655 #endif /* NO_RECURSE */
656
657 /* To save space on the stack and in the heap frame, I have doubled up on some
658 of the local variables that are used only in localised parts of the code, but
659 still need to be preserved over recursive calls of match(). These macros define
660 the alternative names that are used. */
661
662 #define allow_zero cur_is_word
663 #define cbegroup condition
664 #define code_offset codelink
665 #define condassert condition
666 #define matched_once prev_is_word
667 #define foc number
668 #define save_mark data
669
670 /* These statements are here to stop the compiler complaining about unitialized
671 variables. */
672
673 #ifdef SUPPORT_UCP
674 prop_value = 0;
675 prop_fail_result = 0;
676 #endif
677
678
679 /* This label is used for tail recursion, which is used in a few cases even
680 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
681 used. Thanks to Ian Taylor for noticing this possibility and sending the
682 original patch. */
683
684 TAIL_RECURSE:
685
686 /* OK, now we can get on with the real code of the function. Recursive calls
687 are specified by the macro RMATCH and RRETURN is used to return. When
688 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
689 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
690 defined). However, RMATCH isn't like a function call because it's quite a
691 complicated macro. It has to be used in one particular way. This shouldn't,
692 however, impact performance when true recursion is being used. */
693
694 #ifdef SUPPORT_UTF
695 utf = md->utf; /* Local copy of the flag */
696 #else
697 utf = FALSE;
698 #endif
699
700 /* First check that we haven't called match() too many times, or that we
701 haven't exceeded the recursive call limit. */
702
703 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
704 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
705
706 /* At the start of a group with an unlimited repeat that may match an empty
707 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
708 done this way to save having to use another function argument, which would take
709 up space on the stack. See also MATCH_CONDASSERT below.
710
711 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
712 such remembered pointers, to be checked when we hit the closing ket, in order
713 to break infinite loops that match no characters. When match() is called in
714 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
715 NOT be used with tail recursion, because the memory block that is used is on
716 the stack, so a new one may be required for each match(). */
717
718 if (md->match_function_type == MATCH_CBEGROUP)
719 {
720 newptrb.epb_saved_eptr = eptr;
721 newptrb.epb_prev = eptrb;
722 eptrb = &newptrb;
723 md->match_function_type = 0;
724 }
725
726 /* Now start processing the opcodes. */
727
728 for (;;)
729 {
730 minimize = possessive = FALSE;
731 op = *ecode;
732
733 switch(op)
734 {
735 case OP_MARK:
736 md->nomatch_mark = ecode + 2;
737 md->mark = NULL; /* In case previously set by assertion */
738 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
739 eptrb, RM55);
740 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
741 md->mark == NULL) md->mark = ecode + 2;
742
743 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
744 argument, and we must check whether that argument matches this MARK's
745 argument. It is passed back in md->start_match_ptr (an overloading of that
746 variable). If it does match, we reset that variable to the current subject
747 position and return MATCH_SKIP. Otherwise, pass back the return code
748 unaltered. */
749
750 else if (rrc == MATCH_SKIP_ARG &&
751 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
752 {
753 md->start_match_ptr = eptr;
754 RRETURN(MATCH_SKIP);
755 }
756 RRETURN(rrc);
757
758 case OP_FAIL:
759 RRETURN(MATCH_NOMATCH);
760
761 /* COMMIT overrides PRUNE, SKIP, and THEN */
762
763 case OP_COMMIT:
764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
765 eptrb, RM52);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
767 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
768 rrc != MATCH_THEN)
769 RRETURN(rrc);
770 RRETURN(MATCH_COMMIT);
771
772 /* PRUNE overrides THEN */
773
774 case OP_PRUNE:
775 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
776 eptrb, RM51);
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 RRETURN(MATCH_PRUNE);
779
780 case OP_PRUNE_ARG:
781 md->nomatch_mark = ecode + 2;
782 md->mark = NULL; /* In case previously set by assertion */
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
784 eptrb, RM56);
785 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
786 md->mark == NULL) md->mark = ecode + 2;
787 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
788 RRETURN(MATCH_PRUNE);
789
790 /* SKIP overrides PRUNE and THEN */
791
792 case OP_SKIP:
793 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
794 eptrb, RM53);
795 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
796 RRETURN(rrc);
797 md->start_match_ptr = eptr; /* Pass back current position */
798 RRETURN(MATCH_SKIP);
799
800 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
801 nomatch_mark. There is a flag that disables this opcode when re-matching a
802 pattern that ended with a SKIP for which there was not a matching MARK. */
803
804 case OP_SKIP_ARG:
805 if (md->ignore_skip_arg)
806 {
807 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
808 break;
809 }
810 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
811 eptrb, RM57);
812 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
813 RRETURN(rrc);
814
815 /* Pass back the current skip name by overloading md->start_match_ptr and
816 returning the special MATCH_SKIP_ARG return code. This will either be
817 caught by a matching MARK, or get to the top, where it causes a rematch
818 with the md->ignore_skip_arg flag set. */
819
820 md->start_match_ptr = ecode + 2;
821 RRETURN(MATCH_SKIP_ARG);
822
823 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
824 the branch in which it occurs can be determined. Overload the start of
825 match pointer to do this. */
826
827 case OP_THEN:
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
829 eptrb, RM54);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 md->start_match_ptr = ecode;
832 RRETURN(MATCH_THEN);
833
834 case OP_THEN_ARG:
835 md->nomatch_mark = ecode + 2;
836 md->mark = NULL; /* In case previously set by assertion */
837 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
838 md, eptrb, RM58);
839 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
840 md->mark == NULL) md->mark = ecode + 2;
841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
842 md->start_match_ptr = ecode;
843 RRETURN(MATCH_THEN);
844
845 /* Handle an atomic group that does not contain any capturing parentheses.
846 This can be handled like an assertion. Prior to 8.13, all atomic groups
847 were handled this way. In 8.13, the code was changed as below for ONCE, so
848 that backups pass through the group and thereby reset captured values.
849 However, this uses a lot more stack, so in 8.20, atomic groups that do not
850 contain any captures generate OP_ONCE_NC, which can be handled in the old,
851 less stack intensive way.
852
853 Check the alternative branches in turn - the matching won't pass the KET
854 for this kind of subpattern. If any one branch matches, we carry on as at
855 the end of a normal bracket, leaving the subject pointer, but resetting
856 the start-of-match value in case it was changed by \K. */
857
858 case OP_ONCE_NC:
859 prev = ecode;
860 saved_eptr = eptr;
861 save_mark = md->mark;
862 do
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
865 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
866 {
867 mstart = md->start_match_ptr;
868 break;
869 }
870 if (rrc == MATCH_THEN)
871 {
872 next = ecode + GET(ecode,1);
873 if (md->start_match_ptr < next &&
874 (*ecode == OP_ALT || *next == OP_ALT))
875 rrc = MATCH_NOMATCH;
876 }
877
878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
879 ecode += GET(ecode,1);
880 md->mark = save_mark;
881 }
882 while (*ecode == OP_ALT);
883
884 /* If hit the end of the group (which could be repeated), fail */
885
886 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
887
888 /* Continue as from after the group, updating the offsets high water
889 mark, since extracts may have been taken. */
890
891 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
892
893 offset_top = md->end_offset_top;
894 eptr = md->end_match_ptr;
895
896 /* For a non-repeating ket, just continue at this level. This also
897 happens for a repeating ket if no characters were matched in the group.
898 This is the forcible breaking of infinite loops as implemented in Perl
899 5.005. */
900
901 if (*ecode == OP_KET || eptr == saved_eptr)
902 {
903 ecode += 1+LINK_SIZE;
904 break;
905 }
906
907 /* The repeating kets try the rest of the pattern or restart from the
908 preceding bracket, in the appropriate order. The second "call" of match()
909 uses tail recursion, to avoid using another stack frame. */
910
911 if (*ecode == OP_KETRMIN)
912 {
913 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
915 ecode = prev;
916 goto TAIL_RECURSE;
917 }
918 else /* OP_KETRMAX */
919 {
920 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
922 ecode += 1 + LINK_SIZE;
923 goto TAIL_RECURSE;
924 }
925 /* Control never gets here */
926
927 /* Handle a capturing bracket, other than those that are possessive with an
928 unlimited repeat. If there is space in the offset vector, save the current
929 subject position in the working slot at the top of the vector. We mustn't
930 change the current values of the data slot, because they may be set from a
931 previous iteration of this group, and be referred to by a reference inside
932 the group. A failure to match might occur after the group has succeeded,
933 if something later on doesn't match. For this reason, we need to restore
934 the working value and also the values of the final offsets, in case they
935 were set by a previous iteration of the same bracket.
936
937 If there isn't enough space in the offset vector, treat this as if it were
938 a non-capturing bracket. Don't worry about setting the flag for the error
939 case here; that is handled in the code for KET. */
940
941 case OP_CBRA:
942 case OP_SCBRA:
943 number = GET2(ecode, 1+LINK_SIZE);
944 offset = number << 1;
945
946 #ifdef PCRE_DEBUG
947 printf("start bracket %d\n", number);
948 printf("subject=");
949 pchars(eptr, 16, TRUE, md);
950 printf("\n");
951 #endif
952
953 if (offset < md->offset_max)
954 {
955 save_offset1 = md->offset_vector[offset];
956 save_offset2 = md->offset_vector[offset+1];
957 save_offset3 = md->offset_vector[md->offset_end - number];
958 save_capture_last = md->capture_last;
959 save_mark = md->mark;
960
961 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
962 md->offset_vector[md->offset_end - number] =
963 (int)(eptr - md->start_subject);
964
965 for (;;)
966 {
967 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
968 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
969 eptrb, RM1);
970 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
971
972 /* If we backed up to a THEN, check whether it is within the current
973 branch by comparing the address of the THEN that is passed back with
974 the end of the branch. If it is within the current branch, and the
975 branch is one of two or more alternatives (it either starts or ends
976 with OP_ALT), we have reached the limit of THEN's action, so convert
977 the return code to NOMATCH, which will cause normal backtracking to
978 happen from now on. Otherwise, THEN is passed back to an outer
979 alternative. This implements Perl's treatment of parenthesized groups,
980 where a group not containing | does not affect the current alternative,
981 that is, (X) is NOT the same as (X|(*F)). */
982
983 if (rrc == MATCH_THEN)
984 {
985 next = ecode + GET(ecode,1);
986 if (md->start_match_ptr < next &&
987 (*ecode == OP_ALT || *next == OP_ALT))
988 rrc = MATCH_NOMATCH;
989 }
990
991 /* Anything other than NOMATCH is passed back. */
992
993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
994 md->capture_last = save_capture_last;
995 ecode += GET(ecode, 1);
996 md->mark = save_mark;
997 if (*ecode != OP_ALT) break;
998 }
999
1000 DPRINTF(("bracket %d failed\n", number));
1001 md->offset_vector[offset] = save_offset1;
1002 md->offset_vector[offset+1] = save_offset2;
1003 md->offset_vector[md->offset_end - number] = save_offset3;
1004
1005 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1006
1007 RRETURN(rrc);
1008 }
1009
1010 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1011 as a non-capturing bracket. */
1012
1013 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1014 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1015
1016 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1017
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1020
1021 /* Non-capturing or atomic group, except for possessive with unlimited
1022 repeat and ONCE group with no captures. Loop for all the alternatives.
1023
1024 When we get to the final alternative within the brackets, we used to return
1025 the result of a recursive call to match() whatever happened so it was
1026 possible to reduce stack usage by turning this into a tail recursion,
1027 except in the case of a possibly empty group. However, now that there is
1028 the possiblity of (*THEN) occurring in the final alternative, this
1029 optimization is no longer always possible.
1030
1031 We can optimize if we know there are no (*THEN)s in the pattern; at present
1032 this is the best that can be done.
1033
1034 MATCH_ONCE is returned when the end of an atomic group is successfully
1035 reached, but subsequent matching fails. It passes back up the tree (causing
1036 captured values to be reset) until the original atomic group level is
1037 reached. This is tested by comparing md->once_target with the start of the
1038 group. At this point, the return is converted into MATCH_NOMATCH so that
1039 previous backup points can be taken. */
1040
1041 case OP_ONCE:
1042 case OP_BRA:
1043 case OP_SBRA:
1044 DPRINTF(("start non-capturing bracket\n"));
1045
1046 for (;;)
1047 {
1048 if (op >= OP_SBRA || op == OP_ONCE)
1049 md->match_function_type = MATCH_CBEGROUP;
1050
1051 /* If this is not a possibly empty group, and there are no (*THEN)s in
1052 the pattern, and this is the final alternative, optimize as described
1053 above. */
1054
1055 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1056 {
1057 ecode += PRIV(OP_lengths)[*ecode];
1058 goto TAIL_RECURSE;
1059 }
1060
1061 /* In all other cases, we have to make another call to match(). */
1062
1063 save_mark = md->mark;
1064 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1065 RM2);
1066
1067 /* See comment in the code for capturing groups above about handling
1068 THEN. */
1069
1070 if (rrc == MATCH_THEN)
1071 {
1072 next = ecode + GET(ecode,1);
1073 if (md->start_match_ptr < next &&
1074 (*ecode == OP_ALT || *next == OP_ALT))
1075 rrc = MATCH_NOMATCH;
1076 }
1077
1078 if (rrc != MATCH_NOMATCH)
1079 {
1080 if (rrc == MATCH_ONCE)
1081 {
1082 const pcre_uchar *scode = ecode;
1083 if (*scode != OP_ONCE) /* If not at start, find it */
1084 {
1085 while (*scode == OP_ALT) scode += GET(scode, 1);
1086 scode -= GET(scode, 1);
1087 }
1088 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1089 }
1090 RRETURN(rrc);
1091 }
1092 ecode += GET(ecode, 1);
1093 md->mark = save_mark;
1094 if (*ecode != OP_ALT) break;
1095 }
1096
1097 RRETURN(MATCH_NOMATCH);
1098
1099 /* Handle possessive capturing brackets with an unlimited repeat. We come
1100 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1101 handled similarly to the normal case above. However, the matching is
1102 different. The end of these brackets will always be OP_KETRPOS, which
1103 returns MATCH_KETRPOS without going further in the pattern. By this means
1104 we can handle the group by iteration rather than recursion, thereby
1105 reducing the amount of stack needed. */
1106
1107 case OP_CBRAPOS:
1108 case OP_SCBRAPOS:
1109 allow_zero = FALSE;
1110
1111 POSSESSIVE_CAPTURE:
1112 number = GET2(ecode, 1+LINK_SIZE);
1113 offset = number << 1;
1114
1115 #ifdef PCRE_DEBUG
1116 printf("start possessive bracket %d\n", number);
1117 printf("subject=");
1118 pchars(eptr, 16, TRUE, md);
1119 printf("\n");
1120 #endif
1121
1122 if (offset < md->offset_max)
1123 {
1124 matched_once = FALSE;
1125 code_offset = (int)(ecode - md->start_code);
1126
1127 save_offset1 = md->offset_vector[offset];
1128 save_offset2 = md->offset_vector[offset+1];
1129 save_offset3 = md->offset_vector[md->offset_end - number];
1130 save_capture_last = md->capture_last;
1131
1132 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1133
1134 /* Each time round the loop, save the current subject position for use
1135 when the group matches. For MATCH_MATCH, the group has matched, so we
1136 restart it with a new subject starting position, remembering that we had
1137 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1138 usual. If we haven't matched any alternatives in any iteration, check to
1139 see if a previous iteration matched. If so, the group has matched;
1140 continue from afterwards. Otherwise it has failed; restore the previous
1141 capture values before returning NOMATCH. */
1142
1143 for (;;)
1144 {
1145 md->offset_vector[md->offset_end - number] =
1146 (int)(eptr - md->start_subject);
1147 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1148 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1149 eptrb, RM63);
1150 if (rrc == MATCH_KETRPOS)
1151 {
1152 offset_top = md->end_offset_top;
1153 eptr = md->end_match_ptr;
1154 ecode = md->start_code + code_offset;
1155 save_capture_last = md->capture_last;
1156 matched_once = TRUE;
1157 continue;
1158 }
1159
1160 /* See comment in the code for capturing groups above about handling
1161 THEN. */
1162
1163 if (rrc == MATCH_THEN)
1164 {
1165 next = ecode + GET(ecode,1);
1166 if (md->start_match_ptr < next &&
1167 (*ecode == OP_ALT || *next == OP_ALT))
1168 rrc = MATCH_NOMATCH;
1169 }
1170
1171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1172 md->capture_last = save_capture_last;
1173 ecode += GET(ecode, 1);
1174 if (*ecode != OP_ALT) break;
1175 }
1176
1177 if (!matched_once)
1178 {
1179 md->offset_vector[offset] = save_offset1;
1180 md->offset_vector[offset+1] = save_offset2;
1181 md->offset_vector[md->offset_end - number] = save_offset3;
1182 }
1183
1184 if (allow_zero || matched_once)
1185 {
1186 ecode += 1 + LINK_SIZE;
1187 break;
1188 }
1189
1190 RRETURN(MATCH_NOMATCH);
1191 }
1192
1193 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1194 as a non-capturing bracket. */
1195
1196 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1198
1199 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1200
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1203
1204 /* Non-capturing possessive bracket with unlimited repeat. We come here
1205 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1206 without the capturing complication. It is written out separately for speed
1207 and cleanliness. */
1208
1209 case OP_BRAPOS:
1210 case OP_SBRAPOS:
1211 allow_zero = FALSE;
1212
1213 POSSESSIVE_NON_CAPTURE:
1214 matched_once = FALSE;
1215 code_offset = (int)(ecode - md->start_code);
1216
1217 for (;;)
1218 {
1219 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1220 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1221 eptrb, RM48);
1222 if (rrc == MATCH_KETRPOS)
1223 {
1224 offset_top = md->end_offset_top;
1225 eptr = md->end_match_ptr;
1226 ecode = md->start_code + code_offset;
1227 matched_once = TRUE;
1228 continue;
1229 }
1230
1231 /* See comment in the code for capturing groups above about handling
1232 THEN. */
1233
1234 if (rrc == MATCH_THEN)
1235 {
1236 next = ecode + GET(ecode,1);
1237 if (md->start_match_ptr < next &&
1238 (*ecode == OP_ALT || *next == OP_ALT))
1239 rrc = MATCH_NOMATCH;
1240 }
1241
1242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1243 ecode += GET(ecode, 1);
1244 if (*ecode != OP_ALT) break;
1245 }
1246
1247 if (matched_once || allow_zero)
1248 {
1249 ecode += 1 + LINK_SIZE;
1250 break;
1251 }
1252 RRETURN(MATCH_NOMATCH);
1253
1254 /* Control never reaches here. */
1255
1256 /* Conditional group: compilation checked that there are no more than
1257 two branches. If the condition is false, skipping the first branch takes us
1258 past the end if there is only one branch, but that's OK because that is
1259 exactly what going to the ket would do. */
1260
1261 case OP_COND:
1262 case OP_SCOND:
1263 codelink = GET(ecode, 1);
1264
1265 /* Because of the way auto-callout works during compile, a callout item is
1266 inserted between OP_COND and an assertion condition. */
1267
1268 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1269 {
1270 if (PUBL(callout) != NULL)
1271 {
1272 PUBL(callout_block) cb;
1273 cb.version = 2; /* Version 1 of the callout block */
1274 cb.callout_number = ecode[LINK_SIZE+2];
1275 cb.offset_vector = md->offset_vector;
1276 #ifdef COMPILE_PCRE8
1277 cb.subject = (PCRE_SPTR)md->start_subject;
1278 #else
1279 cb.subject = (PCRE_SPTR16)md->start_subject;
1280 #endif
1281 cb.subject_length = (int)(md->end_subject - md->start_subject);
1282 cb.start_match = (int)(mstart - md->start_subject);
1283 cb.current_position = (int)(eptr - md->start_subject);
1284 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1285 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1286 cb.capture_top = offset_top/2;
1287 cb.capture_last = md->capture_last;
1288 cb.callout_data = md->callout_data;
1289 cb.mark = md->nomatch_mark;
1290 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1291 if (rrc < 0) RRETURN(rrc);
1292 }
1293 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1294 }
1295
1296 condcode = ecode[LINK_SIZE+1];
1297
1298 /* Now see what the actual condition is */
1299
1300 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1301 {
1302 if (md->recursive == NULL) /* Not recursing => FALSE */
1303 {
1304 condition = FALSE;
1305 ecode += GET(ecode, 1);
1306 }
1307 else
1308 {
1309 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1310 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1311
1312 /* If the test is for recursion into a specific subpattern, and it is
1313 false, but the test was set up by name, scan the table to see if the
1314 name refers to any other numbers, and test them. The condition is true
1315 if any one is set. */
1316
1317 if (!condition && condcode == OP_NRREF)
1318 {
1319 pcre_uchar *slotA = md->name_table;
1320 for (i = 0; i < md->name_count; i++)
1321 {
1322 if (GET2(slotA, 0) == recno) break;
1323 slotA += md->name_entry_size;
1324 }
1325
1326 /* Found a name for the number - there can be only one; duplicate
1327 names for different numbers are allowed, but not vice versa. First
1328 scan down for duplicates. */
1329
1330 if (i < md->name_count)
1331 {
1332 pcre_uchar *slotB = slotA;
1333 while (slotB > md->name_table)
1334 {
1335 slotB -= md->name_entry_size;
1336 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1337 {
1338 condition = GET2(slotB, 0) == md->recursive->group_num;
1339 if (condition) break;
1340 }
1341 else break;
1342 }
1343
1344 /* Scan up for duplicates */
1345
1346 if (!condition)
1347 {
1348 slotB = slotA;
1349 for (i++; i < md->name_count; i++)
1350 {
1351 slotB += md->name_entry_size;
1352 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1353 {
1354 condition = GET2(slotB, 0) == md->recursive->group_num;
1355 if (condition) break;
1356 }
1357 else break;
1358 }
1359 }
1360 }
1361 }
1362
1363 /* Chose branch according to the condition */
1364
1365 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1366 }
1367 }
1368
1369 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1370 {
1371 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1372 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1373
1374 /* If the numbered capture is unset, but the reference was by name,
1375 scan the table to see if the name refers to any other numbers, and test
1376 them. The condition is true if any one is set. This is tediously similar
1377 to the code above, but not close enough to try to amalgamate. */
1378
1379 if (!condition && condcode == OP_NCREF)
1380 {
1381 int refno = offset >> 1;
1382 pcre_uchar *slotA = md->name_table;
1383
1384 for (i = 0; i < md->name_count; i++)
1385 {
1386 if (GET2(slotA, 0) == refno) break;
1387 slotA += md->name_entry_size;
1388 }
1389
1390 /* Found a name for the number - there can be only one; duplicate names
1391 for different numbers are allowed, but not vice versa. First scan down
1392 for duplicates. */
1393
1394 if (i < md->name_count)
1395 {
1396 pcre_uchar *slotB = slotA;
1397 while (slotB > md->name_table)
1398 {
1399 slotB -= md->name_entry_size;
1400 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1401 {
1402 offset = GET2(slotB, 0) << 1;
1403 condition = offset < offset_top &&
1404 md->offset_vector[offset] >= 0;
1405 if (condition) break;
1406 }
1407 else break;
1408 }
1409
1410 /* Scan up for duplicates */
1411
1412 if (!condition)
1413 {
1414 slotB = slotA;
1415 for (i++; i < md->name_count; i++)
1416 {
1417 slotB += md->name_entry_size;
1418 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1419 {
1420 offset = GET2(slotB, 0) << 1;
1421 condition = offset < offset_top &&
1422 md->offset_vector[offset] >= 0;
1423 if (condition) break;
1424 }
1425 else break;
1426 }
1427 }
1428 }
1429 }
1430
1431 /* Chose branch according to the condition */
1432
1433 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1434 }
1435
1436 else if (condcode == OP_DEF) /* DEFINE - always false */
1437 {
1438 condition = FALSE;
1439 ecode += GET(ecode, 1);
1440 }
1441
1442 /* The condition is an assertion. Call match() to evaluate it - setting
1443 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1444 an assertion. */
1445
1446 else
1447 {
1448 md->match_function_type = MATCH_CONDASSERT;
1449 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1450 if (rrc == MATCH_MATCH)
1451 {
1452 if (md->end_offset_top > offset_top)
1453 offset_top = md->end_offset_top; /* Captures may have happened */
1454 condition = TRUE;
1455 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1456 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1457 }
1458
1459 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1460 assertion; it is therefore treated as NOMATCH. */
1461
1462 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1463 {
1464 RRETURN(rrc); /* Need braces because of following else */
1465 }
1466 else
1467 {
1468 condition = FALSE;
1469 ecode += codelink;
1470 }
1471 }
1472
1473 /* We are now at the branch that is to be obeyed. As there is only one, can
1474 use tail recursion to avoid using another stack frame, except when there is
1475 unlimited repeat of a possibly empty group. In the latter case, a recursive
1476 call to match() is always required, unless the second alternative doesn't
1477 exist, in which case we can just plough on. Note that, for compatibility
1478 with Perl, the | in a conditional group is NOT treated as creating two
1479 alternatives. If a THEN is encountered in the branch, it propagates out to
1480 the enclosing alternative (unless nested in a deeper set of alternatives,
1481 of course). */
1482
1483 if (condition || *ecode == OP_ALT)
1484 {
1485 if (op != OP_SCOND)
1486 {
1487 ecode += 1 + LINK_SIZE;
1488 goto TAIL_RECURSE;
1489 }
1490
1491 md->match_function_type = MATCH_CBEGROUP;
1492 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1493 RRETURN(rrc);
1494 }
1495
1496 /* Condition false & no alternative; continue after the group. */
1497
1498 else
1499 {
1500 ecode += 1 + LINK_SIZE;
1501 }
1502 break;
1503
1504
1505 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1506 to close any currently open capturing brackets. */
1507
1508 case OP_CLOSE:
1509 number = GET2(ecode, 1);
1510 offset = number << 1;
1511
1512 #ifdef PCRE_DEBUG
1513 printf("end bracket %d at *ACCEPT", number);
1514 printf("\n");
1515 #endif
1516
1517 md->capture_last = number;
1518 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1519 {
1520 md->offset_vector[offset] =
1521 md->offset_vector[md->offset_end - number];
1522 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1523 if (offset_top <= offset) offset_top = offset + 2;
1524 }
1525 ecode += 1 + IMM2_SIZE;
1526 break;
1527
1528
1529 /* End of the pattern, either real or forced. */
1530
1531 case OP_END:
1532 case OP_ACCEPT:
1533 case OP_ASSERT_ACCEPT:
1534
1535 /* If we have matched an empty string, fail if not in an assertion and not
1536 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1537 is set and we have matched at the start of the subject. In both cases,
1538 backtracking will then try other alternatives, if any. */
1539
1540 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1541 md->recursive == NULL &&
1542 (md->notempty ||
1543 (md->notempty_atstart &&
1544 mstart == md->start_subject + md->start_offset)))
1545 RRETURN(MATCH_NOMATCH);
1546
1547 /* Otherwise, we have a match. */
1548
1549 md->end_match_ptr = eptr; /* Record where we ended */
1550 md->end_offset_top = offset_top; /* and how many extracts were taken */
1551 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1552
1553 /* For some reason, the macros don't work properly if an expression is
1554 given as the argument to RRETURN when the heap is in use. */
1555
1556 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1557 RRETURN(rrc);
1558
1559 /* Assertion brackets. Check the alternative branches in turn - the
1560 matching won't pass the KET for an assertion. If any one branch matches,
1561 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1562 start of each branch to move the current point backwards, so the code at
1563 this level is identical to the lookahead case. When the assertion is part
1564 of a condition, we want to return immediately afterwards. The caller of
1565 this incarnation of the match() function will have set MATCH_CONDASSERT in
1566 md->match_function type, and one of these opcodes will be the first opcode
1567 that is processed. We use a local variable that is preserved over calls to
1568 match() to remember this case. */
1569
1570 case OP_ASSERT:
1571 case OP_ASSERTBACK:
1572 save_mark = md->mark;
1573 if (md->match_function_type == MATCH_CONDASSERT)
1574 {
1575 condassert = TRUE;
1576 md->match_function_type = 0;
1577 }
1578 else condassert = FALSE;
1579
1580 do
1581 {
1582 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1583 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1584 {
1585 mstart = md->start_match_ptr; /* In case \K reset it */
1586 break;
1587 }
1588 md->mark = save_mark;
1589
1590 /* A COMMIT failure must fail the entire assertion, without trying any
1591 subsequent branches. */
1592
1593 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1594
1595 /* PCRE does not allow THEN to escape beyond an assertion; it
1596 is treated as NOMATCH. */
1597
1598 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1599 ecode += GET(ecode, 1);
1600 }
1601 while (*ecode == OP_ALT);
1602
1603 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1604
1605 /* If checking an assertion for a condition, return MATCH_MATCH. */
1606
1607 if (condassert) RRETURN(MATCH_MATCH);
1608
1609 /* Continue from after the assertion, updating the offsets high water
1610 mark, since extracts may have been taken during the assertion. */
1611
1612 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1613 ecode += 1 + LINK_SIZE;
1614 offset_top = md->end_offset_top;
1615 continue;
1616
1617 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1618 PRUNE, or COMMIT means we must assume failure without checking subsequent
1619 branches. */
1620
1621 case OP_ASSERT_NOT:
1622 case OP_ASSERTBACK_NOT:
1623 save_mark = md->mark;
1624 if (md->match_function_type == MATCH_CONDASSERT)
1625 {
1626 condassert = TRUE;
1627 md->match_function_type = 0;
1628 }
1629 else condassert = FALSE;
1630
1631 do
1632 {
1633 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1634 md->mark = save_mark;
1635 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1636 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1637 {
1638 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1639 break;
1640 }
1641
1642 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1643 as NOMATCH. */
1644
1645 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1646 ecode += GET(ecode,1);
1647 }
1648 while (*ecode == OP_ALT);
1649
1650 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1651
1652 ecode += 1 + LINK_SIZE;
1653 continue;
1654
1655 /* Move the subject pointer back. This occurs only at the start of
1656 each branch of a lookbehind assertion. If we are too close to the start to
1657 move back, this match function fails. When working with UTF-8 we move
1658 back a number of characters, not bytes. */
1659
1660 case OP_REVERSE:
1661 #ifdef SUPPORT_UTF
1662 if (utf)
1663 {
1664 i = GET(ecode, 1);
1665 while (i-- > 0)
1666 {
1667 eptr--;
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 BACKCHAR(eptr);
1670 }
1671 }
1672 else
1673 #endif
1674
1675 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1676
1677 {
1678 eptr -= GET(ecode, 1);
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1680 }
1681
1682 /* Save the earliest consulted character, then skip to next op code */
1683
1684 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685 ecode += 1 + LINK_SIZE;
1686 break;
1687
1688 /* The callout item calls an external function, if one is provided, passing
1689 details of the match so far. This is mainly for debugging, though the
1690 function is able to force a failure. */
1691
1692 case OP_CALLOUT:
1693 if (PUBL(callout) != NULL)
1694 {
1695 PUBL(callout_block) cb;
1696 cb.version = 2; /* Version 1 of the callout block */
1697 cb.callout_number = ecode[1];
1698 cb.offset_vector = md->offset_vector;
1699 #ifdef COMPILE_PCRE8
1700 cb.subject = (PCRE_SPTR)md->start_subject;
1701 #else
1702 cb.subject = (PCRE_SPTR16)md->start_subject;
1703 #endif
1704 cb.subject_length = (int)(md->end_subject - md->start_subject);
1705 cb.start_match = (int)(mstart - md->start_subject);
1706 cb.current_position = (int)(eptr - md->start_subject);
1707 cb.pattern_position = GET(ecode, 2);
1708 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1709 cb.capture_top = offset_top/2;
1710 cb.capture_last = md->capture_last;
1711 cb.callout_data = md->callout_data;
1712 cb.mark = md->nomatch_mark;
1713 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1714 if (rrc < 0) RRETURN(rrc);
1715 }
1716 ecode += 2 + 2*LINK_SIZE;
1717 break;
1718
1719 /* Recursion either matches the current regex, or some subexpression. The
1720 offset data is the offset to the starting bracket from the start of the
1721 whole pattern. (This is so that it works from duplicated subpatterns.)
1722
1723 The state of the capturing groups is preserved over recursion, and
1724 re-instated afterwards. We don't know how many are started and not yet
1725 finished (offset_top records the completed total) so we just have to save
1726 all the potential data. There may be up to 65535 such values, which is too
1727 large to put on the stack, but using malloc for small numbers seems
1728 expensive. As a compromise, the stack is used when there are no more than
1729 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1730
1731 There are also other values that have to be saved. We use a chained
1732 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1733 for the original version of this logic. It has, however, been hacked around
1734 a lot, so he is not to blame for the current way it works. */
1735
1736 case OP_RECURSE:
1737 {
1738 recursion_info *ri;
1739 int recno;
1740
1741 callpat = md->start_code + GET(ecode, 1);
1742 recno = (callpat == md->start_code)? 0 :
1743 GET2(callpat, 1 + LINK_SIZE);
1744
1745 /* Check for repeating a recursion without advancing the subject pointer.
1746 This should catch convoluted mutual recursions. (Some simple cases are
1747 caught at compile time.) */
1748
1749 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1750 if (recno == ri->group_num && eptr == ri->subject_position)
1751 RRETURN(PCRE_ERROR_RECURSELOOP);
1752
1753 /* Add to "recursing stack" */
1754
1755 new_recursive.group_num = recno;
1756 new_recursive.subject_position = eptr;
1757 new_recursive.prevrec = md->recursive;
1758 md->recursive = &new_recursive;
1759
1760 /* Where to continue from afterwards */
1761
1762 ecode += 1 + LINK_SIZE;
1763
1764 /* Now save the offset data */
1765
1766 new_recursive.saved_max = md->offset_end;
1767 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1768 new_recursive.offset_save = stacksave;
1769 else
1770 {
1771 new_recursive.offset_save =
1772 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1773 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1774 }
1775 memcpy(new_recursive.offset_save, md->offset_vector,
1776 new_recursive.saved_max * sizeof(int));
1777
1778 /* OK, now we can do the recursion. After processing each alternative,
1779 restore the offset data. If there were nested recursions, md->recursive
1780 might be changed, so reset it before looping. */
1781
1782 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1783 cbegroup = (*callpat >= OP_SBRA);
1784 do
1785 {
1786 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1787 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1788 md, eptrb, RM6);
1789 memcpy(md->offset_vector, new_recursive.offset_save,
1790 new_recursive.saved_max * sizeof(int));
1791 md->recursive = new_recursive.prevrec;
1792 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1793 {
1794 DPRINTF(("Recursion matched\n"));
1795 if (new_recursive.offset_save != stacksave)
1796 (PUBL(free))(new_recursive.offset_save);
1797
1798 /* Set where we got to in the subject, and reset the start in case
1799 it was changed by \K. This *is* propagated back out of a recursion,
1800 for Perl compatibility. */
1801
1802 eptr = md->end_match_ptr;
1803 mstart = md->start_match_ptr;
1804 goto RECURSION_MATCHED; /* Exit loop; end processing */
1805 }
1806
1807 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1808 is treated as NOMATCH. */
1809
1810 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1811 rrc != MATCH_COMMIT)
1812 {
1813 DPRINTF(("Recursion gave error %d\n", rrc));
1814 if (new_recursive.offset_save != stacksave)
1815 (PUBL(free))(new_recursive.offset_save);
1816 RRETURN(rrc);
1817 }
1818
1819 md->recursive = &new_recursive;
1820 callpat += GET(callpat, 1);
1821 }
1822 while (*callpat == OP_ALT);
1823
1824 DPRINTF(("Recursion didn't match\n"));
1825 md->recursive = new_recursive.prevrec;
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828 RRETURN(MATCH_NOMATCH);
1829 }
1830
1831 RECURSION_MATCHED:
1832 break;
1833
1834 /* An alternation is the end of a branch; scan along to find the end of the
1835 bracketed group and go to there. */
1836
1837 case OP_ALT:
1838 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1839 break;
1840
1841 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1842 indicating that it may occur zero times. It may repeat infinitely, or not
1843 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1844 with fixed upper repeat limits are compiled as a number of copies, with the
1845 optional ones preceded by BRAZERO or BRAMINZERO. */
1846
1847 case OP_BRAZERO:
1848 next = ecode + 1;
1849 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1851 do next += GET(next, 1); while (*next == OP_ALT);
1852 ecode = next + 1 + LINK_SIZE;
1853 break;
1854
1855 case OP_BRAMINZERO:
1856 next = ecode + 1;
1857 do next += GET(next, 1); while (*next == OP_ALT);
1858 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1860 ecode++;
1861 break;
1862
1863 case OP_SKIPZERO:
1864 next = ecode+1;
1865 do next += GET(next,1); while (*next == OP_ALT);
1866 ecode = next + 1 + LINK_SIZE;
1867 break;
1868
1869 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1870 here; just jump to the group, with allow_zero set TRUE. */
1871
1872 case OP_BRAPOSZERO:
1873 op = *(++ecode);
1874 allow_zero = TRUE;
1875 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1876 goto POSSESSIVE_NON_CAPTURE;
1877
1878 /* End of a group, repeated or non-repeating. */
1879
1880 case OP_KET:
1881 case OP_KETRMIN:
1882 case OP_KETRMAX:
1883 case OP_KETRPOS:
1884 prev = ecode - GET(ecode, 1);
1885
1886 /* If this was a group that remembered the subject start, in order to break
1887 infinite repeats of empty string matches, retrieve the subject start from
1888 the chain. Otherwise, set it NULL. */
1889
1890 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1891 {
1892 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1893 eptrb = eptrb->epb_prev; /* Backup to previous group */
1894 }
1895 else saved_eptr = NULL;
1896
1897 /* If we are at the end of an assertion group or a non-capturing atomic
1898 group, stop matching and return MATCH_MATCH, but record the current high
1899 water mark for use by positive assertions. We also need to record the match
1900 start in case it was changed by \K. */
1901
1902 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1903 *prev == OP_ONCE_NC)
1904 {
1905 md->end_match_ptr = eptr; /* For ONCE_NC */
1906 md->end_offset_top = offset_top;
1907 md->start_match_ptr = mstart;
1908 RRETURN(MATCH_MATCH); /* Sets md->mark */
1909 }
1910
1911 /* For capturing groups we have to check the group number back at the start
1912 and if necessary complete handling an extraction by setting the offsets and
1913 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1914 into group 0, so it won't be picked up here. Instead, we catch it when the
1915 OP_END is reached. Other recursion is handled here. We just have to record
1916 the current subject position and start match pointer and give a MATCH
1917 return. */
1918
1919 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1920 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1921 {
1922 number = GET2(prev, 1+LINK_SIZE);
1923 offset = number << 1;
1924
1925 #ifdef PCRE_DEBUG
1926 printf("end bracket %d", number);
1927 printf("\n");
1928 #endif
1929
1930 /* Handle a recursively called group. */
1931
1932 if (md->recursive != NULL && md->recursive->group_num == number)
1933 {
1934 md->end_match_ptr = eptr;
1935 md->start_match_ptr = mstart;
1936 RRETURN(MATCH_MATCH);
1937 }
1938
1939 /* Deal with capturing */
1940
1941 md->capture_last = number;
1942 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1943 {
1944 /* If offset is greater than offset_top, it means that we are
1945 "skipping" a capturing group, and that group's offsets must be marked
1946 unset. In earlier versions of PCRE, all the offsets were unset at the
1947 start of matching, but this doesn't work because atomic groups and
1948 assertions can cause a value to be set that should later be unset.
1949 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1950 part of the atomic group, but this is not on the final matching path,
1951 so must be unset when 2 is set. (If there is no group 2, there is no
1952 problem, because offset_top will then be 2, indicating no capture.) */
1953
1954 if (offset > offset_top)
1955 {
1956 register int *iptr = md->offset_vector + offset_top;
1957 register int *iend = md->offset_vector + offset;
1958 while (iptr < iend) *iptr++ = -1;
1959 }
1960
1961 /* Now make the extraction */
1962
1963 md->offset_vector[offset] =
1964 md->offset_vector[md->offset_end - number];
1965 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1966 if (offset_top <= offset) offset_top = offset + 2;
1967 }
1968 }
1969
1970 /* For an ordinary non-repeating ket, just continue at this level. This
1971 also happens for a repeating ket if no characters were matched in the
1972 group. This is the forcible breaking of infinite loops as implemented in
1973 Perl 5.005. For a non-repeating atomic group that includes captures,
1974 establish a backup point by processing the rest of the pattern at a lower
1975 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1976 original OP_ONCE level, thereby bypassing intermediate backup points, but
1977 resetting any captures that happened along the way. */
1978
1979 if (*ecode == OP_KET || eptr == saved_eptr)
1980 {
1981 if (*prev == OP_ONCE)
1982 {
1983 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1985 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1986 RRETURN(MATCH_ONCE);
1987 }
1988 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1989 break;
1990 }
1991
1992 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1993 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1994 at a time from the outer level, thus saving stack. */
1995
1996 if (*ecode == OP_KETRPOS)
1997 {
1998 md->end_match_ptr = eptr;
1999 md->end_offset_top = offset_top;
2000 RRETURN(MATCH_KETRPOS);
2001 }
2002
2003 /* The normal repeating kets try the rest of the pattern or restart from
2004 the preceding bracket, in the appropriate order. In the second case, we can
2005 use tail recursion to avoid using another stack frame, unless we have an
2006 an atomic group or an unlimited repeat of a group that can match an empty
2007 string. */
2008
2009 if (*ecode == OP_KETRMIN)
2010 {
2011 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2013 if (*prev == OP_ONCE)
2014 {
2015 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2017 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2018 RRETURN(MATCH_ONCE);
2019 }
2020 if (*prev >= OP_SBRA) /* Could match an empty string */
2021 {
2022 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2023 RRETURN(rrc);
2024 }
2025 ecode = prev;
2026 goto TAIL_RECURSE;
2027 }
2028 else /* OP_KETRMAX */
2029 {
2030 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2031 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2033 if (*prev == OP_ONCE)
2034 {
2035 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2037 md->once_target = prev;
2038 RRETURN(MATCH_ONCE);
2039 }
2040 ecode += 1 + LINK_SIZE;
2041 goto TAIL_RECURSE;
2042 }
2043 /* Control never gets here */
2044
2045 /* Not multiline mode: start of subject assertion, unless notbol. */
2046
2047 case OP_CIRC:
2048 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2049
2050 /* Start of subject assertion */
2051
2052 case OP_SOD:
2053 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2054 ecode++;
2055 break;
2056
2057 /* Multiline mode: start of subject unless notbol, or after any newline. */
2058
2059 case OP_CIRCM:
2060 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2061 if (eptr != md->start_subject &&
2062 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2063 RRETURN(MATCH_NOMATCH);
2064 ecode++;
2065 break;
2066
2067 /* Start of match assertion */
2068
2069 case OP_SOM:
2070 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2071 ecode++;
2072 break;
2073
2074 /* Reset the start of match point */
2075
2076 case OP_SET_SOM:
2077 mstart = eptr;
2078 ecode++;
2079 break;
2080
2081 /* Multiline mode: assert before any newline, or before end of subject
2082 unless noteol is set. */
2083
2084 case OP_DOLLM:
2085 if (eptr < md->end_subject)
2086 {
2087 if (!IS_NEWLINE(eptr))
2088 {
2089 if (md->partial != 0 &&
2090 eptr + 1 >= md->end_subject &&
2091 NLBLOCK->nltype == NLTYPE_FIXED &&
2092 NLBLOCK->nllen == 2 &&
2093 *eptr == NLBLOCK->nl[0])
2094 {
2095 md->hitend = TRUE;
2096 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2097 }
2098 RRETURN(MATCH_NOMATCH);
2099 }
2100 }
2101 else
2102 {
2103 if (md->noteol) RRETURN(MATCH_NOMATCH);
2104 SCHECK_PARTIAL();
2105 }
2106 ecode++;
2107 break;
2108
2109 /* Not multiline mode: assert before a terminating newline or before end of
2110 subject unless noteol is set. */
2111
2112 case OP_DOLL:
2113 if (md->noteol) RRETURN(MATCH_NOMATCH);
2114 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2115
2116 /* ... else fall through for endonly */
2117
2118 /* End of subject assertion (\z) */
2119
2120 case OP_EOD:
2121 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2122 SCHECK_PARTIAL();
2123 ecode++;
2124 break;
2125
2126 /* End of subject or ending \n assertion (\Z) */
2127
2128 case OP_EODN:
2129 ASSERT_NL_OR_EOS:
2130 if (eptr < md->end_subject &&
2131 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2132 {
2133 if (md->partial != 0 &&
2134 eptr + 1 >= md->end_subject &&
2135 NLBLOCK->nltype == NLTYPE_FIXED &&
2136 NLBLOCK->nllen == 2 &&
2137 *eptr == NLBLOCK->nl[0])
2138 {
2139 md->hitend = TRUE;
2140 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2141 }
2142 RRETURN(MATCH_NOMATCH);
2143 }
2144
2145 /* Either at end of string or \n before end. */
2146
2147 SCHECK_PARTIAL();
2148 ecode++;
2149 break;
2150
2151 /* Word boundary assertions */
2152
2153 case OP_NOT_WORD_BOUNDARY:
2154 case OP_WORD_BOUNDARY:
2155 {
2156
2157 /* Find out if the previous and current characters are "word" characters.
2158 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2159 be "non-word" characters. Remember the earliest consulted character for
2160 partial matching. */
2161
2162 #ifdef SUPPORT_UTF
2163 if (utf)
2164 {
2165 /* Get status of previous character */
2166
2167 if (eptr == md->start_subject) prev_is_word = FALSE; else
2168 {
2169 PCRE_PUCHAR lastptr = eptr - 1;
2170 BACKCHAR(lastptr);
2171 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2172 GETCHAR(c, lastptr);
2173 #ifdef SUPPORT_UCP
2174 if (md->use_ucp)
2175 {
2176 if (c == '_') prev_is_word = TRUE; else
2177 {
2178 int cat = UCD_CATEGORY(c);
2179 prev_is_word = (cat == ucp_L || cat == ucp_N);
2180 }
2181 }
2182 else
2183 #endif
2184 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2185 }
2186
2187 /* Get status of next character */
2188
2189 if (eptr >= md->end_subject)
2190 {
2191 SCHECK_PARTIAL();
2192 cur_is_word = FALSE;
2193 }
2194 else
2195 {
2196 GETCHAR(c, eptr);
2197 #ifdef SUPPORT_UCP
2198 if (md->use_ucp)
2199 {
2200 if (c == '_') cur_is_word = TRUE; else
2201 {
2202 int cat = UCD_CATEGORY(c);
2203 cur_is_word = (cat == ucp_L || cat == ucp_N);
2204 }
2205 }
2206 else
2207 #endif
2208 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2209 }
2210 }
2211 else
2212 #endif
2213
2214 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2215 consistency with the behaviour of \w we do use it in this case. */
2216
2217 {
2218 /* Get status of previous character */
2219
2220 if (eptr == md->start_subject) prev_is_word = FALSE; else
2221 {
2222 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2223 #ifdef SUPPORT_UCP
2224 if (md->use_ucp)
2225 {
2226 c = eptr[-1];
2227 if (c == '_') prev_is_word = TRUE; else
2228 {
2229 int cat = UCD_CATEGORY(c);
2230 prev_is_word = (cat == ucp_L || cat == ucp_N);
2231 }
2232 }
2233 else
2234 #endif
2235 prev_is_word = MAX_255(eptr[-1])
2236 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2237 }
2238
2239 /* Get status of next character */
2240
2241 if (eptr >= md->end_subject)
2242 {
2243 SCHECK_PARTIAL();
2244 cur_is_word = FALSE;
2245 }
2246 else
2247 #ifdef SUPPORT_UCP
2248 if (md->use_ucp)
2249 {
2250 c = *eptr;
2251 if (c == '_') cur_is_word = TRUE; else
2252 {
2253 int cat = UCD_CATEGORY(c);
2254 cur_is_word = (cat == ucp_L || cat == ucp_N);
2255 }
2256 }
2257 else
2258 #endif
2259 cur_is_word = MAX_255(*eptr)
2260 && ((md->ctypes[*eptr] & ctype_word) != 0);
2261 }
2262
2263 /* Now see if the situation is what we want */
2264
2265 if ((*ecode++ == OP_WORD_BOUNDARY)?
2266 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2267 RRETURN(MATCH_NOMATCH);
2268 }
2269 break;
2270
2271 /* Match any single character type except newline; have to take care with
2272 CRLF newlines and partial matching. */
2273
2274 case OP_ANY:
2275 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2276 if (md->partial != 0 &&
2277 eptr + 1 >= md->end_subject &&
2278 NLBLOCK->nltype == NLTYPE_FIXED &&
2279 NLBLOCK->nllen == 2 &&
2280 *eptr == NLBLOCK->nl[0])
2281 {
2282 md->hitend = TRUE;
2283 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2284 }
2285
2286 /* Fall through */
2287
2288 /* Match any single character whatsoever. */
2289
2290 case OP_ALLANY:
2291 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2292 { /* not be updated before SCHECK_PARTIAL. */
2293 SCHECK_PARTIAL();
2294 RRETURN(MATCH_NOMATCH);
2295 }
2296 eptr++;
2297 #ifdef SUPPORT_UTF
2298 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2299 #endif
2300 ecode++;
2301 break;
2302
2303 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2304 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2305
2306 case OP_ANYBYTE:
2307 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2308 { /* not be updated before SCHECK_PARTIAL. */
2309 SCHECK_PARTIAL();
2310 RRETURN(MATCH_NOMATCH);
2311 }
2312 eptr++;
2313 ecode++;
2314 break;
2315
2316 case OP_NOT_DIGIT:
2317 if (eptr >= md->end_subject)
2318 {
2319 SCHECK_PARTIAL();
2320 RRETURN(MATCH_NOMATCH);
2321 }
2322 GETCHARINCTEST(c, eptr);
2323 if (
2324 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2325 c < 256 &&
2326 #endif
2327 (md->ctypes[c] & ctype_digit) != 0
2328 )
2329 RRETURN(MATCH_NOMATCH);
2330 ecode++;
2331 break;
2332
2333 case OP_DIGIT:
2334 if (eptr >= md->end_subject)
2335 {
2336 SCHECK_PARTIAL();
2337 RRETURN(MATCH_NOMATCH);
2338 }
2339 GETCHARINCTEST(c, eptr);
2340 if (
2341 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2342 c > 255 ||
2343 #endif
2344 (md->ctypes[c] & ctype_digit) == 0
2345 )
2346 RRETURN(MATCH_NOMATCH);
2347 ecode++;
2348 break;
2349
2350 case OP_NOT_WHITESPACE:
2351 if (eptr >= md->end_subject)
2352 {
2353 SCHECK_PARTIAL();
2354 RRETURN(MATCH_NOMATCH);
2355 }
2356 GETCHARINCTEST(c, eptr);
2357 if (
2358 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2359 c < 256 &&
2360 #endif
2361 (md->ctypes[c] & ctype_space) != 0
2362 )
2363 RRETURN(MATCH_NOMATCH);
2364 ecode++;
2365 break;
2366
2367 case OP_WHITESPACE:
2368 if (eptr >= md->end_subject)
2369 {
2370 SCHECK_PARTIAL();
2371 RRETURN(MATCH_NOMATCH);
2372 }
2373 GETCHARINCTEST(c, eptr);
2374 if (
2375 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2376 c > 255 ||
2377 #endif
2378 (md->ctypes[c] & ctype_space) == 0
2379 )
2380 RRETURN(MATCH_NOMATCH);
2381 ecode++;
2382 break;
2383
2384 case OP_NOT_WORDCHAR:
2385 if (eptr >= md->end_subject)
2386 {
2387 SCHECK_PARTIAL();
2388 RRETURN(MATCH_NOMATCH);
2389 }
2390 GETCHARINCTEST(c, eptr);
2391 if (
2392 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2393 c < 256 &&
2394 #endif
2395 (md->ctypes[c] & ctype_word) != 0
2396 )
2397 RRETURN(MATCH_NOMATCH);
2398 ecode++;
2399 break;
2400
2401 case OP_WORDCHAR:
2402 if (eptr >= md->end_subject)
2403 {
2404 SCHECK_PARTIAL();
2405 RRETURN(MATCH_NOMATCH);
2406 }
2407 GETCHARINCTEST(c, eptr);
2408 if (
2409 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2410 c > 255 ||
2411 #endif
2412 (md->ctypes[c] & ctype_word) == 0
2413 )
2414 RRETURN(MATCH_NOMATCH);
2415 ecode++;
2416 break;
2417
2418 case OP_ANYNL:
2419 if (eptr >= md->end_subject)
2420 {
2421 SCHECK_PARTIAL();
2422 RRETURN(MATCH_NOMATCH);
2423 }
2424 GETCHARINCTEST(c, eptr);
2425 switch(c)
2426 {
2427 default: RRETURN(MATCH_NOMATCH);
2428
2429 case CHAR_CR:
2430 if (eptr >= md->end_subject)
2431 {
2432 SCHECK_PARTIAL();
2433 }
2434 else if (*eptr == CHAR_LF) eptr++;
2435 break;
2436
2437 case CHAR_LF:
2438 break;
2439
2440 case CHAR_VT:
2441 case CHAR_FF:
2442 case CHAR_NEL:
2443 #ifndef EBCDIC
2444 case 0x2028:
2445 case 0x2029:
2446 #endif /* Not EBCDIC */
2447 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2448 break;
2449 }
2450 ecode++;
2451 break;
2452
2453 case OP_NOT_HSPACE:
2454 if (eptr >= md->end_subject)
2455 {
2456 SCHECK_PARTIAL();
2457 RRETURN(MATCH_NOMATCH);
2458 }
2459 GETCHARINCTEST(c, eptr);
2460 switch(c)
2461 {
2462 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2463 default: break;
2464 }
2465 ecode++;
2466 break;
2467
2468 case OP_HSPACE:
2469 if (eptr >= md->end_subject)
2470 {
2471 SCHECK_PARTIAL();
2472 RRETURN(MATCH_NOMATCH);
2473 }
2474 GETCHARINCTEST(c, eptr);
2475 switch(c)
2476 {
2477 HSPACE_CASES: break; /* Byte and multibyte cases */
2478 default: RRETURN(MATCH_NOMATCH);
2479 }
2480 ecode++;
2481 break;
2482
2483 case OP_NOT_VSPACE:
2484 if (eptr >= md->end_subject)
2485 {
2486 SCHECK_PARTIAL();
2487 RRETURN(MATCH_NOMATCH);
2488 }
2489 GETCHARINCTEST(c, eptr);
2490 switch(c)
2491 {
2492 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2493 default: break;
2494 }
2495 ecode++;
2496 break;
2497
2498 case OP_VSPACE:
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 GETCHARINCTEST(c, eptr);
2505 switch(c)
2506 {
2507 VSPACE_CASES: break;
2508 default: RRETURN(MATCH_NOMATCH);
2509 }
2510 ecode++;
2511 break;
2512
2513 #ifdef SUPPORT_UCP
2514 /* Check the next character by Unicode property. We will get here only
2515 if the support is in the binary; otherwise a compile-time error occurs. */
2516
2517 case OP_PROP:
2518 case OP_NOTPROP:
2519 if (eptr >= md->end_subject)
2520 {
2521 SCHECK_PARTIAL();
2522 RRETURN(MATCH_NOMATCH);
2523 }
2524 GETCHARINCTEST(c, eptr);
2525 {
2526 const pcre_uint32 *cp;
2527 const ucd_record *prop = GET_UCD(c);
2528
2529 switch(ecode[1])
2530 {
2531 case PT_ANY:
2532 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2533 break;
2534
2535 case PT_LAMP:
2536 if ((prop->chartype == ucp_Lu ||
2537 prop->chartype == ucp_Ll ||
2538 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2539 RRETURN(MATCH_NOMATCH);
2540 break;
2541
2542 case PT_GC:
2543 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2544 RRETURN(MATCH_NOMATCH);
2545 break;
2546
2547 case PT_PC:
2548 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2549 RRETURN(MATCH_NOMATCH);
2550 break;
2551
2552 case PT_SC:
2553 if ((ecode[2] != prop->script) == (op == OP_PROP))
2554 RRETURN(MATCH_NOMATCH);
2555 break;
2556
2557 /* These are specials */
2558
2559 case PT_ALNUM:
2560 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2561 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2562 RRETURN(MATCH_NOMATCH);
2563 break;
2564
2565 case PT_SPACE: /* Perl space */
2566 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2567 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2568 == (op == OP_NOTPROP))
2569 RRETURN(MATCH_NOMATCH);
2570 break;
2571
2572 case PT_PXSPACE: /* POSIX space */
2573 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2574 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2575 c == CHAR_FF || c == CHAR_CR)
2576 == (op == OP_NOTPROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_WORD:
2581 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2582 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2583 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 case PT_CLIST:
2588 cp = PRIV(ucd_caseless_sets) + prop->caseset;
2589 for (;;)
2590 {
2591 if (c < *cp)
2592 { if (op == OP_PROP) RRETURN(MATCH_NOMATCH); else break; }
2593 if (c == *cp++)
2594 { if (op == OP_PROP) break; else RRETURN(MATCH_NOMATCH); }
2595 }
2596 break;
2597
2598 /* This should never occur */
2599
2600 default:
2601 RRETURN(PCRE_ERROR_INTERNAL);
2602 }
2603
2604 ecode += 3;
2605 }
2606 break;
2607
2608 /* Match an extended Unicode sequence. We will get here only if the support
2609 is in the binary; otherwise a compile-time error occurs. */
2610
2611 case OP_EXTUNI:
2612 if (eptr >= md->end_subject)
2613 {
2614 SCHECK_PARTIAL();
2615 RRETURN(MATCH_NOMATCH);
2616 }
2617 else
2618 {
2619 int lgb, rgb;
2620 GETCHARINCTEST(c, eptr);
2621 lgb = UCD_GRAPHBREAK(c);
2622 while (eptr < md->end_subject)
2623 {
2624 int len = 1;
2625 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2626 rgb = UCD_GRAPHBREAK(c);
2627 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2628 lgb = rgb;
2629 eptr += len;
2630 }
2631 }
2632 CHECK_PARTIAL();
2633 ecode++;
2634 break;
2635 #endif /* SUPPORT_UCP */
2636
2637
2638 /* Match a back reference, possibly repeatedly. Look past the end of the
2639 item to see if there is repeat information following. The code is similar
2640 to that for character classes, but repeated for efficiency. Then obey
2641 similar code to character type repeats - written out again for speed.
2642 However, if the referenced string is the empty string, always treat
2643 it as matched, any number of times (otherwise there could be infinite
2644 loops). */
2645
2646 case OP_REF:
2647 case OP_REFI:
2648 caseless = op == OP_REFI;
2649 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2650 ecode += 1 + IMM2_SIZE;
2651
2652 /* If the reference is unset, there are two possibilities:
2653
2654 (a) In the default, Perl-compatible state, set the length negative;
2655 this ensures that every attempt at a match fails. We can't just fail
2656 here, because of the possibility of quantifiers with zero minima.
2657
2658 (b) If the JavaScript compatibility flag is set, set the length to zero
2659 so that the back reference matches an empty string.
2660
2661 Otherwise, set the length to the length of what was matched by the
2662 referenced subpattern. */
2663
2664 if (offset >= offset_top || md->offset_vector[offset] < 0)
2665 length = (md->jscript_compat)? 0 : -1;
2666 else
2667 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2668
2669 /* Set up for repetition, or handle the non-repeated case */
2670
2671 switch (*ecode)
2672 {
2673 case OP_CRSTAR:
2674 case OP_CRMINSTAR:
2675 case OP_CRPLUS:
2676 case OP_CRMINPLUS:
2677 case OP_CRQUERY:
2678 case OP_CRMINQUERY:
2679 c = *ecode++ - OP_CRSTAR;
2680 minimize = (c & 1) != 0;
2681 min = rep_min[c]; /* Pick up values from tables; */
2682 max = rep_max[c]; /* zero for max => infinity */
2683 if (max == 0) max = INT_MAX;
2684 break;
2685
2686 case OP_CRRANGE:
2687 case OP_CRMINRANGE:
2688 minimize = (*ecode == OP_CRMINRANGE);
2689 min = GET2(ecode, 1);
2690 max = GET2(ecode, 1 + IMM2_SIZE);
2691 if (max == 0) max = INT_MAX;
2692 ecode += 1 + 2 * IMM2_SIZE;
2693 break;
2694
2695 default: /* No repeat follows */
2696 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2697 {
2698 if (length == -2) eptr = md->end_subject; /* Partial match */
2699 CHECK_PARTIAL();
2700 RRETURN(MATCH_NOMATCH);
2701 }
2702 eptr += length;
2703 continue; /* With the main loop */
2704 }
2705
2706 /* Handle repeated back references. If the length of the reference is
2707 zero, just continue with the main loop. If the length is negative, it
2708 means the reference is unset in non-Java-compatible mode. If the minimum is
2709 zero, we can continue at the same level without recursion. For any other
2710 minimum, carrying on will result in NOMATCH. */
2711
2712 if (length == 0) continue;
2713 if (length < 0 && min == 0) continue;
2714
2715 /* First, ensure the minimum number of matches are present. We get back
2716 the length of the reference string explicitly rather than passing the
2717 address of eptr, so that eptr can be a register variable. */
2718
2719 for (i = 1; i <= min; i++)
2720 {
2721 int slength;
2722 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2723 {
2724 if (slength == -2) eptr = md->end_subject; /* Partial match */
2725 CHECK_PARTIAL();
2726 RRETURN(MATCH_NOMATCH);
2727 }
2728 eptr += slength;
2729 }
2730
2731 /* If min = max, continue at the same level without recursion.
2732 They are not both allowed to be zero. */
2733
2734 if (min == max) continue;
2735
2736 /* If minimizing, keep trying and advancing the pointer */
2737
2738 if (minimize)
2739 {
2740 for (fi = min;; fi++)
2741 {
2742 int slength;
2743 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745 if (fi >= max) RRETURN(MATCH_NOMATCH);
2746 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2747 {
2748 if (slength == -2) eptr = md->end_subject; /* Partial match */
2749 CHECK_PARTIAL();
2750 RRETURN(MATCH_NOMATCH);
2751 }
2752 eptr += slength;
2753 }
2754 /* Control never gets here */
2755 }
2756
2757 /* If maximizing, find the longest string and work backwards */
2758
2759 else
2760 {
2761 pp = eptr;
2762 for (i = min; i < max; i++)
2763 {
2764 int slength;
2765 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2766 {
2767 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2768 the soft partial matching case. */
2769
2770 if (slength == -2 && md->partial != 0 &&
2771 md->end_subject > md->start_used_ptr)
2772 {
2773 md->hitend = TRUE;
2774 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2775 }
2776 break;
2777 }
2778 eptr += slength;
2779 }
2780
2781 while (eptr >= pp)
2782 {
2783 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2785 eptr -= length;
2786 }
2787 RRETURN(MATCH_NOMATCH);
2788 }
2789 /* Control never gets here */
2790
2791 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2792 used when all the characters in the class have values in the range 0-255,
2793 and either the matching is caseful, or the characters are in the range
2794 0-127 when UTF-8 processing is enabled. The only difference between
2795 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2796 encountered.
2797
2798 First, look past the end of the item to see if there is repeat information
2799 following. Then obey similar code to character type repeats - written out
2800 again for speed. */
2801
2802 case OP_NCLASS:
2803 case OP_CLASS:
2804 {
2805 /* The data variable is saved across frames, so the byte map needs to
2806 be stored there. */
2807 #define BYTE_MAP ((pcre_uint8 *)data)
2808 data = ecode + 1; /* Save for matching */
2809 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2810
2811 switch (*ecode)
2812 {
2813 case OP_CRSTAR:
2814 case OP_CRMINSTAR:
2815 case OP_CRPLUS:
2816 case OP_CRMINPLUS:
2817 case OP_CRQUERY:
2818 case OP_CRMINQUERY:
2819 c = *ecode++ - OP_CRSTAR;
2820 minimize = (c & 1) != 0;
2821 min = rep_min[c]; /* Pick up values from tables; */
2822 max = rep_max[c]; /* zero for max => infinity */
2823 if (max == 0) max = INT_MAX;
2824 break;
2825
2826 case OP_CRRANGE:
2827 case OP_CRMINRANGE:
2828 minimize = (*ecode == OP_CRMINRANGE);
2829 min = GET2(ecode, 1);
2830 max = GET2(ecode, 1 + IMM2_SIZE);
2831 if (max == 0) max = INT_MAX;
2832 ecode += 1 + 2 * IMM2_SIZE;
2833 break;
2834
2835 default: /* No repeat follows */
2836 min = max = 1;
2837 break;
2838 }
2839
2840 /* First, ensure the minimum number of matches are present. */
2841
2842 #ifdef SUPPORT_UTF
2843 if (utf)
2844 {
2845 for (i = 1; i <= min; i++)
2846 {
2847 if (eptr >= md->end_subject)
2848 {
2849 SCHECK_PARTIAL();
2850 RRETURN(MATCH_NOMATCH);
2851 }
2852 GETCHARINC(c, eptr);
2853 if (c > 255)
2854 {
2855 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2856 }
2857 else
2858 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2859 }
2860 }
2861 else
2862 #endif
2863 /* Not UTF mode */
2864 {
2865 for (i = 1; i <= min; i++)
2866 {
2867 if (eptr >= md->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 RRETURN(MATCH_NOMATCH);
2871 }
2872 c = *eptr++;
2873 #ifndef COMPILE_PCRE8
2874 if (c > 255)
2875 {
2876 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2877 }
2878 else
2879 #endif
2880 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2881 }
2882 }
2883
2884 /* If max == min we can continue with the main loop without the
2885 need to recurse. */
2886
2887 if (min == max) continue;
2888
2889 /* If minimizing, keep testing the rest of the expression and advancing
2890 the pointer while it matches the class. */
2891
2892 if (minimize)
2893 {
2894 #ifdef SUPPORT_UTF
2895 if (utf)
2896 {
2897 for (fi = min;; fi++)
2898 {
2899 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2901 if (fi >= max) RRETURN(MATCH_NOMATCH);
2902 if (eptr >= md->end_subject)
2903 {
2904 SCHECK_PARTIAL();
2905 RRETURN(MATCH_NOMATCH);
2906 }
2907 GETCHARINC(c, eptr);
2908 if (c > 255)
2909 {
2910 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2911 }
2912 else
2913 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2914 }
2915 }
2916 else
2917 #endif
2918 /* Not UTF mode */
2919 {
2920 for (fi = min;; fi++)
2921 {
2922 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2923 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2924 if (fi >= max) RRETURN(MATCH_NOMATCH);
2925 if (eptr >= md->end_subject)
2926 {
2927 SCHECK_PARTIAL();
2928 RRETURN(MATCH_NOMATCH);
2929 }
2930 c = *eptr++;
2931 #ifndef COMPILE_PCRE8
2932 if (c > 255)
2933 {
2934 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2935 }
2936 else
2937 #endif
2938 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2939 }
2940 }
2941 /* Control never gets here */
2942 }
2943
2944 /* If maximizing, find the longest possible run, then work backwards. */
2945
2946 else
2947 {
2948 pp = eptr;
2949
2950 #ifdef SUPPORT_UTF
2951 if (utf)
2952 {
2953 for (i = min; i < max; i++)
2954 {
2955 int len = 1;
2956 if (eptr >= md->end_subject)
2957 {
2958 SCHECK_PARTIAL();
2959 break;
2960 }
2961 GETCHARLEN(c, eptr, len);
2962 if (c > 255)
2963 {
2964 if (op == OP_CLASS) break;
2965 }
2966 else
2967 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2968 eptr += len;
2969 }
2970 for (;;)
2971 {
2972 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2974 if (eptr-- == pp) break; /* Stop if tried at original pos */
2975 BACKCHAR(eptr);
2976 }
2977 }
2978 else
2979 #endif
2980 /* Not UTF mode */
2981 {
2982 for (i = min; i < max; i++)
2983 {
2984 if (eptr >= md->end_subject)
2985 {
2986 SCHECK_PARTIAL();
2987 break;
2988 }
2989 c = *eptr;
2990 #ifndef COMPILE_PCRE8
2991 if (c > 255)
2992 {
2993 if (op == OP_CLASS) break;
2994 }
2995 else
2996 #endif
2997 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2998 eptr++;
2999 }
3000 while (eptr >= pp)
3001 {
3002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3004 eptr--;
3005 }
3006 }
3007
3008 RRETURN(MATCH_NOMATCH);
3009 }
3010 #undef BYTE_MAP
3011 }
3012 /* Control never gets here */
3013
3014
3015 /* Match an extended character class. This opcode is encountered only
3016 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3017 mode, because Unicode properties are supported in non-UTF-8 mode. */
3018
3019 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3020 case OP_XCLASS:
3021 {
3022 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3023 ecode += GET(ecode, 1); /* Advance past the item */
3024
3025 switch (*ecode)
3026 {
3027 case OP_CRSTAR:
3028 case OP_CRMINSTAR:
3029 case OP_CRPLUS:
3030 case OP_CRMINPLUS:
3031 case OP_CRQUERY:
3032 case OP_CRMINQUERY:
3033 c = *ecode++ - OP_CRSTAR;
3034 minimize = (c & 1) != 0;
3035 min = rep_min[c]; /* Pick up values from tables; */
3036 max = rep_max[c]; /* zero for max => infinity */
3037 if (max == 0) max = INT_MAX;
3038 break;
3039
3040 case OP_CRRANGE:
3041 case OP_CRMINRANGE:
3042 minimize = (*ecode == OP_CRMINRANGE);
3043 min = GET2(ecode, 1);
3044 max = GET2(ecode, 1 + IMM2_SIZE);
3045 if (max == 0) max = INT_MAX;
3046 ecode += 1 + 2 * IMM2_SIZE;
3047 break;
3048
3049 default: /* No repeat follows */
3050 min = max = 1;
3051 break;
3052 }
3053
3054 /* First, ensure the minimum number of matches are present. */
3055
3056 for (i = 1; i <= min; i++)
3057 {
3058 if (eptr >= md->end_subject)
3059 {
3060 SCHECK_PARTIAL();
3061 RRETURN(MATCH_NOMATCH);
3062 }
3063 GETCHARINCTEST(c, eptr);
3064 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3065 }
3066
3067 /* If max == min we can continue with the main loop without the
3068 need to recurse. */
3069
3070 if (min == max) continue;
3071
3072 /* If minimizing, keep testing the rest of the expression and advancing
3073 the pointer while it matches the class. */
3074
3075 if (minimize)
3076 {
3077 for (fi = min;; fi++)
3078 {
3079 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3081 if (fi >= max) RRETURN(MATCH_NOMATCH);
3082 if (eptr >= md->end_subject)
3083 {
3084 SCHECK_PARTIAL();
3085 RRETURN(MATCH_NOMATCH);
3086 }
3087 GETCHARINCTEST(c, eptr);
3088 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3089 }
3090 /* Control never gets here */
3091 }
3092
3093 /* If maximizing, find the longest possible run, then work backwards. */
3094
3095 else
3096 {
3097 pp = eptr;
3098 for (i = min; i < max; i++)
3099 {
3100 int len = 1;
3101 if (eptr >= md->end_subject)
3102 {
3103 SCHECK_PARTIAL();
3104 break;
3105 }
3106 #ifdef SUPPORT_UTF
3107 GETCHARLENTEST(c, eptr, len);
3108 #else
3109 c = *eptr;
3110 #endif
3111 if (!PRIV(xclass)(c, data, utf)) break;
3112 eptr += len;
3113 }
3114 for(;;)
3115 {
3116 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3118 if (eptr-- == pp) break; /* Stop if tried at original pos */
3119 #ifdef SUPPORT_UTF
3120 if (utf) BACKCHAR(eptr);
3121 #endif
3122 }
3123 RRETURN(MATCH_NOMATCH);
3124 }
3125
3126 /* Control never gets here */
3127 }
3128 #endif /* End of XCLASS */
3129
3130 /* Match a single character, casefully */
3131
3132 case OP_CHAR:
3133 #ifdef SUPPORT_UTF
3134 if (utf)
3135 {
3136 length = 1;
3137 ecode++;
3138 GETCHARLEN(fc, ecode, length);
3139 if (length > md->end_subject - eptr)
3140 {
3141 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3142 RRETURN(MATCH_NOMATCH);
3143 }
3144 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3145 }
3146 else
3147 #endif
3148 /* Not UTF mode */
3149 {
3150 if (md->end_subject - eptr < 1)
3151 {
3152 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3153 RRETURN(MATCH_NOMATCH);
3154 }
3155 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3156 ecode += 2;
3157 }
3158 break;
3159
3160 /* Match a single character, caselessly. If we are at the end of the
3161 subject, give up immediately. */
3162
3163 case OP_CHARI:
3164 if (eptr >= md->end_subject)
3165 {
3166 SCHECK_PARTIAL();
3167 RRETURN(MATCH_NOMATCH);
3168 }
3169
3170 #ifdef SUPPORT_UTF
3171 if (utf)
3172 {
3173 length = 1;
3174 ecode++;
3175 GETCHARLEN(fc, ecode, length);
3176
3177 /* If the pattern character's value is < 128, we have only one byte, and
3178 we know that its other case must also be one byte long, so we can use the
3179 fast lookup table. We know that there is at least one byte left in the
3180 subject. */
3181
3182 if (fc < 128)
3183 {
3184 if (md->lcc[fc]
3185 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3186 ecode++;
3187 eptr++;
3188 }
3189
3190 /* Otherwise we must pick up the subject character. Note that we cannot
3191 use the value of "length" to check for sufficient bytes left, because the
3192 other case of the character may have more or fewer bytes. */
3193
3194 else
3195 {
3196 unsigned int dc;
3197 GETCHARINC(dc, eptr);
3198 ecode += length;
3199
3200 /* If we have Unicode property support, we can use it to test the other
3201 case of the character, if there is one. */
3202
3203 if (fc != dc)
3204 {
3205 #ifdef SUPPORT_UCP
3206 if (dc != UCD_OTHERCASE(fc))
3207 #endif
3208 RRETURN(MATCH_NOMATCH);
3209 }
3210 }
3211 }
3212 else
3213 #endif /* SUPPORT_UTF */
3214
3215 /* Not UTF mode */
3216 {
3217 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3218 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3219 eptr++;
3220 ecode += 2;
3221 }
3222 break;
3223
3224 /* Match a single character repeatedly. */
3225
3226 case OP_EXACT:
3227 case OP_EXACTI:
3228 min = max = GET2(ecode, 1);
3229 ecode += 1 + IMM2_SIZE;
3230 goto REPEATCHAR;
3231
3232 case OP_POSUPTO:
3233 case OP_POSUPTOI:
3234 possessive = TRUE;
3235 /* Fall through */
3236
3237 case OP_UPTO:
3238 case OP_UPTOI:
3239 case OP_MINUPTO:
3240 case OP_MINUPTOI:
3241 min = 0;
3242 max = GET2(ecode, 1);
3243 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3244 ecode += 1 + IMM2_SIZE;
3245 goto REPEATCHAR;
3246
3247 case OP_POSSTAR:
3248 case OP_POSSTARI:
3249 possessive = TRUE;
3250 min = 0;
3251 max = INT_MAX;
3252 ecode++;
3253 goto REPEATCHAR;
3254
3255 case OP_POSPLUS:
3256 case OP_POSPLUSI:
3257 possessive = TRUE;
3258 min = 1;
3259 max = INT_MAX;
3260 ecode++;
3261 goto REPEATCHAR;
3262
3263 case OP_POSQUERY:
3264 case OP_POSQUERYI:
3265 possessive = TRUE;
3266 min = 0;
3267 max = 1;
3268 ecode++;
3269 goto REPEATCHAR;
3270
3271 case OP_STAR:
3272 case OP_STARI:
3273 case OP_MINSTAR:
3274 case OP_MINSTARI:
3275 case OP_PLUS:
3276 case OP_PLUSI:
3277 case OP_MINPLUS:
3278 case OP_MINPLUSI:
3279 case OP_QUERY:
3280 case OP_QUERYI:
3281 case OP_MINQUERY:
3282 case OP_MINQUERYI:
3283 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3284 minimize = (c & 1) != 0;
3285 min = rep_min[c]; /* Pick up values from tables; */
3286 max = rep_max[c]; /* zero for max => infinity */
3287 if (max == 0) max = INT_MAX;
3288
3289 /* Common code for all repeated single-character matches. */
3290
3291 REPEATCHAR:
3292 #ifdef SUPPORT_UTF
3293 if (utf)
3294 {
3295 length = 1;
3296 charptr = ecode;
3297 GETCHARLEN(fc, ecode, length);
3298 ecode += length;
3299
3300 /* Handle multibyte character matching specially here. There is
3301 support for caseless matching if UCP support is present. */
3302
3303 if (length > 1)
3304 {
3305 #ifdef SUPPORT_UCP
3306 unsigned int othercase;
3307 if (op >= OP_STARI && /* Caseless */
3308 (othercase = UCD_OTHERCASE(fc)) != fc)
3309 oclength = PRIV(ord2utf)(othercase, occhars);
3310 else oclength = 0;
3311 #endif /* SUPPORT_UCP */
3312
3313 for (i = 1; i <= min; i++)
3314 {
3315 if (eptr <= md->end_subject - length &&
3316 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3317 #ifdef SUPPORT_UCP
3318 else if (oclength > 0 &&
3319 eptr <= md->end_subject - oclength &&
3320 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3321 #endif /* SUPPORT_UCP */
3322 else
3323 {
3324 CHECK_PARTIAL();
3325 RRETURN(MATCH_NOMATCH);
3326 }
3327 }
3328
3329 if (min == max) continue;
3330
3331 if (minimize)
3332 {
3333 for (fi = min;; fi++)
3334 {
3335 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3336 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3337 if (fi >= max) RRETURN(MATCH_NOMATCH);
3338 if (eptr <= md->end_subject - length &&
3339 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3340 #ifdef SUPPORT_UCP
3341 else if (oclength > 0 &&
3342 eptr <= md->end_subject - oclength &&
3343 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3344 #endif /* SUPPORT_UCP */
3345 else
3346 {
3347 CHECK_PARTIAL();
3348 RRETURN(MATCH_NOMATCH);
3349 }
3350 }
3351 /* Control never gets here */
3352 }
3353
3354 else /* Maximize */
3355 {
3356 pp = eptr;
3357 for (i = min; i < max; i++)
3358 {
3359 if (eptr <= md->end_subject - length &&
3360 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3361 #ifdef SUPPORT_UCP
3362 else if (oclength > 0 &&
3363 eptr <= md->end_subject - oclength &&
3364 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3365 #endif /* SUPPORT_UCP */
3366 else
3367 {
3368 CHECK_PARTIAL();
3369 break;
3370 }
3371 }
3372
3373 if (possessive) continue;
3374
3375 for(;;)
3376 {
3377 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3379 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3380 #ifdef SUPPORT_UCP
3381 eptr--;
3382 BACKCHAR(eptr);
3383 #else /* without SUPPORT_UCP */
3384 eptr -= length;
3385 #endif /* SUPPORT_UCP */
3386 }
3387 }
3388 /* Control never gets here */
3389 }
3390
3391 /* If the length of a UTF-8 character is 1, we fall through here, and
3392 obey the code as for non-UTF-8 characters below, though in this case the
3393 value of fc will always be < 128. */
3394 }
3395 else
3396 #endif /* SUPPORT_UTF */
3397 /* When not in UTF-8 mode, load a single-byte character. */
3398 fc = *ecode++;
3399
3400 /* The value of fc at this point is always one character, though we may
3401 or may not be in UTF mode. The code is duplicated for the caseless and
3402 caseful cases, for speed, since matching characters is likely to be quite
3403 common. First, ensure the minimum number of matches are present. If min =
3404 max, continue at the same level without recursing. Otherwise, if
3405 minimizing, keep trying the rest of the expression and advancing one
3406 matching character if failing, up to the maximum. Alternatively, if
3407 maximizing, find the maximum number of characters and work backwards. */
3408
3409 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3410 max, (char *)eptr));
3411
3412 if (op >= OP_STARI) /* Caseless */
3413 {
3414 #ifdef COMPILE_PCRE8
3415 /* fc must be < 128 if UTF is enabled. */
3416 foc = md->fcc[fc];
3417 #else
3418 #ifdef SUPPORT_UTF
3419 #ifdef SUPPORT_UCP
3420 if (utf && fc > 127)
3421 foc = UCD_OTHERCASE(fc);
3422 #else
3423 if (utf && fc > 127)
3424 foc = fc;
3425 #endif /* SUPPORT_UCP */
3426 else
3427 #endif /* SUPPORT_UTF */
3428 foc = TABLE_GET(fc, md->fcc, fc);
3429 #endif /* COMPILE_PCRE8 */
3430
3431 for (i = 1; i <= min; i++)
3432 {
3433 if (eptr >= md->end_subject)
3434 {
3435 SCHECK_PARTIAL();
3436 RRETURN(MATCH_NOMATCH);
3437 }
3438 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3439 eptr++;
3440 }
3441 if (min == max) continue;
3442 if (minimize)
3443 {
3444 for (fi = min;; fi++)
3445 {
3446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3448 if (fi >= max) RRETURN(MATCH_NOMATCH);
3449 if (eptr >= md->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 RRETURN(MATCH_NOMATCH);
3453 }
3454 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3455 eptr++;
3456 }
3457 /* Control never gets here */
3458 }
3459 else /* Maximize */
3460 {
3461 pp = eptr;
3462 for (i = min; i < max; i++)
3463 {
3464 if (eptr >= md->end_subject)
3465 {
3466 SCHECK_PARTIAL();
3467 break;
3468 }
3469 if (fc != *eptr && foc != *eptr) break;
3470 eptr++;
3471 }
3472
3473 if (possessive) continue;
3474
3475 while (eptr >= pp)
3476 {
3477 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3478 eptr--;
3479 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3480 }
3481 RRETURN(MATCH_NOMATCH);
3482 }
3483 /* Control never gets here */
3484 }
3485
3486 /* Caseful comparisons (includes all multi-byte characters) */
3487
3488 else
3489 {
3490 for (i = 1; i <= min; i++)
3491 {
3492 if (eptr >= md->end_subject)
3493 {
3494 SCHECK_PARTIAL();
3495 RRETURN(MATCH_NOMATCH);
3496 }
3497 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3498 }
3499
3500 if (min == max) continue;
3501
3502 if (minimize)
3503 {
3504 for (fi = min;; fi++)
3505 {
3506 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 if (fi >= max) RRETURN(MATCH_NOMATCH);
3509 if (eptr >= md->end_subject)
3510 {
3511 SCHECK_PARTIAL();
3512 RRETURN(MATCH_NOMATCH);
3513 }
3514 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3515 }
3516 /* Control never gets here */
3517 }
3518 else /* Maximize */
3519 {
3520 pp = eptr;
3521 for (i = min; i < max; i++)
3522 {
3523 if (eptr >= md->end_subject)
3524 {
3525 SCHECK_PARTIAL();
3526 break;
3527 }
3528 if (fc != *eptr) break;
3529 eptr++;
3530 }
3531 if (possessive) continue;
3532
3533 while (eptr >= pp)
3534 {
3535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3536 eptr--;
3537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3538 }
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 }
3542 /* Control never gets here */
3543
3544 /* Match a negated single one-byte character. The character we are
3545 checking can be multibyte. */
3546
3547 case OP_NOT:
3548 case OP_NOTI:
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 RRETURN(MATCH_NOMATCH);
3553 }
3554 #ifdef SUPPORT_UTF
3555 if (utf)
3556 {
3557 register unsigned int ch, och;
3558
3559 ecode++;
3560 GETCHARINC(ch, ecode);
3561 GETCHARINC(c, eptr);
3562
3563 if (op == OP_NOT)
3564 {
3565 if (ch == c) RRETURN(MATCH_NOMATCH);
3566 }
3567 else
3568 {
3569 #ifdef SUPPORT_UCP
3570 if (ch > 127)
3571 och = UCD_OTHERCASE(ch);
3572 #else
3573 if (ch > 127)
3574 och = ch;
3575 #endif /* SUPPORT_UCP */
3576 else
3577 och = TABLE_GET(ch, md->fcc, ch);
3578 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3579 }
3580 }
3581 else
3582 #endif
3583 {
3584 register unsigned int ch = ecode[1];
3585 c = *eptr++;
3586 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3587 RRETURN(MATCH_NOMATCH);
3588 ecode += 2;
3589 }
3590 break;
3591
3592 /* Match a negated single one-byte character repeatedly. This is almost a
3593 repeat of the code for a repeated single character, but I haven't found a
3594 nice way of commoning these up that doesn't require a test of the
3595 positive/negative option for each character match. Maybe that wouldn't add
3596 very much to the time taken, but character matching *is* what this is all
3597 about... */
3598
3599 case OP_NOTEXACT:
3600 case OP_NOTEXACTI:
3601 min = max = GET2(ecode, 1);
3602 ecode += 1 + IMM2_SIZE;
3603 goto REPEATNOTCHAR;
3604
3605 case OP_NOTUPTO:
3606 case OP_NOTUPTOI:
3607 case OP_NOTMINUPTO:
3608 case OP_NOTMINUPTOI:
3609 min = 0;
3610 max = GET2(ecode, 1);
3611 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3612 ecode += 1 + IMM2_SIZE;
3613 goto REPEATNOTCHAR;
3614
3615 case OP_NOTPOSSTAR:
3616 case OP_NOTPOSSTARI:
3617 possessive = TRUE;
3618 min = 0;
3619 max = INT_MAX;
3620 ecode++;
3621 goto REPEATNOTCHAR;
3622
3623 case OP_NOTPOSPLUS:
3624 case OP_NOTPOSPLUSI:
3625 possessive = TRUE;
3626 min = 1;
3627 max = INT_MAX;
3628 ecode++;
3629 goto REPEATNOTCHAR;
3630
3631 case OP_NOTPOSQUERY:
3632 case OP_NOTPOSQUERYI:
3633 possessive = TRUE;
3634 min = 0;
3635 max = 1;
3636 ecode++;
3637 goto REPEATNOTCHAR;
3638
3639 case OP_NOTPOSUPTO:
3640 case OP_NOTPOSUPTOI:
3641 possessive = TRUE;
3642 min = 0;
3643 max = GET2(ecode, 1);
3644 ecode += 1 + IMM2_SIZE;
3645 goto REPEATNOTCHAR;
3646
3647 case OP_NOTSTAR:
3648 case OP_NOTSTARI:
3649 case OP_NOTMINSTAR:
3650 case OP_NOTMINSTARI:
3651 case OP_NOTPLUS:
3652 case OP_NOTPLUSI:
3653 case OP_NOTMINPLUS:
3654 case OP_NOTMINPLUSI:
3655 case OP_NOTQUERY:
3656 case OP_NOTQUERYI:
3657 case OP_NOTMINQUERY:
3658 case OP_NOTMINQUERYI:
3659 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3660 minimize = (c & 1) != 0;
3661 min = rep_min[c]; /* Pick up values from tables; */
3662 max = rep_max[c]; /* zero for max => infinity */
3663 if (max == 0) max = INT_MAX;
3664
3665 /* Common code for all repeated single-byte matches. */
3666
3667 REPEATNOTCHAR:
3668 GETCHARINCTEST(fc, ecode);
3669
3670 /* The code is duplicated for the caseless and caseful cases, for speed,
3671 since matching characters is likely to be quite common. First, ensure the
3672 minimum number of matches are present. If min = max, continue at the same
3673 level without recursing. Otherwise, if minimizing, keep trying the rest of
3674 the expression and advancing one matching character if failing, up to the
3675 maximum. Alternatively, if maximizing, find the maximum number of
3676 characters and work backwards. */
3677
3678 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3679 max, (char *)eptr));
3680
3681 if (op >= OP_NOTSTARI) /* Caseless */
3682 {
3683 #ifdef SUPPORT_UTF
3684 #ifdef SUPPORT_UCP
3685 if (utf && fc > 127)
3686 foc = UCD_OTHERCASE(fc);
3687 #else
3688 if (utf && fc > 127)
3689 foc = fc;
3690 #endif /* SUPPORT_UCP */
3691 else
3692 #endif /* SUPPORT_UTF */
3693 foc = TABLE_GET(fc, md->fcc, fc);
3694
3695 #ifdef SUPPORT_UTF
3696 if (utf)
3697 {
3698 register unsigned int d;
3699 for (i = 1; i <= min; i++)
3700 {
3701 if (eptr >= md->end_subject)
3702 {
3703 SCHECK_PARTIAL();
3704 RRETURN(MATCH_NOMATCH);
3705 }
3706 GETCHARINC(d, eptr);
3707 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3708 }
3709 }
3710 else
3711 #endif
3712 /* Not UTF mode */
3713 {
3714 for (i = 1; i <= min; i++)
3715 {
3716 if (eptr >= md->end_subject)
3717 {
3718 SCHECK_PARTIAL();
3719 RRETURN(MATCH_NOMATCH);
3720 }
3721 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3722 eptr++;
3723 }
3724 }
3725
3726 if (min == max) continue;
3727
3728 if (minimize)
3729 {
3730 #ifdef SUPPORT_UTF
3731 if (utf)
3732 {
3733 register unsigned int d;
3734 for (fi = min;; fi++)
3735 {
3736 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3737 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3738 if (fi >= max) RRETURN(MATCH_NOMATCH);
3739 if (eptr >= md->end_subject)
3740 {
3741 SCHECK_PARTIAL();
3742 RRETURN(MATCH_NOMATCH);
3743 }
3744 GETCHARINC(d, eptr);
3745 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3746 }
3747 }
3748 else
3749 #endif
3750 /* Not UTF mode */
3751 {
3752 for (fi = min;; fi++)
3753 {
3754 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3756 if (fi >= max) RRETURN(MATCH_NOMATCH);
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3761 }
3762 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3763 eptr++;
3764 }
3765 }
3766 /* Control never gets here */
3767 }
3768
3769 /* Maximize case */
3770
3771 else
3772 {
3773 pp = eptr;
3774
3775 #ifdef SUPPORT_UTF
3776 if (utf)
3777 {
3778 register unsigned int d;
3779 for (i = min; i < max; i++)
3780 {
3781 int len = 1;
3782 if (eptr >= md->end_subject)
3783 {
3784 SCHECK_PARTIAL();
3785 break;
3786 }
3787 GETCHARLEN(d, eptr, len);
3788 if (fc == d || (unsigned int)foc == d) break;
3789 eptr += len;
3790 }
3791 if (possessive) continue;
3792 for(;;)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (eptr-- == pp) break; /* Stop if tried at original pos */
3797 BACKCHAR(eptr);
3798 }
3799 }
3800 else
3801 #endif
3802 /* Not UTF mode */
3803 {
3804 for (i = min; i < max; i++)
3805 {
3806 if (eptr >= md->end_subject)
3807 {
3808 SCHECK_PARTIAL();
3809 break;
3810 }
3811 if (fc == *eptr || foc == *eptr) break;
3812 eptr++;
3813 }
3814 if (possessive) continue;
3815 while (eptr >= pp)
3816 {
3817 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3819 eptr--;
3820 }
3821 }
3822
3823 RRETURN(MATCH_NOMATCH);
3824 }
3825 /* Control never gets here */
3826 }
3827
3828 /* Caseful comparisons */
3829
3830 else
3831 {
3832 #ifdef SUPPORT_UTF
3833 if (utf)
3834 {
3835 register unsigned int d;
3836 for (i = 1; i <= min; i++)
3837 {
3838 if (eptr >= md->end_subject)
3839 {
3840 SCHECK_PARTIAL();
3841 RRETURN(MATCH_NOMATCH);
3842 }
3843 GETCHARINC(d, eptr);
3844 if (fc == d) RRETURN(MATCH_NOMATCH);
3845 }
3846 }
3847 else
3848 #endif
3849 /* Not UTF mode */
3850 {
3851 for (i = 1; i <= min; i++)
3852 {
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 RRETURN(MATCH_NOMATCH);
3857 }
3858 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3859 }
3860 }
3861
3862 if (min == max) continue;
3863
3864 if (minimize)
3865 {
3866 #ifdef SUPPORT_UTF
3867 if (utf)
3868 {
3869 register unsigned int d;
3870 for (fi = min;; fi++)
3871 {
3872 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3874 if (fi >= max) RRETURN(MATCH_NOMATCH);
3875 if (eptr >= md->end_subject)
3876 {
3877 SCHECK_PARTIAL();
3878 RRETURN(MATCH_NOMATCH);
3879 }
3880 GETCHARINC(d, eptr);
3881 if (fc == d) RRETURN(MATCH_NOMATCH);
3882 }
3883 }
3884 else
3885 #endif
3886 /* Not UTF mode */
3887 {
3888 for (fi = min;; fi++)
3889 {
3890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3892 if (fi >= max) RRETURN(MATCH_NOMATCH);
3893 if (eptr >= md->end_subject)
3894 {
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3897 }
3898 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3899 }
3900 }
3901 /* Control never gets here */
3902 }
3903
3904 /* Maximize case */
3905
3906 else
3907 {
3908 pp = eptr;
3909
3910 #ifdef SUPPORT_UTF
3911 if (utf)
3912 {
3913 register unsigned int d;
3914 for (i = min; i < max; i++)
3915 {
3916 int len = 1;
3917 if (eptr >= md->end_subject)
3918 {
3919 SCHECK_PARTIAL();
3920 break;
3921 }
3922 GETCHARLEN(d, eptr, len);
3923 if (fc == d) break;
3924 eptr += len;
3925 }
3926 if (possessive) continue;
3927 for(;;)
3928 {
3929 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3930 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3931 if (eptr-- == pp) break; /* Stop if tried at original pos */
3932 BACKCHAR(eptr);
3933 }
3934 }
3935 else
3936 #endif
3937 /* Not UTF mode */
3938 {
3939 for (i = min; i < max; i++)
3940 {
3941 if (eptr >= md->end_subject)
3942 {
3943 SCHECK_PARTIAL();
3944 break;
3945 }
3946 if (fc == *eptr) break;
3947 eptr++;
3948 }
3949 if (possessive) continue;
3950 while (eptr >= pp)
3951 {
3952 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3953 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3954 eptr--;
3955 }
3956 }
3957
3958 RRETURN(MATCH_NOMATCH);
3959 }
3960 }
3961 /* Control never gets here */
3962
3963 /* Match a single character type repeatedly; several different opcodes
3964 share code. This is very similar to the code for single characters, but we
3965 repeat it in the interests of efficiency. */
3966
3967 case OP_TYPEEXACT:
3968 min = max = GET2(ecode, 1);
3969 minimize = TRUE;
3970 ecode += 1 + IMM2_SIZE;
3971 goto REPEATTYPE;
3972
3973 case OP_TYPEUPTO:
3974 case OP_TYPEMINUPTO:
3975 min = 0;
3976 max = GET2(ecode, 1);
3977 minimize = *ecode == OP_TYPEMINUPTO;
3978 ecode += 1 + IMM2_SIZE;
3979 goto REPEATTYPE;
3980
3981 case OP_TYPEPOSSTAR:
3982 possessive = TRUE;
3983 min = 0;
3984 max = INT_MAX;
3985 ecode++;
3986 goto REPEATTYPE;
3987
3988 case OP_TYPEPOSPLUS:
3989 possessive = TRUE;
3990 min = 1;
3991 max = INT_MAX;
3992 ecode++;
3993 goto REPEATTYPE;
3994
3995 case OP_TYPEPOSQUERY:
3996 possessive = TRUE;
3997 min = 0;
3998 max = 1;
3999 ecode++;
4000 goto REPEATTYPE;
4001
4002 case OP_TYPEPOSUPTO:
4003 possessive = TRUE;
4004 min = 0;
4005 max = GET2(ecode, 1);
4006 ecode += 1 + IMM2_SIZE;
4007 goto REPEATTYPE;
4008
4009 case OP_TYPESTAR:
4010 case OP_TYPEMINSTAR:
4011 case OP_TYPEPLUS:
4012 case OP_TYPEMINPLUS:
4013 case OP_TYPEQUERY:
4014 case OP_TYPEMINQUERY:
4015 c = *ecode++ - OP_TYPESTAR;
4016 minimize = (c & 1) != 0;
4017 min = rep_min[c]; /* Pick up values from tables; */
4018 max = rep_max[c]; /* zero for max => infinity */
4019 if (max == 0) max = INT_MAX;
4020
4021 /* Common code for all repeated single character type matches. Note that
4022 in UTF-8 mode, '.' matches a character of any length, but for the other
4023 character types, the valid characters are all one-byte long. */
4024
4025 REPEATTYPE:
4026 ctype = *ecode++; /* Code for the character type */
4027
4028 #ifdef SUPPORT_UCP
4029 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4030 {
4031 prop_fail_result = ctype == OP_NOTPROP;
4032 prop_type = *ecode++;
4033 prop_value = *ecode++;
4034 }
4035 else prop_type = -1;
4036 #endif
4037
4038 /* First, ensure the minimum number of matches are present. Use inline
4039 code for maximizing the speed, and do the type test once at the start
4040 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4041 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4042 and single-bytes. */
4043
4044 if (min > 0)
4045 {
4046 #ifdef SUPPORT_UCP
4047 if (prop_type >= 0)
4048 {
4049 switch(prop_type)
4050 {
4051 case PT_ANY:
4052 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4053 for (i = 1; i <= min; i++)
4054 {
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 RRETURN(MATCH_NOMATCH);
4059 }
4060 GETCHARINCTEST(c, eptr);
4061 }
4062 break;
4063
4064 case PT_LAMP:
4065 for (i = 1; i <= min; i++)
4066 {
4067 int chartype;
4068 if (eptr >= md->end_subject)
4069 {
4070 SCHECK_PARTIAL();
4071 RRETURN(MATCH_NOMATCH);
4072 }
4073 GETCHARINCTEST(c, eptr);
4074 chartype = UCD_CHARTYPE(c);
4075 if ((chartype == ucp_Lu ||
4076 chartype == ucp_Ll ||
4077 chartype == ucp_Lt) == prop_fail_result)
4078 RRETURN(MATCH_NOMATCH);
4079 }
4080 break;
4081
4082 case PT_GC:
4083 for (i = 1; i <= min; i++)
4084 {
4085 if (eptr >= md->end_subject)
4086 {
4087 SCHECK_PARTIAL();
4088 RRETURN(MATCH_NOMATCH);
4089 }
4090 GETCHARINCTEST(c, eptr);
4091 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4092 RRETURN(MATCH_NOMATCH);
4093 }
4094 break;
4095
4096 case PT_PC:
4097 for (i = 1; i <= min; i++)
4098 {
4099 if (eptr >= md->end_subject)
4100 {
4101 SCHECK_PARTIAL();
4102 RRETURN(MATCH_NOMATCH);
4103 }
4104 GETCHARINCTEST(c, eptr);
4105 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 break;
4109
4110 case PT_SC:
4111 for (i = 1; i <= min; i++)
4112 {
4113 if (eptr >= md->end_subject)
4114 {
4115 SCHECK_PARTIAL();
4116 RRETURN(MATCH_NOMATCH);
4117 }
4118 GETCHARINCTEST(c, eptr);
4119 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4120 RRETURN(MATCH_NOMATCH);
4121 }
4122 break;
4123
4124 case PT_ALNUM:
4125 for (i = 1; i <= min; i++)
4126 {
4127 int category;
4128 if (eptr >= md->end_subject)
4129 {
4130 SCHECK_PARTIAL();
4131 RRETURN(MATCH_NOMATCH);
4132 }
4133 GETCHARINCTEST(c, eptr);
4134 category = UCD_CATEGORY(c);
4135 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 break;
4139
4140 case PT_SPACE: /* Perl space */
4141 for (i = 1; i <= min; i++)
4142 {
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 GETCHARINCTEST(c, eptr);
4149 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4150 c == CHAR_FF || c == CHAR_CR)
4151 == prop_fail_result)
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 break;
4155
4156 case PT_PXSPACE: /* POSIX space */
4157 for (i = 1; i <= min; i++)
4158 {
4159 if (eptr >= md->end_subject)
4160 {
4161 SCHECK_PARTIAL();
4162 RRETURN(MATCH_NOMATCH);
4163 }
4164 GETCHARINCTEST(c, eptr);
4165 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4166 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4167 == prop_fail_result)
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 break;
4171
4172 case PT_WORD:
4173 for (i = 1; i <= min; i++)
4174 {
4175 int category;
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 GETCHARINCTEST(c, eptr);
4182 category = UCD_CATEGORY(c);
4183 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4184 == prop_fail_result)
4185 RRETURN(MATCH_NOMATCH);
4186 }
4187 break;
4188
4189 case PT_CLIST:
4190 for (i = 1; i <= min; i++)
4191 {
4192 const pcre_uint32 *cp;
4193 if (eptr >= md->end_subject)
4194 {
4195 SCHECK_PARTIAL();
4196 RRETURN(MATCH_NOMATCH);
4197 }
4198 GETCHARINCTEST(c, eptr);
4199 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4200 for (;;)
4201 {
4202 if (c < *cp)
4203 { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
4204 if (c == *cp++)
4205 { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
4206 }
4207 }
4208 break;
4209
4210 /* This should not occur */
4211
4212 default:
4213 RRETURN(PCRE_ERROR_INTERNAL);
4214 }
4215 }
4216
4217 /* Match extended Unicode sequences. We will get here only if the
4218 support is in the binary; otherwise a compile-time error occurs. */
4219
4220 else if (ctype == OP_EXTUNI)
4221 {
4222 for (i = 1; i <= min; i++)
4223 {
4224 if (eptr >= md->end_subject)
4225 {
4226 SCHECK_PARTIAL();
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 else
4230 {
4231 int lgb, rgb;
4232 GETCHARINCTEST(c, eptr);
4233 lgb = UCD_GRAPHBREAK(c);
4234 while (eptr < md->end_subject)
4235 {
4236 int len = 1;
4237 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4238 rgb = UCD_GRAPHBREAK(c);
4239 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4240 lgb = rgb;
4241 eptr += len;
4242 }
4243 }
4244 CHECK_PARTIAL();
4245 }
4246 }
4247
4248 else
4249 #endif /* SUPPORT_UCP */
4250
4251 /* Handle all other cases when the coding is UTF-8 */
4252
4253 #ifdef SUPPORT_UTF
4254 if (utf) switch(ctype)
4255 {
4256 case OP_ANY:
4257 for (i = 1; i <= min; i++)
4258 {
4259 if (eptr >= md->end_subject)
4260 {
4261 SCHECK_PARTIAL();
4262 RRETURN(MATCH_NOMATCH);
4263 }
4264 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4265 if (md->partial != 0 &&
4266 eptr + 1 >= md->end_subject &&
4267 NLBLOCK->nltype == NLTYPE_FIXED &&
4268 NLBLOCK->nllen == 2 &&
4269 *eptr == NLBLOCK->nl[0])
4270 {
4271 md->hitend = TRUE;
4272 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4273 }
4274 eptr++;
4275 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4276 }
4277 break;
4278
4279 case OP_ALLANY:
4280 for (i = 1; i <= min; i++)
4281 {
4282 if (eptr >= md->end_subject)
4283 {
4284 SCHECK_PARTIAL();
4285 RRETURN(MATCH_NOMATCH);
4286 }
4287 eptr++;
4288 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4289 }
4290 break;
4291
4292 case OP_ANYBYTE:
4293 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4294 eptr += min;
4295 break;
4296
4297 case OP_ANYNL:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 RRETURN(MATCH_NOMATCH);
4304 }
4305 GETCHARINC(c, eptr);
4306 switch(c)
4307 {
4308 default: RRETURN(MATCH_NOMATCH);
4309
4310 case CHAR_CR:
4311 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4312 break;
4313
4314 case CHAR_LF:
4315 break;
4316
4317 case CHAR_VT:
4318 case CHAR_FF:
4319 case CHAR_NEL:
4320 #ifndef EBCDIC
4321 case 0x2028:
4322 case 0x2029:
4323 #endif /* Not EBCDIC */
4324 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4325 break;
4326 }
4327 }
4328 break;
4329
4330 case OP_NOT_HSPACE:
4331 for (i = 1; i <= min; i++)
4332 {
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 RRETURN(MATCH_NOMATCH);
4337 }
4338 GETCHARINC(c, eptr);
4339 switch(c)
4340 {
4341 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4342 default: break;
4343 }
4344 }
4345 break;
4346
4347 case OP_HSPACE:
4348 for (i = 1; i <= min; i++)
4349 {
4350 if (eptr >= md->end_subject)
4351 {
4352 SCHECK_PARTIAL();
4353 RRETURN(MATCH_NOMATCH);
4354 }
4355 GETCHARINC(c, eptr);
4356 switch(c)
4357 {
4358 HSPACE_CASES: break; /* Byte and multibyte cases */
4359 default: RRETURN(MATCH_NOMATCH);
4360 }
4361 }
4362 break;
4363
4364 case OP_NOT_VSPACE:
4365 for (i = 1; i <= min; i++)
4366 {
4367 if (eptr >= md->end_subject)
4368 {
4369 SCHECK_PARTIAL();
4370 RRETURN(MATCH_NOMATCH);
4371 }
4372 GETCHARINC(c, eptr);
4373 switch(c)
4374 {
4375 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4376 default: break;
4377 }
4378 }
4379 break;
4380
4381 case OP_VSPACE:
4382 for (i = 1; i <= min; i++)
4383 {
4384 if (eptr >= md->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 RRETURN(MATCH_NOMATCH);
4388 }
4389 GETCHARINC(c, eptr);
4390 switch(c)
4391 {
4392 VSPACE_CASES: break;
4393 default: RRETURN(MATCH_NOMATCH);
4394 }
4395 }
4396 break;
4397
4398 case OP_NOT_DIGIT:
4399 for (i = 1; i <= min; i++)
4400 {
4401 if (eptr >= md->end_subject)
4402 {
4403 SCHECK_PARTIAL();
4404 RRETURN(MATCH_NOMATCH);
4405 }
4406 GETCHARINC(c, eptr);
4407 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4408 RRETURN(MATCH_NOMATCH);
4409 }
4410 break;
4411
4412 case OP_DIGIT:
4413 for (i = 1; i <= min; i++)
4414 {
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 RRETURN(MATCH_NOMATCH);
4419 }
4420 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4421 RRETURN(MATCH_NOMATCH);
4422 eptr++;
4423 /* No need to skip more bytes - we know it's a 1-byte character */
4424 }
4425 break;
4426
4427 case OP_NOT_WHITESPACE:
4428 for (i = 1; i <= min; i++)
4429 {
4430 if (eptr >= md->end_subject)
4431 {
4432 SCHECK_PARTIAL();
4433 RRETURN(MATCH_NOMATCH);
4434 }
4435 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4436 RRETURN(MATCH_NOMATCH);
4437 eptr++;
4438 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4439 }
4440 break;
4441
4442 case OP_WHITESPACE:
4443 for (i = 1; i <= min; i++)
4444 {
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 RRETURN(MATCH_NOMATCH);
4449 }
4450 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4451 RRETURN(MATCH_NOMATCH);
4452 eptr++;
4453 /* No need to skip more bytes - we know it's a 1-byte character */
4454 }
4455 break;
4456
4457 case OP_NOT_WORDCHAR:
4458 for (i = 1; i <= min; i++)
4459 {
4460 if (eptr >= md->end_subject)
4461 {
4462 SCHECK_PARTIAL();
4463 RRETURN(MATCH_NOMATCH);
4464 }
4465 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4466 RRETURN(MATCH_NOMATCH);
4467 eptr++;
4468 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4469 }
4470 break;
4471
4472 case OP_WORDCHAR:
4473 for (i = 1; i <= min; i++)
4474 {
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 RRETURN(MATCH_NOMATCH);
4479 }
4480 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4481 RRETURN(MATCH_NOMATCH);
4482 eptr++;
4483 /* No need to skip more bytes - we know it's a 1-byte character */
4484 }
4485 break;
4486
4487 default:
4488 RRETURN(PCRE_ERROR_INTERNAL);
4489 } /* End switch(ctype) */
4490
4491 else
4492 #endif /* SUPPORT_UTF */
4493
4494 /* Code for the non-UTF-8 case for minimum matching of operators other
4495 than OP_PROP and OP_NOTPROP. */
4496
4497 switch(ctype)
4498 {
4499 case OP_ANY:
4500 for (i = 1; i <= min; i++)
4501 {
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 RRETURN(MATCH_NOMATCH);
4506 }
4507 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4508 if (md->partial != 0 &&
4509 eptr + 1 >= md->end_subject &&
4510 NLBLOCK->nltype == NLTYPE_FIXED &&
4511 NLBLOCK->nllen == 2 &&
4512 *eptr == NLBLOCK->nl[0])
4513 {
4514 md->hitend = TRUE;
4515 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4516 }
4517 eptr++;
4518 }
4519 break;
4520
4521 case OP_ALLANY:
4522 if (eptr > md->end_subject - min)
4523 {
4524 SCHECK_PARTIAL();
4525 RRETURN(MATCH_NOMATCH);
4526 }
4527 eptr += min;
4528 break;
4529
4530 case OP_ANYBYTE:
4531 if (eptr > md->end_subject - min)
4532 {
4533 SCHECK_PARTIAL();
4534 RRETURN(MATCH_NOMATCH);
4535 }
4536 eptr += min;
4537 break;
4538
4539 case OP_ANYNL:
4540 for (i = 1; i <= min; i++)
4541 {
4542 if (eptr >= md->end_subject)
4543 {
4544 SCHECK_PARTIAL();
4545 RRETURN(MATCH_NOMATCH);
4546 }
4547 switch(*eptr++)
4548 {
4549 default: RRETURN(MATCH_NOMATCH);
4550
4551 case CHAR_CR:
4552 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4553 break;
4554
4555 case CHAR_LF:
4556 break;
4557
4558 case CHAR_VT:
4559 case CHAR_FF:
4560 case CHAR_NEL:
4561 #ifdef COMPILE_PCRE16
4562 case 0x2028:
4563 case 0x2029:
4564 #endif
4565 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4566 break;
4567 }
4568 }
4569 break;
4570
4571 case OP_NOT_HSPACE:
4572 for (i = 1; i <= min; i++)
4573 {
4574 if (eptr >= md->end_subject)
4575 {
4576 SCHECK_PARTIAL();
4577 RRETURN(MATCH_NOMATCH);
4578 }
4579 switch(*eptr++)
4580 {
4581 default: break;
4582 HSPACE_BYTE_CASES:
4583 #ifdef COMPILE_PCRE16
4584 HSPACE_MULTIBYTE_CASES:
4585 #endif
4586 RRETURN(MATCH_NOMATCH);
4587 }
4588 }
4589 break;
4590
4591 case OP_HSPACE:
4592 for (i = 1; i <= min; i++)
4593 {
4594 if (eptr >= md->end_subject)
4595 {
4596 SCHECK_PARTIAL();
4597 RRETURN(MATCH_NOMATCH);
4598 }
4599 switch(*eptr++)
4600 {
4601 default: RRETURN(MATCH_NOMATCH);
4602 HSPACE_BYTE_CASES:
4603 #ifdef COMPILE_PCRE16
4604 HSPACE_MULTIBYTE_CASES:
4605 #endif
4606 break;
4607 }
4608 }
4609 break;
4610
4611 case OP_NOT_VSPACE:
4612 for (i = 1; i <= min; i++)
4613 {
4614 if (eptr >= md->end_subject)
4615 {
4616 SCHECK_PARTIAL();
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 switch(*eptr++)
4620 {
4621 VSPACE_BYTE_CASES:
4622 #ifdef COMPILE_PCRE16
4623 VSPACE_MULTIBYTE_CASES:
4624 #endif
4625 RRETURN(MATCH_NOMATCH);
4626 default: break;
4627 }
4628 }
4629 break;
4630
4631 case OP_VSPACE:
4632 for (i = 1; i <= min; i++)
4633 {
4634 if (eptr >= md->end_subject)
4635 {
4636 SCHECK_PARTIAL();
4637 RRETURN(MATCH_NOMATCH);
4638 }
4639 switch(*eptr++)
4640 {
4641 default: RRETURN(MATCH_NOMATCH);
4642 VSPACE_BYTE_CASES:
4643 #ifdef COMPILE_PCRE16
4644 VSPACE_MULTIBYTE_CASES:
4645 #endif
4646 break;
4647 }
4648 }
4649 break;
4650
4651 case OP_NOT_DIGIT:
4652 for (i = 1; i <= min; i++)
4653 {
4654 if (eptr >= md->end_subject)
4655 {
4656 SCHECK_PARTIAL();
4657 RRETURN(MATCH_NOMATCH);
4658 }
4659 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4660 RRETURN(MATCH_NOMATCH);
4661 eptr++;
4662 }
4663 break;
4664
4665 case OP_DIGIT:
4666 for (i = 1; i <= min; i++)
4667 {
4668 if (eptr >= md->end_subject)
4669 {
4670 SCHECK_PARTIAL();
4671 RRETURN(MATCH_NOMATCH);
4672 }
4673 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4674 RRETURN(MATCH_NOMATCH);
4675 eptr++;
4676 }
4677 break;
4678
4679 case OP_NOT_WHITESPACE:
4680 for (i = 1; i <= min; i++)
4681 {
4682 if (eptr >= md->end_subject)
4683 {
4684 SCHECK_PARTIAL();
4685 RRETURN(MATCH_NOMATCH);
4686 }
4687 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4688 RRETURN(MATCH_NOMATCH);
4689 eptr++;
4690 }
4691 break;
4692
4693 case OP_WHITESPACE:
4694 for (i = 1; i <= min; i++)
4695 {
4696 if (eptr >= md->end_subject)
4697 {
4698 SCHECK_PARTIAL();
4699 RRETURN(MATCH_NOMATCH);
4700 }
4701 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4702 RRETURN(MATCH_NOMATCH);
4703 eptr++;
4704 }
4705 break;
4706
4707 case OP_NOT_WORDCHAR:
4708 for (i = 1; i <= min; i++)
4709 {
4710 if (eptr >= md->end_subject)
4711 {
4712 SCHECK_PARTIAL();
4713 RRETURN(MATCH_NOMATCH);
4714 }
4715 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4716 RRETURN(MATCH_NOMATCH);
4717 eptr++;
4718 }
4719 break;
4720
4721 case OP_WORDCHAR:
4722 for (i = 1; i <= min; i++)
4723 {
4724 if (eptr >= md->end_subject)
4725 {
4726 SCHECK_PARTIAL();
4727 RRETURN(MATCH_NOMATCH);
4728 }
4729 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4730 RRETURN(MATCH_NOMATCH);
4731 eptr++;
4732 }
4733 break;
4734
4735 default:
4736 RRETURN(PCRE_ERROR_INTERNAL);
4737 }
4738 }
4739
4740 /* If min = max, continue at the same level without recursing */
4741
4742 if (min == max) continue;
4743
4744 /* If minimizing, we have to test the rest of the pattern before each
4745 subsequent match. Again, separate the UTF-8 case for speed, and also
4746 separate the UCP cases. */
4747
4748 if (minimize)
4749 {
4750 #ifdef SUPPORT_UCP
4751 if (prop_type >= 0)
4752 {
4753 switch(prop_type)
4754 {
4755 case PT_ANY:
4756 for (fi = min;; fi++)
4757 {
4758 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4760 if (fi >= max) RRETURN(MATCH_NOMATCH);
4761 if (eptr >= md->end_subject)
4762 {
4763 SCHECK_PARTIAL();
4764 RRETURN(MATCH_NOMATCH);
4765 }
4766 GETCHARINCTEST(c, eptr);
4767 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4768 }
4769 /* Control never gets here */
4770
4771 case PT_LAMP:
4772 for (fi = min;; fi++)
4773 {
4774 int chartype;
4775 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4777 if (fi >= max) RRETURN(MATCH_NOMATCH);
4778 if (eptr >= md->end_subject)
4779 {
4780 SCHECK_PARTIAL();
4781 RRETURN(MATCH_NOMATCH);
4782 }
4783 GETCHARINCTEST(c, eptr);
4784 chartype = UCD_CHARTYPE(c);
4785 if ((chartype == ucp_Lu ||
4786 chartype == ucp_Ll ||
4787 chartype == ucp_Lt) == prop_fail_result)
4788 RRETURN(MATCH_NOMATCH);
4789 }
4790 /* Control never gets here */
4791
4792 case PT_GC:
4793 for (fi = min;; fi++)
4794 {
4795 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4797 if (fi >= max) RRETURN(MATCH_NOMATCH);
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 GETCHARINCTEST(c, eptr);
4804 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4805 RRETURN(MATCH_NOMATCH);
4806 }
4807 /* Control never gets here */
4808
4809 case PT_PC:
4810 for (fi = min;; fi++)
4811 {
4812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4814 if (fi >= max) RRETURN(MATCH_NOMATCH);
4815 if (eptr >= md->end_subject)
4816 {
4817 SCHECK_PARTIAL();
4818 RRETURN(MATCH_NOMATCH);
4819 }
4820 GETCHARINCTEST(c, eptr);
4821 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4822 RRETURN(MATCH_NOMATCH);
4823 }
4824 /* Control never gets here */
4825
4826 case PT_SC:
4827 for (fi = min;; fi++)
4828 {
4829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4831 if (fi >= max) RRETURN(MATCH_NOMATCH);
4832 if (eptr >= md->end_subject)
4833 {
4834 SCHECK_PARTIAL();
4835 RRETURN(MATCH_NOMATCH);
4836 }
4837 GETCHARINCTEST(c, eptr);
4838 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4839 RRETURN(MATCH_NOMATCH);
4840 }
4841 /* Control never gets here */
4842
4843 case PT_ALNUM:
4844 for (fi = min;; fi++)
4845 {
4846 int category;
4847 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4848 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4849 if (fi >= max) RRETURN(MATCH_NOMATCH);
4850 if (eptr >= md->end_subject)
4851 {
4852 SCHECK_PARTIAL();
4853 RRETURN(MATCH_NOMATCH);
4854 }
4855 GETCHARINCTEST(c, eptr);
4856 category = UCD_CATEGORY(c);
4857 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4858 RRETURN(MATCH_NOMATCH);
4859 }
4860 /* Control never gets here */
4861
4862 case PT_SPACE: /* Perl space */
4863 for (fi = min;; fi++)
4864 {
4865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4867 if (fi >= max) RRETURN(MATCH_NOMATCH);
4868 if (eptr >= md->end_subject)
4869 {
4870 SCHECK_PARTIAL();
4871 RRETURN(MATCH_NOMATCH);
4872 }
4873 GETCHARINCTEST(c, eptr);
4874 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4875 c == CHAR_FF || c == CHAR_CR)
4876 == prop_fail_result)
4877 RRETURN(MATCH_NOMATCH);
4878 }
4879 /* Control never gets here */
4880
4881 case PT_PXSPACE: /* POSIX space */
4882 for (fi = min;; fi++)
4883 {
4884 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4886 if (fi >= max) RRETURN(MATCH_NOMATCH);
4887 if (eptr >= md->end_subject)
4888 {
4889 SCHECK_PARTIAL();
4890 RRETURN(MATCH_NOMATCH);
4891 }
4892 GETCHARINCTEST(c, eptr);
4893 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4894 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4895 == prop_fail_result)
4896 RRETURN(MATCH_NOMATCH);
4897 }
4898 /* Control never gets here */
4899
4900 case PT_WORD:
4901 for (fi = min;; fi++)
4902 {
4903 int category;
4904 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4906 if (fi >= max) RRETURN(MATCH_NOMATCH);
4907 if (eptr >= md->end_subject)
4908 {
4909 SCHECK_PARTIAL();
4910 RRETURN(MATCH_NOMATCH);
4911 }
4912 GETCHARINCTEST(c, eptr);
4913 category = UCD_CATEGORY(c);
4914 if ((category == ucp_L ||
4915 category == ucp_N ||
4916 c == CHAR_UNDERSCORE)
4917 == prop_fail_result)
4918 RRETURN(MATCH_NOMATCH);
4919 }
4920 /* Control never gets here */
4921
4922 case PT_CLIST:
4923 for (fi = min;; fi++)
4924 {
4925 const pcre_uint32 *cp;
4926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928 if (fi >= max) RRETURN(MATCH_NOMATCH);
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 GETCHARINCTEST(c, eptr);
4935 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4936 for (;;)
4937 {
4938 if (c < *cp)
4939 { if (prop_fail_result) break; else RRETURN(MATCH_NOMATCH); }
4940 if (c == *cp++)
4941 { if (prop_fail_result) RRETURN(MATCH_NOMATCH); else break; }
4942 }
4943 }
4944 /* Control never gets here */
4945
4946 /* This should never occur */
4947 default:
4948 RRETURN(PCRE_ERROR_INTERNAL);
4949 }
4950 }
4951
4952 /* Match extended Unicode sequences. We will get here only if the
4953 support is in the binary; otherwise a compile-time error occurs. */
4954
4955 else if (ctype == OP_EXTUNI)
4956 {
4957 for (fi = min;; fi++)
4958 {
4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4961 if (fi >= max) RRETURN(MATCH_NOMATCH);
4962 if (eptr >= md->end_subject)
4963 {
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4966 }
4967 else
4968 {
4969 int lgb, rgb;
4970 GETCHARINCTEST(c, eptr);
4971 lgb = UCD_GRAPHBREAK(c);
4972 while (eptr < md->end_subject)
4973 {
4974 int len = 1;
4975 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4976 rgb = UCD_GRAPHBREAK(c);
4977 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4978 lgb = rgb;
4979 eptr += len;
4980 }
4981 }
4982 CHECK_PARTIAL();
4983 }
4984 }
4985 else
4986 #endif /* SUPPORT_UCP */
4987
4988 #ifdef SUPPORT_UTF
4989 if (utf)
4990 {
4991 for (fi = min;; fi++)
4992 {
4993 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4994 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4995 if (fi >= max) RRETURN(MATCH_NOMATCH);
4996 if (eptr >= md->end_subject)
4997 {
4998 SCHECK_PARTIAL();
4999 RRETURN(MATCH_NOMATCH);
5000 }
5001 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5002 RRETURN(MATCH_NOMATCH);
5003 GETCHARINC(c, eptr);
5004 switch(ctype)
5005 {
5006 case OP_ANY: /* This is the non-NL case */
5007 if (md->partial != 0 && /* Take care with CRLF partial */
5008 eptr >= md->end_subject &&
5009 NLBLOCK->nltype == NLTYPE_FIXED &&
5010 NLBLOCK->nllen == 2 &&
5011 c == NLBLOCK->nl[0])
5012 {
5013 md->hitend = TRUE;
5014 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5015 }
5016 break;
5017
5018 case OP_ALLANY:
5019 case OP_ANYBYTE:
5020 break;
5021
5022 case OP_ANYNL:
5023 switch(c)
5024 {
5025 default: RRETURN(MATCH_NOMATCH);
5026 case CHAR_CR:
5027 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5028 break;
5029
5030 case CHAR_LF:
5031 break;
5032
5033 case CHAR_VT:
5034 case CHAR_FF:
5035 case CHAR_NEL:
5036 #ifndef EBCDIC
5037 case 0x2028:
5038 case 0x2029:
5039 #endif /* Not EBCDIC */
5040 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5041 break;
5042 }
5043 break;
5044
5045 case OP_NOT_HSPACE:
5046 switch(c)
5047 {
5048 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5049 default: break;
5050 }
5051 break;
5052
5053 case OP_HSPACE:
5054 switch(c)
5055 {
5056 HSPACE_CASES: break;
5057 default: RRETURN(MATCH_NOMATCH);
5058 }
5059 break;
5060
5061 case OP_NOT_VSPACE:
5062 switch(c)
5063 {
5064 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5065 default: break;
5066 }
5067 break;
5068
5069 case OP_VSPACE:
5070 switch(c)
5071 {
5072 VSPACE_CASES: break;
5073 default: RRETURN(MATCH_NOMATCH);
5074 }
5075 break;
5076
5077 case OP_NOT_DIGIT:
5078 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5079 RRETURN(MATCH_NOMATCH);
5080 break;
5081
5082 case OP_DIGIT:
5083 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5084 RRETURN(MATCH_NOMATCH);
5085 break;
5086
5087 case OP_NOT_WHITESPACE:
5088 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5089 RRETURN(MATCH_NOMATCH);
5090 break;
5091
5092 case OP_WHITESPACE:
5093 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5094 RRETURN(MATCH_NOMATCH);
5095 break;
5096
5097 case OP_NOT_WORDCHAR:
5098 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5099 RRETURN(MATCH_NOMATCH);
5100 break;
5101
5102 case OP_WORDCHAR:
5103 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5104 RRETURN(MATCH_NOMATCH);
5105 break;
5106
5107 default:
5108 RRETURN(PCRE_ERROR_INTERNAL);
5109 }
5110 }
5111 }
5112 else
5113 #endif
5114 /* Not UTF mode */
5115 {
5116 for (fi = min;; fi++)
5117 {
5118 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5120 if (fi >= max) RRETURN(MATCH_NOMATCH);
5121 if (eptr >= md->end_subject)
5122 {
5123 SCHECK_PARTIAL();
5124 RRETURN(MATCH_NOMATCH);
5125 }
5126 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5127 RRETURN(MATCH_NOMATCH);
5128 c = *eptr++;
5129 switch(ctype)
5130 {
5131 case OP_ANY: /* This is the non-NL case */
5132 if (md->partial != 0 && /* Take care with CRLF partial */
5133 eptr >= md->end_subject &&
5134 NLBLOCK->nltype == NLTYPE_FIXED &&
5135 NLBLOCK->nllen == 2 &&
5136 c == NLBLOCK->nl[0])
5137 {
5138 md->hitend = TRUE;
5139 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5140 }
5141 break;
5142
5143 case OP_ALLANY:
5144 case OP_ANYBYTE:
5145 break;
5146
5147 case OP_ANYNL:
5148 switch(c)
5149 {
5150 default: RRETURN(MATCH_NOMATCH);
5151 case CHAR_CR:
5152 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5153 break;
5154
5155 case CHAR_LF:
5156 break;
5157
5158 case CHAR_VT:
5159 case CHAR_FF:
5160 case CHAR_NEL:
5161 #ifdef COMPILE_PCRE16
5162 case 0x2028:
5163 case 0x2029:
5164 #endif
5165 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5166 break;
5167 }
5168 break;
5169
5170 case OP_NOT_HSPACE:
5171 switch(c)
5172 {
5173 default: break;
5174 HSPACE_BYTE_CASES:
5175 #ifdef COMPILE_PCRE16
5176 HSPACE_MULTIBYTE_CASES:
5177 #endif
5178 RRETURN(MATCH_NOMATCH);
5179 }
5180 break;
5181
5182 case OP_HSPACE:
5183 switch(c)
5184 {
5185 default: RRETURN(MATCH_NOMATCH);
5186 HSPACE_BYTE_CASES:
5187 #ifdef COMPILE_PCRE16
5188 HSPACE_MULTIBYTE_CASES:
5189 #endif
5190 break;
5191 }
5192 break;
5193
5194 case OP_NOT_VSPACE:
5195 switch(c)
5196 {
5197 default: break;
5198 VSPACE_BYTE_CASES:
5199 #ifdef COMPILE_PCRE16
5200 VSPACE_MULTIBYTE_CASES:
5201 #endif
5202 RRETURN(MATCH_NOMATCH);
5203 }
5204 break;
5205
5206 case OP_VSPACE:
5207 switch(c)
5208 {
5209 default: RRETURN(MATCH_NOMATCH);
5210 VSPACE_BYTE_CASES:
5211 #ifdef COMPILE_PCRE16
5212 VSPACE_MULTIBYTE_CASES:
5213 #endif
5214 break;
5215 }
5216 break;
5217
5218 case OP_NOT_DIGIT:
5219 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5220 break;
5221
5222 case OP_DIGIT:
5223 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5224 break;
5225
5226 case OP_NOT_WHITESPACE:
5227 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5228 break;
5229
5230 case OP_WHITESPACE:
5231 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5232 break;
5233
5234 case OP_NOT_WORDCHAR:
5235 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5236 break;
5237
5238 case OP_WORDCHAR:
5239 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5240 break;
5241
5242 default:
5243 RRETURN(PCRE_ERROR_INTERNAL);
5244 }
5245 }
5246 }
5247 /* Control never gets here */
5248 }
5249
5250 /* If maximizing, it is worth using inline code for speed, doing the type
5251 test once at the start (i.e. keep it out of the loop). Again, keep the
5252 UTF-8 and UCP stuff separate. */
5253
5254 else
5255 {
5256 pp = eptr; /* Remember where we started */
5257
5258 #ifdef SUPPORT_UCP
5259 if (prop_type >= 0)
5260 {
5261 switch(prop_type)
5262 {
5263 case PT_ANY:
5264 for (i = min; i < max; i++)
5265 {
5266 int len = 1;
5267 if (eptr >= md->end_subject)
5268 {
5269 SCHECK_PARTIAL();
5270 break;
5271 }
5272 GETCHARLENTEST(c, eptr, len);
5273 if (prop_fail_result) break;
5274 eptr+= len;
5275 }
5276 break;
5277
5278 case PT_LAMP:
5279 for (i = min; i < max; i++)
5280 {
5281 int chartype;
5282 int len = 1;
5283 if (eptr >= md->end_subject)
5284 {
5285 SCHECK_PARTIAL();
5286 break;
5287 }
5288 GETCHARLENTEST(c, eptr, len);
5289 chartype = UCD_CHARTYPE(c);
5290 if ((chartype == ucp_Lu ||
5291 chartype == ucp_Ll ||
5292 chartype == ucp_Lt) == prop_fail_result)
5293 break;
5294 eptr+= len;
5295 }
5296 break;
5297
5298 case PT_GC:
5299 for (i = min; i < max; i++)
5300 {
5301 int len = 1;
5302 if (eptr >= md->end_subject)
5303 {
5304 SCHECK_PARTIAL();
5305 break;
5306 }
5307 GETCHARLENTEST(c, eptr, len);
5308 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5309 eptr+= len;
5310 }
5311 break;
5312
5313 case PT_PC:
5314 for (i = min; i < max; i++)
5315 {
5316 int len = 1;
5317 if (eptr >= md->end_subject)
5318 {
5319 SCHECK_PARTIAL();
5320 break;
5321 }
5322 GETCHARLENTEST(c, eptr, len);
5323 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5324 eptr+= len;
5325 }
5326 break;
5327
5328 case PT_SC:
5329 for (i = min; i < max; i++)
5330 {
5331 int len = 1;
5332 if (eptr >= md->end_subject)
5333 {
5334 SCHECK_PARTIAL();
5335 break;
5336 }
5337 GETCHARLENTEST(c, eptr, len);
5338 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5339 eptr+= len;
5340 }
5341 break;
5342
5343 case PT_ALNUM:
5344 for (i = min; i < max; i++)
5345 {
5346 int category;
5347 int len = 1;
5348 if (eptr >= md->end_subject)
5349 {
5350 SCHECK_PARTIAL();
5351 break;
5352 }
5353 GETCHARLENTEST(c, eptr, len);
5354 category = UCD_CATEGORY(c);
5355 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5356 break;
5357 eptr+= len;
5358 }
5359 break;
5360
5361 case PT_SPACE: /* Perl space */
5362 for (i = min; i < max; i++)
5363 {
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLENTEST(c, eptr, len);
5371 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5372 c == CHAR_FF || c == CHAR_CR)
5373 == prop_fail_result)
5374 break;
5375 eptr+= len;
5376 }
5377 break;
5378
5379 case PT_PXSPACE: /* POSIX space */
5380 for (i = min; i < max; i++)
5381 {
5382 int len = 1;
5383 if (eptr >= md->end_subject)
5384 {
5385 SCHECK_PARTIAL();
5386 break;
5387 }
5388 GETCHARLENTEST(c, eptr, len);
5389 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5390 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5391 == prop_fail_result)
5392 break;
5393 eptr+= len;
5394 }
5395 break;
5396
5397 case PT_WORD:
5398 for (i = min; i < max; i++)
5399 {
5400 int category;
5401 int len = 1;
5402 if (eptr >= md->end_subject)
5403 {
5404 SCHECK_PARTIAL();
5405 break;
5406 }
5407 GETCHARLENTEST(c, eptr, len);
5408 category = UCD_CATEGORY(c);
5409 if ((category == ucp_L || category == ucp_N ||
5410 c == CHAR_UNDERSCORE) == prop_fail_result)
5411 break;
5412 eptr+= len;
5413 }
5414 break;
5415
5416 case PT_CLIST:
5417 for (i = min; i < max; i++)
5418 {
5419 const pcre_uint32 *cp;
5420 int len = 1;
5421 if (eptr >= md->end_subject)
5422 {
5423 SCHECK_PARTIAL();
5424 break;
5425 }
5426 GETCHARLENTEST(c, eptr, len);
5427 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
5428 for (;;)
5429 {
5430 if (c < *cp)
5431 { if (prop_fail_result) break; else goto GOT_MAX; }
5432 if (c == *cp++)
5433 { if (prop_fail_result) goto GOT_MAX; else break; }
5434 }
5435 eptr += len;
5436 }
5437 GOT_MAX:
5438 break;
5439
5440 default:
5441 RRETURN(PCRE_ERROR_INTERNAL);
5442 }
5443
5444 /* eptr is now past the end of the maximum run */
5445
5446 if (possessive) continue;
5447 for(;;)
5448 {
5449 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5451 if (eptr-- == pp) break; /* Stop if tried at original pos */
5452 if (utf) BACKCHAR(eptr);
5453 }
5454 }
5455
5456 /* Match extended Unicode sequences. We will get here only if the
5457 support is in the binary; otherwise a compile-time error occurs. */
5458
5459 else if (ctype == OP_EXTUNI)
5460 {
5461 for (i = min; i < max; i++)
5462 {
5463 if (eptr >= md->end_subject)
5464 {
5465 SCHECK_PARTIAL();
5466 break;
5467 }
5468 else
5469 {
5470 int lgb, rgb;
5471 GETCHARINCTEST(c, eptr);
5472 lgb = UCD_GRAPHBREAK(c);
5473 while (eptr < md->end_subject)
5474 {
5475 int len = 1;
5476 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5477 rgb = UCD_GRAPHBREAK(c);
5478 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5479 lgb = rgb;
5480 eptr += len;
5481 }
5482 }
5483 CHECK_PARTIAL();
5484 }
5485
5486 /* eptr is now past the end of the maximum run */
5487
5488 if (possessive) continue;
5489
5490 for(;;)
5491 {
5492 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5494 if (eptr-- == pp) break; /* Stop if tried at original pos */
5495 for (;;) /* Move back over one extended */
5496 {
5497 if (!utf) c = *eptr; else
5498 {
5499 BACKCHAR(eptr);
5500 GETCHAR(c, eptr);
5501 }
5502 if (UCD_CATEGORY(c) != ucp_M) break;
5503 eptr--;
5504 }
5505 }
5506 }
5507
5508 else
5509 #endif /* SUPPORT_UCP */
5510
5511 #ifdef SUPPORT_UTF
5512 if (utf)
5513 {
5514 switch(ctype)
5515 {
5516 case OP_ANY:
5517 if (max < INT_MAX)
5518 {
5519 for (i = min; i < max; i++)
5520 {
5521 if (eptr >= md->end_subject)
5522 {
5523 SCHECK_PARTIAL();
5524 break;
5525 }
5526 if (IS_NEWLINE(eptr)) break;
5527 if (md->partial != 0 && /* Take care with CRLF partial */
5528 eptr + 1 >= md->end_subject &&
5529 NLBLOCK->nltype == NLTYPE_FIXED &&
5530 NLBLOCK->nllen == 2 &&
5531 *eptr == NLBLOCK->nl[0])
5532 {
5533 md->hitend = TRUE;
5534 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5535 }
5536 eptr++;
5537 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5538 }
5539 }
5540
5541 /* Handle unlimited UTF-8 repeat */
5542
5543 else
5544 {
5545 for (i = min; i < max; i++)
5546 {
5547 if (eptr >= md->end_subject)
5548 {
5549 SCHECK_PARTIAL();
5550 break;
5551 }
5552 if (IS_NEWLINE(eptr)) break;
5553 if (md->partial != 0 && /* Take care with CRLF partial */
5554 eptr + 1 >= md->end_subject &&
5555 NLBLOCK->nltype == NLTYPE_FIXED &&
5556 NLBLOCK->nllen == 2 &&
5557 *eptr == NLBLOCK->nl[0])
5558 {
5559 md->hitend = TRUE;
5560 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5561 }
5562 eptr++;
5563 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5564 }
5565 }
5566 break;
5567
5568 case OP_ALLANY:
5569 if (max < INT_MAX)
5570 {
5571 for (i = min; i < max; i++)
5572 {
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 eptr++;
5579 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5580 }
5581 }
5582 else
5583 {
5584 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5585 SCHECK_PARTIAL();
5586 }
5587 break;
5588
5589 /* The byte case is the same as non-UTF8 */
5590
5591 case OP_ANYBYTE:
5592 c = max - min;
5593 if (c > (unsigned int)(md->end_subject - eptr))
5594 {
5595 eptr = md->end_subject;
5596 SCHECK_PARTIAL();
5597 }
5598 else eptr += c;
5599 break;
5600
5601 case OP_ANYNL:
5602 for (i = min; i < max; i++)
5603 {
5604 int len = 1;
5605 if (eptr >= md->end_subject)
5606 {
5607 SCHECK_PARTIAL();
5608 break;
5609 }
5610 GETCHARLEN(c, eptr, len);
5611 if (c == CHAR_CR)
5612 {
5613 if (++eptr >= md->end_subject) break;
5614 if (*eptr == CHAR_LF) eptr++;
5615 }
5616 else
5617 {
5618 if (c != CHAR_LF &&
5619 (md->bsr_anycrlf ||
5620 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5621 #ifndef EBCDIC
5622 && c != 0x2028 && c != 0x2029
5623 #endif /* Not EBCDIC */
5624 )))
5625 break;
5626 eptr += len;
5627 }
5628 }
5629 break;
5630
5631 case OP_NOT_HSPACE:
5632 case OP_HSPACE:
5633 for (i = min; i < max; i++)
5634 {
5635 BOOL gotspace;
5636 int len = 1;
5637 if (eptr >= md->end_subject)
5638 {
5639 SCHECK_PARTIAL();
5640 break;
5641 }
5642 GETCHARLEN(c, eptr, len);
5643 switch(c)
5644 {
5645 HSPACE_CASES: gotspace = TRUE; break;
5646 default: gotspace = FALSE; break;
5647 }
5648 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5649 eptr += len;
5650 }
5651 break;
5652
5653 case OP_NOT_VSPACE:
5654 case OP_VSPACE:
5655 for (i = min; i < max; i++)
5656 {
5657 BOOL gotspace;
5658 int len = 1;
5659 if (eptr >= md->end_subject)
5660 {
5661 SCHECK_PARTIAL();
5662 break;
5663 }
5664 GETCHARLEN(c, eptr, len);
5665 switch(c)
5666 {
5667 VSPACE_CASES: gotspace = TRUE; break;
5668 default: gotspace = FALSE; break;
5669 }
5670 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5671 eptr += len;
5672 }
5673 break;
5674
5675 case OP_NOT_DIGIT:
5676 for (i = min; i < max; i++)
5677 {
5678 int len = 1;
5679 if (eptr >= md->end_subject)
5680 {
5681 SCHECK_PARTIAL();
5682 break;
5683 }
5684 GETCHARLEN(c, eptr, len);
5685 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5686 eptr+= len;
5687 }
5688 break;
5689
5690 case OP_DIGIT:
5691 for (i = min; i < max; i++)
5692 {
5693 int len = 1;
5694 if (eptr >= md->end_subject)
5695 {
5696 SCHECK_PARTIAL();
5697 break;
5698 }
5699 GETCHARLEN(c, eptr, len);
5700 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5701 eptr+= len;
5702 }
5703 break;
5704
5705 case OP_NOT_WHITESPACE:
5706 for (i = min; i < max; i++)
5707 {
5708 int len = 1;
5709 if (eptr >= md->end_subject)
5710 {
5711 SCHECK_PARTIAL();
5712 break;
5713 }
5714 GETCHARLEN(c, eptr, len);
5715 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5716 eptr+= len;
5717 }
5718 break;
5719
5720 case OP_WHITESPACE:
5721 for (i = min; i < max; i++)
5722 {
5723 int len = 1;
5724 if (eptr >= md->end_subject)
5725 {
5726 SCHECK_PARTIAL();
5727 break;
5728 }
5729 GETCHARLEN(c, eptr, len);
5730 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5731 eptr+= len;
5732 }
5733 break;
5734
5735 case OP_NOT_WORDCHAR:
5736 for (i = min; i < max; i++)
5737 {
5738 int len = 1;
5739 if (eptr >= md->end_subject)
5740 {
5741 SCHECK_PARTIAL();
5742 break;
5743 }
5744 GETCHARLEN(c, eptr, len);
5745 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5746 eptr+= len;
5747 }
5748 break;
5749
5750 case OP_WORDCHAR:
5751 for (i = min; i < max; i++)
5752 {
5753 int len = 1;
5754 if (eptr >= md->end_subject)
5755 {
5756 SCHECK_PARTIAL();
5757 break;
5758 }
5759 GETCHARLEN(c, eptr, len);
5760 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5761 eptr+= len;
5762 }
5763 break;
5764
5765 default:
5766 RRETURN(PCRE_ERROR_INTERNAL);
5767 }
5768
5769 /* eptr is now past the end of the maximum run. If possessive, we are
5770 done (no backing up). Otherwise, match at this position; anything other
5771 than no match is immediately returned. For nomatch, back up one
5772 character, unless we are matching \R and the last thing matched was
5773 \r\n, in which case, back up two bytes. */
5774
5775 if (possessive) continue;
5776 for(;;)
5777 {
5778 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5780 if (eptr-- == pp) break; /* Stop if tried at original pos */
5781 BACKCHAR(eptr);
5782 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL &&
5783 eptr[-1] == CHAR_CR) eptr--;
5784 }
5785 }
5786 else
5787 #endif /* SUPPORT_UTF */
5788 /* Not UTF mode */
5789 {
5790 switch(ctype)
5791 {
5792 case OP_ANY:
5793 for (i = min; i < max; i++)
5794 {
5795 if (eptr >= md->end_subject)
5796 {
5797 SCHECK_PARTIAL();
5798 break;
5799 }
5800 if (IS_NEWLINE(eptr)) break;
5801 if (md->partial != 0 && /* Take care with CRLF partial */
5802 eptr + 1 >= md->end_subject &&
5803 NLBLOCK->nltype == NLTYPE_FIXED &&
5804 NLBLOCK->nllen == 2 &&
5805 *eptr == NLBLOCK->nl[0])
5806 {
5807 md->hitend = TRUE;
5808 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5809 }
5810 eptr++;
5811 }
5812 break;
5813
5814 case OP_ALLANY:
5815 case OP_ANYBYTE:
5816 c = max - min;
5817 if (c > (unsigned int)(md->end_subject - eptr))
5818 {
5819 eptr = md->end_subject;
5820 SCHECK_PARTIAL();
5821 }
5822 else eptr += c;
5823 break;
5824
5825 case OP_ANYNL:
5826 for (i = min; i < max; i++)
5827 {
5828 if (eptr >= md->end_subject)
5829 {
5830 SCHECK_PARTIAL();
5831 break;
5832 }
5833 c = *eptr;
5834 if (c == CHAR_CR)
5835 {
5836 if (++eptr >= md->end_subject) break;
5837 if (*eptr == CHAR_LF) eptr++;
5838 }
5839 else
5840 {
5841 if (c != CHAR_LF && (md->bsr_anycrlf ||
5842 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5843 #ifdef COMPILE_PCRE16
5844 && c != 0x2028 && c != 0x2029
5845 #endif
5846 ))) break;
5847 eptr++;
5848 }
5849 }
5850 break;
5851
5852 case OP_NOT_HSPACE:
5853 for (i = min; i < max; i++)
5854 {
5855 if (eptr >= md->end_subject)
5856 {
5857 SCHECK_PARTIAL();
5858 break;
5859 }
5860 switch(*eptr)
5861 {
5862 default: eptr++; break;
5863 HSPACE_BYTE_CASES:
5864 #ifdef COMPILE_PCRE16
5865 HSPACE_MULTIBYTE_CASES:
5866 #endif
5867 goto ENDLOOP00;
5868 }
5869 }
5870 ENDLOOP00:
5871 break;
5872
5873 case OP_HSPACE:
5874 for (i = min; i < max; i++)
5875 {
5876 if (eptr >= md->end_subject)
5877 {
5878 SCHECK_PARTIAL();
5879 break;
5880 }
5881 switch(*eptr)
5882 {
5883 default: goto ENDLOOP01;
5884 HSPACE_BYTE_CASES:
5885 #ifdef COMPILE_PCRE16
5886 HSPACE_MULTIBYTE_CASES:
5887 #endif
5888 eptr++; break;
5889 }
5890 }
5891 ENDLOOP01:
5892 break;
5893
5894 case OP_NOT_VSPACE:
5895 for (i = min; i < max; i++)
5896 {
5897 if (eptr >= md->end_subject)
5898 {
5899 SCHECK_PARTIAL();
5900 break;
5901 }
5902 switch(*eptr)
5903 {
5904 default: eptr++; break;
5905 VSPACE_BYTE_CASES:
5906 #ifdef COMPILE_PCRE16
5907 VSPACE_MULTIBYTE_CASES:
5908 #endif
5909 goto ENDLOOP02;
5910 }
5911 }
5912 ENDLOOP02:
5913 break;
5914
5915 case OP_VSPACE:
5916 for (i = min; i < max; i++)
5917 {
5918 if (eptr >= md->end_subject)
5919 {
5920 SCHECK_PARTIAL();
5921 break;
5922 }
5923 switch(*eptr)
5924 {
5925 default: goto ENDLOOP03;
5926 VSPACE_BYTE_CASES:
5927 #ifdef COMPILE_PCRE16
5928 VSPACE_MULTIBYTE_CASES:
5929 #endif
5930 eptr++; break;
5931 }
5932 }
5933 ENDLOOP03:
5934 break;
5935
5936 case OP_NOT_DIGIT:
5937 for (i = min; i < max; i++)
5938 {
5939 if (eptr >= md->end_subject)
5940 {
5941 SCHECK_PARTIAL();
5942 break;
5943 }
5944 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5945 eptr++;
5946 }
5947 break;
5948
5949 case OP_DIGIT:
5950 for (i = min; i < max; i++)
5951 {
5952 if (eptr >= md->end_subject)
5953 {
5954 SCHECK_PARTIAL();
5955 break;
5956 }
5957 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5958 eptr++;
5959 }
5960 break;
5961
5962 case OP_NOT_WHITESPACE:
5963 for (i = min; i < max; i++)
5964 {
5965 if (eptr >= md->end_subject)
5966 {
5967 SCHECK_PARTIAL();
5968 break;
5969 }
5970 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5971 eptr++;
5972 }
5973 break;
5974
5975 case OP_WHITESPACE:
5976 for (i = min; i < max; i++)
5977 {
5978 if (eptr >= md->end_subject)
5979 {
5980 SCHECK_PARTIAL();
5981 break;
5982 }
5983 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5984 eptr++;
5985 }
5986 break;
5987
5988 case OP_NOT_WORDCHAR:
5989 for (i = min; i < max; i++)
5990 {
5991 if (eptr >= md->end_subject)
5992 {
5993 SCHECK_PARTIAL();
5994 break;
5995 }
5996 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5997 eptr++;
5998 }
5999 break;
6000
6001 case OP_WORDCHAR:
6002 for (i = min; i < max; i++)
6003 {
6004 if (eptr >= md->end_subject)
6005 {
6006 SCHECK_PARTIAL();
6007 break;
6008 }
6009 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6010 eptr++;
6011 }
6012 break;
6013
6014 default:
6015 RRETURN(PCRE_ERROR_INTERNAL);
6016 }
6017
6018 /* eptr is now past the end of the maximum run. If possessive, we are
6019 done (no backing up). Otherwise, match at this position; anything other
6020 than no match is immediately returned. For nomatch, back up one
6021 character (byte), unless we are matching \R and the last thing matched
6022 was \r\n, in which case, back up two bytes. */
6023
6024 if (possessive) continue;
6025 while (eptr >= pp)
6026 {
6027 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6029 eptr--;
6030 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6031 eptr[-1] == CHAR_CR) eptr--;
6032 }
6033 }
6034
6035 /* Get here if we can't make it match with any permitted repetitions */
6036
6037 RRETURN(MATCH_NOMATCH);
6038 }
6039 /* Control never gets here */
6040
6041 /* There's been some horrible disaster. Arrival here can only mean there is
6042 something seriously wrong in the code above or the OP_xxx definitions. */
6043
6044 default:
6045 DPRINTF(("Unknown opcode %d\n", *ecode));
6046 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6047 }
6048
6049 /* Do not stick any code in here without much thought; it is assumed
6050 that "continue" in the code above comes out to here to repeat the main
6051 loop. */
6052
6053 } /* End of main loop */
6054 /* Control never reaches here */
6055
6056
6057 /* When compiling to use the heap rather than the stack for recursive calls to
6058 match(), the RRETURN() macro jumps here. The number that is saved in
6059 frame->Xwhere indicates which label we actually want to return to. */
6060
6061 #ifdef NO_RECURSE
6062 #define LBL(val) case val: goto L_RM##val;
6063 HEAP_RETURN:
6064 switch (frame->Xwhere)
6065 {
6066 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6067 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6068 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6069 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6070 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6071 LBL(65) LBL(66)
6072 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6073 LBL(21)
6074 #endif
6075 #ifdef SUPPORT_UTF
6076 LBL(16) LBL(18) LBL(20)
6077 LBL(22) LBL(23) LBL(28) LBL(30)
6078 LBL(32) LBL(34) LBL(42) LBL(46)
6079 #ifdef SUPPORT_UCP
6080 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6081 LBL(59) LBL(60) LBL(61) LBL(62)
6082 #endif /* SUPPORT_UCP */
6083 #endif /* SUPPORT_UTF */
6084 default:
6085 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6086
6087 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6088
6089 return PCRE_ERROR_INTERNAL;
6090 }
6091 #undef LBL
6092 #endif /* NO_RECURSE */
6093 }
6094
6095
6096 /***************************************************************************
6097 ****************************************************************************
6098 RECURSION IN THE match() FUNCTION
6099
6100 Undefine all the macros that were defined above to handle this. */
6101
6102 #ifdef NO_RECURSE
6103 #undef eptr
6104 #undef ecode
6105 #undef mstart
6106 #undef offset_top
6107 #undef eptrb
6108 #undef flags
6109
6110 #undef callpat
6111 #undef charptr
6112 #undef data
6113 #undef next
6114 #undef pp
6115 #undef prev
6116 #undef saved_eptr
6117
6118 #undef new_recursive
6119
6120 #undef cur_is_word
6121 #undef condition
6122 #undef prev_is_word
6123
6124 #undef ctype
6125 #undef length
6126 #undef max
6127 #undef min
6128 #undef number
6129 #undef offset
6130 #undef op
6131 #undef save_capture_last
6132 #undef save_offset1
6133 #undef save_offset2
6134 #undef save_offset3
6135 #undef stacksave
6136
6137 #undef newptrb
6138
6139 #endif
6140
6141 /* These two are defined as macros in both cases */
6142
6143 #undef fc
6144 #undef fi
6145
6146 /***************************************************************************
6147 ***************************************************************************/
6148
6149
6150 #ifdef NO_RECURSE
6151 /*************************************************
6152 * Release allocated heap frames *
6153 *************************************************/
6154
6155 /* This function releases all the allocated frames. The base frame is on the
6156 machine stack, and so must not be freed.
6157
6158 Argument: the address of the base frame
6159 Returns: nothing
6160 */
6161
6162 static void
6163 release_match_heapframes (heapframe *frame_base)
6164 {
6165 heapframe *nextframe = frame_base->Xnextframe;
6166 while (nextframe != NULL)
6167 {
6168 heapframe *oldframe = nextframe;
6169 nextframe = nextframe->Xnextframe;
6170 (PUBL(stack_free))(oldframe);
6171 }
6172 }
6173 #endif
6174
6175
6176 /*************************************************
6177 * Execute a Regular Expression *
6178 *************************************************/
6179
6180 /* This function applies a compiled re to a subject string and picks out
6181 portions of the string if it matches. Two elements in the vector are set for
6182 each substring: the offsets to the start and end of the substring.
6183
6184 Arguments:
6185 argument_re points to the compiled expression
6186 extra_data points to extra data or is NULL
6187 subject points to the subject string
6188 length length of subject string (may contain binary zeros)
6189 start_offset where to start in the subject string
6190 options option bits
6191 offsets points to a vector of ints to be filled in with offsets
6192 offsetcount the number of elements in the vector
6193
6194 Returns: > 0 => success; value is the number of elements filled in
6195 = 0 => success, but offsets is not big enough
6196 -1 => failed to match
6197 < -1 => some kind of unexpected problem
6198 */
6199
6200 #ifdef COMPILE_PCRE8
6201 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6202 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6203 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6204 int offsetcount)
6205 #else
6206 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6207 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6208 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6209 int offsetcount)
6210 #endif
6211 {
6212 int rc, ocount, arg_offset_max;
6213 int newline;
6214 BOOL using_temporary_offsets = FALSE;
6215 BOOL anchored;
6216 BOOL startline;
6217 BOOL firstline;
6218 BOOL utf;
6219 BOOL has_first_char = FALSE;
6220 BOOL has_req_char = FALSE;
6221 pcre_uchar first_char = 0;
6222 pcre_uchar first_char2 = 0;
6223 pcre_uchar req_char = 0;
6224 pcre_uchar req_char2 = 0;
6225 match_data match_block;
6226 match_data *md = &match_block;
6227 const pcre_uint8 *tables;
6228 const pcre_uint8 *start_bits = NULL;
6229 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6230 PCRE_PUCHAR end_subject;
6231 PCRE_PUCHAR start_partial = NULL;
6232 PCRE_PUCHAR req_char_ptr = start_match - 1;
6233
6234 const pcre_study_data *study;
6235 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6236
6237 #ifdef NO_RECURSE
6238 heapframe frame_zero;
6239 frame_zero.Xprevframe = NULL; /* Marks the top level */
6240 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6241 md->match_frames_base = &frame_zero;
6242 #endif
6243
6244 /* Check for the special magic call that measures the size of the stack used
6245 per recursive call of match(). Without the funny casting for sizeof, a Windows
6246 compiler gave this error: "unary minus operator applied to unsigned type,
6247 result still unsigned". Hopefully the cast fixes that. */
6248
6249 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6250 start_offset == -999)
6251 #ifdef NO_RECURSE
6252 return -((int)sizeof(heapframe));
6253 #else
6254 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6255 #endif
6256
6257 /* Plausibility checks */
6258
6259 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6260 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6261 return PCRE_ERROR_NULL;
6262 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6263 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6264
6265 /* Check that the first field in the block is the magic number. If it is not,
6266 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6267 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6268 means that the pattern is likely compiled with different endianness. */
6269
6270 if (re->magic_number != MAGIC_NUMBER)
6271 return re->magic_number == REVERSED_MAGIC_NUMBER?
6272 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6273 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6274
6275 /* These two settings are used in the code for checking a UTF-8 string that
6276 follows immediately afterwards. Other values in the md block are used only
6277 during "normal" pcre_exec() processing, not when the JIT support is in use,
6278 so they are set up later. */
6279
6280 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6281 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6282 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6283 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6284
6285 /* Check a UTF-8 string if required. Pass back the character offset and error
6286 code for an invalid string if a results vector is available. */
6287
6288 #ifdef SUPPORT_UTF
6289 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6290 {
6291 int erroroffset;
6292 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6293 if (errorcode != 0)
6294 {
6295 if (offsetcount >= 2)
6296 {
6297 offsets[0] = erroroffset;
6298 offsets[1] = errorcode;
6299 }
6300 #ifdef COMPILE_PCRE16
6301 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6302 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6303 #else
6304 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6305 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6306 #endif
6307 }
6308
6309 /* Check that a start_offset points to the start of a UTF character. */
6310 if (start_offset > 0 && start_offset < length &&
6311 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6312 return PCRE_ERROR_BADUTF8_OFFSET;
6313 }
6314 #endif
6315
6316 /* If the pattern was successfully studied with JIT support, run the JIT
6317 executable instead of the rest of this function. Most options must be set at
6318 compile time for the JIT code to be usable. Fallback to the normal code path if
6319 an unsupported flag is set. */
6320
6321 #ifdef SUPPORT_JIT
6322 if (extra_data != NULL
6323 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6324 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6325 && extra_data->executable_jit != NULL
6326 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6327 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6328 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6329 {
6330 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6331 start_offset, options, offsets, offsetcount);
6332
6333 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6334 mode is not compiled. In this case we simply fallback to interpreter. */
6335
6336 if (rc != PCRE_ERROR_NULL) return rc;
6337 }
6338 #endif
6339
6340 /* Carry on with non-JIT matching. This information is for finding all the
6341 numbers associated with a given name, for condition testing. */
6342
6343 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6344 md->name_count = re->name_count;
6345 md->name_entry_size = re->name_entry_size;
6346
6347 /* Fish out the optional data from the extra_data structure, first setting
6348 the default values. */
6349
6350 study = NULL;
6351 md->match_limit = MATCH_LIMIT;
6352 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6353 md->callout_data = NULL;
6354
6355 /* The table pointer is always in native byte order. */
6356
6357 tables = re->tables;
6358
6359 if (extra_data != NULL)
6360 {
6361 register unsigned int flags = extra_data->flags;
6362 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6363 study = (const pcre_study_data *)extra_data->study_data;
6364 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6365 md->match_limit = extra_data->match_limit;
6366 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6367 md->match_limit_recursion = extra_data->match_limit_recursion;
6368 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6369 md->callout_data = extra_data->callout_data;
6370 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6371 }
6372
6373 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6374 is a feature that makes it possible to save compiled regex and re-use them
6375 in other programs later. */
6376
6377 if (tables == NULL) tables = PRIV(default_tables);
6378
6379 /* Set up other data */
6380
6381 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6382 startline = (re->flags & PCRE_STARTLINE) != 0;
6383 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6384
6385 /* The code starts after the real_pcre block and the capture name table. */
6386
6387 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6388 re->name_count * re->name_entry_size;
6389
6390 md->start_subject = (PCRE_PUCHAR)subject;
6391 md->start_offset = start_offset;
6392 md->end_subject = md->start_subject + length;
6393 end_subject = md->end_subject;
6394
6395 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6396 md->use_ucp = (re->options & PCRE_UCP) != 0;
6397 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6398 md->ignore_skip_arg = FALSE;
6399
6400 /* Some options are unpacked into BOOL variables in the hope that testing
6401 them will be faster than individual option bits. */
6402
6403 md->notbol = (options & PCRE_NOTBOL) != 0;
6404 md->noteol = (options & PCRE_NOTEOL) != 0;
6405 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6406 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6407
6408 md->hitend = FALSE;
6409 md->mark = md->nomatch_mark = NULL; /* In case never set */
6410
6411 md->recursive = NULL; /* No recursion at top level */
6412 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6413
6414 md->lcc = tables + lcc_offset;
6415 md->fcc = tables + fcc_offset;
6416 md->ctypes = tables + ctypes_offset;
6417
6418 /* Handle different \R options. */
6419
6420 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6421 {
6422 case 0:
6423 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6424 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6425 else
6426 #ifdef BSR_ANYCRLF
6427 md->bsr_anycrlf = TRUE;
6428 #else
6429 md->bsr_anycrlf = FALSE;
6430 #endif
6431 break;
6432
6433 case PCRE_BSR_ANYCRLF:
6434 md->bsr_anycrlf = TRUE;
6435 break;
6436
6437 case PCRE_BSR_UNICODE:
6438 md->bsr_anycrlf = FALSE;
6439 break;
6440
6441 default: return PCRE_ERROR_BADNEWLINE;
6442 }
6443
6444 /* Handle different types of newline. The three bits give eight cases. If
6445 nothing is set at run time, whatever was used at compile time applies. */
6446
6447 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6448 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6449 {
6450 case 0: newline = NEWLINE; break; /* Compile-time default */
6451 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6452 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6453 case PCRE_NEWLINE_CR+
6454 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6455 case PCRE_NEWLINE_ANY: newline = -1; break;
6456 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6457 default: return PCRE_ERROR_BADNEWLINE;
6458 }
6459
6460 if (newline == -2)
6461 {
6462 md->nltype = NLTYPE_ANYCRLF;
6463 }
6464 else if (newline < 0)
6465 {
6466 md->nltype = NLTYPE_ANY;
6467 }
6468 else
6469 {
6470 md->nltype = NLTYPE_FIXED;
6471 if (newline > 255)
6472 {
6473 md->nllen = 2;
6474 md->nl[0] = (newline >> 8) & 255;
6475 md->nl[1] = newline & 255;
6476 }
6477 else
6478 {
6479 md->nllen = 1;
6480 md->nl[0] = newline;
6481 }
6482 }
6483
6484 /* Partial matching was originally supported only for a restricted set of
6485 regexes; from release 8.00 there are no restrictions, but the bits are still
6486 defined (though never set). So there's no harm in leaving this code. */
6487
6488 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6489 return PCRE_ERROR_BADPARTIAL;
6490
6491 /* If the expression has got more back references than the offsets supplied can
6492 hold, we get a temporary chunk of working store to use during the matching.
6493 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6494 of 3. */
6495
6496 ocount = offsetcount - (offsetcount % 3);
6497 arg_offset_max = (2*ocount)/3;
6498
6499 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6500 {
6501 ocount = re->top_backref * 3 + 3;
6502 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6503 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6504 using_temporary_offsets = TRUE;
6505 DPRINTF(("Got memory to hold back references\n"));
6506 }
6507 else md->offset_vector = offsets;
6508
6509 md->offset_end = ocount;
6510 md->offset_max = (2*ocount)/3;
6511 md->offset_overflow = FALSE;
6512 md->capture_last = -1;
6513
6514 /* Reset the working variable associated with each extraction. These should
6515 never be used unless previously set, but they get saved and restored, and so we
6516 initialize them to avoid reading uninitialized locations. Also, unset the
6517 offsets for the matched string. This is really just for tidiness with callouts,
6518 in case they inspect these fields. */
6519
6520 if (md->offset_vector != NULL)
6521 {
6522 register int *iptr = md->offset_vector + ocount;
6523 register int *iend = iptr - re->top_bracket;
6524 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6525 while (--iptr >= iend) *iptr = -1;
6526 md->offset_vector[0] = md->offset_vector[1] = -1;
6527 }
6528
6529 /* Set up the first character to match, if available. The first_char value is
6530 never set for an anchored regular expression, but the anchoring may be forced
6531 at run time, so we have to test for anchoring. The first char may be unset for
6532 an unanchored pattern, of course. If there's no first char and the pattern was
6533 studied, there may be a bitmap of possible first characters. */
6534
6535 if (!anchored)
6536 {
6537 if ((re->flags & PCRE_FIRSTSET) != 0)
6538 {
6539 has_first_char = TRUE;
6540 first_char = first_char2 = (pcre_uchar)(re->first_char);
6541 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6542 {
6543 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6544 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6545 if (utf && first_char > 127)
6546 first_char2 = UCD_OTHERCASE(first_char);
6547 #endif
6548 }
6549 }
6550 else
6551 if (!startline && study != NULL &&
6552 (study->flags & PCRE_STUDY_MAPPED) != 0)
6553 start_bits = study->start_bits;
6554 }
6555
6556 /* For anchored or unanchored matches, there may be a "last known required
6557 character" set. */
6558
6559 if ((re->flags & PCRE_REQCHSET) != 0)
6560 {
6561 has_req_char = TRUE;
6562 req_char = req_char2 = (pcre_uchar)(re->req_char);
6563 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6564 {
6565 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6566 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6567 if (utf && req_char > 127)
6568 req_char2 = UCD_OTHERCASE(req_char);
6569 #endif
6570 }
6571 }
6572
6573
6574 /* ==========================================================================*/
6575
6576 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6577 the loop runs just once. */
6578
6579 for(;;)
6580 {
6581 PCRE_PUCHAR save_end_subject = end_subject;
6582 PCRE_PUCHAR new_start_match;
6583
6584 /* If firstline is TRUE, the start of the match is constrained to the first
6585 line of a multiline string. That is, the match must be before or at the first
6586 newline. Implement this by temporarily adjusting end_subject so that we stop
6587 scanning at a newline. If the match fails at the newline, later code breaks
6588 this loop. */
6589
6590 if (firstline)
6591 {
6592 PCRE_PUCHAR t = start_match;
6593 #ifdef SUPPORT_UTF
6594 if (utf)
6595 {
6596 while (t < md->end_subject && !IS_NEWLINE(t))
6597 {
6598 t++;
6599 ACROSSCHAR(t < end_subject, *t, t++);
6600 }
6601 }
6602 else
6603 #endif
6604 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6605 end_subject = t;
6606 }
6607
6608 /* There are some optimizations that avoid running the match if a known
6609 starting point is not found, or if a known later character is not present.
6610 However, there is an option that disables these, for testing and for ensuring
6611 that all callouts do actually occur. The option can be set in the regex by
6612 (*NO_START_OPT) or passed in match-time options. */
6613
6614 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6615 {
6616 /* Advance to a unique first char if there is one. */
6617
6618 if (has_first_char)
6619 {
6620 if (first_char != first_char2)
6621 while (start_match < end_subject &&
6622 *start_match != first_char && *start_match != first_char2)
6623 start_match++;
6624 else
6625 while (start_match < end_subject && *start_match != first_char)
6626 start_match++;
6627 }
6628
6629 /* Or to just after a linebreak for a multiline match */
6630
6631 else if (startline)
6632 {
6633 if (start_match > md->start_subject + start_offset)
6634 {
6635 #ifdef SUPPORT_UTF
6636 if (utf)
6637 {
6638 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6639 {
6640 start_match++;
6641 ACROSSCHAR(start_match < end_subject, *start_match,
6642 start_match++);
6643 }
6644 }
6645 else
6646 #endif
6647 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6648 start_match++;
6649
6650 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6651 and we are now at a LF, advance the match position by one more character.
6652 */
6653
6654 if (start_match[-1] == CHAR_CR &&
6655 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6656 start_match < end_subject &&
6657 *start_match == CHAR_NL)
6658 start_match++;
6659 }
6660 }
6661
6662 /* Or to a non-unique first byte after study */
6663
6664 else if (start_bits != NULL)
6665 {
6666 while (start_match < end_subject)
6667 {
6668 register unsigned int c = *start_match;
6669 #ifndef COMPILE_PCRE8
6670 if (c > 255) c = 255;
6671 #endif
6672 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6673 {
6674 start_match++;
6675 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6676 /* In non 8-bit mode, the iteration will stop for
6677 characters > 255 at the beginning or not stop at all. */
6678 if (utf)
6679 ACROSSCHAR(start_match < end_subject, *start_match,
6680 start_match++);
6681 #endif
6682 }
6683 else break;
6684 }
6685 }
6686 } /* Starting optimizations */
6687
6688 /* Restore fudged end_subject */
6689
6690 end_subject = save_end_subject;
6691
6692 /* The following two optimizations are disabled for partial matching or if
6693 disabling is explicitly requested. */
6694
6695 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6696 {
6697 /* If the pattern was studied, a minimum subject length may be set. This is
6698 a lower bound; no actual string of that length may actually match the
6699 pattern. Although the value is, strictly, in characters, we treat it as
6700 bytes to avoid spending too much time in this optimization. */
6701
6702 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6703 (pcre_uint32)(end_subject - start_match) < study->minlength)
6704 {
6705 rc = MATCH_NOMATCH;
6706 break;
6707 }
6708
6709 /* If req_char is set, we know that that character must appear in the
6710 subject for the match to succeed. If the first character is set, req_char
6711 must be later in the subject; otherwise the test starts at the match point.
6712 This optimization can save a huge amount of backtracking in patterns with
6713 nested unlimited repeats that aren't going to match. Writing separate code
6714 for cased/caseless versions makes it go faster, as does using an
6715 autoincrement and backing off on a match.
6716
6717 HOWEVER: when the subject string is very, very long, searching to its end
6718 can take a long time, and give bad performance on quite ordinary patterns.
6719 This showed up when somebody was matching something like /^\d+C/ on a
6720 32-megabyte string... so we don't do this when the string is sufficiently
6721 long. */
6722
6723 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6724 {
6725 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6726
6727 /* We don't need to repeat the search if we haven't yet reached the
6728 place we found it at last time. */
6729
6730 if (p > req_char_ptr)
6731 {
6732 if (req_char != req_char2)
6733 {
6734 while (p < end_subject)
6735 {
6736 register int pp = *p++;
6737 if (pp == req_char || pp == req_char2) { p--; break; }
6738 }
6739 }
6740 else
6741 {
6742 while (p < end_subject)
6743 {
6744 if (*p++ == req_char) { p--; break; }
6745 }
6746 }
6747
6748 /* If we can't find the required character, break the matching loop,
6749 forcing a match failure. */
6750
6751 if (p >= end_subject)
6752 {
6753 rc = MATCH_NOMATCH;
6754 break;
6755 }
6756
6757 /* If we have found the required character, save the point where we
6758 found it, so that we don't search again next time round the loop if
6759 the start hasn't passed this character yet. */
6760
6761 req_char_ptr = p;
6762 }
6763 }
6764 }
6765
6766 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6767 printf(">>>> Match against: ");
6768 pchars(start_match, end_subject - start_match, TRUE, md);
6769 printf("\n"