/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1055 - (show annotations)
Tue Oct 16 15:53:30 2012 UTC (6 years, 11 months ago) by chpe
File MIME type: text/plain
File size: 211166 byte(s)
Error occurred while calculating annotation data.
pcre32: Add 32-bit library

Create libpcre32 that operates on 32-bit characters (UTF-32).

This turned out to be surprisingly simple after the UTF-16 support
was introduced; mostly just extra ifdefs and adjusting and adding
some tests.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
62
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
65
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
68
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
71
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
74
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
83
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
87
88 #define REC_STACK_SAVE_MAX 30
89
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
91
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
94
95
96
97 #ifdef PCRE_DEBUG
98 /*************************************************
99 * Debugging function to print chars *
100 *************************************************/
101
102 /* Print a sequence of chars in printable format, stopping at the end of the
103 subject if the requested.
104
105 Arguments:
106 p points to characters
107 length number to print
108 is_subject TRUE if printing from within md->start_subject
109 md pointer to matching data block, if is_subject is TRUE
110
111 Returns: nothing
112 */
113
114 static void
115 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
116 {
117 unsigned int c;
118 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
119 while (length-- > 0)
120 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
121 }
122 #endif
123
124
125
126 /*************************************************
127 * Match a back-reference *
128 *************************************************/
129
130 /* Normally, if a back reference hasn't been set, the length that is passed is
131 negative, so the match always fails. However, in JavaScript compatibility mode,
132 the length passed is zero. Note that in caseless UTF-8 mode, the number of
133 subject bytes matched may be different to the number of reference bytes.
134
135 Arguments:
136 offset index into the offset vector
137 eptr pointer into the subject
138 length length of reference to be matched (number of bytes)
139 md points to match data block
140 caseless TRUE if caseless
141
142 Returns: >= 0 the number of subject bytes matched
143 -1 no match
144 -2 partial match; always given if at end subject
145 */
146
147 static int
148 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
149 BOOL caseless)
150 {
151 PCRE_PUCHAR eptr_start = eptr;
152 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if reference not set (and not JavaScript compatible - in that
168 case the length is passed as zero). */
169
170 if (length < 0) return -1;
171
172 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
173 properly if Unicode properties are supported. Otherwise, we can check only
174 ASCII characters. */
175
176 if (caseless)
177 {
178 #ifdef SUPPORT_UTF
179 #ifdef SUPPORT_UCP
180 if (md->utf)
181 {
182 /* Match characters up to the end of the reference. NOTE: the number of
183 data units matched may differ, because in UTF-8 there are some characters
184 whose upper and lower case versions code have different numbers of bytes.
185 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
186 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
187 sequence of two of the latter. It is important, therefore, to check the
188 length along the reference, not along the subject (earlier code did this
189 wrong). */
190
191 PCRE_PUCHAR endptr = p + length;
192 while (p < endptr)
193 {
194 unsigned int c, d;
195 const ucd_record *ur;
196 if (eptr >= md->end_subject) return -2; /* Partial match */
197 GETCHARINC(c, eptr);
198 GETCHARINC(d, p);
199 ur = GET_UCD(d);
200 if (c != d && c != d + ur->other_case)
201 {
202 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
203 for (;;)
204 {
205 if (c < *pp) return -1;
206 if (c == *pp++) break;
207 }
208 }
209 }
210 }
211 else
212 #endif
213 #endif
214
215 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
216 is no UCP support. */
217 {
218 while (length-- > 0)
219 {
220 if (eptr >= md->end_subject) return -2; /* Partial match */
221 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
222 p++;
223 eptr++;
224 }
225 }
226 }
227
228 /* In the caseful case, we can just compare the bytes, whether or not we
229 are in UTF-8 mode. */
230
231 else
232 {
233 while (length-- > 0)
234 {
235 if (eptr >= md->end_subject) return -2; /* Partial match */
236 if (*p++ != *eptr++) return -1;
237 }
238 }
239
240 return (int)(eptr - eptr_start);
241 }
242
243
244
245 /***************************************************************************
246 ****************************************************************************
247 RECURSION IN THE match() FUNCTION
248
249 The match() function is highly recursive, though not every recursive call
250 increases the recursive depth. Nevertheless, some regular expressions can cause
251 it to recurse to a great depth. I was writing for Unix, so I just let it call
252 itself recursively. This uses the stack for saving everything that has to be
253 saved for a recursive call. On Unix, the stack can be large, and this works
254 fine.
255
256 It turns out that on some non-Unix-like systems there are problems with
257 programs that use a lot of stack. (This despite the fact that every last chip
258 has oodles of memory these days, and techniques for extending the stack have
259 been known for decades.) So....
260
261 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
262 calls by keeping local variables that need to be preserved in blocks of memory
263 obtained from malloc() instead instead of on the stack. Macros are used to
264 achieve this so that the actual code doesn't look very different to what it
265 always used to.
266
267 The original heap-recursive code used longjmp(). However, it seems that this
268 can be very slow on some operating systems. Following a suggestion from Stan
269 Switzer, the use of longjmp() has been abolished, at the cost of having to
270 provide a unique number for each call to RMATCH. There is no way of generating
271 a sequence of numbers at compile time in C. I have given them names, to make
272 them stand out more clearly.
273
274 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
275 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
276 tests. Furthermore, not using longjmp() means that local dynamic variables
277 don't have indeterminate values; this has meant that the frame size can be
278 reduced because the result can be "passed back" by straight setting of the
279 variable instead of being passed in the frame.
280 ****************************************************************************
281 ***************************************************************************/
282
283 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
284 below must be updated in sync. */
285
286 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
287 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
288 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
289 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
290 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
291 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
292 RM61, RM62, RM63, RM64, RM65, RM66 };
293
294 /* These versions of the macros use the stack, as normal. There are debugging
295 versions and production versions. Note that the "rw" argument of RMATCH isn't
296 actually used in this definition. */
297
298 #ifndef NO_RECURSE
299 #define REGISTER register
300
301 #ifdef PCRE_DEBUG
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 { \
304 printf("match() called in line %d\n", __LINE__); \
305 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
306 printf("to line %d\n", __LINE__); \
307 }
308 #define RRETURN(ra) \
309 { \
310 printf("match() returned %d from line %d ", ra, __LINE__); \
311 return ra; \
312 }
313 #else
314 #define RMATCH(ra,rb,rc,rd,re,rw) \
315 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
316 #define RRETURN(ra) return ra
317 #endif
318
319 #else
320
321
322 /* These versions of the macros manage a private stack on the heap. Note that
323 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
324 argument of match(), which never changes. */
325
326 #define REGISTER
327
328 #define RMATCH(ra,rb,rc,rd,re,rw)\
329 {\
330 heapframe *newframe = frame->Xnextframe;\
331 if (newframe == NULL)\
332 {\
333 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
334 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
335 newframe->Xnextframe = NULL;\
336 frame->Xnextframe = newframe;\
337 }\
338 frame->Xwhere = rw;\
339 newframe->Xeptr = ra;\
340 newframe->Xecode = rb;\
341 newframe->Xmstart = mstart;\
342 newframe->Xoffset_top = rc;\
343 newframe->Xeptrb = re;\
344 newframe->Xrdepth = frame->Xrdepth + 1;\
345 newframe->Xprevframe = frame;\
346 frame = newframe;\
347 DPRINTF(("restarting from line %d\n", __LINE__));\
348 goto HEAP_RECURSE;\
349 L_##rw:\
350 DPRINTF(("jumped back to line %d\n", __LINE__));\
351 }
352
353 #define RRETURN(ra)\
354 {\
355 heapframe *oldframe = frame;\
356 frame = oldframe->Xprevframe;\
357 if (frame != NULL)\
358 {\
359 rrc = ra;\
360 goto HEAP_RETURN;\
361 }\
362 return ra;\
363 }
364
365
366 /* Structure for remembering the local variables in a private frame */
367
368 typedef struct heapframe {
369 struct heapframe *Xprevframe;
370 struct heapframe *Xnextframe;
371
372 /* Function arguments that may change */
373
374 PCRE_PUCHAR Xeptr;
375 const pcre_uchar *Xecode;
376 PCRE_PUCHAR Xmstart;
377 int Xoffset_top;
378 eptrblock *Xeptrb;
379 unsigned int Xrdepth;
380
381 /* Function local variables */
382
383 PCRE_PUCHAR Xcallpat;
384 #ifdef SUPPORT_UTF
385 PCRE_PUCHAR Xcharptr;
386 #endif
387 PCRE_PUCHAR Xdata;
388 PCRE_PUCHAR Xnext;
389 PCRE_PUCHAR Xpp;
390 PCRE_PUCHAR Xprev;
391 PCRE_PUCHAR Xsaved_eptr;
392
393 recursion_info Xnew_recursive;
394
395 BOOL Xcur_is_word;
396 BOOL Xcondition;
397 BOOL Xprev_is_word;
398
399 #ifdef SUPPORT_UCP
400 int Xprop_type;
401 int Xprop_value;
402 int Xprop_fail_result;
403 int Xoclength;
404 pcre_uchar Xocchars[6];
405 #endif
406
407 int Xcodelink;
408 int Xctype;
409 unsigned int Xfc;
410 int Xfi;
411 int Xlength;
412 int Xmax;
413 int Xmin;
414 int Xnumber;
415 int Xoffset;
416 int Xop;
417 int Xsave_capture_last;
418 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
419 int Xstacksave[REC_STACK_SAVE_MAX];
420
421 eptrblock Xnewptrb;
422
423 /* Where to jump back to */
424
425 int Xwhere;
426
427 } heapframe;
428
429 #endif
430
431
432 /***************************************************************************
433 ***************************************************************************/
434
435
436
437 /*************************************************
438 * Match from current position *
439 *************************************************/
440
441 /* This function is called recursively in many circumstances. Whenever it
442 returns a negative (error) response, the outer incarnation must also return the
443 same response. */
444
445 /* These macros pack up tests that are used for partial matching, and which
446 appear several times in the code. We set the "hit end" flag if the pointer is
447 at the end of the subject and also past the start of the subject (i.e.
448 something has been matched). For hard partial matching, we then return
449 immediately. The second one is used when we already know we are past the end of
450 the subject. */
451
452 #define CHECK_PARTIAL()\
453 if (md->partial != 0 && eptr >= md->end_subject && \
454 eptr > md->start_used_ptr) \
455 { \
456 md->hitend = TRUE; \
457 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
458 }
459
460 #define SCHECK_PARTIAL()\
461 if (md->partial != 0 && eptr > md->start_used_ptr) \
462 { \
463 md->hitend = TRUE; \
464 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
465 }
466
467
468 /* Performance note: It might be tempting to extract commonly used fields from
469 the md structure (e.g. utf, end_subject) into individual variables to improve
470 performance. Tests using gcc on a SPARC disproved this; in the first case, it
471 made performance worse.
472
473 Arguments:
474 eptr pointer to current character in subject
475 ecode pointer to current position in compiled code
476 mstart pointer to the current match start position (can be modified
477 by encountering \K)
478 offset_top current top pointer
479 md pointer to "static" info for the match
480 eptrb pointer to chain of blocks containing eptr at start of
481 brackets - for testing for empty matches
482 rdepth the recursion depth
483
484 Returns: MATCH_MATCH if matched ) these values are >= 0
485 MATCH_NOMATCH if failed to match )
486 a negative MATCH_xxx value for PRUNE, SKIP, etc
487 a negative PCRE_ERROR_xxx value if aborted by an error condition
488 (e.g. stopped by repeated call or recursion limit)
489 */
490
491 static int
492 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
493 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
494 unsigned int rdepth)
495 {
496 /* These variables do not need to be preserved over recursion in this function,
497 so they can be ordinary variables in all cases. Mark some of them with
498 "register" because they are used a lot in loops. */
499
500 register int rrc; /* Returns from recursive calls */
501 register int i; /* Used for loops not involving calls to RMATCH() */
502 register unsigned int c; /* Character values not kept over RMATCH() calls */
503 register BOOL utf; /* Local copy of UTF flag for speed */
504
505 BOOL minimize, possessive; /* Quantifier options */
506 BOOL caseless;
507 int condcode;
508
509 /* When recursion is not being used, all "local" variables that have to be
510 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
511 frame on the stack here; subsequent instantiations are obtained from the heap
512 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
513 the top-level on the stack rather than malloc-ing them all gives a performance
514 boost in many cases where there is not much "recursion". */
515
516 #ifdef NO_RECURSE
517 heapframe *frame = (heapframe *)md->match_frames_base;
518
519 /* Copy in the original argument variables */
520
521 frame->Xeptr = eptr;
522 frame->Xecode = ecode;
523 frame->Xmstart = mstart;
524 frame->Xoffset_top = offset_top;
525 frame->Xeptrb = eptrb;
526 frame->Xrdepth = rdepth;
527
528 /* This is where control jumps back to to effect "recursion" */
529
530 HEAP_RECURSE:
531
532 /* Macros make the argument variables come from the current frame */
533
534 #define eptr frame->Xeptr
535 #define ecode frame->Xecode
536 #define mstart frame->Xmstart
537 #define offset_top frame->Xoffset_top
538 #define eptrb frame->Xeptrb
539 #define rdepth frame->Xrdepth
540
541 /* Ditto for the local variables */
542
543 #ifdef SUPPORT_UTF
544 #define charptr frame->Xcharptr
545 #endif
546 #define callpat frame->Xcallpat
547 #define codelink frame->Xcodelink
548 #define data frame->Xdata
549 #define next frame->Xnext
550 #define pp frame->Xpp
551 #define prev frame->Xprev
552 #define saved_eptr frame->Xsaved_eptr
553
554 #define new_recursive frame->Xnew_recursive
555
556 #define cur_is_word frame->Xcur_is_word
557 #define condition frame->Xcondition
558 #define prev_is_word frame->Xprev_is_word
559
560 #ifdef SUPPORT_UCP
561 #define prop_type frame->Xprop_type
562 #define prop_value frame->Xprop_value
563 #define prop_fail_result frame->Xprop_fail_result
564 #define oclength frame->Xoclength
565 #define occhars frame->Xocchars
566 #endif
567
568 #define ctype frame->Xctype
569 #define fc frame->Xfc
570 #define fi frame->Xfi
571 #define length frame->Xlength
572 #define max frame->Xmax
573 #define min frame->Xmin
574 #define number frame->Xnumber
575 #define offset frame->Xoffset
576 #define op frame->Xop
577 #define save_capture_last frame->Xsave_capture_last
578 #define save_offset1 frame->Xsave_offset1
579 #define save_offset2 frame->Xsave_offset2
580 #define save_offset3 frame->Xsave_offset3
581 #define stacksave frame->Xstacksave
582
583 #define newptrb frame->Xnewptrb
584
585 /* When recursion is being used, local variables are allocated on the stack and
586 get preserved during recursion in the normal way. In this environment, fi and
587 i, and fc and c, can be the same variables. */
588
589 #else /* NO_RECURSE not defined */
590 #define fi i
591 #define fc c
592
593 /* Many of the following variables are used only in small blocks of the code.
594 My normal style of coding would have declared them within each of those blocks.
595 However, in order to accommodate the version of this code that uses an external
596 "stack" implemented on the heap, it is easier to declare them all here, so the
597 declarations can be cut out in a block. The only declarations within blocks
598 below are for variables that do not have to be preserved over a recursive call
599 to RMATCH(). */
600
601 #ifdef SUPPORT_UTF
602 const pcre_uchar *charptr;
603 #endif
604 const pcre_uchar *callpat;
605 const pcre_uchar *data;
606 const pcre_uchar *next;
607 PCRE_PUCHAR pp;
608 const pcre_uchar *prev;
609 PCRE_PUCHAR saved_eptr;
610
611 recursion_info new_recursive;
612
613 BOOL cur_is_word;
614 BOOL condition;
615 BOOL prev_is_word;
616
617 #ifdef SUPPORT_UCP
618 int prop_type;
619 int prop_value;
620 int prop_fail_result;
621 int oclength;
622 pcre_uchar occhars[6];
623 #endif
624
625 int codelink;
626 int ctype;
627 int length;
628 int max;
629 int min;
630 int number;
631 int offset;
632 int op;
633 int save_capture_last;
634 int save_offset1, save_offset2, save_offset3;
635 int stacksave[REC_STACK_SAVE_MAX];
636
637 eptrblock newptrb;
638
639 /* There is a special fudge for calling match() in a way that causes it to
640 measure the size of its basic stack frame when the stack is being used for
641 recursion. The second argument (ecode) being NULL triggers this behaviour. It
642 cannot normally ever be NULL. The return is the negated value of the frame
643 size. */
644
645 if (ecode == NULL)
646 {
647 if (rdepth == 0)
648 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
649 else
650 {
651 int len = (char *)&rdepth - (char *)eptr;
652 return (len > 0)? -len : len;
653 }
654 }
655 #endif /* NO_RECURSE */
656
657 /* To save space on the stack and in the heap frame, I have doubled up on some
658 of the local variables that are used only in localised parts of the code, but
659 still need to be preserved over recursive calls of match(). These macros define
660 the alternative names that are used. */
661
662 #define allow_zero cur_is_word
663 #define cbegroup condition
664 #define code_offset codelink
665 #define condassert condition
666 #define matched_once prev_is_word
667 #define foc number
668 #define save_mark data
669
670 /* These statements are here to stop the compiler complaining about unitialized
671 variables. */
672
673 #ifdef SUPPORT_UCP
674 prop_value = 0;
675 prop_fail_result = 0;
676 #endif
677
678
679 /* This label is used for tail recursion, which is used in a few cases even
680 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
681 used. Thanks to Ian Taylor for noticing this possibility and sending the
682 original patch. */
683
684 TAIL_RECURSE:
685
686 /* OK, now we can get on with the real code of the function. Recursive calls
687 are specified by the macro RMATCH and RRETURN is used to return. When
688 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
689 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
690 defined). However, RMATCH isn't like a function call because it's quite a
691 complicated macro. It has to be used in one particular way. This shouldn't,
692 however, impact performance when true recursion is being used. */
693
694 #ifdef SUPPORT_UTF
695 utf = md->utf; /* Local copy of the flag */
696 #else
697 utf = FALSE;
698 #endif
699
700 /* First check that we haven't called match() too many times, or that we
701 haven't exceeded the recursive call limit. */
702
703 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
704 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
705
706 /* At the start of a group with an unlimited repeat that may match an empty
707 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
708 done this way to save having to use another function argument, which would take
709 up space on the stack. See also MATCH_CONDASSERT below.
710
711 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
712 such remembered pointers, to be checked when we hit the closing ket, in order
713 to break infinite loops that match no characters. When match() is called in
714 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
715 NOT be used with tail recursion, because the memory block that is used is on
716 the stack, so a new one may be required for each match(). */
717
718 if (md->match_function_type == MATCH_CBEGROUP)
719 {
720 newptrb.epb_saved_eptr = eptr;
721 newptrb.epb_prev = eptrb;
722 eptrb = &newptrb;
723 md->match_function_type = 0;
724 }
725
726 /* Now start processing the opcodes. */
727
728 for (;;)
729 {
730 minimize = possessive = FALSE;
731 op = *ecode;
732
733 switch(op)
734 {
735 case OP_MARK:
736 md->nomatch_mark = ecode + 2;
737 md->mark = NULL; /* In case previously set by assertion */
738 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
739 eptrb, RM55);
740 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
741 md->mark == NULL) md->mark = ecode + 2;
742
743 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
744 argument, and we must check whether that argument matches this MARK's
745 argument. It is passed back in md->start_match_ptr (an overloading of that
746 variable). If it does match, we reset that variable to the current subject
747 position and return MATCH_SKIP. Otherwise, pass back the return code
748 unaltered. */
749
750 else if (rrc == MATCH_SKIP_ARG &&
751 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
752 {
753 md->start_match_ptr = eptr;
754 RRETURN(MATCH_SKIP);
755 }
756 RRETURN(rrc);
757
758 case OP_FAIL:
759 RRETURN(MATCH_NOMATCH);
760
761 /* COMMIT overrides PRUNE, SKIP, and THEN */
762
763 case OP_COMMIT:
764 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
765 eptrb, RM52);
766 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
767 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
768 rrc != MATCH_THEN)
769 RRETURN(rrc);
770 RRETURN(MATCH_COMMIT);
771
772 /* PRUNE overrides THEN */
773
774 case OP_PRUNE:
775 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
776 eptrb, RM51);
777 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 RRETURN(MATCH_PRUNE);
779
780 case OP_PRUNE_ARG:
781 md->nomatch_mark = ecode + 2;
782 md->mark = NULL; /* In case previously set by assertion */
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
784 eptrb, RM56);
785 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
786 md->mark == NULL) md->mark = ecode + 2;
787 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
788 RRETURN(MATCH_PRUNE);
789
790 /* SKIP overrides PRUNE and THEN */
791
792 case OP_SKIP:
793 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
794 eptrb, RM53);
795 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
796 RRETURN(rrc);
797 md->start_match_ptr = eptr; /* Pass back current position */
798 RRETURN(MATCH_SKIP);
799
800 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
801 nomatch_mark. There is a flag that disables this opcode when re-matching a
802 pattern that ended with a SKIP for which there was not a matching MARK. */
803
804 case OP_SKIP_ARG:
805 if (md->ignore_skip_arg)
806 {
807 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
808 break;
809 }
810 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
811 eptrb, RM57);
812 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
813 RRETURN(rrc);
814
815 /* Pass back the current skip name by overloading md->start_match_ptr and
816 returning the special MATCH_SKIP_ARG return code. This will either be
817 caught by a matching MARK, or get to the top, where it causes a rematch
818 with the md->ignore_skip_arg flag set. */
819
820 md->start_match_ptr = ecode + 2;
821 RRETURN(MATCH_SKIP_ARG);
822
823 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
824 the branch in which it occurs can be determined. Overload the start of
825 match pointer to do this. */
826
827 case OP_THEN:
828 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
829 eptrb, RM54);
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 md->start_match_ptr = ecode;
832 RRETURN(MATCH_THEN);
833
834 case OP_THEN_ARG:
835 md->nomatch_mark = ecode + 2;
836 md->mark = NULL; /* In case previously set by assertion */
837 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
838 md, eptrb, RM58);
839 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
840 md->mark == NULL) md->mark = ecode + 2;
841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
842 md->start_match_ptr = ecode;
843 RRETURN(MATCH_THEN);
844
845 /* Handle an atomic group that does not contain any capturing parentheses.
846 This can be handled like an assertion. Prior to 8.13, all atomic groups
847 were handled this way. In 8.13, the code was changed as below for ONCE, so
848 that backups pass through the group and thereby reset captured values.
849 However, this uses a lot more stack, so in 8.20, atomic groups that do not
850 contain any captures generate OP_ONCE_NC, which can be handled in the old,
851 less stack intensive way.
852
853 Check the alternative branches in turn - the matching won't pass the KET
854 for this kind of subpattern. If any one branch matches, we carry on as at
855 the end of a normal bracket, leaving the subject pointer, but resetting
856 the start-of-match value in case it was changed by \K. */
857
858 case OP_ONCE_NC:
859 prev = ecode;
860 saved_eptr = eptr;
861 save_mark = md->mark;
862 do
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
865 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
866 {
867 mstart = md->start_match_ptr;
868 break;
869 }
870 if (rrc == MATCH_THEN)
871 {
872 next = ecode + GET(ecode,1);
873 if (md->start_match_ptr < next &&
874 (*ecode == OP_ALT || *next == OP_ALT))
875 rrc = MATCH_NOMATCH;
876 }
877
878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
879 ecode += GET(ecode,1);
880 md->mark = save_mark;
881 }
882 while (*ecode == OP_ALT);
883
884 /* If hit the end of the group (which could be repeated), fail */
885
886 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
887
888 /* Continue as from after the group, updating the offsets high water
889 mark, since extracts may have been taken. */
890
891 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
892
893 offset_top = md->end_offset_top;
894 eptr = md->end_match_ptr;
895
896 /* For a non-repeating ket, just continue at this level. This also
897 happens for a repeating ket if no characters were matched in the group.
898 This is the forcible breaking of infinite loops as implemented in Perl
899 5.005. */
900
901 if (*ecode == OP_KET || eptr == saved_eptr)
902 {
903 ecode += 1+LINK_SIZE;
904 break;
905 }
906
907 /* The repeating kets try the rest of the pattern or restart from the
908 preceding bracket, in the appropriate order. The second "call" of match()
909 uses tail recursion, to avoid using another stack frame. */
910
911 if (*ecode == OP_KETRMIN)
912 {
913 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
915 ecode = prev;
916 goto TAIL_RECURSE;
917 }
918 else /* OP_KETRMAX */
919 {
920 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
921 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
922 ecode += 1 + LINK_SIZE;
923 goto TAIL_RECURSE;
924 }
925 /* Control never gets here */
926
927 /* Handle a capturing bracket, other than those that are possessive with an
928 unlimited repeat. If there is space in the offset vector, save the current
929 subject position in the working slot at the top of the vector. We mustn't
930 change the current values of the data slot, because they may be set from a
931 previous iteration of this group, and be referred to by a reference inside
932 the group. A failure to match might occur after the group has succeeded,
933 if something later on doesn't match. For this reason, we need to restore
934 the working value and also the values of the final offsets, in case they
935 were set by a previous iteration of the same bracket.
936
937 If there isn't enough space in the offset vector, treat this as if it were
938 a non-capturing bracket. Don't worry about setting the flag for the error
939 case here; that is handled in the code for KET. */
940
941 case OP_CBRA:
942 case OP_SCBRA:
943 number = GET2(ecode, 1+LINK_SIZE);
944 offset = number << 1;
945
946 #ifdef PCRE_DEBUG
947 printf("start bracket %d\n", number);
948 printf("subject=");
949 pchars(eptr, 16, TRUE, md);
950 printf("\n");
951 #endif
952
953 if (offset < md->offset_max)
954 {
955 save_offset1 = md->offset_vector[offset];
956 save_offset2 = md->offset_vector[offset+1];
957 save_offset3 = md->offset_vector[md->offset_end - number];
958 save_capture_last = md->capture_last;
959 save_mark = md->mark;
960
961 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
962 md->offset_vector[md->offset_end - number] =
963 (int)(eptr - md->start_subject);
964
965 for (;;)
966 {
967 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
968 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
969 eptrb, RM1);
970 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
971
972 /* If we backed up to a THEN, check whether it is within the current
973 branch by comparing the address of the THEN that is passed back with
974 the end of the branch. If it is within the current branch, and the
975 branch is one of two or more alternatives (it either starts or ends
976 with OP_ALT), we have reached the limit of THEN's action, so convert
977 the return code to NOMATCH, which will cause normal backtracking to
978 happen from now on. Otherwise, THEN is passed back to an outer
979 alternative. This implements Perl's treatment of parenthesized groups,
980 where a group not containing | does not affect the current alternative,
981 that is, (X) is NOT the same as (X|(*F)). */
982
983 if (rrc == MATCH_THEN)
984 {
985 next = ecode + GET(ecode,1);
986 if (md->start_match_ptr < next &&
987 (*ecode == OP_ALT || *next == OP_ALT))
988 rrc = MATCH_NOMATCH;
989 }
990
991 /* Anything other than NOMATCH is passed back. */
992
993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
994 md->capture_last = save_capture_last;
995 ecode += GET(ecode, 1);
996 md->mark = save_mark;
997 if (*ecode != OP_ALT) break;
998 }
999
1000 DPRINTF(("bracket %d failed\n", number));
1001 md->offset_vector[offset] = save_offset1;
1002 md->offset_vector[offset+1] = save_offset2;
1003 md->offset_vector[md->offset_end - number] = save_offset3;
1004
1005 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1006
1007 RRETURN(rrc);
1008 }
1009
1010 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1011 as a non-capturing bracket. */
1012
1013 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1014 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1015
1016 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1017
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1020
1021 /* Non-capturing or atomic group, except for possessive with unlimited
1022 repeat and ONCE group with no captures. Loop for all the alternatives.
1023
1024 When we get to the final alternative within the brackets, we used to return
1025 the result of a recursive call to match() whatever happened so it was
1026 possible to reduce stack usage by turning this into a tail recursion,
1027 except in the case of a possibly empty group. However, now that there is
1028 the possiblity of (*THEN) occurring in the final alternative, this
1029 optimization is no longer always possible.
1030
1031 We can optimize if we know there are no (*THEN)s in the pattern; at present
1032 this is the best that can be done.
1033
1034 MATCH_ONCE is returned when the end of an atomic group is successfully
1035 reached, but subsequent matching fails. It passes back up the tree (causing
1036 captured values to be reset) until the original atomic group level is
1037 reached. This is tested by comparing md->once_target with the start of the
1038 group. At this point, the return is converted into MATCH_NOMATCH so that
1039 previous backup points can be taken. */
1040
1041 case OP_ONCE:
1042 case OP_BRA:
1043 case OP_SBRA:
1044 DPRINTF(("start non-capturing bracket\n"));
1045
1046 for (;;)
1047 {
1048 if (op >= OP_SBRA || op == OP_ONCE)
1049 md->match_function_type = MATCH_CBEGROUP;
1050
1051 /* If this is not a possibly empty group, and there are no (*THEN)s in
1052 the pattern, and this is the final alternative, optimize as described
1053 above. */
1054
1055 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1056 {
1057 ecode += PRIV(OP_lengths)[*ecode];
1058 goto TAIL_RECURSE;
1059 }
1060
1061 /* In all other cases, we have to make another call to match(). */
1062
1063 save_mark = md->mark;
1064 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1065 RM2);
1066
1067 /* See comment in the code for capturing groups above about handling
1068 THEN. */
1069
1070 if (rrc == MATCH_THEN)
1071 {
1072 next = ecode + GET(ecode,1);
1073 if (md->start_match_ptr < next &&
1074 (*ecode == OP_ALT || *next == OP_ALT))
1075 rrc = MATCH_NOMATCH;
1076 }
1077
1078 if (rrc != MATCH_NOMATCH)
1079 {
1080 if (rrc == MATCH_ONCE)
1081 {
1082 const pcre_uchar *scode = ecode;
1083 if (*scode != OP_ONCE) /* If not at start, find it */
1084 {
1085 while (*scode == OP_ALT) scode += GET(scode, 1);
1086 scode -= GET(scode, 1);
1087 }
1088 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1089 }
1090 RRETURN(rrc);
1091 }
1092 ecode += GET(ecode, 1);
1093 md->mark = save_mark;
1094 if (*ecode != OP_ALT) break;
1095 }
1096
1097 RRETURN(MATCH_NOMATCH);
1098
1099 /* Handle possessive capturing brackets with an unlimited repeat. We come
1100 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1101 handled similarly to the normal case above. However, the matching is
1102 different. The end of these brackets will always be OP_KETRPOS, which
1103 returns MATCH_KETRPOS without going further in the pattern. By this means
1104 we can handle the group by iteration rather than recursion, thereby
1105 reducing the amount of stack needed. */
1106
1107 case OP_CBRAPOS:
1108 case OP_SCBRAPOS:
1109 allow_zero = FALSE;
1110
1111 POSSESSIVE_CAPTURE:
1112 number = GET2(ecode, 1+LINK_SIZE);
1113 offset = number << 1;
1114
1115 #ifdef PCRE_DEBUG
1116 printf("start possessive bracket %d\n", number);
1117 printf("subject=");
1118 pchars(eptr, 16, TRUE, md);
1119 printf("\n");
1120 #endif
1121
1122 if (offset < md->offset_max)
1123 {
1124 matched_once = FALSE;
1125 code_offset = (int)(ecode - md->start_code);
1126
1127 save_offset1 = md->offset_vector[offset];
1128 save_offset2 = md->offset_vector[offset+1];
1129 save_offset3 = md->offset_vector[md->offset_end - number];
1130 save_capture_last = md->capture_last;
1131
1132 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1133
1134 /* Each time round the loop, save the current subject position for use
1135 when the group matches. For MATCH_MATCH, the group has matched, so we
1136 restart it with a new subject starting position, remembering that we had
1137 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1138 usual. If we haven't matched any alternatives in any iteration, check to
1139 see if a previous iteration matched. If so, the group has matched;
1140 continue from afterwards. Otherwise it has failed; restore the previous
1141 capture values before returning NOMATCH. */
1142
1143 for (;;)
1144 {
1145 md->offset_vector[md->offset_end - number] =
1146 (int)(eptr - md->start_subject);
1147 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1148 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1149 eptrb, RM63);
1150 if (rrc == MATCH_KETRPOS)
1151 {
1152 offset_top = md->end_offset_top;
1153 eptr = md->end_match_ptr;
1154 ecode = md->start_code + code_offset;
1155 save_capture_last = md->capture_last;
1156 matched_once = TRUE;
1157 continue;
1158 }
1159
1160 /* See comment in the code for capturing groups above about handling
1161 THEN. */
1162
1163 if (rrc == MATCH_THEN)
1164 {
1165 next = ecode + GET(ecode,1);
1166 if (md->start_match_ptr < next &&
1167 (*ecode == OP_ALT || *next == OP_ALT))
1168 rrc = MATCH_NOMATCH;
1169 }
1170
1171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1172 md->capture_last = save_capture_last;
1173 ecode += GET(ecode, 1);
1174 if (*ecode != OP_ALT) break;
1175 }
1176
1177 if (!matched_once)
1178 {
1179 md->offset_vector[offset] = save_offset1;
1180 md->offset_vector[offset+1] = save_offset2;
1181 md->offset_vector[md->offset_end - number] = save_offset3;
1182 }
1183
1184 if (allow_zero || matched_once)
1185 {
1186 ecode += 1 + LINK_SIZE;
1187 break;
1188 }
1189
1190 RRETURN(MATCH_NOMATCH);
1191 }
1192
1193 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1194 as a non-capturing bracket. */
1195
1196 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1198
1199 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1200
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1203
1204 /* Non-capturing possessive bracket with unlimited repeat. We come here
1205 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1206 without the capturing complication. It is written out separately for speed
1207 and cleanliness. */
1208
1209 case OP_BRAPOS:
1210 case OP_SBRAPOS:
1211 allow_zero = FALSE;
1212
1213 POSSESSIVE_NON_CAPTURE:
1214 matched_once = FALSE;
1215 code_offset = (int)(ecode - md->start_code);
1216
1217 for (;;)
1218 {
1219 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1220 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1221 eptrb, RM48);
1222 if (rrc == MATCH_KETRPOS)
1223 {
1224 offset_top = md->end_offset_top;
1225 eptr = md->end_match_ptr;
1226 ecode = md->start_code + code_offset;
1227 matched_once = TRUE;
1228 continue;
1229 }
1230
1231 /* See comment in the code for capturing groups above about handling
1232 THEN. */
1233
1234 if (rrc == MATCH_THEN)
1235 {
1236 next = ecode + GET(ecode,1);
1237 if (md->start_match_ptr < next &&
1238 (*ecode == OP_ALT || *next == OP_ALT))
1239 rrc = MATCH_NOMATCH;
1240 }
1241
1242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1243 ecode += GET(ecode, 1);
1244 if (*ecode != OP_ALT) break;
1245 }
1246
1247 if (matched_once || allow_zero)
1248 {
1249 ecode += 1 + LINK_SIZE;
1250 break;
1251 }
1252 RRETURN(MATCH_NOMATCH);
1253
1254 /* Control never reaches here. */
1255
1256 /* Conditional group: compilation checked that there are no more than
1257 two branches. If the condition is false, skipping the first branch takes us
1258 past the end if there is only one branch, but that's OK because that is
1259 exactly what going to the ket would do. */
1260
1261 case OP_COND:
1262 case OP_SCOND:
1263 codelink = GET(ecode, 1);
1264
1265 /* Because of the way auto-callout works during compile, a callout item is
1266 inserted between OP_COND and an assertion condition. */
1267
1268 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1269 {
1270 if (PUBL(callout) != NULL)
1271 {
1272 PUBL(callout_block) cb;
1273 cb.version = 2; /* Version 1 of the callout block */
1274 cb.callout_number = ecode[LINK_SIZE+2];
1275 cb.offset_vector = md->offset_vector;
1276 #if defined COMPILE_PCRE8
1277 cb.subject = (PCRE_SPTR)md->start_subject;
1278 #elif defined COMPILE_PCRE16
1279 cb.subject = (PCRE_SPTR16)md->start_subject;
1280 #elif defined COMPILE_PCRE32
1281 cb.subject = (PCRE_SPTR32)md->start_subject;
1282 #endif
1283 cb.subject_length = (int)(md->end_subject - md->start_subject);
1284 cb.start_match = (int)(mstart - md->start_subject);
1285 cb.current_position = (int)(eptr - md->start_subject);
1286 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1287 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1288 cb.capture_top = offset_top/2;
1289 cb.capture_last = md->capture_last;
1290 cb.callout_data = md->callout_data;
1291 cb.mark = md->nomatch_mark;
1292 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1293 if (rrc < 0) RRETURN(rrc);
1294 }
1295 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1296 }
1297
1298 condcode = ecode[LINK_SIZE+1];
1299
1300 /* Now see what the actual condition is */
1301
1302 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1303 {
1304 if (md->recursive == NULL) /* Not recursing => FALSE */
1305 {
1306 condition = FALSE;
1307 ecode += GET(ecode, 1);
1308 }
1309 else
1310 {
1311 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1312 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1313
1314 /* If the test is for recursion into a specific subpattern, and it is
1315 false, but the test was set up by name, scan the table to see if the
1316 name refers to any other numbers, and test them. The condition is true
1317 if any one is set. */
1318
1319 if (!condition && condcode == OP_NRREF)
1320 {
1321 pcre_uchar *slotA = md->name_table;
1322 for (i = 0; i < md->name_count; i++)
1323 {
1324 if (GET2(slotA, 0) == recno) break;
1325 slotA += md->name_entry_size;
1326 }
1327
1328 /* Found a name for the number - there can be only one; duplicate
1329 names for different numbers are allowed, but not vice versa. First
1330 scan down for duplicates. */
1331
1332 if (i < md->name_count)
1333 {
1334 pcre_uchar *slotB = slotA;
1335 while (slotB > md->name_table)
1336 {
1337 slotB -= md->name_entry_size;
1338 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1339 {
1340 condition = GET2(slotB, 0) == md->recursive->group_num;
1341 if (condition) break;
1342 }
1343 else break;
1344 }
1345
1346 /* Scan up for duplicates */
1347
1348 if (!condition)
1349 {
1350 slotB = slotA;
1351 for (i++; i < md->name_count; i++)
1352 {
1353 slotB += md->name_entry_size;
1354 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1355 {
1356 condition = GET2(slotB, 0) == md->recursive->group_num;
1357 if (condition) break;
1358 }
1359 else break;
1360 }
1361 }
1362 }
1363 }
1364
1365 /* Chose branch according to the condition */
1366
1367 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1368 }
1369 }
1370
1371 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1372 {
1373 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1374 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1375
1376 /* If the numbered capture is unset, but the reference was by name,
1377 scan the table to see if the name refers to any other numbers, and test
1378 them. The condition is true if any one is set. This is tediously similar
1379 to the code above, but not close enough to try to amalgamate. */
1380
1381 if (!condition && condcode == OP_NCREF)
1382 {
1383 int refno = offset >> 1;
1384 pcre_uchar *slotA = md->name_table;
1385
1386 for (i = 0; i < md->name_count; i++)
1387 {
1388 if (GET2(slotA, 0) == refno) break;
1389 slotA += md->name_entry_size;
1390 }
1391
1392 /* Found a name for the number - there can be only one; duplicate names
1393 for different numbers are allowed, but not vice versa. First scan down
1394 for duplicates. */
1395
1396 if (i < md->name_count)
1397 {
1398 pcre_uchar *slotB = slotA;
1399 while (slotB > md->name_table)
1400 {
1401 slotB -= md->name_entry_size;
1402 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1403 {
1404 offset = GET2(slotB, 0) << 1;
1405 condition = offset < offset_top &&
1406 md->offset_vector[offset] >= 0;
1407 if (condition) break;
1408 }
1409 else break;
1410 }
1411
1412 /* Scan up for duplicates */
1413
1414 if (!condition)
1415 {
1416 slotB = slotA;
1417 for (i++; i < md->name_count; i++)
1418 {
1419 slotB += md->name_entry_size;
1420 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1421 {
1422 offset = GET2(slotB, 0) << 1;
1423 condition = offset < offset_top &&
1424 md->offset_vector[offset] >= 0;
1425 if (condition) break;
1426 }
1427 else break;
1428 }
1429 }
1430 }
1431 }
1432
1433 /* Chose branch according to the condition */
1434
1435 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1436 }
1437
1438 else if (condcode == OP_DEF) /* DEFINE - always false */
1439 {
1440 condition = FALSE;
1441 ecode += GET(ecode, 1);
1442 }
1443
1444 /* The condition is an assertion. Call match() to evaluate it - setting
1445 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1446 an assertion. */
1447
1448 else
1449 {
1450 md->match_function_type = MATCH_CONDASSERT;
1451 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1452 if (rrc == MATCH_MATCH)
1453 {
1454 if (md->end_offset_top > offset_top)
1455 offset_top = md->end_offset_top; /* Captures may have happened */
1456 condition = TRUE;
1457 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1458 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1459 }
1460
1461 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1462 assertion; it is therefore treated as NOMATCH. */
1463
1464 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1465 {
1466 RRETURN(rrc); /* Need braces because of following else */
1467 }
1468 else
1469 {
1470 condition = FALSE;
1471 ecode += codelink;
1472 }
1473 }
1474
1475 /* We are now at the branch that is to be obeyed. As there is only one, can
1476 use tail recursion to avoid using another stack frame, except when there is
1477 unlimited repeat of a possibly empty group. In the latter case, a recursive
1478 call to match() is always required, unless the second alternative doesn't
1479 exist, in which case we can just plough on. Note that, for compatibility
1480 with Perl, the | in a conditional group is NOT treated as creating two
1481 alternatives. If a THEN is encountered in the branch, it propagates out to
1482 the enclosing alternative (unless nested in a deeper set of alternatives,
1483 of course). */
1484
1485 if (condition || *ecode == OP_ALT)
1486 {
1487 if (op != OP_SCOND)
1488 {
1489 ecode += 1 + LINK_SIZE;
1490 goto TAIL_RECURSE;
1491 }
1492
1493 md->match_function_type = MATCH_CBEGROUP;
1494 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1495 RRETURN(rrc);
1496 }
1497
1498 /* Condition false & no alternative; continue after the group. */
1499
1500 else
1501 {
1502 ecode += 1 + LINK_SIZE;
1503 }
1504 break;
1505
1506
1507 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1508 to close any currently open capturing brackets. */
1509
1510 case OP_CLOSE:
1511 number = GET2(ecode, 1);
1512 offset = number << 1;
1513
1514 #ifdef PCRE_DEBUG
1515 printf("end bracket %d at *ACCEPT", number);
1516 printf("\n");
1517 #endif
1518
1519 md->capture_last = number;
1520 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1521 {
1522 md->offset_vector[offset] =
1523 md->offset_vector[md->offset_end - number];
1524 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1525 if (offset_top <= offset) offset_top = offset + 2;
1526 }
1527 ecode += 1 + IMM2_SIZE;
1528 break;
1529
1530
1531 /* End of the pattern, either real or forced. */
1532
1533 case OP_END:
1534 case OP_ACCEPT:
1535 case OP_ASSERT_ACCEPT:
1536
1537 /* If we have matched an empty string, fail if not in an assertion and not
1538 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1539 is set and we have matched at the start of the subject. In both cases,
1540 backtracking will then try other alternatives, if any. */
1541
1542 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1543 md->recursive == NULL &&
1544 (md->notempty ||
1545 (md->notempty_atstart &&
1546 mstart == md->start_subject + md->start_offset)))
1547 RRETURN(MATCH_NOMATCH);
1548
1549 /* Otherwise, we have a match. */
1550
1551 md->end_match_ptr = eptr; /* Record where we ended */
1552 md->end_offset_top = offset_top; /* and how many extracts were taken */
1553 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1554
1555 /* For some reason, the macros don't work properly if an expression is
1556 given as the argument to RRETURN when the heap is in use. */
1557
1558 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1559 RRETURN(rrc);
1560
1561 /* Assertion brackets. Check the alternative branches in turn - the
1562 matching won't pass the KET for an assertion. If any one branch matches,
1563 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1564 start of each branch to move the current point backwards, so the code at
1565 this level is identical to the lookahead case. When the assertion is part
1566 of a condition, we want to return immediately afterwards. The caller of
1567 this incarnation of the match() function will have set MATCH_CONDASSERT in
1568 md->match_function type, and one of these opcodes will be the first opcode
1569 that is processed. We use a local variable that is preserved over calls to
1570 match() to remember this case. */
1571
1572 case OP_ASSERT:
1573 case OP_ASSERTBACK:
1574 save_mark = md->mark;
1575 if (md->match_function_type == MATCH_CONDASSERT)
1576 {
1577 condassert = TRUE;
1578 md->match_function_type = 0;
1579 }
1580 else condassert = FALSE;
1581
1582 do
1583 {
1584 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1585 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1586 {
1587 mstart = md->start_match_ptr; /* In case \K reset it */
1588 break;
1589 }
1590 md->mark = save_mark;
1591
1592 /* A COMMIT failure must fail the entire assertion, without trying any
1593 subsequent branches. */
1594
1595 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1596
1597 /* PCRE does not allow THEN to escape beyond an assertion; it
1598 is treated as NOMATCH. */
1599
1600 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1601 ecode += GET(ecode, 1);
1602 }
1603 while (*ecode == OP_ALT);
1604
1605 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1606
1607 /* If checking an assertion for a condition, return MATCH_MATCH. */
1608
1609 if (condassert) RRETURN(MATCH_MATCH);
1610
1611 /* Continue from after the assertion, updating the offsets high water
1612 mark, since extracts may have been taken during the assertion. */
1613
1614 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1615 ecode += 1 + LINK_SIZE;
1616 offset_top = md->end_offset_top;
1617 continue;
1618
1619 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1620 PRUNE, or COMMIT means we must assume failure without checking subsequent
1621 branches. */
1622
1623 case OP_ASSERT_NOT:
1624 case OP_ASSERTBACK_NOT:
1625 save_mark = md->mark;
1626 if (md->match_function_type == MATCH_CONDASSERT)
1627 {
1628 condassert = TRUE;
1629 md->match_function_type = 0;
1630 }
1631 else condassert = FALSE;
1632
1633 do
1634 {
1635 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1636 md->mark = save_mark;
1637 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1638 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1639 {
1640 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1641 break;
1642 }
1643
1644 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1645 as NOMATCH. */
1646
1647 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1648 ecode += GET(ecode,1);
1649 }
1650 while (*ecode == OP_ALT);
1651
1652 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1653
1654 ecode += 1 + LINK_SIZE;
1655 continue;
1656
1657 /* Move the subject pointer back. This occurs only at the start of
1658 each branch of a lookbehind assertion. If we are too close to the start to
1659 move back, this match function fails. When working with UTF-8 we move
1660 back a number of characters, not bytes. */
1661
1662 case OP_REVERSE:
1663 #ifdef SUPPORT_UTF
1664 if (utf)
1665 {
1666 i = GET(ecode, 1);
1667 while (i-- > 0)
1668 {
1669 eptr--;
1670 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1671 BACKCHAR(eptr);
1672 }
1673 }
1674 else
1675 #endif
1676
1677 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1678
1679 {
1680 eptr -= GET(ecode, 1);
1681 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1682 }
1683
1684 /* Save the earliest consulted character, then skip to next op code */
1685
1686 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1687 ecode += 1 + LINK_SIZE;
1688 break;
1689
1690 /* The callout item calls an external function, if one is provided, passing
1691 details of the match so far. This is mainly for debugging, though the
1692 function is able to force a failure. */
1693
1694 case OP_CALLOUT:
1695 if (PUBL(callout) != NULL)
1696 {
1697 PUBL(callout_block) cb;
1698 cb.version = 2; /* Version 1 of the callout block */
1699 cb.callout_number = ecode[1];
1700 cb.offset_vector = md->offset_vector;
1701 #if defined COMPILE_PCRE8
1702 cb.subject = (PCRE_SPTR)md->start_subject;
1703 #elif defined COMPILE_PCRE16
1704 cb.subject = (PCRE_SPTR16)md->start_subject;
1705 #elif defined COMPILE_PCRE32
1706 cb.subject = (PCRE_SPTR32)md->start_subject;
1707 #endif
1708 cb.subject_length = (int)(md->end_subject - md->start_subject);
1709 cb.start_match = (int)(mstart - md->start_subject);
1710 cb.current_position = (int)(eptr - md->start_subject);
1711 cb.pattern_position = GET(ecode, 2);
1712 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1713 cb.capture_top = offset_top/2;
1714 cb.capture_last = md->capture_last;
1715 cb.callout_data = md->callout_data;
1716 cb.mark = md->nomatch_mark;
1717 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1718 if (rrc < 0) RRETURN(rrc);
1719 }
1720 ecode += 2 + 2*LINK_SIZE;
1721 break;
1722
1723 /* Recursion either matches the current regex, or some subexpression. The
1724 offset data is the offset to the starting bracket from the start of the
1725 whole pattern. (This is so that it works from duplicated subpatterns.)
1726
1727 The state of the capturing groups is preserved over recursion, and
1728 re-instated afterwards. We don't know how many are started and not yet
1729 finished (offset_top records the completed total) so we just have to save
1730 all the potential data. There may be up to 65535 such values, which is too
1731 large to put on the stack, but using malloc for small numbers seems
1732 expensive. As a compromise, the stack is used when there are no more than
1733 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1734
1735 There are also other values that have to be saved. We use a chained
1736 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737 for the original version of this logic. It has, however, been hacked around
1738 a lot, so he is not to blame for the current way it works. */
1739
1740 case OP_RECURSE:
1741 {
1742 recursion_info *ri;
1743 int recno;
1744
1745 callpat = md->start_code + GET(ecode, 1);
1746 recno = (callpat == md->start_code)? 0 :
1747 GET2(callpat, 1 + LINK_SIZE);
1748
1749 /* Check for repeating a recursion without advancing the subject pointer.
1750 This should catch convoluted mutual recursions. (Some simple cases are
1751 caught at compile time.) */
1752
1753 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754 if (recno == ri->group_num && eptr == ri->subject_position)
1755 RRETURN(PCRE_ERROR_RECURSELOOP);
1756
1757 /* Add to "recursing stack" */
1758
1759 new_recursive.group_num = recno;
1760 new_recursive.subject_position = eptr;
1761 new_recursive.prevrec = md->recursive;
1762 md->recursive = &new_recursive;
1763
1764 /* Where to continue from afterwards */
1765
1766 ecode += 1 + LINK_SIZE;
1767
1768 /* Now save the offset data */
1769
1770 new_recursive.saved_max = md->offset_end;
1771 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1772 new_recursive.offset_save = stacksave;
1773 else
1774 {
1775 new_recursive.offset_save =
1776 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1777 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1778 }
1779 memcpy(new_recursive.offset_save, md->offset_vector,
1780 new_recursive.saved_max * sizeof(int));
1781
1782 /* OK, now we can do the recursion. After processing each alternative,
1783 restore the offset data. If there were nested recursions, md->recursive
1784 might be changed, so reset it before looping. */
1785
1786 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1787 cbegroup = (*callpat >= OP_SBRA);
1788 do
1789 {
1790 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1791 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1792 md, eptrb, RM6);
1793 memcpy(md->offset_vector, new_recursive.offset_save,
1794 new_recursive.saved_max * sizeof(int));
1795 md->recursive = new_recursive.prevrec;
1796 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1797 {
1798 DPRINTF(("Recursion matched\n"));
1799 if (new_recursive.offset_save != stacksave)
1800 (PUBL(free))(new_recursive.offset_save);
1801
1802 /* Set where we got to in the subject, and reset the start in case
1803 it was changed by \K. This *is* propagated back out of a recursion,
1804 for Perl compatibility. */
1805
1806 eptr = md->end_match_ptr;
1807 mstart = md->start_match_ptr;
1808 goto RECURSION_MATCHED; /* Exit loop; end processing */
1809 }
1810
1811 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1812 is treated as NOMATCH. */
1813
1814 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1815 rrc != MATCH_COMMIT)
1816 {
1817 DPRINTF(("Recursion gave error %d\n", rrc));
1818 if (new_recursive.offset_save != stacksave)
1819 (PUBL(free))(new_recursive.offset_save);
1820 RRETURN(rrc);
1821 }
1822
1823 md->recursive = &new_recursive;
1824 callpat += GET(callpat, 1);
1825 }
1826 while (*callpat == OP_ALT);
1827
1828 DPRINTF(("Recursion didn't match\n"));
1829 md->recursive = new_recursive.prevrec;
1830 if (new_recursive.offset_save != stacksave)
1831 (PUBL(free))(new_recursive.offset_save);
1832 RRETURN(MATCH_NOMATCH);
1833 }
1834
1835 RECURSION_MATCHED:
1836 break;
1837
1838 /* An alternation is the end of a branch; scan along to find the end of the
1839 bracketed group and go to there. */
1840
1841 case OP_ALT:
1842 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1843 break;
1844
1845 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1846 indicating that it may occur zero times. It may repeat infinitely, or not
1847 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1848 with fixed upper repeat limits are compiled as a number of copies, with the
1849 optional ones preceded by BRAZERO or BRAMINZERO. */
1850
1851 case OP_BRAZERO:
1852 next = ecode + 1;
1853 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1854 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1855 do next += GET(next, 1); while (*next == OP_ALT);
1856 ecode = next + 1 + LINK_SIZE;
1857 break;
1858
1859 case OP_BRAMINZERO:
1860 next = ecode + 1;
1861 do next += GET(next, 1); while (*next == OP_ALT);
1862 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 ecode++;
1865 break;
1866
1867 case OP_SKIPZERO:
1868 next = ecode+1;
1869 do next += GET(next,1); while (*next == OP_ALT);
1870 ecode = next + 1 + LINK_SIZE;
1871 break;
1872
1873 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1874 here; just jump to the group, with allow_zero set TRUE. */
1875
1876 case OP_BRAPOSZERO:
1877 op = *(++ecode);
1878 allow_zero = TRUE;
1879 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1880 goto POSSESSIVE_NON_CAPTURE;
1881
1882 /* End of a group, repeated or non-repeating. */
1883
1884 case OP_KET:
1885 case OP_KETRMIN:
1886 case OP_KETRMAX:
1887 case OP_KETRPOS:
1888 prev = ecode - GET(ecode, 1);
1889
1890 /* If this was a group that remembered the subject start, in order to break
1891 infinite repeats of empty string matches, retrieve the subject start from
1892 the chain. Otherwise, set it NULL. */
1893
1894 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1895 {
1896 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1897 eptrb = eptrb->epb_prev; /* Backup to previous group */
1898 }
1899 else saved_eptr = NULL;
1900
1901 /* If we are at the end of an assertion group or a non-capturing atomic
1902 group, stop matching and return MATCH_MATCH, but record the current high
1903 water mark for use by positive assertions. We also need to record the match
1904 start in case it was changed by \K. */
1905
1906 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1907 *prev == OP_ONCE_NC)
1908 {
1909 md->end_match_ptr = eptr; /* For ONCE_NC */
1910 md->end_offset_top = offset_top;
1911 md->start_match_ptr = mstart;
1912 RRETURN(MATCH_MATCH); /* Sets md->mark */
1913 }
1914
1915 /* For capturing groups we have to check the group number back at the start
1916 and if necessary complete handling an extraction by setting the offsets and
1917 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1918 into group 0, so it won't be picked up here. Instead, we catch it when the
1919 OP_END is reached. Other recursion is handled here. We just have to record
1920 the current subject position and start match pointer and give a MATCH
1921 return. */
1922
1923 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1924 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1925 {
1926 number = GET2(prev, 1+LINK_SIZE);
1927 offset = number << 1;
1928
1929 #ifdef PCRE_DEBUG
1930 printf("end bracket %d", number);
1931 printf("\n");
1932 #endif
1933
1934 /* Handle a recursively called group. */
1935
1936 if (md->recursive != NULL && md->recursive->group_num == number)
1937 {
1938 md->end_match_ptr = eptr;
1939 md->start_match_ptr = mstart;
1940 RRETURN(MATCH_MATCH);
1941 }
1942
1943 /* Deal with capturing */
1944
1945 md->capture_last = number;
1946 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1947 {
1948 /* If offset is greater than offset_top, it means that we are
1949 "skipping" a capturing group, and that group's offsets must be marked
1950 unset. In earlier versions of PCRE, all the offsets were unset at the
1951 start of matching, but this doesn't work because atomic groups and
1952 assertions can cause a value to be set that should later be unset.
1953 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1954 part of the atomic group, but this is not on the final matching path,
1955 so must be unset when 2 is set. (If there is no group 2, there is no
1956 problem, because offset_top will then be 2, indicating no capture.) */
1957
1958 if (offset > offset_top)
1959 {
1960 register int *iptr = md->offset_vector + offset_top;
1961 register int *iend = md->offset_vector + offset;
1962 while (iptr < iend) *iptr++ = -1;
1963 }
1964
1965 /* Now make the extraction */
1966
1967 md->offset_vector[offset] =
1968 md->offset_vector[md->offset_end - number];
1969 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1970 if (offset_top <= offset) offset_top = offset + 2;
1971 }
1972 }
1973
1974 /* For an ordinary non-repeating ket, just continue at this level. This
1975 also happens for a repeating ket if no characters were matched in the
1976 group. This is the forcible breaking of infinite loops as implemented in
1977 Perl 5.005. For a non-repeating atomic group that includes captures,
1978 establish a backup point by processing the rest of the pattern at a lower
1979 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1980 original OP_ONCE level, thereby bypassing intermediate backup points, but
1981 resetting any captures that happened along the way. */
1982
1983 if (*ecode == OP_KET || eptr == saved_eptr)
1984 {
1985 if (*prev == OP_ONCE)
1986 {
1987 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1989 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1990 RRETURN(MATCH_ONCE);
1991 }
1992 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1993 break;
1994 }
1995
1996 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1997 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1998 at a time from the outer level, thus saving stack. */
1999
2000 if (*ecode == OP_KETRPOS)
2001 {
2002 md->end_match_ptr = eptr;
2003 md->end_offset_top = offset_top;
2004 RRETURN(MATCH_KETRPOS);
2005 }
2006
2007 /* The normal repeating kets try the rest of the pattern or restart from
2008 the preceding bracket, in the appropriate order. In the second case, we can
2009 use tail recursion to avoid using another stack frame, unless we have an
2010 an atomic group or an unlimited repeat of a group that can match an empty
2011 string. */
2012
2013 if (*ecode == OP_KETRMIN)
2014 {
2015 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2017 if (*prev == OP_ONCE)
2018 {
2019 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2020 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2021 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2022 RRETURN(MATCH_ONCE);
2023 }
2024 if (*prev >= OP_SBRA) /* Could match an empty string */
2025 {
2026 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2027 RRETURN(rrc);
2028 }
2029 ecode = prev;
2030 goto TAIL_RECURSE;
2031 }
2032 else /* OP_KETRMAX */
2033 {
2034 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2035 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2036 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2037 if (*prev == OP_ONCE)
2038 {
2039 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2041 md->once_target = prev;
2042 RRETURN(MATCH_ONCE);
2043 }
2044 ecode += 1 + LINK_SIZE;
2045 goto TAIL_RECURSE;
2046 }
2047 /* Control never gets here */
2048
2049 /* Not multiline mode: start of subject assertion, unless notbol. */
2050
2051 case OP_CIRC:
2052 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2053
2054 /* Start of subject assertion */
2055
2056 case OP_SOD:
2057 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2058 ecode++;
2059 break;
2060
2061 /* Multiline mode: start of subject unless notbol, or after any newline. */
2062
2063 case OP_CIRCM:
2064 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2065 if (eptr != md->start_subject &&
2066 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2067 RRETURN(MATCH_NOMATCH);
2068 ecode++;
2069 break;
2070
2071 /* Start of match assertion */
2072
2073 case OP_SOM:
2074 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2075 ecode++;
2076 break;
2077
2078 /* Reset the start of match point */
2079
2080 case OP_SET_SOM:
2081 mstart = eptr;
2082 ecode++;
2083 break;
2084
2085 /* Multiline mode: assert before any newline, or before end of subject
2086 unless noteol is set. */
2087
2088 case OP_DOLLM:
2089 if (eptr < md->end_subject)
2090 {
2091 if (!IS_NEWLINE(eptr))
2092 {
2093 if (md->partial != 0 &&
2094 eptr + 1 >= md->end_subject &&
2095 NLBLOCK->nltype == NLTYPE_FIXED &&
2096 NLBLOCK->nllen == 2 &&
2097 *eptr == NLBLOCK->nl[0])
2098 {
2099 md->hitend = TRUE;
2100 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2101 }
2102 RRETURN(MATCH_NOMATCH);
2103 }
2104 }
2105 else
2106 {
2107 if (md->noteol) RRETURN(MATCH_NOMATCH);
2108 SCHECK_PARTIAL();
2109 }
2110 ecode++;
2111 break;
2112
2113 /* Not multiline mode: assert before a terminating newline or before end of
2114 subject unless noteol is set. */
2115
2116 case OP_DOLL:
2117 if (md->noteol) RRETURN(MATCH_NOMATCH);
2118 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2119
2120 /* ... else fall through for endonly */
2121
2122 /* End of subject assertion (\z) */
2123
2124 case OP_EOD:
2125 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2126 SCHECK_PARTIAL();
2127 ecode++;
2128 break;
2129
2130 /* End of subject or ending \n assertion (\Z) */
2131
2132 case OP_EODN:
2133 ASSERT_NL_OR_EOS:
2134 if (eptr < md->end_subject &&
2135 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2136 {
2137 if (md->partial != 0 &&
2138 eptr + 1 >= md->end_subject &&
2139 NLBLOCK->nltype == NLTYPE_FIXED &&
2140 NLBLOCK->nllen == 2 &&
2141 *eptr == NLBLOCK->nl[0])
2142 {
2143 md->hitend = TRUE;
2144 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2145 }
2146 RRETURN(MATCH_NOMATCH);
2147 }
2148
2149 /* Either at end of string or \n before end. */
2150
2151 SCHECK_PARTIAL();
2152 ecode++;
2153 break;
2154
2155 /* Word boundary assertions */
2156
2157 case OP_NOT_WORD_BOUNDARY:
2158 case OP_WORD_BOUNDARY:
2159 {
2160
2161 /* Find out if the previous and current characters are "word" characters.
2162 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2163 be "non-word" characters. Remember the earliest consulted character for
2164 partial matching. */
2165
2166 #ifdef SUPPORT_UTF
2167 if (utf)
2168 {
2169 /* Get status of previous character */
2170
2171 if (eptr == md->start_subject) prev_is_word = FALSE; else
2172 {
2173 PCRE_PUCHAR lastptr = eptr - 1;
2174 BACKCHAR(lastptr);
2175 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2176 GETCHAR(c, lastptr);
2177 #ifdef SUPPORT_UCP
2178 if (md->use_ucp)
2179 {
2180 if (c == '_') prev_is_word = TRUE; else
2181 {
2182 int cat = UCD_CATEGORY(c);
2183 prev_is_word = (cat == ucp_L || cat == ucp_N);
2184 }
2185 }
2186 else
2187 #endif
2188 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2189 }
2190
2191 /* Get status of next character */
2192
2193 if (eptr >= md->end_subject)
2194 {
2195 SCHECK_PARTIAL();
2196 cur_is_word = FALSE;
2197 }
2198 else
2199 {
2200 GETCHAR(c, eptr);
2201 #ifdef SUPPORT_UCP
2202 if (md->use_ucp)
2203 {
2204 if (c == '_') cur_is_word = TRUE; else
2205 {
2206 int cat = UCD_CATEGORY(c);
2207 cur_is_word = (cat == ucp_L || cat == ucp_N);
2208 }
2209 }
2210 else
2211 #endif
2212 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2213 }
2214 }
2215 else
2216 #endif
2217
2218 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2219 consistency with the behaviour of \w we do use it in this case. */
2220
2221 {
2222 /* Get status of previous character */
2223
2224 if (eptr == md->start_subject) prev_is_word = FALSE; else
2225 {
2226 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2227 #ifdef SUPPORT_UCP
2228 if (md->use_ucp)
2229 {
2230 c = eptr[-1];
2231 if (c == '_') prev_is_word = TRUE; else
2232 {
2233 int cat = UCD_CATEGORY(c);
2234 prev_is_word = (cat == ucp_L || cat == ucp_N);
2235 }
2236 }
2237 else
2238 #endif
2239 prev_is_word = MAX_255(eptr[-1])
2240 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2241 }
2242
2243 /* Get status of next character */
2244
2245 if (eptr >= md->end_subject)
2246 {
2247 SCHECK_PARTIAL();
2248 cur_is_word = FALSE;
2249 }
2250 else
2251 #ifdef SUPPORT_UCP
2252 if (md->use_ucp)
2253 {
2254 c = *eptr;
2255 if (c == '_') cur_is_word = TRUE; else
2256 {
2257 int cat = UCD_CATEGORY(c);
2258 cur_is_word = (cat == ucp_L || cat == ucp_N);
2259 }
2260 }
2261 else
2262 #endif
2263 cur_is_word = MAX_255(*eptr)
2264 && ((md->ctypes[*eptr] & ctype_word) != 0);
2265 }
2266
2267 /* Now see if the situation is what we want */
2268
2269 if ((*ecode++ == OP_WORD_BOUNDARY)?
2270 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2271 RRETURN(MATCH_NOMATCH);
2272 }
2273 break;
2274
2275 /* Match any single character type except newline; have to take care with
2276 CRLF newlines and partial matching. */
2277
2278 case OP_ANY:
2279 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2280 if (md->partial != 0 &&
2281 eptr + 1 >= md->end_subject &&
2282 NLBLOCK->nltype == NLTYPE_FIXED &&
2283 NLBLOCK->nllen == 2 &&
2284 *eptr == NLBLOCK->nl[0])
2285 {
2286 md->hitend = TRUE;
2287 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2288 }
2289
2290 /* Fall through */
2291
2292 /* Match any single character whatsoever. */
2293
2294 case OP_ALLANY:
2295 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2296 { /* not be updated before SCHECK_PARTIAL. */
2297 SCHECK_PARTIAL();
2298 RRETURN(MATCH_NOMATCH);
2299 }
2300 eptr++;
2301 #ifdef SUPPORT_UTF
2302 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2303 #endif
2304 ecode++;
2305 break;
2306
2307 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2308 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2309
2310 case OP_ANYBYTE:
2311 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2312 { /* not be updated before SCHECK_PARTIAL. */
2313 SCHECK_PARTIAL();
2314 RRETURN(MATCH_NOMATCH);
2315 }
2316 eptr++;
2317 ecode++;
2318 break;
2319
2320 case OP_NOT_DIGIT:
2321 if (eptr >= md->end_subject)
2322 {
2323 SCHECK_PARTIAL();
2324 RRETURN(MATCH_NOMATCH);
2325 }
2326 GETCHARINCTEST(c, eptr);
2327 if (
2328 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2329 c < 256 &&
2330 #endif
2331 (md->ctypes[c] & ctype_digit) != 0
2332 )
2333 RRETURN(MATCH_NOMATCH);
2334 ecode++;
2335 break;
2336
2337 case OP_DIGIT:
2338 if (eptr >= md->end_subject)
2339 {
2340 SCHECK_PARTIAL();
2341 RRETURN(MATCH_NOMATCH);
2342 }
2343 GETCHARINCTEST(c, eptr);
2344 if (
2345 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2346 c > 255 ||
2347 #endif
2348 (md->ctypes[c] & ctype_digit) == 0
2349 )
2350 RRETURN(MATCH_NOMATCH);
2351 ecode++;
2352 break;
2353
2354 case OP_NOT_WHITESPACE:
2355 if (eptr >= md->end_subject)
2356 {
2357 SCHECK_PARTIAL();
2358 RRETURN(MATCH_NOMATCH);
2359 }
2360 GETCHARINCTEST(c, eptr);
2361 if (
2362 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2363 c < 256 &&
2364 #endif
2365 (md->ctypes[c] & ctype_space) != 0
2366 )
2367 RRETURN(MATCH_NOMATCH);
2368 ecode++;
2369 break;
2370
2371 case OP_WHITESPACE:
2372 if (eptr >= md->end_subject)
2373 {
2374 SCHECK_PARTIAL();
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 GETCHARINCTEST(c, eptr);
2378 if (
2379 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2380 c > 255 ||
2381 #endif
2382 (md->ctypes[c] & ctype_space) == 0
2383 )
2384 RRETURN(MATCH_NOMATCH);
2385 ecode++;
2386 break;
2387
2388 case OP_NOT_WORDCHAR:
2389 if (eptr >= md->end_subject)
2390 {
2391 SCHECK_PARTIAL();
2392 RRETURN(MATCH_NOMATCH);
2393 }
2394 GETCHARINCTEST(c, eptr);
2395 if (
2396 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2397 c < 256 &&
2398 #endif
2399 (md->ctypes[c] & ctype_word) != 0
2400 )
2401 RRETURN(MATCH_NOMATCH);
2402 ecode++;
2403 break;
2404
2405 case OP_WORDCHAR:
2406 if (eptr >= md->end_subject)
2407 {
2408 SCHECK_PARTIAL();
2409 RRETURN(MATCH_NOMATCH);
2410 }
2411 GETCHARINCTEST(c, eptr);
2412 if (
2413 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2414 c > 255 ||
2415 #endif
2416 (md->ctypes[c] & ctype_word) == 0
2417 )
2418 RRETURN(MATCH_NOMATCH);
2419 ecode++;
2420 break;
2421
2422 case OP_ANYNL:
2423 if (eptr >= md->end_subject)
2424 {
2425 SCHECK_PARTIAL();
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 GETCHARINCTEST(c, eptr);
2429 switch(c)
2430 {
2431 default: RRETURN(MATCH_NOMATCH);
2432
2433 case CHAR_CR:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 }
2438 else if (*eptr == CHAR_LF) eptr++;
2439 break;
2440
2441 case CHAR_LF:
2442 break;
2443
2444 case CHAR_VT:
2445 case CHAR_FF:
2446 case CHAR_NEL:
2447 #ifndef EBCDIC
2448 case 0x2028:
2449 case 0x2029:
2450 #endif /* Not EBCDIC */
2451 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2452 break;
2453 }
2454 ecode++;
2455 break;
2456
2457 case OP_NOT_HSPACE:
2458 if (eptr >= md->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 RRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(c, eptr);
2464 switch(c)
2465 {
2466 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2467 default: break;
2468 }
2469 ecode++;
2470 break;
2471
2472 case OP_HSPACE:
2473 if (eptr >= md->end_subject)
2474 {
2475 SCHECK_PARTIAL();
2476 RRETURN(MATCH_NOMATCH);
2477 }
2478 GETCHARINCTEST(c, eptr);
2479 switch(c)
2480 {
2481 HSPACE_CASES: break; /* Byte and multibyte cases */
2482 default: RRETURN(MATCH_NOMATCH);
2483 }
2484 ecode++;
2485 break;
2486
2487 case OP_NOT_VSPACE:
2488 if (eptr >= md->end_subject)
2489 {
2490 SCHECK_PARTIAL();
2491 RRETURN(MATCH_NOMATCH);
2492 }
2493 GETCHARINCTEST(c, eptr);
2494 switch(c)
2495 {
2496 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2497 default: break;
2498 }
2499 ecode++;
2500 break;
2501
2502 case OP_VSPACE:
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 GETCHARINCTEST(c, eptr);
2509 switch(c)
2510 {
2511 VSPACE_CASES: break;
2512 default: RRETURN(MATCH_NOMATCH);
2513 }
2514 ecode++;
2515 break;
2516
2517 #ifdef SUPPORT_UCP
2518 /* Check the next character by Unicode property. We will get here only
2519 if the support is in the binary; otherwise a compile-time error occurs. */
2520
2521 case OP_PROP:
2522 case OP_NOTPROP:
2523 if (eptr >= md->end_subject)
2524 {
2525 SCHECK_PARTIAL();
2526 RRETURN(MATCH_NOMATCH);
2527 }
2528 GETCHARINCTEST(c, eptr);
2529 {
2530 const pcre_uint32 *cp;
2531 const ucd_record *prop = GET_UCD(c);
2532
2533 switch(ecode[1])
2534 {
2535 case PT_ANY:
2536 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2537 break;
2538
2539 case PT_LAMP:
2540 if ((prop->chartype == ucp_Lu ||
2541 prop->chartype == ucp_Ll ||
2542 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2543 RRETURN(MATCH_NOMATCH);
2544 break;
2545
2546 case PT_GC:
2547 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2548 RRETURN(MATCH_NOMATCH);
2549 break;
2550
2551 case PT_PC:
2552 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2553 RRETURN(MATCH_NOMATCH);
2554 break;
2555
2556 case PT_SC:
2557 if ((ecode[2] != prop->script) == (op == OP_PROP))
2558 RRETURN(MATCH_NOMATCH);
2559 break;
2560
2561 /* These are specials */
2562
2563 case PT_ALNUM:
2564 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2565 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2566 RRETURN(MATCH_NOMATCH);
2567 break;
2568
2569 case PT_SPACE: /* Perl space */
2570 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2571 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2572 == (op == OP_NOTPROP))
2573 RRETURN(MATCH_NOMATCH);
2574 break;
2575
2576 case PT_PXSPACE: /* POSIX space */
2577 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2578 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2579 c == CHAR_FF || c == CHAR_CR)
2580 == (op == OP_NOTPROP))
2581 RRETURN(MATCH_NOMATCH);
2582 break;
2583
2584 case PT_WORD:
2585 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2586 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2587 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2588 RRETURN(MATCH_NOMATCH);
2589 break;
2590
2591 case PT_CLIST:
2592 cp = PRIV(ucd_caseless_sets) + prop->caseset;
2593 for (;;)
2594 {
2595 if (c < *cp)
2596 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2597 if (c == *cp++)
2598 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2599 }
2600 break;
2601
2602 /* This should never occur */
2603
2604 default:
2605 RRETURN(PCRE_ERROR_INTERNAL);
2606 }
2607
2608 ecode += 3;
2609 }
2610 break;
2611
2612 /* Match an extended Unicode sequence. We will get here only if the support
2613 is in the binary; otherwise a compile-time error occurs. */
2614
2615 case OP_EXTUNI:
2616 if (eptr >= md->end_subject)
2617 {
2618 SCHECK_PARTIAL();
2619 RRETURN(MATCH_NOMATCH);
2620 }
2621 else
2622 {
2623 int lgb, rgb;
2624 GETCHARINCTEST(c, eptr);
2625 lgb = UCD_GRAPHBREAK(c);
2626 while (eptr < md->end_subject)
2627 {
2628 int len = 1;
2629 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2630 rgb = UCD_GRAPHBREAK(c);
2631 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2632 lgb = rgb;
2633 eptr += len;
2634 }
2635 }
2636 CHECK_PARTIAL();
2637 ecode++;
2638 break;
2639 #endif /* SUPPORT_UCP */
2640
2641
2642 /* Match a back reference, possibly repeatedly. Look past the end of the
2643 item to see if there is repeat information following. The code is similar
2644 to that for character classes, but repeated for efficiency. Then obey
2645 similar code to character type repeats - written out again for speed.
2646 However, if the referenced string is the empty string, always treat
2647 it as matched, any number of times (otherwise there could be infinite
2648 loops). */
2649
2650 case OP_REF:
2651 case OP_REFI:
2652 caseless = op == OP_REFI;
2653 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2654 ecode += 1 + IMM2_SIZE;
2655
2656 /* If the reference is unset, there are two possibilities:
2657
2658 (a) In the default, Perl-compatible state, set the length negative;
2659 this ensures that every attempt at a match fails. We can't just fail
2660 here, because of the possibility of quantifiers with zero minima.
2661
2662 (b) If the JavaScript compatibility flag is set, set the length to zero
2663 so that the back reference matches an empty string.
2664
2665 Otherwise, set the length to the length of what was matched by the
2666 referenced subpattern. */
2667
2668 if (offset >= offset_top || md->offset_vector[offset] < 0)
2669 length = (md->jscript_compat)? 0 : -1;
2670 else
2671 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2672
2673 /* Set up for repetition, or handle the non-repeated case */
2674
2675 switch (*ecode)
2676 {
2677 case OP_CRSTAR:
2678 case OP_CRMINSTAR:
2679 case OP_CRPLUS:
2680 case OP_CRMINPLUS:
2681 case OP_CRQUERY:
2682 case OP_CRMINQUERY:
2683 c = *ecode++ - OP_CRSTAR;
2684 minimize = (c & 1) != 0;
2685 min = rep_min[c]; /* Pick up values from tables; */
2686 max = rep_max[c]; /* zero for max => infinity */
2687 if (max == 0) max = INT_MAX;
2688 break;
2689
2690 case OP_CRRANGE:
2691 case OP_CRMINRANGE:
2692 minimize = (*ecode == OP_CRMINRANGE);
2693 min = GET2(ecode, 1);
2694 max = GET2(ecode, 1 + IMM2_SIZE);
2695 if (max == 0) max = INT_MAX;
2696 ecode += 1 + 2 * IMM2_SIZE;
2697 break;
2698
2699 default: /* No repeat follows */
2700 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2701 {
2702 if (length == -2) eptr = md->end_subject; /* Partial match */
2703 CHECK_PARTIAL();
2704 RRETURN(MATCH_NOMATCH);
2705 }
2706 eptr += length;
2707 continue; /* With the main loop */
2708 }
2709
2710 /* Handle repeated back references. If the length of the reference is
2711 zero, just continue with the main loop. If the length is negative, it
2712 means the reference is unset in non-Java-compatible mode. If the minimum is
2713 zero, we can continue at the same level without recursion. For any other
2714 minimum, carrying on will result in NOMATCH. */
2715
2716 if (length == 0) continue;
2717 if (length < 0 && min == 0) continue;
2718
2719 /* First, ensure the minimum number of matches are present. We get back
2720 the length of the reference string explicitly rather than passing the
2721 address of eptr, so that eptr can be a register variable. */
2722
2723 for (i = 1; i <= min; i++)
2724 {
2725 int slength;
2726 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2727 {
2728 if (slength == -2) eptr = md->end_subject; /* Partial match */
2729 CHECK_PARTIAL();
2730 RRETURN(MATCH_NOMATCH);
2731 }
2732 eptr += slength;
2733 }
2734
2735 /* If min = max, continue at the same level without recursion.
2736 They are not both allowed to be zero. */
2737
2738 if (min == max) continue;
2739
2740 /* If minimizing, keep trying and advancing the pointer */
2741
2742 if (minimize)
2743 {
2744 for (fi = min;; fi++)
2745 {
2746 int slength;
2747 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2748 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2749 if (fi >= max) RRETURN(MATCH_NOMATCH);
2750 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2751 {
2752 if (slength == -2) eptr = md->end_subject; /* Partial match */
2753 CHECK_PARTIAL();
2754 RRETURN(MATCH_NOMATCH);
2755 }
2756 eptr += slength;
2757 }
2758 /* Control never gets here */
2759 }
2760
2761 /* If maximizing, find the longest string and work backwards */
2762
2763 else
2764 {
2765 pp = eptr;
2766 for (i = min; i < max; i++)
2767 {
2768 int slength;
2769 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2770 {
2771 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2772 the soft partial matching case. */
2773
2774 if (slength == -2 && md->partial != 0 &&
2775 md->end_subject > md->start_used_ptr)
2776 {
2777 md->hitend = TRUE;
2778 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2779 }
2780 break;
2781 }
2782 eptr += slength;
2783 }
2784
2785 while (eptr >= pp)
2786 {
2787 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2789 eptr -= length;
2790 }
2791 RRETURN(MATCH_NOMATCH);
2792 }
2793 /* Control never gets here */
2794
2795 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2796 used when all the characters in the class have values in the range 0-255,
2797 and either the matching is caseful, or the characters are in the range
2798 0-127 when UTF-8 processing is enabled. The only difference between
2799 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2800 encountered.
2801
2802 First, look past the end of the item to see if there is repeat information
2803 following. Then obey similar code to character type repeats - written out
2804 again for speed. */
2805
2806 case OP_NCLASS:
2807 case OP_CLASS:
2808 {
2809 /* The data variable is saved across frames, so the byte map needs to
2810 be stored there. */
2811 #define BYTE_MAP ((pcre_uint8 *)data)
2812 data = ecode + 1; /* Save for matching */
2813 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2814
2815 switch (*ecode)
2816 {
2817 case OP_CRSTAR:
2818 case OP_CRMINSTAR:
2819 case OP_CRPLUS:
2820 case OP_CRMINPLUS:
2821 case OP_CRQUERY:
2822 case OP_CRMINQUERY:
2823 c = *ecode++ - OP_CRSTAR;
2824 minimize = (c & 1) != 0;
2825 min = rep_min[c]; /* Pick up values from tables; */
2826 max = rep_max[c]; /* zero for max => infinity */
2827 if (max == 0) max = INT_MAX;
2828 break;
2829
2830 case OP_CRRANGE:
2831 case OP_CRMINRANGE:
2832 minimize = (*ecode == OP_CRMINRANGE);
2833 min = GET2(ecode, 1);
2834 max = GET2(ecode, 1 + IMM2_SIZE);
2835 if (max == 0) max = INT_MAX;
2836 ecode += 1 + 2 * IMM2_SIZE;
2837 break;
2838
2839 default: /* No repeat follows */
2840 min = max = 1;
2841 break;
2842 }
2843
2844 /* First, ensure the minimum number of matches are present. */
2845
2846 #ifdef SUPPORT_UTF
2847 if (utf)
2848 {
2849 for (i = 1; i <= min; i++)
2850 {
2851 if (eptr >= md->end_subject)
2852 {
2853 SCHECK_PARTIAL();
2854 RRETURN(MATCH_NOMATCH);
2855 }
2856 GETCHARINC(c, eptr);
2857 if (c > 255)
2858 {
2859 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2860 }
2861 else
2862 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2863 }
2864 }
2865 else
2866 #endif
2867 /* Not UTF mode */
2868 {
2869 for (i = 1; i <= min; i++)
2870 {
2871 if (eptr >= md->end_subject)
2872 {
2873 SCHECK_PARTIAL();
2874 RRETURN(MATCH_NOMATCH);
2875 }
2876 c = *eptr++;
2877 #ifndef COMPILE_PCRE8
2878 if (c > 255)
2879 {
2880 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2881 }
2882 else
2883 #endif
2884 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2885 }
2886 }
2887
2888 /* If max == min we can continue with the main loop without the
2889 need to recurse. */
2890
2891 if (min == max) continue;
2892
2893 /* If minimizing, keep testing the rest of the expression and advancing
2894 the pointer while it matches the class. */
2895
2896 if (minimize)
2897 {
2898 #ifdef SUPPORT_UTF
2899 if (utf)
2900 {
2901 for (fi = min;; fi++)
2902 {
2903 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2905 if (fi >= max) RRETURN(MATCH_NOMATCH);
2906 if (eptr >= md->end_subject)
2907 {
2908 SCHECK_PARTIAL();
2909 RRETURN(MATCH_NOMATCH);
2910 }
2911 GETCHARINC(c, eptr);
2912 if (c > 255)
2913 {
2914 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2915 }
2916 else
2917 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2918 }
2919 }
2920 else
2921 #endif
2922 /* Not UTF mode */
2923 {
2924 for (fi = min;; fi++)
2925 {
2926 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2927 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2928 if (fi >= max) RRETURN(MATCH_NOMATCH);
2929 if (eptr >= md->end_subject)
2930 {
2931 SCHECK_PARTIAL();
2932 RRETURN(MATCH_NOMATCH);
2933 }
2934 c = *eptr++;
2935 #ifndef COMPILE_PCRE8
2936 if (c > 255)
2937 {
2938 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2939 }
2940 else
2941 #endif
2942 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2943 }
2944 }
2945 /* Control never gets here */
2946 }
2947
2948 /* If maximizing, find the longest possible run, then work backwards. */
2949
2950 else
2951 {
2952 pp = eptr;
2953
2954 #ifdef SUPPORT_UTF
2955 if (utf)
2956 {
2957 for (i = min; i < max; i++)
2958 {
2959 int len = 1;
2960 if (eptr >= md->end_subject)
2961 {
2962 SCHECK_PARTIAL();
2963 break;
2964 }
2965 GETCHARLEN(c, eptr, len);
2966 if (c > 255)
2967 {
2968 if (op == OP_CLASS) break;
2969 }
2970 else
2971 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2972 eptr += len;
2973 }
2974 for (;;)
2975 {
2976 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 if (eptr-- == pp) break; /* Stop if tried at original pos */
2979 BACKCHAR(eptr);
2980 }
2981 }
2982 else
2983 #endif
2984 /* Not UTF mode */
2985 {
2986 for (i = min; i < max; i++)
2987 {
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 break;
2992 }
2993 c = *eptr;
2994 #ifndef COMPILE_PCRE8
2995 if (c > 255)
2996 {
2997 if (op == OP_CLASS) break;
2998 }
2999 else
3000 #endif
3001 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3002 eptr++;
3003 }
3004 while (eptr >= pp)
3005 {
3006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3008 eptr--;
3009 }
3010 }
3011
3012 RRETURN(MATCH_NOMATCH);
3013 }
3014 #undef BYTE_MAP
3015 }
3016 /* Control never gets here */
3017
3018
3019 /* Match an extended character class. This opcode is encountered only
3020 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3021 mode, because Unicode properties are supported in non-UTF-8 mode. */
3022
3023 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3024 case OP_XCLASS:
3025 {
3026 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3027 ecode += GET(ecode, 1); /* Advance past the item */
3028
3029 switch (*ecode)
3030 {
3031 case OP_CRSTAR:
3032 case OP_CRMINSTAR:
3033 case OP_CRPLUS:
3034 case OP_CRMINPLUS:
3035 case OP_CRQUERY:
3036 case OP_CRMINQUERY:
3037 c = *ecode++ - OP_CRSTAR;
3038 minimize = (c & 1) != 0;
3039 min = rep_min[c]; /* Pick up values from tables; */
3040 max = rep_max[c]; /* zero for max => infinity */
3041 if (max == 0) max = INT_MAX;
3042 break;
3043
3044 case OP_CRRANGE:
3045 case OP_CRMINRANGE:
3046 minimize = (*ecode == OP_CRMINRANGE);
3047 min = GET2(ecode, 1);
3048 max = GET2(ecode, 1 + IMM2_SIZE);
3049 if (max == 0) max = INT_MAX;
3050 ecode += 1 + 2 * IMM2_SIZE;
3051 break;
3052
3053 default: /* No repeat follows */
3054 min = max = 1;
3055 break;
3056 }
3057
3058 /* First, ensure the minimum number of matches are present. */
3059
3060 for (i = 1; i <= min; i++)
3061 {
3062 if (eptr >= md->end_subject)
3063 {
3064 SCHECK_PARTIAL();
3065 RRETURN(MATCH_NOMATCH);
3066 }
3067 GETCHARINCTEST(c, eptr);
3068 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3069 }
3070
3071 /* If max == min we can continue with the main loop without the
3072 need to recurse. */
3073
3074 if (min == max) continue;
3075
3076 /* If minimizing, keep testing the rest of the expression and advancing
3077 the pointer while it matches the class. */
3078
3079 if (minimize)
3080 {
3081 for (fi = min;; fi++)
3082 {
3083 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3084 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3085 if (fi >= max) RRETURN(MATCH_NOMATCH);
3086 if (eptr >= md->end_subject)
3087 {
3088 SCHECK_PARTIAL();
3089 RRETURN(MATCH_NOMATCH);
3090 }
3091 GETCHARINCTEST(c, eptr);
3092 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3093 }
3094 /* Control never gets here */
3095 }
3096
3097 /* If maximizing, find the longest possible run, then work backwards. */
3098
3099 else
3100 {
3101 pp = eptr;
3102 for (i = min; i < max; i++)
3103 {
3104 int len = 1;
3105 if (eptr >= md->end_subject)
3106 {
3107 SCHECK_PARTIAL();
3108 break;
3109 }
3110 #ifdef SUPPORT_UTF
3111 GETCHARLENTEST(c, eptr, len);
3112 #else
3113 c = *eptr;
3114 #endif
3115 if (!PRIV(xclass)(c, data, utf)) break;
3116 eptr += len;
3117 }
3118 for(;;)
3119 {
3120 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 if (eptr-- == pp) break; /* Stop if tried at original pos */
3123 #ifdef SUPPORT_UTF
3124 if (utf) BACKCHAR(eptr);
3125 #endif
3126 }
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129
3130 /* Control never gets here */
3131 }
3132 #endif /* End of XCLASS */
3133
3134 /* Match a single character, casefully */
3135
3136 case OP_CHAR:
3137 #ifdef SUPPORT_UTF
3138 if (utf)
3139 {
3140 length = 1;
3141 ecode++;
3142 GETCHARLEN(fc, ecode, length);
3143 if (length > md->end_subject - eptr)
3144 {
3145 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3146 RRETURN(MATCH_NOMATCH);
3147 }
3148 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3149 }
3150 else
3151 #endif
3152 /* Not UTF mode */
3153 {
3154 if (md->end_subject - eptr < 1)
3155 {
3156 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3160 ecode += 2;
3161 }
3162 break;
3163
3164 /* Match a single character, caselessly. If we are at the end of the
3165 subject, give up immediately. */
3166
3167 case OP_CHARI:
3168 if (eptr >= md->end_subject)
3169 {
3170 SCHECK_PARTIAL();
3171 RRETURN(MATCH_NOMATCH);
3172 }
3173
3174 #ifdef SUPPORT_UTF
3175 if (utf)
3176 {
3177 length = 1;
3178 ecode++;
3179 GETCHARLEN(fc, ecode, length);
3180
3181 /* If the pattern character's value is < 128, we have only one byte, and
3182 we know that its other case must also be one byte long, so we can use the
3183 fast lookup table. We know that there is at least one byte left in the
3184 subject. */
3185
3186 if (fc < 128)
3187 {
3188 if (md->lcc[fc]
3189 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3190 ecode++;
3191 eptr++;
3192 }
3193
3194 /* Otherwise we must pick up the subject character. Note that we cannot
3195 use the value of "length" to check for sufficient bytes left, because the
3196 other case of the character may have more or fewer bytes. */
3197
3198 else
3199 {
3200 unsigned int dc;
3201 GETCHARINC(dc, eptr);
3202 ecode += length;
3203
3204 /* If we have Unicode property support, we can use it to test the other
3205 case of the character, if there is one. */
3206
3207 if (fc != dc)
3208 {
3209 #ifdef SUPPORT_UCP
3210 if (dc != UCD_OTHERCASE(fc))
3211 #endif
3212 RRETURN(MATCH_NOMATCH);
3213 }
3214 }
3215 }
3216 else
3217 #endif /* SUPPORT_UTF */
3218
3219 /* Not UTF mode */
3220 {
3221 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3222 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3223 eptr++;
3224 ecode += 2;
3225 }
3226 break;
3227
3228 /* Match a single character repeatedly. */
3229
3230 case OP_EXACT:
3231 case OP_EXACTI:
3232 min = max = GET2(ecode, 1);
3233 ecode += 1 + IMM2_SIZE;
3234 goto REPEATCHAR;
3235
3236 case OP_POSUPTO:
3237 case OP_POSUPTOI:
3238 possessive = TRUE;
3239 /* Fall through */
3240
3241 case OP_UPTO:
3242 case OP_UPTOI:
3243 case OP_MINUPTO:
3244 case OP_MINUPTOI:
3245 min = 0;
3246 max = GET2(ecode, 1);
3247 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3248 ecode += 1 + IMM2_SIZE;
3249 goto REPEATCHAR;
3250
3251 case OP_POSSTAR:
3252 case OP_POSSTARI:
3253 possessive = TRUE;
3254 min = 0;
3255 max = INT_MAX;
3256 ecode++;
3257 goto REPEATCHAR;
3258
3259 case OP_POSPLUS:
3260 case OP_POSPLUSI:
3261 possessive = TRUE;
3262 min = 1;
3263 max = INT_MAX;
3264 ecode++;
3265 goto REPEATCHAR;
3266
3267 case OP_POSQUERY:
3268 case OP_POSQUERYI:
3269 possessive = TRUE;
3270 min = 0;
3271 max = 1;
3272 ecode++;
3273 goto REPEATCHAR;
3274
3275 case OP_STAR:
3276 case OP_STARI:
3277 case OP_MINSTAR:
3278 case OP_MINSTARI:
3279 case OP_PLUS:
3280 case OP_PLUSI:
3281 case OP_MINPLUS:
3282 case OP_MINPLUSI:
3283 case OP_QUERY:
3284 case OP_QUERYI:
3285 case OP_MINQUERY:
3286 case OP_MINQUERYI:
3287 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3288 minimize = (c & 1) != 0;
3289 min = rep_min[c]; /* Pick up values from tables; */
3290 max = rep_max[c]; /* zero for max => infinity */
3291 if (max == 0) max = INT_MAX;
3292
3293 /* Common code for all repeated single-character matches. */
3294
3295 REPEATCHAR:
3296 #ifdef SUPPORT_UTF
3297 if (utf)
3298 {
3299 length = 1;
3300 charptr = ecode;
3301 GETCHARLEN(fc, ecode, length);
3302 ecode += length;
3303
3304 /* Handle multibyte character matching specially here. There is
3305 support for caseless matching if UCP support is present. */
3306
3307 if (length > 1)
3308 {
3309 #ifdef SUPPORT_UCP
3310 unsigned int othercase;
3311 if (op >= OP_STARI && /* Caseless */
3312 (othercase = UCD_OTHERCASE(fc)) != fc)
3313 oclength = PRIV(ord2utf)(othercase, occhars);
3314 else oclength = 0;
3315 #endif /* SUPPORT_UCP */
3316
3317 for (i = 1; i <= min; i++)
3318 {
3319 if (eptr <= md->end_subject - length &&
3320 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3321 #ifdef SUPPORT_UCP
3322 else if (oclength > 0 &&
3323 eptr <= md->end_subject - oclength &&
3324 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3325 #endif /* SUPPORT_UCP */
3326 else
3327 {
3328 CHECK_PARTIAL();
3329 RRETURN(MATCH_NOMATCH);
3330 }
3331 }
3332
3333 if (min == max) continue;
3334
3335 if (minimize)
3336 {
3337 for (fi = min;; fi++)
3338 {
3339 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3341 if (fi >= max) RRETURN(MATCH_NOMATCH);
3342 if (eptr <= md->end_subject - length &&
3343 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3344 #ifdef SUPPORT_UCP
3345 else if (oclength > 0 &&
3346 eptr <= md->end_subject - oclength &&
3347 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3348 #endif /* SUPPORT_UCP */
3349 else
3350 {
3351 CHECK_PARTIAL();
3352 RRETURN(MATCH_NOMATCH);
3353 }
3354 }
3355 /* Control never gets here */
3356 }
3357
3358 else /* Maximize */
3359 {
3360 pp = eptr;
3361 for (i = min; i < max; i++)
3362 {
3363 if (eptr <= md->end_subject - length &&
3364 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3365 #ifdef SUPPORT_UCP
3366 else if (oclength > 0 &&
3367 eptr <= md->end_subject - oclength &&
3368 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3369 #endif /* SUPPORT_UCP */
3370 else
3371 {
3372 CHECK_PARTIAL();
3373 break;
3374 }
3375 }
3376
3377 if (possessive) continue;
3378
3379 for(;;)
3380 {
3381 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3382 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3383 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3384 #ifdef SUPPORT_UCP
3385 eptr--;
3386 BACKCHAR(eptr);
3387 #else /* without SUPPORT_UCP */
3388 eptr -= length;
3389 #endif /* SUPPORT_UCP */
3390 }
3391 }
3392 /* Control never gets here */
3393 }
3394
3395 /* If the length of a UTF-8 character is 1, we fall through here, and
3396 obey the code as for non-UTF-8 characters below, though in this case the
3397 value of fc will always be < 128. */
3398 }
3399 else
3400 #endif /* SUPPORT_UTF */
3401 /* When not in UTF-8 mode, load a single-byte character. */
3402 fc = *ecode++;
3403
3404 /* The value of fc at this point is always one character, though we may
3405 or may not be in UTF mode. The code is duplicated for the caseless and
3406 caseful cases, for speed, since matching characters is likely to be quite
3407 common. First, ensure the minimum number of matches are present. If min =
3408 max, continue at the same level without recursing. Otherwise, if
3409 minimizing, keep trying the rest of the expression and advancing one
3410 matching character if failing, up to the maximum. Alternatively, if
3411 maximizing, find the maximum number of characters and work backwards. */
3412
3413 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3414 max, (char *)eptr));
3415
3416 if (op >= OP_STARI) /* Caseless */
3417 {
3418 #ifdef COMPILE_PCRE8
3419 /* fc must be < 128 if UTF is enabled. */
3420 foc = md->fcc[fc];
3421 #else
3422 #ifdef SUPPORT_UTF
3423 #ifdef SUPPORT_UCP
3424 if (utf && fc > 127)
3425 foc = UCD_OTHERCASE(fc);
3426 #else
3427 if (utf && fc > 127)
3428 foc = fc;
3429 #endif /* SUPPORT_UCP */
3430 else
3431 #endif /* SUPPORT_UTF */
3432 foc = TABLE_GET(fc, md->fcc, fc);
3433 #endif /* COMPILE_PCRE8 */
3434
3435 for (i = 1; i <= min; i++)
3436 {
3437 if (eptr >= md->end_subject)
3438 {
3439 SCHECK_PARTIAL();
3440 RRETURN(MATCH_NOMATCH);
3441 }
3442 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3443 eptr++;
3444 }
3445 if (min == max) continue;
3446 if (minimize)
3447 {
3448 for (fi = min;; fi++)
3449 {
3450 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3452 if (fi >= max) RRETURN(MATCH_NOMATCH);
3453 if (eptr >= md->end_subject)
3454 {
3455 SCHECK_PARTIAL();
3456 RRETURN(MATCH_NOMATCH);
3457 }
3458 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3459 eptr++;
3460 }
3461 /* Control never gets here */
3462 }
3463 else /* Maximize */
3464 {
3465 pp = eptr;
3466 for (i = min; i < max; i++)
3467 {
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 break;
3472 }
3473 if (fc != *eptr && foc != *eptr) break;
3474 eptr++;
3475 }
3476
3477 if (possessive) continue;
3478
3479 while (eptr >= pp)
3480 {
3481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3482 eptr--;
3483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3484 }
3485 RRETURN(MATCH_NOMATCH);
3486 }
3487 /* Control never gets here */
3488 }
3489
3490 /* Caseful comparisons (includes all multi-byte characters) */
3491
3492 else
3493 {
3494 for (i = 1; i <= min; i++)
3495 {
3496 if (eptr >= md->end_subject)
3497 {
3498 SCHECK_PARTIAL();
3499 RRETURN(MATCH_NOMATCH);
3500 }
3501 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3502 }
3503
3504 if (min == max) continue;
3505
3506 if (minimize)
3507 {
3508 for (fi = min;; fi++)
3509 {
3510 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3511 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3512 if (fi >= max) RRETURN(MATCH_NOMATCH);
3513 if (eptr >= md->end_subject)
3514 {
3515 SCHECK_PARTIAL();
3516 RRETURN(MATCH_NOMATCH);
3517 }
3518 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3519 }
3520 /* Control never gets here */
3521 }
3522 else /* Maximize */
3523 {
3524 pp = eptr;
3525 for (i = min; i < max; i++)
3526 {
3527 if (eptr >= md->end_subject)
3528 {
3529 SCHECK_PARTIAL();
3530 break;
3531 }
3532 if (fc != *eptr) break;
3533 eptr++;
3534 }
3535 if (possessive) continue;
3536
3537 while (eptr >= pp)
3538 {
3539 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3540 eptr--;
3541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3542 }
3543 RRETURN(MATCH_NOMATCH);
3544 }
3545 }
3546 /* Control never gets here */
3547
3548 /* Match a negated single one-byte character. The character we are
3549 checking can be multibyte. */
3550
3551 case OP_NOT:
3552 case OP_NOTI:
3553 if (eptr >= md->end_subject)
3554 {
3555 SCHECK_PARTIAL();
3556 RRETURN(MATCH_NOMATCH);
3557 }
3558 #ifdef SUPPORT_UTF
3559 if (utf)
3560 {
3561 register unsigned int ch, och;
3562
3563 ecode++;
3564 GETCHARINC(ch, ecode);
3565 GETCHARINC(c, eptr);
3566
3567 if (op == OP_NOT)
3568 {
3569 if (ch == c) RRETURN(MATCH_NOMATCH);
3570 }
3571 else
3572 {
3573 #ifdef SUPPORT_UCP
3574 if (ch > 127)
3575 och = UCD_OTHERCASE(ch);
3576 #else
3577 if (ch > 127)
3578 och = ch;
3579 #endif /* SUPPORT_UCP */
3580 else
3581 och = TABLE_GET(ch, md->fcc, ch);
3582 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3583 }
3584 }
3585 else
3586 #endif
3587 {
3588 register unsigned int ch = ecode[1];
3589 c = *eptr++;
3590 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3591 RRETURN(MATCH_NOMATCH);
3592 ecode += 2;
3593 }
3594 break;
3595
3596 /* Match a negated single one-byte character repeatedly. This is almost a
3597 repeat of the code for a repeated single character, but I haven't found a
3598 nice way of commoning these up that doesn't require a test of the
3599 positive/negative option for each character match. Maybe that wouldn't add
3600 very much to the time taken, but character matching *is* what this is all
3601 about... */
3602
3603 case OP_NOTEXACT:
3604 case OP_NOTEXACTI:
3605 min = max = GET2(ecode, 1);
3606 ecode += 1 + IMM2_SIZE;
3607 goto REPEATNOTCHAR;
3608
3609 case OP_NOTUPTO:
3610 case OP_NOTUPTOI:
3611 case OP_NOTMINUPTO:
3612 case OP_NOTMINUPTOI:
3613 min = 0;
3614 max = GET2(ecode, 1);
3615 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3616 ecode += 1 + IMM2_SIZE;
3617 goto REPEATNOTCHAR;
3618
3619 case OP_NOTPOSSTAR:
3620 case OP_NOTPOSSTARI:
3621 possessive = TRUE;
3622 min = 0;
3623 max = INT_MAX;
3624 ecode++;
3625 goto REPEATNOTCHAR;
3626
3627 case OP_NOTPOSPLUS:
3628 case OP_NOTPOSPLUSI:
3629 possessive = TRUE;
3630 min = 1;
3631 max = INT_MAX;
3632 ecode++;
3633 goto REPEATNOTCHAR;
3634
3635 case OP_NOTPOSQUERY:
3636 case OP_NOTPOSQUERYI:
3637 possessive = TRUE;
3638 min = 0;
3639 max = 1;
3640 ecode++;
3641 goto REPEATNOTCHAR;
3642
3643 case OP_NOTPOSUPTO:
3644 case OP_NOTPOSUPTOI:
3645 possessive = TRUE;
3646 min = 0;
3647 max = GET2(ecode, 1);
3648 ecode += 1 + IMM2_SIZE;
3649 goto REPEATNOTCHAR;
3650
3651 case OP_NOTSTAR:
3652 case OP_NOTSTARI:
3653 case OP_NOTMINSTAR:
3654 case OP_NOTMINSTARI:
3655 case OP_NOTPLUS:
3656 case OP_NOTPLUSI:
3657 case OP_NOTMINPLUS:
3658 case OP_NOTMINPLUSI:
3659 case OP_NOTQUERY:
3660 case OP_NOTQUERYI:
3661 case OP_NOTMINQUERY:
3662 case OP_NOTMINQUERYI:
3663 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3664 minimize = (c & 1) != 0;
3665 min = rep_min[c]; /* Pick up values from tables; */
3666 max = rep_max[c]; /* zero for max => infinity */
3667 if (max == 0) max = INT_MAX;
3668
3669 /* Common code for all repeated single-byte matches. */
3670
3671 REPEATNOTCHAR:
3672 GETCHARINCTEST(fc, ecode);
3673
3674 /* The code is duplicated for the caseless and caseful cases, for speed,
3675 since matching characters is likely to be quite common. First, ensure the
3676 minimum number of matches are present. If min = max, continue at the same
3677 level without recursing. Otherwise, if minimizing, keep trying the rest of
3678 the expression and advancing one matching character if failing, up to the
3679 maximum. Alternatively, if maximizing, find the maximum number of
3680 characters and work backwards. */
3681
3682 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3683 max, (char *)eptr));
3684
3685 if (op >= OP_NOTSTARI) /* Caseless */
3686 {
3687 #ifdef SUPPORT_UTF
3688 #ifdef SUPPORT_UCP
3689 if (utf && fc > 127)
3690 foc = UCD_OTHERCASE(fc);
3691 #else
3692 if (utf && fc > 127)
3693 foc = fc;
3694 #endif /* SUPPORT_UCP */
3695 else
3696 #endif /* SUPPORT_UTF */
3697 foc = TABLE_GET(fc, md->fcc, fc);
3698
3699 #ifdef SUPPORT_UTF
3700 if (utf)
3701 {
3702 register unsigned int d;
3703 for (i = 1; i <= min; i++)
3704 {
3705 if (eptr >= md->end_subject)
3706 {
3707 SCHECK_PARTIAL();
3708 RRETURN(MATCH_NOMATCH);
3709 }
3710 GETCHARINC(d, eptr);
3711 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3712 }
3713 }
3714 else
3715 #endif
3716 /* Not UTF mode */
3717 {
3718 for (i = 1; i <= min; i++)
3719 {
3720 if (eptr >= md->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 RRETURN(MATCH_NOMATCH);
3724 }
3725 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3726 eptr++;
3727 }
3728 }
3729
3730 if (min == max) continue;
3731
3732 if (minimize)
3733 {
3734 #ifdef SUPPORT_UTF
3735 if (utf)
3736 {
3737 register unsigned int d;
3738 for (fi = min;; fi++)
3739 {
3740 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3742 if (fi >= max) RRETURN(MATCH_NOMATCH);
3743 if (eptr >= md->end_subject)
3744 {
3745 SCHECK_PARTIAL();
3746 RRETURN(MATCH_NOMATCH);
3747 }
3748 GETCHARINC(d, eptr);
3749 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3750 }
3751 }
3752 else
3753 #endif
3754 /* Not UTF mode */
3755 {
3756 for (fi = min;; fi++)
3757 {
3758 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3760 if (fi >= max) RRETURN(MATCH_NOMATCH);
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 RRETURN(MATCH_NOMATCH);
3765 }
3766 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3767 eptr++;
3768 }
3769 }
3770 /* Control never gets here */
3771 }
3772
3773 /* Maximize case */
3774
3775 else
3776 {
3777 pp = eptr;
3778
3779 #ifdef SUPPORT_UTF
3780 if (utf)
3781 {
3782 register unsigned int d;
3783 for (i = min; i < max; i++)
3784 {
3785 int len = 1;
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 break;
3790 }
3791 GETCHARLEN(d, eptr, len);
3792 if (fc == d || (unsigned int)foc == d) break;
3793 eptr += len;
3794 }
3795 if (possessive) continue;
3796 for(;;)
3797 {
3798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3800 if (eptr-- == pp) break; /* Stop if tried at original pos */
3801 BACKCHAR(eptr);
3802 }
3803 }
3804 else
3805 #endif
3806 /* Not UTF mode */
3807 {
3808 for (i = min; i < max; i++)
3809 {
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 break;
3814 }
3815 if (fc == *eptr || foc == *eptr) break;
3816 eptr++;
3817 }
3818 if (possessive) continue;
3819 while (eptr >= pp)
3820 {
3821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3823 eptr--;
3824 }
3825 }
3826
3827 RRETURN(MATCH_NOMATCH);
3828 }
3829 /* Control never gets here */
3830 }
3831
3832 /* Caseful comparisons */
3833
3834 else
3835 {
3836 #ifdef SUPPORT_UTF
3837 if (utf)
3838 {
3839 register unsigned int d;
3840 for (i = 1; i <= min; i++)
3841 {
3842 if (eptr >= md->end_subject)
3843 {
3844 SCHECK_PARTIAL();
3845 RRETURN(MATCH_NOMATCH);
3846 }
3847 GETCHARINC(d, eptr);
3848 if (fc == d) RRETURN(MATCH_NOMATCH);
3849 }
3850 }
3851 else
3852 #endif
3853 /* Not UTF mode */
3854 {
3855 for (i = 1; i <= min; i++)
3856 {
3857 if (eptr >= md->end_subject)
3858 {
3859 SCHECK_PARTIAL();
3860 RRETURN(MATCH_NOMATCH);
3861 }
3862 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3863 }
3864 }
3865
3866 if (min == max) continue;
3867
3868 if (minimize)
3869 {
3870 #ifdef SUPPORT_UTF
3871 if (utf)
3872 {
3873 register unsigned int d;
3874 for (fi = min;; fi++)
3875 {
3876 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3878 if (fi >= max) RRETURN(MATCH_NOMATCH);
3879 if (eptr >= md->end_subject)
3880 {
3881 SCHECK_PARTIAL();
3882 RRETURN(MATCH_NOMATCH);
3883 }
3884 GETCHARINC(d, eptr);
3885 if (fc == d) RRETURN(MATCH_NOMATCH);
3886 }
3887 }
3888 else
3889 #endif
3890 /* Not UTF mode */
3891 {
3892 for (fi = min;; fi++)
3893 {
3894 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3896 if (fi >= max) RRETURN(MATCH_NOMATCH);
3897 if (eptr >= md->end_subject)
3898 {
3899 SCHECK_PARTIAL();
3900 RRETURN(MATCH_NOMATCH);
3901 }
3902 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3903 }
3904 }
3905 /* Control never gets here */
3906 }
3907
3908 /* Maximize case */
3909
3910 else
3911 {
3912 pp = eptr;
3913
3914 #ifdef SUPPORT_UTF
3915 if (utf)
3916 {
3917 register unsigned int d;
3918 for (i = min; i < max; i++)
3919 {
3920 int len = 1;
3921 if (eptr >= md->end_subject)
3922 {
3923 SCHECK_PARTIAL();
3924 break;
3925 }
3926 GETCHARLEN(d, eptr, len);
3927 if (fc == d) break;
3928 eptr += len;
3929 }
3930 if (possessive) continue;
3931 for(;;)
3932 {
3933 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3935 if (eptr-- == pp) break; /* Stop if tried at original pos */
3936 BACKCHAR(eptr);
3937 }
3938 }
3939 else
3940 #endif
3941 /* Not UTF mode */
3942 {
3943 for (i = min; i < max; i++)
3944 {
3945 if (eptr >= md->end_subject)
3946 {
3947 SCHECK_PARTIAL();
3948 break;
3949 }
3950 if (fc == *eptr) break;
3951 eptr++;
3952 }
3953 if (possessive) continue;
3954 while (eptr >= pp)
3955 {
3956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3958 eptr--;
3959 }
3960 }
3961
3962 RRETURN(MATCH_NOMATCH);
3963 }
3964 }
3965 /* Control never gets here */
3966
3967 /* Match a single character type repeatedly; several different opcodes
3968 share code. This is very similar to the code for single characters, but we
3969 repeat it in the interests of efficiency. */
3970
3971 case OP_TYPEEXACT:
3972 min = max = GET2(ecode, 1);
3973 minimize = TRUE;
3974 ecode += 1 + IMM2_SIZE;
3975 goto REPEATTYPE;
3976
3977 case OP_TYPEUPTO:
3978 case OP_TYPEMINUPTO:
3979 min = 0;
3980 max = GET2(ecode, 1);
3981 minimize = *ecode == OP_TYPEMINUPTO;
3982 ecode += 1 + IMM2_SIZE;
3983 goto REPEATTYPE;
3984
3985 case OP_TYPEPOSSTAR:
3986 possessive = TRUE;
3987 min = 0;
3988 max = INT_MAX;
3989 ecode++;
3990 goto REPEATTYPE;
3991
3992 case OP_TYPEPOSPLUS:
3993 possessive = TRUE;
3994 min = 1;
3995 max = INT_MAX;
3996 ecode++;
3997 goto REPEATTYPE;
3998
3999 case OP_TYPEPOSQUERY:
4000 possessive = TRUE;
4001 min = 0;
4002 max = 1;
4003 ecode++;
4004 goto REPEATTYPE;
4005
4006 case OP_TYPEPOSUPTO:
4007 possessive = TRUE;
4008 min = 0;
4009 max = GET2(ecode, 1);
4010 ecode += 1 + IMM2_SIZE;
4011 goto REPEATTYPE;
4012
4013 case OP_TYPESTAR:
4014 case OP_TYPEMINSTAR:
4015 case OP_TYPEPLUS:
4016 case OP_TYPEMINPLUS:
4017 case OP_TYPEQUERY:
4018 case OP_TYPEMINQUERY:
4019 c = *ecode++ - OP_TYPESTAR;
4020 minimize = (c & 1) != 0;
4021 min = rep_min[c]; /* Pick up values from tables; */
4022 max = rep_max[c]; /* zero for max => infinity */
4023 if (max == 0) max = INT_MAX;
4024
4025 /* Common code for all repeated single character type matches. Note that
4026 in UTF-8 mode, '.' matches a character of any length, but for the other
4027 character types, the valid characters are all one-byte long. */
4028
4029 REPEATTYPE:
4030 ctype = *ecode++; /* Code for the character type */
4031
4032 #ifdef SUPPORT_UCP
4033 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4034 {
4035 prop_fail_result = ctype == OP_NOTPROP;
4036 prop_type = *ecode++;
4037 prop_value = *ecode++;
4038 }
4039 else prop_type = -1;
4040 #endif
4041
4042 /* First, ensure the minimum number of matches are present. Use inline
4043 code for maximizing the speed, and do the type test once at the start
4044 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4045 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4046 and single-bytes. */
4047
4048 if (min > 0)
4049 {
4050 #ifdef SUPPORT_UCP
4051 if (prop_type >= 0)
4052 {
4053 switch(prop_type)
4054 {
4055 case PT_ANY:
4056 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4057 for (i = 1; i <= min; i++)
4058 {
4059 if (eptr >= md->end_subject)
4060 {
4061 SCHECK_PARTIAL();
4062 RRETURN(MATCH_NOMATCH);
4063 }
4064 GETCHARINCTEST(c, eptr);
4065 }
4066 break;
4067
4068 case PT_LAMP:
4069 for (i = 1; i <= min; i++)
4070 {
4071 int chartype;
4072 if (eptr >= md->end_subject)
4073 {
4074 SCHECK_PARTIAL();
4075 RRETURN(MATCH_NOMATCH);
4076 }
4077 GETCHARINCTEST(c, eptr);
4078 chartype = UCD_CHARTYPE(c);
4079 if ((chartype == ucp_Lu ||
4080 chartype == ucp_Ll ||
4081 chartype == ucp_Lt) == prop_fail_result)
4082 RRETURN(MATCH_NOMATCH);
4083 }
4084 break;
4085
4086 case PT_GC:
4087 for (i = 1; i <= min; i++)
4088 {
4089 if (eptr >= md->end_subject)
4090 {
4091 SCHECK_PARTIAL();
4092 RRETURN(MATCH_NOMATCH);
4093 }
4094 GETCHARINCTEST(c, eptr);
4095 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4096 RRETURN(MATCH_NOMATCH);
4097 }
4098 break;
4099
4100 case PT_PC:
4101 for (i = 1; i <= min; i++)
4102 {
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 GETCHARINCTEST(c, eptr);
4109 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4110 RRETURN(MATCH_NOMATCH);
4111 }
4112 break;
4113
4114 case PT_SC:
4115 for (i = 1; i <= min; i++)
4116 {
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 RRETURN(MATCH_NOMATCH);
4121 }
4122 GETCHARINCTEST(c, eptr);
4123 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4124 RRETURN(MATCH_NOMATCH);
4125 }
4126 break;
4127
4128 case PT_ALNUM:
4129 for (i = 1; i <= min; i++)
4130 {
4131 int category;
4132 if (eptr >= md->end_subject)
4133 {
4134 SCHECK_PARTIAL();
4135 RRETURN(MATCH_NOMATCH);
4136 }
4137 GETCHARINCTEST(c, eptr);
4138 category = UCD_CATEGORY(c);
4139 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4140 RRETURN(MATCH_NOMATCH);
4141 }
4142 break;
4143
4144 case PT_SPACE: /* Perl space */
4145 for (i = 1; i <= min; i++)
4146 {
4147 if (eptr >= md->end_subject)
4148 {
4149 SCHECK_PARTIAL();
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 GETCHARINCTEST(c, eptr);
4153 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4154 c == CHAR_FF || c == CHAR_CR)
4155 == prop_fail_result)
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 break;
4159
4160 case PT_PXSPACE: /* POSIX space */
4161 for (i = 1; i <= min; i++)
4162 {
4163 if (eptr >= md->end_subject)
4164 {
4165 SCHECK_PARTIAL();
4166 RRETURN(MATCH_NOMATCH);
4167 }
4168 GETCHARINCTEST(c, eptr);
4169 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4170 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4171 == prop_fail_result)
4172 RRETURN(MATCH_NOMATCH);
4173 }
4174 break;
4175
4176 case PT_WORD:
4177 for (i = 1; i <= min; i++)
4178 {
4179 int category;
4180 if (eptr >= md->end_subject)
4181 {
4182 SCHECK_PARTIAL();
4183 RRETURN(MATCH_NOMATCH);
4184 }
4185 GETCHARINCTEST(c, eptr);
4186 category = UCD_CATEGORY(c);
4187 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4188 == prop_fail_result)
4189 RRETURN(MATCH_NOMATCH);
4190 }
4191 break;
4192
4193 case PT_CLIST:
4194 for (i = 1; i <= min; i++)
4195 {
4196 const pcre_uint32 *cp;
4197 if (eptr >= md->end_subject)
4198 {
4199 SCHECK_PARTIAL();
4200 RRETURN(MATCH_NOMATCH);
4201 }
4202 GETCHARINCTEST(c, eptr);
4203 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4204 for (;;)
4205 {
4206 if (c < *cp)
4207 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4208 if (c == *cp++)
4209 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4210 }
4211 }
4212 break;
4213
4214 /* This should not occur */
4215
4216 default:
4217 RRETURN(PCRE_ERROR_INTERNAL);
4218 }
4219 }
4220
4221 /* Match extended Unicode sequences. We will get here only if the
4222 support is in the binary; otherwise a compile-time error occurs. */
4223
4224 else if (ctype == OP_EXTUNI)
4225 {
4226 for (i = 1; i <= min; i++)
4227 {
4228 if (eptr >= md->end_subject)
4229 {
4230 SCHECK_PARTIAL();
4231 RRETURN(MATCH_NOMATCH);
4232 }
4233 else
4234 {
4235 int lgb, rgb;
4236 GETCHARINCTEST(c, eptr);
4237 lgb = UCD_GRAPHBREAK(c);
4238 while (eptr < md->end_subject)
4239 {
4240 int len = 1;
4241 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4242 rgb = UCD_GRAPHBREAK(c);
4243 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4244 lgb = rgb;
4245 eptr += len;
4246 }
4247 }
4248 CHECK_PARTIAL();
4249 }
4250 }
4251
4252 else
4253 #endif /* SUPPORT_UCP */
4254
4255 /* Handle all other cases when the coding is UTF-8 */
4256
4257 #ifdef SUPPORT_UTF
4258 if (utf) switch(ctype)
4259 {
4260 case OP_ANY:
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 RRETURN(MATCH_NOMATCH);
4267 }
4268 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4269 if (md->partial != 0 &&
4270 eptr + 1 >= md->end_subject &&
4271 NLBLOCK->nltype == NLTYPE_FIXED &&
4272 NLBLOCK->nllen == 2 &&
4273 *eptr == NLBLOCK->nl[0])
4274 {
4275 md->hitend = TRUE;
4276 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4277 }
4278 eptr++;
4279 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4280 }
4281 break;
4282
4283 case OP_ALLANY:
4284 for (i = 1; i <= min; i++)
4285 {
4286 if (eptr >= md->end_subject)
4287 {
4288 SCHECK_PARTIAL();
4289 RRETURN(MATCH_NOMATCH);
4290 }
4291 eptr++;
4292 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4293 }
4294 break;
4295
4296 case OP_ANYBYTE:
4297 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4298 eptr += min;
4299 break;
4300
4301 case OP_ANYNL:
4302 for (i = 1; i <= min; i++)
4303 {
4304 if (eptr >= md->end_subject)
4305 {
4306 SCHECK_PARTIAL();
4307 RRETURN(MATCH_NOMATCH);
4308 }
4309 GETCHARINC(c, eptr);
4310 switch(c)
4311 {
4312 default: RRETURN(MATCH_NOMATCH);
4313
4314 case CHAR_CR:
4315 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4316 break;
4317
4318 case CHAR_LF:
4319 break;
4320
4321 case CHAR_VT:
4322 case CHAR_FF:
4323 case CHAR_NEL:
4324 #ifndef EBCDIC
4325 case 0x2028:
4326 case 0x2029:
4327 #endif /* Not EBCDIC */
4328 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4329 break;
4330 }
4331 }
4332 break;
4333
4334 case OP_NOT_HSPACE:
4335 for (i = 1; i <= min; i++)
4336 {
4337 if (eptr >= md->end_subject)
4338 {
4339 SCHECK_PARTIAL();
4340 RRETURN(MATCH_NOMATCH);
4341 }
4342 GETCHARINC(c, eptr);
4343 switch(c)
4344 {
4345 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4346 default: break;
4347 }
4348 }
4349 break;
4350
4351 case OP_HSPACE:
4352 for (i = 1; i <= min; i++)
4353 {
4354 if (eptr >= md->end_subject)
4355 {
4356 SCHECK_PARTIAL();
4357 RRETURN(MATCH_NOMATCH);
4358 }
4359 GETCHARINC(c, eptr);
4360 switch(c)
4361 {
4362 HSPACE_CASES: break; /* Byte and multibyte cases */
4363 default: RRETURN(MATCH_NOMATCH);
4364 }
4365 }
4366 break;
4367
4368 case OP_NOT_VSPACE:
4369 for (i = 1; i <= min; i++)
4370 {
4371 if (eptr >= md->end_subject)
4372 {
4373 SCHECK_PARTIAL();
4374 RRETURN(MATCH_NOMATCH);
4375 }
4376 GETCHARINC(c, eptr);
4377 switch(c)
4378 {
4379 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4380 default: break;
4381 }
4382 }
4383 break;
4384
4385 case OP_VSPACE:
4386 for (i = 1; i <= min; i++)
4387 {
4388 if (eptr >= md->end_subject)
4389 {
4390 SCHECK_PARTIAL();
4391 RRETURN(MATCH_NOMATCH);
4392 }
4393 GETCHARINC(c, eptr);
4394 switch(c)
4395 {
4396 VSPACE_CASES: break;
4397 default: RRETURN(MATCH_NOMATCH);
4398 }
4399 }
4400 break;
4401
4402 case OP_NOT_DIGIT:
4403 for (i = 1; i <= min; i++)
4404 {
4405 if (eptr >= md->end_subject)
4406 {
4407 SCHECK_PARTIAL();
4408 RRETURN(MATCH_NOMATCH);
4409 }
4410 GETCHARINC(c, eptr);
4411 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4412 RRETURN(MATCH_NOMATCH);
4413 }
4414 break;
4415
4416 case OP_DIGIT:
4417 for (i = 1; i <= min; i++)
4418 {
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4423 }
4424 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4425 RRETURN(MATCH_NOMATCH);
4426 eptr++;
4427 /* No need to skip more bytes - we know it's a 1-byte character */
4428 }
4429 break;
4430
4431 case OP_NOT_WHITESPACE:
4432 for (i = 1; i <= min; i++)
4433 {
4434 if (eptr >= md->end_subject)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4440 RRETURN(MATCH_NOMATCH);
4441 eptr++;
4442 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4443 }
4444 break;
4445
4446 case OP_WHITESPACE:
4447 for (i = 1; i <= min; i++)
4448 {
4449 if (eptr >= md->end_subject)
4450 {
4451 SCHECK_PARTIAL();
4452 RRETURN(MATCH_NOMATCH);
4453 }
4454 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4455 RRETURN(MATCH_NOMATCH);
4456 eptr++;
4457 /* No need to skip more bytes - we know it's a 1-byte character */
4458 }
4459 break;
4460
4461 case OP_NOT_WORDCHAR:
4462 for (i = 1; i <= min; i++)
4463 {
4464 if (eptr >= md->end_subject)
4465 {
4466 SCHECK_PARTIAL();
4467 RRETURN(MATCH_NOMATCH);
4468 }
4469 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4470 RRETURN(MATCH_NOMATCH);
4471 eptr++;
4472 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4473 }
4474 break;
4475
4476 case OP_WORDCHAR:
4477 for (i = 1; i <= min; i++)
4478 {
4479 if (eptr >= md->end_subject)
4480 {
4481 SCHECK_PARTIAL();
4482 RRETURN(MATCH_NOMATCH);
4483 }
4484 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4485 RRETURN(MATCH_NOMATCH);
4486 eptr++;
4487 /* No need to skip more bytes - we know it's a 1-byte character */
4488 }
4489 break;
4490
4491 default:
4492 RRETURN(PCRE_ERROR_INTERNAL);
4493 } /* End switch(ctype) */
4494
4495 else
4496 #endif /* SUPPORT_UTF */
4497
4498 /* Code for the non-UTF-8 case for minimum matching of operators other
4499 than OP_PROP and OP_NOTPROP. */
4500
4501 switch(ctype)
4502 {
4503 case OP_ANY:
4504 for (i = 1; i <= min; i++)
4505 {
4506 if (eptr >= md->end_subject)
4507 {
4508 SCHECK_PARTIAL();
4509 RRETURN(MATCH_NOMATCH);
4510 }
4511 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4512 if (md->partial != 0 &&
4513 eptr + 1 >= md->end_subject &&
4514 NLBLOCK->nltype == NLTYPE_FIXED &&
4515 NLBLOCK->nllen == 2 &&
4516 *eptr == NLBLOCK->nl[0])
4517 {
4518 md->hitend = TRUE;
4519 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4520 }
4521 eptr++;
4522 }
4523 break;
4524
4525 case OP_ALLANY:
4526 if (eptr > md->end_subject - min)
4527 {
4528 SCHECK_PARTIAL();
4529 RRETURN(MATCH_NOMATCH);
4530 }
4531 eptr += min;
4532 break;
4533
4534 case OP_ANYBYTE:
4535 if (eptr > md->end_subject - min)
4536 {
4537 SCHECK_PARTIAL();
4538 RRETURN(MATCH_NOMATCH);
4539 }
4540 eptr += min;
4541 break;
4542
4543 case OP_ANYNL:
4544 for (i = 1; i <= min; i++)
4545 {
4546 if (eptr >= md->end_subject)
4547 {
4548 SCHECK_PARTIAL();
4549 RRETURN(MATCH_NOMATCH);
4550 }
4551 switch(*eptr++)
4552 {
4553 default: RRETURN(MATCH_NOMATCH);
4554
4555 case CHAR_CR:
4556 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4557 break;
4558
4559 case CHAR_LF:
4560 break;
4561
4562 case CHAR_VT:
4563 case CHAR_FF:
4564 case CHAR_NEL:
4565 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4566 case 0x2028:
4567 case 0x2029:
4568 #endif
4569 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4570 break;
4571 }
4572 }
4573 break;
4574
4575 case OP_NOT_HSPACE:
4576 for (i = 1; i <= min; i++)
4577 {
4578 if (eptr >= md->end_subject)
4579 {
4580 SCHECK_PARTIAL();
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 switch(*eptr++)
4584 {
4585 default: break;
4586 HSPACE_BYTE_CASES:
4587 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4588 HSPACE_MULTIBYTE_CASES:
4589 #endif
4590 RRETURN(MATCH_NOMATCH);
4591 }
4592 }
4593 break;
4594
4595 case OP_HSPACE:
4596 for (i = 1; i <= min; i++)
4597 {
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 RRETURN(MATCH_NOMATCH);
4602 }
4603 switch(*eptr++)
4604 {
4605 default: RRETURN(MATCH_NOMATCH);
4606 HSPACE_BYTE_CASES:
4607 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4608 HSPACE_MULTIBYTE_CASES:
4609 #endif
4610 break;
4611 }
4612 }
4613 break;
4614
4615 case OP_NOT_VSPACE:
4616 for (i = 1; i <= min; i++)
4617 {
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 switch(*eptr++)
4624 {
4625 VSPACE_BYTE_CASES:
4626 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4627 VSPACE_MULTIBYTE_CASES:
4628 #endif
4629 RRETURN(MATCH_NOMATCH);
4630 default: break;
4631 }
4632 }
4633 break;
4634
4635 case OP_VSPACE:
4636 for (i = 1; i <= min; i++)
4637 {
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 RRETURN(MATCH_NOMATCH);
4642 }
4643 switch(*eptr++)
4644 {
4645 default: RRETURN(MATCH_NOMATCH);
4646 VSPACE_BYTE_CASES:
4647 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4648 VSPACE_MULTIBYTE_CASES:
4649 #endif
4650 break;
4651 }
4652 }
4653 break;
4654
4655 case OP_NOT_DIGIT:
4656 for (i = 1; i <= min; i++)
4657 {
4658 if (eptr >= md->end_subject)
4659 {
4660 SCHECK_PARTIAL();
4661 RRETURN(MATCH_NOMATCH);
4662 }
4663 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4664 RRETURN(MATCH_NOMATCH);
4665 eptr++;
4666 }
4667 break;
4668
4669 case OP_DIGIT:
4670 for (i = 1; i <= min; i++)
4671 {
4672 if (eptr >= md->end_subject)
4673 {
4674 SCHECK_PARTIAL();
4675 RRETURN(MATCH_NOMATCH);
4676 }
4677 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4678 RRETURN(MATCH_NOMATCH);
4679 eptr++;
4680 }
4681 break;
4682
4683 case OP_NOT_WHITESPACE:
4684 for (i = 1; i <= min; i++)
4685 {
4686 if (eptr >= md->end_subject)
4687 {
4688 SCHECK_PARTIAL();
4689 RRETURN(MATCH_NOMATCH);
4690 }
4691 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4692 RRETURN(MATCH_NOMATCH);
4693 eptr++;
4694 }
4695 break;
4696
4697 case OP_WHITESPACE:
4698 for (i = 1; i <= min; i++)
4699 {
4700 if (eptr >= md->end_subject)
4701 {
4702 SCHECK_PARTIAL();
4703 RRETURN(MATCH_NOMATCH);
4704 }
4705 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4706 RRETURN(MATCH_NOMATCH);
4707 eptr++;
4708 }
4709 break;
4710
4711 case OP_NOT_WORDCHAR:
4712 for (i = 1; i <= min; i++)
4713 {
4714 if (eptr >= md->end_subject)
4715 {
4716 SCHECK_PARTIAL();
4717 RRETURN(MATCH_NOMATCH);
4718 }
4719 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4720 RRETURN(MATCH_NOMATCH);
4721 eptr++;
4722 }
4723 break;
4724
4725 case OP_WORDCHAR:
4726 for (i = 1; i <= min; i++)
4727 {
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 RRETURN(MATCH_NOMATCH);
4732 }
4733 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4734 RRETURN(MATCH_NOMATCH);
4735 eptr++;
4736 }
4737 break;
4738
4739 default:
4740 RRETURN(PCRE_ERROR_INTERNAL);
4741 }
4742 }
4743
4744 /* If min = max, continue at the same level without recursing */
4745
4746 if (min == max) continue;
4747
4748 /* If minimizing, we have to test the rest of the pattern before each
4749 subsequent match. Again, separate the UTF-8 case for speed, and also
4750 separate the UCP cases. */
4751
4752 if (minimize)
4753 {
4754 #ifdef SUPPORT_UCP
4755 if (prop_type >= 0)
4756 {
4757 switch(prop_type)
4758 {
4759 case PT_ANY:
4760 for (fi = min;; fi++)
4761 {
4762 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4763 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4764 if (fi >= max) RRETURN(MATCH_NOMATCH);
4765 if (eptr >= md->end_subject)
4766 {
4767 SCHECK_PARTIAL();
4768 RRETURN(MATCH_NOMATCH);
4769 }
4770 GETCHARINCTEST(c, eptr);
4771 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4772 }
4773 /* Control never gets here */
4774
4775 case PT_LAMP:
4776 for (fi = min;; fi++)
4777 {
4778 int chartype;
4779 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4781 if (fi >= max) RRETURN(MATCH_NOMATCH);
4782 if (eptr >= md->end_subject)
4783 {
4784 SCHECK_PARTIAL();
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 GETCHARINCTEST(c, eptr);
4788 chartype = UCD_CHARTYPE(c);
4789 if ((chartype == ucp_Lu ||
4790 chartype == ucp_Ll ||
4791 chartype == ucp_Lt) == prop_fail_result)
4792 RRETURN(MATCH_NOMATCH);
4793 }
4794 /* Control never gets here */
4795
4796 case PT_GC:
4797 for (fi = min;; fi++)
4798 {
4799 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4801 if (fi >= max) RRETURN(MATCH_NOMATCH);
4802 if (eptr >= md->end_subject)
4803 {
4804 SCHECK_PARTIAL();
4805 RRETURN(MATCH_NOMATCH);
4806 }
4807 GETCHARINCTEST(c, eptr);
4808 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4809 RRETURN(MATCH_NOMATCH);
4810 }
4811 /* Control never gets here */
4812
4813 case PT_PC:
4814 for (fi = min;; fi++)
4815 {
4816 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4817 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4818 if (fi >= max) RRETURN(MATCH_NOMATCH);
4819 if (eptr >= md->end_subject)
4820 {
4821 SCHECK_PARTIAL();
4822 RRETURN(MATCH_NOMATCH);
4823 }
4824 GETCHARINCTEST(c, eptr);
4825 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4826 RRETURN(MATCH_NOMATCH);
4827 }
4828 /* Control never gets here */
4829
4830 case PT_SC:
4831 for (fi = min;; fi++)
4832 {
4833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4835 if (fi >= max) RRETURN(MATCH_NOMATCH);
4836 if (eptr >= md->end_subject)
4837 {
4838 SCHECK_PARTIAL();
4839 RRETURN(MATCH_NOMATCH);
4840 }
4841 GETCHARINCTEST(c, eptr);
4842 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4843 RRETURN(MATCH_NOMATCH);
4844 }
4845 /* Control never gets here */
4846
4847 case PT_ALNUM:
4848 for (fi = min;; fi++)
4849 {
4850 int category;
4851 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4852 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4853 if (fi >= max) RRETURN(MATCH_NOMATCH);
4854 if (eptr >= md->end_subject)
4855 {
4856 SCHECK_PARTIAL();
4857 RRETURN(MATCH_NOMATCH);
4858 }
4859 GETCHARINCTEST(c, eptr);
4860 category = UCD_CATEGORY(c);
4861 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4862 RRETURN(MATCH_NOMATCH);
4863 }
4864 /* Control never gets here */
4865
4866 case PT_SPACE: /* Perl space */
4867 for (fi = min;; fi++)
4868 {
4869 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4871 if (fi >= max) RRETURN(MATCH_NOMATCH);
4872 if (eptr >= md->end_subject)
4873 {
4874 SCHECK_PARTIAL();
4875 RRETURN(MATCH_NOMATCH);
4876 }
4877 GETCHARINCTEST(c, eptr);
4878 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4879 c == CHAR_FF || c == CHAR_CR)
4880 == prop_fail_result)
4881 RRETURN(MATCH_NOMATCH);
4882 }
4883 /* Control never gets here */
4884
4885 case PT_PXSPACE: /* POSIX space */
4886 for (fi = min;; fi++)
4887 {
4888 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4890 if (fi >= max) RRETURN(MATCH_NOMATCH);
4891 if (eptr >= md->end_subject)
4892 {
4893 SCHECK_PARTIAL();
4894 RRETURN(MATCH_NOMATCH);
4895 }
4896 GETCHARINCTEST(c, eptr);
4897 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4898 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4899 == prop_fail_result)
4900 RRETURN(MATCH_NOMATCH);
4901 }
4902 /* Control never gets here */
4903
4904 case PT_WORD:
4905 for (fi = min;; fi++)
4906 {
4907 int category;
4908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4910 if (fi >= max) RRETURN(MATCH_NOMATCH);
4911 if (eptr >= md->end_subject)
4912 {
4913 SCHECK_PARTIAL();
4914 RRETURN(MATCH_NOMATCH);
4915 }
4916 GETCHARINCTEST(c, eptr);
4917 category = UCD_CATEGORY(c);
4918 if ((category == ucp_L ||
4919 category == ucp_N ||
4920 c == CHAR_UNDERSCORE)
4921 == prop_fail_result)
4922 RRETURN(MATCH_NOMATCH);
4923 }
4924 /* Control never gets here */
4925
4926 case PT_CLIST:
4927 for (fi = min;; fi++)
4928 {
4929 const pcre_uint32 *cp;
4930 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4931 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4932 if (fi >= max) RRETURN(MATCH_NOMATCH);
4933 if (eptr >= md->end_subject)
4934 {
4935 SCHECK_PARTIAL();
4936 RRETURN(MATCH_NOMATCH);
4937 }
4938 GETCHARINCTEST(c, eptr);
4939 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
4940 for (;;)
4941 {
4942 if (c < *cp)
4943 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4944 if (c == *cp++)
4945 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4946 }
4947 }
4948 /* Control never gets here */
4949
4950 /* This should never occur */
4951 default:
4952 RRETURN(PCRE_ERROR_INTERNAL);
4953 }
4954 }
4955
4956 /* Match extended Unicode sequences. We will get here only if the
4957 support is in the binary; otherwise a compile-time error occurs. */
4958
4959 else if (ctype == OP_EXTUNI)
4960 {
4961 for (fi = min;; fi++)
4962 {
4963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965 if (fi >= max) RRETURN(MATCH_NOMATCH);
4966 if (eptr >= md->end_subject)
4967 {
4968 SCHECK_PARTIAL();
4969 RRETURN(MATCH_NOMATCH);
4970 }
4971 else
4972 {
4973 int lgb, rgb;
4974 GETCHARINCTEST(c, eptr);
4975 lgb = UCD_GRAPHBREAK(c);
4976 while (eptr < md->end_subject)
4977 {
4978 int len = 1;
4979 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4980 rgb = UCD_GRAPHBREAK(c);
4981 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4982 lgb = rgb;
4983 eptr += len;
4984 }
4985 }
4986 CHECK_PARTIAL();
4987 }
4988 }
4989 else
4990 #endif /* SUPPORT_UCP */
4991
4992 #ifdef SUPPORT_UTF
4993 if (utf)
4994 {
4995 for (fi = min;; fi++)
4996 {
4997 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4999 if (fi >= max) RRETURN(MATCH_NOMATCH);
5000 if (eptr >= md->end_subject)
5001 {
5002 SCHECK_PARTIAL();
5003 RRETURN(MATCH_NOMATCH);
5004 }
5005 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5006 RRETURN(MATCH_NOMATCH);
5007 GETCHARINC(c, eptr);
5008 switch(ctype)
5009 {
5010 case OP_ANY: /* This is the non-NL case */
5011 if (md->partial != 0 && /* Take care with CRLF partial */
5012 eptr >= md->end_subject &&
5013 NLBLOCK->nltype == NLTYPE_FIXED &&
5014 NLBLOCK->nllen == 2 &&
5015 c == NLBLOCK->nl[0])
5016 {
5017 md->hitend = TRUE;
5018 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5019 }
5020 break;
5021
5022 case OP_ALLANY:
5023 case OP_ANYBYTE:
5024 break;
5025
5026 case OP_ANYNL:
5027 switch(c)
5028 {
5029 default: RRETURN(MATCH_NOMATCH);
5030 case CHAR_CR:
5031 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5032 break;
5033
5034 case CHAR_LF:
5035 break;
5036
5037 case CHAR_VT:
5038 case CHAR_FF:
5039 case CHAR_NEL:
5040 #ifndef EBCDIC
5041 case 0x2028:
5042 case 0x2029:
5043 #endif /* Not EBCDIC */
5044 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5045 break;
5046 }
5047 break;
5048
5049 case OP_NOT_HSPACE:
5050 switch(c)
5051 {
5052 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5053 default: break;
5054 }
5055 break;
5056
5057 case OP_HSPACE:
5058 switch(c)
5059 {
5060 HSPACE_CASES: break;
5061 default: RRETURN(MATCH_NOMATCH);
5062 }
5063 break;
5064
5065 case OP_NOT_VSPACE:
5066 switch(c)
5067 {
5068 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5069 default: break;
5070 }
5071 break;
5072
5073 case OP_VSPACE:
5074 switch(c)
5075 {
5076 VSPACE_CASES: break;
5077 default: RRETURN(MATCH_NOMATCH);
5078 }
5079 break;
5080
5081 case OP_NOT_DIGIT:
5082 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5083 RRETURN(MATCH_NOMATCH);
5084 break;
5085
5086 case OP_DIGIT:
5087 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5088 RRETURN(MATCH_NOMATCH);
5089 break;
5090
5091 case OP_NOT_WHITESPACE:
5092 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5093 RRETURN(MATCH_NOMATCH);
5094 break;
5095
5096 case OP_WHITESPACE:
5097 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5098 RRETURN(MATCH_NOMATCH);
5099 break;
5100
5101 case OP_NOT_WORDCHAR:
5102 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5103 RRETURN(MATCH_NOMATCH);
5104 break;
5105
5106 case OP_WORDCHAR:
5107 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5108 RRETURN(MATCH_NOMATCH);
5109 break;
5110
5111 default:
5112 RRETURN(PCRE_ERROR_INTERNAL);
5113 }
5114 }
5115 }
5116 else
5117 #endif
5118 /* Not UTF mode */
5119 {
5120 for (fi = min;; fi++)
5121 {
5122 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5124 if (fi >= max) RRETURN(MATCH_NOMATCH);
5125 if (eptr >= md->end_subject)
5126 {
5127 SCHECK_PARTIAL();
5128 RRETURN(MATCH_NOMATCH);
5129 }
5130 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5131 RRETURN(MATCH_NOMATCH);
5132 c = *eptr++;
5133 switch(ctype)
5134 {
5135 case OP_ANY: /* This is the non-NL case */
5136 if (md->partial != 0 && /* Take care with CRLF partial */
5137 eptr >= md->end_subject &&
5138 NLBLOCK->nltype == NLTYPE_FIXED &&
5139 NLBLOCK->nllen == 2 &&
5140 c == NLBLOCK->nl[0])
5141 {
5142 md->hitend = TRUE;
5143 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5144 }
5145 break;
5146
5147 case OP_ALLANY:
5148 case OP_ANYBYTE:
5149 break;
5150
5151 case OP_ANYNL:
5152 switch(c)
5153 {
5154 default: RRETURN(MATCH_NOMATCH);
5155 case CHAR_CR:
5156 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5157 break;
5158
5159 case CHAR_LF:
5160 break;
5161
5162 case CHAR_VT:
5163 case CHAR_FF:
5164 case CHAR_NEL:
5165 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5166 case 0x2028:
5167 case 0x2029:
5168 #endif
5169 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5170 break;
5171 }
5172 break;
5173
5174 case OP_NOT_HSPACE:
5175 switch(c)
5176 {
5177 default: break;
5178 HSPACE_BYTE_CASES:
5179 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5180 HSPACE_MULTIBYTE_CASES:
5181 #endif
5182 RRETURN(MATCH_NOMATCH);
5183 }
5184 break;
5185
5186 case OP_HSPACE:
5187 switch(c)
5188 {
5189 default: RRETURN(MATCH_NOMATCH);
5190 HSPACE_BYTE_CASES:
5191 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5192 HSPACE_MULTIBYTE_CASES:
5193 #endif
5194 break;
5195 }
5196 break;
5197
5198 case OP_NOT_VSPACE:
5199 switch(c)
5200 {
5201 default: break;
5202 VSPACE_BYTE_CASES:
5203 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5204 VSPACE_MULTIBYTE_CASES:
5205 #endif
5206 RRETURN(MATCH_NOMATCH);
5207 }
5208 break;
5209
5210 case OP_VSPACE:
5211 switch(c)
5212 {
5213 default: RRETURN(MATCH_NOMATCH);
5214 VSPACE_BYTE_CASES:
5215 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5216 VSPACE_MULTIBYTE_CASES:
5217 #endif
5218 break;
5219 }
5220 break;
5221
5222 case OP_NOT_DIGIT:
5223 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5224 break;
5225
5226 case OP_DIGIT:
5227 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5228 break;
5229
5230 case OP_NOT_WHITESPACE:
5231 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5232 break;
5233
5234 case OP_WHITESPACE:
5235 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5236 break;
5237
5238 case OP_NOT_WORDCHAR:
5239 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5240 break;
5241
5242 case OP_WORDCHAR:
5243 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5244 break;
5245
5246 default:
5247 RRETURN(PCRE_ERROR_INTERNAL);
5248 }
5249 }
5250 }
5251 /* Control never gets here */
5252 }
5253
5254 /* If maximizing, it is worth using inline code for speed, doing the type
5255 test once at the start (i.e. keep it out of the loop). Again, keep the
5256 UTF-8 and UCP stuff separate. */
5257
5258 else
5259 {
5260 pp = eptr; /* Remember where we started */
5261
5262 #ifdef SUPPORT_UCP
5263 if (prop_type >= 0)
5264 {
5265 switch(prop_type)
5266 {
5267 case PT_ANY:
5268 for (i = min; i < max; i++)
5269 {
5270 int len = 1;
5271 if (eptr >= md->end_subject)
5272 {
5273 SCHECK_PARTIAL();
5274 break;
5275 }
5276 GETCHARLENTEST(c, eptr, len);
5277 if (prop_fail_result) break;
5278 eptr+= len;
5279 }
5280 break;
5281
5282 case PT_LAMP:
5283 for (i = min; i < max; i++)
5284 {
5285 int chartype;
5286 int len = 1;
5287 if (eptr >= md->end_subject)
5288 {
5289 SCHECK_PARTIAL();
5290 break;
5291 }
5292 GETCHARLENTEST(c, eptr, len);
5293 chartype = UCD_CHARTYPE(c);
5294 if ((chartype == ucp_Lu ||
5295 chartype == ucp_Ll ||
5296 chartype == ucp_Lt) == prop_fail_result)
5297 break;
5298 eptr+= len;
5299 }
5300 break;
5301
5302 case PT_GC:
5303 for (i = min; i < max; i++)
5304 {
5305 int len = 1;
5306 if (eptr >= md->end_subject)
5307 {
5308 SCHECK_PARTIAL();
5309 break;
5310 }
5311 GETCHARLENTEST(c, eptr, len);
5312 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5313 eptr+= len;
5314 }
5315 break;
5316
5317 case PT_PC:
5318 for (i = min; i < max; i++)
5319 {
5320 int len = 1;
5321 if (eptr >= md->end_subject)
5322 {
5323 SCHECK_PARTIAL();
5324 break;
5325 }
5326 GETCHARLENTEST(c, eptr, len);
5327 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5328 eptr+= len;
5329 }
5330 break;
5331
5332 case PT_SC:
5333 for (i = min; i < max; i++)
5334 {
5335 int len = 1;
5336 if (eptr >= md->end_subject)
5337 {
5338 SCHECK_PARTIAL();
5339 break;
5340 }
5341 GETCHARLENTEST(c, eptr, len);
5342 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5343 eptr+= len;
5344 }
5345 break;
5346
5347 case PT_ALNUM:
5348 for (i = min; i < max; i++)
5349 {
5350 int category;
5351 int len = 1;
5352 if (eptr >= md->end_subject)
5353 {
5354 SCHECK_PARTIAL();
5355 break;
5356 }
5357 GETCHARLENTEST(c, eptr, len);
5358 category = UCD_CATEGORY(c);
5359 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5360 break;
5361 eptr+= len;
5362 }
5363 break;
5364
5365 case PT_SPACE: /* Perl space */
5366 for (i = min; i < max; i++)
5367 {
5368 int len = 1;
5369 if (eptr >= md->end_subject)
5370 {
5371 SCHECK_PARTIAL();
5372 break;
5373 }
5374 GETCHARLENTEST(c, eptr, len);
5375 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5376 c == CHAR_FF || c == CHAR_CR)
5377 == prop_fail_result)
5378 break;
5379 eptr+= len;
5380 }
5381 break;
5382
5383 case PT_PXSPACE: /* POSIX space */
5384 for (i = min; i < max; i++)
5385 {
5386 int len = 1;
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 GETCHARLENTEST(c, eptr, len);
5393 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5394 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5395 == prop_fail_result)
5396 break;
5397 eptr+= len;
5398 }
5399 break;
5400
5401 case PT_WORD:
5402 for (i = min; i < max; i++)
5403 {
5404 int category;
5405 int len = 1;
5406 if (eptr >= md->end_subject)
5407 {
5408 SCHECK_PARTIAL();
5409 break;
5410 }
5411 GETCHARLENTEST(c, eptr, len);
5412 category = UCD_CATEGORY(c);
5413 if ((category == ucp_L || category == ucp_N ||
5414 c == CHAR_UNDERSCORE) == prop_fail_result)
5415 break;
5416 eptr+= len;
5417 }
5418 break;
5419
5420 case PT_CLIST:
5421 for (i = min; i < max; i++)
5422 {
5423 const pcre_uint32 *cp;
5424 int len = 1;
5425 if (eptr >= md->end_subject)
5426 {
5427 SCHECK_PARTIAL();
5428 break;
5429 }
5430 GETCHARLENTEST(c, eptr, len);
5431 cp = PRIV(ucd_caseless_sets) + UCD_CASESET(c);
5432 for (;;)
5433 {
5434 if (c < *cp)
5435 { if (prop_fail_result) break; else goto GOT_MAX; }
5436 if (c == *cp++)
5437 { if (prop_fail_result) goto GOT_MAX; else break; }
5438 }
5439 eptr += len;
5440 }
5441 GOT_MAX:
5442 break;
5443
5444 default:
5445 RRETURN(PCRE_ERROR_INTERNAL);
5446 }
5447
5448 /* eptr is now past the end of the maximum run */
5449
5450 if (possessive) continue;
5451 for(;;)
5452 {
5453 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5454 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5455 if (eptr-- == pp) break; /* Stop if tried at original pos */
5456 if (utf) BACKCHAR(eptr);
5457 }
5458 }
5459
5460 /* Match extended Unicode sequences. We will get here only if the
5461 support is in the binary; otherwise a compile-time error occurs. */
5462
5463 else if (ctype == OP_EXTUNI)
5464 {
5465 for (i = min; i < max; i++)
5466 {
5467 if (eptr >= md->end_subject)
5468 {
5469 SCHECK_PARTIAL();
5470 break;
5471 }
5472 else
5473 {
5474 int lgb, rgb;
5475 GETCHARINCTEST(c, eptr);
5476 lgb = UCD_GRAPHBREAK(c);
5477 while (eptr < md->end_subject)
5478 {
5479 int len = 1;
5480 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5481 rgb = UCD_GRAPHBREAK(c);
5482 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5483 lgb = rgb;
5484 eptr += len;
5485 }
5486 }
5487 CHECK_PARTIAL();
5488 }
5489
5490 /* eptr is now past the end of the maximum run */
5491
5492 if (possessive) continue;
5493
5494 for(;;)
5495 {
5496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5498 if (eptr-- == pp) break; /* Stop if tried at original pos */
5499 for (;;) /* Move back over one extended */
5500 {
5501 if (!utf) c = *eptr; else
5502 {
5503 BACKCHAR(eptr);
5504 GETCHAR(c, eptr);
5505 }
5506 if (UCD_CATEGORY(c) != ucp_M) break;
5507 eptr--;
5508 }
5509 }
5510 }
5511
5512 else
5513 #endif /* SUPPORT_UCP */
5514
5515 #ifdef SUPPORT_UTF
5516 if (utf)
5517 {
5518 switch(ctype)
5519 {
5520 case OP_ANY:
5521 if (max < INT_MAX)
5522 {
5523 for (i = min; i < max; i++)
5524 {
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 if (IS_NEWLINE(eptr)) break;
5531 if (md->partial != 0 && /* Take care with CRLF partial */
5532 eptr + 1 >= md->end_subject &&
5533 NLBLOCK->nltype == NLTYPE_FIXED &&
5534 NLBLOCK->nllen == 2 &&
5535 *eptr == NLBLOCK->nl[0])
5536 {
5537 md->hitend = TRUE;
5538 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5539 }
5540 eptr++;
5541 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5542 }
5543 }
5544
5545 /* Handle unlimited UTF-8 repeat */
5546
5547 else
5548 {
5549 for (i = min; i < max; i++)
5550 {
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 if (IS_NEWLINE(eptr)) break;
5557 if (md->partial != 0 && /* Take care with CRLF partial */
5558 eptr + 1 >= md->end_subject &&
5559 NLBLOCK->nltype == NLTYPE_FIXED &&
5560 NLBLOCK->nllen == 2 &&
5561 *eptr == NLBLOCK->nl[0])
5562 {
5563 md->hitend = TRUE;
5564 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5565 }
5566 eptr++;
5567 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5568 }
5569 }
5570 break;
5571
5572 case OP_ALLANY:
5573 if (max < INT_MAX)
5574 {
5575 for (i = min; i < max; i++)
5576 {
5577 if (eptr >= md->end_subject)
5578 {
5579 SCHECK_PARTIAL();
5580 break;
5581 }
5582 eptr++;
5583 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5584 }
5585 }
5586 else
5587 {
5588 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5589 SCHECK_PARTIAL();
5590 }
5591 break;
5592
5593 /* The byte case is the same as non-UTF8 */
5594
5595 case OP_ANYBYTE:
5596 c = max - min;
5597 if (c > (unsigned int)(md->end_subject - eptr))
5598 {
5599 eptr = md->end_subject;
5600 SCHECK_PARTIAL();
5601 }
5602 else eptr += c;
5603 break;
5604
5605 case OP_ANYNL:
5606 for (i = min; i < max; i++)
5607 {
5608 int len = 1;
5609 if (eptr >= md->end_subject)
5610 {
5611 SCHECK_PARTIAL();
5612 break;
5613 }
5614 GETCHARLEN(c, eptr, len);
5615 if (c == CHAR_CR)
5616 {
5617 if (++eptr >= md->end_subject) break;
5618 if (*eptr == CHAR_LF) eptr++;
5619 }
5620 else
5621 {
5622 if (c != CHAR_LF &&
5623 (md->bsr_anycrlf ||
5624 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5625 #ifndef EBCDIC
5626 && c != 0x2028 && c != 0x2029
5627 #endif /* Not EBCDIC */
5628 )))
5629 break;
5630 eptr += len;
5631 }
5632 }
5633 break;
5634
5635 case OP_NOT_HSPACE:
5636 case OP_HSPACE:
5637 for (i = min; i < max; i++)
5638 {
5639 BOOL gotspace;
5640 int len = 1;
5641 if (eptr >= md->end_subject)
5642 {
5643 SCHECK_PARTIAL();
5644 break;
5645 }
5646 GETCHARLEN(c, eptr, len);
5647 switch(c)
5648 {
5649 HSPACE_CASES: gotspace = TRUE; break;
5650 default: gotspace = FALSE; break;
5651 }
5652 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5653 eptr += len;
5654 }
5655 break;
5656
5657 case OP_NOT_VSPACE:
5658 case OP_VSPACE:
5659 for (i = min; i < max; i++)
5660 {
5661 BOOL gotspace;
5662 int len = 1;
5663 if (eptr >= md->end_subject)
5664 {
5665 SCHECK_PARTIAL();
5666 break;
5667 }
5668 GETCHARLEN(c, eptr, len);
5669 switch(c)
5670 {
5671 VSPACE_CASES: gotspace = TRUE; break;
5672 default: gotspace = FALSE; break;
5673 }
5674 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5675 eptr += len;
5676 }
5677 break;
5678
5679 case OP_NOT_DIGIT:
5680 for (i = min; i < max; i++)
5681 {
5682 int len = 1;
5683 if (eptr >= md->end_subject)
5684 {
5685 SCHECK_PARTIAL();
5686 break;
5687 }
5688 GETCHARLEN(c, eptr, len);
5689 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5690 eptr+= len;
5691 }
5692 break;
5693
5694 case OP_DIGIT:
5695 for (i = min; i < max; i++)
5696 {
5697 int len = 1;
5698 if (eptr >= md->end_subject)
5699 {
5700 SCHECK_PARTIAL();
5701 break;
5702 }
5703 GETCHARLEN(c, eptr, len);
5704 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5705 eptr+= len;
5706 }
5707 break;
5708
5709 case OP_NOT_WHITESPACE:
5710 for (i = min; i < max; i++)
5711 {
5712 int len = 1;
5713 if (eptr >= md->end_subject)
5714 {
5715 SCHECK_PARTIAL();
5716 break;
5717 }
5718 GETCHARLEN(c, eptr, len);
5719 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5720 eptr+= len;
5721 }
5722 break;
5723
5724 case OP_WHITESPACE:
5725 for (i = min; i < max; i++)
5726 {
5727 int len = 1;
5728 if (eptr >= md->end_subject)
5729 {
5730 SCHECK_PARTIAL();
5731 break;
5732 }
5733 GETCHARLEN(c, eptr, len);
5734 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5735 eptr+= len;
5736 }
5737 break;
5738
5739 case OP_NOT_WORDCHAR:
5740 for (i = min; i < max; i++)
5741 {
5742 int len = 1;
5743 if (eptr >= md->end_subject)
5744 {
5745 SCHECK_PARTIAL();
5746 break;
5747 }
5748 GETCHARLEN(c, eptr, len);
5749 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5750 eptr+= len;
5751 }
5752 break;
5753
5754 case OP_WORDCHAR:
5755 for (i = min; i < max; i++)
5756 {
5757 int len = 1;
5758 if (eptr >= md->end_subject)
5759 {
5760 SCHECK_PARTIAL();
5761 break;
5762 }
5763 GETCHARLEN(c, eptr, len);
5764 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5765 eptr+= len;
5766 }
5767 break;
5768
5769 default:
5770 RRETURN(PCRE_ERROR_INTERNAL);
5771 }
5772
5773 /* eptr is now past the end of the maximum run. If possessive, we are
5774 done (no backing up). Otherwise, match at this position; anything other
5775 than no match is immediately returned. For nomatch, back up one
5776 character, unless we are matching \R and the last thing matched was
5777 \r\n, in which case, back up two bytes. */
5778
5779 if (possessive) continue;
5780 for(;;)
5781 {
5782 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5784 if (eptr-- == pp) break; /* Stop if tried at original pos */
5785 BACKCHAR(eptr);
5786 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_NL &&
5787 eptr[-1] == CHAR_CR) eptr--;
5788 }
5789 }
5790 else
5791 #endif /* SUPPORT_UTF */
5792 /* Not UTF mode */
5793 {
5794 switch(ctype)
5795 {
5796 case OP_ANY:
5797 for (i = min; i < max; i++)
5798 {
5799 if (eptr >= md->end_subject)
5800 {
5801 SCHECK_PARTIAL();
5802 break;
5803 }
5804 if (IS_NEWLINE(eptr)) break;
5805 if (md->partial != 0 && /* Take care with CRLF partial */
5806 eptr + 1 >= md->end_subject &&
5807 NLBLOCK->nltype == NLTYPE_FIXED &&
5808 NLBLOCK->nllen == 2 &&
5809 *eptr == NLBLOCK->nl[0])
5810 {
5811 md->hitend = TRUE;
5812 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5813 }
5814 eptr++;
5815 }
5816 break;
5817
5818 case OP_ALLANY:
5819 case OP_ANYBYTE:
5820 c = max - min;
5821 if (c > (unsigned int)(md->end_subject - eptr))
5822 {
5823 eptr = md->end_subject;
5824 SCHECK_PARTIAL();
5825 }
5826 else eptr += c;
5827 break;
5828
5829 case OP_ANYNL:
5830 for (i = min; i < max; i++)
5831 {
5832 if (eptr >= md->end_subject)
5833 {
5834 SCHECK_PARTIAL();
5835 break;
5836 }
5837 c = *eptr;
5838 if (c == CHAR_CR)
5839 {
5840 if (++eptr >= md->end_subject) break;
5841 if (*eptr == CHAR_LF) eptr++;
5842 }
5843 else
5844 {
5845 if (c != CHAR_LF && (md->bsr_anycrlf ||
5846 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5847 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5848 && c != 0x2028 && c != 0x2029
5849 #endif
5850 ))) break;
5851 eptr++;
5852 }
5853 }
5854 break;
5855
5856 case OP_NOT_HSPACE:
5857 for (i = min; i < max; i++)
5858 {
5859 if (eptr >= md->end_subject)
5860 {
5861 SCHECK_PARTIAL();
5862 break;
5863 }
5864 switch(*eptr)
5865 {
5866 default: eptr++; break;
5867 HSPACE_BYTE_CASES:
5868 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5869 HSPACE_MULTIBYTE_CASES:
5870 #endif
5871 goto ENDLOOP00;
5872 }
5873 }
5874 ENDLOOP00:
5875 break;
5876
5877 case OP_HSPACE:
5878 for (i = min; i < max; i++)
5879 {
5880 if (eptr >= md->end_subject)
5881 {
5882 SCHECK_PARTIAL();
5883 break;
5884 }
5885 switch(*eptr)
5886 {
5887 default: goto ENDLOOP01;
5888 HSPACE_BYTE_CASES:
5889 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5890 HSPACE_MULTIBYTE_CASES:
5891 #endif
5892 eptr++; break;
5893 }
5894 }
5895 ENDLOOP01:
5896 break;
5897
5898 case OP_NOT_VSPACE:
5899 for (i = min; i < max; i++)
5900 {
5901 if (eptr >= md->end_subject)
5902 {
5903 SCHECK_PARTIAL();
5904 break;
5905 }
5906 switch(*eptr)
5907 {
5908 default: eptr++; break;
5909 VSPACE_BYTE_CASES:
5910 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5911 VSPACE_MULTIBYTE_CASES:
5912 #endif
5913 goto ENDLOOP02;
5914 }
5915 }
5916 ENDLOOP02:
5917 break;
5918
5919 case OP_VSPACE:
5920 for (i = min; i < max; i++)
5921 {
5922 if (eptr >= md->end_subject)
5923 {
5924 SCHECK_PARTIAL();
5925 break;
5926 }
5927 switch(*eptr)
5928 {
5929 default: goto ENDLOOP03;
5930 VSPACE_BYTE_CASES:
5931 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5932 VSPACE_MULTIBYTE_CASES:
5933 #endif
5934 eptr++; break;
5935 }
5936 }
5937 ENDLOOP03:
5938 break;
5939
5940 case OP_NOT_DIGIT:
5941 for (i = min; i < max; i++)
5942 {
5943 if (eptr >= md->end_subject)
5944 {
5945 SCHECK_PARTIAL();
5946 break;
5947 }
5948 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5949 eptr++;
5950 }
5951 break;
5952
5953 case OP_DIGIT:
5954 for (i = min; i < max; i++)
5955 {
5956 if (eptr >= md->end_subject)
5957 {
5958 SCHECK_PARTIAL();
5959 break;
5960 }
5961 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5962 eptr++;
5963 }
5964 break;
5965
5966 case OP_NOT_WHITESPACE:
5967 for (i = min; i < max; i++)
5968 {
5969 if (eptr >= md->end_subject)
5970 {
5971 SCHECK_PARTIAL();
5972 break;
5973 }
5974 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5975 eptr++;
5976 }
5977 break;
5978
5979 case OP_WHITESPACE:
5980 for (i = min; i < max; i++)
5981 {
5982 if (eptr >= md->end_subject)
5983 {
5984 SCHECK_PARTIAL();
5985 break;
5986 }
5987 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5988 eptr++;
5989 }
5990 break;
5991
5992 case OP_NOT_WORDCHAR:
5993 for (i = min; i < max; i++)
5994 {
5995 if (eptr >= md->end_subject)
5996 {
5997 SCHECK_PARTIAL();
5998 break;
5999 }
6000 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6001 eptr++;
6002 }
6003 break;
6004
6005 case OP_WORDCHAR:
6006 for (i = min; i < max; i++)
6007 {
6008 if (eptr >= md->end_subject)
6009 {
6010 SCHECK_PARTIAL();
6011 break;
6012 }
6013 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6014 eptr++;
6015 }
6016 break;
6017
6018 default:
6019 RRETURN(PCRE_ERROR_INTERNAL);
6020 }
6021
6022 /* eptr is now past the end of the maximum run. If possessive, we are
6023 done (no backing up). Otherwise, match at this position; anything other
6024 than no match is immediately returned. For nomatch, back up one
6025 character (byte), unless we are matching \R and the last thing matched
6026 was \r\n, in which case, back up two bytes. */
6027
6028 if (possessive) continue;
6029 while (eptr >= pp)
6030 {
6031 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6033 eptr--;
6034 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6035 eptr[-1] == CHAR_CR) eptr--;
6036 }
6037 }
6038
6039 /* Get here if we can't make it match with any permitted repetitions */
6040
6041 RRETURN(MATCH_NOMATCH);
6042 }
6043 /* Control never gets here */
6044
6045 /* There's been some horrible disaster. Arrival here can only mean there is
6046 something seriously wrong in the code above or the OP_xxx definitions. */
6047
6048 default:
6049 DPRINTF(("Unknown opcode %d\n", *ecode));
6050 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6051 }
6052
6053 /* Do not stick any code in here without much thought; it is assumed
6054 that "continue" in the code above comes out to here to repeat the main
6055 loop. */
6056
6057 } /* End of main loop */
6058 /* Control never reaches here */
6059
6060
6061 /* When compiling to use the heap rather than the stack for recursive calls to
6062 match(), the RRETURN() macro jumps here. The number that is saved in
6063 frame->Xwhere indicates which label we actually want to return to. */
6064
6065 #ifdef NO_RECURSE
6066 #define LBL(val) case val: goto L_RM##val;
6067 HEAP_RETURN:
6068 switch (frame->Xwhere)
6069 {
6070 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6071 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6072 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6073 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6074 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6075 LBL(65) LBL(66)
6076 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6077 LBL(21)
6078 #endif
6079 #ifdef SUPPORT_UTF
6080 LBL(16) LBL(18) LBL(20)
6081 LBL(22) LBL(23) LBL(28) LBL(30)
6082 LBL(32) LBL(34) LBL(42) LBL(46)
6083 #ifdef SUPPORT_UCP
6084 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6085 LBL(59) LBL(60) LBL(61) LBL(62)
6086 #endif /* SUPPORT_UCP */
6087 #endif /* SUPPORT_UTF */
6088 default:
6089 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6090
6091 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6092
6093 return PCRE_ERROR_INTERNAL;
6094 }
6095 #undef LBL
6096 #endif /* NO_RECURSE */
6097 }
6098
6099
6100 /***************************************************************************
6101 ****************************************************************************
6102 RECURSION IN THE match() FUNCTION
6103
6104 Undefine all the macros that were defined above to handle this. */
6105
6106 #ifdef NO_RECURSE
6107 #undef eptr
6108 #undef ecode
6109 #undef mstart
6110 #undef offset_top
6111 #undef eptrb
6112 #undef flags
6113
6114 #undef callpat
6115 #undef charptr
6116 #undef data
6117 #undef next
6118 #undef pp
6119 #undef prev
6120 #undef saved_eptr
6121
6122 #undef new_recursive
6123
6124 #undef cur_is_word
6125 #undef condition
6126 #undef prev_is_word
6127
6128 #undef ctype
6129 #undef length
6130 #undef max
6131 #undef min
6132 #undef number
6133 #undef offset
6134 #undef op
6135 #undef save_capture_last
6136 #undef save_offset1
6137 #undef save_offset2
6138 #undef save_offset3
6139 #undef stacksave
6140
6141 #undef newptrb
6142
6143 #endif
6144
6145 /* These two are defined as macros in both cases */
6146
6147 #undef fc
6148 #undef fi
6149
6150 /***************************************************************************
6151 ***************************************************************************/
6152
6153
6154 #ifdef NO_RECURSE
6155 /*************************************************
6156 * Release allocated heap frames *
6157 *************************************************/
6158
6159 /* This function releases all the allocated frames. The base frame is on the
6160 machine stack, and so must not be freed.
6161
6162 Argument: the address of the base frame
6163 Returns: nothing
6164 */
6165
6166 static void
6167 release_match_heapframes (heapframe *frame_base)
6168 {
6169 heapframe *nextframe = frame_base->Xnextframe;
6170 while (nextframe != NULL)
6171 {
6172 heapframe *oldframe = nextframe;
6173 nextframe = nextframe->Xnextframe;
6174 (PUBL(stack_free))(oldframe);
6175 }
6176 }
6177 #endif
6178
6179
6180 /*************************************************
6181 * Execute a Regular Expression *
6182 *************************************************/
6183
6184 /* This function applies a compiled re to a subject string and picks out
6185 portions of the string if it matches. Two elements in the vector are set for
6186 each substring: the offsets to the start and end of the substring.
6187
6188 Arguments:
6189 argument_re points to the compiled expression
6190 extra_data points to extra data or is NULL
6191 subject points to the subject string
6192 length length of subject string (may contain binary zeros)
6193 start_offset where to start in the subject string
6194 options option bits
6195 offsets points to a vector of ints to be filled in with offsets
6196 offsetcount the number of elements in the vector
6197
6198 Returns: > 0 => success; value is the number of elements filled in
6199 = 0 => success, but offsets is not big enough
6200 -1 => failed to match
6201 < -1 => some kind of unexpected problem
6202 */
6203
6204 #if defined COMPILE_PCRE8
6205 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6206 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6207 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6208 int offsetcount)
6209 #elif defined COMPILE_PCRE16
6210 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6211 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6212 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6213 int offsetcount)
6214 #elif defined COMPILE_PCRE32
6215 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6216 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6217 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6218 int offsetcount)
6219 #endif
6220 {
6221 int rc, ocount, arg_offset_max;
6222 int newline;
6223 BOOL using_temporary_offsets = FALSE;
6224 BOOL anchored;
6225 BOOL startline;
6226 BOOL firstline;
6227 BOOL utf;
6228 BOOL has_first_char = FALSE;
6229 BOOL has_req_char = FALSE;
6230 pcre_uchar first_char = 0;
6231 pcre_uchar first_char2 = 0;
6232 pcre_uchar req_char = 0;
6233 pcre_uchar req_char2 = 0;
6234 match_data match_block;
6235 match_data *md = &match_block;
6236 const pcre_uint8 *tables;
6237 const pcre_uint8 *start_bits = NULL;
6238 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6239 PCRE_PUCHAR end_subject;
6240 PCRE_PUCHAR start_partial = NULL;
6241 PCRE_PUCHAR req_char_ptr = start_match - 1;
6242
6243 const pcre_study_data *study;
6244 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6245
6246 #ifdef NO_RECURSE
6247 heapframe frame_zero;
6248 frame_zero.Xprevframe = NULL; /* Marks the top level */
6249 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6250 md->match_frames_base = &frame_zero;
6251 #endif
6252
6253 /* Check for the special magic call that measures the size of the stack used
6254 per recursive call of match(). Without the funny casting for sizeof, a Windows
6255 compiler gave this error: "unary minus operator applied to unsigned type,
6256 result still unsigned". Hopefully the cast fixes that. */
6257
6258 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6259 start_offset == -999)
6260 #ifdef NO_RECURSE
6261 return -((int)sizeof(heapframe));
6262 #else
6263 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6264 #endif
6265
6266 /* Plausibility checks */
6267
6268 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6269 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6270 return PCRE_ERROR_NULL;
6271 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6272 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6273
6274 /* Check that the first field in the block is the magic number. If it is not,
6275 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6276 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6277 means that the pattern is likely compiled with different endianness. */
6278
6279 if (re->magic_number != MAGIC_NUMBER)
6280 return re->magic_number == REVERSED_MAGIC_NUMBER?
6281 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6282 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6283
6284 /* These two settings are used in the code for checking a UTF-8 string that
6285 follows immediately afterwards. Other values in the md block are used only
6286 during "normal" pcre_exec() processing, not when the JIT support is in use,
6287 so they are set up later. */
6288
6289 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6290 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6291 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6292 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6293
6294 /* Check a UTF-8 string if required. Pass back the character offset and error
6295 code for an invalid string if a results vector is available. */
6296
6297 #ifdef SUPPORT_UTF
6298 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6299 {
6300 int erroroffset;
6301 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6302 if (errorcode != 0)
6303 {
6304 if (offsetcount >= 2)
6305 {
6306 offsets[0] = erroroffset;
6307 offsets[1] = errorcode;
6308 }
6309 #if defined COMPILE_PCRE8
6310 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6311 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6312 #elif defined COMPILE_PCRE16
6313 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6314 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6315 #elif defined COMPILE_PCRE32
6316 return PCRE_ERROR_BADUTF32;
6317 #endif
6318 }
6319 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6320 /* Check that a start_offset points to the start of a UTF character. */
6321 if (start_offset > 0 && start_offset < length &&
6322 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6323 return PCRE_ERROR_BADUTF8_OFFSET;
6324 #endif
6325 }
6326 #endif
6327
6328 /* If the pattern was successfully studied with JIT support, run the JIT
6329 executable instead of the rest of this function. Most options must be set at
6330 compile time for the JIT code to be usable. Fallback to the normal code path if
6331 an unsupported flag is set. */
6332
6333 #ifdef SUPPORT_JIT
6334 if (extra_data != NULL
6335 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6336 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6337 && extra_data->executable_jit != NULL
6338 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6339 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART |
6340 PCRE_PARTIAL_SOFT | PCRE_PARTIAL_HARD)) == 0)
6341 {
6342 rc = PRIV(jit_exec)(re, extra_data, (const pcre_uchar *)subject, length,
6343 start_offset, options, offsets, offsetcount);
6344
6345 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6346 mode is not compiled. In this case we simply fallback to interpreter. */
6347
6348 if (rc != PCRE_ERROR_NULL) return rc;
6349 }
6350 #endif
6351
6352 /* Carry on with non-JIT matching. This information is for finding all the
6353 numbers associated with a given name, for condition testing. */
6354
6355 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6356 md->name_count = re->name_count;
6357 md->name_entry_size = re->name_entry_size;
6358
6359 /* Fish out the optional data from the extra_data structure, first setting
6360 the default values. */
6361
6362 study = NULL;
6363 md->match_limit = MATCH_LIMIT;
6364 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6365 md->callout_data = NULL;
6366
6367 /* The table pointer is always in native byte order. */
6368
6369 tables = re->tables;
6370
6371 if (extra_data != NULL)
6372 {
6373 register unsigned int flags = extra_data->flags;
6374 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6375 study = (const pcre_study_data *)extra_data->study_data;
6376 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6377 md->match_limit = extra_data->match_limit;
6378 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6379 md->match_limit_recursion = extra_data->match_limit_recursion;
6380 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6381 md->callout_data = extra_data->callout_data;
6382 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6383 }
6384
6385 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6386 is a feature that makes it possible to save compiled regex and re-use them
6387 in other programs later. */
6388
6389 if (tables == NULL) tables = PRIV(default_tables);
6390
6391 /* Set up other data */
6392
6393 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6394 startline = (re->flags & PCRE_STARTLINE) != 0;
6395 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6396
6397 /* The code starts after the real_pcre block and the capture name table. */
6398
6399 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6400 re->name_count * re->name_entry_size;
6401
6402 md->start_subject = (PCRE_PUCHAR)subject;
6403 md->start_offset = start_offset;
6404 md->end_subject = md->start_subject + length;
6405 end_subject = md->end_subject;
6406
6407 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6408 md->use_ucp = (re->options & PCRE_UCP) != 0;
6409 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6410 md->ignore_skip_arg = FALSE;
6411
6412 /* Some options are unpacked into BOOL variables in the hope that testing
6413 them will be faster than individual option bits. */
6414
6415 md->notbol = (options & PCRE_NOTBOL) != 0;
6416 md->noteol = (options & PCRE_NOTEOL) != 0;
6417 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6418 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6419
6420 md->hitend = FALSE;
6421 md->mark = md->nomatch_mark = NULL; /* In case never set */
6422
6423 md->recursive = NULL; /* No recursion at top level */
6424 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6425
6426 md->lcc = tables + lcc_offset;
6427 md->fcc = tables + fcc_offset;
6428 md->ctypes = tables + ctypes_offset;
6429
6430 /* Handle different \R options. */
6431
6432 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6433 {
6434 case 0:
6435 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6436 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6437 else
6438 #ifdef BSR_ANYCRLF
6439 md->bsr_anycrlf = TRUE;
6440 #else
6441 md->bsr_anycrlf = FALSE;
6442 #endif
6443 break;
6444
6445 case PCRE_BSR_ANYCRLF:
6446 md->bsr_anycrlf = TRUE;
6447 break;
6448
6449 case PCRE_BSR_UNICODE:
6450 md->bsr_anycrlf = FALSE;
6451 break;
6452
6453 default: return PCRE_ERROR_BADNEWLINE;
6454 }
6455
6456 /* Handle different types of newline. The three bits give eight cases. If
6457 nothing is set at run time, whatever was used at compile time applies. */
6458
6459 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6460 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6461 {
6462 case 0: newline = NEWLINE; break; /* Compile-time default */
6463 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6464 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6465 case PCRE_NEWLINE_CR+
6466 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6467 case PCRE_NEWLINE_ANY: newline = -1; break;
6468 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6469 default: return PCRE_ERROR_BADNEWLINE;
6470 }
6471
6472 if (newline == -2)
6473 {
6474 md->nltype = NLTYPE_ANYCRLF;
6475 }
6476 else if (newline < 0)
6477 {
6478 md->nltype = NLTYPE_ANY;
6479 }
6480 else
6481 {
6482 md->nltype = NLTYPE_FIXED;
6483 if (newline > 255)
6484 {
6485 md->nllen = 2;
6486 md->nl[0] = (newline >> 8) & 255;
6487 md->nl[1] = newline & 255;
6488 }
6489 else
6490 {
6491 md->nllen = 1;
6492 md->nl[0] = newline;
6493 }
6494 }
6495
6496 /* Partial matching was originally supported only for a restricted set of
6497 regexes; from release 8.00 there are no restrictions, but the bits are still
6498 defined (though never set). So there's no harm in leaving this code. */
6499
6500 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6501 return PCRE_ERROR_BADPARTIAL;
6502
6503 /* If the expression has got more back references than the offsets supplied can
6504 hold, we get a temporary chunk of working store to use during the matching.
6505 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6506 of 3. */
6507
6508 ocount = offsetcount - (offsetcount % 3);
6509 arg_offset_max = (2*ocount)/3;
6510
6511 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6512 {
6513 ocount = re->top_backref * 3 + 3;
6514 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6515 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6516 using_temporary_offsets = TRUE;
6517 DPRINTF(("Got memory to hold back references\n"));
6518 }
6519 else md->offset_vector = offsets;
6520
6521 md->offset_end = ocount;
6522 md->offset_max = (2*ocount)/3;
6523 md->offset_overflow = FALSE;
6524 md->capture_last = -1;
6525
6526 /* Reset the working variable associated with each extraction. These should
6527 never be used unless previously set, but they get saved and restored, and so we
6528 initialize them to avoid reading uninitialized locations. Also, unset the
6529 offsets for the matched string. This is really just for tidiness with callouts,
6530 in case they inspect these fields. */
6531
6532 if (md->offset_vector != NULL)
6533 {
6534 register int *iptr = md->offset_vector + ocount;
6535 register int *iend = iptr - re->top_bracket;
6536 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6537 while (--iptr >= iend) *iptr = -1;
6538 md->offset_vector[0] = md->offset_vector[1] = -1;
6539 }
6540
6541 /* Set up the first character to match, if available. The first_char value is
6542 never set for an anchored regular expression, but the anchoring may be forced
6543 at run time, so we have to test for anchoring. The first char may be unset for
6544 an unanchored pattern, of course. If there's no first char and the pattern was
6545 studied, there may be a bitmap of possible first characters. */
6546
6547 if (!anchored)
6548 {
6549 if ((re->flags & PCRE_FIRSTSET) != 0)
6550 {
6551 has_first_char = TRUE;
6552 first_char = first_char2 = (pcre_uchar)(re->first_char);
6553 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6554 {
6555 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6556 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6557 if (utf && first_char > 127)
6558 first_char2 = UCD_OTHERCASE(first_char);
6559 #endif
6560 }
6561 }
6562 else
6563 if (!startline && study != NULL &&
6564 (study->flags & PCRE_STUDY_MAPPED) != 0)
6565 start_bits = study->start_bits;
6566 }
6567
6568 /* For anchored or unanchored matches, there may be a "last known required
6569 character" set. */
6570
6571 if ((re->flags & PCRE_REQCHSET) != 0)
6572 {
6573 has_req_char = TRUE;
6574 req_char = req_char2 = (pcre_uchar)(re->req_char);
6575 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6576 {
6577 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6578 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6579 if (utf && req_char > 127)
6580 req_char2 = UCD_OTHERCASE(req_char);
6581 #endif
6582 }
6583 }
6584
6585
6586 /* ==========================================================================*/
6587
6588 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6589 the loop runs just once. */
6590
6591 for(;;)
6592 {
6593 PCRE_PUCHAR save_end_subject = end_subject;
6594 PCRE_PUCHAR new_start_match;
6595
6596 /* If firstline is TRUE, the start of the match is constrained to the first
6597 line of a multiline string. That is, the match must be before or at the first
6598 newline. Implement this by temporarily adjusting end_subject so that we stop
6599 scanning at a newline. If the match fails at the newline, later code breaks
6600 this loop. */
6601
6602 if (firstline)
6603 {
6604 PCRE_PUCHAR t = start_match;
6605 #ifdef SUPPORT_UTF
6606 if (utf)
6607 {
6608 while (t < md->end_subject && !IS_NEWLINE(t))
6609 {
6610 t++;
6611 ACROSSCHAR(t < end_subject, *t, t++);
6612 }
6613 }
6614 else
6615 #endif
6616 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6617 end_subject = t;
6618 }
6619
6620 /* There are some optimizations that avoid running the match if a known
6621 starting point is not found, or if a known later character is not present.
6622 However, there is an option that disables these, for testing and for ensuring
6623 that all callouts do actually occur. The option can be set in the regex by
6624 (*NO_START_OPT) or passed in match-time options. */
6625
6626 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6627 {
6628 /* Advance to a unique first char if there is one. */
6629
6630 if (has_first_char)
6631 {
6632 if (first_char != first_char2)
6633 while (start_match < end_subject &&
6634 *start_match != first_char && *start_match != first_char2)
6635 start_match++;
6636 else
6637 while (start_match < end_subject && *start_match != first_char)
6638 start_match++;
6639 }
6640
6641 /* Or to just after a linebreak for a multiline match */
6642
6643 else if (startline)
6644 {
6645 if (start_match > md->start_subject + start_offset)
6646 {
6647 #ifdef SUPPORT_UTF
6648 if (utf)
6649 {
6650 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6651 {
6652 start_match++;
6653 ACROSSCHAR(start_match < end_subject, *start_match,
6654 start_match++);
6655 }
6656 }
6657 else
6658 #endif
6659 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6660 start_match++;
6661
6662 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6663 and we are now at a LF, advance the match position by one more character.
6664 */
6665
6666 if (start_match[-1] == CHAR_CR &&
6667 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6668 start_match < end_subject &&
6669 *start_match == CHAR_NL)
6670 start_match++;
6671 }
6672 }
6673
6674 /* Or to a non-unique first byte after study */
6675
6676 else if (start_bits != NULL)
6677 {
6678 while (start_match < end_subject)
6679 {
6680 register unsigned int c = *start_match;
6681 #ifndef COMPILE_PCRE8
6682 if (c > 255) c = 255;
6683 #endif
6684 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6685 {
6686 start_match++;
6687 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6688 /* In non 8-bit mode, the iteration will stop for
6689 characters > 255 at the beginning or not stop at all. */
6690 if (utf)
6691 ACROSSCHAR(start_match < end_subject, *start_match,
6692 start_match++);
6693 #endif
6694 }
6695 else break;
6696 }
6697 }
6698 } /* Starting optimizations */
6699
6700 /* Restore fudged end_subject */
6701
6702 end_subject = save_end_subject;
6703
6704 /* The following two optimizations are disabled for partial matching or if
6705 disabling is explicitly requested. */
6706
6707 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6708 {
6709 /* If the pattern was studied, a minimum subject length may be set. This is
6710 a lower bound; no actual string of that length may actually match the
6711 pattern. Although the value is, strictly, in characters, we treat it as
6712 bytes to avoid spending too much time in this optimization. */
6713
6714 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6715 (pcre_uint32)(end_subject - start_match) < study->minlength)
6716 {
6717 rc = MATCH_NOMATCH;
6718 break;
6719 }
6720
6721 /* If req_char is set, we know that that character must appear in the
6722 subject for the match to succeed. If the first character is set, req_char
6723 must be later in the subject; otherwise the test starts at the match point.
6724 This optimization can save a huge amount of backtracking in patterns with
6725 nested unlimited repeats that aren't going to match. Writing separate code
6726 for cased/caseless versions makes it go faster, as does using an
6727 autoincrement and backing off on a match.
6728
6729 HOWEVER: when the subject string is very, very long, searching to its end
6730 can take a long time, and give bad performance on quite ordinary patterns.
6731 This showed up when somebody was matching something like /^\d+C/ on a
6732 32-megabyte string... so we don't do this when the string is sufficiently
6733 long. */
6734
6735 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6736 {
6737 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6738
6739 /* We don't need to repeat the search if we haven't yet reached the
6740 place we found it at last time. */
6741
6742 if (p > req_char_ptr)
6743 {
6744 if (req_char != req_char2)
6745 {
6746 while (p < end_subject)
6747 {
6748 register int pp = *p++;
6749 if (pp == req_char || pp == req_char2) { p--; break; }
6750 }
6751 }
6752 else
6753 {
6754 while (p < end_subject)
6755 {
6756 if (*p++ == req_char) { p--; break; }
6757 }
6758 }
6759
6760 /* If we can't find the required character, break the matching loop,
6761 forcing a match failure. */
6762
6763 if (p >= end_subject)
6764 {