/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 892 - (show annotations)
Wed Jan 18 17:23:20 2012 UTC (7 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 211958 byte(s)
Put top level heap frame on the stack.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
489 frame on the stack here; subsequent instantiations are obtained from the heap
490 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
491 the top-level on the stack rather than malloc-ing them all gives a performance
492 boost in many cases where there is not much "recursion". */
493
494 #ifdef NO_RECURSE
495 heapframe frame_zero;
496 heapframe *frame = &frame_zero;
497 frame->Xprevframe = NULL; /* Marks the top level */
498
499 /* Copy in the original argument variables */
500
501 frame->Xeptr = eptr;
502 frame->Xecode = ecode;
503 frame->Xmstart = mstart;
504 frame->Xoffset_top = offset_top;
505 frame->Xeptrb = eptrb;
506 frame->Xrdepth = rdepth;
507
508 /* This is where control jumps back to to effect "recursion" */
509
510 HEAP_RECURSE:
511
512 /* Macros make the argument variables come from the current frame */
513
514 #define eptr frame->Xeptr
515 #define ecode frame->Xecode
516 #define mstart frame->Xmstart
517 #define offset_top frame->Xoffset_top
518 #define eptrb frame->Xeptrb
519 #define rdepth frame->Xrdepth
520
521 /* Ditto for the local variables */
522
523 #ifdef SUPPORT_UTF
524 #define charptr frame->Xcharptr
525 #endif
526 #define callpat frame->Xcallpat
527 #define codelink frame->Xcodelink
528 #define data frame->Xdata
529 #define next frame->Xnext
530 #define pp frame->Xpp
531 #define prev frame->Xprev
532 #define saved_eptr frame->Xsaved_eptr
533
534 #define new_recursive frame->Xnew_recursive
535
536 #define cur_is_word frame->Xcur_is_word
537 #define condition frame->Xcondition
538 #define prev_is_word frame->Xprev_is_word
539
540 #ifdef SUPPORT_UCP
541 #define prop_type frame->Xprop_type
542 #define prop_value frame->Xprop_value
543 #define prop_fail_result frame->Xprop_fail_result
544 #define oclength frame->Xoclength
545 #define occhars frame->Xocchars
546 #endif
547
548 #define ctype frame->Xctype
549 #define fc frame->Xfc
550 #define fi frame->Xfi
551 #define length frame->Xlength
552 #define max frame->Xmax
553 #define min frame->Xmin
554 #define number frame->Xnumber
555 #define offset frame->Xoffset
556 #define op frame->Xop
557 #define save_capture_last frame->Xsave_capture_last
558 #define save_offset1 frame->Xsave_offset1
559 #define save_offset2 frame->Xsave_offset2
560 #define save_offset3 frame->Xsave_offset3
561 #define stacksave frame->Xstacksave
562
563 #define newptrb frame->Xnewptrb
564
565 /* When recursion is being used, local variables are allocated on the stack and
566 get preserved during recursion in the normal way. In this environment, fi and
567 i, and fc and c, can be the same variables. */
568
569 #else /* NO_RECURSE not defined */
570 #define fi i
571 #define fc c
572
573 /* Many of the following variables are used only in small blocks of the code.
574 My normal style of coding would have declared them within each of those blocks.
575 However, in order to accommodate the version of this code that uses an external
576 "stack" implemented on the heap, it is easier to declare them all here, so the
577 declarations can be cut out in a block. The only declarations within blocks
578 below are for variables that do not have to be preserved over a recursive call
579 to RMATCH(). */
580
581 #ifdef SUPPORT_UTF
582 const pcre_uchar *charptr;
583 #endif
584 const pcre_uchar *callpat;
585 const pcre_uchar *data;
586 const pcre_uchar *next;
587 PCRE_PUCHAR pp;
588 const pcre_uchar *prev;
589 PCRE_PUCHAR saved_eptr;
590
591 recursion_info new_recursive;
592
593 BOOL cur_is_word;
594 BOOL condition;
595 BOOL prev_is_word;
596
597 #ifdef SUPPORT_UCP
598 int prop_type;
599 int prop_value;
600 int prop_fail_result;
601 int oclength;
602 pcre_uchar occhars[6];
603 #endif
604
605 int codelink;
606 int ctype;
607 int length;
608 int max;
609 int min;
610 int number;
611 int offset;
612 int op;
613 int save_capture_last;
614 int save_offset1, save_offset2, save_offset3;
615 int stacksave[REC_STACK_SAVE_MAX];
616
617 eptrblock newptrb;
618 #endif /* NO_RECURSE */
619
620 /* To save space on the stack and in the heap frame, I have doubled up on some
621 of the local variables that are used only in localised parts of the code, but
622 still need to be preserved over recursive calls of match(). These macros define
623 the alternative names that are used. */
624
625 #define allow_zero cur_is_word
626 #define cbegroup condition
627 #define code_offset codelink
628 #define condassert condition
629 #define matched_once prev_is_word
630 #define foc number
631 #define save_mark data
632
633 /* These statements are here to stop the compiler complaining about unitialized
634 variables. */
635
636 #ifdef SUPPORT_UCP
637 prop_value = 0;
638 prop_fail_result = 0;
639 #endif
640
641
642 /* This label is used for tail recursion, which is used in a few cases even
643 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
644 used. Thanks to Ian Taylor for noticing this possibility and sending the
645 original patch. */
646
647 TAIL_RECURSE:
648
649 /* OK, now we can get on with the real code of the function. Recursive calls
650 are specified by the macro RMATCH and RRETURN is used to return. When
651 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
652 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
653 defined). However, RMATCH isn't like a function call because it's quite a
654 complicated macro. It has to be used in one particular way. This shouldn't,
655 however, impact performance when true recursion is being used. */
656
657 #ifdef SUPPORT_UTF
658 utf = md->utf; /* Local copy of the flag */
659 #else
660 utf = FALSE;
661 #endif
662
663 /* First check that we haven't called match() too many times, or that we
664 haven't exceeded the recursive call limit. */
665
666 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
667 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
668
669 /* At the start of a group with an unlimited repeat that may match an empty
670 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
671 done this way to save having to use another function argument, which would take
672 up space on the stack. See also MATCH_CONDASSERT below.
673
674 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
675 such remembered pointers, to be checked when we hit the closing ket, in order
676 to break infinite loops that match no characters. When match() is called in
677 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
678 NOT be used with tail recursion, because the memory block that is used is on
679 the stack, so a new one may be required for each match(). */
680
681 if (md->match_function_type == MATCH_CBEGROUP)
682 {
683 newptrb.epb_saved_eptr = eptr;
684 newptrb.epb_prev = eptrb;
685 eptrb = &newptrb;
686 md->match_function_type = 0;
687 }
688
689 /* Now start processing the opcodes. */
690
691 for (;;)
692 {
693 minimize = possessive = FALSE;
694 op = *ecode;
695
696 switch(op)
697 {
698 case OP_MARK:
699 md->nomatch_mark = ecode + 2;
700 md->mark = NULL; /* In case previously set by assertion */
701 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
702 eptrb, RM55);
703 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
704 md->mark == NULL) md->mark = ecode + 2;
705
706 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
707 argument, and we must check whether that argument matches this MARK's
708 argument. It is passed back in md->start_match_ptr (an overloading of that
709 variable). If it does match, we reset that variable to the current subject
710 position and return MATCH_SKIP. Otherwise, pass back the return code
711 unaltered. */
712
713 else if (rrc == MATCH_SKIP_ARG &&
714 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
715 {
716 md->start_match_ptr = eptr;
717 RRETURN(MATCH_SKIP);
718 }
719 RRETURN(rrc);
720
721 case OP_FAIL:
722 RRETURN(MATCH_NOMATCH);
723
724 /* COMMIT overrides PRUNE, SKIP, and THEN */
725
726 case OP_COMMIT:
727 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
728 eptrb, RM52);
729 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
730 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
731 rrc != MATCH_THEN)
732 RRETURN(rrc);
733 RRETURN(MATCH_COMMIT);
734
735 /* PRUNE overrides THEN */
736
737 case OP_PRUNE:
738 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
739 eptrb, RM51);
740 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
741 RRETURN(MATCH_PRUNE);
742
743 case OP_PRUNE_ARG:
744 md->nomatch_mark = ecode + 2;
745 md->mark = NULL; /* In case previously set by assertion */
746 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
747 eptrb, RM56);
748 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
749 md->mark == NULL) md->mark = ecode + 2;
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 RRETURN(MATCH_PRUNE);
752
753 /* SKIP overrides PRUNE and THEN */
754
755 case OP_SKIP:
756 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
757 eptrb, RM53);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
759 RRETURN(rrc);
760 md->start_match_ptr = eptr; /* Pass back current position */
761 RRETURN(MATCH_SKIP);
762
763 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
764 nomatch_mark. There is a flag that disables this opcode when re-matching a
765 pattern that ended with a SKIP for which there was not a matching MARK. */
766
767 case OP_SKIP_ARG:
768 if (md->ignore_skip_arg)
769 {
770 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
771 break;
772 }
773 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it causes a rematch
781 with the md->ignore_skip_arg flag set. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
787 the branch in which it occurs can be determined. Overload the start of
788 match pointer to do this. */
789
790 case OP_THEN:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
792 eptrb, RM54);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode;
795 RRETURN(MATCH_THEN);
796
797 case OP_THEN_ARG:
798 md->nomatch_mark = ecode + 2;
799 md->mark = NULL; /* In case previously set by assertion */
800 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
801 md, eptrb, RM58);
802 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
803 md->mark == NULL) md->mark = ecode + 2;
804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
805 md->start_match_ptr = ecode;
806 RRETURN(MATCH_THEN);
807
808 /* Handle an atomic group that does not contain any capturing parentheses.
809 This can be handled like an assertion. Prior to 8.13, all atomic groups
810 were handled this way. In 8.13, the code was changed as below for ONCE, so
811 that backups pass through the group and thereby reset captured values.
812 However, this uses a lot more stack, so in 8.20, atomic groups that do not
813 contain any captures generate OP_ONCE_NC, which can be handled in the old,
814 less stack intensive way.
815
816 Check the alternative branches in turn - the matching won't pass the KET
817 for this kind of subpattern. If any one branch matches, we carry on as at
818 the end of a normal bracket, leaving the subject pointer, but resetting
819 the start-of-match value in case it was changed by \K. */
820
821 case OP_ONCE_NC:
822 prev = ecode;
823 saved_eptr = eptr;
824 save_mark = md->mark;
825 do
826 {
827 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
828 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
829 {
830 mstart = md->start_match_ptr;
831 break;
832 }
833 if (rrc == MATCH_THEN)
834 {
835 next = ecode + GET(ecode,1);
836 if (md->start_match_ptr < next &&
837 (*ecode == OP_ALT || *next == OP_ALT))
838 rrc = MATCH_NOMATCH;
839 }
840
841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
842 ecode += GET(ecode,1);
843 md->mark = save_mark;
844 }
845 while (*ecode == OP_ALT);
846
847 /* If hit the end of the group (which could be repeated), fail */
848
849 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
850
851 /* Continue as from after the group, updating the offsets high water
852 mark, since extracts may have been taken. */
853
854 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
855
856 offset_top = md->end_offset_top;
857 eptr = md->end_match_ptr;
858
859 /* For a non-repeating ket, just continue at this level. This also
860 happens for a repeating ket if no characters were matched in the group.
861 This is the forcible breaking of infinite loops as implemented in Perl
862 5.005. */
863
864 if (*ecode == OP_KET || eptr == saved_eptr)
865 {
866 ecode += 1+LINK_SIZE;
867 break;
868 }
869
870 /* The repeating kets try the rest of the pattern or restart from the
871 preceding bracket, in the appropriate order. The second "call" of match()
872 uses tail recursion, to avoid using another stack frame. */
873
874 if (*ecode == OP_KETRMIN)
875 {
876 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
878 ecode = prev;
879 goto TAIL_RECURSE;
880 }
881 else /* OP_KETRMAX */
882 {
883 md->match_function_type = MATCH_CBEGROUP;
884 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
886 ecode += 1 + LINK_SIZE;
887 goto TAIL_RECURSE;
888 }
889 /* Control never gets here */
890
891 /* Handle a capturing bracket, other than those that are possessive with an
892 unlimited repeat. If there is space in the offset vector, save the current
893 subject position in the working slot at the top of the vector. We mustn't
894 change the current values of the data slot, because they may be set from a
895 previous iteration of this group, and be referred to by a reference inside
896 the group. A failure to match might occur after the group has succeeded,
897 if something later on doesn't match. For this reason, we need to restore
898 the working value and also the values of the final offsets, in case they
899 were set by a previous iteration of the same bracket.
900
901 If there isn't enough space in the offset vector, treat this as if it were
902 a non-capturing bracket. Don't worry about setting the flag for the error
903 case here; that is handled in the code for KET. */
904
905 case OP_CBRA:
906 case OP_SCBRA:
907 number = GET2(ecode, 1+LINK_SIZE);
908 offset = number << 1;
909
910 #ifdef PCRE_DEBUG
911 printf("start bracket %d\n", number);
912 printf("subject=");
913 pchars(eptr, 16, TRUE, md);
914 printf("\n");
915 #endif
916
917 if (offset < md->offset_max)
918 {
919 save_offset1 = md->offset_vector[offset];
920 save_offset2 = md->offset_vector[offset+1];
921 save_offset3 = md->offset_vector[md->offset_end - number];
922 save_capture_last = md->capture_last;
923 save_mark = md->mark;
924
925 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
926 md->offset_vector[md->offset_end - number] =
927 (int)(eptr - md->start_subject);
928
929 for (;;)
930 {
931 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
932 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
933 eptrb, RM1);
934 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
935
936 /* If we backed up to a THEN, check whether it is within the current
937 branch by comparing the address of the THEN that is passed back with
938 the end of the branch. If it is within the current branch, and the
939 branch is one of two or more alternatives (it either starts or ends
940 with OP_ALT), we have reached the limit of THEN's action, so convert
941 the return code to NOMATCH, which will cause normal backtracking to
942 happen from now on. Otherwise, THEN is passed back to an outer
943 alternative. This implements Perl's treatment of parenthesized groups,
944 where a group not containing | does not affect the current alternative,
945 that is, (X) is NOT the same as (X|(*F)). */
946
947 if (rrc == MATCH_THEN)
948 {
949 next = ecode + GET(ecode,1);
950 if (md->start_match_ptr < next &&
951 (*ecode == OP_ALT || *next == OP_ALT))
952 rrc = MATCH_NOMATCH;
953 }
954
955 /* Anything other than NOMATCH is passed back. */
956
957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
958 md->capture_last = save_capture_last;
959 ecode += GET(ecode, 1);
960 md->mark = save_mark;
961 if (*ecode != OP_ALT) break;
962 }
963
964 DPRINTF(("bracket %d failed\n", number));
965 md->offset_vector[offset] = save_offset1;
966 md->offset_vector[offset+1] = save_offset2;
967 md->offset_vector[md->offset_end - number] = save_offset3;
968
969 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
970
971 RRETURN(rrc);
972 }
973
974 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
975 as a non-capturing bracket. */
976
977 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
978 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
979
980 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
981
982 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
983 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
984
985 /* Non-capturing or atomic group, except for possessive with unlimited
986 repeat and ONCE group with no captures. Loop for all the alternatives.
987
988 When we get to the final alternative within the brackets, we used to return
989 the result of a recursive call to match() whatever happened so it was
990 possible to reduce stack usage by turning this into a tail recursion,
991 except in the case of a possibly empty group. However, now that there is
992 the possiblity of (*THEN) occurring in the final alternative, this
993 optimization is no longer always possible.
994
995 We can optimize if we know there are no (*THEN)s in the pattern; at present
996 this is the best that can be done.
997
998 MATCH_ONCE is returned when the end of an atomic group is successfully
999 reached, but subsequent matching fails. It passes back up the tree (causing
1000 captured values to be reset) until the original atomic group level is
1001 reached. This is tested by comparing md->once_target with the start of the
1002 group. At this point, the return is converted into MATCH_NOMATCH so that
1003 previous backup points can be taken. */
1004
1005 case OP_ONCE:
1006 case OP_BRA:
1007 case OP_SBRA:
1008 DPRINTF(("start non-capturing bracket\n"));
1009
1010 for (;;)
1011 {
1012 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1013
1014 /* If this is not a possibly empty group, and there are no (*THEN)s in
1015 the pattern, and this is the final alternative, optimize as described
1016 above. */
1017
1018 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1019 {
1020 ecode += PRIV(OP_lengths)[*ecode];
1021 goto TAIL_RECURSE;
1022 }
1023
1024 /* In all other cases, we have to make another call to match(). */
1025
1026 save_mark = md->mark;
1027 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1028 RM2);
1029
1030 /* See comment in the code for capturing groups above about handling
1031 THEN. */
1032
1033 if (rrc == MATCH_THEN)
1034 {
1035 next = ecode + GET(ecode,1);
1036 if (md->start_match_ptr < next &&
1037 (*ecode == OP_ALT || *next == OP_ALT))
1038 rrc = MATCH_NOMATCH;
1039 }
1040
1041 if (rrc != MATCH_NOMATCH)
1042 {
1043 if (rrc == MATCH_ONCE)
1044 {
1045 const pcre_uchar *scode = ecode;
1046 if (*scode != OP_ONCE) /* If not at start, find it */
1047 {
1048 while (*scode == OP_ALT) scode += GET(scode, 1);
1049 scode -= GET(scode, 1);
1050 }
1051 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1052 }
1053 RRETURN(rrc);
1054 }
1055 ecode += GET(ecode, 1);
1056 md->mark = save_mark;
1057 if (*ecode != OP_ALT) break;
1058 }
1059
1060 RRETURN(MATCH_NOMATCH);
1061
1062 /* Handle possessive capturing brackets with an unlimited repeat. We come
1063 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1064 handled similarly to the normal case above. However, the matching is
1065 different. The end of these brackets will always be OP_KETRPOS, which
1066 returns MATCH_KETRPOS without going further in the pattern. By this means
1067 we can handle the group by iteration rather than recursion, thereby
1068 reducing the amount of stack needed. */
1069
1070 case OP_CBRAPOS:
1071 case OP_SCBRAPOS:
1072 allow_zero = FALSE;
1073
1074 POSSESSIVE_CAPTURE:
1075 number = GET2(ecode, 1+LINK_SIZE);
1076 offset = number << 1;
1077
1078 #ifdef PCRE_DEBUG
1079 printf("start possessive bracket %d\n", number);
1080 printf("subject=");
1081 pchars(eptr, 16, TRUE, md);
1082 printf("\n");
1083 #endif
1084
1085 if (offset < md->offset_max)
1086 {
1087 matched_once = FALSE;
1088 code_offset = (int)(ecode - md->start_code);
1089
1090 save_offset1 = md->offset_vector[offset];
1091 save_offset2 = md->offset_vector[offset+1];
1092 save_offset3 = md->offset_vector[md->offset_end - number];
1093 save_capture_last = md->capture_last;
1094
1095 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1096
1097 /* Each time round the loop, save the current subject position for use
1098 when the group matches. For MATCH_MATCH, the group has matched, so we
1099 restart it with a new subject starting position, remembering that we had
1100 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1101 usual. If we haven't matched any alternatives in any iteration, check to
1102 see if a previous iteration matched. If so, the group has matched;
1103 continue from afterwards. Otherwise it has failed; restore the previous
1104 capture values before returning NOMATCH. */
1105
1106 for (;;)
1107 {
1108 md->offset_vector[md->offset_end - number] =
1109 (int)(eptr - md->start_subject);
1110 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1111 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1112 eptrb, RM63);
1113 if (rrc == MATCH_KETRPOS)
1114 {
1115 offset_top = md->end_offset_top;
1116 eptr = md->end_match_ptr;
1117 ecode = md->start_code + code_offset;
1118 save_capture_last = md->capture_last;
1119 matched_once = TRUE;
1120 continue;
1121 }
1122
1123 /* See comment in the code for capturing groups above about handling
1124 THEN. */
1125
1126 if (rrc == MATCH_THEN)
1127 {
1128 next = ecode + GET(ecode,1);
1129 if (md->start_match_ptr < next &&
1130 (*ecode == OP_ALT || *next == OP_ALT))
1131 rrc = MATCH_NOMATCH;
1132 }
1133
1134 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1135 md->capture_last = save_capture_last;
1136 ecode += GET(ecode, 1);
1137 if (*ecode != OP_ALT) break;
1138 }
1139
1140 if (!matched_once)
1141 {
1142 md->offset_vector[offset] = save_offset1;
1143 md->offset_vector[offset+1] = save_offset2;
1144 md->offset_vector[md->offset_end - number] = save_offset3;
1145 }
1146
1147 if (allow_zero || matched_once)
1148 {
1149 ecode += 1 + LINK_SIZE;
1150 break;
1151 }
1152
1153 RRETURN(MATCH_NOMATCH);
1154 }
1155
1156 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1157 as a non-capturing bracket. */
1158
1159 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1160 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1161
1162 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1163
1164 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1165 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1166
1167 /* Non-capturing possessive bracket with unlimited repeat. We come here
1168 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1169 without the capturing complication. It is written out separately for speed
1170 and cleanliness. */
1171
1172 case OP_BRAPOS:
1173 case OP_SBRAPOS:
1174 allow_zero = FALSE;
1175
1176 POSSESSIVE_NON_CAPTURE:
1177 matched_once = FALSE;
1178 code_offset = (int)(ecode - md->start_code);
1179
1180 for (;;)
1181 {
1182 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1183 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1184 eptrb, RM48);
1185 if (rrc == MATCH_KETRPOS)
1186 {
1187 offset_top = md->end_offset_top;
1188 eptr = md->end_match_ptr;
1189 ecode = md->start_code + code_offset;
1190 matched_once = TRUE;
1191 continue;
1192 }
1193
1194 /* See comment in the code for capturing groups above about handling
1195 THEN. */
1196
1197 if (rrc == MATCH_THEN)
1198 {
1199 next = ecode + GET(ecode,1);
1200 if (md->start_match_ptr < next &&
1201 (*ecode == OP_ALT || *next == OP_ALT))
1202 rrc = MATCH_NOMATCH;
1203 }
1204
1205 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1206 ecode += GET(ecode, 1);
1207 if (*ecode != OP_ALT) break;
1208 }
1209
1210 if (matched_once || allow_zero)
1211 {
1212 ecode += 1 + LINK_SIZE;
1213 break;
1214 }
1215 RRETURN(MATCH_NOMATCH);
1216
1217 /* Control never reaches here. */
1218
1219 /* Conditional group: compilation checked that there are no more than
1220 two branches. If the condition is false, skipping the first branch takes us
1221 past the end if there is only one branch, but that's OK because that is
1222 exactly what going to the ket would do. */
1223
1224 case OP_COND:
1225 case OP_SCOND:
1226 codelink = GET(ecode, 1);
1227
1228 /* Because of the way auto-callout works during compile, a callout item is
1229 inserted between OP_COND and an assertion condition. */
1230
1231 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1232 {
1233 if (PUBL(callout) != NULL)
1234 {
1235 PUBL(callout_block) cb;
1236 cb.version = 2; /* Version 1 of the callout block */
1237 cb.callout_number = ecode[LINK_SIZE+2];
1238 cb.offset_vector = md->offset_vector;
1239 #ifdef COMPILE_PCRE8
1240 cb.subject = (PCRE_SPTR)md->start_subject;
1241 #else
1242 cb.subject = (PCRE_SPTR16)md->start_subject;
1243 #endif
1244 cb.subject_length = (int)(md->end_subject - md->start_subject);
1245 cb.start_match = (int)(mstart - md->start_subject);
1246 cb.current_position = (int)(eptr - md->start_subject);
1247 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1248 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1249 cb.capture_top = offset_top/2;
1250 cb.capture_last = md->capture_last;
1251 cb.callout_data = md->callout_data;
1252 cb.mark = md->nomatch_mark;
1253 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1254 if (rrc < 0) RRETURN(rrc);
1255 }
1256 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1257 }
1258
1259 condcode = ecode[LINK_SIZE+1];
1260
1261 /* Now see what the actual condition is */
1262
1263 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1264 {
1265 if (md->recursive == NULL) /* Not recursing => FALSE */
1266 {
1267 condition = FALSE;
1268 ecode += GET(ecode, 1);
1269 }
1270 else
1271 {
1272 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1273 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1274
1275 /* If the test is for recursion into a specific subpattern, and it is
1276 false, but the test was set up by name, scan the table to see if the
1277 name refers to any other numbers, and test them. The condition is true
1278 if any one is set. */
1279
1280 if (!condition && condcode == OP_NRREF)
1281 {
1282 pcre_uchar *slotA = md->name_table;
1283 for (i = 0; i < md->name_count; i++)
1284 {
1285 if (GET2(slotA, 0) == recno) break;
1286 slotA += md->name_entry_size;
1287 }
1288
1289 /* Found a name for the number - there can be only one; duplicate
1290 names for different numbers are allowed, but not vice versa. First
1291 scan down for duplicates. */
1292
1293 if (i < md->name_count)
1294 {
1295 pcre_uchar *slotB = slotA;
1296 while (slotB > md->name_table)
1297 {
1298 slotB -= md->name_entry_size;
1299 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1300 {
1301 condition = GET2(slotB, 0) == md->recursive->group_num;
1302 if (condition) break;
1303 }
1304 else break;
1305 }
1306
1307 /* Scan up for duplicates */
1308
1309 if (!condition)
1310 {
1311 slotB = slotA;
1312 for (i++; i < md->name_count; i++)
1313 {
1314 slotB += md->name_entry_size;
1315 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1316 {
1317 condition = GET2(slotB, 0) == md->recursive->group_num;
1318 if (condition) break;
1319 }
1320 else break;
1321 }
1322 }
1323 }
1324 }
1325
1326 /* Chose branch according to the condition */
1327
1328 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1329 }
1330 }
1331
1332 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1333 {
1334 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1335 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1336
1337 /* If the numbered capture is unset, but the reference was by name,
1338 scan the table to see if the name refers to any other numbers, and test
1339 them. The condition is true if any one is set. This is tediously similar
1340 to the code above, but not close enough to try to amalgamate. */
1341
1342 if (!condition && condcode == OP_NCREF)
1343 {
1344 int refno = offset >> 1;
1345 pcre_uchar *slotA = md->name_table;
1346
1347 for (i = 0; i < md->name_count; i++)
1348 {
1349 if (GET2(slotA, 0) == refno) break;
1350 slotA += md->name_entry_size;
1351 }
1352
1353 /* Found a name for the number - there can be only one; duplicate names
1354 for different numbers are allowed, but not vice versa. First scan down
1355 for duplicates. */
1356
1357 if (i < md->name_count)
1358 {
1359 pcre_uchar *slotB = slotA;
1360 while (slotB > md->name_table)
1361 {
1362 slotB -= md->name_entry_size;
1363 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1364 {
1365 offset = GET2(slotB, 0) << 1;
1366 condition = offset < offset_top &&
1367 md->offset_vector[offset] >= 0;
1368 if (condition) break;
1369 }
1370 else break;
1371 }
1372
1373 /* Scan up for duplicates */
1374
1375 if (!condition)
1376 {
1377 slotB = slotA;
1378 for (i++; i < md->name_count; i++)
1379 {
1380 slotB += md->name_entry_size;
1381 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1382 {
1383 offset = GET2(slotB, 0) << 1;
1384 condition = offset < offset_top &&
1385 md->offset_vector[offset] >= 0;
1386 if (condition) break;
1387 }
1388 else break;
1389 }
1390 }
1391 }
1392 }
1393
1394 /* Chose branch according to the condition */
1395
1396 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1397 }
1398
1399 else if (condcode == OP_DEF) /* DEFINE - always false */
1400 {
1401 condition = FALSE;
1402 ecode += GET(ecode, 1);
1403 }
1404
1405 /* The condition is an assertion. Call match() to evaluate it - setting
1406 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1407 an assertion. */
1408
1409 else
1410 {
1411 md->match_function_type = MATCH_CONDASSERT;
1412 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1413 if (rrc == MATCH_MATCH)
1414 {
1415 if (md->end_offset_top > offset_top)
1416 offset_top = md->end_offset_top; /* Captures may have happened */
1417 condition = TRUE;
1418 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1419 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1420 }
1421
1422 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1423 assertion; it is therefore treated as NOMATCH. */
1424
1425 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1426 {
1427 RRETURN(rrc); /* Need braces because of following else */
1428 }
1429 else
1430 {
1431 condition = FALSE;
1432 ecode += codelink;
1433 }
1434 }
1435
1436 /* We are now at the branch that is to be obeyed. As there is only one, can
1437 use tail recursion to avoid using another stack frame, except when there is
1438 unlimited repeat of a possibly empty group. In the latter case, a recursive
1439 call to match() is always required, unless the second alternative doesn't
1440 exist, in which case we can just plough on. Note that, for compatibility
1441 with Perl, the | in a conditional group is NOT treated as creating two
1442 alternatives. If a THEN is encountered in the branch, it propagates out to
1443 the enclosing alternative (unless nested in a deeper set of alternatives,
1444 of course). */
1445
1446 if (condition || *ecode == OP_ALT)
1447 {
1448 if (op != OP_SCOND)
1449 {
1450 ecode += 1 + LINK_SIZE;
1451 goto TAIL_RECURSE;
1452 }
1453
1454 md->match_function_type = MATCH_CBEGROUP;
1455 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1456 RRETURN(rrc);
1457 }
1458
1459 /* Condition false & no alternative; continue after the group. */
1460
1461 else
1462 {
1463 ecode += 1 + LINK_SIZE;
1464 }
1465 break;
1466
1467
1468 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1469 to close any currently open capturing brackets. */
1470
1471 case OP_CLOSE:
1472 number = GET2(ecode, 1);
1473 offset = number << 1;
1474
1475 #ifdef PCRE_DEBUG
1476 printf("end bracket %d at *ACCEPT", number);
1477 printf("\n");
1478 #endif
1479
1480 md->capture_last = number;
1481 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1482 {
1483 md->offset_vector[offset] =
1484 md->offset_vector[md->offset_end - number];
1485 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1486 if (offset_top <= offset) offset_top = offset + 2;
1487 }
1488 ecode += 1 + IMM2_SIZE;
1489 break;
1490
1491
1492 /* End of the pattern, either real or forced. */
1493
1494 case OP_END:
1495 case OP_ACCEPT:
1496 case OP_ASSERT_ACCEPT:
1497
1498 /* If we have matched an empty string, fail if not in an assertion and not
1499 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1500 is set and we have matched at the start of the subject. In both cases,
1501 backtracking will then try other alternatives, if any. */
1502
1503 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1504 md->recursive == NULL &&
1505 (md->notempty ||
1506 (md->notempty_atstart &&
1507 mstart == md->start_subject + md->start_offset)))
1508 RRETURN(MATCH_NOMATCH);
1509
1510 /* Otherwise, we have a match. */
1511
1512 md->end_match_ptr = eptr; /* Record where we ended */
1513 md->end_offset_top = offset_top; /* and how many extracts were taken */
1514 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1515
1516 /* For some reason, the macros don't work properly if an expression is
1517 given as the argument to RRETURN when the heap is in use. */
1518
1519 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1520 RRETURN(rrc);
1521
1522 /* Assertion brackets. Check the alternative branches in turn - the
1523 matching won't pass the KET for an assertion. If any one branch matches,
1524 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1525 start of each branch to move the current point backwards, so the code at
1526 this level is identical to the lookahead case. When the assertion is part
1527 of a condition, we want to return immediately afterwards. The caller of
1528 this incarnation of the match() function will have set MATCH_CONDASSERT in
1529 md->match_function type, and one of these opcodes will be the first opcode
1530 that is processed. We use a local variable that is preserved over calls to
1531 match() to remember this case. */
1532
1533 case OP_ASSERT:
1534 case OP_ASSERTBACK:
1535 save_mark = md->mark;
1536 if (md->match_function_type == MATCH_CONDASSERT)
1537 {
1538 condassert = TRUE;
1539 md->match_function_type = 0;
1540 }
1541 else condassert = FALSE;
1542
1543 do
1544 {
1545 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1546 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1547 {
1548 mstart = md->start_match_ptr; /* In case \K reset it */
1549 break;
1550 }
1551
1552 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1553 as NOMATCH. */
1554
1555 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1556 ecode += GET(ecode, 1);
1557 md->mark = save_mark;
1558 }
1559 while (*ecode == OP_ALT);
1560
1561 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1562
1563 /* If checking an assertion for a condition, return MATCH_MATCH. */
1564
1565 if (condassert) RRETURN(MATCH_MATCH);
1566
1567 /* Continue from after the assertion, updating the offsets high water
1568 mark, since extracts may have been taken during the assertion. */
1569
1570 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1571 ecode += 1 + LINK_SIZE;
1572 offset_top = md->end_offset_top;
1573 continue;
1574
1575 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1576 PRUNE, or COMMIT means we must assume failure without checking subsequent
1577 branches. */
1578
1579 case OP_ASSERT_NOT:
1580 case OP_ASSERTBACK_NOT:
1581 save_mark = md->mark;
1582 if (md->match_function_type == MATCH_CONDASSERT)
1583 {
1584 condassert = TRUE;
1585 md->match_function_type = 0;
1586 }
1587 else condassert = FALSE;
1588
1589 do
1590 {
1591 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1592 md->mark = save_mark;
1593 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1594 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1595 {
1596 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1597 break;
1598 }
1599
1600 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1601 as NOMATCH. */
1602
1603 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1604 ecode += GET(ecode,1);
1605 }
1606 while (*ecode == OP_ALT);
1607
1608 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1609
1610 ecode += 1 + LINK_SIZE;
1611 continue;
1612
1613 /* Move the subject pointer back. This occurs only at the start of
1614 each branch of a lookbehind assertion. If we are too close to the start to
1615 move back, this match function fails. When working with UTF-8 we move
1616 back a number of characters, not bytes. */
1617
1618 case OP_REVERSE:
1619 #ifdef SUPPORT_UTF
1620 if (utf)
1621 {
1622 i = GET(ecode, 1);
1623 while (i-- > 0)
1624 {
1625 eptr--;
1626 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1627 BACKCHAR(eptr);
1628 }
1629 }
1630 else
1631 #endif
1632
1633 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1634
1635 {
1636 eptr -= GET(ecode, 1);
1637 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1638 }
1639
1640 /* Save the earliest consulted character, then skip to next op code */
1641
1642 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1643 ecode += 1 + LINK_SIZE;
1644 break;
1645
1646 /* The callout item calls an external function, if one is provided, passing
1647 details of the match so far. This is mainly for debugging, though the
1648 function is able to force a failure. */
1649
1650 case OP_CALLOUT:
1651 if (PUBL(callout) != NULL)
1652 {
1653 PUBL(callout_block) cb;
1654 cb.version = 2; /* Version 1 of the callout block */
1655 cb.callout_number = ecode[1];
1656 cb.offset_vector = md->offset_vector;
1657 #ifdef COMPILE_PCRE8
1658 cb.subject = (PCRE_SPTR)md->start_subject;
1659 #else
1660 cb.subject = (PCRE_SPTR16)md->start_subject;
1661 #endif
1662 cb.subject_length = (int)(md->end_subject - md->start_subject);
1663 cb.start_match = (int)(mstart - md->start_subject);
1664 cb.current_position = (int)(eptr - md->start_subject);
1665 cb.pattern_position = GET(ecode, 2);
1666 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1667 cb.capture_top = offset_top/2;
1668 cb.capture_last = md->capture_last;
1669 cb.callout_data = md->callout_data;
1670 cb.mark = md->nomatch_mark;
1671 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1672 if (rrc < 0) RRETURN(rrc);
1673 }
1674 ecode += 2 + 2*LINK_SIZE;
1675 break;
1676
1677 /* Recursion either matches the current regex, or some subexpression. The
1678 offset data is the offset to the starting bracket from the start of the
1679 whole pattern. (This is so that it works from duplicated subpatterns.)
1680
1681 The state of the capturing groups is preserved over recursion, and
1682 re-instated afterwards. We don't know how many are started and not yet
1683 finished (offset_top records the completed total) so we just have to save
1684 all the potential data. There may be up to 65535 such values, which is too
1685 large to put on the stack, but using malloc for small numbers seems
1686 expensive. As a compromise, the stack is used when there are no more than
1687 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1688
1689 There are also other values that have to be saved. We use a chained
1690 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1691 for the original version of this logic. It has, however, been hacked around
1692 a lot, so he is not to blame for the current way it works. */
1693
1694 case OP_RECURSE:
1695 {
1696 recursion_info *ri;
1697 int recno;
1698
1699 callpat = md->start_code + GET(ecode, 1);
1700 recno = (callpat == md->start_code)? 0 :
1701 GET2(callpat, 1 + LINK_SIZE);
1702
1703 /* Check for repeating a recursion without advancing the subject pointer.
1704 This should catch convoluted mutual recursions. (Some simple cases are
1705 caught at compile time.) */
1706
1707 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1708 if (recno == ri->group_num && eptr == ri->subject_position)
1709 RRETURN(PCRE_ERROR_RECURSELOOP);
1710
1711 /* Add to "recursing stack" */
1712
1713 new_recursive.group_num = recno;
1714 new_recursive.subject_position = eptr;
1715 new_recursive.prevrec = md->recursive;
1716 md->recursive = &new_recursive;
1717
1718 /* Where to continue from afterwards */
1719
1720 ecode += 1 + LINK_SIZE;
1721
1722 /* Now save the offset data */
1723
1724 new_recursive.saved_max = md->offset_end;
1725 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1726 new_recursive.offset_save = stacksave;
1727 else
1728 {
1729 new_recursive.offset_save =
1730 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1731 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1732 }
1733 memcpy(new_recursive.offset_save, md->offset_vector,
1734 new_recursive.saved_max * sizeof(int));
1735
1736 /* OK, now we can do the recursion. After processing each alternative,
1737 restore the offset data. If there were nested recursions, md->recursive
1738 might be changed, so reset it before looping. */
1739
1740 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1741 cbegroup = (*callpat >= OP_SBRA);
1742 do
1743 {
1744 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1745 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1746 md, eptrb, RM6);
1747 memcpy(md->offset_vector, new_recursive.offset_save,
1748 new_recursive.saved_max * sizeof(int));
1749 md->recursive = new_recursive.prevrec;
1750 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1751 {
1752 DPRINTF(("Recursion matched\n"));
1753 if (new_recursive.offset_save != stacksave)
1754 (PUBL(free))(new_recursive.offset_save);
1755
1756 /* Set where we got to in the subject, and reset the start in case
1757 it was changed by \K. This *is* propagated back out of a recursion,
1758 for Perl compatibility. */
1759
1760 eptr = md->end_match_ptr;
1761 mstart = md->start_match_ptr;
1762 goto RECURSION_MATCHED; /* Exit loop; end processing */
1763 }
1764
1765 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1766 as NOMATCH. */
1767
1768 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1769 {
1770 DPRINTF(("Recursion gave error %d\n", rrc));
1771 if (new_recursive.offset_save != stacksave)
1772 (PUBL(free))(new_recursive.offset_save);
1773 RRETURN(rrc);
1774 }
1775
1776 md->recursive = &new_recursive;
1777 callpat += GET(callpat, 1);
1778 }
1779 while (*callpat == OP_ALT);
1780
1781 DPRINTF(("Recursion didn't match\n"));
1782 md->recursive = new_recursive.prevrec;
1783 if (new_recursive.offset_save != stacksave)
1784 (PUBL(free))(new_recursive.offset_save);
1785 RRETURN(MATCH_NOMATCH);
1786 }
1787
1788 RECURSION_MATCHED:
1789 break;
1790
1791 /* An alternation is the end of a branch; scan along to find the end of the
1792 bracketed group and go to there. */
1793
1794 case OP_ALT:
1795 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1796 break;
1797
1798 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1799 indicating that it may occur zero times. It may repeat infinitely, or not
1800 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1801 with fixed upper repeat limits are compiled as a number of copies, with the
1802 optional ones preceded by BRAZERO or BRAMINZERO. */
1803
1804 case OP_BRAZERO:
1805 next = ecode + 1;
1806 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1808 do next += GET(next, 1); while (*next == OP_ALT);
1809 ecode = next + 1 + LINK_SIZE;
1810 break;
1811
1812 case OP_BRAMINZERO:
1813 next = ecode + 1;
1814 do next += GET(next, 1); while (*next == OP_ALT);
1815 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1817 ecode++;
1818 break;
1819
1820 case OP_SKIPZERO:
1821 next = ecode+1;
1822 do next += GET(next,1); while (*next == OP_ALT);
1823 ecode = next + 1 + LINK_SIZE;
1824 break;
1825
1826 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1827 here; just jump to the group, with allow_zero set TRUE. */
1828
1829 case OP_BRAPOSZERO:
1830 op = *(++ecode);
1831 allow_zero = TRUE;
1832 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1833 goto POSSESSIVE_NON_CAPTURE;
1834
1835 /* End of a group, repeated or non-repeating. */
1836
1837 case OP_KET:
1838 case OP_KETRMIN:
1839 case OP_KETRMAX:
1840 case OP_KETRPOS:
1841 prev = ecode - GET(ecode, 1);
1842
1843 /* If this was a group that remembered the subject start, in order to break
1844 infinite repeats of empty string matches, retrieve the subject start from
1845 the chain. Otherwise, set it NULL. */
1846
1847 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1848 {
1849 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1850 eptrb = eptrb->epb_prev; /* Backup to previous group */
1851 }
1852 else saved_eptr = NULL;
1853
1854 /* If we are at the end of an assertion group or a non-capturing atomic
1855 group, stop matching and return MATCH_MATCH, but record the current high
1856 water mark for use by positive assertions. We also need to record the match
1857 start in case it was changed by \K. */
1858
1859 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1860 *prev == OP_ONCE_NC)
1861 {
1862 md->end_match_ptr = eptr; /* For ONCE_NC */
1863 md->end_offset_top = offset_top;
1864 md->start_match_ptr = mstart;
1865 RRETURN(MATCH_MATCH); /* Sets md->mark */
1866 }
1867
1868 /* For capturing groups we have to check the group number back at the start
1869 and if necessary complete handling an extraction by setting the offsets and
1870 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1871 into group 0, so it won't be picked up here. Instead, we catch it when the
1872 OP_END is reached. Other recursion is handled here. We just have to record
1873 the current subject position and start match pointer and give a MATCH
1874 return. */
1875
1876 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1877 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1878 {
1879 number = GET2(prev, 1+LINK_SIZE);
1880 offset = number << 1;
1881
1882 #ifdef PCRE_DEBUG
1883 printf("end bracket %d", number);
1884 printf("\n");
1885 #endif
1886
1887 /* Handle a recursively called group. */
1888
1889 if (md->recursive != NULL && md->recursive->group_num == number)
1890 {
1891 md->end_match_ptr = eptr;
1892 md->start_match_ptr = mstart;
1893 RRETURN(MATCH_MATCH);
1894 }
1895
1896 /* Deal with capturing */
1897
1898 md->capture_last = number;
1899 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1900 {
1901 /* If offset is greater than offset_top, it means that we are
1902 "skipping" a capturing group, and that group's offsets must be marked
1903 unset. In earlier versions of PCRE, all the offsets were unset at the
1904 start of matching, but this doesn't work because atomic groups and
1905 assertions can cause a value to be set that should later be unset.
1906 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1907 part of the atomic group, but this is not on the final matching path,
1908 so must be unset when 2 is set. (If there is no group 2, there is no
1909 problem, because offset_top will then be 2, indicating no capture.) */
1910
1911 if (offset > offset_top)
1912 {
1913 register int *iptr = md->offset_vector + offset_top;
1914 register int *iend = md->offset_vector + offset;
1915 while (iptr < iend) *iptr++ = -1;
1916 }
1917
1918 /* Now make the extraction */
1919
1920 md->offset_vector[offset] =
1921 md->offset_vector[md->offset_end - number];
1922 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1923 if (offset_top <= offset) offset_top = offset + 2;
1924 }
1925 }
1926
1927 /* For an ordinary non-repeating ket, just continue at this level. This
1928 also happens for a repeating ket if no characters were matched in the
1929 group. This is the forcible breaking of infinite loops as implemented in
1930 Perl 5.005. For a non-repeating atomic group that includes captures,
1931 establish a backup point by processing the rest of the pattern at a lower
1932 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1933 original OP_ONCE level, thereby bypassing intermediate backup points, but
1934 resetting any captures that happened along the way. */
1935
1936 if (*ecode == OP_KET || eptr == saved_eptr)
1937 {
1938 if (*prev == OP_ONCE)
1939 {
1940 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1942 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1943 RRETURN(MATCH_ONCE);
1944 }
1945 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1946 break;
1947 }
1948
1949 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1950 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1951 at a time from the outer level, thus saving stack. */
1952
1953 if (*ecode == OP_KETRPOS)
1954 {
1955 md->end_match_ptr = eptr;
1956 md->end_offset_top = offset_top;
1957 RRETURN(MATCH_KETRPOS);
1958 }
1959
1960 /* The normal repeating kets try the rest of the pattern or restart from
1961 the preceding bracket, in the appropriate order. In the second case, we can
1962 use tail recursion to avoid using another stack frame, unless we have an
1963 an atomic group or an unlimited repeat of a group that can match an empty
1964 string. */
1965
1966 if (*ecode == OP_KETRMIN)
1967 {
1968 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1970 if (*prev == OP_ONCE)
1971 {
1972 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1973 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1974 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1975 RRETURN(MATCH_ONCE);
1976 }
1977 if (*prev >= OP_SBRA) /* Could match an empty string */
1978 {
1979 md->match_function_type = MATCH_CBEGROUP;
1980 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1981 RRETURN(rrc);
1982 }
1983 ecode = prev;
1984 goto TAIL_RECURSE;
1985 }
1986 else /* OP_KETRMAX */
1987 {
1988 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1989 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1990 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1992 if (*prev == OP_ONCE)
1993 {
1994 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 md->once_target = prev;
1997 RRETURN(MATCH_ONCE);
1998 }
1999 ecode += 1 + LINK_SIZE;
2000 goto TAIL_RECURSE;
2001 }
2002 /* Control never gets here */
2003
2004 /* Not multiline mode: start of subject assertion, unless notbol. */
2005
2006 case OP_CIRC:
2007 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2008
2009 /* Start of subject assertion */
2010
2011 case OP_SOD:
2012 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2013 ecode++;
2014 break;
2015
2016 /* Multiline mode: start of subject unless notbol, or after any newline. */
2017
2018 case OP_CIRCM:
2019 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2020 if (eptr != md->start_subject &&
2021 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2022 RRETURN(MATCH_NOMATCH);
2023 ecode++;
2024 break;
2025
2026 /* Start of match assertion */
2027
2028 case OP_SOM:
2029 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2030 ecode++;
2031 break;
2032
2033 /* Reset the start of match point */
2034
2035 case OP_SET_SOM:
2036 mstart = eptr;
2037 ecode++;
2038 break;
2039
2040 /* Multiline mode: assert before any newline, or before end of subject
2041 unless noteol is set. */
2042
2043 case OP_DOLLM:
2044 if (eptr < md->end_subject)
2045 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2046 else
2047 {
2048 if (md->noteol) RRETURN(MATCH_NOMATCH);
2049 SCHECK_PARTIAL();
2050 }
2051 ecode++;
2052 break;
2053
2054 /* Not multiline mode: assert before a terminating newline or before end of
2055 subject unless noteol is set. */
2056
2057 case OP_DOLL:
2058 if (md->noteol) RRETURN(MATCH_NOMATCH);
2059 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2060
2061 /* ... else fall through for endonly */
2062
2063 /* End of subject assertion (\z) */
2064
2065 case OP_EOD:
2066 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2067 SCHECK_PARTIAL();
2068 ecode++;
2069 break;
2070
2071 /* End of subject or ending \n assertion (\Z) */
2072
2073 case OP_EODN:
2074 ASSERT_NL_OR_EOS:
2075 if (eptr < md->end_subject &&
2076 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2077 RRETURN(MATCH_NOMATCH);
2078
2079 /* Either at end of string or \n before end. */
2080
2081 SCHECK_PARTIAL();
2082 ecode++;
2083 break;
2084
2085 /* Word boundary assertions */
2086
2087 case OP_NOT_WORD_BOUNDARY:
2088 case OP_WORD_BOUNDARY:
2089 {
2090
2091 /* Find out if the previous and current characters are "word" characters.
2092 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2093 be "non-word" characters. Remember the earliest consulted character for
2094 partial matching. */
2095
2096 #ifdef SUPPORT_UTF
2097 if (utf)
2098 {
2099 /* Get status of previous character */
2100
2101 if (eptr == md->start_subject) prev_is_word = FALSE; else
2102 {
2103 PCRE_PUCHAR lastptr = eptr - 1;
2104 BACKCHAR(lastptr);
2105 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2106 GETCHAR(c, lastptr);
2107 #ifdef SUPPORT_UCP
2108 if (md->use_ucp)
2109 {
2110 if (c == '_') prev_is_word = TRUE; else
2111 {
2112 int cat = UCD_CATEGORY(c);
2113 prev_is_word = (cat == ucp_L || cat == ucp_N);
2114 }
2115 }
2116 else
2117 #endif
2118 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2119 }
2120
2121 /* Get status of next character */
2122
2123 if (eptr >= md->end_subject)
2124 {
2125 SCHECK_PARTIAL();
2126 cur_is_word = FALSE;
2127 }
2128 else
2129 {
2130 GETCHAR(c, eptr);
2131 #ifdef SUPPORT_UCP
2132 if (md->use_ucp)
2133 {
2134 if (c == '_') cur_is_word = TRUE; else
2135 {
2136 int cat = UCD_CATEGORY(c);
2137 cur_is_word = (cat == ucp_L || cat == ucp_N);
2138 }
2139 }
2140 else
2141 #endif
2142 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2143 }
2144 }
2145 else
2146 #endif
2147
2148 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2149 consistency with the behaviour of \w we do use it in this case. */
2150
2151 {
2152 /* Get status of previous character */
2153
2154 if (eptr == md->start_subject) prev_is_word = FALSE; else
2155 {
2156 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2157 #ifdef SUPPORT_UCP
2158 if (md->use_ucp)
2159 {
2160 c = eptr[-1];
2161 if (c == '_') prev_is_word = TRUE; else
2162 {
2163 int cat = UCD_CATEGORY(c);
2164 prev_is_word = (cat == ucp_L || cat == ucp_N);
2165 }
2166 }
2167 else
2168 #endif
2169 prev_is_word = MAX_255(eptr[-1])
2170 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2171 }
2172
2173 /* Get status of next character */
2174
2175 if (eptr >= md->end_subject)
2176 {
2177 SCHECK_PARTIAL();
2178 cur_is_word = FALSE;
2179 }
2180 else
2181 #ifdef SUPPORT_UCP
2182 if (md->use_ucp)
2183 {
2184 c = *eptr;
2185 if (c == '_') cur_is_word = TRUE; else
2186 {
2187 int cat = UCD_CATEGORY(c);
2188 cur_is_word = (cat == ucp_L || cat == ucp_N);
2189 }
2190 }
2191 else
2192 #endif
2193 cur_is_word = MAX_255(*eptr)
2194 && ((md->ctypes[*eptr] & ctype_word) != 0);
2195 }
2196
2197 /* Now see if the situation is what we want */
2198
2199 if ((*ecode++ == OP_WORD_BOUNDARY)?
2200 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2201 RRETURN(MATCH_NOMATCH);
2202 }
2203 break;
2204
2205 /* Match a single character type; inline for speed */
2206
2207 case OP_ANY:
2208 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2209 /* Fall through */
2210
2211 case OP_ALLANY:
2212 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2213 { /* not be updated before SCHECK_PARTIAL. */
2214 SCHECK_PARTIAL();
2215 RRETURN(MATCH_NOMATCH);
2216 }
2217 eptr++;
2218 #ifdef SUPPORT_UTF
2219 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2220 #endif
2221 ecode++;
2222 break;
2223
2224 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2225 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2226
2227 case OP_ANYBYTE:
2228 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2229 { /* not be updated before SCHECK_PARTIAL. */
2230 SCHECK_PARTIAL();
2231 RRETURN(MATCH_NOMATCH);
2232 }
2233 eptr++;
2234 ecode++;
2235 break;
2236
2237 case OP_NOT_DIGIT:
2238 if (eptr >= md->end_subject)
2239 {
2240 SCHECK_PARTIAL();
2241 RRETURN(MATCH_NOMATCH);
2242 }
2243 GETCHARINCTEST(c, eptr);
2244 if (
2245 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2246 c < 256 &&
2247 #endif
2248 (md->ctypes[c] & ctype_digit) != 0
2249 )
2250 RRETURN(MATCH_NOMATCH);
2251 ecode++;
2252 break;
2253
2254 case OP_DIGIT:
2255 if (eptr >= md->end_subject)
2256 {
2257 SCHECK_PARTIAL();
2258 RRETURN(MATCH_NOMATCH);
2259 }
2260 GETCHARINCTEST(c, eptr);
2261 if (
2262 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2263 c > 255 ||
2264 #endif
2265 (md->ctypes[c] & ctype_digit) == 0
2266 )
2267 RRETURN(MATCH_NOMATCH);
2268 ecode++;
2269 break;
2270
2271 case OP_NOT_WHITESPACE:
2272 if (eptr >= md->end_subject)
2273 {
2274 SCHECK_PARTIAL();
2275 RRETURN(MATCH_NOMATCH);
2276 }
2277 GETCHARINCTEST(c, eptr);
2278 if (
2279 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2280 c < 256 &&
2281 #endif
2282 (md->ctypes[c] & ctype_space) != 0
2283 )
2284 RRETURN(MATCH_NOMATCH);
2285 ecode++;
2286 break;
2287
2288 case OP_WHITESPACE:
2289 if (eptr >= md->end_subject)
2290 {
2291 SCHECK_PARTIAL();
2292 RRETURN(MATCH_NOMATCH);
2293 }
2294 GETCHARINCTEST(c, eptr);
2295 if (
2296 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2297 c > 255 ||
2298 #endif
2299 (md->ctypes[c] & ctype_space) == 0
2300 )
2301 RRETURN(MATCH_NOMATCH);
2302 ecode++;
2303 break;
2304
2305 case OP_NOT_WORDCHAR:
2306 if (eptr >= md->end_subject)
2307 {
2308 SCHECK_PARTIAL();
2309 RRETURN(MATCH_NOMATCH);
2310 }
2311 GETCHARINCTEST(c, eptr);
2312 if (
2313 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2314 c < 256 &&
2315 #endif
2316 (md->ctypes[c] & ctype_word) != 0
2317 )
2318 RRETURN(MATCH_NOMATCH);
2319 ecode++;
2320 break;
2321
2322 case OP_WORDCHAR:
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 RRETURN(MATCH_NOMATCH);
2327 }
2328 GETCHARINCTEST(c, eptr);
2329 if (
2330 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2331 c > 255 ||
2332 #endif
2333 (md->ctypes[c] & ctype_word) == 0
2334 )
2335 RRETURN(MATCH_NOMATCH);
2336 ecode++;
2337 break;
2338
2339 case OP_ANYNL:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 RRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 switch(c)
2347 {
2348 default: RRETURN(MATCH_NOMATCH);
2349
2350 case 0x000d:
2351 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2352 break;
2353
2354 case 0x000a:
2355 break;
2356
2357 case 0x000b:
2358 case 0x000c:
2359 case 0x0085:
2360 case 0x2028:
2361 case 0x2029:
2362 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2363 break;
2364 }
2365 ecode++;
2366 break;
2367
2368 case OP_NOT_HSPACE:
2369 if (eptr >= md->end_subject)
2370 {
2371 SCHECK_PARTIAL();
2372 RRETURN(MATCH_NOMATCH);
2373 }
2374 GETCHARINCTEST(c, eptr);
2375 switch(c)
2376 {
2377 default: break;
2378 case 0x09: /* HT */
2379 case 0x20: /* SPACE */
2380 case 0xa0: /* NBSP */
2381 case 0x1680: /* OGHAM SPACE MARK */
2382 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2383 case 0x2000: /* EN QUAD */
2384 case 0x2001: /* EM QUAD */
2385 case 0x2002: /* EN SPACE */
2386 case 0x2003: /* EM SPACE */
2387 case 0x2004: /* THREE-PER-EM SPACE */
2388 case 0x2005: /* FOUR-PER-EM SPACE */
2389 case 0x2006: /* SIX-PER-EM SPACE */
2390 case 0x2007: /* FIGURE SPACE */
2391 case 0x2008: /* PUNCTUATION SPACE */
2392 case 0x2009: /* THIN SPACE */
2393 case 0x200A: /* HAIR SPACE */
2394 case 0x202f: /* NARROW NO-BREAK SPACE */
2395 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2396 case 0x3000: /* IDEOGRAPHIC SPACE */
2397 RRETURN(MATCH_NOMATCH);
2398 }
2399 ecode++;
2400 break;
2401
2402 case OP_HSPACE:
2403 if (eptr >= md->end_subject)
2404 {
2405 SCHECK_PARTIAL();
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 GETCHARINCTEST(c, eptr);
2409 switch(c)
2410 {
2411 default: RRETURN(MATCH_NOMATCH);
2412 case 0x09: /* HT */
2413 case 0x20: /* SPACE */
2414 case 0xa0: /* NBSP */
2415 case 0x1680: /* OGHAM SPACE MARK */
2416 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2417 case 0x2000: /* EN QUAD */
2418 case 0x2001: /* EM QUAD */
2419 case 0x2002: /* EN SPACE */
2420 case 0x2003: /* EM SPACE */
2421 case 0x2004: /* THREE-PER-EM SPACE */
2422 case 0x2005: /* FOUR-PER-EM SPACE */
2423 case 0x2006: /* SIX-PER-EM SPACE */
2424 case 0x2007: /* FIGURE SPACE */
2425 case 0x2008: /* PUNCTUATION SPACE */
2426 case 0x2009: /* THIN SPACE */
2427 case 0x200A: /* HAIR SPACE */
2428 case 0x202f: /* NARROW NO-BREAK SPACE */
2429 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2430 case 0x3000: /* IDEOGRAPHIC SPACE */
2431 break;
2432 }
2433 ecode++;
2434 break;
2435
2436 case OP_NOT_VSPACE:
2437 if (eptr >= md->end_subject)
2438 {
2439 SCHECK_PARTIAL();
2440 RRETURN(MATCH_NOMATCH);
2441 }
2442 GETCHARINCTEST(c, eptr);
2443 switch(c)
2444 {
2445 default: break;
2446 case 0x0a: /* LF */
2447 case 0x0b: /* VT */
2448 case 0x0c: /* FF */
2449 case 0x0d: /* CR */
2450 case 0x85: /* NEL */
2451 case 0x2028: /* LINE SEPARATOR */
2452 case 0x2029: /* PARAGRAPH SEPARATOR */
2453 RRETURN(MATCH_NOMATCH);
2454 }
2455 ecode++;
2456 break;
2457
2458 case OP_VSPACE:
2459 if (eptr >= md->end_subject)
2460 {
2461 SCHECK_PARTIAL();
2462 RRETURN(MATCH_NOMATCH);
2463 }
2464 GETCHARINCTEST(c, eptr);
2465 switch(c)
2466 {
2467 default: RRETURN(MATCH_NOMATCH);
2468 case 0x0a: /* LF */
2469 case 0x0b: /* VT */
2470 case 0x0c: /* FF */
2471 case 0x0d: /* CR */
2472 case 0x85: /* NEL */
2473 case 0x2028: /* LINE SEPARATOR */
2474 case 0x2029: /* PARAGRAPH SEPARATOR */
2475 break;
2476 }
2477 ecode++;
2478 break;
2479
2480 #ifdef SUPPORT_UCP
2481 /* Check the next character by Unicode property. We will get here only
2482 if the support is in the binary; otherwise a compile-time error occurs. */
2483
2484 case OP_PROP:
2485 case OP_NOTPROP:
2486 if (eptr >= md->end_subject)
2487 {
2488 SCHECK_PARTIAL();
2489 RRETURN(MATCH_NOMATCH);
2490 }
2491 GETCHARINCTEST(c, eptr);
2492 {
2493 const ucd_record *prop = GET_UCD(c);
2494
2495 switch(ecode[1])
2496 {
2497 case PT_ANY:
2498 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2499 break;
2500
2501 case PT_LAMP:
2502 if ((prop->chartype == ucp_Lu ||
2503 prop->chartype == ucp_Ll ||
2504 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2505 RRETURN(MATCH_NOMATCH);
2506 break;
2507
2508 case PT_GC:
2509 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2510 RRETURN(MATCH_NOMATCH);
2511 break;
2512
2513 case PT_PC:
2514 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2515 RRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_SC:
2519 if ((ecode[2] != prop->script) == (op == OP_PROP))
2520 RRETURN(MATCH_NOMATCH);
2521 break;
2522
2523 /* These are specials */
2524
2525 case PT_ALNUM:
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2527 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2528 RRETURN(MATCH_NOMATCH);
2529 break;
2530
2531 case PT_SPACE: /* Perl space */
2532 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2533 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2534 == (op == OP_NOTPROP))
2535 RRETURN(MATCH_NOMATCH);
2536 break;
2537
2538 case PT_PXSPACE: /* POSIX space */
2539 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2540 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2541 c == CHAR_FF || c == CHAR_CR)
2542 == (op == OP_NOTPROP))
2543 RRETURN(MATCH_NOMATCH);
2544 break;
2545
2546 case PT_WORD:
2547 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2548 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2549 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2550 RRETURN(MATCH_NOMATCH);
2551 break;
2552
2553 /* This should never occur */
2554
2555 default:
2556 RRETURN(PCRE_ERROR_INTERNAL);
2557 }
2558
2559 ecode += 3;
2560 }
2561 break;
2562
2563 /* Match an extended Unicode sequence. We will get here only if the support
2564 is in the binary; otherwise a compile-time error occurs. */
2565
2566 case OP_EXTUNI:
2567 if (eptr >= md->end_subject)
2568 {
2569 SCHECK_PARTIAL();
2570 RRETURN(MATCH_NOMATCH);
2571 }
2572 GETCHARINCTEST(c, eptr);
2573 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2574 while (eptr < md->end_subject)
2575 {
2576 int len = 1;
2577 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2578 if (UCD_CATEGORY(c) != ucp_M) break;
2579 eptr += len;
2580 }
2581 ecode++;
2582 break;
2583 #endif
2584
2585
2586 /* Match a back reference, possibly repeatedly. Look past the end of the
2587 item to see if there is repeat information following. The code is similar
2588 to that for character classes, but repeated for efficiency. Then obey
2589 similar code to character type repeats - written out again for speed.
2590 However, if the referenced string is the empty string, always treat
2591 it as matched, any number of times (otherwise there could be infinite
2592 loops). */
2593
2594 case OP_REF:
2595 case OP_REFI:
2596 caseless = op == OP_REFI;
2597 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2598 ecode += 1 + IMM2_SIZE;
2599
2600 /* If the reference is unset, there are two possibilities:
2601
2602 (a) In the default, Perl-compatible state, set the length negative;
2603 this ensures that every attempt at a match fails. We can't just fail
2604 here, because of the possibility of quantifiers with zero minima.
2605
2606 (b) If the JavaScript compatibility flag is set, set the length to zero
2607 so that the back reference matches an empty string.
2608
2609 Otherwise, set the length to the length of what was matched by the
2610 referenced subpattern. */
2611
2612 if (offset >= offset_top || md->offset_vector[offset] < 0)
2613 length = (md->jscript_compat)? 0 : -1;
2614 else
2615 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2616
2617 /* Set up for repetition, or handle the non-repeated case */
2618
2619 switch (*ecode)
2620 {
2621 case OP_CRSTAR:
2622 case OP_CRMINSTAR:
2623 case OP_CRPLUS:
2624 case OP_CRMINPLUS:
2625 case OP_CRQUERY:
2626 case OP_CRMINQUERY:
2627 c = *ecode++ - OP_CRSTAR;
2628 minimize = (c & 1) != 0;
2629 min = rep_min[c]; /* Pick up values from tables; */
2630 max = rep_max[c]; /* zero for max => infinity */
2631 if (max == 0) max = INT_MAX;
2632 break;
2633
2634 case OP_CRRANGE:
2635 case OP_CRMINRANGE:
2636 minimize = (*ecode == OP_CRMINRANGE);
2637 min = GET2(ecode, 1);
2638 max = GET2(ecode, 1 + IMM2_SIZE);
2639 if (max == 0) max = INT_MAX;
2640 ecode += 1 + 2 * IMM2_SIZE;
2641 break;
2642
2643 default: /* No repeat follows */
2644 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2645 {
2646 CHECK_PARTIAL();
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 eptr += length;
2650 continue; /* With the main loop */
2651 }
2652
2653 /* Handle repeated back references. If the length of the reference is
2654 zero, just continue with the main loop. If the length is negative, it
2655 means the reference is unset in non-Java-compatible mode. If the minimum is
2656 zero, we can continue at the same level without recursion. For any other
2657 minimum, carrying on will result in NOMATCH. */
2658
2659 if (length == 0) continue;
2660 if (length < 0 && min == 0) continue;
2661
2662 /* First, ensure the minimum number of matches are present. We get back
2663 the length of the reference string explicitly rather than passing the
2664 address of eptr, so that eptr can be a register variable. */
2665
2666 for (i = 1; i <= min; i++)
2667 {
2668 int slength;
2669 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2670 {
2671 CHECK_PARTIAL();
2672 RRETURN(MATCH_NOMATCH);
2673 }
2674 eptr += slength;
2675 }
2676
2677 /* If min = max, continue at the same level without recursion.
2678 They are not both allowed to be zero. */
2679
2680 if (min == max) continue;
2681
2682 /* If minimizing, keep trying and advancing the pointer */
2683
2684 if (minimize)
2685 {
2686 for (fi = min;; fi++)
2687 {
2688 int slength;
2689 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2690 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2691 if (fi >= max) RRETURN(MATCH_NOMATCH);
2692 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2693 {
2694 CHECK_PARTIAL();
2695 RRETURN(MATCH_NOMATCH);
2696 }
2697 eptr += slength;
2698 }
2699 /* Control never gets here */
2700 }
2701
2702 /* If maximizing, find the longest string and work backwards */
2703
2704 else
2705 {
2706 pp = eptr;
2707 for (i = min; i < max; i++)
2708 {
2709 int slength;
2710 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2711 {
2712 CHECK_PARTIAL();
2713 break;
2714 }
2715 eptr += slength;
2716 }
2717 while (eptr >= pp)
2718 {
2719 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2721 eptr -= length;
2722 }
2723 RRETURN(MATCH_NOMATCH);
2724 }
2725 /* Control never gets here */
2726
2727 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2728 used when all the characters in the class have values in the range 0-255,
2729 and either the matching is caseful, or the characters are in the range
2730 0-127 when UTF-8 processing is enabled. The only difference between
2731 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2732 encountered.
2733
2734 First, look past the end of the item to see if there is repeat information
2735 following. Then obey similar code to character type repeats - written out
2736 again for speed. */
2737
2738 case OP_NCLASS:
2739 case OP_CLASS:
2740 {
2741 /* The data variable is saved across frames, so the byte map needs to
2742 be stored there. */
2743 #define BYTE_MAP ((pcre_uint8 *)data)
2744 data = ecode + 1; /* Save for matching */
2745 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2746
2747 switch (*ecode)
2748 {
2749 case OP_CRSTAR:
2750 case OP_CRMINSTAR:
2751 case OP_CRPLUS:
2752 case OP_CRMINPLUS:
2753 case OP_CRQUERY:
2754 case OP_CRMINQUERY:
2755 c = *ecode++ - OP_CRSTAR;
2756 minimize = (c & 1) != 0;
2757 min = rep_min[c]; /* Pick up values from tables; */
2758 max = rep_max[c]; /* zero for max => infinity */
2759 if (max == 0) max = INT_MAX;
2760 break;
2761
2762 case OP_CRRANGE:
2763 case OP_CRMINRANGE:
2764 minimize = (*ecode == OP_CRMINRANGE);
2765 min = GET2(ecode, 1);
2766 max = GET2(ecode, 1 + IMM2_SIZE);
2767 if (max == 0) max = INT_MAX;
2768 ecode += 1 + 2 * IMM2_SIZE;
2769 break;
2770
2771 default: /* No repeat follows */
2772 min = max = 1;
2773 break;
2774 }
2775
2776 /* First, ensure the minimum number of matches are present. */
2777
2778 #ifdef SUPPORT_UTF
2779 if (utf)
2780 {
2781 for (i = 1; i <= min; i++)
2782 {
2783 if (eptr >= md->end_subject)
2784 {
2785 SCHECK_PARTIAL();
2786 RRETURN(MATCH_NOMATCH);
2787 }
2788 GETCHARINC(c, eptr);
2789 if (c > 255)
2790 {
2791 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2792 }
2793 else
2794 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2795 }
2796 }
2797 else
2798 #endif
2799 /* Not UTF mode */
2800 {
2801 for (i = 1; i <= min; i++)
2802 {
2803 if (eptr >= md->end_subject)
2804 {
2805 SCHECK_PARTIAL();
2806 RRETURN(MATCH_NOMATCH);
2807 }
2808 c = *eptr++;
2809 #ifndef COMPILE_PCRE8
2810 if (c > 255)
2811 {
2812 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2813 }
2814 else
2815 #endif
2816 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2817 }
2818 }
2819
2820 /* If max == min we can continue with the main loop without the
2821 need to recurse. */
2822
2823 if (min == max) continue;
2824
2825 /* If minimizing, keep testing the rest of the expression and advancing
2826 the pointer while it matches the class. */
2827
2828 if (minimize)
2829 {
2830 #ifdef SUPPORT_UTF
2831 if (utf)
2832 {
2833 for (fi = min;; fi++)
2834 {
2835 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2837 if (fi >= max) RRETURN(MATCH_NOMATCH);
2838 if (eptr >= md->end_subject)
2839 {
2840 SCHECK_PARTIAL();
2841 RRETURN(MATCH_NOMATCH);
2842 }
2843 GETCHARINC(c, eptr);
2844 if (c > 255)
2845 {
2846 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2847 }
2848 else
2849 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2850 }
2851 }
2852 else
2853 #endif
2854 /* Not UTF mode */
2855 {
2856 for (fi = min;; fi++)
2857 {
2858 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2860 if (fi >= max) RRETURN(MATCH_NOMATCH);
2861 if (eptr >= md->end_subject)
2862 {
2863 SCHECK_PARTIAL();
2864 RRETURN(MATCH_NOMATCH);
2865 }
2866 c = *eptr++;
2867 #ifndef COMPILE_PCRE8
2868 if (c > 255)
2869 {
2870 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2871 }
2872 else
2873 #endif
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2875 }
2876 }
2877 /* Control never gets here */
2878 }
2879
2880 /* If maximizing, find the longest possible run, then work backwards. */
2881
2882 else
2883 {
2884 pp = eptr;
2885
2886 #ifdef SUPPORT_UTF
2887 if (utf)
2888 {
2889 for (i = min; i < max; i++)
2890 {
2891 int len = 1;
2892 if (eptr >= md->end_subject)
2893 {
2894 SCHECK_PARTIAL();
2895 break;
2896 }
2897 GETCHARLEN(c, eptr, len);
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) break;
2901 }
2902 else
2903 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2904 eptr += len;
2905 }
2906 for (;;)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (eptr-- == pp) break; /* Stop if tried at original pos */
2911 BACKCHAR(eptr);
2912 }
2913 }
2914 else
2915 #endif
2916 /* Not UTF mode */
2917 {
2918 for (i = min; i < max; i++)
2919 {
2920 if (eptr >= md->end_subject)
2921 {
2922 SCHECK_PARTIAL();
2923 break;
2924 }
2925 c = *eptr;
2926 #ifndef COMPILE_PCRE8
2927 if (c > 255)
2928 {
2929 if (op == OP_CLASS) break;
2930 }
2931 else
2932 #endif
2933 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2934 eptr++;
2935 }
2936 while (eptr >= pp)
2937 {
2938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2940 eptr--;
2941 }
2942 }
2943
2944 RRETURN(MATCH_NOMATCH);
2945 }
2946 #undef BYTE_MAP
2947 }
2948 /* Control never gets here */
2949
2950
2951 /* Match an extended character class. This opcode is encountered only
2952 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2953 mode, because Unicode properties are supported in non-UTF-8 mode. */
2954
2955 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2956 case OP_XCLASS:
2957 {
2958 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2959 ecode += GET(ecode, 1); /* Advance past the item */
2960
2961 switch (*ecode)
2962 {
2963 case OP_CRSTAR:
2964 case OP_CRMINSTAR:
2965 case OP_CRPLUS:
2966 case OP_CRMINPLUS:
2967 case OP_CRQUERY:
2968 case OP_CRMINQUERY:
2969 c = *ecode++ - OP_CRSTAR;
2970 minimize = (c & 1) != 0;
2971 min = rep_min[c]; /* Pick up values from tables; */
2972 max = rep_max[c]; /* zero for max => infinity */
2973 if (max == 0) max = INT_MAX;
2974 break;
2975
2976 case OP_CRRANGE:
2977 case OP_CRMINRANGE:
2978 minimize = (*ecode == OP_CRMINRANGE);
2979 min = GET2(ecode, 1);
2980 max = GET2(ecode, 1 + IMM2_SIZE);
2981 if (max == 0) max = INT_MAX;
2982 ecode += 1 + 2 * IMM2_SIZE;
2983 break;
2984
2985 default: /* No repeat follows */
2986 min = max = 1;
2987 break;
2988 }
2989
2990 /* First, ensure the minimum number of matches are present. */
2991
2992 for (i = 1; i <= min; i++)
2993 {
2994 if (eptr >= md->end_subject)
2995 {
2996 SCHECK_PARTIAL();
2997 RRETURN(MATCH_NOMATCH);
2998 }
2999 GETCHARINCTEST(c, eptr);
3000 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3001 }
3002
3003 /* If max == min we can continue with the main loop without the
3004 need to recurse. */
3005
3006 if (min == max) continue;
3007
3008 /* If minimizing, keep testing the rest of the expression and advancing
3009 the pointer while it matches the class. */
3010
3011 if (minimize)
3012 {
3013 for (fi = min;; fi++)
3014 {
3015 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3017 if (fi >= max) RRETURN(MATCH_NOMATCH);
3018 if (eptr >= md->end_subject)
3019 {
3020 SCHECK_PARTIAL();
3021 RRETURN(MATCH_NOMATCH);
3022 }
3023 GETCHARINCTEST(c, eptr);
3024 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3025 }
3026 /* Control never gets here */
3027 }
3028
3029 /* If maximizing, find the longest possible run, then work backwards. */
3030
3031 else
3032 {
3033 pp = eptr;
3034 for (i = min; i < max; i++)
3035 {
3036 int len = 1;
3037 if (eptr >= md->end_subject)
3038 {
3039 SCHECK_PARTIAL();
3040 break;
3041 }
3042 #ifdef SUPPORT_UTF
3043 GETCHARLENTEST(c, eptr, len);
3044 #else
3045 c = *eptr;
3046 #endif
3047 if (!PRIV(xclass)(c, data, utf)) break;
3048 eptr += len;
3049 }
3050 for(;;)
3051 {
3052 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3053 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3054 if (eptr-- == pp) break; /* Stop if tried at original pos */
3055 #ifdef SUPPORT_UTF
3056 if (utf) BACKCHAR(eptr);
3057 #endif
3058 }
3059 RRETURN(MATCH_NOMATCH);
3060 }
3061
3062 /* Control never gets here */
3063 }
3064 #endif /* End of XCLASS */
3065
3066 /* Match a single character, casefully */
3067
3068 case OP_CHAR:
3069 #ifdef SUPPORT_UTF
3070 if (utf)
3071 {
3072 length = 1;
3073 ecode++;
3074 GETCHARLEN(fc, ecode, length);
3075 if (length > md->end_subject - eptr)
3076 {
3077 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3078 RRETURN(MATCH_NOMATCH);
3079 }
3080 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3081 }
3082 else
3083 #endif
3084 /* Not UTF mode */
3085 {
3086 if (md->end_subject - eptr < 1)
3087 {
3088 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3089 RRETURN(MATCH_NOMATCH);
3090 }
3091 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3092 ecode += 2;
3093 }
3094 break;
3095
3096 /* Match a single character, caselessly. If we are at the end of the
3097 subject, give up immediately. */
3098
3099 case OP_CHARI:
3100 if (eptr >= md->end_subject)
3101 {
3102 SCHECK_PARTIAL();
3103 RRETURN(MATCH_NOMATCH);
3104 }
3105
3106 #ifdef SUPPORT_UTF
3107 if (utf)
3108 {
3109 length = 1;
3110 ecode++;
3111 GETCHARLEN(fc, ecode, length);
3112
3113 /* If the pattern character's value is < 128, we have only one byte, and
3114 we know that its other case must also be one byte long, so we can use the
3115 fast lookup table. We know that there is at least one byte left in the
3116 subject. */
3117
3118 if (fc < 128)
3119 {
3120 if (md->lcc[fc]
3121 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3122 ecode++;
3123 eptr++;
3124 }
3125
3126 /* Otherwise we must pick up the subject character. Note that we cannot
3127 use the value of "length" to check for sufficient bytes left, because the
3128 other case of the character may have more or fewer bytes. */
3129
3130 else
3131 {
3132 unsigned int dc;
3133 GETCHARINC(dc, eptr);
3134 ecode += length;
3135
3136 /* If we have Unicode property support, we can use it to test the other
3137 case of the character, if there is one. */
3138
3139 if (fc != dc)
3140 {
3141 #ifdef SUPPORT_UCP
3142 if (dc != UCD_OTHERCASE(fc))
3143 #endif
3144 RRETURN(MATCH_NOMATCH);
3145 }
3146 }
3147 }
3148 else
3149 #endif /* SUPPORT_UTF */
3150
3151 /* Not UTF mode */
3152 {
3153 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3154 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3155 eptr++;
3156 ecode += 2;
3157 }
3158 break;
3159
3160 /* Match a single character repeatedly. */
3161
3162 case OP_EXACT:
3163 case OP_EXACTI:
3164 min = max = GET2(ecode, 1);
3165 ecode += 1 + IMM2_SIZE;
3166 goto REPEATCHAR;
3167
3168 case OP_POSUPTO:
3169 case OP_POSUPTOI:
3170 possessive = TRUE;
3171 /* Fall through */
3172
3173 case OP_UPTO:
3174 case OP_UPTOI:
3175 case OP_MINUPTO:
3176 case OP_MINUPTOI:
3177 min = 0;
3178 max = GET2(ecode, 1);
3179 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3180 ecode += 1 + IMM2_SIZE;
3181 goto REPEATCHAR;
3182
3183 case OP_POSSTAR:
3184 case OP_POSSTARI:
3185 possessive = TRUE;
3186 min = 0;
3187 max = INT_MAX;
3188 ecode++;
3189 goto REPEATCHAR;
3190
3191 case OP_POSPLUS:
3192 case OP_POSPLUSI:
3193 possessive = TRUE;
3194 min = 1;
3195 max = INT_MAX;
3196 ecode++;
3197 goto REPEATCHAR;
3198
3199 case OP_POSQUERY:
3200 case OP_POSQUERYI:
3201 possessive = TRUE;
3202 min = 0;
3203 max = 1;
3204 ecode++;
3205 goto REPEATCHAR;
3206
3207 case OP_STAR:
3208 case OP_STARI:
3209 case OP_MINSTAR:
3210 case OP_MINSTARI:
3211 case OP_PLUS:
3212 case OP_PLUSI:
3213 case OP_MINPLUS:
3214 case OP_MINPLUSI:
3215 case OP_QUERY:
3216 case OP_QUERYI:
3217 case OP_MINQUERY:
3218 case OP_MINQUERYI:
3219 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3220 minimize = (c & 1) != 0;
3221 min = rep_min[c]; /* Pick up values from tables; */
3222 max = rep_max[c]; /* zero for max => infinity */
3223 if (max == 0) max = INT_MAX;
3224
3225 /* Common code for all repeated single-character matches. */
3226
3227 REPEATCHAR:
3228 #ifdef SUPPORT_UTF
3229 if (utf)
3230 {
3231 length = 1;
3232 charptr = ecode;
3233 GETCHARLEN(fc, ecode, length);
3234 ecode += length;
3235
3236 /* Handle multibyte character matching specially here. There is
3237 support for caseless matching if UCP support is present. */
3238
3239 if (length > 1)
3240 {
3241 #ifdef SUPPORT_UCP
3242 unsigned int othercase;
3243 if (op >= OP_STARI && /* Caseless */
3244 (othercase = UCD_OTHERCASE(fc)) != fc)
3245 oclength = PRIV(ord2utf)(othercase, occhars);
3246 else oclength = 0;
3247 #endif /* SUPPORT_UCP */
3248
3249 for (i = 1; i <= min; i++)
3250 {
3251 if (eptr <= md->end_subject - length &&
3252 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3253 #ifdef SUPPORT_UCP
3254 else if (oclength > 0 &&
3255 eptr <= md->end_subject - oclength &&
3256 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3257 #endif /* SUPPORT_UCP */
3258 else
3259 {
3260 CHECK_PARTIAL();
3261 RRETURN(MATCH_NOMATCH);
3262 }
3263 }
3264
3265 if (min == max) continue;
3266
3267 if (minimize)
3268 {
3269 for (fi = min;; fi++)
3270 {
3271 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3273 if (fi >= max) RRETURN(MATCH_NOMATCH);
3274 if (eptr <= md->end_subject - length &&
3275 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3276 #ifdef SUPPORT_UCP
3277 else if (oclength > 0 &&
3278 eptr <= md->end_subject - oclength &&
3279 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3280 #endif /* SUPPORT_UCP */
3281 else
3282 {
3283 CHECK_PARTIAL();
3284 RRETURN(MATCH_NOMATCH);
3285 }
3286 }
3287 /* Control never gets here */
3288 }
3289
3290 else /* Maximize */
3291 {
3292 pp = eptr;
3293 for (i = min; i < max; i++)
3294 {
3295 if (eptr <= md->end_subject - length &&
3296 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3297 #ifdef SUPPORT_UCP
3298 else if (oclength > 0 &&
3299 eptr <= md->end_subject - oclength &&
3300 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3301 #endif /* SUPPORT_UCP */
3302 else
3303 {
3304 CHECK_PARTIAL();
3305 break;
3306 }
3307 }
3308
3309 if (possessive) continue;
3310
3311 for(;;)
3312 {
3313 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3316 #ifdef SUPPORT_UCP
3317 eptr--;
3318 BACKCHAR(eptr);
3319 #else /* without SUPPORT_UCP */
3320 eptr -= length;
3321 #endif /* SUPPORT_UCP */
3322 }
3323 }
3324 /* Control never gets here */
3325 }
3326
3327 /* If the length of a UTF-8 character is 1, we fall through here, and
3328 obey the code as for non-UTF-8 characters below, though in this case the
3329 value of fc will always be < 128. */
3330 }
3331 else
3332 #endif /* SUPPORT_UTF */
3333 /* When not in UTF-8 mode, load a single-byte character. */
3334 fc = *ecode++;
3335
3336 /* The value of fc at this point is always one character, though we may
3337 or may not be in UTF mode. The code is duplicated for the caseless and
3338 caseful cases, for speed, since matching characters is likely to be quite
3339 common. First, ensure the minimum number of matches are present. If min =
3340 max, continue at the same level without recursing. Otherwise, if
3341 minimizing, keep trying the rest of the expression and advancing one
3342 matching character if failing, up to the maximum. Alternatively, if
3343 maximizing, find the maximum number of characters and work backwards. */
3344
3345 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3346 max, eptr));
3347
3348 if (op >= OP_STARI) /* Caseless */
3349 {
3350 #ifdef COMPILE_PCRE8
3351 /* fc must be < 128 if UTF is enabled. */
3352 foc = md->fcc[fc];
3353 #else
3354 #ifdef SUPPORT_UTF
3355 #ifdef SUPPORT_UCP
3356 if (utf && fc > 127)
3357 foc = UCD_OTHERCASE(fc);
3358 #else
3359 if (utf && fc > 127)
3360 foc = fc;
3361 #endif /* SUPPORT_UCP */
3362 else
3363 #endif /* SUPPORT_UTF */
3364 foc = TABLE_GET(fc, md->fcc, fc);
3365 #endif /* COMPILE_PCRE8 */
3366
3367 for (i = 1; i <= min; i++)
3368 {
3369 if (eptr >= md->end_subject)
3370 {
3371 SCHECK_PARTIAL();
3372 RRETURN(MATCH_NOMATCH);
3373 }
3374 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3375 eptr++;
3376 }
3377 if (min == max) continue;
3378 if (minimize)
3379 {
3380 for (fi = min;; fi++)
3381 {
3382 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 if (fi >= max) RRETURN(MATCH_NOMATCH);
3385 if (eptr >= md->end_subject)
3386 {
3387 SCHECK_PARTIAL();
3388 RRETURN(MATCH_NOMATCH);
3389 }
3390 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3391 eptr++;
3392 }
3393 /* Control never gets here */
3394 }
3395 else /* Maximize */
3396 {
3397 pp = eptr;
3398 for (i = min; i < max; i++)
3399 {
3400 if (eptr >= md->end_subject)
3401 {
3402 SCHECK_PARTIAL();
3403 break;
3404 }
3405 if (fc != *eptr && foc != *eptr) break;
3406 eptr++;
3407 }
3408
3409 if (possessive) continue;
3410
3411 while (eptr >= pp)
3412 {
3413 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3414 eptr--;
3415 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416 }
3417 RRETURN(MATCH_NOMATCH);
3418 }
3419 /* Control never gets here */
3420 }
3421
3422 /* Caseful comparisons (includes all multi-byte characters) */
3423
3424 else
3425 {
3426 for (i = 1; i <= min; i++)
3427 {
3428 if (eptr >= md->end_subject)
3429 {
3430 SCHECK_PARTIAL();
3431 RRETURN(MATCH_NOMATCH);
3432 }
3433 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3434 }
3435
3436 if (min == max) continue;
3437
3438 if (minimize)
3439 {
3440 for (fi = min;; fi++)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444 if (fi >= max) RRETURN(MATCH_NOMATCH);
3445 if (eptr >= md->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 RRETURN(MATCH_NOMATCH);
3449 }
3450 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3451 }
3452 /* Control never gets here */
3453 }
3454 else /* Maximize */
3455 {
3456 pp = eptr;
3457 for (i = min; i < max; i++)
3458 {
3459 if (eptr >= md->end_subject)
3460 {
3461 SCHECK_PARTIAL();
3462 break;
3463 }
3464 if (fc != *eptr) break;
3465 eptr++;
3466 }
3467 if (possessive) continue;
3468
3469 while (eptr >= pp)
3470 {
3471 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3472 eptr--;
3473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3474 }
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 }
3478 /* Control never gets here */
3479
3480 /* Match a negated single one-byte character. The character we are
3481 checking can be multibyte. */
3482
3483 case OP_NOT:
3484 case OP_NOTI:
3485 if (eptr >= md->end_subject)
3486 {
3487 SCHECK_PARTIAL();
3488 RRETURN(MATCH_NOMATCH);
3489 }
3490 ecode++;
3491 GETCHARINCTEST(c, eptr);
3492 if (op == OP_NOTI) /* The caseless case */
3493 {
3494 register int ch, och;
3495 ch = *ecode++;
3496 #ifdef COMPILE_PCRE8
3497 /* ch must be < 128 if UTF is enabled. */
3498 och = md->fcc[ch];
3499 #else
3500 #ifdef SUPPORT_UTF
3501 #ifdef SUPPORT_UCP
3502 if (utf && ch > 127)
3503 och = UCD_OTHERCASE(ch);
3504 #else
3505 if (utf && ch > 127)
3506 och = ch;
3507 #endif /* SUPPORT_UCP */
3508 else
3509 #endif /* SUPPORT_UTF */
3510 och = TABLE_GET(ch, md->fcc, ch);
3511 #endif /* COMPILE_PCRE8 */
3512 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3513 }
3514 else /* Caseful */
3515 {
3516 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3517 }
3518 break;
3519
3520 /* Match a negated single one-byte character repeatedly. This is almost a
3521 repeat of the code for a repeated single character, but I haven't found a
3522 nice way of commoning these up that doesn't require a test of the
3523 positive/negative option for each character match. Maybe that wouldn't add
3524 very much to the time taken, but character matching *is* what this is all
3525 about... */
3526
3527 case OP_NOTEXACT:
3528 case OP_NOTEXACTI:
3529 min = max = GET2(ecode, 1);
3530 ecode += 1 + IMM2_SIZE;
3531 goto REPEATNOTCHAR;
3532
3533 case OP_NOTUPTO:
3534 case OP_NOTUPTOI:
3535 case OP_NOTMINUPTO:
3536 case OP_NOTMINUPTOI:
3537 min = 0;
3538 max = GET2(ecode, 1);
3539 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3540 ecode += 1 + IMM2_SIZE;
3541 goto REPEATNOTCHAR;
3542
3543 case OP_NOTPOSSTAR:
3544 case OP_NOTPOSSTARI:
3545 possessive = TRUE;
3546 min = 0;
3547 max = INT_MAX;
3548 ecode++;
3549 goto REPEATNOTCHAR;
3550
3551 case OP_NOTPOSPLUS:
3552 case OP_NOTPOSPLUSI:
3553 possessive = TRUE;
3554 min = 1;
3555 max = INT_MAX;
3556 ecode++;
3557 goto REPEATNOTCHAR;
3558
3559 case OP_NOTPOSQUERY:
3560 case OP_NOTPOSQUERYI:
3561 possessive = TRUE;
3562 min = 0;
3563 max = 1;
3564 ecode++;
3565 goto REPEATNOTCHAR;
3566
3567 case OP_NOTPOSUPTO:
3568 case OP_NOTPOSUPTOI:
3569 possessive = TRUE;
3570 min = 0;
3571 max = GET2(ecode, 1);
3572 ecode += 1 + IMM2_SIZE;
3573 goto REPEATNOTCHAR;
3574
3575 case OP_NOTSTAR:
3576 case OP_NOTSTARI:
3577 case OP_NOTMINSTAR:
3578 case OP_NOTMINSTARI:
3579 case OP_NOTPLUS:
3580 case OP_NOTPLUSI:
3581 case OP_NOTMINPLUS:
3582 case OP_NOTMINPLUSI:
3583 case OP_NOTQUERY:
3584 case OP_NOTQUERYI:
3585 case OP_NOTMINQUERY:
3586 case OP_NOTMINQUERYI:
3587 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3588 minimize = (c & 1) != 0;
3589 min = rep_min[c]; /* Pick up values from tables; */
3590 max = rep_max[c]; /* zero for max => infinity */
3591 if (max == 0) max = INT_MAX;
3592
3593 /* Common code for all repeated single-byte matches. */
3594
3595 REPEATNOTCHAR:
3596 fc = *ecode++;
3597
3598 /* The code is duplicated for the caseless and caseful cases, for speed,
3599 since matching characters is likely to be quite common. First, ensure the
3600 minimum number of matches are present. If min = max, continue at the same
3601 level without recursing. Otherwise, if minimizing, keep trying the rest of
3602 the expression and advancing one matching character if failing, up to the
3603 maximum. Alternatively, if maximizing, find the maximum number of
3604 characters and work backwards. */
3605
3606 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3607 max, eptr));
3608
3609 if (op >= OP_NOTSTARI) /* Caseless */
3610 {
3611 #ifdef COMPILE_PCRE8
3612 /* fc must be < 128 if UTF is enabled. */
3613 foc = md->fcc[fc];
3614 #else
3615 #ifdef SUPPORT_UTF
3616 #ifdef SUPPORT_UCP
3617 if (utf && fc > 127)
3618 foc = UCD_OTHERCASE(fc);
3619 #else
3620 if (utf && fc > 127)
3621 foc = fc;
3622 #endif /* SUPPORT_UCP */
3623 else
3624 #endif /* SUPPORT_UTF */
3625 foc = TABLE_GET(fc, md->fcc, fc);
3626 #endif /* COMPILE_PCRE8 */
3627
3628 #ifdef SUPPORT_UTF
3629 if (utf)
3630 {
3631 register unsigned int d;
3632 for (i = 1; i <= min; i++)
3633 {
3634 if (eptr >= md->end_subject)
3635 {
3636 SCHECK_PARTIAL();
3637 RRETURN(MATCH_NOMATCH);
3638 }
3639 GETCHARINC(d, eptr);
3640 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3641 }
3642 }
3643 else
3644 #endif
3645 /* Not UTF mode */
3646 {
3647 for (i = 1; i <= min; i++)
3648 {
3649 if (eptr >= md->end_subject)
3650 {
3651 SCHECK_PARTIAL();
3652 RRETURN(MATCH_NOMATCH);
3653 }
3654 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3655 eptr++;
3656 }
3657 }
3658
3659 if (min == max) continue;
3660
3661 if (minimize)
3662 {
3663 #ifdef SUPPORT_UTF
3664 if (utf)
3665 {
3666 register unsigned int d;
3667 for (fi = min;; fi++)
3668 {
3669 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3671 if (fi >= max) RRETURN(MATCH_NOMATCH);
3672 if (eptr >= md->end_subject)
3673 {
3674 SCHECK_PARTIAL();
3675 RRETURN(MATCH_NOMATCH);
3676 }
3677 GETCHARINC(d, eptr);
3678 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3679 }
3680 }
3681 else
3682 #endif
3683 /* Not UTF mode */
3684 {
3685 for (fi = min;; fi++)
3686 {
3687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3689 if (fi >= max) RRETURN(MATCH_NOMATCH);
3690 if (eptr >= md->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 RRETURN(MATCH_NOMATCH);
3694 }
3695 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3696 eptr++;
3697 }
3698 }
3699 /* Control never gets here */
3700 }
3701
3702 /* Maximize case */
3703
3704 else
3705 {
3706 pp = eptr;
3707
3708 #ifdef SUPPORT_UTF
3709 if (utf)
3710 {
3711 register unsigned int d;
3712 for (i = min; i < max; i++)
3713 {
3714 int len = 1;
3715 if (eptr >= md->end_subject)
3716 {
3717 SCHECK_PARTIAL();
3718 break;
3719 }
3720 GETCHARLEN(d, eptr, len);
3721 if (fc == d || foc == d) break;
3722 eptr += len;
3723 }
3724 if (possessive) continue;
3725 for(;;)
3726 {
3727 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3729 if (eptr-- == pp) break; /* Stop if tried at original pos */
3730 BACKCHAR(eptr);
3731 }
3732 }
3733 else
3734 #endif
3735 /* Not UTF mode */
3736 {
3737 for (i = min; i < max; i++)
3738 {
3739 if (eptr >= md->end_subject)
3740 {
3741 SCHECK_PARTIAL();
3742 break;
3743 }
3744 if (fc == *eptr || foc == *eptr) break;
3745 eptr++;
3746 }
3747 if (possessive) continue;
3748 while (eptr >= pp)
3749 {
3750 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3752 eptr--;
3753 }
3754 }
3755
3756 RRETURN(MATCH_NOMATCH);
3757 }
3758 /* Control never gets here */
3759 }
3760
3761 /* Caseful comparisons */
3762
3763 else
3764 {
3765 #ifdef SUPPORT_UTF
3766 if (utf)
3767 {
3768 register unsigned int d;
3769 for (i = 1; i <= min; i++)
3770 {
3771 if (eptr >= md->end_subject)
3772 {
3773 SCHECK_PARTIAL();
3774 RRETURN(MATCH_NOMATCH);
3775 }
3776 GETCHARINC(d, eptr);
3777 if (fc == d) RRETURN(MATCH_NOMATCH);
3778 }
3779 }
3780 else
3781 #endif
3782 /* Not UTF mode */
3783 {
3784 for (i = 1; i <= min; i++)
3785 {
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 RRETURN(MATCH_NOMATCH);
3790 }
3791 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3792 }
3793 }
3794
3795 if (min == max) continue;
3796
3797 if (minimize)
3798 {
3799 #ifdef SUPPORT_UTF
3800 if (utf)
3801 {
3802 register unsigned int d;
3803 for (fi = min;; fi++)
3804 {
3805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3807 if (fi >= max) RRETURN(MATCH_NOMATCH);
3808 if (eptr >= md->end_subject)
3809 {
3810 SCHECK_PARTIAL();
3811 RRETURN(MATCH_NOMATCH);
3812 }
3813 GETCHARINC(d, eptr);
3814 if (fc == d) RRETURN(MATCH_NOMATCH);
3815 }
3816 }
3817 else
3818 #endif
3819 /* Not UTF mode */
3820 {
3821 for (fi = min;; fi++)
3822 {
3823 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3825 if (fi >= max) RRETURN(MATCH_NOMATCH);
3826 if (eptr >= md->end_subject)
3827 {
3828 SCHECK_PARTIAL();
3829 RRETURN(MATCH_NOMATCH);
3830 }
3831 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3832 }
3833 }
3834 /* Control never gets here */
3835 }
3836
3837 /* Maximize case */
3838
3839 else
3840 {
3841 pp = eptr;
3842
3843 #ifdef SUPPORT_UTF
3844 if (utf)
3845 {
3846 register unsigned int d;
3847 for (i = min; i < max; i++)
3848 {
3849 int len = 1;
3850 if (eptr >= md->end_subject)
3851 {
3852 SCHECK_PARTIAL();
3853 break;
3854 }
3855 GETCHARLEN(d, eptr, len);
3856 if (fc == d) break;
3857 eptr += len;
3858 }
3859 if (possessive) continue;
3860 for(;;)
3861 {
3862 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3864 if (eptr-- == pp) break; /* Stop if tried at original pos */
3865 BACKCHAR(eptr);
3866 }
3867 }
3868 else
3869 #endif
3870 /* Not UTF mode */
3871 {
3872 for (i = min; i < max; i++)
3873 {
3874 if (eptr >= md->end_subject)
3875 {
3876 SCHECK_PARTIAL();
3877 break;
3878 }
3879 if (fc == *eptr) break;
3880 eptr++;
3881 }
3882 if (possessive) continue;
3883 while (eptr >= pp)
3884 {
3885 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3887 eptr--;
3888 }
3889 }
3890
3891 RRETURN(MATCH_NOMATCH);
3892 }
3893 }
3894 /* Control never gets here */
3895
3896 /* Match a single character type repeatedly; several different opcodes
3897 share code. This is very similar to the code for single characters, but we
3898 repeat it in the interests of efficiency. */
3899
3900 case OP_TYPEEXACT:
3901 min = max = GET2(ecode, 1);
3902 minimize = TRUE;
3903 ecode += 1 + IMM2_SIZE;
3904 goto REPEATTYPE;
3905
3906 case OP_TYPEUPTO:
3907 case OP_TYPEMINUPTO:
3908 min = 0;
3909 max = GET2(ecode, 1);
3910 minimize = *ecode == OP_TYPEMINUPTO;
3911 ecode += 1 + IMM2_SIZE;
3912 goto REPEATTYPE;
3913
3914 case OP_TYPEPOSSTAR:
3915 possessive = TRUE;
3916 min = 0;
3917 max = INT_MAX;
3918 ecode++;
3919 goto REPEATTYPE;
3920
3921 case OP_TYPEPOSPLUS:
3922 possessive = TRUE;
3923 min = 1;
3924 max = INT_MAX;
3925 ecode++;
3926 goto REPEATTYPE;
3927
3928 case OP_TYPEPOSQUERY:
3929 possessive = TRUE;
3930 min = 0;
3931 max = 1;
3932 ecode++;
3933 goto REPEATTYPE;
3934
3935 case OP_TYPEPOSUPTO:
3936 possessive = TRUE;
3937 min = 0;
3938 max = GET2(ecode, 1);
3939 ecode += 1 + IMM2_SIZE;
3940 goto REPEATTYPE;
3941
3942 case OP_TYPESTAR:
3943 case OP_TYPEMINSTAR:
3944 case OP_TYPEPLUS:
3945 case OP_TYPEMINPLUS:
3946 case OP_TYPEQUERY:
3947 case OP_TYPEMINQUERY:
3948 c = *ecode++ - OP_TYPESTAR;
3949 minimize = (c & 1) != 0;
3950 min = rep_min[c]; /* Pick up values from tables; */
3951 max = rep_max[c]; /* zero for max => infinity */
3952 if (max == 0) max = INT_MAX;
3953
3954 /* Common code for all repeated single character type matches. Note that
3955 in UTF-8 mode, '.' matches a character of any length, but for the other
3956 character types, the valid characters are all one-byte long. */
3957
3958 REPEATTYPE:
3959 ctype = *ecode++; /* Code for the character type */
3960
3961 #ifdef SUPPORT_UCP
3962 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3963 {
3964 prop_fail_result = ctype == OP_NOTPROP;
3965 prop_type = *ecode++;
3966 prop_value = *ecode++;
3967 }
3968 else prop_type = -1;
3969 #endif
3970
3971 /* First, ensure the minimum number of matches are present. Use inline
3972 code for maximizing the speed, and do the type test once at the start
3973 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3974 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3975 and single-bytes. */
3976
3977 if (min > 0)
3978 {
3979 #ifdef SUPPORT_UCP
3980 if (prop_type >= 0)
3981 {
3982 switch(prop_type)
3983 {
3984 case PT_ANY:
3985 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3986 for (i = 1; i <= min; i++)
3987 {
3988 if (eptr >= md->end_subject)
3989 {
3990 SCHECK_PARTIAL();
3991 RRETURN(MATCH_NOMATCH);
3992 }
3993 GETCHARINCTEST(c, eptr);
3994 }
3995 break;
3996
3997 case PT_LAMP:
3998 for (i = 1; i <= min; i++)
3999 {
4000 int chartype;
4001 if (eptr >= md->end_subject)
4002 {
4003 SCHECK_PARTIAL();
4004 RRETURN(MATCH_NOMATCH);
4005 }
4006 GETCHARINCTEST(c, eptr);
4007 chartype = UCD_CHARTYPE(c);
4008 if ((chartype == ucp_Lu ||
4009 chartype == ucp_Ll ||
4010 chartype == ucp_Lt) == prop_fail_result)
4011 RRETURN(MATCH_NOMATCH);
4012 }
4013 break;
4014
4015 case PT_GC:
4016 for (i = 1; i <= min; i++)
4017 {
4018 if (eptr >= md->end_subject)
4019 {
4020 SCHECK_PARTIAL();
4021 RRETURN(MATCH_NOMATCH);
4022 }
4023 GETCHARINCTEST(c, eptr);
4024 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4025 RRETURN(MATCH_NOMATCH);
4026 }
4027 break;
4028
4029 case PT_PC:
4030 for (i = 1; i <= min; i++)
4031 {
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 RRETURN(MATCH_NOMATCH);
4036 }
4037 GETCHARINCTEST(c, eptr);
4038 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4039 RRETURN(MATCH_NOMATCH);
4040 }
4041 break;
4042
4043 case PT_SC:
4044 for (i = 1; i <= min; i++)
4045 {
4046 if (eptr >= md->end_subject)
4047 {
4048 SCHECK_PARTIAL();
4049 RRETURN(MATCH_NOMATCH);
4050 }
4051 GETCHARINCTEST(c, eptr);
4052 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4053 RRETURN(MATCH_NOMATCH);
4054 }
4055 break;
4056
4057 case PT_ALNUM:
4058 for (i = 1; i <= min; i++)
4059 {
4060 int category;
4061 if (eptr >= md->end_subject)
4062 {
4063 SCHECK_PARTIAL();
4064 RRETURN(MATCH_NOMATCH);
4065 }
4066 GETCHARINCTEST(c, eptr);
4067 category = UCD_CATEGORY(c);
4068 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4069 RRETURN(MATCH_NOMATCH);
4070 }
4071 break;
4072
4073 case PT_SPACE: /* Perl space */
4074 for (i = 1; i <= min; i++)
4075 {
4076 if (eptr >= md->end_subject)
4077 {
4078 SCHECK_PARTIAL();
4079 RRETURN(MATCH_NOMATCH);
4080 }
4081 GETCHARINCTEST(c, eptr);
4082 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4083 c == CHAR_FF || c == CHAR_CR)
4084 == prop_fail_result)
4085 RRETURN(MATCH_NOMATCH);
4086 }
4087 break;
4088
4089 case PT_PXSPACE: /* POSIX space */
4090 for (i = 1; i <= min; i++)
4091 {
4092 if (eptr >= md->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 GETCHARINCTEST(c, eptr);
4098 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4099 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4100 == prop_fail_result)
4101 RRETURN(MATCH_NOMATCH);
4102 }
4103 break;
4104
4105 case PT_WORD:
4106 for (i = 1; i <= min; i++)
4107 {
4108 int category;
4109 if (eptr >= md->end_subject)
4110 {
4111 SCHECK_PARTIAL();
4112 RRETURN(MATCH_NOMATCH);
4113 }
4114 GETCHARINCTEST(c, eptr);
4115 category = UCD_CATEGORY(c);
4116 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4117 == prop_fail_result)
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 break;
4121
4122 /* This should not occur */
4123
4124 default:
4125 RRETURN(PCRE_ERROR_INTERNAL);
4126 }
4127 }
4128
4129 /* Match extended Unicode sequences. We will get here only if the
4130 support is in the binary; otherwise a compile-time error occurs. */
4131
4132 else if (ctype == OP_EXTUNI)
4133 {
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 RRETURN(MATCH_NOMATCH);
4140 }
4141 GETCHARINCTEST(c, eptr);
4142 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4143 while (eptr < md->end_subject)
4144 {
4145 int len = 1;
4146 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4147 if (UCD_CATEGORY(c) != ucp_M) break;
4148 eptr += len;
4149 }
4150 }
4151 }
4152
4153 else
4154 #endif /* SUPPORT_UCP */
4155
4156 /* Handle all other cases when the coding is UTF-8 */
4157
4158 #ifdef SUPPORT_UTF
4159 if (utf) switch(ctype)
4160 {
4161 case OP_ANY:
4162 for (i = 1; i <= min; i++)
4163 {
4164 if (eptr >= md->end_subject)
4165 {
4166 SCHECK_PARTIAL();
4167 RRETURN(MATCH_NOMATCH);
4168 }
4169 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4170 eptr++;
4171 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4172 }
4173 break;
4174
4175 case OP_ALLANY:
4176 for (i = 1; i <= min; i++)
4177 {
4178 if (eptr >= md->end_subject)
4179 {
4180 SCHECK_PARTIAL();
4181 RRETURN(MATCH_NOMATCH);
4182 }
4183 eptr++;
4184 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4185 }
4186 break;
4187
4188 case OP_ANYBYTE:
4189 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4190 eptr += min;
4191 break;
4192
4193 case OP_ANYNL:
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 GETCHARINC(c, eptr);
4202 switch(c)
4203 {
4204 default: RRETURN(MATCH_NOMATCH);
4205
4206 case 0x000d:
4207 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4208 break;
4209
4210 case 0x000a:
4211 break;
4212
4213 case 0x000b:
4214 case 0x000c:
4215 case 0x0085:
4216 case 0x2028:
4217 case 0x2029:
4218 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4219 break;
4220 }
4221 }
4222 break;
4223
4224 case OP_NOT_HSPACE:
4225 for (i = 1; i <= min; i++)
4226 {
4227 if (eptr >= md->end_subject)
4228 {
4229 SCHECK_PARTIAL();
4230 RRETURN(MATCH_NOMATCH);
4231 }
4232 GETCHARINC(c, eptr);
4233 switch(c)
4234 {
4235 default: break;
4236 case 0x09: /* HT */
4237 case 0x20: /* SPACE */
4238 case 0xa0: /* NBSP */
4239 case 0x1680: /* OGHAM SPACE MARK */
4240 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4241 case 0x2000: /* EN QUAD */
4242 case 0x2001: /* EM QUAD */
4243 case 0x2002: /* EN SPACE */
4244 case 0x2003: /* EM SPACE */
4245 case 0x2004: /* THREE-PER-EM SPACE */
4246 case 0x2005: /* FOUR-PER-EM SPACE */
4247 case 0x2006: /* SIX-PER-EM SPACE */
4248 case 0x2007: /* FIGURE SPACE */
4249 case 0x2008: /* PUNCTUATION SPACE */
4250 case 0x2009: /* THIN SPACE */
4251 case 0x200A: /* HAIR SPACE */
4252 case 0x202f: /* NARROW NO-BREAK SPACE */
4253 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4254 case 0x3000: /* IDEOGRAPHIC SPACE */
4255 RRETURN(MATCH_NOMATCH);
4256 }
4257 }
4258 break;
4259
4260 case OP_HSPACE:
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 RRETURN(MATCH_NOMATCH);
4267 }
4268 GETCHARINC(c, eptr);
4269 switch(c)
4270 {
4271 default: RRETURN(MATCH_NOMATCH);
4272 case 0x09: /* HT */
4273 case 0x20: /* SPACE */
4274 case 0xa0: /* NBSP */
4275 case 0x1680: /* OGHAM SPACE MARK */
4276 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4277 case 0x2000: /* EN QUAD */
4278 case 0x2001: /* EM QUAD */
4279 case 0x2002: /* EN SPACE */
4280 case 0x2003: /* EM SPACE */
4281 case 0x2004: /* THREE-PER-EM SPACE */
4282 case 0x2005: /* FOUR-PER-EM SPACE */
4283 case 0x2006: /* SIX-PER-EM SPACE */
4284 case 0x2007: /* FIGURE SPACE */
4285 case 0x2008: /* PUNCTUATION SPACE */
4286 case 0x2009: /* THIN SPACE */
4287 case 0x200A: /* HAIR SPACE */
4288 case 0x202f: /* NARROW NO-BREAK SPACE */
4289 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4290 case 0x3000: /* IDEOGRAPHIC SPACE */
4291 break;
4292 }
4293 }
4294 break;
4295
4296 case OP_NOT_VSPACE:
4297 for (i = 1; i <= min; i++)
4298 {
4299 if (eptr >= md->end_subject)
4300 {
4301 SCHECK_PARTIAL();
4302 RRETURN(MATCH_NOMATCH);
4303 }
4304 GETCHARINC(c, eptr);
4305 switch(c)
4306 {
4307 default: break;
4308 case 0x0a: /* LF */
4309 case 0x0b: /* VT */
4310 case 0x0c: /* FF */
4311 case 0x0d: /* CR */
4312 case 0x85: /* NEL */
4313 case 0x2028: /* LINE SEPARATOR */
4314 case 0x2029: /* PARAGRAPH SEPARATOR */
4315 RRETURN(MATCH_NOMATCH);
4316 }
4317 }
4318 break;
4319
4320 case OP_VSPACE:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINC(c, eptr);
4329 switch(c)
4330 {
4331 default: RRETURN(MATCH_NOMATCH);
4332 case 0x0a: /* LF */
4333 case 0x0b: /* VT */
4334 case 0x0c: /* FF */
4335 case 0x0d: /* CR */
4336 case 0x85: /* NEL */
4337 case 0x2028: /* LINE SEPARATOR */
4338 case 0x2029: /* PARAGRAPH SEPARATOR */
4339 break;
4340 }
4341 }
4342 break;
4343
4344 case OP_NOT_DIGIT:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 RRETURN(MATCH_NOMATCH);
4351 }
4352 GETCHARINC(c, eptr);
4353 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 break;
4357
4358 case OP_DIGIT:
4359 for (i = 1; i <= min; i++)
4360 {
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 RRETURN(MATCH_NOMATCH);
4365 }
4366 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4367 RRETURN(MATCH_NOMATCH);
4368 eptr++;
4369 /* No need to skip more bytes - we know it's a 1-byte character */
4370 }
4371 break;
4372
4373 case OP_NOT_WHITESPACE:
4374 for (i = 1; i <= min; i++)
4375 {
4376 if (eptr >= md->end_subject)
4377 {
4378 SCHECK_PARTIAL();
4379 RRETURN(MATCH_NOMATCH);
4380 }
4381 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4382 RRETURN(MATCH_NOMATCH);
4383 eptr++;
4384 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4385 }
4386 break;
4387
4388 case OP_WHITESPACE:
4389 for (i = 1; i <= min; i++)
4390 {
4391 if (eptr >= md->end_subject)
4392 {
4393 SCHECK_PARTIAL();
4394 RRETURN(MATCH_NOMATCH);
4395 }
4396 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4397 RRETURN(MATCH_NOMATCH);
4398 eptr++;
4399 /* No need to skip more bytes - we know it's a 1-byte character */
4400 }
4401 break;
4402
4403 case OP_NOT_WORDCHAR:
4404 for (i = 1; i <= min; i++)
4405 {
4406 if (eptr >= md->end_subject)
4407 {
4408 SCHECK_PARTIAL();
4409 RRETURN(MATCH_NOMATCH);
4410 }
4411 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4412 RRETURN(MATCH_NOMATCH);
4413 eptr++;
4414 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4415 }
4416 break;
4417
4418 case OP_WORDCHAR:
4419 for (i = 1; i <= min; i++)
4420 {
4421 if (eptr >= md->end_subject)
4422 {
4423 SCHECK_PARTIAL();
4424 RRETURN(MATCH_NOMATCH);
4425 }
4426 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4427 RRETURN(MATCH_NOMATCH);
4428 eptr++;
4429 /* No need to skip more bytes - we know it's a 1-byte character */
4430 }
4431 break;
4432
4433 default:
4434 RRETURN(PCRE_ERROR_INTERNAL);
4435 } /* End switch(ctype) */
4436
4437 else
4438 #endif /* SUPPORT_UTF */
4439
4440 /* Code for the non-UTF-8 case for minimum matching of operators other
4441 than OP_PROP and OP_NOTPROP. */
4442
4443 switch(ctype)
4444 {
4445 case OP_ANY:
4446 for (i = 1; i <= min; i++)
4447 {
4448 if (eptr >= md->end_subject)
4449 {
4450 SCHECK_PARTIAL();
4451 RRETURN(MATCH_NOMATCH);
4452 }
4453 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4454 eptr++;
4455 }
4456 break;
4457
4458 case OP_ALLANY:
4459 if (eptr > md->end_subject - min)
4460 {
4461 SCHECK_PARTIAL();
4462 RRETURN(MATCH_NOMATCH);
4463 }
4464 eptr += min;
4465 break;
4466
4467 case OP_ANYBYTE:
4468 if (eptr > md->end_subject - min)
4469 {
4470 SCHECK_PARTIAL();
4471 RRETURN(MATCH_NOMATCH);
4472 }
4473 eptr += min;
4474 break;
4475
4476 case OP_ANYNL:
4477 for (i = 1; i <= min; i++)
4478 {
4479 if (eptr >= md->end_subject)
4480 {
4481 SCHECK_PARTIAL();
4482 RRETURN(MATCH_NOMATCH);
4483 }
4484 switch(*eptr++)
4485 {
4486 default: RRETURN(MATCH_NOMATCH);
4487
4488 case 0x000d:
4489 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4490 break;
4491
4492 case 0x000a:
4493 break;
4494
4495 case 0x000b:
4496 case 0x000c:
4497 case 0x0085:
4498 #ifdef COMPILE_PCRE16
4499 case 0x2028:
4500 case 0x2029:
4501 #endif
4502 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4503 break;
4504 }
4505 }
4506 break;
4507
4508 case OP_NOT_HSPACE:
4509 for (i = 1; i <= min; i++)
4510 {
4511 if (eptr >= md->end_subject)
4512 {
4513 SCHECK_PARTIAL();
4514 RRETURN(MATCH_NOMATCH);
4515 }
4516 switch(*eptr++)
4517 {
4518 default: break;
4519 case 0x09: /* HT */
4520 case 0x20: /* SPACE */
4521 case 0xa0: /* NBSP */
4522 #ifdef COMPILE_PCRE16
4523 case 0x1680: /* OGHAM SPACE MARK */
4524 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4525 case 0x2000: /* EN QUAD */
4526 case 0x2001: /* EM QUAD */
4527 case 0x2002: /* EN SPACE */
4528 case 0x2003: /* EM SPACE */
4529 case 0x2004: /* THREE-PER-EM SPACE */
4530 case 0x2005: /* FOUR-PER-EM SPACE */
4531 case 0x2006: /* SIX-PER-EM SPACE */
4532 case 0x2007: /* FIGURE SPACE */
4533 case 0x2008: /* PUNCTUATION SPACE */
4534 case 0x2009: /* THIN SPACE */
4535 case 0x200A: /* HAIR SPACE */
4536 case 0x202f: /* NARROW NO-BREAK SPACE */
4537 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4538 case 0x3000: /* IDEOGRAPHIC SPACE */
4539 #endif
4540 RRETURN(MATCH_NOMATCH);
4541 }
4542 }
4543 break;
4544
4545 case OP_HSPACE:
4546 for (i = 1; i <= min; i++)
4547 {
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 RRETURN(MATCH_NOMATCH);
4552 }
4553 switch(*eptr++)
4554 {
4555 default: RRETURN(MATCH_NOMATCH);
4556 case 0x09: /* HT */
4557 case 0x20: /* SPACE */
4558 case 0xa0: /* NBSP */
4559 #ifdef COMPILE_PCRE16
4560 case 0x1680: /* OGHAM SPACE MARK */
4561 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4562 case 0x2000: /* EN QUAD */
4563 case 0x2001: /* EM QUAD */
4564 case 0x2002: /* EN SPACE */
4565 case 0x2003: /* EM SPACE */
4566 case 0x2004: /* THREE-PER-EM SPACE */
4567 case 0x2005: /* FOUR-PER-EM SPACE */
4568 case 0x2006: /* SIX-PER-EM SPACE */
4569 case 0x2007: /* FIGURE SPACE */
4570 case 0x2008: /* PUNCTUATION SPACE */
4571 case 0x2009: /* THIN SPACE */
4572 case 0x200A: /* HAIR SPACE */
4573 case 0x202f: /* NARROW NO-BREAK SPACE */
4574 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4575 case 0x3000: /* IDEOGRAPHIC SPACE */
4576 #endif
4577 break;
4578 }
4579 }
4580 break;
4581
4582 case OP_NOT_VSPACE:
4583 for (i = 1; i <= min; i++)
4584 {
4585 if (eptr >= md->end_subject)
4586 {
4587 SCHECK_PARTIAL();
4588 RRETURN(MATCH_NOMATCH);
4589 }
4590 switch(*eptr++)
4591 {
4592 default: break;
4593 case 0x0a: /* LF */
4594 case 0x0b: /* VT */
4595 case 0x0c: /* FF */
4596 case 0x0d: /* CR */
4597 case 0x85: /* NEL */
4598 #ifdef COMPILE_PCRE16
4599 case 0x2028: /* LINE SEPARATOR */
4600 case 0x2029: /* PARAGRAPH SEPARATOR */
4601 #endif
4602 RRETURN(MATCH_NOMATCH);
4603 }
4604 }
4605 break;
4606
4607 case OP_VSPACE:
4608 for (i = 1; i <= min; i++)
4609 {
4610 if (eptr >= md->end_subject)
4611 {
4612 SCHECK_PARTIAL();
4613 RRETURN(MATCH_NOMATCH);
4614 }
4615 switch(*eptr++)
4616 {
4617 default: RRETURN(MATCH_NOMATCH);
4618 case 0x0a: /* LF */
4619 case 0x0b: /* VT */
4620 case 0x0c: /* FF */
4621 case 0x0d: /* CR */
4622 case 0x85: /* NEL */
4623 #ifdef COMPILE_PCRE16
4624 case 0x2028: /* LINE SEPARATOR */
4625 case 0x2029: /* PARAGRAPH SEPARATOR */
4626 #endif
4627 break;
4628 }
4629 }
4630 break;
4631
4632 case OP_NOT_DIGIT:
4633 for (i = 1; i <= min; i++)
4634 {
4635 if (eptr >= md->end_subject)
4636 {
4637 SCHECK_PARTIAL();
4638 RRETURN(MATCH_NOMATCH);
4639 }
4640 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4641 RRETURN(MATCH_NOMATCH);
4642 eptr++;
4643 }
4644 break;
4645
4646 case OP_DIGIT:
4647 for (i = 1; i <= min; i++)
4648 {
4649 if (eptr >= md->end_subject)
4650 {
4651 SCHECK_PARTIAL();
4652 RRETURN(MATCH_NOMATCH);
4653 }
4654 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4655 RRETURN(MATCH_NOMATCH);
4656 eptr++;
4657 }
4658 break;
4659
4660 case OP_NOT_WHITESPACE:
4661 for (i = 1; i <= min; i++)
4662 {
4663 if (eptr >= md->end_subject)
4664 {
4665 SCHECK_PARTIAL();
4666 RRETURN(MATCH_NOMATCH);
4667 }
4668 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4669 RRETURN(MATCH_NOMATCH);
4670 eptr++;
4671 }
4672 break;
4673
4674 case OP_WHITESPACE:
4675 for (i = 1; i <= min; i++)
4676 {
4677 if (eptr >= md->end_subject)
4678 {
4679 SCHECK_PARTIAL();
4680 RRETURN(MATCH_NOMATCH);
4681 }
4682 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4683 RRETURN(MATCH_NOMATCH);
4684 eptr++;
4685 }
4686 break;
4687
4688 case OP_NOT_WORDCHAR:
4689 for (i = 1; i <= min; i++)
4690 {
4691 if (eptr >= md->end_subject)
4692 {
4693 SCHECK_PARTIAL();
4694 RRETURN(MATCH_NOMATCH);
4695 }
4696 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4697 RRETURN(MATCH_NOMATCH);
4698 eptr++;
4699 }
4700 break;
4701
4702 case OP_WORDCHAR:
4703 for (i = 1; i <= min; i++)
4704 {
4705 if (eptr >= md->end_subject)
4706 {
4707 SCHECK_PARTIAL();
4708 RRETURN(MATCH_NOMATCH);
4709 }
4710 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4711 RRETURN(MATCH_NOMATCH);
4712 eptr++;
4713 }
4714 break;
4715
4716 default:
4717 RRETURN(PCRE_ERROR_INTERNAL);
4718 }
4719 }
4720
4721 /* If min = max, continue at the same level without recursing */
4722
4723 if (min == max) continue;
4724
4725 /* If minimizing, we have to test the rest of the pattern before each
4726 subsequent match. Again, separate the UTF-8 case for speed, and also
4727 separate the UCP cases. */
4728
4729 if (minimize)
4730 {
4731 #ifdef SUPPORT_UCP
4732 if (prop_type >= 0)
4733 {
4734 switch(prop_type)
4735 {
4736 case PT_ANY:
4737 for (fi = min;; fi++)
4738 {
4739 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4741 if (fi >= max) RRETURN(MATCH_NOMATCH);
4742 if (eptr >= md->end_subject)
4743 {
4744 SCHECK_PARTIAL();
4745 RRETURN(MATCH_NOMATCH);
4746 }
4747 GETCHARINCTEST(c, eptr);
4748 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4749 }
4750 /* Control never gets here */
4751
4752 case PT_LAMP:
4753 for (fi = min;; fi++)
4754 {
4755 int chartype;
4756 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4758 if (fi >= max) RRETURN(MATCH_NOMATCH);
4759 if (eptr >= md->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 RRETURN(MATCH_NOMATCH);
4763 }
4764 GETCHARINCTEST(c, eptr);
4765 chartype = UCD_CHARTYPE(c);
4766 if ((chartype == ucp_Lu ||
4767 chartype == ucp_Ll ||
4768 chartype == ucp_Lt) == prop_fail_result)
4769 RRETURN(MATCH_NOMATCH);
4770 }
4771 /* Control never gets here */
4772
4773 case PT_GC:
4774 for (fi = min;; fi++)
4775 {
4776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4778 if (fi >= max) RRETURN(MATCH_NOMATCH);
4779 if (eptr >= md->end_subject)
4780 {
4781 SCHECK_PARTIAL();
4782 RRETURN(MATCH_NOMATCH);
4783 }
4784 GETCHARINCTEST(c, eptr);
4785 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4786 RRETURN(MATCH_NOMATCH);
4787 }
4788 /* Control never gets here */
4789
4790 case PT_PC:
4791 for (fi = min;; fi++)
4792 {
4793 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4795 if (fi >= max) RRETURN(MATCH_NOMATCH);
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 GETCHARINCTEST(c, eptr);
4802 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4803 RRETURN(MATCH_NOMATCH);
4804 }
4805 /* Control never gets here */
4806
4807 case PT_SC:
4808 for (fi = min;; fi++)
4809 {
4810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4812 if (fi >= max) RRETURN(MATCH_NOMATCH);
4813 if (eptr >= md->end_subject)
4814 {
4815 SCHECK_PARTIAL();
4816 RRETURN(MATCH_NOMATCH);
4817 }
4818 GETCHARINCTEST(c, eptr);
4819 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4820 RRETURN(MATCH_NOMATCH);
4821 }
4822 /* Control never gets here */
4823
4824 case PT_ALNUM:
4825 for (fi = min;; fi++)
4826 {
4827 int category;
4828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4830 if (fi >= max) RRETURN(MATCH_NOMATCH);
4831 if (eptr >= md->end_subject)
4832 {
4833 SCHECK_PARTIAL();
4834 RRETURN(MATCH_NOMATCH);
4835 }
4836 GETCHARINCTEST(c, eptr);
4837 category = UCD_CATEGORY(c);
4838 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4839 RRETURN(MATCH_NOMATCH);
4840 }
4841 /* Control never gets here */
4842
4843 case PT_SPACE: /* Perl space */
4844 for (fi = min;; fi++)
4845 {
4846 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4848 if (fi >= max) RRETURN(MATCH_NOMATCH);
4849 if (eptr >= md->end_subject)
4850 {
4851 SCHECK_PARTIAL();
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 GETCHARINCTEST(c, eptr);
4855 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4856 c == CHAR_FF || c == CHAR_CR)
4857 == prop_fail_result)
4858 RRETURN(MATCH_NOMATCH);
4859 }
4860 /* Control never gets here */
4861
4862 case PT_PXSPACE: /* POSIX space */
4863 for (fi = min;; fi++)
4864 {
4865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4867 if (fi >= max) RRETURN(MATCH_NOMATCH);
4868 if (eptr >= md->end_subject)
4869 {
4870 SCHECK_PARTIAL();
4871 RRETURN(MATCH_NOMATCH);
4872 }
4873 GETCHARINCTEST(c, eptr);
4874 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4875 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4876 == prop_fail_result)
4877 RRETURN(MATCH_NOMATCH);
4878 }
4879 /* Control never gets here */
4880
4881 case PT_WORD:
4882 for (fi = min;; fi++)
4883 {
4884 int category;
4885 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4887 if (fi >= max) RRETURN(MATCH_NOMATCH);
4888 if (eptr >= md->end_subject)
4889 {
4890 SCHECK_PARTIAL();
4891 RRETURN(MATCH_NOMATCH);
4892 }
4893 GETCHARINCTEST(c, eptr);
4894 category = UCD_CATEGORY(c);
4895 if ((category == ucp_L ||
4896 category == ucp_N ||
4897 c == CHAR_UNDERSCORE)
4898 == prop_fail_result)
4899 RRETURN(MATCH_NOMATCH);
4900 }
4901 /* Control never gets here */
4902
4903 /* This should never occur */
4904
4905 default:
4906 RRETURN(PCRE_ERROR_INTERNAL);
4907 }
4908 }
4909
4910 /* Match extended Unicode sequences. We will get here only if the
4911 support is in the binary; otherwise a compile-time error occurs. */
4912
4913 else if (ctype == OP_EXTUNI)
4914 {
4915 for (fi = min;; fi++)
4916 {
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4921 {
4922 SCHECK_PARTIAL();
4923 RRETURN(MATCH_NOMATCH);
4924 }
4925 GETCHARINCTEST(c, eptr);
4926 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4927 while (eptr < md->end_subject)
4928 {
4929 int len = 1;
4930 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4931 if (UCD_CATEGORY(c) != ucp_M) break;
4932 eptr += len;
4933 }
4934 }
4935 }
4936 else
4937 #endif /* SUPPORT_UCP */
4938
4939 #ifdef SUPPORT_UTF
4940 if (utf)
4941 {
4942 for (fi = min;; fi++)
4943 {
4944 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4946 if (fi >= max) RRETURN(MATCH_NOMATCH);
4947 if (eptr >= md->end_subject)
4948 {
4949 SCHECK_PARTIAL();
4950 RRETURN(MATCH_NOMATCH);
4951 }
4952 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4953 RRETURN(MATCH_NOMATCH);
4954 GETCHARINC(c, eptr);
4955 switch(ctype)
4956 {
4957 case OP_ANY: /* This is the non-NL case */
4958 case OP_ALLANY:
4959 case OP_ANYBYTE:
4960 break;
4961
4962 case OP_ANYNL:
4963 switch(c)
4964 {
4965 default: RRETURN(MATCH_NOMATCH);
4966 case 0x000d:
4967 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4968 break;
4969 case 0x000a:
4970 break;
4971
4972 case 0x000b:
4973 case 0x000c:
4974 case 0x0085:
4975 case 0x2028:
4976 case 0x2029:
4977 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4978 break;
4979 }
4980 break;
4981
4982 case OP_NOT_HSPACE:
4983 switch(c)
4984 {
4985 default: break;
4986 case 0x09: /* HT */
4987 case 0x20: /* SPACE */
4988 case 0xa0: /* NBSP */
4989 case 0x1680: /* OGHAM SPACE MARK */
4990 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4991 case 0x2000: /* EN QUAD */
4992 case 0x2001: /* EM QUAD */
4993 case 0x2002: /* EN SPACE */
4994 case 0x2003: /* EM SPACE */
4995 case 0x2004: /* THREE-PER-EM SPACE */
4996 case 0x2005: /* FOUR-PER-EM SPACE */
4997 case 0x2006: /* SIX-PER-EM SPACE */
4998 case 0x2007: /* FIGURE SPACE */
4999 case 0x2008: /* PUNCTUATION SPACE */
5000 case 0x2009: /* THIN SPACE */
5001 case 0x200A: /* HAIR SPACE */
5002 case 0x202f: /* NARROW NO-BREAK SPACE */
5003 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5004 case 0x3000: /* IDEOGRAPHIC SPACE */
5005 RRETURN(MATCH_NOMATCH);
5006 }
5007 break;
5008
5009 case OP_HSPACE:
5010 switch(c)
5011 {
5012 default: RRETURN(MATCH_NOMATCH);
5013 case 0x09: /* HT */
5014 case 0x20: /* SPACE */
5015 case 0xa0: /* NBSP */
5016 case 0x1680: /* OGHAM SPACE MARK */
5017 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5018 case 0x2000: /* EN QUAD */
5019 case 0x2001: /* EM QUAD */
5020 case 0x2002: /* EN SPACE */
5021 case 0x2003: /* EM SPACE */
5022 case 0x2004: /* THREE-PER-EM SPACE */
5023 case 0x2005: /* FOUR-PER-EM SPACE */
5024 case 0x2006: /* SIX-PER-EM SPACE */
5025 case 0x2007: /* FIGURE SPACE */
5026 case 0x2008: /* PUNCTUATION SPACE */
5027 case 0x2009: /* THIN SPACE */
5028 case 0x200A: /* HAIR SPACE */
5029 case 0x202f: /* NARROW NO-BREAK SPACE */
5030 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5031 case 0x3000: /* IDEOGRAPHIC SPACE */
5032 break;
5033 }
5034 break;
5035
5036 case OP_NOT_VSPACE:
5037 switch(c)
5038 {
5039 default: break;
5040 case 0x0a: /* LF */
5041 case 0x0b: /* VT */
5042 case 0x0c: /* FF */
5043 case 0x0d: /* CR */
5044 case 0x85: /* NEL */
5045 case 0x2028: /* LINE SEPARATOR */
5046 case 0x2029: /* PARAGRAPH SEPARATOR */
5047 RRETURN(MATCH_NOMATCH);
5048 }
5049 break;
5050
5051 case OP_VSPACE:
5052 switch(c)
5053 {
5054 default: RRETURN(MATCH_NOMATCH);
5055 case 0x0a: /* LF */
5056 case 0x0b: /* VT */
5057 case 0x0c: /* FF */
5058 case 0x0d: /* CR */
5059 case 0x85: /* NEL */
5060 case 0x2028: /* LINE SEPARATOR */
5061 case 0x2029: /* PARAGRAPH SEPARATOR */
5062 break;
5063 }
5064 break;
5065
5066 case OP_NOT_DIGIT:
5067 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5068 RRETURN(MATCH_NOMATCH);
5069 break;
5070
5071 case OP_DIGIT:
5072 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5073 RRETURN(MATCH_NOMATCH);
5074 break;
5075
5076 case OP_NOT_WHITESPACE:
5077 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5078 RRETURN(MATCH_NOMATCH);
5079 break;
5080
5081 case OP_WHITESPACE:
5082 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5083 RRETURN(MATCH_NOMATCH);
5084 break;
5085
5086 case OP_NOT_WORDCHAR:
5087 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5088 RRETURN(MATCH_NOMATCH);
5089 break;
5090
5091 case OP_WORDCHAR:
5092 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5093 RRETURN(MATCH_NOMATCH);
5094 break;
5095
5096 default:
5097 RRETURN(PCRE_ERROR_INTERNAL);
5098 }
5099 }
5100 }
5101 else
5102 #endif
5103 /* Not UTF mode */
5104 {
5105 for (fi = min;; fi++)
5106 {
5107 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5108 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5109 if (fi >= max) RRETURN(MATCH_NOMATCH);
5110 if (eptr >= md->end_subject)
5111 {
5112 SCHECK_PARTIAL();
5113 RRETURN(MATCH_NOMATCH);
5114 }
5115 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5116 RRETURN(MATCH_NOMATCH);
5117 c = *eptr++;
5118 switch(ctype)
5119 {
5120 case OP_ANY: /* This is the non-NL case */
5121 case OP_ALLANY:
5122 case OP_ANYBYTE:
5123 break;
5124
5125 case OP_ANYNL:
5126 switch(c)
5127 {
5128 default: RRETURN(MATCH_NOMATCH);
5129 case 0x000d:
5130 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5131 break;
5132
5133 case 0x000a:
5134 break;
5135
5136 case 0x000b:
5137 case 0x000c:
5138 case 0x0085:
5139 #ifdef COMPILE_PCRE16
5140 case 0x2028:
5141 case 0x2029:
5142 #endif
5143 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5144 break;
5145 }
5146 break;
5147
5148 case OP_NOT_HSPACE:
5149 switch(c)
5150 {
5151 default: break;
5152 case 0x09: /* HT */
5153 case 0x20: /* SPACE */
5154 case 0xa0: /* NBSP */
5155 #ifdef COMPILE_PCRE16
5156 case 0x1680: /* OGHAM SPACE MARK */
5157 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5158 case 0x2000: /* EN QUAD */
5159 case 0x2001: /* EM QUAD */
5160 case 0x2002: /* EN SPACE */
5161 case 0x2003: /* EM SPACE */
5162 case 0x2004: /* THREE-PER-EM SPACE */
5163 case 0x2005: /* FOUR-PER-EM SPACE */
5164 case 0x2006: /* SIX-PER-EM SPACE */
5165 case 0x2007: /* FIGURE SPACE */
5166 case 0x2008: /* PUNCTUATION SPACE */
5167 case 0x2009: /* THIN SPACE */
5168 case 0x200A: /* HAIR SPACE */
5169 case 0x202f: /* NARROW NO-BREAK SPACE */
5170 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5171 case 0x3000: /* IDEOGRAPHIC SPACE */
5172 #endif
5173 RRETURN(MATCH_NOMATCH);
5174 }
5175 break;
5176
5177 case OP_HSPACE:
5178 switch(c)
5179 {
5180 default: RRETURN(MATCH_NOMATCH);
5181 case 0x09: /* HT */
5182 case 0x20: /* SPACE */
5183 case 0xa0: /* NBSP */
5184 #ifdef COMPILE_PCRE16
5185 case 0x1680: /* OGHAM SPACE MARK */
5186 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5187 case 0x2000: /* EN QUAD */
5188 case 0x2001: /* EM QUAD */
5189 case 0x2002: /* EN SPACE */
5190 case 0x2003: /* EM SPACE */
5191 case 0x2004: /* THREE-PER-EM SPACE */
5192 case 0x2005: /* FOUR-PER-EM SPACE */
5193 case 0x2006: /* SIX-PER-EM SPACE */
5194 case 0x2007: /* FIGURE SPACE */
5195 case 0x2008: /* PUNCTUATION SPACE */
5196 case 0x2009: /* THIN SPACE */
5197 case 0x200A: /* HAIR SPACE */
5198 case 0x202f: /* NARROW NO-BREAK SPACE */
5199 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5200 case 0x3000: /* IDEOGRAPHIC SPACE */
5201 #endif
5202 break;
5203 }
5204 break;
5205
5206 case OP_NOT_VSPACE:
5207 switch(c)
5208 {
5209 default: break;
5210 case 0x0a: /* LF */
5211 case 0x0b: /* VT */
5212 case 0x0c: /* FF */
5213 case 0x0d: /* CR */
5214 case 0x85: /* NEL */
5215 #ifdef COMPILE_PCRE16
5216 case 0x2028: /* LINE SEPARATOR */
5217 case 0x2029: /* PARAGRAPH SEPARATOR */
5218 #endif
5219 RRETURN(MATCH_NOMATCH);
5220 }
5221 break;
5222
5223 case OP_VSPACE:
5224 switch(c)
5225 {
5226 default: RRETURN(MATCH_NOMATCH);
5227 case 0x0a: /* LF */
5228 case 0x0b: /* VT */
5229 case 0x0c: /* FF */
5230 case 0x0d: /* CR */
5231 case 0x85: /* NEL */
5232 #ifdef COMPILE_PCRE16
5233 case 0x2028: /* LINE SEPARATOR */
5234 case 0x2029: /* PARAGRAPH SEPARATOR */
5235 #endif
5236 break;
5237 }
5238 break;
5239
5240 case OP_NOT_DIGIT:
5241 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5242 break;
5243
5244 case OP_DIGIT:
5245 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5246 break;
5247
5248 case OP_NOT_WHITESPACE:
5249 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5250 break;
5251
5252 case OP_WHITESPACE:
5253 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5254 break;
5255
5256 case OP_NOT_WORDCHAR:
5257 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5258 break;
5259
5260 case OP_WORDCHAR:
5261 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5262 break;
5263
5264 default:
5265 RRETURN(PCRE_ERROR_INTERNAL);
5266 }
5267 }
5268 }
5269 /* Control never gets here */
5270 }
5271
5272 /* If maximizing, it is worth using inline code for speed, doing the type
5273 test once at the start (i.e. keep it out of the loop). Again, keep the
5274 UTF-8 and UCP stuff separate. */
5275
5276 else
5277 {
5278 pp = eptr; /* Remember where we started */
5279
5280 #ifdef SUPPORT_UCP
5281 if (prop_type >= 0)
5282 {
5283 switch(prop_type)
5284 {
5285 case PT_ANY:
5286 for (i = min; i < max; i++)
5287 {
5288 int len = 1;
5289 if (eptr >= md->end_subject)
5290 {
5291 SCHECK_PARTIAL();
5292 break;
5293 }
5294 GETCHARLENTEST(c, eptr, len);
5295 if (prop_fail_result) break;
5296 eptr+= len;
5297 }
5298 break;
5299
5300 case PT_LAMP:
5301 for (i = min; i < max; i++)
5302 {
5303 int chartype;
5304 int len = 1;
5305 if (eptr >= md->end_subject)
5306 {
5307 SCHECK_PARTIAL();
5308 break;
5309 }
5310 GETCHARLENTEST(c, eptr, len);
5311 chartype = UCD_CHARTYPE(c);
5312 if ((chartype == ucp_Lu ||
5313 chartype == ucp_Ll ||
5314 chartype == ucp_Lt) == prop_fail_result)
5315 break;
5316 eptr+= len;
5317 }
5318 break;
5319
5320 case PT_GC:
5321 for (i = min; i < max; i++)
5322 {
5323 int len = 1;
5324 if (eptr >= md->end_subject)
5325 {
5326 SCHECK_PARTIAL();
5327 break;
5328 }
5329 GETCHARLENTEST(c, eptr, len);
5330 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5331 eptr+= len;
5332 }
5333 break;
5334
5335 case PT_PC:
5336 for (i = min; i < max; i++)
5337 {
5338 int len = 1;
5339 if (eptr >= md->end_subject)
5340 {
5341 SCHECK_PARTIAL();
5342 break;
5343 }
5344 GETCHARLENTEST(c, eptr, len);
5345 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5346 eptr+= len;
5347 }
5348 break;
5349
5350 case PT_SC:
5351 for (i = min; i < max; i++)
5352 {
5353 int len = 1;
5354 if (eptr >= md->end_subject)
5355 {
5356 SCHECK_PARTIAL();
5357 break;
5358 }
5359 GETCHARLENTEST(c, eptr, len);
5360 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5361 eptr+= len;
5362 }
5363 break;
5364
5365 case PT_ALNUM:
5366 for (i = min; i < max; i++)
5367 {
5368 int category;
5369 int len = 1;
5370 if (eptr >= md->end_subject)
5371 {
5372 SCHECK_PARTIAL();
5373 break;
5374 }
5375 GETCHARLENTEST(c, eptr, len);
5376 category = UCD_CATEGORY(c);
5377 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5378 break;
5379 eptr+= len;
5380 }
5381 break;
5382
5383 case PT_SPACE: /* Perl space */
5384 for (i = min; i < max; i++)
5385 {
5386 int len = 1;
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 GETCHARLENTEST(c, eptr, len);
5393 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5394 c == CHAR_FF || c == CHAR_CR)
5395 == prop_fail_result)
5396 break;
5397 eptr+= len;
5398 }
5399 break;
5400
5401 case PT_PXSPACE: /* POSIX space */
5402 for (i = min; i < max; i++)
5403 {
5404 int len = 1;
5405 if (eptr >= md->end_subject)
5406 {
5407 SCHECK_PARTIAL();
5408 break;
5409 }
5410 GETCHARLENTEST(c, eptr, len);
5411 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5412 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5413 == prop_fail_result)
5414 break;
5415 eptr+= len;
5416 }
5417 break;
5418
5419 case PT_WORD:
5420 for (i = min; i < max; i++)
5421 {
5422 int category;
5423 int len = 1;
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 GETCHARLENTEST(c, eptr, len);
5430 category = UCD_CATEGORY(c);
5431 if ((category == ucp_L || category == ucp_N ||
5432 c == CHAR_UNDERSCORE) == prop_fail_result)
5433 break;
5434 eptr+= len;
5435 }
5436 break;
5437
5438 default:
5439 RRETURN(PCRE_ERROR_INTERNAL);
5440 }
5441
5442 /* eptr is now past the end of the maximum run */
5443
5444 if (possessive) continue;
5445 for(;;)
5446 {
5447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5449 if (eptr-- == pp) break; /* Stop if tried at original pos */
5450 if (utf) BACKCHAR(eptr);
5451 }
5452 }
5453
5454 /* Match extended Unicode sequences. We will get here only if the
5455 support is in the binary; otherwise a compile-time error occurs. */
5456
5457 else if (ctype == OP_EXTUNI)
5458 {
5459 for (i = min; i < max; i++)
5460 {
5461 int len = 1;
5462 if (eptr >= md->end_subject)
5463 {
5464 SCHECK_PARTIAL();
5465 break;
5466 }
5467 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5468 if (UCD_CATEGORY(c) == ucp_M) break;
5469 eptr += len;
5470 while (eptr < md->end_subject)
5471 {
5472 len = 1;
5473 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5474 if (UCD_CATEGORY(c) != ucp_M) break;
5475 eptr += len;
5476 }
5477 }
5478
5479 /* eptr is now past the end of the maximum run */
5480
5481 if (possessive) continue;
5482
5483 for(;;)
5484 {
5485 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5486 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5487 if (eptr-- == pp) break; /* Stop if tried at original pos */
5488 for (;;) /* Move back over one extended */
5489 {
5490 if (!utf) c = *eptr; else
5491 {
5492 BACKCHAR(eptr);
5493 GETCHAR(c, eptr);
5494 }
5495 if (UCD_CATEGORY(c) != ucp_M) break;
5496 eptr--;
5497 }
5498 }
5499 }
5500
5501 else
5502 #endif /* SUPPORT_UCP */
5503
5504 #ifdef SUPPORT_UTF
5505 if (utf)
5506 {
5507 switch(ctype)
5508 {
5509 case OP_ANY:
5510 if (max < INT_MAX)
5511 {
5512 for (i = min; i < max; i++)
5513 {
5514 if (eptr >= md->end_subject)
5515 {
5516 SCHECK_PARTIAL();
5517 break;
5518 }
5519 if (IS_NEWLINE(eptr)) break;
5520 eptr++;
5521 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5522 }
5523 }
5524
5525 /* Handle unlimited UTF-8 repeat */
5526
5527 else
5528 {
5529 for (i = min; i < max; i++)
5530 {
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 if (IS_NEWLINE(eptr)) break;
5537 eptr++;
5538 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5539 }
5540 }
5541 break;
5542
5543 case OP_ALLANY:
5544 if (max < INT_MAX)
5545 {
5546 for (i = min; i < max; i++)
5547 {
5548 if (eptr >= md->end_subject)
5549 {
5550 SCHECK_PARTIAL();
5551 break;
5552 }
5553 eptr++;
5554 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5555 }
5556 }
5557 else
5558 {
5559 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5560 SCHECK_PARTIAL();
5561 }
5562 break;
5563
5564 /* The byte case is the same as non-UTF8 */
5565
5566 case OP_ANYBYTE:
5567 c = max - min;
5568 if (c > (unsigned int)(md->end_subject - eptr))
5569 {
5570 eptr = md->end_subject;
5571 SCHECK_PARTIAL();
5572 }
5573 else eptr += c;
5574 break;
5575
5576 case OP_ANYNL:
5577 for (i = min; i < max; i++)
5578 {
5579 int len = 1;
5580 if (eptr >= md->end_subject)
5581 {
5582 SCHECK_PARTIAL();
5583 break;
5584 }
5585 GETCHARLEN(c, eptr, len);
5586 if (c == 0x000d)
5587 {
5588 if (++eptr >= md->end_subject) break;
5589 if (*eptr == 0x000a) eptr++;
5590 }
5591 else
5592 {
5593 if (c != 0x000a &&
5594 (md->bsr_anycrlf ||
5595 (c != 0x000b && c != 0x000c &&
5596 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5597 break;
5598 eptr += len;
5599 }
5600 }
5601 break;
5602
5603 case OP_NOT_HSPACE:
5604 case OP_HSPACE:
5605 for (i = min; i < max; i++)
5606 {
5607 BOOL gotspace;
5608 int len = 1;
5609 if (eptr >= md->end_subject)
5610 {
5611 SCHECK_PARTIAL();
5612 break;
5613 }
5614 GETCHARLEN(c, eptr, len);
5615 switch(c)
5616 {
5617 default: gotspace = FALSE; break;
5618 case 0x09: /* HT */
5619 case 0x20: /* SPACE */
5620 case 0xa0: /* NBSP */
5621 case 0x1680: /* OGHAM SPACE MARK */
5622 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5623 case 0x2000: /* EN QUAD */
5624 case 0x2001: /* EM QUAD */
5625 case 0x2002: /* EN SPACE */
5626 case 0x2003: /* EM SPACE */
5627 case 0x2004: /* THREE-PER-EM SPACE */
5628 case 0x2005: /* FOUR-PER-EM SPACE */
5629 case 0x2006: /* SIX-PER-EM SPACE */
5630 case 0x2007: /* FIGURE SPACE */
5631 case 0x2008: /* PUNCTUATION SPACE */
5632 case 0x2009: /* THIN SPACE */
5633 case 0x200A: /* HAIR SPACE */
5634 case 0x202f: /* NARROW NO-BREAK SPACE */
5635 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5636 case 0x3000: /* IDEOGRAPHIC SPACE */
5637 gotspace = TRUE;
5638 break;
5639 }
5640 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5641 eptr += len;
5642 }
5643 break;
5644
5645 case OP_NOT_VSPACE:
5646 case OP_VSPACE:
5647 for (i = min; i < max; i++)
5648 {
5649 BOOL gotspace;
5650 int len = 1;
5651 if (eptr >= md->end_subject)
5652 {
5653 SCHECK_PARTIAL();
5654 break;
5655 }
5656 GETCHARLEN(c, eptr, len);
5657 switch(c)
5658 {
5659 default: gotspace = FALSE; break;
5660 case 0x0a: /* LF */
5661 case 0x0b: /* VT */
5662 case 0x0c: /* FF */
5663 case 0x0d: /* CR */
5664 case 0x85: /* NEL */
5665 case 0x2028: /* LINE SEPARATOR */
5666 case 0x2029: /* PARAGRAPH SEPARATOR */
5667 gotspace = TRUE;
5668 break;
5669 }
5670 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5671 eptr += len;
5672 }
5673 break;
5674
5675 case OP_NOT_DIGIT:
5676 for (i = min; i < max; i++)
5677 {
5678 int len = 1;
5679 if (eptr >= md->end_subject)
5680 {
5681 SCHECK_PARTIAL();
5682 break;
5683 }
5684 GETCHARLEN(c, eptr, len);
5685 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5686 eptr+= len;
5687 }
5688 break;
5689
5690 case OP_DIGIT:
5691 for (i = min; i < max; i++)
5692 {
5693 int len = 1;
5694 if (eptr >= md->end_subject)
5695 {
5696 SCHECK_PARTIAL();
5697 break;
5698 }
5699 GETCHARLEN(c, eptr, len);
5700 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5701 eptr+= len;
5702 }
5703 break;
5704
5705 case OP_NOT_WHITESPACE:
5706 for (i = min; i < max; i++)
5707 {
5708 int len = 1;
5709 if (eptr >= md->end_subject)
5710 {
5711 SCHECK_PARTIAL();
5712 break;
5713 }
5714 GETCHARLEN(c, eptr, len);
5715 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5716 eptr+= len;
5717 }
5718 break;
5719
5720 case OP_WHITESPACE:
5721 for (i = min; i < max; i++)
5722 {
5723 int len = 1;
5724 if (eptr >= md->end_subject)
5725 {
5726 SCHECK_PARTIAL();
5727 break;
5728 }
5729 GETCHARLEN(c, eptr, len);
5730 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5731 eptr+= len;
5732 }
5733 break;
5734
5735 case OP_NOT_WORDCHAR:
5736 for (i = min; i < max; i++)
5737 {
5738 int len = 1;
5739 if (eptr >= md->end_subject)
5740 {
5741 SCHECK_PARTIAL();
5742 break;
5743 }
5744 GETCHARLEN(c, eptr, len);
5745 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5746 eptr+= len;
5747 }
5748 break;
5749
5750 case OP_WORDCHAR:
5751 for (i = min; i < max; i++)
5752 {
5753 int len = 1;
5754 if (eptr >= md->end_subject)
5755 {
5756 SCHECK_PARTIAL();
5757 break;
5758 }
5759 GETCHARLEN(c, eptr, len);
5760 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5761 eptr+= len;
5762 }
5763 break;
5764
5765 default:
5766 RRETURN(PCRE_ERROR_INTERNAL);
5767 }
5768
5769 /* eptr is now past the end of the maximum run. If possessive, we are
5770 done (no backing up). Otherwise, match at this position; anything other
5771 than no match is immediately returned. For nomatch, back up one
5772 character, unless we are matching \R and the last thing matched was
5773 \r\n, in which case, back up two bytes. */
5774
5775 if (possessive) continue;
5776 for(;;)
5777 {
5778 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5780 if (eptr-- == pp) break; /* Stop if tried at original pos */
5781 BACKCHAR(eptr);
5782 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5783 eptr[-1] == '\r') eptr--;
5784 }
5785 }
5786 else
5787 #endif /* SUPPORT_UTF */
5788 /* Not UTF mode */
5789 {
5790 switch(ctype)
5791 {
5792 case OP_ANY:
5793 for (i = min; i < max; i++)
5794 {
5795 if (eptr >= md->end_subject)
5796 {
5797 SCHECK_PARTIAL();
5798 break;
5799 }
5800 if (IS_NEWLINE(eptr)) break;
5801 eptr++;
5802 }
5803 break;
5804
5805 case OP_ALLANY:
5806 case OP_ANYBYTE:
5807 c = max - min;
5808 if (c > (unsigned int)(md->end_subject - eptr))
5809 {
5810 eptr = md->end_subject;
5811 SCHECK_PARTIAL();
5812 }
5813 else eptr += c;
5814 break;
5815
5816 case OP_ANYNL:
5817 for (i = min; i < max; i++)
5818 {
5819 if (eptr >= md->end_subject)
5820 {
5821 SCHECK_PARTIAL();
5822 break;
5823 }
5824 c = *eptr;
5825 if (c == 0x000d)
5826 {
5827 if (++eptr >= md->end_subject) break;
5828 if (*eptr == 0x000a) eptr++;
5829 }
5830 else
5831 {
5832 if (c != 0x000a && (md->bsr_anycrlf ||
5833 (c != 0x000b && c != 0x000c && c != 0x0085
5834 #ifdef COMPILE_PCRE16
5835 && c != 0x2028 && c != 0x2029
5836 #endif
5837 ))) break;
5838 eptr++;
5839 }
5840 }
5841 break;
5842
5843 case OP_NOT_HSPACE:
5844 for (i = min; i < max; i++)
5845 {
5846 if (eptr >= md->end_subject)
5847 {
5848 SCHECK_PARTIAL();
5849 break;
5850 }
5851 c = *eptr;
5852 if (c == 0x09 || c == 0x20 || c == 0xa0
5853 #ifdef COMPILE_PCRE16
5854 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5855 || c == 0x202f || c == 0x205f || c == 0x3000
5856 #endif
5857 ) break;
5858 eptr++;
5859 }
5860 break;
5861
5862 case OP_HSPACE:
5863 for (i = min; i < max; i++)
5864 {
5865 if (eptr >= md->end_subject)
5866 {
5867 SCHECK_PARTIAL();
5868 break;
5869 }
5870 c = *eptr;
5871 if (c != 0x09 && c != 0x20 && c != 0xa0
5872 #ifdef COMPILE_PCRE16
5873 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5874 && c != 0x202f && c != 0x205f && c != 0x3000
5875 #endif
5876 ) break;
5877 eptr++;
5878 }
5879 break;
5880
5881 case OP_NOT_VSPACE:
5882 for (i = min; i < max; i++)
5883 {
5884 if (eptr >= md->end_subject)
5885 {
5886 SCHECK_PARTIAL();
5887 break;
5888 }
5889 c = *eptr;
5890 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5891 #ifdef COMPILE_PCRE16
5892 || c == 0x2028 || c == 0x2029
5893 #endif
5894 ) break;
5895 eptr++;
5896 }
5897 break;
5898
5899 case OP_VSPACE:
5900 for (i = min; i < max; i++)
5901 {
5902 if (eptr >= md->end_subject)
5903 {
5904 SCHECK_PARTIAL();
5905 break;
5906 }
5907 c = *eptr;
5908 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5909 #ifdef COMPILE_PCRE16
5910 && c != 0x2028 && c != 0x2029
5911 #endif
5912 ) break;
5913 eptr++;
5914 }
5915 break;
5916
5917 case OP_NOT_DIGIT:
5918 for (i = min; i < max; i++)
5919 {
5920 if (eptr >= md->end_subject)
5921 {
5922 SCHECK_PARTIAL();
5923 break;
5924 }
5925 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5926 eptr++;
5927 }
5928 break;
5929
5930 case OP_DIGIT:
5931 for (i = min; i < max; i++)
5932 {
5933 if (eptr >= md->end_subject)
5934 {
5935 SCHECK_PARTIAL();
5936 break;
5937 }
5938 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5939 eptr++;
5940 }
5941 break;
5942
5943 case OP_NOT_WHITESPACE:
5944 for (i = min; i < max; i++)
5945 {
5946 if (eptr >= md->end_subject)
5947 {
5948 SCHECK_PARTIAL();
5949 break;
5950 }
5951 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5952 eptr++;
5953 }
5954 break;
5955
5956 case OP_WHITESPACE:
5957 for (i = min; i < max; i++)
5958 {
5959 if (eptr >= md->end_subject)
5960 {
5961 SCHECK_PARTIAL();
5962 break;
5963 }
5964 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5965 eptr++;
5966 }
5967 break;
5968
5969 case OP_NOT_WORDCHAR:
5970 for (i = min; i < max; i++)
5971 {
5972 if (eptr >= md->end_subject)
5973 {
5974 SCHECK_PARTIAL();
5975 break;
5976 }
5977 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5978 eptr++;
5979 }
5980 break;
5981
5982 case OP_WORDCHAR:
5983 for (i = min; i < max; i++)
5984 {
5985 if (eptr >= md->end_subject)
5986 {
5987 SCHECK_PARTIAL();
5988 break;
5989 }
5990 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5991 eptr++;
5992 }
5993 break;
5994
5995 default:
5996 RRETURN(PCRE_ERROR_INTERNAL);
5997 }
5998
5999 /* eptr is now past the end of the maximum run. If possessive, we are
6000 done (no backing up). Otherwise, match at this position; anything other
6001 than no match is immediately returned. For nomatch, back up one
6002 character (byte), unless we are matching \R and the last thing matched
6003 was \r\n, in which case, back up two bytes. */
6004
6005 if (possessive) continue;
6006 while (eptr >= pp)
6007 {
6008 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6010 eptr--;
6011 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6012 eptr[-1] == '\r') eptr--;
6013 }
6014 }
6015
6016 /* Get here if we can't make it match with any permitted repetitions */
6017
6018 RRETURN(MATCH_NOMATCH);
6019 }
6020 /* Control never gets here */
6021
6022 /* There's been some horrible disaster. Arrival here can only mean there is
6023 something seriously wrong in the code above or the OP_xxx definitions. */
6024
6025 default:
6026 DPRINTF(("Unknown opcode %d\n", *ecode));
6027 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6028 }
6029
6030 /* Do not stick any code in here without much thought; it is assumed
6031 that "continue" in the code above comes out to here to repeat the main
6032 loop. */
6033
6034 } /* End of main loop */
6035 /* Control never reaches here */
6036
6037
6038 /* When compiling to use the heap rather than the stack for recursive calls to
6039 match(), the RRETURN() macro jumps here. The number that is saved in
6040 frame->Xwhere indicates which label we actually want to return to. */
6041
6042 #ifdef NO_RECURSE
6043 #define LBL(val) case val: goto L_RM##val;
6044 HEAP_RETURN:
6045 switch (frame->Xwhere)
6046 {
6047 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6048 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6049 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6050 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6051 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6052 LBL(65) LBL(66)
6053 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6054 LBL(21)
6055 #endif
6056 #ifdef SUPPORT_UTF
6057 LBL(16) LBL(18) LBL(20)
6058 LBL(22) LBL(23) LBL(28) LBL(30)
6059 LBL(32) LBL(34) LBL(42) LBL(46)
6060 #ifdef SUPPORT_UCP
6061 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6062 LBL(59) LBL(60) LBL(61) LBL(62)
6063 #endif /* SUPPORT_UCP */
6064 #endif /* SUPPORT_UTF */
6065 default:
6066 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6067
6068 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6069
6070 return PCRE_ERROR_INTERNAL;
6071 }
6072 #undef LBL
6073 #endif /* NO_RECURSE */
6074 }
6075
6076
6077 /***************************************************************************
6078 ****************************************************************************
6079 RECURSION IN THE match() FUNCTION
6080
6081 Undefine all the macros that were defined above to handle this. */
6082
6083 #ifdef NO_RECURSE
6084 #undef eptr
6085 #undef ecode
6086 #undef mstart
6087 #undef offset_top
6088 #undef eptrb
6089 #undef flags
6090
6091 #undef callpat
6092 #undef charptr
6093 #undef data
6094 #undef next
6095 #undef pp
6096 #undef prev
6097 #undef saved_eptr
6098
6099 #undef new_recursive
6100
6101 #undef cur_is_word
6102 #undef condition
6103 #undef prev_is_word
6104
6105 #undef ctype
6106 #undef length
6107 #undef max
6108 #undef min
6109 #undef number
6110 #undef offset
6111 #undef op
6112 #undef save_capture_last
6113 #undef save_offset1
6114 #undef save_offset2
6115 #undef save_offset3
6116 #undef stacksave
6117
6118 #undef newptrb
6119
6120 #endif
6121
6122 /* These two are defined as macros in both cases */
6123
6124 #undef fc
6125 #undef fi
6126
6127 /***************************************************************************
6128 ***************************************************************************/
6129
6130
6131
6132 /*************************************************
6133 * Execute a Regular Expression *
6134 *************************************************/
6135
6136 /* This function applies a compiled re to a subject string and picks out
6137 portions of the string if it matches. Two elements in the vector are set for
6138 each substring: the offsets to the start and end of the substring.
6139
6140 Arguments:
6141 argument_re points to the compiled expression
6142 extra_data points to extra data or is NULL
6143 subject points to the subject string
6144 length length of subject string (may contain binary zeros)
6145 start_offset where to start in the subject string
6146 options option bits
6147 offsets points to a vector of ints to be filled in with offsets
6148 offsetcount the number of elements in the vector
6149
6150 Returns: > 0 => success; value is the number of elements filled in
6151 = 0 => success, but offsets is not big enough
6152 -1 => failed to match
6153 < -1 => some kind of unexpected problem
6154 */
6155
6156 #ifdef COMPILE_PCRE8
6157 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6158 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6159 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6160 int offsetcount)
6161 #else
6162 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6163 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6164 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6165 int offsetcount)
6166 #endif
6167 {
6168 int rc, ocount, arg_offset_max;
6169 int newline;
6170 BOOL using_temporary_offsets = FALSE;
6171 BOOL anchored;
6172 BOOL startline;
6173 BOOL firstline;
6174 BOOL utf;
6175 BOOL has_first_char = FALSE;
6176 BOOL has_req_char = FALSE;
6177 pcre_uchar first_char = 0;
6178 pcre_uchar first_char2 = 0;
6179 pcre_uchar req_char = 0;
6180 pcre_uchar req_char2 = 0;
6181 match_data match_block;
6182 match_data *md = &match_block;
6183 const pcre_uint8 *tables;
6184 const pcre_uint8 *start_bits = NULL;
6185 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6186 PCRE_PUCHAR end_subject;
6187 PCRE_PUCHAR start_partial = NULL;
6188 PCRE_PUCHAR req_char_ptr = start_match - 1;
6189
6190 const pcre_study_data *study;
6191 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6192
6193 /* Plausibility checks */
6194
6195 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6196 if (re == NULL || subject == NULL ||
6197 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6198 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6199 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6200
6201 /* These two settings are used in the code for checking a UTF-8 string that
6202 follows immediately afterwards. Other values in the md block are used only
6203 during "normal" pcre_exec() processing, not when the JIT support is in use,
6204 so they are set up later. */
6205
6206 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6207 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6208 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6209 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6210
6211 /* Check a UTF-8 string if required. Pass back the character offset and error
6212 code for an invalid string if a results vector is available. */
6213
6214 #ifdef SUPPORT_UTF
6215 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6216 {
6217 int erroroffset;
6218 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6219 if (errorcode != 0)
6220 {
6221 if (offsetcount >= 2)
6222 {
6223 offsets[0] = erroroffset;
6224 offsets[1] = errorcode;
6225 }
6226 #ifdef COMPILE_PCRE16
6227 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6228 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6229 #else
6230 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6231 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6232 #endif
6233 }
6234
6235 /* Check that a start_offset points to the start of a UTF character. */
6236 if (start_offset > 0 && start_offset < length &&
6237 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6238 return PCRE_ERROR_BADUTF8_OFFSET;
6239 }
6240 #endif
6241
6242 /* If the pattern was successfully studied with JIT support, run the JIT
6243 executable instead of the rest of this function. Most options must be set at
6244 compile time for the JIT code to be usable. Fallback to the normal code path if
6245 an unsupported flag is set. In particular, JIT does not support partial
6246 matching. */
6247
6248 #ifdef SUPPORT_JIT
6249 if (extra_data != NULL
6250 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6251 && extra_data->executable_jit != NULL
6252 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6253 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6254 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6255 return PRIV(jit_exec)(re, extra_data->executable_jit,
6256 (const pcre_uchar *)subject, length, start_offset, options,
6257 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6258 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6259 #endif
6260
6261 /* Carry on with non-JIT matching. This information is for finding all the
6262 numbers associated with a given name, for condition testing. */
6263
6264 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6265 md->name_count = re->name_count;
6266 md->name_entry_size = re->name_entry_size;
6267
6268 /* Fish out the optional data from the extra_data structure, first setting
6269 the default values. */
6270
6271 study = NULL;
6272 md->match_limit = MATCH_LIMIT;
6273 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6274 md->callout_data = NULL;
6275
6276 /* The table pointer is always in native byte order. */
6277
6278 tables = re->tables;
6279
6280 if (extra_data != NULL)
6281 {
6282 register unsigned int flags = extra_data->flags;
6283 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6284 study = (const pcre_study_data *)extra_data->study_data;
6285 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6286 md->match_limit = extra_data->match_limit;
6287 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6288 md->match_limit_recursion = extra_data->match_limit_recursion;
6289 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6290 md->callout_data = extra_data->callout_data;
6291 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6292 }
6293
6294 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6295 is a feature that makes it possible to save compiled regex and re-use them
6296 in other programs later. */
6297
6298 if (tables == NULL) tables = PRIV(default_tables);
6299
6300 /* Check that the first field in the block is the magic number. If it is not,
6301 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6302 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6303 means that the pattern is likely compiled with different endianness. */
6304
6305 if (re->magic_number != MAGIC_NUMBER)
6306 return re->magic_number == REVERSED_MAGIC_NUMBER?
6307 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6308 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6309
6310 /* Set up other data */
6311
6312 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6313 startline = (re->flags & PCRE_STARTLINE) != 0;
6314 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6315
6316 /* The code starts after the real_pcre block and the capture name table. */
6317
6318 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6319 re->name_count * re->name_entry_size;
6320
6321 md->start_subject = (PCRE_PUCHAR)subject;
6322 md->start_offset = start_offset;
6323 md->end_subject = md->start_subject + length;
6324 end_subject = md->end_subject;
6325
6326 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6327 md->use_ucp = (re->options & PCRE_UCP) != 0;
6328 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6329 md->ignore_skip_arg = FALSE;
6330
6331 /* Some options are unpacked into BOOL variables in the hope that testing
6332 them will be faster than individual option bits. */
6333
6334 md->notbol = (options & PCRE_NOTBOL) != 0;
6335 md->noteol = (options & PCRE_NOTEOL) != 0;
6336 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6337 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6338
6339 md->hitend = FALSE;
6340 md->mark = md->nomatch_mark = NULL; /* In case never set */
6341
6342 md->recursive = NULL; /* No recursion at top level */
6343 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6344
6345 md->lcc = tables + lcc_offset;
6346 md->fcc = tables + fcc_offset;
6347 md->ctypes = tables + ctypes_offset;
6348
6349 /* Handle different \R options. */
6350
6351 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6352 {
6353 case 0:
6354 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6355 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6356 else
6357 #ifdef BSR_ANYCRLF
6358 md->bsr_anycrlf = TRUE;
6359 #else
6360 md->bsr_anycrlf = FALSE;
6361 #endif
6362 break;
6363
6364 case PCRE_BSR_ANYCRLF:
6365 md->bsr_anycrlf = TRUE;
6366 break;
6367
6368 case PCRE_BSR_UNICODE:
6369 md->bsr_anycrlf = FALSE;
6370 break;
6371
6372 default: return PCRE_ERROR_BADNEWLINE;
6373 }
6374
6375 /* Handle different types of newline. The three bits give eight cases. If
6376 nothing is set at run time, whatever was used at compile time applies. */
6377
6378 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6379 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6380 {
6381 case 0: newline = NEWLINE; break; /* Compile-time default */
6382 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6383 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6384 case PCRE_NEWLINE_CR+
6385 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6386 case PCRE_NEWLINE_ANY: newline = -1; break;
6387 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6388 default: return PCRE_ERROR_BADNEWLINE;
6389 }
6390
6391 if (newline == -2)
6392 {
6393 md->nltype = NLTYPE_ANYCRLF;
6394 }
6395 else if (newline < 0)
6396 {
6397 md->nltype = NLTYPE_ANY;
6398 }
6399 else
6400 {
6401 md->nltype = NLTYPE_FIXED;
6402 if (newline > 255)
6403 {
6404 md->nllen = 2;
6405 md->nl[0] = (newline >> 8) & 255;
6406 md->nl[1] = newline & 255;
6407 }
6408 else
6409 {
6410 md->nllen = 1;
6411 md->nl[0] = newline;
6412 }
6413 }
6414
6415 /* Partial matching was originally supported only for a restricted set of
6416 regexes; from release 8.00 there are no restrictions, but the bits are still
6417 defined (though never set). So there's no harm in leaving this code. */
6418
6419 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6420 return PCRE_ERROR_BADPARTIAL;
6421
6422 /* If the expression has got more back references than the offsets supplied can
6423 hold, we get a temporary chunk of working store to use during the matching.
6424 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6425 of 3. */
6426
6427 ocount = offsetcount - (offsetcount % 3);
6428 arg_offset_max = (2*ocount)/3;
6429
6430 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6431 {
6432 ocount = re->top_backref * 3 + 3;
6433 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6434 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6435 using_temporary_offsets = TRUE;
6436 DPRINTF(("Got memory to hold back references\n"));
6437 }
6438 else md->offset_vector = offsets;
6439
6440 md->offset_end = ocount;
6441 md->offset_max = (2*ocount)/3;
6442 md->offset_overflow = FALSE;
6443 md->capture_last = -1;
6444
6445 /* Reset the working variable associated with each extraction. These should
6446 never be used unless previously set, but they get saved and restored, and so we
6447 initialize them to avoid reading uninitialized locations. Also, unset the
6448 offsets for the matched string. This is really just for tidiness with callouts,
6449 in case they inspect these fields. */
6450
6451 if (md->offset_vector != NULL)
6452 {
6453 register int *iptr = md->offset_vector + ocount;
6454 register int *iend = iptr - re->top_bracket;
6455 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6456 while (--iptr >= iend) *iptr = -1;
6457 md->offset_vector[0] = md->offset_vector[1] = -1;
6458 }
6459
6460 /* Set up the first character to match, if available. The first_char value is
6461 never set for an anchored regular expression, but the anchoring may be forced
6462 at run time, so we have to test for anchoring. The first char may be unset for
6463 an unanchored pattern, of course. If there's no first char and the pattern was
6464 studied, there may be a bitmap of possible first characters. */
6465
6466 if (!anchored)
6467 {
6468 if ((re->flags & PCRE_FIRSTSET) != 0)
6469 {
6470 has_first_char = TRUE;
6471 first_char = first_char2 = re->first_char;
6472 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6473 {
6474 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6475 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6476 if (utf && first_char > 127)
6477 first_char2 = UCD_OTHERCASE(first_char);
6478 #endif
6479 }
6480 }
6481 else
6482 if (!startline && study != NULL &&
6483 (study->flags & PCRE_STUDY_MAPPED) != 0)
6484 start_bits = study->start_bits;
6485 }
6486
6487 /* For anchored or unanchored matches, there may be a "last known required
6488 character" set. */
6489
6490 if ((re->flags & PCRE_REQCHSET) != 0)
6491 {
6492 has_req_char = TRUE;
6493 req_char = req_char2 = re->req_char;
6494 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6495 {
6496 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6497 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6498 if (utf && req_char > 127)
6499 req_char2 = UCD_OTHERCASE(req_char);
6500 #endif
6501 }
6502 }
6503
6504
6505 /* ==========================================================================*/
6506
6507 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6508 the loop runs just once. */
6509
6510 for(;;)
6511 {
6512 PCRE_PUCHAR save_end_subject = end_subject;
6513 PCRE_PUCHAR new_start_match;
6514
6515 /* If firstline is TRUE, the start of the match is constrained to the first
6516 line of a multiline string. That is, the match must be before or at the first
6517 newline. Implement this by temporarily adjusting end_subject so that we stop
6518 scanning at a newline. If the match fails at the newline, later code breaks
6519 this loop. */
6520
6521 if (firstline)
6522 {
6523 PCRE_PUCHAR t = start_match;
6524 #ifdef SUPPORT_UTF
6525 if (utf)
6526 {
6527 while (t < md->end_subject && !IS_NEWLINE(t))
6528 {
6529 t++;
6530 ACROSSCHAR(t < end_subject, *t, t++);
6531 }
6532 }
6533 else
6534 #endif
6535 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6536 end_subject = t;
6537 }
6538
6539 /* There are some optimizations that avoid running the match if a known
6540 starting point is not found, or if a known later character is not present.
6541 However, there is an option that disables these, for testing and for ensuring
6542 that all callouts do actually occur. The option can be set in the regex by
6543 (*NO_START_OPT) or passed in match-time options. */
6544
6545 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6546 {
6547 /* Advance to a unique first char if there is one. */
6548
6549 if (has_first_char)
6550 {
6551 if (first_char != first_char2)
6552 while (start_match < end_subject &&
6553 *start_match != first_char && *start_match != first_char2)
6554 start_match++;
6555 else
6556 while (start_match < end_subject && *start_match != first_char)
6557 start_match++;
6558 }
6559
6560 /* Or to just after a linebreak for a multiline match */
6561
6562 else if (startline)
6563 {
6564 if (start_match > md->start_subject + start_offset)
6565 {
6566 #ifdef SUPPORT_UTF
6567 if (utf)
6568 {
6569 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6570 {
6571 start_match++;
6572 ACROSSCHAR(start_match < end_subject, *start_match,
6573 start_match++);
6574 }
6575 }
6576 else
6577 #endif
6578 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6579 start_match++;
6580
6581 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6582 and we are now at a LF, advance the match position by one more character.
6583 */
6584
6585 if (start_match[-1] == CHAR_CR &&
6586 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6587 start_match < end_subject &&
6588 *start_match == CHAR_NL)
6589 start_match++;
6590 }
6591 }
6592
6593 /* Or to a non-unique first byte after study */
6594
6595 else if (start_bits != NULL)
6596 {
6597 while (start_match < end_subject)
6598 {
6599 register unsigned int c = *start_match;
6600 #ifndef COMPILE_PCRE8
6601 if (c > 255) c = 255;
6602 #endif
6603 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6604 {
6605 start_match++;
6606 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6607 /* In non 8-bit mode, the iteration will stop for
6608 characters > 255 at the beginning or not stop at all. */
6609 if (utf)
6610 ACROSSCHAR(start_match < end_subject, *start_match,
6611 start_match++);
6612 #endif
6613 }
6614 else break;
6615 }
6616 }
6617 } /* Starting optimizations */
6618
6619 /* Restore fudged end_subject */
6620
6621 end_subject = save_end_subject;
6622
6623 /* The following two optimizations are disabled for partial matching or if
6624 disabling is explicitly requested. */
6625
6626 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6627 {
6628 /* If the pattern was studied, a minimum subject length may be set. This is
6629 a lower bound; no actual string of that length may actually match the
6630 pattern. Although the value is, strictly, in characters, we treat it as
6631 bytes to avoid spending too much time in this optimization. */
6632
6633 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6634 (pcre_uint32)(end_subject - start_match) < study->minlength)
6635 {
6636 rc = MATCH_NOMATCH;
6637 break;
6638 }
6639
6640 /* If req_char is set, we know that that character must appear in the
6641 subject for the match to succeed. If the first character is set, req_char
6642 must be later in the subject; otherwise the test starts at the match point.
6643 This optimization can save a huge amount of backtracking in patterns with
6644 nested unlimited repeats that aren't going to match. Writing separate code
6645 for cased/caseless versions makes it go faster, as does using an
6646 autoincrement and backing off on a match.
6647
6648 HOWEVER: when the subject string is very, very long, searching to its end
6649 can take a long time, and give bad performance on quite ordinary patterns.
6650 This showed up when somebody was matching something like /^\d+C/ on a
6651 32-megabyte string... so we don't do this when the string is sufficiently
6652 long. */
6653
6654 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6655 {
6656 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6657
6658 /* We don't need to repeat the search if we haven't yet reached the
6659 place we found it at last time. */
6660
6661 if (p > req_char_ptr)
6662 {
6663 if (req_char != req_char2)
6664 {
6665 while (p < end_subject)
6666 {
6667 register int pp = *p++;
6668 if (pp == req_char || pp == req_char2) { p--; break; }
6669 }
6670 }
6671 else
6672 {
6673 while (p < end_subject)
6674 {
6675 if (*p++ == req_char) { p--; break; }
6676 }
6677 }
6678
6679 /* If we can't find the required character, break the matching loop,
6680 forcing a match failure. */
6681
6682 if (p >= end_subject)
6683 {
6684 rc = MATCH_NOMATCH;
6685 break;
6686 }
6687
6688 /* If we have found the required character, save the point where we
6689 found it, so that we don't search again next time round the loop if
6690 the start hasn't passed this character yet. */
6691
6692 req_char_ptr = p;
6693 }
6694 }
6695 }
6696
6697 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6698 printf(">>>> Match against: ");
6699 pchars(start_match, end_subject - start_match, TRUE, md);
6700 printf("\n");
6701 #endif
6702
6703 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6704 first starting point for which a partial match was found. */
6705
6706 md->start_match_ptr = start_match;
6707 md->start_used_ptr = start_match;
6708 md->match_call_count = 0;
6709 md->match_function_type = 0;
6710 md->end_offset_top = 0;
6711 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6712 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6713
6714 switch(rc)
6715 {
6716 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6717 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6718 entirely. The only way we can do that is to re-do the match at the same
6719 point, with a flag to force SKIP with an argument to be ignored. Just
6720 treating this case as NOMATCH does not work because it does not check other
6721 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6722
6723 case MATCH_SKIP_ARG:
6724 new_start_match = start_match;
6725 md->ignore_skip_arg = TRUE;
6726 break;
6727
6728 /* SKIP passes back the next starting point explicitly, but if it is the
6729 same as the match we have just done, treat it as NOMATCH. */
6730
6731 case MATCH_SKIP:
6732 if (md->start_match_ptr != start_match)
6733 {
6734 new_start_match = md->start_match_ptr;
6735 break;
6736 }
6737 /* Fall through */
6738
6739 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6740 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6741
6742 case MATCH_NOMATCH:
6743 case MATCH_PRUNE:
6744 case MATCH_THEN:
6745