/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 882 - (show annotations)
Sun Jan 15 18:45:27 2012 UTC (7 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 211676 byte(s)
Fix nested *MARK bug (nothing shown for /(?=(*:x))((*:y)q|)/ etc.)
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496
497 /* Copy in the original argument variables */
498
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xoffset_top = offset_top;
503 frame->Xeptrb = eptrb;
504 frame->Xrdepth = rdepth;
505
506 /* This is where control jumps back to to effect "recursion" */
507
508 HEAP_RECURSE:
509
510 /* Macros make the argument variables come from the current frame */
511
512 #define eptr frame->Xeptr
513 #define ecode frame->Xecode
514 #define mstart frame->Xmstart
515 #define offset_top frame->Xoffset_top
516 #define eptrb frame->Xeptrb
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #ifdef SUPPORT_UCP
539 #define prop_type frame->Xprop_type
540 #define prop_value frame->Xprop_value
541 #define prop_fail_result frame->Xprop_fail_result
542 #define oclength frame->Xoclength
543 #define occhars frame->Xocchars
544 #endif
545
546 #define ctype frame->Xctype
547 #define fc frame->Xfc
548 #define fi frame->Xfi
549 #define length frame->Xlength
550 #define max frame->Xmax
551 #define min frame->Xmin
552 #define number frame->Xnumber
553 #define offset frame->Xoffset
554 #define op frame->Xop
555 #define save_capture_last frame->Xsave_capture_last
556 #define save_offset1 frame->Xsave_offset1
557 #define save_offset2 frame->Xsave_offset2
558 #define save_offset3 frame->Xsave_offset3
559 #define stacksave frame->Xstacksave
560
561 #define newptrb frame->Xnewptrb
562
563 /* When recursion is being used, local variables are allocated on the stack and
564 get preserved during recursion in the normal way. In this environment, fi and
565 i, and fc and c, can be the same variables. */
566
567 #else /* NO_RECURSE not defined */
568 #define fi i
569 #define fc c
570
571 /* Many of the following variables are used only in small blocks of the code.
572 My normal style of coding would have declared them within each of those blocks.
573 However, in order to accommodate the version of this code that uses an external
574 "stack" implemented on the heap, it is easier to declare them all here, so the
575 declarations can be cut out in a block. The only declarations within blocks
576 below are for variables that do not have to be preserved over a recursive call
577 to RMATCH(). */
578
579 #ifdef SUPPORT_UTF
580 const pcre_uchar *charptr;
581 #endif
582 const pcre_uchar *callpat;
583 const pcre_uchar *data;
584 const pcre_uchar *next;
585 PCRE_PUCHAR pp;
586 const pcre_uchar *prev;
587 PCRE_PUCHAR saved_eptr;
588
589 recursion_info new_recursive;
590
591 BOOL cur_is_word;
592 BOOL condition;
593 BOOL prev_is_word;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int oclength;
600 pcre_uchar occhars[6];
601 #endif
602
603 int codelink;
604 int ctype;
605 int length;
606 int max;
607 int min;
608 int number;
609 int offset;
610 int op;
611 int save_capture_last;
612 int save_offset1, save_offset2, save_offset3;
613 int stacksave[REC_STACK_SAVE_MAX];
614
615 eptrblock newptrb;
616 #endif /* NO_RECURSE */
617
618 /* To save space on the stack and in the heap frame, I have doubled up on some
619 of the local variables that are used only in localised parts of the code, but
620 still need to be preserved over recursive calls of match(). These macros define
621 the alternative names that are used. */
622
623 #define allow_zero cur_is_word
624 #define cbegroup condition
625 #define code_offset codelink
626 #define condassert condition
627 #define matched_once prev_is_word
628 #define foc number
629 #define save_mark data
630
631 /* These statements are here to stop the compiler complaining about unitialized
632 variables. */
633
634 #ifdef SUPPORT_UCP
635 prop_value = 0;
636 prop_fail_result = 0;
637 #endif
638
639
640 /* This label is used for tail recursion, which is used in a few cases even
641 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
642 used. Thanks to Ian Taylor for noticing this possibility and sending the
643 original patch. */
644
645 TAIL_RECURSE:
646
647 /* OK, now we can get on with the real code of the function. Recursive calls
648 are specified by the macro RMATCH and RRETURN is used to return. When
649 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
650 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
651 defined). However, RMATCH isn't like a function call because it's quite a
652 complicated macro. It has to be used in one particular way. This shouldn't,
653 however, impact performance when true recursion is being used. */
654
655 #ifdef SUPPORT_UTF
656 utf = md->utf; /* Local copy of the flag */
657 #else
658 utf = FALSE;
659 #endif
660
661 /* First check that we haven't called match() too many times, or that we
662 haven't exceeded the recursive call limit. */
663
664 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
665 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
666
667 /* At the start of a group with an unlimited repeat that may match an empty
668 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
669 done this way to save having to use another function argument, which would take
670 up space on the stack. See also MATCH_CONDASSERT below.
671
672 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
673 such remembered pointers, to be checked when we hit the closing ket, in order
674 to break infinite loops that match no characters. When match() is called in
675 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
676 NOT be used with tail recursion, because the memory block that is used is on
677 the stack, so a new one may be required for each match(). */
678
679 if (md->match_function_type == MATCH_CBEGROUP)
680 {
681 newptrb.epb_saved_eptr = eptr;
682 newptrb.epb_prev = eptrb;
683 eptrb = &newptrb;
684 md->match_function_type = 0;
685 }
686
687 /* Now start processing the opcodes. */
688
689 for (;;)
690 {
691 minimize = possessive = FALSE;
692 op = *ecode;
693
694 switch(op)
695 {
696 case OP_MARK:
697 md->nomatch_mark = ecode + 2;
698 md->mark = NULL; /* In case previously set by assertion */
699 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
700 eptrb, RM55);
701 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
702 md->mark == NULL) md->mark = ecode + 2;
703
704 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
705 argument, and we must check whether that argument matches this MARK's
706 argument. It is passed back in md->start_match_ptr (an overloading of that
707 variable). If it does match, we reset that variable to the current subject
708 position and return MATCH_SKIP. Otherwise, pass back the return code
709 unaltered. */
710
711 else if (rrc == MATCH_SKIP_ARG &&
712 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
713 {
714 md->start_match_ptr = eptr;
715 RRETURN(MATCH_SKIP);
716 }
717 RRETURN(rrc);
718
719 case OP_FAIL:
720 RRETURN(MATCH_NOMATCH);
721
722 /* COMMIT overrides PRUNE, SKIP, and THEN */
723
724 case OP_COMMIT:
725 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
726 eptrb, RM52);
727 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
728 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
729 rrc != MATCH_THEN)
730 RRETURN(rrc);
731 RRETURN(MATCH_COMMIT);
732
733 /* PRUNE overrides THEN */
734
735 case OP_PRUNE:
736 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
737 eptrb, RM51);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 RRETURN(MATCH_PRUNE);
740
741 case OP_PRUNE_ARG:
742 md->nomatch_mark = ecode + 2;
743 md->mark = NULL; /* In case previously set by assertion */
744 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
745 eptrb, RM56);
746 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
747 md->mark == NULL) md->mark = ecode + 2;
748 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
749 RRETURN(MATCH_PRUNE);
750
751 /* SKIP overrides PRUNE and THEN */
752
753 case OP_SKIP:
754 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
755 eptrb, RM53);
756 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
757 RRETURN(rrc);
758 md->start_match_ptr = eptr; /* Pass back current position */
759 RRETURN(MATCH_SKIP);
760
761 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
762 nomatch_mark. There is a flag that disables this opcode when re-matching a
763 pattern that ended with a SKIP for which there was not a matching MARK. */
764
765 case OP_SKIP_ARG:
766 if (md->ignore_skip_arg)
767 {
768 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
769 break;
770 }
771 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
772 eptrb, RM57);
773 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
774 RRETURN(rrc);
775
776 /* Pass back the current skip name by overloading md->start_match_ptr and
777 returning the special MATCH_SKIP_ARG return code. This will either be
778 caught by a matching MARK, or get to the top, where it causes a rematch
779 with the md->ignore_skip_arg flag set. */
780
781 md->start_match_ptr = ecode + 2;
782 RRETURN(MATCH_SKIP_ARG);
783
784 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
785 the branch in which it occurs can be determined. Overload the start of
786 match pointer to do this. */
787
788 case OP_THEN:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM54);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 md->start_match_ptr = ecode;
793 RRETURN(MATCH_THEN);
794
795 case OP_THEN_ARG:
796 md->nomatch_mark = ecode + 2;
797 md->mark = NULL; /* In case previously set by assertion */
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
799 md, eptrb, RM58);
800 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
801 md->mark == NULL) md->mark = ecode + 2;
802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803 md->start_match_ptr = ecode;
804 RRETURN(MATCH_THEN);
805
806 /* Handle an atomic group that does not contain any capturing parentheses.
807 This can be handled like an assertion. Prior to 8.13, all atomic groups
808 were handled this way. In 8.13, the code was changed as below for ONCE, so
809 that backups pass through the group and thereby reset captured values.
810 However, this uses a lot more stack, so in 8.20, atomic groups that do not
811 contain any captures generate OP_ONCE_NC, which can be handled in the old,
812 less stack intensive way.
813
814 Check the alternative branches in turn - the matching won't pass the KET
815 for this kind of subpattern. If any one branch matches, we carry on as at
816 the end of a normal bracket, leaving the subject pointer, but resetting
817 the start-of-match value in case it was changed by \K. */
818
819 case OP_ONCE_NC:
820 prev = ecode;
821 saved_eptr = eptr;
822 save_mark = md->mark;
823 do
824 {
825 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
826 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
827 {
828 mstart = md->start_match_ptr;
829 break;
830 }
831 if (rrc == MATCH_THEN)
832 {
833 next = ecode + GET(ecode,1);
834 if (md->start_match_ptr < next &&
835 (*ecode == OP_ALT || *next == OP_ALT))
836 rrc = MATCH_NOMATCH;
837 }
838
839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
840 ecode += GET(ecode,1);
841 md->mark = save_mark;
842 }
843 while (*ecode == OP_ALT);
844
845 /* If hit the end of the group (which could be repeated), fail */
846
847 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
848
849 /* Continue as from after the group, updating the offsets high water
850 mark, since extracts may have been taken. */
851
852 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
853
854 offset_top = md->end_offset_top;
855 eptr = md->end_match_ptr;
856
857 /* For a non-repeating ket, just continue at this level. This also
858 happens for a repeating ket if no characters were matched in the group.
859 This is the forcible breaking of infinite loops as implemented in Perl
860 5.005. */
861
862 if (*ecode == OP_KET || eptr == saved_eptr)
863 {
864 ecode += 1+LINK_SIZE;
865 break;
866 }
867
868 /* The repeating kets try the rest of the pattern or restart from the
869 preceding bracket, in the appropriate order. The second "call" of match()
870 uses tail recursion, to avoid using another stack frame. */
871
872 if (*ecode == OP_KETRMIN)
873 {
874 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
875 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
876 ecode = prev;
877 goto TAIL_RECURSE;
878 }
879 else /* OP_KETRMAX */
880 {
881 md->match_function_type = MATCH_CBEGROUP;
882 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
884 ecode += 1 + LINK_SIZE;
885 goto TAIL_RECURSE;
886 }
887 /* Control never gets here */
888
889 /* Handle a capturing bracket, other than those that are possessive with an
890 unlimited repeat. If there is space in the offset vector, save the current
891 subject position in the working slot at the top of the vector. We mustn't
892 change the current values of the data slot, because they may be set from a
893 previous iteration of this group, and be referred to by a reference inside
894 the group. A failure to match might occur after the group has succeeded,
895 if something later on doesn't match. For this reason, we need to restore
896 the working value and also the values of the final offsets, in case they
897 were set by a previous iteration of the same bracket.
898
899 If there isn't enough space in the offset vector, treat this as if it were
900 a non-capturing bracket. Don't worry about setting the flag for the error
901 case here; that is handled in the code for KET. */
902
903 case OP_CBRA:
904 case OP_SCBRA:
905 number = GET2(ecode, 1+LINK_SIZE);
906 offset = number << 1;
907
908 #ifdef PCRE_DEBUG
909 printf("start bracket %d\n", number);
910 printf("subject=");
911 pchars(eptr, 16, TRUE, md);
912 printf("\n");
913 #endif
914
915 if (offset < md->offset_max)
916 {
917 save_offset1 = md->offset_vector[offset];
918 save_offset2 = md->offset_vector[offset+1];
919 save_offset3 = md->offset_vector[md->offset_end - number];
920 save_capture_last = md->capture_last;
921 save_mark = md->mark;
922
923 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
924 md->offset_vector[md->offset_end - number] =
925 (int)(eptr - md->start_subject);
926
927 for (;;)
928 {
929 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
930 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
931 eptrb, RM1);
932 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
933
934 /* If we backed up to a THEN, check whether it is within the current
935 branch by comparing the address of the THEN that is passed back with
936 the end of the branch. If it is within the current branch, and the
937 branch is one of two or more alternatives (it either starts or ends
938 with OP_ALT), we have reached the limit of THEN's action, so convert
939 the return code to NOMATCH, which will cause normal backtracking to
940 happen from now on. Otherwise, THEN is passed back to an outer
941 alternative. This implements Perl's treatment of parenthesized groups,
942 where a group not containing | does not affect the current alternative,
943 that is, (X) is NOT the same as (X|(*F)). */
944
945 if (rrc == MATCH_THEN)
946 {
947 next = ecode + GET(ecode,1);
948 if (md->start_match_ptr < next &&
949 (*ecode == OP_ALT || *next == OP_ALT))
950 rrc = MATCH_NOMATCH;
951 }
952
953 /* Anything other than NOMATCH is passed back. */
954
955 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
956 md->capture_last = save_capture_last;
957 ecode += GET(ecode, 1);
958 md->mark = save_mark;
959 if (*ecode != OP_ALT) break;
960 }
961
962 DPRINTF(("bracket %d failed\n", number));
963 md->offset_vector[offset] = save_offset1;
964 md->offset_vector[offset+1] = save_offset2;
965 md->offset_vector[md->offset_end - number] = save_offset3;
966
967 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
968
969 RRETURN(rrc);
970 }
971
972 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
973 as a non-capturing bracket. */
974
975 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977
978 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
979
980 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
981 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
982
983 /* Non-capturing or atomic group, except for possessive with unlimited
984 repeat and ONCE group with no captures. Loop for all the alternatives.
985
986 When we get to the final alternative within the brackets, we used to return
987 the result of a recursive call to match() whatever happened so it was
988 possible to reduce stack usage by turning this into a tail recursion,
989 except in the case of a possibly empty group. However, now that there is
990 the possiblity of (*THEN) occurring in the final alternative, this
991 optimization is no longer always possible.
992
993 We can optimize if we know there are no (*THEN)s in the pattern; at present
994 this is the best that can be done.
995
996 MATCH_ONCE is returned when the end of an atomic group is successfully
997 reached, but subsequent matching fails. It passes back up the tree (causing
998 captured values to be reset) until the original atomic group level is
999 reached. This is tested by comparing md->once_target with the start of the
1000 group. At this point, the return is converted into MATCH_NOMATCH so that
1001 previous backup points can be taken. */
1002
1003 case OP_ONCE:
1004 case OP_BRA:
1005 case OP_SBRA:
1006 DPRINTF(("start non-capturing bracket\n"));
1007
1008 for (;;)
1009 {
1010 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1011
1012 /* If this is not a possibly empty group, and there are no (*THEN)s in
1013 the pattern, and this is the final alternative, optimize as described
1014 above. */
1015
1016 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1017 {
1018 ecode += PRIV(OP_lengths)[*ecode];
1019 goto TAIL_RECURSE;
1020 }
1021
1022 /* In all other cases, we have to make another call to match(). */
1023
1024 save_mark = md->mark;
1025 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1026 RM2);
1027
1028 /* See comment in the code for capturing groups above about handling
1029 THEN. */
1030
1031 if (rrc == MATCH_THEN)
1032 {
1033 next = ecode + GET(ecode,1);
1034 if (md->start_match_ptr < next &&
1035 (*ecode == OP_ALT || *next == OP_ALT))
1036 rrc = MATCH_NOMATCH;
1037 }
1038
1039 if (rrc != MATCH_NOMATCH)
1040 {
1041 if (rrc == MATCH_ONCE)
1042 {
1043 const pcre_uchar *scode = ecode;
1044 if (*scode != OP_ONCE) /* If not at start, find it */
1045 {
1046 while (*scode == OP_ALT) scode += GET(scode, 1);
1047 scode -= GET(scode, 1);
1048 }
1049 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1050 }
1051 RRETURN(rrc);
1052 }
1053 ecode += GET(ecode, 1);
1054 md->mark = save_mark;
1055 if (*ecode != OP_ALT) break;
1056 }
1057
1058 RRETURN(MATCH_NOMATCH);
1059
1060 /* Handle possessive capturing brackets with an unlimited repeat. We come
1061 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1062 handled similarly to the normal case above. However, the matching is
1063 different. The end of these brackets will always be OP_KETRPOS, which
1064 returns MATCH_KETRPOS without going further in the pattern. By this means
1065 we can handle the group by iteration rather than recursion, thereby
1066 reducing the amount of stack needed. */
1067
1068 case OP_CBRAPOS:
1069 case OP_SCBRAPOS:
1070 allow_zero = FALSE;
1071
1072 POSSESSIVE_CAPTURE:
1073 number = GET2(ecode, 1+LINK_SIZE);
1074 offset = number << 1;
1075
1076 #ifdef PCRE_DEBUG
1077 printf("start possessive bracket %d\n", number);
1078 printf("subject=");
1079 pchars(eptr, 16, TRUE, md);
1080 printf("\n");
1081 #endif
1082
1083 if (offset < md->offset_max)
1084 {
1085 matched_once = FALSE;
1086 code_offset = (int)(ecode - md->start_code);
1087
1088 save_offset1 = md->offset_vector[offset];
1089 save_offset2 = md->offset_vector[offset+1];
1090 save_offset3 = md->offset_vector[md->offset_end - number];
1091 save_capture_last = md->capture_last;
1092
1093 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1094
1095 /* Each time round the loop, save the current subject position for use
1096 when the group matches. For MATCH_MATCH, the group has matched, so we
1097 restart it with a new subject starting position, remembering that we had
1098 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1099 usual. If we haven't matched any alternatives in any iteration, check to
1100 see if a previous iteration matched. If so, the group has matched;
1101 continue from afterwards. Otherwise it has failed; restore the previous
1102 capture values before returning NOMATCH. */
1103
1104 for (;;)
1105 {
1106 md->offset_vector[md->offset_end - number] =
1107 (int)(eptr - md->start_subject);
1108 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1109 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1110 eptrb, RM63);
1111 if (rrc == MATCH_KETRPOS)
1112 {
1113 offset_top = md->end_offset_top;
1114 eptr = md->end_match_ptr;
1115 ecode = md->start_code + code_offset;
1116 save_capture_last = md->capture_last;
1117 matched_once = TRUE;
1118 continue;
1119 }
1120
1121 /* See comment in the code for capturing groups above about handling
1122 THEN. */
1123
1124 if (rrc == MATCH_THEN)
1125 {
1126 next = ecode + GET(ecode,1);
1127 if (md->start_match_ptr < next &&
1128 (*ecode == OP_ALT || *next == OP_ALT))
1129 rrc = MATCH_NOMATCH;
1130 }
1131
1132 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1133 md->capture_last = save_capture_last;
1134 ecode += GET(ecode, 1);
1135 if (*ecode != OP_ALT) break;
1136 }
1137
1138 if (!matched_once)
1139 {
1140 md->offset_vector[offset] = save_offset1;
1141 md->offset_vector[offset+1] = save_offset2;
1142 md->offset_vector[md->offset_end - number] = save_offset3;
1143 }
1144
1145 if (allow_zero || matched_once)
1146 {
1147 ecode += 1 + LINK_SIZE;
1148 break;
1149 }
1150
1151 RRETURN(MATCH_NOMATCH);
1152 }
1153
1154 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1155 as a non-capturing bracket. */
1156
1157 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1158 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1159
1160 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1161
1162 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1163 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1164
1165 /* Non-capturing possessive bracket with unlimited repeat. We come here
1166 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1167 without the capturing complication. It is written out separately for speed
1168 and cleanliness. */
1169
1170 case OP_BRAPOS:
1171 case OP_SBRAPOS:
1172 allow_zero = FALSE;
1173
1174 POSSESSIVE_NON_CAPTURE:
1175 matched_once = FALSE;
1176 code_offset = (int)(ecode - md->start_code);
1177
1178 for (;;)
1179 {
1180 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1181 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1182 eptrb, RM48);
1183 if (rrc == MATCH_KETRPOS)
1184 {
1185 offset_top = md->end_offset_top;
1186 eptr = md->end_match_ptr;
1187 ecode = md->start_code + code_offset;
1188 matched_once = TRUE;
1189 continue;
1190 }
1191
1192 /* See comment in the code for capturing groups above about handling
1193 THEN. */
1194
1195 if (rrc == MATCH_THEN)
1196 {
1197 next = ecode + GET(ecode,1);
1198 if (md->start_match_ptr < next &&
1199 (*ecode == OP_ALT || *next == OP_ALT))
1200 rrc = MATCH_NOMATCH;
1201 }
1202
1203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204 ecode += GET(ecode, 1);
1205 if (*ecode != OP_ALT) break;
1206 }
1207
1208 if (matched_once || allow_zero)
1209 {
1210 ecode += 1 + LINK_SIZE;
1211 break;
1212 }
1213 RRETURN(MATCH_NOMATCH);
1214
1215 /* Control never reaches here. */
1216
1217 /* Conditional group: compilation checked that there are no more than
1218 two branches. If the condition is false, skipping the first branch takes us
1219 past the end if there is only one branch, but that's OK because that is
1220 exactly what going to the ket would do. */
1221
1222 case OP_COND:
1223 case OP_SCOND:
1224 codelink = GET(ecode, 1);
1225
1226 /* Because of the way auto-callout works during compile, a callout item is
1227 inserted between OP_COND and an assertion condition. */
1228
1229 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1230 {
1231 if (PUBL(callout) != NULL)
1232 {
1233 PUBL(callout_block) cb;
1234 cb.version = 2; /* Version 1 of the callout block */
1235 cb.callout_number = ecode[LINK_SIZE+2];
1236 cb.offset_vector = md->offset_vector;
1237 #ifdef COMPILE_PCRE8
1238 cb.subject = (PCRE_SPTR)md->start_subject;
1239 #else
1240 cb.subject = (PCRE_SPTR16)md->start_subject;
1241 #endif
1242 cb.subject_length = (int)(md->end_subject - md->start_subject);
1243 cb.start_match = (int)(mstart - md->start_subject);
1244 cb.current_position = (int)(eptr - md->start_subject);
1245 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1246 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1247 cb.capture_top = offset_top/2;
1248 cb.capture_last = md->capture_last;
1249 cb.callout_data = md->callout_data;
1250 cb.mark = md->nomatch_mark;
1251 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1252 if (rrc < 0) RRETURN(rrc);
1253 }
1254 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1255 }
1256
1257 condcode = ecode[LINK_SIZE+1];
1258
1259 /* Now see what the actual condition is */
1260
1261 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1262 {
1263 if (md->recursive == NULL) /* Not recursing => FALSE */
1264 {
1265 condition = FALSE;
1266 ecode += GET(ecode, 1);
1267 }
1268 else
1269 {
1270 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1271 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1272
1273 /* If the test is for recursion into a specific subpattern, and it is
1274 false, but the test was set up by name, scan the table to see if the
1275 name refers to any other numbers, and test them. The condition is true
1276 if any one is set. */
1277
1278 if (!condition && condcode == OP_NRREF)
1279 {
1280 pcre_uchar *slotA = md->name_table;
1281 for (i = 0; i < md->name_count; i++)
1282 {
1283 if (GET2(slotA, 0) == recno) break;
1284 slotA += md->name_entry_size;
1285 }
1286
1287 /* Found a name for the number - there can be only one; duplicate
1288 names for different numbers are allowed, but not vice versa. First
1289 scan down for duplicates. */
1290
1291 if (i < md->name_count)
1292 {
1293 pcre_uchar *slotB = slotA;
1294 while (slotB > md->name_table)
1295 {
1296 slotB -= md->name_entry_size;
1297 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1298 {
1299 condition = GET2(slotB, 0) == md->recursive->group_num;
1300 if (condition) break;
1301 }
1302 else break;
1303 }
1304
1305 /* Scan up for duplicates */
1306
1307 if (!condition)
1308 {
1309 slotB = slotA;
1310 for (i++; i < md->name_count; i++)
1311 {
1312 slotB += md->name_entry_size;
1313 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1314 {
1315 condition = GET2(slotB, 0) == md->recursive->group_num;
1316 if (condition) break;
1317 }
1318 else break;
1319 }
1320 }
1321 }
1322 }
1323
1324 /* Chose branch according to the condition */
1325
1326 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1327 }
1328 }
1329
1330 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1331 {
1332 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1333 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1334
1335 /* If the numbered capture is unset, but the reference was by name,
1336 scan the table to see if the name refers to any other numbers, and test
1337 them. The condition is true if any one is set. This is tediously similar
1338 to the code above, but not close enough to try to amalgamate. */
1339
1340 if (!condition && condcode == OP_NCREF)
1341 {
1342 int refno = offset >> 1;
1343 pcre_uchar *slotA = md->name_table;
1344
1345 for (i = 0; i < md->name_count; i++)
1346 {
1347 if (GET2(slotA, 0) == refno) break;
1348 slotA += md->name_entry_size;
1349 }
1350
1351 /* Found a name for the number - there can be only one; duplicate names
1352 for different numbers are allowed, but not vice versa. First scan down
1353 for duplicates. */
1354
1355 if (i < md->name_count)
1356 {
1357 pcre_uchar *slotB = slotA;
1358 while (slotB > md->name_table)
1359 {
1360 slotB -= md->name_entry_size;
1361 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1362 {
1363 offset = GET2(slotB, 0) << 1;
1364 condition = offset < offset_top &&
1365 md->offset_vector[offset] >= 0;
1366 if (condition) break;
1367 }
1368 else break;
1369 }
1370
1371 /* Scan up for duplicates */
1372
1373 if (!condition)
1374 {
1375 slotB = slotA;
1376 for (i++; i < md->name_count; i++)
1377 {
1378 slotB += md->name_entry_size;
1379 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1380 {
1381 offset = GET2(slotB, 0) << 1;
1382 condition = offset < offset_top &&
1383 md->offset_vector[offset] >= 0;
1384 if (condition) break;
1385 }
1386 else break;
1387 }
1388 }
1389 }
1390 }
1391
1392 /* Chose branch according to the condition */
1393
1394 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1395 }
1396
1397 else if (condcode == OP_DEF) /* DEFINE - always false */
1398 {
1399 condition = FALSE;
1400 ecode += GET(ecode, 1);
1401 }
1402
1403 /* The condition is an assertion. Call match() to evaluate it - setting
1404 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1405 an assertion. */
1406
1407 else
1408 {
1409 md->match_function_type = MATCH_CONDASSERT;
1410 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1411 if (rrc == MATCH_MATCH)
1412 {
1413 if (md->end_offset_top > offset_top)
1414 offset_top = md->end_offset_top; /* Captures may have happened */
1415 condition = TRUE;
1416 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1417 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1418 }
1419
1420 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1421 assertion; it is therefore treated as NOMATCH. */
1422
1423 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1424 {
1425 RRETURN(rrc); /* Need braces because of following else */
1426 }
1427 else
1428 {
1429 condition = FALSE;
1430 ecode += codelink;
1431 }
1432 }
1433
1434 /* We are now at the branch that is to be obeyed. As there is only one, can
1435 use tail recursion to avoid using another stack frame, except when there is
1436 unlimited repeat of a possibly empty group. In the latter case, a recursive
1437 call to match() is always required, unless the second alternative doesn't
1438 exist, in which case we can just plough on. Note that, for compatibility
1439 with Perl, the | in a conditional group is NOT treated as creating two
1440 alternatives. If a THEN is encountered in the branch, it propagates out to
1441 the enclosing alternative (unless nested in a deeper set of alternatives,
1442 of course). */
1443
1444 if (condition || *ecode == OP_ALT)
1445 {
1446 if (op != OP_SCOND)
1447 {
1448 ecode += 1 + LINK_SIZE;
1449 goto TAIL_RECURSE;
1450 }
1451
1452 md->match_function_type = MATCH_CBEGROUP;
1453 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1454 RRETURN(rrc);
1455 }
1456
1457 /* Condition false & no alternative; continue after the group. */
1458
1459 else
1460 {
1461 ecode += 1 + LINK_SIZE;
1462 }
1463 break;
1464
1465
1466 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1467 to close any currently open capturing brackets. */
1468
1469 case OP_CLOSE:
1470 number = GET2(ecode, 1);
1471 offset = number << 1;
1472
1473 #ifdef PCRE_DEBUG
1474 printf("end bracket %d at *ACCEPT", number);
1475 printf("\n");
1476 #endif
1477
1478 md->capture_last = number;
1479 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1480 {
1481 md->offset_vector[offset] =
1482 md->offset_vector[md->offset_end - number];
1483 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1484 if (offset_top <= offset) offset_top = offset + 2;
1485 }
1486 ecode += 1 + IMM2_SIZE;
1487 break;
1488
1489
1490 /* End of the pattern, either real or forced. */
1491
1492 case OP_END:
1493 case OP_ACCEPT:
1494 case OP_ASSERT_ACCEPT:
1495
1496 /* If we have matched an empty string, fail if not in an assertion and not
1497 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1498 is set and we have matched at the start of the subject. In both cases,
1499 backtracking will then try other alternatives, if any. */
1500
1501 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1502 md->recursive == NULL &&
1503 (md->notempty ||
1504 (md->notempty_atstart &&
1505 mstart == md->start_subject + md->start_offset)))
1506 RRETURN(MATCH_NOMATCH);
1507
1508 /* Otherwise, we have a match. */
1509
1510 md->end_match_ptr = eptr; /* Record where we ended */
1511 md->end_offset_top = offset_top; /* and how many extracts were taken */
1512 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1513
1514 /* For some reason, the macros don't work properly if an expression is
1515 given as the argument to RRETURN when the heap is in use. */
1516
1517 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1518 RRETURN(rrc);
1519
1520 /* Assertion brackets. Check the alternative branches in turn - the
1521 matching won't pass the KET for an assertion. If any one branch matches,
1522 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1523 start of each branch to move the current point backwards, so the code at
1524 this level is identical to the lookahead case. When the assertion is part
1525 of a condition, we want to return immediately afterwards. The caller of
1526 this incarnation of the match() function will have set MATCH_CONDASSERT in
1527 md->match_function type, and one of these opcodes will be the first opcode
1528 that is processed. We use a local variable that is preserved over calls to
1529 match() to remember this case. */
1530
1531 case OP_ASSERT:
1532 case OP_ASSERTBACK:
1533 if (md->match_function_type == MATCH_CONDASSERT)
1534 {
1535 condassert = TRUE;
1536 md->match_function_type = 0;
1537 }
1538 else condassert = FALSE;
1539
1540 do
1541 {
1542 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1543 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1544 {
1545 mstart = md->start_match_ptr; /* In case \K reset it */
1546 break;
1547 }
1548
1549 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1550 as NOMATCH. */
1551
1552 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1553 ecode += GET(ecode, 1);
1554 }
1555 while (*ecode == OP_ALT);
1556
1557 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1558
1559 /* If checking an assertion for a condition, return MATCH_MATCH. */
1560
1561 if (condassert) RRETURN(MATCH_MATCH);
1562
1563 /* Continue from after the assertion, updating the offsets high water
1564 mark, since extracts may have been taken during the assertion. */
1565
1566 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1567 ecode += 1 + LINK_SIZE;
1568 offset_top = md->end_offset_top;
1569 continue;
1570
1571 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1572 PRUNE, or COMMIT means we must assume failure without checking subsequent
1573 branches. */
1574
1575 case OP_ASSERT_NOT:
1576 case OP_ASSERTBACK_NOT:
1577 if (md->match_function_type == MATCH_CONDASSERT)
1578 {
1579 condassert = TRUE;
1580 md->match_function_type = 0;
1581 }
1582 else condassert = FALSE;
1583
1584 do
1585 {
1586 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1587 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1588 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1589 {
1590 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1591 break;
1592 }
1593
1594 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1595 as NOMATCH. */
1596
1597 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1598 ecode += GET(ecode,1);
1599 }
1600 while (*ecode == OP_ALT);
1601
1602 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1603
1604 ecode += 1 + LINK_SIZE;
1605 continue;
1606
1607 /* Move the subject pointer back. This occurs only at the start of
1608 each branch of a lookbehind assertion. If we are too close to the start to
1609 move back, this match function fails. When working with UTF-8 we move
1610 back a number of characters, not bytes. */
1611
1612 case OP_REVERSE:
1613 #ifdef SUPPORT_UTF
1614 if (utf)
1615 {
1616 i = GET(ecode, 1);
1617 while (i-- > 0)
1618 {
1619 eptr--;
1620 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1621 BACKCHAR(eptr);
1622 }
1623 }
1624 else
1625 #endif
1626
1627 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1628
1629 {
1630 eptr -= GET(ecode, 1);
1631 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1632 }
1633
1634 /* Save the earliest consulted character, then skip to next op code */
1635
1636 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1637 ecode += 1 + LINK_SIZE;
1638 break;
1639
1640 /* The callout item calls an external function, if one is provided, passing
1641 details of the match so far. This is mainly for debugging, though the
1642 function is able to force a failure. */
1643
1644 case OP_CALLOUT:
1645 if (PUBL(callout) != NULL)
1646 {
1647 PUBL(callout_block) cb;
1648 cb.version = 2; /* Version 1 of the callout block */
1649 cb.callout_number = ecode[1];
1650 cb.offset_vector = md->offset_vector;
1651 #ifdef COMPILE_PCRE8
1652 cb.subject = (PCRE_SPTR)md->start_subject;
1653 #else
1654 cb.subject = (PCRE_SPTR16)md->start_subject;
1655 #endif
1656 cb.subject_length = (int)(md->end_subject - md->start_subject);
1657 cb.start_match = (int)(mstart - md->start_subject);
1658 cb.current_position = (int)(eptr - md->start_subject);
1659 cb.pattern_position = GET(ecode, 2);
1660 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1661 cb.capture_top = offset_top/2;
1662 cb.capture_last = md->capture_last;
1663 cb.callout_data = md->callout_data;
1664 cb.mark = md->nomatch_mark;
1665 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1666 if (rrc < 0) RRETURN(rrc);
1667 }
1668 ecode += 2 + 2*LINK_SIZE;
1669 break;
1670
1671 /* Recursion either matches the current regex, or some subexpression. The
1672 offset data is the offset to the starting bracket from the start of the
1673 whole pattern. (This is so that it works from duplicated subpatterns.)
1674
1675 The state of the capturing groups is preserved over recursion, and
1676 re-instated afterwards. We don't know how many are started and not yet
1677 finished (offset_top records the completed total) so we just have to save
1678 all the potential data. There may be up to 65535 such values, which is too
1679 large to put on the stack, but using malloc for small numbers seems
1680 expensive. As a compromise, the stack is used when there are no more than
1681 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1682
1683 There are also other values that have to be saved. We use a chained
1684 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1685 for the original version of this logic. It has, however, been hacked around
1686 a lot, so he is not to blame for the current way it works. */
1687
1688 case OP_RECURSE:
1689 {
1690 recursion_info *ri;
1691 int recno;
1692
1693 callpat = md->start_code + GET(ecode, 1);
1694 recno = (callpat == md->start_code)? 0 :
1695 GET2(callpat, 1 + LINK_SIZE);
1696
1697 /* Check for repeating a recursion without advancing the subject pointer.
1698 This should catch convoluted mutual recursions. (Some simple cases are
1699 caught at compile time.) */
1700
1701 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1702 if (recno == ri->group_num && eptr == ri->subject_position)
1703 RRETURN(PCRE_ERROR_RECURSELOOP);
1704
1705 /* Add to "recursing stack" */
1706
1707 new_recursive.group_num = recno;
1708 new_recursive.subject_position = eptr;
1709 new_recursive.prevrec = md->recursive;
1710 md->recursive = &new_recursive;
1711
1712 /* Where to continue from afterwards */
1713
1714 ecode += 1 + LINK_SIZE;
1715
1716 /* Now save the offset data */
1717
1718 new_recursive.saved_max = md->offset_end;
1719 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1720 new_recursive.offset_save = stacksave;
1721 else
1722 {
1723 new_recursive.offset_save =
1724 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1725 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1726 }
1727 memcpy(new_recursive.offset_save, md->offset_vector,
1728 new_recursive.saved_max * sizeof(int));
1729
1730 /* OK, now we can do the recursion. After processing each alternative,
1731 restore the offset data. If there were nested recursions, md->recursive
1732 might be changed, so reset it before looping. */
1733
1734 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1735 cbegroup = (*callpat >= OP_SBRA);
1736 do
1737 {
1738 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1739 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1740 md, eptrb, RM6);
1741 memcpy(md->offset_vector, new_recursive.offset_save,
1742 new_recursive.saved_max * sizeof(int));
1743 md->recursive = new_recursive.prevrec;
1744 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1745 {
1746 DPRINTF(("Recursion matched\n"));
1747 if (new_recursive.offset_save != stacksave)
1748 (PUBL(free))(new_recursive.offset_save);
1749
1750 /* Set where we got to in the subject, and reset the start in case
1751 it was changed by \K. This *is* propagated back out of a recursion,
1752 for Perl compatibility. */
1753
1754 eptr = md->end_match_ptr;
1755 mstart = md->start_match_ptr;
1756 goto RECURSION_MATCHED; /* Exit loop; end processing */
1757 }
1758
1759 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1760 as NOMATCH. */
1761
1762 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1763 {
1764 DPRINTF(("Recursion gave error %d\n", rrc));
1765 if (new_recursive.offset_save != stacksave)
1766 (PUBL(free))(new_recursive.offset_save);
1767 RRETURN(rrc);
1768 }
1769
1770 md->recursive = &new_recursive;
1771 callpat += GET(callpat, 1);
1772 }
1773 while (*callpat == OP_ALT);
1774
1775 DPRINTF(("Recursion didn't match\n"));
1776 md->recursive = new_recursive.prevrec;
1777 if (new_recursive.offset_save != stacksave)
1778 (PUBL(free))(new_recursive.offset_save);
1779 RRETURN(MATCH_NOMATCH);
1780 }
1781
1782 RECURSION_MATCHED:
1783 break;
1784
1785 /* An alternation is the end of a branch; scan along to find the end of the
1786 bracketed group and go to there. */
1787
1788 case OP_ALT:
1789 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1790 break;
1791
1792 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1793 indicating that it may occur zero times. It may repeat infinitely, or not
1794 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1795 with fixed upper repeat limits are compiled as a number of copies, with the
1796 optional ones preceded by BRAZERO or BRAMINZERO. */
1797
1798 case OP_BRAZERO:
1799 next = ecode + 1;
1800 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1802 do next += GET(next, 1); while (*next == OP_ALT);
1803 ecode = next + 1 + LINK_SIZE;
1804 break;
1805
1806 case OP_BRAMINZERO:
1807 next = ecode + 1;
1808 do next += GET(next, 1); while (*next == OP_ALT);
1809 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 ecode++;
1812 break;
1813
1814 case OP_SKIPZERO:
1815 next = ecode+1;
1816 do next += GET(next,1); while (*next == OP_ALT);
1817 ecode = next + 1 + LINK_SIZE;
1818 break;
1819
1820 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1821 here; just jump to the group, with allow_zero set TRUE. */
1822
1823 case OP_BRAPOSZERO:
1824 op = *(++ecode);
1825 allow_zero = TRUE;
1826 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1827 goto POSSESSIVE_NON_CAPTURE;
1828
1829 /* End of a group, repeated or non-repeating. */
1830
1831 case OP_KET:
1832 case OP_KETRMIN:
1833 case OP_KETRMAX:
1834 case OP_KETRPOS:
1835 prev = ecode - GET(ecode, 1);
1836
1837 /* If this was a group that remembered the subject start, in order to break
1838 infinite repeats of empty string matches, retrieve the subject start from
1839 the chain. Otherwise, set it NULL. */
1840
1841 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1842 {
1843 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1844 eptrb = eptrb->epb_prev; /* Backup to previous group */
1845 }
1846 else saved_eptr = NULL;
1847
1848 /* If we are at the end of an assertion group or a non-capturing atomic
1849 group, stop matching and return MATCH_MATCH, but record the current high
1850 water mark for use by positive assertions. We also need to record the match
1851 start in case it was changed by \K. */
1852
1853 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1854 *prev == OP_ONCE_NC)
1855 {
1856 md->end_match_ptr = eptr; /* For ONCE_NC */
1857 md->end_offset_top = offset_top;
1858 md->start_match_ptr = mstart;
1859 RRETURN(MATCH_MATCH); /* Sets md->mark */
1860 }
1861
1862 /* For capturing groups we have to check the group number back at the start
1863 and if necessary complete handling an extraction by setting the offsets and
1864 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1865 into group 0, so it won't be picked up here. Instead, we catch it when the
1866 OP_END is reached. Other recursion is handled here. We just have to record
1867 the current subject position and start match pointer and give a MATCH
1868 return. */
1869
1870 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1871 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1872 {
1873 number = GET2(prev, 1+LINK_SIZE);
1874 offset = number << 1;
1875
1876 #ifdef PCRE_DEBUG
1877 printf("end bracket %d", number);
1878 printf("\n");
1879 #endif
1880
1881 /* Handle a recursively called group. */
1882
1883 if (md->recursive != NULL && md->recursive->group_num == number)
1884 {
1885 md->end_match_ptr = eptr;
1886 md->start_match_ptr = mstart;
1887 RRETURN(MATCH_MATCH);
1888 }
1889
1890 /* Deal with capturing */
1891
1892 md->capture_last = number;
1893 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1894 {
1895 /* If offset is greater than offset_top, it means that we are
1896 "skipping" a capturing group, and that group's offsets must be marked
1897 unset. In earlier versions of PCRE, all the offsets were unset at the
1898 start of matching, but this doesn't work because atomic groups and
1899 assertions can cause a value to be set that should later be unset.
1900 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1901 part of the atomic group, but this is not on the final matching path,
1902 so must be unset when 2 is set. (If there is no group 2, there is no
1903 problem, because offset_top will then be 2, indicating no capture.) */
1904
1905 if (offset > offset_top)
1906 {
1907 register int *iptr = md->offset_vector + offset_top;
1908 register int *iend = md->offset_vector + offset;
1909 while (iptr < iend) *iptr++ = -1;
1910 }
1911
1912 /* Now make the extraction */
1913
1914 md->offset_vector[offset] =
1915 md->offset_vector[md->offset_end - number];
1916 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1917 if (offset_top <= offset) offset_top = offset + 2;
1918 }
1919 }
1920
1921 /* For an ordinary non-repeating ket, just continue at this level. This
1922 also happens for a repeating ket if no characters were matched in the
1923 group. This is the forcible breaking of infinite loops as implemented in
1924 Perl 5.005. For a non-repeating atomic group that includes captures,
1925 establish a backup point by processing the rest of the pattern at a lower
1926 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1927 original OP_ONCE level, thereby bypassing intermediate backup points, but
1928 resetting any captures that happened along the way. */
1929
1930 if (*ecode == OP_KET || eptr == saved_eptr)
1931 {
1932 if (*prev == OP_ONCE)
1933 {
1934 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1936 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1937 RRETURN(MATCH_ONCE);
1938 }
1939 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1940 break;
1941 }
1942
1943 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1944 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1945 at a time from the outer level, thus saving stack. */
1946
1947 if (*ecode == OP_KETRPOS)
1948 {
1949 md->end_match_ptr = eptr;
1950 md->end_offset_top = offset_top;
1951 RRETURN(MATCH_KETRPOS);
1952 }
1953
1954 /* The normal repeating kets try the rest of the pattern or restart from
1955 the preceding bracket, in the appropriate order. In the second case, we can
1956 use tail recursion to avoid using another stack frame, unless we have an
1957 an atomic group or an unlimited repeat of a group that can match an empty
1958 string. */
1959
1960 if (*ecode == OP_KETRMIN)
1961 {
1962 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1963 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1964 if (*prev == OP_ONCE)
1965 {
1966 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1969 RRETURN(MATCH_ONCE);
1970 }
1971 if (*prev >= OP_SBRA) /* Could match an empty string */
1972 {
1973 md->match_function_type = MATCH_CBEGROUP;
1974 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1975 RRETURN(rrc);
1976 }
1977 ecode = prev;
1978 goto TAIL_RECURSE;
1979 }
1980 else /* OP_KETRMAX */
1981 {
1982 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1983 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1984 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1986 if (*prev == OP_ONCE)
1987 {
1988 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1989 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1990 md->once_target = prev;
1991 RRETURN(MATCH_ONCE);
1992 }
1993 ecode += 1 + LINK_SIZE;
1994 goto TAIL_RECURSE;
1995 }
1996 /* Control never gets here */
1997
1998 /* Not multiline mode: start of subject assertion, unless notbol. */
1999
2000 case OP_CIRC:
2001 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2002
2003 /* Start of subject assertion */
2004
2005 case OP_SOD:
2006 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2007 ecode++;
2008 break;
2009
2010 /* Multiline mode: start of subject unless notbol, or after any newline. */
2011
2012 case OP_CIRCM:
2013 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2014 if (eptr != md->start_subject &&
2015 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2016 RRETURN(MATCH_NOMATCH);
2017 ecode++;
2018 break;
2019
2020 /* Start of match assertion */
2021
2022 case OP_SOM:
2023 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2024 ecode++;
2025 break;
2026
2027 /* Reset the start of match point */
2028
2029 case OP_SET_SOM:
2030 mstart = eptr;
2031 ecode++;
2032 break;
2033
2034 /* Multiline mode: assert before any newline, or before end of subject
2035 unless noteol is set. */
2036
2037 case OP_DOLLM:
2038 if (eptr < md->end_subject)
2039 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2040 else
2041 {
2042 if (md->noteol) RRETURN(MATCH_NOMATCH);
2043 SCHECK_PARTIAL();
2044 }
2045 ecode++;
2046 break;
2047
2048 /* Not multiline mode: assert before a terminating newline or before end of
2049 subject unless noteol is set. */
2050
2051 case OP_DOLL:
2052 if (md->noteol) RRETURN(MATCH_NOMATCH);
2053 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2054
2055 /* ... else fall through for endonly */
2056
2057 /* End of subject assertion (\z) */
2058
2059 case OP_EOD:
2060 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2061 SCHECK_PARTIAL();
2062 ecode++;
2063 break;
2064
2065 /* End of subject or ending \n assertion (\Z) */
2066
2067 case OP_EODN:
2068 ASSERT_NL_OR_EOS:
2069 if (eptr < md->end_subject &&
2070 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2071 RRETURN(MATCH_NOMATCH);
2072
2073 /* Either at end of string or \n before end. */
2074
2075 SCHECK_PARTIAL();
2076 ecode++;
2077 break;
2078
2079 /* Word boundary assertions */
2080
2081 case OP_NOT_WORD_BOUNDARY:
2082 case OP_WORD_BOUNDARY:
2083 {
2084
2085 /* Find out if the previous and current characters are "word" characters.
2086 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2087 be "non-word" characters. Remember the earliest consulted character for
2088 partial matching. */
2089
2090 #ifdef SUPPORT_UTF
2091 if (utf)
2092 {
2093 /* Get status of previous character */
2094
2095 if (eptr == md->start_subject) prev_is_word = FALSE; else
2096 {
2097 PCRE_PUCHAR lastptr = eptr - 1;
2098 BACKCHAR(lastptr);
2099 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2100 GETCHAR(c, lastptr);
2101 #ifdef SUPPORT_UCP
2102 if (md->use_ucp)
2103 {
2104 if (c == '_') prev_is_word = TRUE; else
2105 {
2106 int cat = UCD_CATEGORY(c);
2107 prev_is_word = (cat == ucp_L || cat == ucp_N);
2108 }
2109 }
2110 else
2111 #endif
2112 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2113 }
2114
2115 /* Get status of next character */
2116
2117 if (eptr >= md->end_subject)
2118 {
2119 SCHECK_PARTIAL();
2120 cur_is_word = FALSE;
2121 }
2122 else
2123 {
2124 GETCHAR(c, eptr);
2125 #ifdef SUPPORT_UCP
2126 if (md->use_ucp)
2127 {
2128 if (c == '_') cur_is_word = TRUE; else
2129 {
2130 int cat = UCD_CATEGORY(c);
2131 cur_is_word = (cat == ucp_L || cat == ucp_N);
2132 }
2133 }
2134 else
2135 #endif
2136 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2137 }
2138 }
2139 else
2140 #endif
2141
2142 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2143 consistency with the behaviour of \w we do use it in this case. */
2144
2145 {
2146 /* Get status of previous character */
2147
2148 if (eptr == md->start_subject) prev_is_word = FALSE; else
2149 {
2150 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2151 #ifdef SUPPORT_UCP
2152 if (md->use_ucp)
2153 {
2154 c = eptr[-1];
2155 if (c == '_') prev_is_word = TRUE; else
2156 {
2157 int cat = UCD_CATEGORY(c);
2158 prev_is_word = (cat == ucp_L || cat == ucp_N);
2159 }
2160 }
2161 else
2162 #endif
2163 prev_is_word = MAX_255(eptr[-1])
2164 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2165 }
2166
2167 /* Get status of next character */
2168
2169 if (eptr >= md->end_subject)
2170 {
2171 SCHECK_PARTIAL();
2172 cur_is_word = FALSE;
2173 }
2174 else
2175 #ifdef SUPPORT_UCP
2176 if (md->use_ucp)
2177 {
2178 c = *eptr;
2179 if (c == '_') cur_is_word = TRUE; else
2180 {
2181 int cat = UCD_CATEGORY(c);
2182 cur_is_word = (cat == ucp_L || cat == ucp_N);
2183 }
2184 }
2185 else
2186 #endif
2187 cur_is_word = MAX_255(*eptr)
2188 && ((md->ctypes[*eptr] & ctype_word) != 0);
2189 }
2190
2191 /* Now see if the situation is what we want */
2192
2193 if ((*ecode++ == OP_WORD_BOUNDARY)?
2194 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2195 RRETURN(MATCH_NOMATCH);
2196 }
2197 break;
2198
2199 /* Match a single character type; inline for speed */
2200
2201 case OP_ANY:
2202 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2203 /* Fall through */
2204
2205 case OP_ALLANY:
2206 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2207 { /* not be updated before SCHECK_PARTIAL. */
2208 SCHECK_PARTIAL();
2209 RRETURN(MATCH_NOMATCH);
2210 }
2211 eptr++;
2212 #ifdef SUPPORT_UTF
2213 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2214 #endif
2215 ecode++;
2216 break;
2217
2218 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2219 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2220
2221 case OP_ANYBYTE:
2222 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2223 { /* not be updated before SCHECK_PARTIAL. */
2224 SCHECK_PARTIAL();
2225 RRETURN(MATCH_NOMATCH);
2226 }
2227 eptr++;
2228 ecode++;
2229 break;
2230
2231 case OP_NOT_DIGIT:
2232 if (eptr >= md->end_subject)
2233 {
2234 SCHECK_PARTIAL();
2235 RRETURN(MATCH_NOMATCH);
2236 }
2237 GETCHARINCTEST(c, eptr);
2238 if (
2239 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2240 c < 256 &&
2241 #endif
2242 (md->ctypes[c] & ctype_digit) != 0
2243 )
2244 RRETURN(MATCH_NOMATCH);
2245 ecode++;
2246 break;
2247
2248 case OP_DIGIT:
2249 if (eptr >= md->end_subject)
2250 {
2251 SCHECK_PARTIAL();
2252 RRETURN(MATCH_NOMATCH);
2253 }
2254 GETCHARINCTEST(c, eptr);
2255 if (
2256 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2257 c > 255 ||
2258 #endif
2259 (md->ctypes[c] & ctype_digit) == 0
2260 )
2261 RRETURN(MATCH_NOMATCH);
2262 ecode++;
2263 break;
2264
2265 case OP_NOT_WHITESPACE:
2266 if (eptr >= md->end_subject)
2267 {
2268 SCHECK_PARTIAL();
2269 RRETURN(MATCH_NOMATCH);
2270 }
2271 GETCHARINCTEST(c, eptr);
2272 if (
2273 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2274 c < 256 &&
2275 #endif
2276 (md->ctypes[c] & ctype_space) != 0
2277 )
2278 RRETURN(MATCH_NOMATCH);
2279 ecode++;
2280 break;
2281
2282 case OP_WHITESPACE:
2283 if (eptr >= md->end_subject)
2284 {
2285 SCHECK_PARTIAL();
2286 RRETURN(MATCH_NOMATCH);
2287 }
2288 GETCHARINCTEST(c, eptr);
2289 if (
2290 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2291 c > 255 ||
2292 #endif
2293 (md->ctypes[c] & ctype_space) == 0
2294 )
2295 RRETURN(MATCH_NOMATCH);
2296 ecode++;
2297 break;
2298
2299 case OP_NOT_WORDCHAR:
2300 if (eptr >= md->end_subject)
2301 {
2302 SCHECK_PARTIAL();
2303 RRETURN(MATCH_NOMATCH);
2304 }
2305 GETCHARINCTEST(c, eptr);
2306 if (
2307 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2308 c < 256 &&
2309 #endif
2310 (md->ctypes[c] & ctype_word) != 0
2311 )
2312 RRETURN(MATCH_NOMATCH);
2313 ecode++;
2314 break;
2315
2316 case OP_WORDCHAR:
2317 if (eptr >= md->end_subject)
2318 {
2319 SCHECK_PARTIAL();
2320 RRETURN(MATCH_NOMATCH);
2321 }
2322 GETCHARINCTEST(c, eptr);
2323 if (
2324 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2325 c > 255 ||
2326 #endif
2327 (md->ctypes[c] & ctype_word) == 0
2328 )
2329 RRETURN(MATCH_NOMATCH);
2330 ecode++;
2331 break;
2332
2333 case OP_ANYNL:
2334 if (eptr >= md->end_subject)
2335 {
2336 SCHECK_PARTIAL();
2337 RRETURN(MATCH_NOMATCH);
2338 }
2339 GETCHARINCTEST(c, eptr);
2340 switch(c)
2341 {
2342 default: RRETURN(MATCH_NOMATCH);
2343
2344 case 0x000d:
2345 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2346 break;
2347
2348 case 0x000a:
2349 break;
2350
2351 case 0x000b:
2352 case 0x000c:
2353 case 0x0085:
2354 case 0x2028:
2355 case 0x2029:
2356 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2357 break;
2358 }
2359 ecode++;
2360 break;
2361
2362 case OP_NOT_HSPACE:
2363 if (eptr >= md->end_subject)
2364 {
2365 SCHECK_PARTIAL();
2366 RRETURN(MATCH_NOMATCH);
2367 }
2368 GETCHARINCTEST(c, eptr);
2369 switch(c)
2370 {
2371 default: break;
2372 case 0x09: /* HT */
2373 case 0x20: /* SPACE */
2374 case 0xa0: /* NBSP */
2375 case 0x1680: /* OGHAM SPACE MARK */
2376 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2377 case 0x2000: /* EN QUAD */
2378 case 0x2001: /* EM QUAD */
2379 case 0x2002: /* EN SPACE */
2380 case 0x2003: /* EM SPACE */
2381 case 0x2004: /* THREE-PER-EM SPACE */
2382 case 0x2005: /* FOUR-PER-EM SPACE */
2383 case 0x2006: /* SIX-PER-EM SPACE */
2384 case 0x2007: /* FIGURE SPACE */
2385 case 0x2008: /* PUNCTUATION SPACE */
2386 case 0x2009: /* THIN SPACE */
2387 case 0x200A: /* HAIR SPACE */
2388 case 0x202f: /* NARROW NO-BREAK SPACE */
2389 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2390 case 0x3000: /* IDEOGRAPHIC SPACE */
2391 RRETURN(MATCH_NOMATCH);
2392 }
2393 ecode++;
2394 break;
2395
2396 case OP_HSPACE:
2397 if (eptr >= md->end_subject)
2398 {
2399 SCHECK_PARTIAL();
2400 RRETURN(MATCH_NOMATCH);
2401 }
2402 GETCHARINCTEST(c, eptr);
2403 switch(c)
2404 {
2405 default: RRETURN(MATCH_NOMATCH);
2406 case 0x09: /* HT */
2407 case 0x20: /* SPACE */
2408 case 0xa0: /* NBSP */
2409 case 0x1680: /* OGHAM SPACE MARK */
2410 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2411 case 0x2000: /* EN QUAD */
2412 case 0x2001: /* EM QUAD */
2413 case 0x2002: /* EN SPACE */
2414 case 0x2003: /* EM SPACE */
2415 case 0x2004: /* THREE-PER-EM SPACE */
2416 case 0x2005: /* FOUR-PER-EM SPACE */
2417 case 0x2006: /* SIX-PER-EM SPACE */
2418 case 0x2007: /* FIGURE SPACE */
2419 case 0x2008: /* PUNCTUATION SPACE */
2420 case 0x2009: /* THIN SPACE */
2421 case 0x200A: /* HAIR SPACE */
2422 case 0x202f: /* NARROW NO-BREAK SPACE */
2423 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2424 case 0x3000: /* IDEOGRAPHIC SPACE */
2425 break;
2426 }
2427 ecode++;
2428 break;
2429
2430 case OP_NOT_VSPACE:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 RRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: break;
2440 case 0x0a: /* LF */
2441 case 0x0b: /* VT */
2442 case 0x0c: /* FF */
2443 case 0x0d: /* CR */
2444 case 0x85: /* NEL */
2445 case 0x2028: /* LINE SEPARATOR */
2446 case 0x2029: /* PARAGRAPH SEPARATOR */
2447 RRETURN(MATCH_NOMATCH);
2448 }
2449 ecode++;
2450 break;
2451
2452 case OP_VSPACE:
2453 if (eptr >= md->end_subject)
2454 {
2455 SCHECK_PARTIAL();
2456 RRETURN(MATCH_NOMATCH);
2457 }
2458 GETCHARINCTEST(c, eptr);
2459 switch(c)
2460 {
2461 default: RRETURN(MATCH_NOMATCH);
2462 case 0x0a: /* LF */
2463 case 0x0b: /* VT */
2464 case 0x0c: /* FF */
2465 case 0x0d: /* CR */
2466 case 0x85: /* NEL */
2467 case 0x2028: /* LINE SEPARATOR */
2468 case 0x2029: /* PARAGRAPH SEPARATOR */
2469 break;
2470 }
2471 ecode++;
2472 break;
2473
2474 #ifdef SUPPORT_UCP
2475 /* Check the next character by Unicode property. We will get here only
2476 if the support is in the binary; otherwise a compile-time error occurs. */
2477
2478 case OP_PROP:
2479 case OP_NOTPROP:
2480 if (eptr >= md->end_subject)
2481 {
2482 SCHECK_PARTIAL();
2483 RRETURN(MATCH_NOMATCH);
2484 }
2485 GETCHARINCTEST(c, eptr);
2486 {
2487 const ucd_record *prop = GET_UCD(c);
2488
2489 switch(ecode[1])
2490 {
2491 case PT_ANY:
2492 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2493 break;
2494
2495 case PT_LAMP:
2496 if ((prop->chartype == ucp_Lu ||
2497 prop->chartype == ucp_Ll ||
2498 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2499 RRETURN(MATCH_NOMATCH);
2500 break;
2501
2502 case PT_GC:
2503 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2504 RRETURN(MATCH_NOMATCH);
2505 break;
2506
2507 case PT_PC:
2508 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2509 RRETURN(MATCH_NOMATCH);
2510 break;
2511
2512 case PT_SC:
2513 if ((ecode[2] != prop->script) == (op == OP_PROP))
2514 RRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 /* These are specials */
2518
2519 case PT_ALNUM:
2520 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2521 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_SPACE: /* Perl space */
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2527 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2528 == (op == OP_NOTPROP))
2529 RRETURN(MATCH_NOMATCH);
2530 break;
2531
2532 case PT_PXSPACE: /* POSIX space */
2533 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2534 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2535 c == CHAR_FF || c == CHAR_CR)
2536 == (op == OP_NOTPROP))
2537 RRETURN(MATCH_NOMATCH);
2538 break;
2539
2540 case PT_WORD:
2541 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2542 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2543 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2544 RRETURN(MATCH_NOMATCH);
2545 break;
2546
2547 /* This should never occur */
2548
2549 default:
2550 RRETURN(PCRE_ERROR_INTERNAL);
2551 }
2552
2553 ecode += 3;
2554 }
2555 break;
2556
2557 /* Match an extended Unicode sequence. We will get here only if the support
2558 is in the binary; otherwise a compile-time error occurs. */
2559
2560 case OP_EXTUNI:
2561 if (eptr >= md->end_subject)
2562 {
2563 SCHECK_PARTIAL();
2564 RRETURN(MATCH_NOMATCH);
2565 }
2566 GETCHARINCTEST(c, eptr);
2567 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2568 while (eptr < md->end_subject)
2569 {
2570 int len = 1;
2571 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2572 if (UCD_CATEGORY(c) != ucp_M) break;
2573 eptr += len;
2574 }
2575 ecode++;
2576 break;
2577 #endif
2578
2579
2580 /* Match a back reference, possibly repeatedly. Look past the end of the
2581 item to see if there is repeat information following. The code is similar
2582 to that for character classes, but repeated for efficiency. Then obey
2583 similar code to character type repeats - written out again for speed.
2584 However, if the referenced string is the empty string, always treat
2585 it as matched, any number of times (otherwise there could be infinite
2586 loops). */
2587
2588 case OP_REF:
2589 case OP_REFI:
2590 caseless = op == OP_REFI;
2591 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2592 ecode += 1 + IMM2_SIZE;
2593
2594 /* If the reference is unset, there are two possibilities:
2595
2596 (a) In the default, Perl-compatible state, set the length negative;
2597 this ensures that every attempt at a match fails. We can't just fail
2598 here, because of the possibility of quantifiers with zero minima.
2599
2600 (b) If the JavaScript compatibility flag is set, set the length to zero
2601 so that the back reference matches an empty string.
2602
2603 Otherwise, set the length to the length of what was matched by the
2604 referenced subpattern. */
2605
2606 if (offset >= offset_top || md->offset_vector[offset] < 0)
2607 length = (md->jscript_compat)? 0 : -1;
2608 else
2609 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2610
2611 /* Set up for repetition, or handle the non-repeated case */
2612
2613 switch (*ecode)
2614 {
2615 case OP_CRSTAR:
2616 case OP_CRMINSTAR:
2617 case OP_CRPLUS:
2618 case OP_CRMINPLUS:
2619 case OP_CRQUERY:
2620 case OP_CRMINQUERY:
2621 c = *ecode++ - OP_CRSTAR;
2622 minimize = (c & 1) != 0;
2623 min = rep_min[c]; /* Pick up values from tables; */
2624 max = rep_max[c]; /* zero for max => infinity */
2625 if (max == 0) max = INT_MAX;
2626 break;
2627
2628 case OP_CRRANGE:
2629 case OP_CRMINRANGE:
2630 minimize = (*ecode == OP_CRMINRANGE);
2631 min = GET2(ecode, 1);
2632 max = GET2(ecode, 1 + IMM2_SIZE);
2633 if (max == 0) max = INT_MAX;
2634 ecode += 1 + 2 * IMM2_SIZE;
2635 break;
2636
2637 default: /* No repeat follows */
2638 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2639 {
2640 CHECK_PARTIAL();
2641 RRETURN(MATCH_NOMATCH);
2642 }
2643 eptr += length;
2644 continue; /* With the main loop */
2645 }
2646
2647 /* Handle repeated back references. If the length of the reference is
2648 zero, just continue with the main loop. If the length is negative, it
2649 means the reference is unset in non-Java-compatible mode. If the minimum is
2650 zero, we can continue at the same level without recursion. For any other
2651 minimum, carrying on will result in NOMATCH. */
2652
2653 if (length == 0) continue;
2654 if (length < 0 && min == 0) continue;
2655
2656 /* First, ensure the minimum number of matches are present. We get back
2657 the length of the reference string explicitly rather than passing the
2658 address of eptr, so that eptr can be a register variable. */
2659
2660 for (i = 1; i <= min; i++)
2661 {
2662 int slength;
2663 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2664 {
2665 CHECK_PARTIAL();
2666 RRETURN(MATCH_NOMATCH);
2667 }
2668 eptr += slength;
2669 }
2670
2671 /* If min = max, continue at the same level without recursion.
2672 They are not both allowed to be zero. */
2673
2674 if (min == max) continue;
2675
2676 /* If minimizing, keep trying and advancing the pointer */
2677
2678 if (minimize)
2679 {
2680 for (fi = min;; fi++)
2681 {
2682 int slength;
2683 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2685 if (fi >= max) RRETURN(MATCH_NOMATCH);
2686 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2687 {
2688 CHECK_PARTIAL();
2689 RRETURN(MATCH_NOMATCH);
2690 }
2691 eptr += slength;
2692 }
2693 /* Control never gets here */
2694 }
2695
2696 /* If maximizing, find the longest string and work backwards */
2697
2698 else
2699 {
2700 pp = eptr;
2701 for (i = min; i < max; i++)
2702 {
2703 int slength;
2704 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2705 {
2706 CHECK_PARTIAL();
2707 break;
2708 }
2709 eptr += slength;
2710 }
2711 while (eptr >= pp)
2712 {
2713 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2715 eptr -= length;
2716 }
2717 RRETURN(MATCH_NOMATCH);
2718 }
2719 /* Control never gets here */
2720
2721 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2722 used when all the characters in the class have values in the range 0-255,
2723 and either the matching is caseful, or the characters are in the range
2724 0-127 when UTF-8 processing is enabled. The only difference between
2725 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2726 encountered.
2727
2728 First, look past the end of the item to see if there is repeat information
2729 following. Then obey similar code to character type repeats - written out
2730 again for speed. */
2731
2732 case OP_NCLASS:
2733 case OP_CLASS:
2734 {
2735 /* The data variable is saved across frames, so the byte map needs to
2736 be stored there. */
2737 #define BYTE_MAP ((pcre_uint8 *)data)
2738 data = ecode + 1; /* Save for matching */
2739 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2740
2741 switch (*ecode)
2742 {
2743 case OP_CRSTAR:
2744 case OP_CRMINSTAR:
2745 case OP_CRPLUS:
2746 case OP_CRMINPLUS:
2747 case OP_CRQUERY:
2748 case OP_CRMINQUERY:
2749 c = *ecode++ - OP_CRSTAR;
2750 minimize = (c & 1) != 0;
2751 min = rep_min[c]; /* Pick up values from tables; */
2752 max = rep_max[c]; /* zero for max => infinity */
2753 if (max == 0) max = INT_MAX;
2754 break;
2755
2756 case OP_CRRANGE:
2757 case OP_CRMINRANGE:
2758 minimize = (*ecode == OP_CRMINRANGE);
2759 min = GET2(ecode, 1);
2760 max = GET2(ecode, 1 + IMM2_SIZE);
2761 if (max == 0) max = INT_MAX;
2762 ecode += 1 + 2 * IMM2_SIZE;
2763 break;
2764
2765 default: /* No repeat follows */
2766 min = max = 1;
2767 break;
2768 }
2769
2770 /* First, ensure the minimum number of matches are present. */
2771
2772 #ifdef SUPPORT_UTF
2773 if (utf)
2774 {
2775 for (i = 1; i <= min; i++)
2776 {
2777 if (eptr >= md->end_subject)
2778 {
2779 SCHECK_PARTIAL();
2780 RRETURN(MATCH_NOMATCH);
2781 }
2782 GETCHARINC(c, eptr);
2783 if (c > 255)
2784 {
2785 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2786 }
2787 else
2788 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2789 }
2790 }
2791 else
2792 #endif
2793 /* Not UTF mode */
2794 {
2795 for (i = 1; i <= min; i++)
2796 {
2797 if (eptr >= md->end_subject)
2798 {
2799 SCHECK_PARTIAL();
2800 RRETURN(MATCH_NOMATCH);
2801 }
2802 c = *eptr++;
2803 #ifndef COMPILE_PCRE8
2804 if (c > 255)
2805 {
2806 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2807 }
2808 else
2809 #endif
2810 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2811 }
2812 }
2813
2814 /* If max == min we can continue with the main loop without the
2815 need to recurse. */
2816
2817 if (min == max) continue;
2818
2819 /* If minimizing, keep testing the rest of the expression and advancing
2820 the pointer while it matches the class. */
2821
2822 if (minimize)
2823 {
2824 #ifdef SUPPORT_UTF
2825 if (utf)
2826 {
2827 for (fi = min;; fi++)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (fi >= max) RRETURN(MATCH_NOMATCH);
2832 if (eptr >= md->end_subject)
2833 {
2834 SCHECK_PARTIAL();
2835 RRETURN(MATCH_NOMATCH);
2836 }
2837 GETCHARINC(c, eptr);
2838 if (c > 255)
2839 {
2840 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2841 }
2842 else
2843 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2844 }
2845 }
2846 else
2847 #endif
2848 /* Not UTF mode */
2849 {
2850 for (fi = min;; fi++)
2851 {
2852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2854 if (fi >= max) RRETURN(MATCH_NOMATCH);
2855 if (eptr >= md->end_subject)
2856 {
2857 SCHECK_PARTIAL();
2858 RRETURN(MATCH_NOMATCH);
2859 }
2860 c = *eptr++;
2861 #ifndef COMPILE_PCRE8
2862 if (c > 255)
2863 {
2864 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2865 }
2866 else
2867 #endif
2868 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2869 }
2870 }
2871 /* Control never gets here */
2872 }
2873
2874 /* If maximizing, find the longest possible run, then work backwards. */
2875
2876 else
2877 {
2878 pp = eptr;
2879
2880 #ifdef SUPPORT_UTF
2881 if (utf)
2882 {
2883 for (i = min; i < max; i++)
2884 {
2885 int len = 1;
2886 if (eptr >= md->end_subject)
2887 {
2888 SCHECK_PARTIAL();
2889 break;
2890 }
2891 GETCHARLEN(c, eptr, len);
2892 if (c > 255)
2893 {
2894 if (op == OP_CLASS) break;
2895 }
2896 else
2897 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2898 eptr += len;
2899 }
2900 for (;;)
2901 {
2902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2904 if (eptr-- == pp) break; /* Stop if tried at original pos */
2905 BACKCHAR(eptr);
2906 }
2907 }
2908 else
2909 #endif
2910 /* Not UTF mode */
2911 {
2912 for (i = min; i < max; i++)
2913 {
2914 if (eptr >= md->end_subject)
2915 {
2916 SCHECK_PARTIAL();
2917 break;
2918 }
2919 c = *eptr;
2920 #ifndef COMPILE_PCRE8
2921 if (c > 255)
2922 {
2923 if (op == OP_CLASS) break;
2924 }
2925 else
2926 #endif
2927 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2928 eptr++;
2929 }
2930 while (eptr >= pp)
2931 {
2932 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2934 eptr--;
2935 }
2936 }
2937
2938 RRETURN(MATCH_NOMATCH);
2939 }
2940 #undef BYTE_MAP
2941 }
2942 /* Control never gets here */
2943
2944
2945 /* Match an extended character class. This opcode is encountered only
2946 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2947 mode, because Unicode properties are supported in non-UTF-8 mode. */
2948
2949 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2950 case OP_XCLASS:
2951 {
2952 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2953 ecode += GET(ecode, 1); /* Advance past the item */
2954
2955 switch (*ecode)
2956 {
2957 case OP_CRSTAR:
2958 case OP_CRMINSTAR:
2959 case OP_CRPLUS:
2960 case OP_CRMINPLUS:
2961 case OP_CRQUERY:
2962 case OP_CRMINQUERY:
2963 c = *ecode++ - OP_CRSTAR;
2964 minimize = (c & 1) != 0;
2965 min = rep_min[c]; /* Pick up values from tables; */
2966 max = rep_max[c]; /* zero for max => infinity */
2967 if (max == 0) max = INT_MAX;
2968 break;
2969
2970 case OP_CRRANGE:
2971 case OP_CRMINRANGE:
2972 minimize = (*ecode == OP_CRMINRANGE);
2973 min = GET2(ecode, 1);
2974 max = GET2(ecode, 1 + IMM2_SIZE);
2975 if (max == 0) max = INT_MAX;
2976 ecode += 1 + 2 * IMM2_SIZE;
2977 break;
2978
2979 default: /* No repeat follows */
2980 min = max = 1;
2981 break;
2982 }
2983
2984 /* First, ensure the minimum number of matches are present. */
2985
2986 for (i = 1; i <= min; i++)
2987 {
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 RRETURN(MATCH_NOMATCH);
2992 }
2993 GETCHARINCTEST(c, eptr);
2994 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2995 }
2996
2997 /* If max == min we can continue with the main loop without the
2998 need to recurse. */
2999
3000 if (min == max) continue;
3001
3002 /* If minimizing, keep testing the rest of the expression and advancing
3003 the pointer while it matches the class. */
3004
3005 if (minimize)
3006 {
3007 for (fi = min;; fi++)
3008 {
3009 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3011 if (fi >= max) RRETURN(MATCH_NOMATCH);
3012 if (eptr >= md->end_subject)
3013 {
3014 SCHECK_PARTIAL();
3015 RRETURN(MATCH_NOMATCH);
3016 }
3017 GETCHARINCTEST(c, eptr);
3018 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3019 }
3020 /* Control never gets here */
3021 }
3022
3023 /* If maximizing, find the longest possible run, then work backwards. */
3024
3025 else
3026 {
3027 pp = eptr;
3028 for (i = min; i < max; i++)
3029 {
3030 int len = 1;
3031 if (eptr >= md->end_subject)
3032 {
3033 SCHECK_PARTIAL();
3034 break;
3035 }
3036 #ifdef SUPPORT_UTF
3037 GETCHARLENTEST(c, eptr, len);
3038 #else
3039 c = *eptr;
3040 #endif
3041 if (!PRIV(xclass)(c, data, utf)) break;
3042 eptr += len;
3043 }
3044 for(;;)
3045 {
3046 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3047 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3048 if (eptr-- == pp) break; /* Stop if tried at original pos */
3049 #ifdef SUPPORT_UTF
3050 if (utf) BACKCHAR(eptr);
3051 #endif
3052 }
3053 RRETURN(MATCH_NOMATCH);
3054 }
3055
3056 /* Control never gets here */
3057 }
3058 #endif /* End of XCLASS */
3059
3060 /* Match a single character, casefully */
3061
3062 case OP_CHAR:
3063 #ifdef SUPPORT_UTF
3064 if (utf)
3065 {
3066 length = 1;
3067 ecode++;
3068 GETCHARLEN(fc, ecode, length);
3069 if (length > md->end_subject - eptr)
3070 {
3071 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3075 }
3076 else
3077 #endif
3078 /* Not UTF mode */
3079 {
3080 if (md->end_subject - eptr < 1)
3081 {
3082 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3083 RRETURN(MATCH_NOMATCH);
3084 }
3085 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3086 ecode += 2;
3087 }
3088 break;
3089
3090 /* Match a single character, caselessly. If we are at the end of the
3091 subject, give up immediately. */
3092
3093 case OP_CHARI:
3094 if (eptr >= md->end_subject)
3095 {
3096 SCHECK_PARTIAL();
3097 RRETURN(MATCH_NOMATCH);
3098 }
3099
3100 #ifdef SUPPORT_UTF
3101 if (utf)
3102 {
3103 length = 1;
3104 ecode++;
3105 GETCHARLEN(fc, ecode, length);
3106
3107 /* If the pattern character's value is < 128, we have only one byte, and
3108 we know that its other case must also be one byte long, so we can use the
3109 fast lookup table. We know that there is at least one byte left in the
3110 subject. */
3111
3112 if (fc < 128)
3113 {
3114 if (md->lcc[fc]
3115 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3116 ecode++;
3117 eptr++;
3118 }
3119
3120 /* Otherwise we must pick up the subject character. Note that we cannot
3121 use the value of "length" to check for sufficient bytes left, because the
3122 other case of the character may have more or fewer bytes. */
3123
3124 else
3125 {
3126 unsigned int dc;
3127 GETCHARINC(dc, eptr);
3128 ecode += length;
3129
3130 /* If we have Unicode property support, we can use it to test the other
3131 case of the character, if there is one. */
3132
3133 if (fc != dc)
3134 {
3135 #ifdef SUPPORT_UCP
3136 if (dc != UCD_OTHERCASE(fc))
3137 #endif
3138 RRETURN(MATCH_NOMATCH);
3139 }
3140 }
3141 }
3142 else
3143 #endif /* SUPPORT_UTF */
3144
3145 /* Not UTF mode */
3146 {
3147 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3148 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3149 eptr++;
3150 ecode += 2;
3151 }
3152 break;
3153
3154 /* Match a single character repeatedly. */
3155
3156 case OP_EXACT:
3157 case OP_EXACTI:
3158 min = max = GET2(ecode, 1);
3159 ecode += 1 + IMM2_SIZE;
3160 goto REPEATCHAR;
3161
3162 case OP_POSUPTO:
3163 case OP_POSUPTOI:
3164 possessive = TRUE;
3165 /* Fall through */
3166
3167 case OP_UPTO:
3168 case OP_UPTOI:
3169 case OP_MINUPTO:
3170 case OP_MINUPTOI:
3171 min = 0;
3172 max = GET2(ecode, 1);
3173 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3174 ecode += 1 + IMM2_SIZE;
3175 goto REPEATCHAR;
3176
3177 case OP_POSSTAR:
3178 case OP_POSSTARI:
3179 possessive = TRUE;
3180 min = 0;
3181 max = INT_MAX;
3182 ecode++;
3183 goto REPEATCHAR;
3184
3185 case OP_POSPLUS:
3186 case OP_POSPLUSI:
3187 possessive = TRUE;
3188 min = 1;
3189 max = INT_MAX;
3190 ecode++;
3191 goto REPEATCHAR;
3192
3193 case OP_POSQUERY:
3194 case OP_POSQUERYI:
3195 possessive = TRUE;
3196 min = 0;
3197 max = 1;
3198 ecode++;
3199 goto REPEATCHAR;
3200
3201 case OP_STAR:
3202 case OP_STARI:
3203 case OP_MINSTAR:
3204 case OP_MINSTARI:
3205 case OP_PLUS:
3206 case OP_PLUSI:
3207 case OP_MINPLUS:
3208 case OP_MINPLUSI:
3209 case OP_QUERY:
3210 case OP_QUERYI:
3211 case OP_MINQUERY:
3212 case OP_MINQUERYI:
3213 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3214 minimize = (c & 1) != 0;
3215 min = rep_min[c]; /* Pick up values from tables; */
3216 max = rep_max[c]; /* zero for max => infinity */
3217 if (max == 0) max = INT_MAX;
3218
3219 /* Common code for all repeated single-character matches. */
3220
3221 REPEATCHAR:
3222 #ifdef SUPPORT_UTF
3223 if (utf)
3224 {
3225 length = 1;
3226 charptr = ecode;
3227 GETCHARLEN(fc, ecode, length);
3228 ecode += length;
3229
3230 /* Handle multibyte character matching specially here. There is
3231 support for caseless matching if UCP support is present. */
3232
3233 if (length > 1)
3234 {
3235 #ifdef SUPPORT_UCP
3236 unsigned int othercase;
3237 if (op >= OP_STARI && /* Caseless */
3238 (othercase = UCD_OTHERCASE(fc)) != fc)
3239 oclength = PRIV(ord2utf)(othercase, occhars);
3240 else oclength = 0;
3241 #endif /* SUPPORT_UCP */
3242
3243 for (i = 1; i <= min; i++)
3244 {
3245 if (eptr <= md->end_subject - length &&
3246 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3247 #ifdef SUPPORT_UCP
3248 else if (oclength > 0 &&
3249 eptr <= md->end_subject - oclength &&
3250 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3251 #endif /* SUPPORT_UCP */
3252 else
3253 {
3254 CHECK_PARTIAL();
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257 }
3258
3259 if (min == max) continue;
3260
3261 if (minimize)
3262 {
3263 for (fi = min;; fi++)
3264 {
3265 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3266 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3267 if (fi >= max) RRETURN(MATCH_NOMATCH);
3268 if (eptr <= md->end_subject - length &&
3269 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3270 #ifdef SUPPORT_UCP
3271 else if (oclength > 0 &&
3272 eptr <= md->end_subject - oclength &&
3273 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3274 #endif /* SUPPORT_UCP */
3275 else
3276 {
3277 CHECK_PARTIAL();
3278 RRETURN(MATCH_NOMATCH);
3279 }
3280 }
3281 /* Control never gets here */
3282 }
3283
3284 else /* Maximize */
3285 {
3286 pp = eptr;
3287 for (i = min; i < max; i++)
3288 {
3289 if (eptr <= md->end_subject - length &&
3290 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3291 #ifdef SUPPORT_UCP
3292 else if (oclength > 0 &&
3293 eptr <= md->end_subject - oclength &&
3294 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3295 #endif /* SUPPORT_UCP */
3296 else
3297 {
3298 CHECK_PARTIAL();
3299 break;
3300 }
3301 }
3302
3303 if (possessive) continue;
3304
3305 for(;;)
3306 {
3307 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3308 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3309 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3310 #ifdef SUPPORT_UCP
3311 eptr--;
3312 BACKCHAR(eptr);
3313 #else /* without SUPPORT_UCP */
3314 eptr -= length;
3315 #endif /* SUPPORT_UCP */
3316 }
3317 }
3318 /* Control never gets here */
3319 }
3320
3321 /* If the length of a UTF-8 character is 1, we fall through here, and
3322 obey the code as for non-UTF-8 characters below, though in this case the
3323 value of fc will always be < 128. */
3324 }
3325 else
3326 #endif /* SUPPORT_UTF */
3327 /* When not in UTF-8 mode, load a single-byte character. */
3328 fc = *ecode++;
3329
3330 /* The value of fc at this point is always one character, though we may
3331 or may not be in UTF mode. The code is duplicated for the caseless and
3332 caseful cases, for speed, since matching characters is likely to be quite
3333 common. First, ensure the minimum number of matches are present. If min =
3334 max, continue at the same level without recursing. Otherwise, if
3335 minimizing, keep trying the rest of the expression and advancing one
3336 matching character if failing, up to the maximum. Alternatively, if
3337 maximizing, find the maximum number of characters and work backwards. */
3338
3339 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3340 max, eptr));
3341
3342 if (op >= OP_STARI) /* Caseless */
3343 {
3344 #ifdef COMPILE_PCRE8
3345 /* fc must be < 128 if UTF is enabled. */
3346 foc = md->fcc[fc];
3347 #else
3348 #ifdef SUPPORT_UTF
3349 #ifdef SUPPORT_UCP
3350 if (utf && fc > 127)
3351 foc = UCD_OTHERCASE(fc);
3352 #else
3353 if (utf && fc > 127)
3354 foc = fc;
3355 #endif /* SUPPORT_UCP */
3356 else
3357 #endif /* SUPPORT_UTF */
3358 foc = TABLE_GET(fc, md->fcc, fc);
3359 #endif /* COMPILE_PCRE8 */
3360
3361 for (i = 1; i <= min; i++)
3362 {
3363 if (eptr >= md->end_subject)
3364 {
3365 SCHECK_PARTIAL();
3366 RRETURN(MATCH_NOMATCH);
3367 }
3368 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3369 eptr++;
3370 }
3371 if (min == max) continue;
3372 if (minimize)
3373 {
3374 for (fi = min;; fi++)
3375 {
3376 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3377 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378 if (fi >= max) RRETURN(MATCH_NOMATCH);
3379 if (eptr >= md->end_subject)
3380 {
3381 SCHECK_PARTIAL();
3382 RRETURN(MATCH_NOMATCH);
3383 }
3384 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3385 eptr++;
3386 }
3387 /* Control never gets here */
3388 }
3389 else /* Maximize */
3390 {
3391 pp = eptr;
3392 for (i = min; i < max; i++)
3393 {
3394 if (eptr >= md->end_subject)
3395 {
3396 SCHECK_PARTIAL();
3397 break;
3398 }
3399 if (fc != *eptr && foc != *eptr) break;
3400 eptr++;
3401 }
3402
3403 if (possessive) continue;
3404
3405 while (eptr >= pp)
3406 {
3407 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3408 eptr--;
3409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3410 }
3411 RRETURN(MATCH_NOMATCH);
3412 }
3413 /* Control never gets here */
3414 }
3415
3416 /* Caseful comparisons (includes all multi-byte characters) */
3417
3418 else
3419 {
3420 for (i = 1; i <= min; i++)
3421 {
3422 if (eptr >= md->end_subject)
3423 {
3424 SCHECK_PARTIAL();
3425 RRETURN(MATCH_NOMATCH);
3426 }
3427 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3428 }
3429
3430 if (min == max) continue;
3431
3432 if (minimize)
3433 {
3434 for (fi = min;; fi++)
3435 {
3436 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3437 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438 if (fi >= max) RRETURN(MATCH_NOMATCH);
3439 if (eptr >= md->end_subject)
3440 {
3441 SCHECK_PARTIAL();
3442 RRETURN(MATCH_NOMATCH);
3443 }
3444 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3445 }
3446 /* Control never gets here */
3447 }
3448 else /* Maximize */
3449 {
3450 pp = eptr;
3451 for (i = min; i < max; i++)
3452 {
3453 if (eptr >= md->end_subject)
3454 {
3455 SCHECK_PARTIAL();
3456 break;
3457 }
3458 if (fc != *eptr) break;
3459 eptr++;
3460 }
3461 if (possessive) continue;
3462
3463 while (eptr >= pp)
3464 {
3465 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3466 eptr--;
3467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3468 }
3469 RRETURN(MATCH_NOMATCH);
3470 }
3471 }
3472 /* Control never gets here */
3473
3474 /* Match a negated single one-byte character. The character we are
3475 checking can be multibyte. */
3476
3477 case OP_NOT:
3478 case OP_NOTI:
3479 if (eptr >= md->end_subject)
3480 {
3481 SCHECK_PARTIAL();
3482 RRETURN(MATCH_NOMATCH);
3483 }
3484 ecode++;
3485 GETCHARINCTEST(c, eptr);
3486 if (op == OP_NOTI) /* The caseless case */
3487 {
3488 register int ch, och;
3489 ch = *ecode++;
3490 #ifdef COMPILE_PCRE8
3491 /* ch must be < 128 if UTF is enabled. */
3492 och = md->fcc[ch];
3493 #else
3494 #ifdef SUPPORT_UTF
3495 #ifdef SUPPORT_UCP
3496 if (utf && ch > 127)
3497 och = UCD_OTHERCASE(ch);
3498 #else
3499 if (utf && ch > 127)
3500 och = ch;
3501 #endif /* SUPPORT_UCP */
3502 else
3503 #endif /* SUPPORT_UTF */
3504 och = TABLE_GET(ch, md->fcc, ch);
3505 #endif /* COMPILE_PCRE8 */
3506 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3507 }
3508 else /* Caseful */
3509 {
3510 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3511 }
3512 break;
3513
3514 /* Match a negated single one-byte character repeatedly. This is almost a
3515 repeat of the code for a repeated single character, but I haven't found a
3516 nice way of commoning these up that doesn't require a test of the
3517 positive/negative option for each character match. Maybe that wouldn't add
3518 very much to the time taken, but character matching *is* what this is all
3519 about... */
3520
3521 case OP_NOTEXACT:
3522 case OP_NOTEXACTI:
3523 min = max = GET2(ecode, 1);
3524 ecode += 1 + IMM2_SIZE;
3525 goto REPEATNOTCHAR;
3526
3527 case OP_NOTUPTO:
3528 case OP_NOTUPTOI:
3529 case OP_NOTMINUPTO:
3530 case OP_NOTMINUPTOI:
3531 min = 0;
3532 max = GET2(ecode, 1);
3533 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3534 ecode += 1 + IMM2_SIZE;
3535 goto REPEATNOTCHAR;
3536
3537 case OP_NOTPOSSTAR:
3538 case OP_NOTPOSSTARI:
3539 possessive = TRUE;
3540 min = 0;
3541 max = INT_MAX;
3542 ecode++;
3543 goto REPEATNOTCHAR;
3544
3545 case OP_NOTPOSPLUS:
3546 case OP_NOTPOSPLUSI:
3547 possessive = TRUE;
3548 min = 1;
3549 max = INT_MAX;
3550 ecode++;
3551 goto REPEATNOTCHAR;
3552
3553 case OP_NOTPOSQUERY:
3554 case OP_NOTPOSQUERYI:
3555 possessive = TRUE;
3556 min = 0;
3557 max = 1;
3558 ecode++;
3559 goto REPEATNOTCHAR;
3560
3561 case OP_NOTPOSUPTO:
3562 case OP_NOTPOSUPTOI:
3563 possessive = TRUE;
3564 min = 0;
3565 max = GET2(ecode, 1);
3566 ecode += 1 + IMM2_SIZE;
3567 goto REPEATNOTCHAR;
3568
3569 case OP_NOTSTAR:
3570 case OP_NOTSTARI:
3571 case OP_NOTMINSTAR:
3572 case OP_NOTMINSTARI:
3573 case OP_NOTPLUS:
3574 case OP_NOTPLUSI:
3575 case OP_NOTMINPLUS:
3576 case OP_NOTMINPLUSI:
3577 case OP_NOTQUERY:
3578 case OP_NOTQUERYI:
3579 case OP_NOTMINQUERY:
3580 case OP_NOTMINQUERYI:
3581 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3582 minimize = (c & 1) != 0;
3583 min = rep_min[c]; /* Pick up values from tables; */
3584 max = rep_max[c]; /* zero for max => infinity */
3585 if (max == 0) max = INT_MAX;
3586
3587 /* Common code for all repeated single-byte matches. */
3588
3589 REPEATNOTCHAR:
3590 fc = *ecode++;
3591
3592 /* The code is duplicated for the caseless and caseful cases, for speed,
3593 since matching characters is likely to be quite common. First, ensure the
3594 minimum number of matches are present. If min = max, continue at the same
3595 level without recursing. Otherwise, if minimizing, keep trying the rest of
3596 the expression and advancing one matching character if failing, up to the
3597 maximum. Alternatively, if maximizing, find the maximum number of
3598 characters and work backwards. */
3599
3600 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3601 max, eptr));
3602
3603 if (op >= OP_NOTSTARI) /* Caseless */
3604 {
3605 #ifdef COMPILE_PCRE8
3606 /* fc must be < 128 if UTF is enabled. */
3607 foc = md->fcc[fc];
3608 #else
3609 #ifdef SUPPORT_UTF
3610 #ifdef SUPPORT_UCP
3611 if (utf && fc > 127)
3612 foc = UCD_OTHERCASE(fc);
3613 #else
3614 if (utf && fc > 127)
3615 foc = fc;
3616 #endif /* SUPPORT_UCP */
3617 else
3618 #endif /* SUPPORT_UTF */
3619 foc = TABLE_GET(fc, md->fcc, fc);
3620 #endif /* COMPILE_PCRE8 */
3621
3622 #ifdef SUPPORT_UTF
3623 if (utf)
3624 {
3625 register unsigned int d;
3626 for (i = 1; i <= min; i++)
3627 {
3628 if (eptr >= md->end_subject)
3629 {
3630 SCHECK_PARTIAL();
3631 RRETURN(MATCH_NOMATCH);
3632 }
3633 GETCHARINC(d, eptr);
3634 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3635 }
3636 }
3637 else
3638 #endif
3639 /* Not UTF mode */
3640 {
3641 for (i = 1; i <= min; i++)
3642 {
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 RRETURN(MATCH_NOMATCH);
3647 }
3648 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3649 eptr++;
3650 }
3651 }
3652
3653 if (min == max) continue;
3654
3655 if (minimize)
3656 {
3657 #ifdef SUPPORT_UTF
3658 if (utf)
3659 {
3660 register unsigned int d;
3661 for (fi = min;; fi++)
3662 {
3663 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3664 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3665 if (fi >= max) RRETURN(MATCH_NOMATCH);
3666 if (eptr >= md->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 RRETURN(MATCH_NOMATCH);
3670 }
3671 GETCHARINC(d, eptr);
3672 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3673 }
3674 }
3675 else
3676 #endif
3677 /* Not UTF mode */
3678 {
3679 for (fi = min;; fi++)
3680 {
3681 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3683 if (fi >= max) RRETURN(MATCH_NOMATCH);
3684 if (eptr >= md->end_subject)
3685 {
3686 SCHECK_PARTIAL();
3687 RRETURN(MATCH_NOMATCH);
3688 }
3689 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3690 eptr++;
3691 }
3692 }
3693 /* Control never gets here */
3694 }
3695
3696 /* Maximize case */
3697
3698 else
3699 {
3700 pp = eptr;
3701
3702 #ifdef SUPPORT_UTF
3703 if (utf)
3704 {
3705 register unsigned int d;
3706 for (i = min; i < max; i++)
3707 {
3708 int len = 1;
3709 if (eptr >= md->end_subject)
3710 {
3711 SCHECK_PARTIAL();
3712 break;
3713 }
3714 GETCHARLEN(d, eptr, len);
3715 if (fc == d || foc == d) break;
3716 eptr += len;
3717 }
3718 if (possessive) continue;
3719 for(;;)
3720 {
3721 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3723 if (eptr-- == pp) break; /* Stop if tried at original pos */
3724 BACKCHAR(eptr);
3725 }
3726 }
3727 else
3728 #endif
3729 /* Not UTF mode */
3730 {
3731 for (i = min; i < max; i++)
3732 {
3733 if (eptr >= md->end_subject)
3734 {
3735 SCHECK_PARTIAL();
3736 break;
3737 }
3738 if (fc == *eptr || foc == *eptr) break;
3739 eptr++;
3740 }
3741 if (possessive) continue;
3742 while (eptr >= pp)
3743 {
3744 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746 eptr--;
3747 }
3748 }
3749
3750 RRETURN(MATCH_NOMATCH);
3751 }
3752 /* Control never gets here */
3753 }
3754
3755 /* Caseful comparisons */
3756
3757 else
3758 {
3759 #ifdef SUPPORT_UTF
3760 if (utf)
3761 {
3762 register unsigned int d;
3763 for (i = 1; i <= min; i++)
3764 {
3765 if (eptr >= md->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 RRETURN(MATCH_NOMATCH);
3769 }
3770 GETCHARINC(d, eptr);
3771 if (fc == d) RRETURN(MATCH_NOMATCH);
3772 }
3773 }
3774 else
3775 #endif
3776 /* Not UTF mode */
3777 {
3778 for (i = 1; i <= min; i++)
3779 {
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 RRETURN(MATCH_NOMATCH);
3784 }
3785 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3786 }
3787 }
3788
3789 if (min == max) continue;
3790
3791 if (minimize)
3792 {
3793 #ifdef SUPPORT_UTF
3794 if (utf)
3795 {
3796 register unsigned int d;
3797 for (fi = min;; fi++)
3798 {
3799 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3801 if (fi >= max) RRETURN(MATCH_NOMATCH);
3802 if (eptr >= md->end_subject)
3803 {
3804 SCHECK_PARTIAL();
3805 RRETURN(MATCH_NOMATCH);
3806 }
3807 GETCHARINC(d, eptr);
3808 if (fc == d) RRETURN(MATCH_NOMATCH);
3809 }
3810 }
3811 else
3812 #endif
3813 /* Not UTF mode */
3814 {
3815 for (fi = min;; fi++)
3816 {
3817 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3818 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3819 if (fi >= max) RRETURN(MATCH_NOMATCH);
3820 if (eptr >= md->end_subject)
3821 {
3822 SCHECK_PARTIAL();
3823 RRETURN(MATCH_NOMATCH);
3824 }
3825 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3826 }
3827 }
3828 /* Control never gets here */
3829 }
3830
3831 /* Maximize case */
3832
3833 else
3834 {
3835 pp = eptr;
3836
3837 #ifdef SUPPORT_UTF
3838 if (utf)
3839 {
3840 register unsigned int d;
3841 for (i = min; i < max; i++)
3842 {
3843 int len = 1;
3844 if (eptr >= md->end_subject)
3845 {
3846 SCHECK_PARTIAL();
3847 break;
3848 }
3849 GETCHARLEN(d, eptr, len);
3850 if (fc == d) break;
3851 eptr += len;
3852 }
3853 if (possessive) continue;
3854 for(;;)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 if (eptr-- == pp) break; /* Stop if tried at original pos */
3859 BACKCHAR(eptr);
3860 }
3861 }
3862 else
3863 #endif
3864 /* Not UTF mode */
3865 {
3866 for (i = min; i < max; i++)
3867 {
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 break;
3872 }
3873 if (fc == *eptr) break;
3874 eptr++;
3875 }
3876 if (possessive) continue;
3877 while (eptr >= pp)
3878 {
3879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3881 eptr--;
3882 }
3883 }
3884
3885 RRETURN(MATCH_NOMATCH);
3886 }
3887 }
3888 /* Control never gets here */
3889
3890 /* Match a single character type repeatedly; several different opcodes
3891 share code. This is very similar to the code for single characters, but we
3892 repeat it in the interests of efficiency. */
3893
3894 case OP_TYPEEXACT:
3895 min = max = GET2(ecode, 1);
3896 minimize = TRUE;
3897 ecode += 1 + IMM2_SIZE;
3898 goto REPEATTYPE;
3899
3900 case OP_TYPEUPTO:
3901 case OP_TYPEMINUPTO:
3902 min = 0;
3903 max = GET2(ecode, 1);
3904 minimize = *ecode == OP_TYPEMINUPTO;
3905 ecode += 1 + IMM2_SIZE;
3906 goto REPEATTYPE;
3907
3908 case OP_TYPEPOSSTAR:
3909 possessive = TRUE;
3910 min = 0;
3911 max = INT_MAX;
3912 ecode++;
3913 goto REPEATTYPE;
3914
3915 case OP_TYPEPOSPLUS:
3916 possessive = TRUE;
3917 min = 1;
3918 max = INT_MAX;
3919 ecode++;
3920 goto REPEATTYPE;
3921
3922 case OP_TYPEPOSQUERY:
3923 possessive = TRUE;
3924 min = 0;
3925 max = 1;
3926 ecode++;
3927 goto REPEATTYPE;
3928
3929 case OP_TYPEPOSUPTO:
3930 possessive = TRUE;
3931 min = 0;
3932 max = GET2(ecode, 1);
3933 ecode += 1 + IMM2_SIZE;
3934 goto REPEATTYPE;
3935
3936 case OP_TYPESTAR:
3937 case OP_TYPEMINSTAR:
3938 case OP_TYPEPLUS:
3939 case OP_TYPEMINPLUS:
3940 case OP_TYPEQUERY:
3941 case OP_TYPEMINQUERY:
3942 c = *ecode++ - OP_TYPESTAR;
3943 minimize = (c & 1) != 0;
3944 min = rep_min[c]; /* Pick up values from tables; */
3945 max = rep_max[c]; /* zero for max => infinity */
3946 if (max == 0) max = INT_MAX;
3947
3948 /* Common code for all repeated single character type matches. Note that
3949 in UTF-8 mode, '.' matches a character of any length, but for the other
3950 character types, the valid characters are all one-byte long. */
3951
3952 REPEATTYPE:
3953 ctype = *ecode++; /* Code for the character type */
3954
3955 #ifdef SUPPORT_UCP
3956 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3957 {
3958 prop_fail_result = ctype == OP_NOTPROP;
3959 prop_type = *ecode++;
3960 prop_value = *ecode++;
3961 }
3962 else prop_type = -1;
3963 #endif
3964
3965 /* First, ensure the minimum number of matches are present. Use inline
3966 code for maximizing the speed, and do the type test once at the start
3967 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3968 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3969 and single-bytes. */
3970
3971 if (min > 0)
3972 {
3973 #ifdef SUPPORT_UCP
3974 if (prop_type >= 0)
3975 {
3976 switch(prop_type)
3977 {
3978 case PT_ANY:
3979 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3980 for (i = 1; i <= min; i++)
3981 {
3982 if (eptr >= md->end_subject)
3983 {
3984 SCHECK_PARTIAL();
3985 RRETURN(MATCH_NOMATCH);
3986 }
3987 GETCHARINCTEST(c, eptr);
3988 }
3989 break;
3990
3991 case PT_LAMP:
3992 for (i = 1; i <= min; i++)
3993 {
3994 int chartype;
3995 if (eptr >= md->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 GETCHARINCTEST(c, eptr);
4001 chartype = UCD_CHARTYPE(c);
4002 if ((chartype == ucp_Lu ||
4003 chartype == ucp_Ll ||
4004 chartype == ucp_Lt) == prop_fail_result)
4005 RRETURN(MATCH_NOMATCH);
4006 }
4007 break;
4008
4009 case PT_GC:
4010 for (i = 1; i <= min; i++)
4011 {
4012 if (eptr >= md->end_subject)
4013 {
4014 SCHECK_PARTIAL();
4015 RRETURN(MATCH_NOMATCH);
4016 }
4017 GETCHARINCTEST(c, eptr);
4018 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4019 RRETURN(MATCH_NOMATCH);
4020 }
4021 break;
4022
4023 case PT_PC:
4024 for (i = 1; i <= min; i++)
4025 {
4026 if (eptr >= md->end_subject)
4027 {
4028 SCHECK_PARTIAL();
4029 RRETURN(MATCH_NOMATCH);
4030 }
4031 GETCHARINCTEST(c, eptr);
4032 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4033 RRETURN(MATCH_NOMATCH);
4034 }
4035 break;
4036
4037 case PT_SC:
4038 for (i = 1; i <= min; i++)
4039 {
4040 if (eptr >= md->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 RRETURN(MATCH_NOMATCH);
4044 }
4045 GETCHARINCTEST(c, eptr);
4046 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4047 RRETURN(MATCH_NOMATCH);
4048 }
4049 break;
4050
4051 case PT_ALNUM:
4052 for (i = 1; i <= min; i++)
4053 {
4054 int category;
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 RRETURN(MATCH_NOMATCH);
4059 }
4060 GETCHARINCTEST(c, eptr);
4061 category = UCD_CATEGORY(c);
4062 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4063 RRETURN(MATCH_NOMATCH);
4064 }
4065 break;
4066
4067 case PT_SPACE: /* Perl space */
4068 for (i = 1; i <= min; i++)
4069 {
4070 if (eptr >= md->end_subject)
4071 {
4072 SCHECK_PARTIAL();
4073 RRETURN(MATCH_NOMATCH);
4074 }
4075 GETCHARINCTEST(c, eptr);
4076 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4077 c == CHAR_FF || c == CHAR_CR)
4078 == prop_fail_result)
4079 RRETURN(MATCH_NOMATCH);
4080 }
4081 break;
4082
4083 case PT_PXSPACE: /* POSIX space */
4084 for (i = 1; i <= min; i++)
4085 {
4086 if (eptr >= md->end_subject)
4087 {
4088 SCHECK_PARTIAL();
4089 RRETURN(MATCH_NOMATCH);
4090 }
4091 GETCHARINCTEST(c, eptr);
4092 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4093 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4094 == prop_fail_result)
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 break;
4098
4099 case PT_WORD:
4100 for (i = 1; i <= min; i++)
4101 {
4102 int category;
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4107 }
4108 GETCHARINCTEST(c, eptr);
4109 category = UCD_CATEGORY(c);
4110 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4111 == prop_fail_result)
4112 RRETURN(MATCH_NOMATCH);
4113 }
4114 break;
4115
4116 /* This should not occur */
4117
4118 default:
4119 RRETURN(PCRE_ERROR_INTERNAL);
4120 }
4121 }
4122
4123 /* Match extended Unicode sequences. We will get here only if the
4124 support is in the binary; otherwise a compile-time error occurs. */
4125
4126 else if (ctype == OP_EXTUNI)
4127 {
4128 for (i = 1; i <= min; i++)
4129 {
4130 if (eptr >= md->end_subject)
4131 {
4132 SCHECK_PARTIAL();
4133 RRETURN(MATCH_NOMATCH);
4134 }
4135 GETCHARINCTEST(c, eptr);
4136 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4137 while (eptr < md->end_subject)
4138 {
4139 int len = 1;
4140 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4141 if (UCD_CATEGORY(c) != ucp_M) break;
4142 eptr += len;
4143 }
4144 }
4145 }
4146
4147 else
4148 #endif /* SUPPORT_UCP */
4149
4150 /* Handle all other cases when the coding is UTF-8 */
4151
4152 #ifdef SUPPORT_UTF
4153 if (utf) switch(ctype)
4154 {
4155 case OP_ANY:
4156 for (i = 1; i <= min; i++)
4157 {
4158 if (eptr >= md->end_subject)
4159 {
4160 SCHECK_PARTIAL();
4161 RRETURN(MATCH_NOMATCH);
4162 }
4163 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4164 eptr++;
4165 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4166 }
4167 break;
4168
4169 case OP_ALLANY:
4170 for (i = 1; i <= min; i++)
4171 {
4172 if (eptr >= md->end_subject)
4173 {
4174 SCHECK_PARTIAL();
4175 RRETURN(MATCH_NOMATCH);
4176 }
4177 eptr++;
4178 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4179 }
4180 break;
4181
4182 case OP_ANYBYTE:
4183 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4184 eptr += min;
4185 break;
4186
4187 case OP_ANYNL:
4188 for (i = 1; i <= min; i++)
4189 {
4190 if (eptr >= md->end_subject)
4191 {
4192 SCHECK_PARTIAL();
4193 RRETURN(MATCH_NOMATCH);
4194 }
4195 GETCHARINC(c, eptr);
4196 switch(c)
4197 {
4198 default: RRETURN(MATCH_NOMATCH);
4199
4200 case 0x000d:
4201 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4202 break;
4203
4204 case 0x000a:
4205 break;
4206
4207 case 0x000b:
4208 case 0x000c:
4209 case 0x0085:
4210 case 0x2028:
4211 case 0x2029:
4212 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4213 break;
4214 }
4215 }
4216 break;
4217
4218 case OP_NOT_HSPACE:
4219 for (i = 1; i <= min; i++)
4220 {
4221 if (eptr >= md->end_subject)
4222 {
4223 SCHECK_PARTIAL();
4224 RRETURN(MATCH_NOMATCH);
4225 }
4226 GETCHARINC(c, eptr);
4227 switch(c)
4228 {
4229 default: break;
4230 case 0x09: /* HT */
4231 case 0x20: /* SPACE */
4232 case 0xa0: /* NBSP */
4233 case 0x1680: /* OGHAM SPACE MARK */
4234 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4235 case 0x2000: /* EN QUAD */
4236 case 0x2001: /* EM QUAD */
4237 case 0x2002: /* EN SPACE */
4238 case 0x2003: /* EM SPACE */
4239 case 0x2004: /* THREE-PER-EM SPACE */
4240 case 0x2005: /* FOUR-PER-EM SPACE */
4241 case 0x2006: /* SIX-PER-EM SPACE */
4242 case 0x2007: /* FIGURE SPACE */
4243 case 0x2008: /* PUNCTUATION SPACE */
4244 case 0x2009: /* THIN SPACE */
4245 case 0x200A: /* HAIR SPACE */
4246 case 0x202f: /* NARROW NO-BREAK SPACE */
4247 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4248 case 0x3000: /* IDEOGRAPHIC SPACE */
4249 RRETURN(MATCH_NOMATCH);
4250 }
4251 }
4252 break;
4253
4254 case OP_HSPACE:
4255 for (i = 1; i <= min; i++)
4256 {
4257 if (eptr >= md->end_subject)
4258 {
4259 SCHECK_PARTIAL();
4260 RRETURN(MATCH_NOMATCH);
4261 }
4262 GETCHARINC(c, eptr);
4263 switch(c)
4264 {
4265 default: RRETURN(MATCH_NOMATCH);
4266 case 0x09: /* HT */
4267 case 0x20: /* SPACE */
4268 case 0xa0: /* NBSP */
4269 case 0x1680: /* OGHAM SPACE MARK */
4270 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4271 case 0x2000: /* EN QUAD */
4272 case 0x2001: /* EM QUAD */
4273 case 0x2002: /* EN SPACE */
4274 case 0x2003: /* EM SPACE */
4275 case 0x2004: /* THREE-PER-EM SPACE */
4276 case 0x2005: /* FOUR-PER-EM SPACE */
4277 case 0x2006: /* SIX-PER-EM SPACE */
4278 case 0x2007: /* FIGURE SPACE */
4279 case 0x2008: /* PUNCTUATION SPACE */
4280 case 0x2009: /* THIN SPACE */
4281 case 0x200A: /* HAIR SPACE */
4282 case 0x202f: /* NARROW NO-BREAK SPACE */
4283 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4284 case 0x3000: /* IDEOGRAPHIC SPACE */
4285 break;
4286 }
4287 }
4288 break;
4289
4290 case OP_NOT_VSPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 RRETURN(MATCH_NOMATCH);
4297 }
4298 GETCHARINC(c, eptr);
4299 switch(c)
4300 {
4301 default: break;
4302 case 0x0a: /* LF */
4303 case 0x0b: /* VT */
4304 case 0x0c: /* FF */
4305 case 0x0d: /* CR */
4306 case 0x85: /* NEL */
4307 case 0x2028: /* LINE SEPARATOR */
4308 case 0x2029: /* PARAGRAPH SEPARATOR */
4309 RRETURN(MATCH_NOMATCH);
4310 }
4311 }
4312 break;
4313
4314 case OP_VSPACE:
4315 for (i = 1; i <= min; i++)
4316 {
4317 if (eptr >= md->end_subject)
4318 {
4319 SCHECK_PARTIAL();
4320 RRETURN(MATCH_NOMATCH);
4321 }
4322 GETCHARINC(c, eptr);
4323 switch(c)
4324 {
4325 default: RRETURN(MATCH_NOMATCH);
4326 case 0x0a: /* LF */
4327 case 0x0b: /* VT */
4328 case 0x0c: /* FF */
4329 case 0x0d: /* CR */
4330 case 0x85: /* NEL */
4331 case 0x2028: /* LINE SEPARATOR */
4332 case 0x2029: /* PARAGRAPH SEPARATOR */
4333 break;
4334 }
4335 }
4336 break;
4337
4338 case OP_NOT_DIGIT:
4339 for (i = 1; i <= min; i++)
4340 {
4341 if (eptr >= md->end_subject)
4342 {
4343 SCHECK_PARTIAL();
4344 RRETURN(MATCH_NOMATCH);
4345 }
4346 GETCHARINC(c, eptr);
4347 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4348 RRETURN(MATCH_NOMATCH);
4349 }
4350 break;
4351
4352 case OP_DIGIT:
4353 for (i = 1; i <= min; i++)
4354 {
4355 if (eptr >= md->end_subject)
4356 {
4357 SCHECK_PARTIAL();
4358 RRETURN(MATCH_NOMATCH);
4359 }
4360 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4361 RRETURN(MATCH_NOMATCH);
4362 eptr++;
4363 /* No need to skip more bytes - we know it's a 1-byte character */
4364 }
4365 break;
4366
4367 case OP_NOT_WHITESPACE:
4368 for (i = 1; i <= min; i++)
4369 {
4370 if (eptr >= md->end_subject)
4371 {
4372 SCHECK_PARTIAL();
4373 RRETURN(MATCH_NOMATCH);
4374 }
4375 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4376 RRETURN(MATCH_NOMATCH);
4377 eptr++;
4378 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4379 }
4380 break;
4381
4382 case OP_WHITESPACE:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4389 }
4390 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4391 RRETURN(MATCH_NOMATCH);
4392 eptr++;
4393 /* No need to skip more bytes - we know it's a 1-byte character */
4394 }
4395 break;
4396
4397 case OP_NOT_WORDCHAR:
4398 for (i = 1; i <= min; i++)
4399 {
4400 if (eptr >= md->end_subject)
4401 {
4402 SCHECK_PARTIAL();
4403 RRETURN(MATCH_NOMATCH);
4404 }
4405 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4406 RRETURN(MATCH_NOMATCH);
4407 eptr++;
4408 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4409 }
4410 break;
4411
4412 case OP_WORDCHAR:
4413 for (i = 1; i <= min; i++)
4414 {
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 RRETURN(MATCH_NOMATCH);
4419 }
4420 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4421 RRETURN(MATCH_NOMATCH);
4422 eptr++;
4423 /* No need to skip more bytes - we know it's a 1-byte character */
4424 }
4425 break;
4426
4427 default:
4428 RRETURN(PCRE_ERROR_INTERNAL);
4429 } /* End switch(ctype) */
4430
4431 else
4432 #endif /* SUPPORT_UTF */
4433
4434 /* Code for the non-UTF-8 case for minimum matching of operators other
4435 than OP_PROP and OP_NOTPROP. */
4436
4437 switch(ctype)
4438 {
4439 case OP_ANY:
4440 for (i = 1; i <= min; i++)
4441 {
4442 if (eptr >= md->end_subject)
4443 {
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4446 }
4447 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4448 eptr++;
4449 }
4450 break;
4451
4452 case OP_ALLANY:
4453 if (eptr > md->end_subject - min)
4454 {
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4457 }
4458 eptr += min;
4459 break;
4460
4461 case OP_ANYBYTE:
4462 if (eptr > md->end_subject - min)
4463 {
4464 SCHECK_PARTIAL();
4465 RRETURN(MATCH_NOMATCH);
4466 }
4467 eptr += min;
4468 break;
4469
4470 case OP_ANYNL:
4471 for (i = 1; i <= min; i++)
4472 {
4473 if (eptr >= md->end_subject)
4474 {
4475 SCHECK_PARTIAL();
4476 RRETURN(MATCH_NOMATCH);
4477 }
4478 switch(*eptr++)
4479 {
4480 default: RRETURN(MATCH_NOMATCH);
4481
4482 case 0x000d:
4483 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4484 break;
4485
4486 case 0x000a:
4487 break;
4488
4489 case 0x000b:
4490 case 0x000c:
4491 case 0x0085:
4492 #ifdef COMPILE_PCRE16
4493 case 0x2028:
4494 case 0x2029:
4495 #endif
4496 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4497 break;
4498 }
4499 }
4500 break;
4501
4502 case OP_NOT_HSPACE:
4503 for (i = 1; i <= min; i++)
4504 {
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 RRETURN(MATCH_NOMATCH);
4509 }
4510 switch(*eptr++)
4511 {
4512 default: break;
4513 case 0x09: /* HT */
4514 case 0x20: /* SPACE */
4515 case 0xa0: /* NBSP */
4516 #ifdef COMPILE_PCRE16
4517 case 0x1680: /* OGHAM SPACE MARK */
4518 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4519 case 0x2000: /* EN QUAD */
4520 case 0x2001: /* EM QUAD */
4521 case 0x2002: /* EN SPACE */
4522 case 0x2003: /* EM SPACE */
4523 case 0x2004: /* THREE-PER-EM SPACE */
4524 case 0x2005: /* FOUR-PER-EM SPACE */
4525 case 0x2006: /* SIX-PER-EM SPACE */
4526 case 0x2007: /* FIGURE SPACE */
4527 case 0x2008: /* PUNCTUATION SPACE */
4528 case 0x2009: /* THIN SPACE */
4529 case 0x200A: /* HAIR SPACE */
4530 case 0x202f: /* NARROW NO-BREAK SPACE */
4531 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4532 case 0x3000: /* IDEOGRAPHIC SPACE */
4533 #endif
4534 RRETURN(MATCH_NOMATCH);
4535 }
4536 }
4537 break;
4538
4539 case OP_HSPACE:
4540 for (i = 1; i <= min; i++)
4541 {
4542 if (eptr >= md->end_subject)
4543 {
4544 SCHECK_PARTIAL();
4545 RRETURN(MATCH_NOMATCH);
4546 }
4547 switch(*eptr++)
4548 {
4549 default: RRETURN(MATCH_NOMATCH);
4550 case 0x09: /* HT */
4551 case 0x20: /* SPACE */
4552 case 0xa0: /* NBSP */
4553 #ifdef COMPILE_PCRE16
4554 case 0x1680: /* OGHAM SPACE MARK */
4555 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4556 case 0x2000: /* EN QUAD */
4557 case 0x2001: /* EM QUAD */
4558 case 0x2002: /* EN SPACE */
4559 case 0x2003: /* EM SPACE */
4560 case 0x2004: /* THREE-PER-EM SPACE */
4561 case 0x2005: /* FOUR-PER-EM SPACE */
4562 case 0x2006: /* SIX-PER-EM SPACE */
4563 case 0x2007: /* FIGURE SPACE */
4564 case 0x2008: /* PUNCTUATION SPACE */
4565 case 0x2009: /* THIN SPACE */
4566 case 0x200A: /* HAIR SPACE */
4567 case 0x202f: /* NARROW NO-BREAK SPACE */
4568 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4569 case 0x3000: /* IDEOGRAPHIC SPACE */
4570 #endif
4571 break;
4572 }
4573 }
4574 break;
4575
4576 case OP_NOT_VSPACE:
4577 for (i = 1; i <= min; i++)
4578 {
4579 if (eptr >= md->end_subject)
4580 {
4581 SCHECK_PARTIAL();
4582 RRETURN(MATCH_NOMATCH);
4583 }
4584 switch(*eptr++)
4585 {
4586 default: break;
4587 case 0x0a: /* LF */
4588 case 0x0b: /* VT */
4589 case 0x0c: /* FF */
4590 case 0x0d: /* CR */
4591 case 0x85: /* NEL */
4592 #ifdef COMPILE_PCRE16
4593 case 0x2028: /* LINE SEPARATOR */
4594 case 0x2029: /* PARAGRAPH SEPARATOR */
4595 #endif
4596 RRETURN(MATCH_NOMATCH);
4597 }
4598 }
4599 break;
4600
4601 case OP_VSPACE:
4602 for (i = 1; i <= min; i++)
4603 {
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 switch(*eptr++)
4610 {
4611 default: RRETURN(MATCH_NOMATCH);
4612 case 0x0a: /* LF */
4613 case 0x0b: /* VT */
4614 case 0x0c: /* FF */
4615 case 0x0d: /* CR */
4616 case 0x85: /* NEL */
4617 #ifdef COMPILE_PCRE16
4618 case 0x2028: /* LINE SEPARATOR */
4619 case 0x2029: /* PARAGRAPH SEPARATOR */
4620 #endif
4621 break;
4622 }
4623 }
4624 break;
4625
4626 case OP_NOT_DIGIT:
4627 for (i = 1; i <= min; i++)
4628 {
4629 if (eptr >= md->end_subject)
4630 {
4631 SCHECK_PARTIAL();
4632 RRETURN(MATCH_NOMATCH);
4633 }
4634 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4635 RRETURN(MATCH_NOMATCH);
4636 eptr++;
4637 }
4638 break;
4639
4640 case OP_DIGIT:
4641 for (i = 1; i <= min; i++)
4642 {
4643 if (eptr >= md->end_subject)
4644 {
4645 SCHECK_PARTIAL();
4646 RRETURN(MATCH_NOMATCH);
4647 }
4648 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4649 RRETURN(MATCH_NOMATCH);
4650 eptr++;
4651 }
4652 break;
4653
4654 case OP_NOT_WHITESPACE:
4655 for (i = 1; i <= min; i++)
4656 {
4657 if (eptr >= md->end_subject)
4658 {
4659 SCHECK_PARTIAL();
4660 RRETURN(MATCH_NOMATCH);
4661 }
4662 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4663 RRETURN(MATCH_NOMATCH);
4664 eptr++;
4665 }
4666 break;
4667
4668 case OP_WHITESPACE:
4669 for (i = 1; i <= min; i++)
4670 {
4671 if (eptr >= md->end_subject)
4672 {
4673 SCHECK_PARTIAL();
4674 RRETURN(MATCH_NOMATCH);
4675 }
4676 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4677 RRETURN(MATCH_NOMATCH);
4678 eptr++;
4679 }
4680 break;
4681
4682 case OP_NOT_WORDCHAR:
4683 for (i = 1; i <= min; i++)
4684 {
4685 if (eptr >= md->end_subject)
4686 {
4687 SCHECK_PARTIAL();
4688 RRETURN(MATCH_NOMATCH);
4689 }
4690 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4691 RRETURN(MATCH_NOMATCH);
4692 eptr++;
4693 }
4694 break;
4695
4696 case OP_WORDCHAR:
4697 for (i = 1; i <= min; i++)
4698 {
4699 if (eptr >= md->end_subject)
4700 {
4701 SCHECK_PARTIAL();
4702 RRETURN(MATCH_NOMATCH);
4703 }
4704 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4705 RRETURN(MATCH_NOMATCH);
4706 eptr++;
4707 }
4708 break;
4709
4710 default:
4711 RRETURN(PCRE_ERROR_INTERNAL);
4712 }
4713 }
4714
4715 /* If min = max, continue at the same level without recursing */
4716
4717 if (min == max) continue;
4718
4719 /* If minimizing, we have to test the rest of the pattern before each
4720 subsequent match. Again, separate the UTF-8 case for speed, and also
4721 separate the UCP cases. */
4722
4723 if (minimize)
4724 {
4725 #ifdef SUPPORT_UCP
4726 if (prop_type >= 0)
4727 {
4728 switch(prop_type)
4729 {
4730 case PT_ANY:
4731 for (fi = min;; fi++)
4732 {
4733 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4734 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4735 if (fi >= max) RRETURN(MATCH_NOMATCH);
4736 if (eptr >= md->end_subject)
4737 {
4738 SCHECK_PARTIAL();
4739 RRETURN(MATCH_NOMATCH);
4740 }
4741 GETCHARINCTEST(c, eptr);
4742 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4743 }
4744 /* Control never gets here */
4745
4746 case PT_LAMP:
4747 for (fi = min;; fi++)
4748 {
4749 int chartype;
4750 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4752 if (fi >= max) RRETURN(MATCH_NOMATCH);
4753 if (eptr >= md->end_subject)
4754 {
4755 SCHECK_PARTIAL();
4756 RRETURN(MATCH_NOMATCH);
4757 }
4758 GETCHARINCTEST(c, eptr);
4759 chartype = UCD_CHARTYPE(c);
4760 if ((chartype == ucp_Lu ||
4761 chartype == ucp_Ll ||
4762 chartype == ucp_Lt) == prop_fail_result)
4763 RRETURN(MATCH_NOMATCH);
4764 }
4765 /* Control never gets here */
4766
4767 case PT_GC:
4768 for (fi = min;; fi++)
4769 {
4770 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4772 if (fi >= max) RRETURN(MATCH_NOMATCH);
4773 if (eptr >= md->end_subject)
4774 {
4775 SCHECK_PARTIAL();
4776 RRETURN(MATCH_NOMATCH);
4777 }
4778 GETCHARINCTEST(c, eptr);
4779 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4780 RRETURN(MATCH_NOMATCH);
4781 }
4782 /* Control never gets here */
4783
4784 case PT_PC:
4785 for (fi = min;; fi++)
4786 {
4787 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4789 if (fi >= max) RRETURN(MATCH_NOMATCH);
4790 if (eptr >= md->end_subject)
4791 {
4792 SCHECK_PARTIAL();
4793 RRETURN(MATCH_NOMATCH);
4794 }
4795 GETCHARINCTEST(c, eptr);
4796 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4797 RRETURN(MATCH_NOMATCH);
4798 }
4799 /* Control never gets here */
4800
4801 case PT_SC:
4802 for (fi = min;; fi++)
4803 {
4804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4806 if (fi >= max) RRETURN(MATCH_NOMATCH);
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 RRETURN(MATCH_NOMATCH);
4811 }
4812 GETCHARINCTEST(c, eptr);
4813 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 /* Control never gets here */
4817
4818 case PT_ALNUM:
4819 for (fi = min;; fi++)
4820 {
4821 int category;
4822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4824 if (fi >= max) RRETURN(MATCH_NOMATCH);
4825 if (eptr >= md->end_subject)
4826 {
4827 SCHECK_PARTIAL();
4828 RRETURN(MATCH_NOMATCH);
4829 }
4830 GETCHARINCTEST(c, eptr);
4831 category = UCD_CATEGORY(c);
4832 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4833 RRETURN(MATCH_NOMATCH);
4834 }
4835 /* Control never gets here */
4836
4837 case PT_SPACE: /* Perl space */
4838 for (fi = min;; fi++)
4839 {
4840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4842 if (fi >= max) RRETURN(MATCH_NOMATCH);
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 RRETURN(MATCH_NOMATCH);
4847 }
4848 GETCHARINCTEST(c, eptr);
4849 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4850 c == CHAR_FF || c == CHAR_CR)
4851 == prop_fail_result)
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 /* Control never gets here */
4855
4856 case PT_PXSPACE: /* POSIX space */
4857 for (fi = min;; fi++)
4858 {
4859 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4860 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4861 if (fi >= max) RRETURN(MATCH_NOMATCH);
4862 if (eptr >= md->end_subject)
4863 {
4864 SCHECK_PARTIAL();
4865 RRETURN(MATCH_NOMATCH);
4866 }
4867 GETCHARINCTEST(c, eptr);
4868 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4869 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4870 == prop_fail_result)
4871 RRETURN(MATCH_NOMATCH);
4872 }
4873 /* Control never gets here */
4874
4875 case PT_WORD:
4876 for (fi = min;; fi++)
4877 {
4878 int category;
4879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4881 if (fi >= max) RRETURN(MATCH_NOMATCH);
4882 if (eptr >= md->end_subject)
4883 {
4884 SCHECK_PARTIAL();
4885 RRETURN(MATCH_NOMATCH);
4886 }
4887 GETCHARINCTEST(c, eptr);
4888 category = UCD_CATEGORY(c);
4889 if ((category == ucp_L ||
4890 category == ucp_N ||
4891 c == CHAR_UNDERSCORE)
4892 == prop_fail_result)
4893 RRETURN(MATCH_NOMATCH);
4894 }
4895 /* Control never gets here */
4896
4897 /* This should never occur */
4898
4899 default:
4900 RRETURN(PCRE_ERROR_INTERNAL);
4901 }
4902 }
4903
4904 /* Match extended Unicode sequences. We will get here only if the
4905 support is in the binary; otherwise a compile-time error occurs. */
4906
4907 else if (ctype == OP_EXTUNI)
4908 {
4909 for (fi = min;; fi++)
4910 {
4911 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4913 if (fi >= max) RRETURN(MATCH_NOMATCH);
4914 if (eptr >= md->end_subject)
4915 {
4916 SCHECK_PARTIAL();
4917 RRETURN(MATCH_NOMATCH);
4918 }
4919 GETCHARINCTEST(c, eptr);
4920 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4921 while (eptr < md->end_subject)
4922 {
4923 int len = 1;
4924 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4925 if (UCD_CATEGORY(c) != ucp_M) break;
4926 eptr += len;
4927 }
4928 }
4929 }
4930 else
4931 #endif /* SUPPORT_UCP */
4932
4933 #ifdef SUPPORT_UTF
4934 if (utf)
4935 {
4936 for (fi = min;; fi++)
4937 {
4938 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4940 if (fi >= max) RRETURN(MATCH_NOMATCH);
4941 if (eptr >= md->end_subject)
4942 {
4943 SCHECK_PARTIAL();
4944 RRETURN(MATCH_NOMATCH);
4945 }
4946 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4947 RRETURN(MATCH_NOMATCH);
4948 GETCHARINC(c, eptr);
4949 switch(ctype)
4950 {
4951 case OP_ANY: /* This is the non-NL case */
4952 case OP_ALLANY:
4953 case OP_ANYBYTE:
4954 break;
4955
4956 case OP_ANYNL:
4957 switch(c)
4958 {
4959 default: RRETURN(MATCH_NOMATCH);
4960 case 0x000d:
4961 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4962 break;
4963 case 0x000a:
4964 break;
4965
4966 case 0x000b:
4967 case 0x000c:
4968 case 0x0085:
4969 case 0x2028:
4970 case 0x2029:
4971 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4972 break;
4973 }
4974 break;
4975
4976 case OP_NOT_HSPACE:
4977 switch(c)
4978 {
4979 default: break;
4980 case 0x09: /* HT */
4981 case 0x20: /* SPACE */
4982 case 0xa0: /* NBSP */
4983 case 0x1680: /* OGHAM SPACE MARK */
4984 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4985 case 0x2000: /* EN QUAD */
4986 case 0x2001: /* EM QUAD */
4987 case 0x2002: /* EN SPACE */
4988 case 0x2003: /* EM SPACE */
4989 case 0x2004: /* THREE-PER-EM SPACE */
4990 case 0x2005: /* FOUR-PER-EM SPACE */
4991 case 0x2006: /* SIX-PER-EM SPACE */
4992 case 0x2007: /* FIGURE SPACE */
4993 case 0x2008: /* PUNCTUATION SPACE */
4994 case 0x2009: /* THIN SPACE */
4995 case 0x200A: /* HAIR SPACE */
4996 case 0x202f: /* NARROW NO-BREAK SPACE */
4997 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4998 case 0x3000: /* IDEOGRAPHIC SPACE */
4999 RRETURN(MATCH_NOMATCH);
5000 }
5001 break;
5002
5003 case OP_HSPACE:
5004 switch(c)
5005 {
5006 default: RRETURN(MATCH_NOMATCH);
5007 case 0x09: /* HT */
5008 case 0x20: /* SPACE */
5009 case 0xa0: /* NBSP */
5010 case 0x1680: /* OGHAM SPACE MARK */
5011 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5012 case 0x2000: /* EN QUAD */
5013 case 0x2001: /* EM QUAD */
5014 case 0x2002: /* EN SPACE */
5015 case 0x2003: /* EM SPACE */
5016 case 0x2004: /* THREE-PER-EM SPACE */
5017 case 0x2005: /* FOUR-PER-EM SPACE */
5018 case 0x2006: /* SIX-PER-EM SPACE */
5019 case 0x2007: /* FIGURE SPACE */
5020 case 0x2008: /* PUNCTUATION SPACE */
5021 case 0x2009: /* THIN SPACE */
5022 case 0x200A: /* HAIR SPACE */
5023 case 0x202f: /* NARROW NO-BREAK SPACE */
5024 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5025 case 0x3000: /* IDEOGRAPHIC SPACE */
5026 break;
5027 }
5028 break;
5029
5030 case OP_NOT_VSPACE:
5031 switch(c)
5032 {
5033 default: break;
5034 case 0x0a: /* LF */
5035 case 0x0b: /* VT */
5036 case 0x0c: /* FF */
5037 case 0x0d: /* CR */
5038 case 0x85: /* NEL */
5039 case 0x2028: /* LINE SEPARATOR */
5040 case 0x2029: /* PARAGRAPH SEPARATOR */
5041 RRETURN(MATCH_NOMATCH);
5042 }
5043 break;
5044
5045 case OP_VSPACE:
5046 switch(c)
5047 {
5048 default: RRETURN(MATCH_NOMATCH);
5049 case 0x0a: /* LF */
5050 case 0x0b: /* VT */
5051 case 0x0c: /* FF */
5052 case 0x0d: /* CR */
5053 case 0x85: /* NEL */
5054 case 0x2028: /* LINE SEPARATOR */
5055 case 0x2029: /* PARAGRAPH SEPARATOR */
5056 break;
5057 }
5058 break;
5059
5060 case OP_NOT_DIGIT:
5061 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5062 RRETURN(MATCH_NOMATCH);
5063 break;
5064
5065 case OP_DIGIT:
5066 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5067 RRETURN(MATCH_NOMATCH);
5068 break;
5069
5070 case OP_NOT_WHITESPACE:
5071 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5072 RRETURN(MATCH_NOMATCH);
5073 break;
5074
5075 case OP_WHITESPACE:
5076 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5077 RRETURN(MATCH_NOMATCH);
5078 break;
5079
5080 case OP_NOT_WORDCHAR:
5081 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5082 RRETURN(MATCH_NOMATCH);
5083 break;
5084
5085 case OP_WORDCHAR:
5086 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5087 RRETURN(MATCH_NOMATCH);
5088 break;
5089
5090 default:
5091 RRETURN(PCRE_ERROR_INTERNAL);
5092 }
5093 }
5094 }
5095 else
5096 #endif
5097 /* Not UTF mode */
5098 {
5099 for (fi = min;; fi++)
5100 {
5101 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5103 if (fi >= max) RRETURN(MATCH_NOMATCH);
5104 if (eptr >= md->end_subject)
5105 {
5106 SCHECK_PARTIAL();
5107 RRETURN(MATCH_NOMATCH);
5108 }
5109 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5110 RRETURN(MATCH_NOMATCH);
5111 c = *eptr++;
5112 switch(ctype)
5113 {
5114 case OP_ANY: /* This is the non-NL case */
5115 case OP_ALLANY:
5116 case OP_ANYBYTE:
5117 break;
5118
5119 case OP_ANYNL:
5120 switch(c)
5121 {
5122 default: RRETURN(MATCH_NOMATCH);
5123 case 0x000d:
5124 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5125 break;
5126
5127 case 0x000a:
5128 break;
5129
5130 case 0x000b:
5131 case 0x000c:
5132 case 0x0085:
5133 #ifdef COMPILE_PCRE16
5134 case 0x2028:
5135 case 0x2029:
5136 #endif
5137 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5138 break;
5139 }
5140 break;
5141
5142 case OP_NOT_HSPACE:
5143 switch(c)
5144 {
5145 default: break;
5146 case 0x09: /* HT */
5147 case 0x20: /* SPACE */
5148 case 0xa0: /* NBSP */
5149 #ifdef COMPILE_PCRE16
5150 case 0x1680: /* OGHAM SPACE MARK */
5151 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5152 case 0x2000: /* EN QUAD */
5153 case 0x2001: /* EM QUAD */
5154 case 0x2002: /* EN SPACE */
5155 case 0x2003: /* EM SPACE */
5156 case 0x2004: /* THREE-PER-EM SPACE */
5157 case 0x2005: /* FOUR-PER-EM SPACE */
5158 case 0x2006: /* SIX-PER-EM SPACE */
5159 case 0x2007: /* FIGURE SPACE */
5160 case 0x2008: /* PUNCTUATION SPACE */
5161 case 0x2009: /* THIN SPACE */
5162 case 0x200A: /* HAIR SPACE */
5163 case 0x202f: /* NARROW NO-BREAK SPACE */
5164 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5165 case 0x3000: /* IDEOGRAPHIC SPACE */
5166 #endif
5167 RRETURN(MATCH_NOMATCH);
5168 }
5169 break;
5170
5171 case OP_HSPACE:
5172 switch(c)
5173 {
5174 default: RRETURN(MATCH_NOMATCH);
5175 case 0x09: /* HT */
5176 case 0x20: /* SPACE */
5177 case 0xa0: /* NBSP */
5178 #ifdef COMPILE_PCRE16
5179 case 0x1680: /* OGHAM SPACE MARK */
5180 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5181 case 0x2000: /* EN QUAD */
5182 case 0x2001: /* EM QUAD */
5183 case 0x2002: /* EN SPACE */
5184 case 0x2003: /* EM SPACE */
5185 case 0x2004: /* THREE-PER-EM SPACE */
5186 case 0x2005: /* FOUR-PER-EM SPACE */
5187 case 0x2006: /* SIX-PER-EM SPACE */
5188 case 0x2007: /* FIGURE SPACE */
5189 case 0x2008: /* PUNCTUATION SPACE */
5190 case 0x2009: /* THIN SPACE */
5191 case 0x200A: /* HAIR SPACE */
5192 case 0x202f: /* NARROW NO-BREAK SPACE */
5193 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5194 case 0x3000: /* IDEOGRAPHIC SPACE */
5195 #endif
5196 break;
5197 }
5198 break;
5199
5200 case OP_NOT_VSPACE:
5201 switch(c)
5202 {
5203 default: break;
5204 case 0x0a: /* LF */
5205 case 0x0b: /* VT */
5206 case 0x0c: /* FF */
5207 case 0x0d: /* CR */
5208 case 0x85: /* NEL */
5209 #ifdef COMPILE_PCRE16
5210 case 0x2028: /* LINE SEPARATOR */
5211 case 0x2029: /* PARAGRAPH SEPARATOR */
5212 #endif
5213 RRETURN(MATCH_NOMATCH);
5214 }
5215 break;
5216
5217 case OP_VSPACE:
5218 switch(c)
5219 {
5220 default: RRETURN(MATCH_NOMATCH);
5221 case 0x0a: /* LF */
5222 case 0x0b: /* VT */
5223 case 0x0c: /* FF */
5224 case 0x0d: /* CR */
5225 case 0x85: /* NEL */
5226 #ifdef COMPILE_PCRE16
5227 case 0x2028: /* LINE SEPARATOR */
5228 case 0x2029: /* PARAGRAPH SEPARATOR */
5229 #endif
5230 break;
5231 }
5232 break;
5233
5234 case OP_NOT_DIGIT:
5235 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5236 break;
5237
5238 case OP_DIGIT:
5239 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5240 break;
5241
5242 case OP_NOT_WHITESPACE:
5243 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5244 break;
5245
5246 case OP_WHITESPACE:
5247 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5248 break;
5249
5250 case OP_NOT_WORDCHAR:
5251 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5252 break;
5253
5254 case OP_WORDCHAR:
5255 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5256 break;
5257
5258 default:
5259 RRETURN(PCRE_ERROR_INTERNAL);
5260 }
5261 }
5262 }
5263 /* Control never gets here */
5264 }
5265
5266 /* If maximizing, it is worth using inline code for speed, doing the type
5267 test once at the start (i.e. keep it out of the loop). Again, keep the
5268 UTF-8 and UCP stuff separate. */
5269
5270 else
5271 {
5272 pp = eptr; /* Remember where we started */
5273
5274 #ifdef SUPPORT_UCP
5275 if (prop_type >= 0)
5276 {
5277 switch(prop_type)
5278 {
5279 case PT_ANY:
5280 for (i = min; i < max; i++)
5281 {
5282 int len = 1;
5283 if (eptr >= md->end_subject)
5284 {
5285 SCHECK_PARTIAL();
5286 break;
5287 }
5288 GETCHARLENTEST(c, eptr, len);
5289 if (prop_fail_result) break;
5290 eptr+= len;
5291 }
5292 break;
5293
5294 case PT_LAMP:
5295 for (i = min; i < max; i++)
5296 {
5297 int chartype;
5298 int len = 1;
5299 if (eptr >= md->end_subject)
5300 {
5301 SCHECK_PARTIAL();
5302 break;
5303 }
5304 GETCHARLENTEST(c, eptr, len);
5305 chartype = UCD_CHARTYPE(c);
5306 if ((chartype == ucp_Lu ||
5307 chartype == ucp_Ll ||
5308 chartype == ucp_Lt) == prop_fail_result)
5309 break;
5310 eptr+= len;
5311 }
5312 break;
5313
5314 case PT_GC:
5315 for (i = min; i < max; i++)
5316 {
5317 int len = 1;
5318 if (eptr >= md->end_subject)
5319 {
5320 SCHECK_PARTIAL();
5321 break;
5322 }
5323 GETCHARLENTEST(c, eptr, len);
5324 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5325 eptr+= len;
5326 }
5327 break;
5328
5329 case PT_PC:
5330 for (i = min; i < max; i++)
5331 {
5332 int len = 1;
5333 if (eptr >= md->end_subject)
5334 {
5335 SCHECK_PARTIAL();
5336 break;
5337 }
5338 GETCHARLENTEST(c, eptr, len);
5339 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5340 eptr+= len;
5341 }
5342 break;
5343
5344 case PT_SC:
5345 for (i = min; i < max; i++)
5346 {
5347 int len = 1;
5348 if (eptr >= md->end_subject)
5349 {
5350 SCHECK_PARTIAL();
5351 break;
5352 }
5353 GETCHARLENTEST(c, eptr, len);
5354 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5355 eptr+= len;
5356 }
5357 break;
5358
5359 case PT_ALNUM:
5360 for (i = min; i < max; i++)
5361 {
5362 int category;
5363 int len = 1;
5364 if (eptr >= md->end_subject)
5365 {
5366 SCHECK_PARTIAL();
5367 break;
5368 }
5369 GETCHARLENTEST(c, eptr, len);
5370 category = UCD_CATEGORY(c);
5371 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5372 break;
5373 eptr+= len;
5374 }
5375 break;
5376
5377 case PT_SPACE: /* Perl space */
5378 for (i = min; i < max; i++)
5379 {
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5382 {
5383 SCHECK_PARTIAL();
5384 break;
5385 }
5386 GETCHARLENTEST(c, eptr, len);
5387 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5388 c == CHAR_FF || c == CHAR_CR)
5389 == prop_fail_result)
5390 break;
5391 eptr+= len;
5392 }
5393 break;
5394
5395 case PT_PXSPACE: /* POSIX space */
5396 for (i = min; i < max; i++)
5397 {
5398 int len = 1;
5399 if (eptr >= md->end_subject)
5400 {
5401 SCHECK_PARTIAL();
5402 break;
5403 }
5404 GETCHARLENTEST(c, eptr, len);
5405 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5406 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5407 == prop_fail_result)
5408 break;
5409 eptr+= len;
5410 }
5411 break;
5412
5413 case PT_WORD:
5414 for (i = min; i < max; i++)
5415 {
5416 int category;
5417 int len = 1;
5418 if (eptr >= md->end_subject)
5419 {
5420 SCHECK_PARTIAL();
5421 break;
5422 }
5423 GETCHARLENTEST(c, eptr, len);
5424 category = UCD_CATEGORY(c);
5425 if ((category == ucp_L || category == ucp_N ||
5426 c == CHAR_UNDERSCORE) == prop_fail_result)
5427 break;
5428 eptr+= len;
5429 }
5430 break;
5431
5432 default:
5433 RRETURN(PCRE_ERROR_INTERNAL);
5434 }
5435
5436 /* eptr is now past the end of the maximum run */
5437
5438 if (possessive) continue;
5439 for(;;)
5440 {
5441 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5442 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5443 if (eptr-- == pp) break; /* Stop if tried at original pos */
5444 if (utf) BACKCHAR(eptr);
5445 }
5446 }
5447
5448 /* Match extended Unicode sequences. We will get here only if the
5449 support is in the binary; otherwise a compile-time error occurs. */
5450
5451 else if (ctype == OP_EXTUNI)
5452 {
5453 for (i = min; i < max; i++)
5454 {
5455 int len = 1;
5456 if (eptr >= md->end_subject)
5457 {
5458 SCHECK_PARTIAL();
5459 break;
5460 }
5461 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5462 if (UCD_CATEGORY(c) == ucp_M) break;
5463 eptr += len;
5464 while (eptr < md->end_subject)
5465 {
5466 len = 1;
5467 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5468 if (UCD_CATEGORY(c) != ucp_M) break;
5469 eptr += len;
5470 }
5471 }
5472
5473 /* eptr is now past the end of the maximum run */
5474
5475 if (possessive) continue;
5476
5477 for(;;)
5478 {
5479 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5481 if (eptr-- == pp) break; /* Stop if tried at original pos */
5482 for (;;) /* Move back over one extended */
5483 {
5484 if (!utf) c = *eptr; else
5485 {
5486 BACKCHAR(eptr);
5487 GETCHAR(c, eptr);
5488 }
5489 if (UCD_CATEGORY(c) != ucp_M) break;
5490 eptr--;
5491 }
5492 }
5493 }
5494
5495 else
5496 #endif /* SUPPORT_UCP */
5497
5498 #ifdef SUPPORT_UTF
5499 if (utf)
5500 {
5501 switch(ctype)
5502 {
5503 case OP_ANY:
5504 if (max < INT_MAX)
5505 {
5506 for (i = min; i < max; i++)
5507 {
5508 if (eptr >= md->end_subject)
5509 {
5510 SCHECK_PARTIAL();
5511 break;
5512 }
5513 if (IS_NEWLINE(eptr)) break;
5514 eptr++;
5515 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5516 }
5517 }
5518
5519 /* Handle unlimited UTF-8 repeat */
5520
5521 else
5522 {
5523 for (i = min; i < max; i++)
5524 {
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 if (IS_NEWLINE(eptr)) break;
5531 eptr++;
5532 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5533 }
5534 }
5535 break;
5536
5537 case OP_ALLANY:
5538 if (max < INT_MAX)
5539 {
5540 for (i = min; i < max; i++)
5541 {
5542 if (eptr >= md->end_subject)
5543 {
5544 SCHECK_PARTIAL();
5545 break;
5546 }
5547 eptr++;
5548 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5549 }
5550 }
5551 else
5552 {
5553 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5554 SCHECK_PARTIAL();
5555 }
5556 break;
5557
5558 /* The byte case is the same as non-UTF8 */
5559
5560 case OP_ANYBYTE:
5561 c = max - min;
5562 if (c > (unsigned int)(md->end_subject - eptr))
5563 {
5564 eptr = md->end_subject;
5565 SCHECK_PARTIAL();
5566 }
5567 else eptr += c;
5568 break;
5569
5570 case OP_ANYNL:
5571 for (i = min; i < max; i++)
5572 {
5573 int len = 1;
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 GETCHARLEN(c, eptr, len);
5580 if (c == 0x000d)
5581 {
5582 if (++eptr >= md->end_subject) break;
5583 if (*eptr == 0x000a) eptr++;
5584 }
5585 else
5586 {
5587 if (c != 0x000a &&
5588 (md->bsr_anycrlf ||
5589 (c != 0x000b && c != 0x000c &&
5590 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5591 break;
5592 eptr += len;
5593 }
5594 }
5595 break;
5596
5597 case OP_NOT_HSPACE:
5598 case OP_HSPACE:
5599 for (i = min; i < max; i++)
5600 {
5601 BOOL gotspace;
5602 int len = 1;
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 GETCHARLEN(c, eptr, len);
5609 switch(c)
5610 {
5611 default: gotspace = FALSE; break;
5612 case 0x09: /* HT */
5613 case 0x20: /* SPACE */
5614 case 0xa0: /* NBSP */
5615 case 0x1680: /* OGHAM SPACE MARK */
5616 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5617 case 0x2000: /* EN QUAD */
5618 case 0x2001: /* EM QUAD */
5619 case 0x2002: /* EN SPACE */
5620 case 0x2003: /* EM SPACE */
5621 case 0x2004: /* THREE-PER-EM SPACE */
5622 case 0x2005: /* FOUR-PER-EM SPACE */
5623 case 0x2006: /* SIX-PER-EM SPACE */
5624 case 0x2007: /* FIGURE SPACE */
5625 case 0x2008: /* PUNCTUATION SPACE */
5626 case 0x2009: /* THIN SPACE */
5627 case 0x200A: /* HAIR SPACE */
5628 case 0x202f: /* NARROW NO-BREAK SPACE */
5629 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5630 case 0x3000: /* IDEOGRAPHIC SPACE */
5631 gotspace = TRUE;
5632 break;
5633 }
5634 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5635 eptr += len;
5636 }
5637 break;
5638
5639 case OP_NOT_VSPACE:
5640 case OP_VSPACE:
5641 for (i = min; i < max; i++)
5642 {
5643 BOOL gotspace;
5644 int len = 1;
5645 if (eptr >= md->end_subject)
5646 {
5647 SCHECK_PARTIAL();
5648 break;
5649 }
5650 GETCHARLEN(c, eptr, len);
5651 switch(c)
5652 {
5653 default: gotspace = FALSE; break;
5654 case 0x0a: /* LF */
5655 case 0x0b: /* VT */
5656 case 0x0c: /* FF */
5657 case 0x0d: /* CR */
5658 case 0x85: /* NEL */
5659 case 0x2028: /* LINE SEPARATOR */
5660 case 0x2029: /* PARAGRAPH SEPARATOR */
5661 gotspace = TRUE;
5662 break;
5663 }
5664 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5665 eptr += len;
5666 }
5667 break;
5668
5669 case OP_NOT_DIGIT:
5670 for (i = min; i < max; i++)
5671 {
5672 int len = 1;
5673 if (eptr >= md->end_subject)
5674 {
5675 SCHECK_PARTIAL();
5676 break;
5677 }
5678 GETCHARLEN(c, eptr, len);
5679 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5680 eptr+= len;
5681 }
5682 break;
5683
5684 case OP_DIGIT:
5685 for (i = min; i < max; i++)
5686 {
5687 int len = 1;
5688 if (eptr >= md->end_subject)
5689 {
5690 SCHECK_PARTIAL();
5691 break;
5692 }
5693 GETCHARLEN(c, eptr, len);
5694 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5695 eptr+= len;
5696 }
5697 break;
5698
5699 case OP_NOT_WHITESPACE:
5700 for (i = min; i < max; i++)
5701 {
5702 int len = 1;
5703 if (eptr >= md->end_subject)
5704 {
5705 SCHECK_PARTIAL();
5706 break;
5707 }
5708 GETCHARLEN(c, eptr, len);
5709 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5710 eptr+= len;
5711 }
5712 break;
5713
5714 case OP_WHITESPACE:
5715 for (i = min; i < max; i++)
5716 {
5717 int len = 1;
5718 if (eptr >= md->end_subject)
5719 {
5720 SCHECK_PARTIAL();
5721 break;
5722 }
5723 GETCHARLEN(c, eptr, len);
5724 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5725 eptr+= len;
5726 }
5727 break;
5728
5729 case OP_NOT_WORDCHAR:
5730 for (i = min; i < max; i++)
5731 {
5732 int len = 1;
5733 if (eptr >= md->end_subject)
5734 {
5735 SCHECK_PARTIAL();
5736 break;
5737 }
5738 GETCHARLEN(c, eptr, len);
5739 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5740 eptr+= len;
5741 }
5742 break;
5743
5744 case OP_WORDCHAR:
5745 for (i = min; i < max; i++)
5746 {
5747 int len = 1;
5748 if (eptr >= md->end_subject)
5749 {
5750 SCHECK_PARTIAL();
5751 break;
5752 }
5753 GETCHARLEN(c, eptr, len);
5754 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5755 eptr+= len;
5756 }
5757 break;
5758
5759 default:
5760 RRETURN(PCRE_ERROR_INTERNAL);
5761 }
5762
5763 /* eptr is now past the end of the maximum run. If possessive, we are
5764 done (no backing up). Otherwise, match at this position; anything other
5765 than no match is immediately returned. For nomatch, back up one
5766 character, unless we are matching \R and the last thing matched was
5767 \r\n, in which case, back up two bytes. */
5768
5769 if (possessive) continue;
5770 for(;;)
5771 {
5772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5774 if (eptr-- == pp) break; /* Stop if tried at original pos */
5775 BACKCHAR(eptr);
5776 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5777 eptr[-1] == '\r') eptr--;
5778 }
5779 }
5780 else
5781 #endif /* SUPPORT_UTF */
5782 /* Not UTF mode */
5783 {
5784 switch(ctype)
5785 {
5786 case OP_ANY:
5787 for (i = min; i < max; i++)
5788 {
5789 if (eptr >= md->end_subject)
5790 {
5791 SCHECK_PARTIAL();
5792 break;
5793 }
5794 if (IS_NEWLINE(eptr)) break;
5795 eptr++;
5796 }
5797 break;
5798
5799 case OP_ALLANY:
5800 case OP_ANYBYTE:
5801 c = max - min;
5802 if (c > (unsigned int)(md->end_subject - eptr))
5803 {
5804 eptr = md->end_subject;
5805 SCHECK_PARTIAL();
5806 }
5807 else eptr += c;
5808 break;
5809
5810 case OP_ANYNL:
5811 for (i = min; i < max; i++)
5812 {
5813 if (eptr >= md->end_subject)
5814 {
5815 SCHECK_PARTIAL();
5816 break;
5817 }
5818 c = *eptr;
5819 if (c == 0x000d)
5820 {
5821 if (++eptr >= md->end_subject) break;
5822 if (*eptr == 0x000a) eptr++;
5823 }
5824 else
5825 {
5826 if (c != 0x000a && (md->bsr_anycrlf ||
5827 (c != 0x000b && c != 0x000c && c != 0x0085
5828 #ifdef COMPILE_PCRE16
5829 && c != 0x2028 && c != 0x2029
5830 #endif
5831 ))) break;
5832 eptr++;
5833 }
5834 }
5835 break;
5836
5837 case OP_NOT_HSPACE:
5838 for (i = min; i < max; i++)
5839 {
5840 if (eptr >= md->end_subject)
5841 {
5842 SCHECK_PARTIAL();
5843 break;
5844 }
5845 c = *eptr;
5846 if (c == 0x09 || c == 0x20 || c == 0xa0
5847 #ifdef COMPILE_PCRE16
5848 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5849 || c == 0x202f || c == 0x205f || c == 0x3000
5850 #endif
5851 ) break;
5852 eptr++;
5853 }
5854 break;
5855
5856 case OP_HSPACE:
5857 for (i = min; i < max; i++)
5858 {
5859 if (eptr >= md->end_subject)
5860 {
5861 SCHECK_PARTIAL();
5862 break;
5863 }
5864 c = *eptr;
5865 if (c != 0x09 && c != 0x20 && c != 0xa0
5866 #ifdef COMPILE_PCRE16
5867 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5868 && c != 0x202f && c != 0x205f && c != 0x3000
5869 #endif
5870 ) break;
5871 eptr++;
5872 }
5873 break;
5874
5875 case OP_NOT_VSPACE:
5876 for (i = min; i < max; i++)
5877 {
5878 if (eptr >= md->end_subject)
5879 {
5880 SCHECK_PARTIAL();
5881 break;
5882 }
5883 c = *eptr;
5884 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5885 #ifdef COMPILE_PCRE16
5886 || c == 0x2028 || c == 0x2029
5887 #endif
5888 ) break;
5889 eptr++;
5890 }
5891 break;
5892
5893 case OP_VSPACE:
5894 for (i = min; i < max; i++)
5895 {
5896 if (eptr >= md->end_subject)
5897 {
5898 SCHECK_PARTIAL();
5899 break;
5900 }
5901 c = *eptr;
5902 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5903 #ifdef COMPILE_PCRE16
5904 && c != 0x2028 && c != 0x2029
5905 #endif
5906 ) break;
5907 eptr++;
5908 }
5909 break;
5910
5911 case OP_NOT_DIGIT:
5912 for (i = min; i < max; i++)
5913 {
5914 if (eptr >= md->end_subject)
5915 {
5916 SCHECK_PARTIAL();
5917 break;
5918 }
5919 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5920 eptr++;
5921 }
5922 break;
5923
5924 case OP_DIGIT:
5925 for (i = min; i < max; i++)
5926 {
5927 if (eptr >= md->end_subject)
5928 {
5929 SCHECK_PARTIAL();
5930 break;
5931 }
5932 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5933 eptr++;
5934 }
5935 break;
5936
5937 case OP_NOT_WHITESPACE:
5938 for (i = min; i < max; i++)
5939 {
5940 if (eptr >= md->end_subject)
5941 {
5942 SCHECK_PARTIAL();
5943 break;
5944 }
5945 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5946 eptr++;
5947 }
5948 break;
5949
5950 case OP_WHITESPACE:
5951 for (i = min; i < max; i++)
5952 {
5953 if (eptr >= md->end_subject)
5954 {
5955 SCHECK_PARTIAL();
5956 break;
5957 }
5958 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5959 eptr++;
5960 }
5961 break;
5962
5963 case OP_NOT_WORDCHAR:
5964 for (i = min; i < max; i++)
5965 {
5966 if (eptr >= md->end_subject)
5967 {
5968 SCHECK_PARTIAL();
5969 break;
5970 }
5971 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5972 eptr++;
5973 }
5974 break;
5975
5976 case OP_WORDCHAR:
5977 for (i = min; i < max; i++)
5978 {
5979 if (eptr >= md->end_subject)
5980 {
5981 SCHECK_PARTIAL();
5982 break;
5983 }
5984 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5985 eptr++;
5986 }
5987 break;
5988
5989 default:
5990 RRETURN(PCRE_ERROR_INTERNAL);
5991 }
5992
5993 /* eptr is now past the end of the maximum run. If possessive, we are
5994 done (no backing up). Otherwise, match at this position; anything other
5995 than no match is immediately returned. For nomatch, back up one
5996 character (byte), unless we are matching \R and the last thing matched
5997 was \r\n, in which case, back up two bytes. */
5998
5999 if (possessive) continue;
6000 while (eptr >= pp)
6001 {
6002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6004 eptr--;
6005 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
6006 eptr[-1] == '\r') eptr--;
6007 }
6008 }
6009
6010 /* Get here if we can't make it match with any permitted repetitions */
6011
6012 RRETURN(MATCH_NOMATCH);
6013 }
6014 /* Control never gets here */
6015
6016 /* There's been some horrible disaster. Arrival here can only mean there is
6017 something seriously wrong in the code above or the OP_xxx definitions. */
6018
6019 default:
6020 DPRINTF(("Unknown opcode %d\n", *ecode));
6021 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6022 }
6023
6024 /* Do not stick any code in here without much thought; it is assumed
6025 that "continue" in the code above comes out to here to repeat the main
6026 loop. */
6027
6028 } /* End of main loop */
6029 /* Control never reaches here */
6030
6031
6032 /* When compiling to use the heap rather than the stack for recursive calls to
6033 match(), the RRETURN() macro jumps here. The number that is saved in
6034 frame->Xwhere indicates which label we actually want to return to. */
6035
6036 #ifdef NO_RECURSE
6037 #define LBL(val) case val: goto L_RM##val;
6038 HEAP_RETURN:
6039 switch (frame->Xwhere)
6040 {
6041 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6042 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6043 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6044 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6045 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6046 LBL(65) LBL(66)
6047 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6048 LBL(21)
6049 #endif
6050 #ifdef SUPPORT_UTF
6051 LBL(16) LBL(18) LBL(20)
6052 LBL(22) LBL(23) LBL(28) LBL(30)
6053 LBL(32) LBL(34) LBL(42) LBL(46)
6054 #ifdef SUPPORT_UCP
6055 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6056 LBL(59) LBL(60) LBL(61) LBL(62)
6057 #endif /* SUPPORT_UCP */
6058 #endif /* SUPPORT_UTF */
6059 default:
6060 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6061
6062 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6063
6064 return PCRE_ERROR_INTERNAL;
6065 }
6066 #undef LBL
6067 #endif /* NO_RECURSE */
6068 }
6069
6070
6071 /***************************************************************************
6072 ****************************************************************************
6073 RECURSION IN THE match() FUNCTION
6074
6075 Undefine all the macros that were defined above to handle this. */
6076
6077 #ifdef NO_RECURSE
6078 #undef eptr
6079 #undef ecode
6080 #undef mstart
6081 #undef offset_top
6082 #undef eptrb
6083 #undef flags
6084
6085 #undef callpat
6086 #undef charptr
6087 #undef data
6088 #undef next
6089 #undef pp
6090 #undef prev
6091 #undef saved_eptr
6092
6093 #undef new_recursive
6094
6095 #undef cur_is_word
6096 #undef condition
6097 #undef prev_is_word
6098
6099 #undef ctype
6100 #undef length
6101 #undef max
6102 #undef min
6103 #undef number
6104 #undef offset
6105 #undef op
6106 #undef save_capture_last
6107 #undef save_offset1
6108 #undef save_offset2
6109 #undef save_offset3
6110 #undef stacksave
6111
6112 #undef newptrb
6113
6114 #endif
6115
6116 /* These two are defined as macros in both cases */
6117
6118 #undef fc
6119 #undef fi
6120
6121 /***************************************************************************
6122 ***************************************************************************/
6123
6124
6125
6126 /*************************************************
6127 * Execute a Regular Expression *
6128 *************************************************/
6129
6130 /* This function applies a compiled re to a subject string and picks out
6131 portions of the string if it matches. Two elements in the vector are set for
6132 each substring: the offsets to the start and end of the substring.
6133
6134 Arguments:
6135 argument_re points to the compiled expression
6136 extra_data points to extra data or is NULL
6137 subject points to the subject string
6138 length length of subject string (may contain binary zeros)
6139 start_offset where to start in the subject string
6140 options option bits
6141 offsets points to a vector of ints to be filled in with offsets
6142 offsetcount the number of elements in the vector
6143
6144 Returns: > 0 => success; value is the number of elements filled in
6145 = 0 => success, but offsets is not big enough
6146 -1 => failed to match
6147 < -1 => some kind of unexpected problem
6148 */
6149
6150 #ifdef COMPILE_PCRE8
6151 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6152 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6153 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6154 int offsetcount)
6155 #else
6156 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6157 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6158 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6159 int offsetcount)
6160 #endif
6161 {
6162 int rc, ocount, arg_offset_max;
6163 int newline;
6164 BOOL using_temporary_offsets = FALSE;
6165 BOOL anchored;
6166 BOOL startline;
6167 BOOL firstline;
6168 BOOL utf;
6169 BOOL has_first_char = FALSE;
6170 BOOL has_req_char = FALSE;
6171 pcre_uchar first_char = 0;
6172 pcre_uchar first_char2 = 0;
6173 pcre_uchar req_char = 0;
6174 pcre_uchar req_char2 = 0;
6175 match_data match_block;
6176 match_data *md = &match_block;
6177 const pcre_uint8 *tables;
6178 const pcre_uint8 *start_bits = NULL;
6179 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6180 PCRE_PUCHAR end_subject;
6181 PCRE_PUCHAR start_partial = NULL;
6182 PCRE_PUCHAR req_char_ptr = start_match - 1;
6183
6184 const pcre_study_data *study;
6185 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6186
6187 /* Plausibility checks */
6188
6189 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6190 if (re == NULL || subject == NULL ||
6191 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6192 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6193 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6194
6195 /* These two settings are used in the code for checking a UTF-8 string that
6196 follows immediately afterwards. Other values in the md block are used only
6197 during "normal" pcre_exec() processing, not when the JIT support is in use,
6198 so they are set up later. */
6199
6200 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6201 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6202 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6203 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6204
6205 /* Check a UTF-8 string if required. Pass back the character offset and error
6206 code for an invalid string if a results vector is available. */
6207
6208 #ifdef SUPPORT_UTF
6209 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6210 {
6211 int erroroffset;
6212 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6213 if (errorcode != 0)
6214 {
6215 if (offsetcount >= 2)
6216 {
6217 offsets[0] = erroroffset;
6218 offsets[1] = errorcode;
6219 }
6220 #ifdef COMPILE_PCRE16
6221 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6222 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6223 #else
6224 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6225 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6226 #endif
6227 }
6228
6229 /* Check that a start_offset points to the start of a UTF character. */
6230 if (start_offset > 0 && start_offset < length &&
6231 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6232 return PCRE_ERROR_BADUTF8_OFFSET;
6233 }
6234 #endif
6235
6236 /* If the pattern was successfully studied with JIT support, run the JIT
6237 executable instead of the rest of this function. Most options must be set at
6238 compile time for the JIT code to be usable. Fallback to the normal code path if
6239 an unsupported flag is set. In particular, JIT does not support partial
6240 matching. */
6241
6242 #ifdef SUPPORT_JIT
6243 if (extra_data != NULL
6244 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6245 && extra_data->executable_jit != NULL
6246 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6247 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6248 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6249 return PRIV(jit_exec)(re, extra_data->executable_jit,
6250 (const pcre_uchar *)subject, length, start_offset, options,
6251 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6252 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6253 #endif
6254
6255 /* Carry on with non-JIT matching. This information is for finding all the
6256 numbers associated with a given name, for condition testing. */
6257
6258 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6259 md->name_count = re->name_count;
6260 md->name_entry_size = re->name_entry_size;
6261
6262 /* Fish out the optional data from the extra_data structure, first setting
6263 the default values. */
6264
6265 study = NULL;
6266 md->match_limit = MATCH_LIMIT;
6267 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6268 md->callout_data = NULL;
6269
6270 /* The table pointer is always in native byte order. */
6271
6272 tables = re->tables;
6273
6274 if (extra_data != NULL)
6275 {
6276 register unsigned int flags = extra_data->flags;
6277 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6278 study = (const pcre_study_data *)extra_data->study_data;
6279 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6280 md->match_limit = extra_data->match_limit;
6281 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6282 md->match_limit_recursion = extra_data->match_limit_recursion;
6283 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6284 md->callout_data = extra_data->callout_data;
6285 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6286 }
6287
6288 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6289 is a feature that makes it possible to save compiled regex and re-use them
6290 in other programs later. */
6291
6292 if (tables == NULL) tables = PRIV(default_tables);
6293
6294 /* Check that the first field in the block is the magic number. If it is not,
6295 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6296 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6297 means that the pattern is likely compiled with different endianness. */
6298
6299 if (re->magic_number != MAGIC_NUMBER)
6300 return re->magic_number == REVERSED_MAGIC_NUMBER?
6301 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6302 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6303
6304 /* Set up other data */
6305
6306 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6307 startline = (re->flags & PCRE_STARTLINE) != 0;
6308 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6309
6310 /* The code starts after the real_pcre block and the capture name table. */
6311
6312 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6313 re->name_count * re->name_entry_size;
6314
6315 md->start_subject = (PCRE_PUCHAR)subject;
6316 md->start_offset = start_offset;
6317 md->end_subject = md->start_subject + length;
6318 end_subject = md->end_subject;
6319
6320 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6321 md->use_ucp = (re->options & PCRE_UCP) != 0;
6322 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6323 md->ignore_skip_arg = FALSE;
6324
6325 /* Some options are unpacked into BOOL variables in the hope that testing
6326 them will be faster than individual option bits. */
6327
6328 md->notbol = (options & PCRE_NOTBOL) != 0;
6329 md->noteol = (options & PCRE_NOTEOL) != 0;
6330 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6331 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6332
6333 md->hitend = FALSE;
6334 md->mark = md->nomatch_mark = NULL; /* In case never set */
6335
6336 md->recursive = NULL; /* No recursion at top level */
6337 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6338
6339 md->lcc = tables + lcc_offset;
6340 md->fcc = tables + fcc_offset;
6341 md->ctypes = tables + ctypes_offset;
6342
6343 /* Handle different \R options. */
6344
6345 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6346 {
6347 case 0:
6348 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6349 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6350 else
6351 #ifdef BSR_ANYCRLF
6352 md->bsr_anycrlf = TRUE;
6353 #else
6354 md->bsr_anycrlf = FALSE;
6355 #endif
6356 break;
6357
6358 case PCRE_BSR_ANYCRLF:
6359 md->bsr_anycrlf = TRUE;
6360 break;
6361
6362 case PCRE_BSR_UNICODE:
6363 md->bsr_anycrlf = FALSE;
6364 break;
6365
6366 default: return PCRE_ERROR_BADNEWLINE;
6367 }
6368
6369 /* Handle different types of newline. The three bits give eight cases. If
6370 nothing is set at run time, whatever was used at compile time applies. */
6371
6372 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6373 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6374 {
6375 case 0: newline = NEWLINE; break; /* Compile-time default */
6376 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6377 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6378 case PCRE_NEWLINE_CR+
6379 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6380 case PCRE_NEWLINE_ANY: newline = -1; break;
6381 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6382 default: return PCRE_ERROR_BADNEWLINE;
6383 }
6384
6385 if (newline == -2)
6386 {
6387 md->nltype = NLTYPE_ANYCRLF;
6388 }
6389 else if (newline < 0)
6390 {
6391 md->nltype = NLTYPE_ANY;
6392 }
6393 else
6394 {
6395 md->nltype = NLTYPE_FIXED;
6396 if (newline > 255)
6397 {
6398 md->nllen = 2;
6399 md->nl[0] = (newline >> 8) & 255;
6400 md->nl[1] = newline & 255;
6401 }
6402 else
6403 {
6404 md->nllen = 1;
6405 md->nl[0] = newline;
6406 }
6407 }
6408
6409 /* Partial matching was originally supported only for a restricted set of
6410 regexes; from release 8.00 there are no restrictions, but the bits are still
6411 defined (though never set). So there's no harm in leaving this code. */
6412
6413 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6414 return PCRE_ERROR_BADPARTIAL;
6415
6416 /* If the expression has got more back references than the offsets supplied can
6417 hold, we get a temporary chunk of working store to use during the matching.
6418 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6419 of 3. */
6420
6421 ocount = offsetcount - (offsetcount % 3);
6422 arg_offset_max = (2*ocount)/3;
6423
6424 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6425 {
6426 ocount = re->top_backref * 3 + 3;
6427 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6428 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6429 using_temporary_offsets = TRUE;
6430 DPRINTF(("Got memory to hold back references\n"));
6431 }
6432 else md->offset_vector = offsets;
6433
6434 md->offset_end = ocount;
6435 md->offset_max = (2*ocount)/3;
6436 md->offset_overflow = FALSE;
6437 md->capture_last = -1;
6438
6439 /* Reset the working variable associated with each extraction. These should
6440 never be used unless previously set, but they get saved and restored, and so we
6441 initialize them to avoid reading uninitialized locations. Also, unset the
6442 offsets for the matched string. This is really just for tidiness with callouts,
6443 in case they inspect these fields. */
6444
6445 if (md->offset_vector != NULL)
6446 {
6447 register int *iptr = md->offset_vector + ocount;
6448 register int *iend = iptr - re->top_bracket;
6449 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6450 while (--iptr >= iend) *iptr = -1;
6451 md->offset_vector[0] = md->offset_vector[1] = -1;
6452 }
6453
6454 /* Set up the first character to match, if available. The first_char value is
6455 never set for an anchored regular expression, but the anchoring may be forced
6456 at run time, so we have to test for anchoring. The first char may be unset for
6457 an unanchored pattern, of course. If there's no first char and the pattern was
6458 studied, there may be a bitmap of possible first characters. */
6459
6460 if (!anchored)
6461 {
6462 if ((re->flags & PCRE_FIRSTSET) != 0)
6463 {
6464 has_first_char = TRUE;
6465 first_char = first_char2 = re->first_char;
6466 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6467 {
6468 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6469 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6470 if (utf && first_char > 127)
6471 first_char2 = UCD_OTHERCASE(first_char);
6472 #endif
6473 }
6474 }
6475 else
6476 if (!startline && study != NULL &&
6477 (study->flags & PCRE_STUDY_MAPPED) != 0)
6478 start_bits = study->start_bits;
6479 }
6480
6481 /* For anchored or unanchored matches, there may be a "last known required
6482 character" set. */
6483
6484 if ((re->flags & PCRE_REQCHSET) != 0)
6485 {
6486 has_req_char = TRUE;
6487 req_char = req_char2 = re->req_char;
6488 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6489 {
6490 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6491 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6492 if (utf && req_char > 127)
6493 req_char2 = UCD_OTHERCASE(req_char);
6494 #endif
6495 }
6496 }
6497
6498
6499 /* ==========================================================================*/
6500
6501 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6502 the loop runs just once. */
6503
6504 for(;;)
6505 {
6506 PCRE_PUCHAR save_end_subject = end_subject;
6507 PCRE_PUCHAR new_start_match;
6508
6509 /* If firstline is TRUE, the start of the match is constrained to the first
6510 line of a multiline string. That is, the match must be before or at the first
6511 newline. Implement this by temporarily adjusting end_subject so that we stop
6512 scanning at a newline. If the match fails at the newline, later code breaks
6513 this loop. */
6514
6515 if (firstline)
6516 {
6517 PCRE_PUCHAR t = start_match;
6518 #ifdef SUPPORT_UTF
6519 if (utf)
6520 {
6521 while (t < md->end_subject && !IS_NEWLINE(t))
6522 {
6523 t++;
6524 ACROSSCHAR(t < end_subject, *t, t++);
6525 }
6526 }
6527 else
6528 #endif
6529 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6530 end_subject = t;
6531 }
6532
6533 /* There are some optimizations that avoid running the match if a known
6534 starting point is not found, or if a known later character is not present.
6535 However, there is an option that disables these, for testing and for ensuring
6536 that all callouts do actually occur. The option can be set in the regex by
6537 (*NO_START_OPT) or passed in match-time options. */
6538
6539 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6540 {
6541 /* Advance to a unique first char if there is one. */
6542
6543 if (has_first_char)
6544 {
6545 if (first_char != first_char2)
6546 while (start_match < end_subject &&
6547 *start_match != first_char && *start_match != first_char2)
6548 start_match++;
6549 else
6550 while (start_match < end_subject && *start_match != first_char)
6551 start_match++;
6552 }
6553
6554 /* Or to just after a linebreak for a multiline match */
6555
6556 else if (startline)
6557 {
6558 if (start_match > md->start_subject + start_offset)
6559 {
6560 #ifdef SUPPORT_UTF
6561 if (utf)
6562 {
6563 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6564 {
6565 start_match++;
6566 ACROSSCHAR(start_match < end_subject, *start_match,
6567 start_match++);
6568 }
6569 }
6570 else
6571 #endif
6572 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6573 start_match++;
6574
6575 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6576 and we are now at a LF, advance the match position by one more character.
6577 */
6578
6579 if (start_match[-1] == CHAR_CR &&
6580 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6581 start_match < end_subject &&
6582 *start_match == CHAR_NL)
6583 start_match++;
6584 }
6585 }
6586
6587 /* Or to a non-unique first byte after study */
6588
6589 else if (start_bits != NULL)
6590 {
6591 while (start_match < end_subject)
6592 {
6593 register unsigned int c = *start_match;
6594 #ifndef COMPILE_PCRE8
6595 if (c > 255) c = 255;
6596 #endif
6597 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6598 {
6599 start_match++;
6600 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6601 /* In non 8-bit mode, the iteration will stop for
6602 characters > 255 at the beginning or not stop at all. */
6603 if (utf)
6604 ACROSSCHAR(start_match < end_subject, *start_match,
6605 start_match++);
6606 #endif
6607 }
6608 else break;
6609 }
6610 }
6611 } /* Starting optimizations */
6612
6613 /* Restore fudged end_subject */
6614
6615 end_subject = save_end_subject;
6616
6617 /* The following two optimizations are disabled for partial matching or if
6618 disabling is explicitly requested. */
6619
6620 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6621 {
6622 /* If the pattern was studied, a minimum subject length may be set. This is
6623 a lower bound; no actual string of that length may actually match the
6624 pattern. Although the value is, strictly, in characters, we treat it as
6625 bytes to avoid spending too much time in this optimization. */
6626
6627 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6628 (pcre_uint32)(end_subject - start_match) < study->minlength)
6629 {
6630 rc = MATCH_NOMATCH;
6631 break;
6632 }
6633
6634 /* If req_char is set, we know that that character must appear in the
6635 subject for the match to succeed. If the first character is set, req_char
6636 must be later in the subject; otherwise the test starts at the match point.
6637 This optimization can save a huge amount of backtracking in patterns with
6638 nested unlimited repeats that aren't going to match. Writing separate code
6639 for cased/caseless versions makes it go faster, as does using an
6640 autoincrement and backing off on a match.
6641
6642 HOWEVER: when the subject string is very, very long, searching to its end
6643 can take a long time, and give bad performance on quite ordinary patterns.
6644 This showed up when somebody was matching something like /^\d+C/ on a
6645 32-megabyte string... so we don't do this when the string is sufficiently
6646 long. */
6647
6648 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6649 {
6650 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6651
6652 /* We don't need to repeat the search if we haven't yet reached the
6653 place we found it at last time. */
6654
6655 if (p > req_char_ptr)
6656 {
6657 if (req_char != req_char2)
6658 {
6659 while (p < end_subject)
6660 {
6661 register int pp = *p++;
6662 if (pp == req_char || pp == req_char2) { p--; break; }
6663 }
6664 }
6665 else
6666 {
6667 while (p < end_subject)
6668 {
6669 if (*p++ == req_char) { p--; break; }
6670 }
6671 }
6672
6673 /* If we can't find the required character, break the matching loop,
6674 forcing a match failure. */
6675
6676 if (p >= end_subject)
6677 {
6678 rc = MATCH_NOMATCH;
6679 break;
6680 }
6681
6682 /* If we have found the required character, save the point where we
6683 found it, so that we don't search again next time round the loop if
6684 the start hasn't passed this character yet. */
6685
6686 req_char_ptr = p;
6687 }
6688 }
6689 }
6690
6691 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6692 printf(">>>> Match against: ");
6693 pchars(start_match, end_subject - start_match, TRUE, md);
6694 printf("\n");
6695 #endif
6696
6697 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6698 first starting point for which a partial match was found. */
6699
6700 md->start_match_ptr = start_match;
6701 md->start_used_ptr = start_match;
6702 md->match_call_count = 0;
6703 md->match_function_type = 0;
6704 md->end_offset_top = 0;
6705 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6706 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6707
6708 switch(rc)
6709 {
6710 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6711 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6712 entirely. The only way we can do that is to re-do the match at the same
6713 point, with a flag to force SKIP with an argument to be ignored. Just
6714 treating this case as NOMATCH does not work because it does not check other
6715 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6716
6717 case MATCH_SKIP_ARG:
6718 new_start_match = start_match;
6719 md->ignore_skip_arg = TRUE;
6720 break;
6721
6722 /* SKIP passes back the next starting point explicitly, but if it is the
6723 same as the match we have just done, treat it as NOMATCH. */
6724
6725 case MATCH_SKIP:
6726 if (md->start_match_ptr != start_match)
6727 {
6728 new_start_match = md->start_match_ptr;
6729 break;
6730 }
6731 /* Fall through */
6732
6733 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6734 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6735
6736 case MATCH_NOMATCH:
6737 case MATCH_PRUNE:
6738 case MATCH_THEN:
6739 md->ignore_skip_arg = FALSE;
6740 new_start_match = start_match + 1;
6741 #ifdef SUPPORT_UTF
6742 if (utf)
6743 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6744 new_start_match++);
6745 #endif