/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 821 - (show annotations)
Fri Dec 23 16:38:13 2011 UTC (7 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 206168 byte(s)
More pcretest 16-bit updates; also a bug fix in pcre_exec.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496
497 /* Copy in the original argument variables */
498
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xoffset_top = offset_top;
503 frame->Xeptrb = eptrb;
504 frame->Xrdepth = rdepth;
505
506 /* This is where control jumps back to to effect "recursion" */
507
508 HEAP_RECURSE:
509
510 /* Macros make the argument variables come from the current frame */
511
512 #define eptr frame->Xeptr
513 #define ecode frame->Xecode
514 #define mstart frame->Xmstart
515 #define offset_top frame->Xoffset_top
516 #define eptrb frame->Xeptrb
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #ifdef SUPPORT_UCP
539 #define prop_type frame->Xprop_type
540 #define prop_value frame->Xprop_value
541 #define prop_fail_result frame->Xprop_fail_result
542 #define oclength frame->Xoclength
543 #define occhars frame->Xocchars
544 #endif
545
546 #define ctype frame->Xctype
547 #define fc frame->Xfc
548 #define fi frame->Xfi
549 #define length frame->Xlength
550 #define max frame->Xmax
551 #define min frame->Xmin
552 #define number frame->Xnumber
553 #define offset frame->Xoffset
554 #define op frame->Xop
555 #define save_capture_last frame->Xsave_capture_last
556 #define save_offset1 frame->Xsave_offset1
557 #define save_offset2 frame->Xsave_offset2
558 #define save_offset3 frame->Xsave_offset3
559 #define stacksave frame->Xstacksave
560
561 #define newptrb frame->Xnewptrb
562
563 /* When recursion is being used, local variables are allocated on the stack and
564 get preserved during recursion in the normal way. In this environment, fi and
565 i, and fc and c, can be the same variables. */
566
567 #else /* NO_RECURSE not defined */
568 #define fi i
569 #define fc c
570
571 /* Many of the following variables are used only in small blocks of the code.
572 My normal style of coding would have declared them within each of those blocks.
573 However, in order to accommodate the version of this code that uses an external
574 "stack" implemented on the heap, it is easier to declare them all here, so the
575 declarations can be cut out in a block. The only declarations within blocks
576 below are for variables that do not have to be preserved over a recursive call
577 to RMATCH(). */
578
579 #ifdef SUPPORT_UTF
580 const pcre_uchar *charptr;
581 #endif
582 const pcre_uchar *callpat;
583 const pcre_uchar *data;
584 const pcre_uchar *next;
585 PCRE_PUCHAR pp;
586 const pcre_uchar *prev;
587 PCRE_PUCHAR saved_eptr;
588
589 recursion_info new_recursive;
590
591 BOOL cur_is_word;
592 BOOL condition;
593 BOOL prev_is_word;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int oclength;
600 pcre_uchar occhars[6];
601 #endif
602
603 int codelink;
604 int ctype;
605 int length;
606 int max;
607 int min;
608 int number;
609 int offset;
610 int op;
611 int save_capture_last;
612 int save_offset1, save_offset2, save_offset3;
613 int stacksave[REC_STACK_SAVE_MAX];
614
615 eptrblock newptrb;
616 #endif /* NO_RECURSE */
617
618 /* To save space on the stack and in the heap frame, I have doubled up on some
619 of the local variables that are used only in localised parts of the code, but
620 still need to be preserved over recursive calls of match(). These macros define
621 the alternative names that are used. */
622
623 #define allow_zero cur_is_word
624 #define cbegroup condition
625 #define code_offset codelink
626 #define condassert condition
627 #define matched_once prev_is_word
628 #define foc number
629
630 /* These statements are here to stop the compiler complaining about unitialized
631 variables. */
632
633 #ifdef SUPPORT_UCP
634 prop_value = 0;
635 prop_fail_result = 0;
636 #endif
637
638
639 /* This label is used for tail recursion, which is used in a few cases even
640 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641 used. Thanks to Ian Taylor for noticing this possibility and sending the
642 original patch. */
643
644 TAIL_RECURSE:
645
646 /* OK, now we can get on with the real code of the function. Recursive calls
647 are specified by the macro RMATCH and RRETURN is used to return. When
648 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 defined). However, RMATCH isn't like a function call because it's quite a
651 complicated macro. It has to be used in one particular way. This shouldn't,
652 however, impact performance when true recursion is being used. */
653
654 #ifdef SUPPORT_UTF
655 utf = md->utf; /* Local copy of the flag */
656 #else
657 utf = FALSE;
658 #endif
659
660 /* First check that we haven't called match() too many times, or that we
661 haven't exceeded the recursive call limit. */
662
663 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665
666 /* At the start of a group with an unlimited repeat that may match an empty
667 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668 done this way to save having to use another function argument, which would take
669 up space on the stack. See also MATCH_CONDASSERT below.
670
671 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672 such remembered pointers, to be checked when we hit the closing ket, in order
673 to break infinite loops that match no characters. When match() is called in
674 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675 NOT be used with tail recursion, because the memory block that is used is on
676 the stack, so a new one may be required for each match(). */
677
678 if (md->match_function_type == MATCH_CBEGROUP)
679 {
680 newptrb.epb_saved_eptr = eptr;
681 newptrb.epb_prev = eptrb;
682 eptrb = &newptrb;
683 md->match_function_type = 0;
684 }
685
686 /* Now start processing the opcodes. */
687
688 for (;;)
689 {
690 minimize = possessive = FALSE;
691 op = *ecode;
692
693 switch(op)
694 {
695 case OP_MARK:
696 md->nomatch_mark = ecode + 2;
697 md->mark = NULL; /* In case previously set by assertion */
698 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 eptrb, RM55);
700 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701 md->mark == NULL) md->mark = ecode + 2;
702
703 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704 argument, and we must check whether that argument matches this MARK's
705 argument. It is passed back in md->start_match_ptr (an overloading of that
706 variable). If it does match, we reset that variable to the current subject
707 position and return MATCH_SKIP. Otherwise, pass back the return code
708 unaltered. */
709
710 else if (rrc == MATCH_SKIP_ARG &&
711 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 {
713 md->start_match_ptr = eptr;
714 RRETURN(MATCH_SKIP);
715 }
716 RRETURN(rrc);
717
718 case OP_FAIL:
719 RRETURN(MATCH_NOMATCH);
720
721 /* COMMIT overrides PRUNE, SKIP, and THEN */
722
723 case OP_COMMIT:
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 eptrb, RM52);
726 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728 rrc != MATCH_THEN)
729 RRETURN(rrc);
730 RRETURN(MATCH_COMMIT);
731
732 /* PRUNE overrides THEN */
733
734 case OP_PRUNE:
735 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 eptrb, RM51);
737 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 RRETURN(MATCH_PRUNE);
739
740 case OP_PRUNE_ARG:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM56);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 RRETURN(MATCH_PRUNE);
749
750 /* SKIP overrides PRUNE and THEN */
751
752 case OP_SKIP:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM53);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 RRETURN(rrc);
757 md->start_match_ptr = eptr; /* Pass back current position */
758 RRETURN(MATCH_SKIP);
759
760 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761 nomatch_mark. There is a flag that disables this opcode when re-matching a
762 pattern that ended with a SKIP for which there was not a matching MARK. */
763
764 case OP_SKIP_ARG:
765 if (md->ignore_skip_arg)
766 {
767 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768 break;
769 }
770 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 eptrb, RM57);
772 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 RRETURN(rrc);
774
775 /* Pass back the current skip name by overloading md->start_match_ptr and
776 returning the special MATCH_SKIP_ARG return code. This will either be
777 caught by a matching MARK, or get to the top, where it causes a rematch
778 with the md->ignore_skip_arg flag set. */
779
780 md->start_match_ptr = ecode + 2;
781 RRETURN(MATCH_SKIP_ARG);
782
783 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784 the branch in which it occurs can be determined. Overload the start of
785 match pointer to do this. */
786
787 case OP_THEN:
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 eptrb, RM54);
790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 md->start_match_ptr = ecode;
792 RRETURN(MATCH_THEN);
793
794 case OP_THEN_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 md, eptrb, RM58);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode;
803 RRETURN(MATCH_THEN);
804
805 /* Handle an atomic group that does not contain any capturing parentheses.
806 This can be handled like an assertion. Prior to 8.13, all atomic groups
807 were handled this way. In 8.13, the code was changed as below for ONCE, so
808 that backups pass through the group and thereby reset captured values.
809 However, this uses a lot more stack, so in 8.20, atomic groups that do not
810 contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 less stack intensive way.
812
813 Check the alternative branches in turn - the matching won't pass the KET
814 for this kind of subpattern. If any one branch matches, we carry on as at
815 the end of a normal bracket, leaving the subject pointer, but resetting
816 the start-of-match value in case it was changed by \K. */
817
818 case OP_ONCE_NC:
819 prev = ecode;
820 saved_eptr = eptr;
821 do
822 {
823 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825 {
826 mstart = md->start_match_ptr;
827 break;
828 }
829 if (rrc == MATCH_THEN)
830 {
831 next = ecode + GET(ecode,1);
832 if (md->start_match_ptr < next &&
833 (*ecode == OP_ALT || *next == OP_ALT))
834 rrc = MATCH_NOMATCH;
835 }
836
837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838 ecode += GET(ecode,1);
839 }
840 while (*ecode == OP_ALT);
841
842 /* If hit the end of the group (which could be repeated), fail */
843
844 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845
846 /* Continue as from after the group, updating the offsets high water
847 mark, since extracts may have been taken. */
848
849 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850
851 offset_top = md->end_offset_top;
852 eptr = md->end_match_ptr;
853
854 /* For a non-repeating ket, just continue at this level. This also
855 happens for a repeating ket if no characters were matched in the group.
856 This is the forcible breaking of infinite loops as implemented in Perl
857 5.005. */
858
859 if (*ecode == OP_KET || eptr == saved_eptr)
860 {
861 ecode += 1+LINK_SIZE;
862 break;
863 }
864
865 /* The repeating kets try the rest of the pattern or restart from the
866 preceding bracket, in the appropriate order. The second "call" of match()
867 uses tail recursion, to avoid using another stack frame. */
868
869 if (*ecode == OP_KETRMIN)
870 {
871 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873 ecode = prev;
874 goto TAIL_RECURSE;
875 }
876 else /* OP_KETRMAX */
877 {
878 md->match_function_type = MATCH_CBEGROUP;
879 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881 ecode += 1 + LINK_SIZE;
882 goto TAIL_RECURSE;
883 }
884 /* Control never gets here */
885
886 /* Handle a capturing bracket, other than those that are possessive with an
887 unlimited repeat. If there is space in the offset vector, save the current
888 subject position in the working slot at the top of the vector. We mustn't
889 change the current values of the data slot, because they may be set from a
890 previous iteration of this group, and be referred to by a reference inside
891 the group. A failure to match might occur after the group has succeeded,
892 if something later on doesn't match. For this reason, we need to restore
893 the working value and also the values of the final offsets, in case they
894 were set by a previous iteration of the same bracket.
895
896 If there isn't enough space in the offset vector, treat this as if it were
897 a non-capturing bracket. Don't worry about setting the flag for the error
898 case here; that is handled in the code for KET. */
899
900 case OP_CBRA:
901 case OP_SCBRA:
902 number = GET2(ecode, 1+LINK_SIZE);
903 offset = number << 1;
904
905 #ifdef PCRE_DEBUG
906 printf("start bracket %d\n", number);
907 printf("subject=");
908 pchars(eptr, 16, TRUE, md);
909 printf("\n");
910 #endif
911
912 if (offset < md->offset_max)
913 {
914 save_offset1 = md->offset_vector[offset];
915 save_offset2 = md->offset_vector[offset+1];
916 save_offset3 = md->offset_vector[md->offset_end - number];
917 save_capture_last = md->capture_last;
918
919 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 md->offset_vector[md->offset_end - number] =
921 (int)(eptr - md->start_subject);
922
923 for (;;)
924 {
925 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 eptrb, RM1);
928 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929
930 /* If we backed up to a THEN, check whether it is within the current
931 branch by comparing the address of the THEN that is passed back with
932 the end of the branch. If it is within the current branch, and the
933 branch is one of two or more alternatives (it either starts or ends
934 with OP_ALT), we have reached the limit of THEN's action, so convert
935 the return code to NOMATCH, which will cause normal backtracking to
936 happen from now on. Otherwise, THEN is passed back to an outer
937 alternative. This implements Perl's treatment of parenthesized groups,
938 where a group not containing | does not affect the current alternative,
939 that is, (X) is NOT the same as (X|(*F)). */
940
941 if (rrc == MATCH_THEN)
942 {
943 next = ecode + GET(ecode,1);
944 if (md->start_match_ptr < next &&
945 (*ecode == OP_ALT || *next == OP_ALT))
946 rrc = MATCH_NOMATCH;
947 }
948
949 /* Anything other than NOMATCH is passed back. */
950
951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 md->capture_last = save_capture_last;
953 ecode += GET(ecode, 1);
954 if (*ecode != OP_ALT) break;
955 }
956
957 DPRINTF(("bracket %d failed\n", number));
958 md->offset_vector[offset] = save_offset1;
959 md->offset_vector[offset+1] = save_offset2;
960 md->offset_vector[md->offset_end - number] = save_offset3;
961
962 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963
964 RRETURN(rrc);
965 }
966
967 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968 as a non-capturing bracket. */
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974
975 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977
978 /* Non-capturing or atomic group, except for possessive with unlimited
979 repeat and ONCE group with no captures. Loop for all the alternatives.
980
981 When we get to the final alternative within the brackets, we used to return
982 the result of a recursive call to match() whatever happened so it was
983 possible to reduce stack usage by turning this into a tail recursion,
984 except in the case of a possibly empty group. However, now that there is
985 the possiblity of (*THEN) occurring in the final alternative, this
986 optimization is no longer always possible.
987
988 We can optimize if we know there are no (*THEN)s in the pattern; at present
989 this is the best that can be done.
990
991 MATCH_ONCE is returned when the end of an atomic group is successfully
992 reached, but subsequent matching fails. It passes back up the tree (causing
993 captured values to be reset) until the original atomic group level is
994 reached. This is tested by comparing md->once_target with the start of the
995 group. At this point, the return is converted into MATCH_NOMATCH so that
996 previous backup points can be taken. */
997
998 case OP_ONCE:
999 case OP_BRA:
1000 case OP_SBRA:
1001 DPRINTF(("start non-capturing bracket\n"));
1002
1003 for (;;)
1004 {
1005 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006
1007 /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 the pattern, and this is the final alternative, optimize as described
1009 above. */
1010
1011 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012 {
1013 ecode += PRIV(OP_lengths)[*ecode];
1014 goto TAIL_RECURSE;
1015 }
1016
1017 /* In all other cases, we have to make another call to match(). */
1018
1019 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 RM2);
1021
1022 /* See comment in the code for capturing groups above about handling
1023 THEN. */
1024
1025 if (rrc == MATCH_THEN)
1026 {
1027 next = ecode + GET(ecode,1);
1028 if (md->start_match_ptr < next &&
1029 (*ecode == OP_ALT || *next == OP_ALT))
1030 rrc = MATCH_NOMATCH;
1031 }
1032
1033 if (rrc != MATCH_NOMATCH)
1034 {
1035 if (rrc == MATCH_ONCE)
1036 {
1037 const pcre_uchar *scode = ecode;
1038 if (*scode != OP_ONCE) /* If not at start, find it */
1039 {
1040 while (*scode == OP_ALT) scode += GET(scode, 1);
1041 scode -= GET(scode, 1);
1042 }
1043 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 }
1045 RRETURN(rrc);
1046 }
1047 ecode += GET(ecode, 1);
1048 if (*ecode != OP_ALT) break;
1049 }
1050
1051 RRETURN(MATCH_NOMATCH);
1052
1053 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055 handled similarly to the normal case above. However, the matching is
1056 different. The end of these brackets will always be OP_KETRPOS, which
1057 returns MATCH_KETRPOS without going further in the pattern. By this means
1058 we can handle the group by iteration rather than recursion, thereby
1059 reducing the amount of stack needed. */
1060
1061 case OP_CBRAPOS:
1062 case OP_SCBRAPOS:
1063 allow_zero = FALSE;
1064
1065 POSSESSIVE_CAPTURE:
1066 number = GET2(ecode, 1+LINK_SIZE);
1067 offset = number << 1;
1068
1069 #ifdef PCRE_DEBUG
1070 printf("start possessive bracket %d\n", number);
1071 printf("subject=");
1072 pchars(eptr, 16, TRUE, md);
1073 printf("\n");
1074 #endif
1075
1076 if (offset < md->offset_max)
1077 {
1078 matched_once = FALSE;
1079 code_offset = (int)(ecode - md->start_code);
1080
1081 save_offset1 = md->offset_vector[offset];
1082 save_offset2 = md->offset_vector[offset+1];
1083 save_offset3 = md->offset_vector[md->offset_end - number];
1084 save_capture_last = md->capture_last;
1085
1086 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087
1088 /* Each time round the loop, save the current subject position for use
1089 when the group matches. For MATCH_MATCH, the group has matched, so we
1090 restart it with a new subject starting position, remembering that we had
1091 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092 usual. If we haven't matched any alternatives in any iteration, check to
1093 see if a previous iteration matched. If so, the group has matched;
1094 continue from afterwards. Otherwise it has failed; restore the previous
1095 capture values before returning NOMATCH. */
1096
1097 for (;;)
1098 {
1099 md->offset_vector[md->offset_end - number] =
1100 (int)(eptr - md->start_subject);
1101 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 eptrb, RM63);
1104 if (rrc == MATCH_KETRPOS)
1105 {
1106 offset_top = md->end_offset_top;
1107 eptr = md->end_match_ptr;
1108 ecode = md->start_code + code_offset;
1109 save_capture_last = md->capture_last;
1110 matched_once = TRUE;
1111 continue;
1112 }
1113
1114 /* See comment in the code for capturing groups above about handling
1115 THEN. */
1116
1117 if (rrc == MATCH_THEN)
1118 {
1119 next = ecode + GET(ecode,1);
1120 if (md->start_match_ptr < next &&
1121 (*ecode == OP_ALT || *next == OP_ALT))
1122 rrc = MATCH_NOMATCH;
1123 }
1124
1125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 md->capture_last = save_capture_last;
1127 ecode += GET(ecode, 1);
1128 if (*ecode != OP_ALT) break;
1129 }
1130
1131 if (!matched_once)
1132 {
1133 md->offset_vector[offset] = save_offset1;
1134 md->offset_vector[offset+1] = save_offset2;
1135 md->offset_vector[md->offset_end - number] = save_offset3;
1136 }
1137
1138 if (allow_zero || matched_once)
1139 {
1140 ecode += 1 + LINK_SIZE;
1141 break;
1142 }
1143
1144 RRETURN(MATCH_NOMATCH);
1145 }
1146
1147 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148 as a non-capturing bracket. */
1149
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152
1153 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154
1155 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157
1158 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160 without the capturing complication. It is written out separately for speed
1161 and cleanliness. */
1162
1163 case OP_BRAPOS:
1164 case OP_SBRAPOS:
1165 allow_zero = FALSE;
1166
1167 POSSESSIVE_NON_CAPTURE:
1168 matched_once = FALSE;
1169 code_offset = (int)(ecode - md->start_code);
1170
1171 for (;;)
1172 {
1173 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 eptrb, RM48);
1176 if (rrc == MATCH_KETRPOS)
1177 {
1178 offset_top = md->end_offset_top;
1179 eptr = md->end_match_ptr;
1180 ecode = md->start_code + code_offset;
1181 matched_once = TRUE;
1182 continue;
1183 }
1184
1185 /* See comment in the code for capturing groups above about handling
1186 THEN. */
1187
1188 if (rrc == MATCH_THEN)
1189 {
1190 next = ecode + GET(ecode,1);
1191 if (md->start_match_ptr < next &&
1192 (*ecode == OP_ALT || *next == OP_ALT))
1193 rrc = MATCH_NOMATCH;
1194 }
1195
1196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ecode += GET(ecode, 1);
1198 if (*ecode != OP_ALT) break;
1199 }
1200
1201 if (matched_once || allow_zero)
1202 {
1203 ecode += 1 + LINK_SIZE;
1204 break;
1205 }
1206 RRETURN(MATCH_NOMATCH);
1207
1208 /* Control never reaches here. */
1209
1210 /* Conditional group: compilation checked that there are no more than
1211 two branches. If the condition is false, skipping the first branch takes us
1212 past the end if there is only one branch, but that's OK because that is
1213 exactly what going to the ket would do. */
1214
1215 case OP_COND:
1216 case OP_SCOND:
1217 codelink = GET(ecode, 1);
1218
1219 /* Because of the way auto-callout works during compile, a callout item is
1220 inserted between OP_COND and an assertion condition. */
1221
1222 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223 {
1224 if (PUBL(callout) != NULL)
1225 {
1226 pcre_callout_block cb;
1227 cb.version = 2; /* Version 1 of the callout block */
1228 cb.callout_number = ecode[LINK_SIZE+2];
1229 cb.offset_vector = md->offset_vector;
1230 cb.subject = (PCRE_SPTR)md->start_subject;
1231 cb.subject_length = (int)(md->end_subject - md->start_subject);
1232 cb.start_match = (int)(mstart - md->start_subject);
1233 cb.current_position = (int)(eptr - md->start_subject);
1234 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1235 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1236 cb.capture_top = offset_top/2;
1237 cb.capture_last = md->capture_last;
1238 cb.callout_data = md->callout_data;
1239 cb.mark = md->nomatch_mark;
1240 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1241 if (rrc < 0) RRETURN(rrc);
1242 }
1243 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1244 }
1245
1246 condcode = ecode[LINK_SIZE+1];
1247
1248 /* Now see what the actual condition is */
1249
1250 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1251 {
1252 if (md->recursive == NULL) /* Not recursing => FALSE */
1253 {
1254 condition = FALSE;
1255 ecode += GET(ecode, 1);
1256 }
1257 else
1258 {
1259 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1260 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1261
1262 /* If the test is for recursion into a specific subpattern, and it is
1263 false, but the test was set up by name, scan the table to see if the
1264 name refers to any other numbers, and test them. The condition is true
1265 if any one is set. */
1266
1267 if (!condition && condcode == OP_NRREF)
1268 {
1269 pcre_uchar *slotA = md->name_table;
1270 for (i = 0; i < md->name_count; i++)
1271 {
1272 if (GET2(slotA, 0) == recno) break;
1273 slotA += md->name_entry_size;
1274 }
1275
1276 /* Found a name for the number - there can be only one; duplicate
1277 names for different numbers are allowed, but not vice versa. First
1278 scan down for duplicates. */
1279
1280 if (i < md->name_count)
1281 {
1282 pcre_uchar *slotB = slotA;
1283 while (slotB > md->name_table)
1284 {
1285 slotB -= md->name_entry_size;
1286 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1287 {
1288 condition = GET2(slotB, 0) == md->recursive->group_num;
1289 if (condition) break;
1290 }
1291 else break;
1292 }
1293
1294 /* Scan up for duplicates */
1295
1296 if (!condition)
1297 {
1298 slotB = slotA;
1299 for (i++; i < md->name_count; i++)
1300 {
1301 slotB += md->name_entry_size;
1302 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1303 {
1304 condition = GET2(slotB, 0) == md->recursive->group_num;
1305 if (condition) break;
1306 }
1307 else break;
1308 }
1309 }
1310 }
1311 }
1312
1313 /* Chose branch according to the condition */
1314
1315 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1316 }
1317 }
1318
1319 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1320 {
1321 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1322 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1323
1324 /* If the numbered capture is unset, but the reference was by name,
1325 scan the table to see if the name refers to any other numbers, and test
1326 them. The condition is true if any one is set. This is tediously similar
1327 to the code above, but not close enough to try to amalgamate. */
1328
1329 if (!condition && condcode == OP_NCREF)
1330 {
1331 int refno = offset >> 1;
1332 pcre_uchar *slotA = md->name_table;
1333
1334 for (i = 0; i < md->name_count; i++)
1335 {
1336 if (GET2(slotA, 0) == refno) break;
1337 slotA += md->name_entry_size;
1338 }
1339
1340 /* Found a name for the number - there can be only one; duplicate names
1341 for different numbers are allowed, but not vice versa. First scan down
1342 for duplicates. */
1343
1344 if (i < md->name_count)
1345 {
1346 pcre_uchar *slotB = slotA;
1347 while (slotB > md->name_table)
1348 {
1349 slotB -= md->name_entry_size;
1350 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1351 {
1352 offset = GET2(slotB, 0) << 1;
1353 condition = offset < offset_top &&
1354 md->offset_vector[offset] >= 0;
1355 if (condition) break;
1356 }
1357 else break;
1358 }
1359
1360 /* Scan up for duplicates */
1361
1362 if (!condition)
1363 {
1364 slotB = slotA;
1365 for (i++; i < md->name_count; i++)
1366 {
1367 slotB += md->name_entry_size;
1368 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1369 {
1370 offset = GET2(slotB, 0) << 1;
1371 condition = offset < offset_top &&
1372 md->offset_vector[offset] >= 0;
1373 if (condition) break;
1374 }
1375 else break;
1376 }
1377 }
1378 }
1379 }
1380
1381 /* Chose branch according to the condition */
1382
1383 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1384 }
1385
1386 else if (condcode == OP_DEF) /* DEFINE - always false */
1387 {
1388 condition = FALSE;
1389 ecode += GET(ecode, 1);
1390 }
1391
1392 /* The condition is an assertion. Call match() to evaluate it - setting
1393 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1394 an assertion. */
1395
1396 else
1397 {
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1406 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1407 }
1408
1409 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1410 assertion; it is therefore treated as NOMATCH. */
1411
1412 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 {
1414 RRETURN(rrc); /* Need braces because of following else */
1415 }
1416 else
1417 {
1418 condition = FALSE;
1419 ecode += codelink;
1420 }
1421 }
1422
1423 /* We are now at the branch that is to be obeyed. As there is only one, can
1424 use tail recursion to avoid using another stack frame, except when there is
1425 unlimited repeat of a possibly empty group. In the latter case, a recursive
1426 call to match() is always required, unless the second alternative doesn't
1427 exist, in which case we can just plough on. Note that, for compatibility
1428 with Perl, the | in a conditional group is NOT treated as creating two
1429 alternatives. If a THEN is encountered in the branch, it propagates out to
1430 the enclosing alternative (unless nested in a deeper set of alternatives,
1431 of course). */
1432
1433 if (condition || *ecode == OP_ALT)
1434 {
1435 if (op != OP_SCOND)
1436 {
1437 ecode += 1 + LINK_SIZE;
1438 goto TAIL_RECURSE;
1439 }
1440
1441 md->match_function_type = MATCH_CBEGROUP;
1442 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1443 RRETURN(rrc);
1444 }
1445
1446 /* Condition false & no alternative; continue after the group. */
1447
1448 else
1449 {
1450 ecode += 1 + LINK_SIZE;
1451 }
1452 break;
1453
1454
1455 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1456 to close any currently open capturing brackets. */
1457
1458 case OP_CLOSE:
1459 number = GET2(ecode, 1);
1460 offset = number << 1;
1461
1462 #ifdef PCRE_DEBUG
1463 printf("end bracket %d at *ACCEPT", number);
1464 printf("\n");
1465 #endif
1466
1467 md->capture_last = number;
1468 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1469 {
1470 md->offset_vector[offset] =
1471 md->offset_vector[md->offset_end - number];
1472 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1473 if (offset_top <= offset) offset_top = offset + 2;
1474 }
1475 ecode += 1 + IMM2_SIZE;
1476 break;
1477
1478
1479 /* End of the pattern, either real or forced. */
1480
1481 case OP_END:
1482 case OP_ACCEPT:
1483 case OP_ASSERT_ACCEPT:
1484
1485 /* If we have matched an empty string, fail if not in an assertion and not
1486 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1487 is set and we have matched at the start of the subject. In both cases,
1488 backtracking will then try other alternatives, if any. */
1489
1490 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1491 md->recursive == NULL &&
1492 (md->notempty ||
1493 (md->notempty_atstart &&
1494 mstart == md->start_subject + md->start_offset)))
1495 RRETURN(MATCH_NOMATCH);
1496
1497 /* Otherwise, we have a match. */
1498
1499 md->end_match_ptr = eptr; /* Record where we ended */
1500 md->end_offset_top = offset_top; /* and how many extracts were taken */
1501 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1502
1503 /* For some reason, the macros don't work properly if an expression is
1504 given as the argument to RRETURN when the heap is in use. */
1505
1506 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1507 RRETURN(rrc);
1508
1509 /* Assertion brackets. Check the alternative branches in turn - the
1510 matching won't pass the KET for an assertion. If any one branch matches,
1511 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1512 start of each branch to move the current point backwards, so the code at
1513 this level is identical to the lookahead case. When the assertion is part
1514 of a condition, we want to return immediately afterwards. The caller of
1515 this incarnation of the match() function will have set MATCH_CONDASSERT in
1516 md->match_function type, and one of these opcodes will be the first opcode
1517 that is processed. We use a local variable that is preserved over calls to
1518 match() to remember this case. */
1519
1520 case OP_ASSERT:
1521 case OP_ASSERTBACK:
1522 if (md->match_function_type == MATCH_CONDASSERT)
1523 {
1524 condassert = TRUE;
1525 md->match_function_type = 0;
1526 }
1527 else condassert = FALSE;
1528
1529 do
1530 {
1531 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1532 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1533 {
1534 mstart = md->start_match_ptr; /* In case \K reset it */
1535 break;
1536 }
1537
1538 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1539 as NOMATCH. */
1540
1541 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1542 ecode += GET(ecode, 1);
1543 }
1544 while (*ecode == OP_ALT);
1545
1546 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1547
1548 /* If checking an assertion for a condition, return MATCH_MATCH. */
1549
1550 if (condassert) RRETURN(MATCH_MATCH);
1551
1552 /* Continue from after the assertion, updating the offsets high water
1553 mark, since extracts may have been taken during the assertion. */
1554
1555 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1556 ecode += 1 + LINK_SIZE;
1557 offset_top = md->end_offset_top;
1558 continue;
1559
1560 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1561 PRUNE, or COMMIT means we must assume failure without checking subsequent
1562 branches. */
1563
1564 case OP_ASSERT_NOT:
1565 case OP_ASSERTBACK_NOT:
1566 if (md->match_function_type == MATCH_CONDASSERT)
1567 {
1568 condassert = TRUE;
1569 md->match_function_type = 0;
1570 }
1571 else condassert = FALSE;
1572
1573 do
1574 {
1575 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1576 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1577 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1578 {
1579 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1580 break;
1581 }
1582
1583 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1584 as NOMATCH. */
1585
1586 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1587 ecode += GET(ecode,1);
1588 }
1589 while (*ecode == OP_ALT);
1590
1591 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1592
1593 ecode += 1 + LINK_SIZE;
1594 continue;
1595
1596 /* Move the subject pointer back. This occurs only at the start of
1597 each branch of a lookbehind assertion. If we are too close to the start to
1598 move back, this match function fails. When working with UTF-8 we move
1599 back a number of characters, not bytes. */
1600
1601 case OP_REVERSE:
1602 #ifdef SUPPORT_UTF
1603 if (utf)
1604 {
1605 i = GET(ecode, 1);
1606 while (i-- > 0)
1607 {
1608 eptr--;
1609 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1610 BACKCHAR(eptr);
1611 }
1612 }
1613 else
1614 #endif
1615
1616 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1617
1618 {
1619 eptr -= GET(ecode, 1);
1620 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1621 }
1622
1623 /* Save the earliest consulted character, then skip to next op code */
1624
1625 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1626 ecode += 1 + LINK_SIZE;
1627 break;
1628
1629 /* The callout item calls an external function, if one is provided, passing
1630 details of the match so far. This is mainly for debugging, though the
1631 function is able to force a failure. */
1632
1633 case OP_CALLOUT:
1634 if (PUBL(callout) != NULL)
1635 {
1636 pcre_callout_block cb;
1637 cb.version = 2; /* Version 1 of the callout block */
1638 cb.callout_number = ecode[1];
1639 cb.offset_vector = md->offset_vector;
1640 cb.subject = (PCRE_SPTR)md->start_subject;
1641 cb.subject_length = (int)(md->end_subject - md->start_subject);
1642 cb.start_match = (int)(mstart - md->start_subject);
1643 cb.current_position = (int)(eptr - md->start_subject);
1644 cb.pattern_position = GET(ecode, 2);
1645 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1646 cb.capture_top = offset_top/2;
1647 cb.capture_last = md->capture_last;
1648 cb.callout_data = md->callout_data;
1649 cb.mark = md->nomatch_mark;
1650 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1651 if (rrc < 0) RRETURN(rrc);
1652 }
1653 ecode += 2 + 2*LINK_SIZE;
1654 break;
1655
1656 /* Recursion either matches the current regex, or some subexpression. The
1657 offset data is the offset to the starting bracket from the start of the
1658 whole pattern. (This is so that it works from duplicated subpatterns.)
1659
1660 The state of the capturing groups is preserved over recursion, and
1661 re-instated afterwards. We don't know how many are started and not yet
1662 finished (offset_top records the completed total) so we just have to save
1663 all the potential data. There may be up to 65535 such values, which is too
1664 large to put on the stack, but using malloc for small numbers seems
1665 expensive. As a compromise, the stack is used when there are no more than
1666 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1667
1668 There are also other values that have to be saved. We use a chained
1669 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1670 for the original version of this logic. It has, however, been hacked around
1671 a lot, so he is not to blame for the current way it works. */
1672
1673 case OP_RECURSE:
1674 {
1675 recursion_info *ri;
1676 int recno;
1677
1678 callpat = md->start_code + GET(ecode, 1);
1679 recno = (callpat == md->start_code)? 0 :
1680 GET2(callpat, 1 + LINK_SIZE);
1681
1682 /* Check for repeating a recursion without advancing the subject pointer.
1683 This should catch convoluted mutual recursions. (Some simple cases are
1684 caught at compile time.) */
1685
1686 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1687 if (recno == ri->group_num && eptr == ri->subject_position)
1688 RRETURN(PCRE_ERROR_RECURSELOOP);
1689
1690 /* Add to "recursing stack" */
1691
1692 new_recursive.group_num = recno;
1693 new_recursive.subject_position = eptr;
1694 new_recursive.prevrec = md->recursive;
1695 md->recursive = &new_recursive;
1696
1697 /* Where to continue from afterwards */
1698
1699 ecode += 1 + LINK_SIZE;
1700
1701 /* Now save the offset data */
1702
1703 new_recursive.saved_max = md->offset_end;
1704 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1705 new_recursive.offset_save = stacksave;
1706 else
1707 {
1708 new_recursive.offset_save =
1709 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1710 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1711 }
1712 memcpy(new_recursive.offset_save, md->offset_vector,
1713 new_recursive.saved_max * sizeof(int));
1714
1715 /* OK, now we can do the recursion. After processing each alternative,
1716 restore the offset data. If there were nested recursions, md->recursive
1717 might be changed, so reset it before looping. */
1718
1719 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1720 cbegroup = (*callpat >= OP_SBRA);
1721 do
1722 {
1723 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1724 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1725 md, eptrb, RM6);
1726 memcpy(md->offset_vector, new_recursive.offset_save,
1727 new_recursive.saved_max * sizeof(int));
1728 md->recursive = new_recursive.prevrec;
1729 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1730 {
1731 DPRINTF(("Recursion matched\n"));
1732 if (new_recursive.offset_save != stacksave)
1733 (PUBL(free))(new_recursive.offset_save);
1734
1735 /* Set where we got to in the subject, and reset the start in case
1736 it was changed by \K. This *is* propagated back out of a recursion,
1737 for Perl compatibility. */
1738
1739 eptr = md->end_match_ptr;
1740 mstart = md->start_match_ptr;
1741 goto RECURSION_MATCHED; /* Exit loop; end processing */
1742 }
1743
1744 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1745 as NOMATCH. */
1746
1747 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1748 {
1749 DPRINTF(("Recursion gave error %d\n", rrc));
1750 if (new_recursive.offset_save != stacksave)
1751 (PUBL(free))(new_recursive.offset_save);
1752 RRETURN(rrc);
1753 }
1754
1755 md->recursive = &new_recursive;
1756 callpat += GET(callpat, 1);
1757 }
1758 while (*callpat == OP_ALT);
1759
1760 DPRINTF(("Recursion didn't match\n"));
1761 md->recursive = new_recursive.prevrec;
1762 if (new_recursive.offset_save != stacksave)
1763 (PUBL(free))(new_recursive.offset_save);
1764 RRETURN(MATCH_NOMATCH);
1765 }
1766
1767 RECURSION_MATCHED:
1768 break;
1769
1770 /* An alternation is the end of a branch; scan along to find the end of the
1771 bracketed group and go to there. */
1772
1773 case OP_ALT:
1774 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1775 break;
1776
1777 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1778 indicating that it may occur zero times. It may repeat infinitely, or not
1779 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1780 with fixed upper repeat limits are compiled as a number of copies, with the
1781 optional ones preceded by BRAZERO or BRAMINZERO. */
1782
1783 case OP_BRAZERO:
1784 next = ecode + 1;
1785 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787 do next += GET(next, 1); while (*next == OP_ALT);
1788 ecode = next + 1 + LINK_SIZE;
1789 break;
1790
1791 case OP_BRAMINZERO:
1792 next = ecode + 1;
1793 do next += GET(next, 1); while (*next == OP_ALT);
1794 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796 ecode++;
1797 break;
1798
1799 case OP_SKIPZERO:
1800 next = ecode+1;
1801 do next += GET(next,1); while (*next == OP_ALT);
1802 ecode = next + 1 + LINK_SIZE;
1803 break;
1804
1805 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1806 here; just jump to the group, with allow_zero set TRUE. */
1807
1808 case OP_BRAPOSZERO:
1809 op = *(++ecode);
1810 allow_zero = TRUE;
1811 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1812 goto POSSESSIVE_NON_CAPTURE;
1813
1814 /* End of a group, repeated or non-repeating. */
1815
1816 case OP_KET:
1817 case OP_KETRMIN:
1818 case OP_KETRMAX:
1819 case OP_KETRPOS:
1820 prev = ecode - GET(ecode, 1);
1821
1822 /* If this was a group that remembered the subject start, in order to break
1823 infinite repeats of empty string matches, retrieve the subject start from
1824 the chain. Otherwise, set it NULL. */
1825
1826 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1827 {
1828 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1829 eptrb = eptrb->epb_prev; /* Backup to previous group */
1830 }
1831 else saved_eptr = NULL;
1832
1833 /* If we are at the end of an assertion group or a non-capturing atomic
1834 group, stop matching and return MATCH_MATCH, but record the current high
1835 water mark for use by positive assertions. We also need to record the match
1836 start in case it was changed by \K. */
1837
1838 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1839 *prev == OP_ONCE_NC)
1840 {
1841 md->end_match_ptr = eptr; /* For ONCE_NC */
1842 md->end_offset_top = offset_top;
1843 md->start_match_ptr = mstart;
1844 RRETURN(MATCH_MATCH); /* Sets md->mark */
1845 }
1846
1847 /* For capturing groups we have to check the group number back at the start
1848 and if necessary complete handling an extraction by setting the offsets and
1849 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1850 into group 0, so it won't be picked up here. Instead, we catch it when the
1851 OP_END is reached. Other recursion is handled here. We just have to record
1852 the current subject position and start match pointer and give a MATCH
1853 return. */
1854
1855 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1856 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1857 {
1858 number = GET2(prev, 1+LINK_SIZE);
1859 offset = number << 1;
1860
1861 #ifdef PCRE_DEBUG
1862 printf("end bracket %d", number);
1863 printf("\n");
1864 #endif
1865
1866 /* Handle a recursively called group. */
1867
1868 if (md->recursive != NULL && md->recursive->group_num == number)
1869 {
1870 md->end_match_ptr = eptr;
1871 md->start_match_ptr = mstart;
1872 RRETURN(MATCH_MATCH);
1873 }
1874
1875 /* Deal with capturing */
1876
1877 md->capture_last = number;
1878 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1879 {
1880 /* If offset is greater than offset_top, it means that we are
1881 "skipping" a capturing group, and that group's offsets must be marked
1882 unset. In earlier versions of PCRE, all the offsets were unset at the
1883 start of matching, but this doesn't work because atomic groups and
1884 assertions can cause a value to be set that should later be unset.
1885 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1886 part of the atomic group, but this is not on the final matching path,
1887 so must be unset when 2 is set. (If there is no group 2, there is no
1888 problem, because offset_top will then be 2, indicating no capture.) */
1889
1890 if (offset > offset_top)
1891 {
1892 register int *iptr = md->offset_vector + offset_top;
1893 register int *iend = md->offset_vector + offset;
1894 while (iptr < iend) *iptr++ = -1;
1895 }
1896
1897 /* Now make the extraction */
1898
1899 md->offset_vector[offset] =
1900 md->offset_vector[md->offset_end - number];
1901 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1902 if (offset_top <= offset) offset_top = offset + 2;
1903 }
1904 }
1905
1906 /* For an ordinary non-repeating ket, just continue at this level. This
1907 also happens for a repeating ket if no characters were matched in the
1908 group. This is the forcible breaking of infinite loops as implemented in
1909 Perl 5.005. For a non-repeating atomic group that includes captures,
1910 establish a backup point by processing the rest of the pattern at a lower
1911 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1912 original OP_ONCE level, thereby bypassing intermediate backup points, but
1913 resetting any captures that happened along the way. */
1914
1915 if (*ecode == OP_KET || eptr == saved_eptr)
1916 {
1917 if (*prev == OP_ONCE)
1918 {
1919 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1922 RRETURN(MATCH_ONCE);
1923 }
1924 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1925 break;
1926 }
1927
1928 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1929 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1930 at a time from the outer level, thus saving stack. */
1931
1932 if (*ecode == OP_KETRPOS)
1933 {
1934 md->end_match_ptr = eptr;
1935 md->end_offset_top = offset_top;
1936 RRETURN(MATCH_KETRPOS);
1937 }
1938
1939 /* The normal repeating kets try the rest of the pattern or restart from
1940 the preceding bracket, in the appropriate order. In the second case, we can
1941 use tail recursion to avoid using another stack frame, unless we have an
1942 an atomic group or an unlimited repeat of a group that can match an empty
1943 string. */
1944
1945 if (*ecode == OP_KETRMIN)
1946 {
1947 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 if (*prev == OP_ONCE)
1950 {
1951 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1953 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1954 RRETURN(MATCH_ONCE);
1955 }
1956 if (*prev >= OP_SBRA) /* Could match an empty string */
1957 {
1958 md->match_function_type = MATCH_CBEGROUP;
1959 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1960 RRETURN(rrc);
1961 }
1962 ecode = prev;
1963 goto TAIL_RECURSE;
1964 }
1965 else /* OP_KETRMAX */
1966 {
1967 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1968 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1969 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 if (*prev == OP_ONCE)
1972 {
1973 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1975 md->once_target = prev;
1976 RRETURN(MATCH_ONCE);
1977 }
1978 ecode += 1 + LINK_SIZE;
1979 goto TAIL_RECURSE;
1980 }
1981 /* Control never gets here */
1982
1983 /* Not multiline mode: start of subject assertion, unless notbol. */
1984
1985 case OP_CIRC:
1986 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1987
1988 /* Start of subject assertion */
1989
1990 case OP_SOD:
1991 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1992 ecode++;
1993 break;
1994
1995 /* Multiline mode: start of subject unless notbol, or after any newline. */
1996
1997 case OP_CIRCM:
1998 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1999 if (eptr != md->start_subject &&
2000 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2001 RRETURN(MATCH_NOMATCH);
2002 ecode++;
2003 break;
2004
2005 /* Start of match assertion */
2006
2007 case OP_SOM:
2008 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2009 ecode++;
2010 break;
2011
2012 /* Reset the start of match point */
2013
2014 case OP_SET_SOM:
2015 mstart = eptr;
2016 ecode++;
2017 break;
2018
2019 /* Multiline mode: assert before any newline, or before end of subject
2020 unless noteol is set. */
2021
2022 case OP_DOLLM:
2023 if (eptr < md->end_subject)
2024 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2025 else
2026 {
2027 if (md->noteol) RRETURN(MATCH_NOMATCH);
2028 SCHECK_PARTIAL();
2029 }
2030 ecode++;
2031 break;
2032
2033 /* Not multiline mode: assert before a terminating newline or before end of
2034 subject unless noteol is set. */
2035
2036 case OP_DOLL:
2037 if (md->noteol) RRETURN(MATCH_NOMATCH);
2038 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2039
2040 /* ... else fall through for endonly */
2041
2042 /* End of subject assertion (\z) */
2043
2044 case OP_EOD:
2045 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2046 SCHECK_PARTIAL();
2047 ecode++;
2048 break;
2049
2050 /* End of subject or ending \n assertion (\Z) */
2051
2052 case OP_EODN:
2053 ASSERT_NL_OR_EOS:
2054 if (eptr < md->end_subject &&
2055 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2056 RRETURN(MATCH_NOMATCH);
2057
2058 /* Either at end of string or \n before end. */
2059
2060 SCHECK_PARTIAL();
2061 ecode++;
2062 break;
2063
2064 /* Word boundary assertions */
2065
2066 case OP_NOT_WORD_BOUNDARY:
2067 case OP_WORD_BOUNDARY:
2068 {
2069
2070 /* Find out if the previous and current characters are "word" characters.
2071 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2072 be "non-word" characters. Remember the earliest consulted character for
2073 partial matching. */
2074
2075 #ifdef SUPPORT_UTF
2076 if (utf)
2077 {
2078 /* Get status of previous character */
2079
2080 if (eptr == md->start_subject) prev_is_word = FALSE; else
2081 {
2082 PCRE_PUCHAR lastptr = eptr - 1;
2083 BACKCHAR(lastptr);
2084 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2085 GETCHAR(c, lastptr);
2086 #ifdef SUPPORT_UCP
2087 if (md->use_ucp)
2088 {
2089 if (c == '_') prev_is_word = TRUE; else
2090 {
2091 int cat = UCD_CATEGORY(c);
2092 prev_is_word = (cat == ucp_L || cat == ucp_N);
2093 }
2094 }
2095 else
2096 #endif
2097 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2098 }
2099
2100 /* Get status of next character */
2101
2102 if (eptr >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 cur_is_word = FALSE;
2106 }
2107 else
2108 {
2109 GETCHAR(c, eptr);
2110 #ifdef SUPPORT_UCP
2111 if (md->use_ucp)
2112 {
2113 if (c == '_') cur_is_word = TRUE; else
2114 {
2115 int cat = UCD_CATEGORY(c);
2116 cur_is_word = (cat == ucp_L || cat == ucp_N);
2117 }
2118 }
2119 else
2120 #endif
2121 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2122 }
2123 }
2124 else
2125 #endif
2126
2127 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2128 consistency with the behaviour of \w we do use it in this case. */
2129
2130 {
2131 /* Get status of previous character */
2132
2133 if (eptr == md->start_subject) prev_is_word = FALSE; else
2134 {
2135 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2136 #ifdef SUPPORT_UCP
2137 if (md->use_ucp)
2138 {
2139 c = eptr[-1];
2140 if (c == '_') prev_is_word = TRUE; else
2141 {
2142 int cat = UCD_CATEGORY(c);
2143 prev_is_word = (cat == ucp_L || cat == ucp_N);
2144 }
2145 }
2146 else
2147 #endif
2148 prev_is_word = MAX_255(eptr[-1])
2149 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2150 }
2151
2152 /* Get status of next character */
2153
2154 if (eptr >= md->end_subject)
2155 {
2156 SCHECK_PARTIAL();
2157 cur_is_word = FALSE;
2158 }
2159 else
2160 #ifdef SUPPORT_UCP
2161 if (md->use_ucp)
2162 {
2163 c = *eptr;
2164 if (c == '_') cur_is_word = TRUE; else
2165 {
2166 int cat = UCD_CATEGORY(c);
2167 cur_is_word = (cat == ucp_L || cat == ucp_N);
2168 }
2169 }
2170 else
2171 #endif
2172 cur_is_word = MAX_255(*eptr)
2173 && ((md->ctypes[*eptr] & ctype_word) != 0);
2174 }
2175
2176 /* Now see if the situation is what we want */
2177
2178 if ((*ecode++ == OP_WORD_BOUNDARY)?
2179 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2180 RRETURN(MATCH_NOMATCH);
2181 }
2182 break;
2183
2184 /* Match a single character type; inline for speed */
2185
2186 case OP_ANY:
2187 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2188 /* Fall through */
2189
2190 case OP_ALLANY:
2191 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2192 { /* not be updated before SCHECK_PARTIAL. */
2193 SCHECK_PARTIAL();
2194 RRETURN(MATCH_NOMATCH);
2195 }
2196 eptr++;
2197 #ifdef SUPPORT_UTF
2198 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2199 #endif
2200 ecode++;
2201 break;
2202
2203 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2204 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2205
2206 case OP_ANYBYTE:
2207 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2208 { /* not be updated before SCHECK_PARTIAL. */
2209 SCHECK_PARTIAL();
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 eptr++;
2213 ecode++;
2214 break;
2215
2216 case OP_NOT_DIGIT:
2217 if (eptr >= md->end_subject)
2218 {
2219 SCHECK_PARTIAL();
2220 RRETURN(MATCH_NOMATCH);
2221 }
2222 GETCHARINCTEST(c, eptr);
2223 if (
2224 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2225 c < 256 &&
2226 #endif
2227 (md->ctypes[c] & ctype_digit) != 0
2228 )
2229 RRETURN(MATCH_NOMATCH);
2230 ecode++;
2231 break;
2232
2233 case OP_DIGIT:
2234 if (eptr >= md->end_subject)
2235 {
2236 SCHECK_PARTIAL();
2237 RRETURN(MATCH_NOMATCH);
2238 }
2239 GETCHARINCTEST(c, eptr);
2240 if (
2241 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2242 c > 255 ||
2243 #endif
2244 (md->ctypes[c] & ctype_digit) == 0
2245 )
2246 RRETURN(MATCH_NOMATCH);
2247 ecode++;
2248 break;
2249
2250 case OP_NOT_WHITESPACE:
2251 if (eptr >= md->end_subject)
2252 {
2253 SCHECK_PARTIAL();
2254 RRETURN(MATCH_NOMATCH);
2255 }
2256 GETCHARINCTEST(c, eptr);
2257 if (
2258 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2259 c < 256 &&
2260 #endif
2261 (md->ctypes[c] & ctype_space) != 0
2262 )
2263 RRETURN(MATCH_NOMATCH);
2264 ecode++;
2265 break;
2266
2267 case OP_WHITESPACE:
2268 if (eptr >= md->end_subject)
2269 {
2270 SCHECK_PARTIAL();
2271 RRETURN(MATCH_NOMATCH);
2272 }
2273 GETCHARINCTEST(c, eptr);
2274 if (
2275 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2276 c > 255 ||
2277 #endif
2278 (md->ctypes[c] & ctype_space) == 0
2279 )
2280 RRETURN(MATCH_NOMATCH);
2281 ecode++;
2282 break;
2283
2284 case OP_NOT_WORDCHAR:
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 RRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 if (
2292 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2293 c < 256 &&
2294 #endif
2295 (md->ctypes[c] & ctype_word) != 0
2296 )
2297 RRETURN(MATCH_NOMATCH);
2298 ecode++;
2299 break;
2300
2301 case OP_WORDCHAR:
2302 if (eptr >= md->end_subject)
2303 {
2304 SCHECK_PARTIAL();
2305 RRETURN(MATCH_NOMATCH);
2306 }
2307 GETCHARINCTEST(c, eptr);
2308 if (
2309 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2310 c > 255 ||
2311 #endif
2312 (md->ctypes[c] & ctype_word) == 0
2313 )
2314 RRETURN(MATCH_NOMATCH);
2315 ecode++;
2316 break;
2317
2318 case OP_ANYNL:
2319 if (eptr >= md->end_subject)
2320 {
2321 SCHECK_PARTIAL();
2322 RRETURN(MATCH_NOMATCH);
2323 }
2324 GETCHARINCTEST(c, eptr);
2325 switch(c)
2326 {
2327 default: RRETURN(MATCH_NOMATCH);
2328
2329 case 0x000d:
2330 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2331 break;
2332
2333 case 0x000a:
2334 break;
2335
2336 case 0x000b:
2337 case 0x000c:
2338 case 0x0085:
2339 case 0x2028:
2340 case 0x2029:
2341 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2342 break;
2343 }
2344 ecode++;
2345 break;
2346
2347 case OP_NOT_HSPACE:
2348 if (eptr >= md->end_subject)
2349 {
2350 SCHECK_PARTIAL();
2351 RRETURN(MATCH_NOMATCH);
2352 }
2353 GETCHARINCTEST(c, eptr);
2354 switch(c)
2355 {
2356 default: break;
2357 case 0x09: /* HT */
2358 case 0x20: /* SPACE */
2359 case 0xa0: /* NBSP */
2360 case 0x1680: /* OGHAM SPACE MARK */
2361 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2362 case 0x2000: /* EN QUAD */
2363 case 0x2001: /* EM QUAD */
2364 case 0x2002: /* EN SPACE */
2365 case 0x2003: /* EM SPACE */
2366 case 0x2004: /* THREE-PER-EM SPACE */
2367 case 0x2005: /* FOUR-PER-EM SPACE */
2368 case 0x2006: /* SIX-PER-EM SPACE */
2369 case 0x2007: /* FIGURE SPACE */
2370 case 0x2008: /* PUNCTUATION SPACE */
2371 case 0x2009: /* THIN SPACE */
2372 case 0x200A: /* HAIR SPACE */
2373 case 0x202f: /* NARROW NO-BREAK SPACE */
2374 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2375 case 0x3000: /* IDEOGRAPHIC SPACE */
2376 RRETURN(MATCH_NOMATCH);
2377 }
2378 ecode++;
2379 break;
2380
2381 case OP_HSPACE:
2382 if (eptr >= md->end_subject)
2383 {
2384 SCHECK_PARTIAL();
2385 RRETURN(MATCH_NOMATCH);
2386 }
2387 GETCHARINCTEST(c, eptr);
2388 switch(c)
2389 {
2390 default: RRETURN(MATCH_NOMATCH);
2391 case 0x09: /* HT */
2392 case 0x20: /* SPACE */
2393 case 0xa0: /* NBSP */
2394 case 0x1680: /* OGHAM SPACE MARK */
2395 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2396 case 0x2000: /* EN QUAD */
2397 case 0x2001: /* EM QUAD */
2398 case 0x2002: /* EN SPACE */
2399 case 0x2003: /* EM SPACE */
2400 case 0x2004: /* THREE-PER-EM SPACE */
2401 case 0x2005: /* FOUR-PER-EM SPACE */
2402 case 0x2006: /* SIX-PER-EM SPACE */
2403 case 0x2007: /* FIGURE SPACE */
2404 case 0x2008: /* PUNCTUATION SPACE */
2405 case 0x2009: /* THIN SPACE */
2406 case 0x200A: /* HAIR SPACE */
2407 case 0x202f: /* NARROW NO-BREAK SPACE */
2408 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2409 case 0x3000: /* IDEOGRAPHIC SPACE */
2410 break;
2411 }
2412 ecode++;
2413 break;
2414
2415 case OP_NOT_VSPACE:
2416 if (eptr >= md->end_subject)
2417 {
2418 SCHECK_PARTIAL();
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 GETCHARINCTEST(c, eptr);
2422 switch(c)
2423 {
2424 default: break;
2425 case 0x0a: /* LF */
2426 case 0x0b: /* VT */
2427 case 0x0c: /* FF */
2428 case 0x0d: /* CR */
2429 case 0x85: /* NEL */
2430 case 0x2028: /* LINE SEPARATOR */
2431 case 0x2029: /* PARAGRAPH SEPARATOR */
2432 RRETURN(MATCH_NOMATCH);
2433 }
2434 ecode++;
2435 break;
2436
2437 case OP_VSPACE:
2438 if (eptr >= md->end_subject)
2439 {
2440 SCHECK_PARTIAL();
2441 RRETURN(MATCH_NOMATCH);
2442 }
2443 GETCHARINCTEST(c, eptr);
2444 switch(c)
2445 {
2446 default: RRETURN(MATCH_NOMATCH);
2447 case 0x0a: /* LF */
2448 case 0x0b: /* VT */
2449 case 0x0c: /* FF */
2450 case 0x0d: /* CR */
2451 case 0x85: /* NEL */
2452 case 0x2028: /* LINE SEPARATOR */
2453 case 0x2029: /* PARAGRAPH SEPARATOR */
2454 break;
2455 }
2456 ecode++;
2457 break;
2458
2459 #ifdef SUPPORT_UCP
2460 /* Check the next character by Unicode property. We will get here only
2461 if the support is in the binary; otherwise a compile-time error occurs. */
2462
2463 case OP_PROP:
2464 case OP_NOTPROP:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 RRETURN(MATCH_NOMATCH);
2469 }
2470 GETCHARINCTEST(c, eptr);
2471 {
2472 const ucd_record *prop = GET_UCD(c);
2473
2474 switch(ecode[1])
2475 {
2476 case PT_ANY:
2477 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_LAMP:
2481 if ((prop->chartype == ucp_Lu ||
2482 prop->chartype == ucp_Ll ||
2483 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2484 RRETURN(MATCH_NOMATCH);
2485 break;
2486
2487 case PT_GC:
2488 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2489 RRETURN(MATCH_NOMATCH);
2490 break;
2491
2492 case PT_PC:
2493 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2494 RRETURN(MATCH_NOMATCH);
2495 break;
2496
2497 case PT_SC:
2498 if ((ecode[2] != prop->script) == (op == OP_PROP))
2499 RRETURN(MATCH_NOMATCH);
2500 break;
2501
2502 /* These are specials */
2503
2504 case PT_ALNUM:
2505 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2506 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2507 RRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_SPACE: /* Perl space */
2511 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2513 == (op == OP_NOTPROP))
2514 RRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 case PT_PXSPACE: /* POSIX space */
2518 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2519 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2520 c == CHAR_FF || c == CHAR_CR)
2521 == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_WORD:
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2527 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2528 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2529 RRETURN(MATCH_NOMATCH);
2530 break;
2531
2532 /* This should never occur */
2533
2534 default:
2535 RRETURN(PCRE_ERROR_INTERNAL);
2536 }
2537
2538 ecode += 3;
2539 }
2540 break;
2541
2542 /* Match an extended Unicode sequence. We will get here only if the support
2543 is in the binary; otherwise a compile-time error occurs. */
2544
2545 case OP_EXTUNI:
2546 if (eptr >= md->end_subject)
2547 {
2548 SCHECK_PARTIAL();
2549 RRETURN(MATCH_NOMATCH);
2550 }
2551 GETCHARINCTEST(c, eptr);
2552 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2553 while (eptr < md->end_subject)
2554 {
2555 int len = 1;
2556 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2557 if (UCD_CATEGORY(c) != ucp_M) break;
2558 eptr += len;
2559 }
2560 ecode++;
2561 break;
2562 #endif
2563
2564
2565 /* Match a back reference, possibly repeatedly. Look past the end of the
2566 item to see if there is repeat information following. The code is similar
2567 to that for character classes, but repeated for efficiency. Then obey
2568 similar code to character type repeats - written out again for speed.
2569 However, if the referenced string is the empty string, always treat
2570 it as matched, any number of times (otherwise there could be infinite
2571 loops). */
2572
2573 case OP_REF:
2574 case OP_REFI:
2575 caseless = op == OP_REFI;
2576 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2577 ecode += 1 + IMM2_SIZE;
2578
2579 /* If the reference is unset, there are two possibilities:
2580
2581 (a) In the default, Perl-compatible state, set the length negative;
2582 this ensures that every attempt at a match fails. We can't just fail
2583 here, because of the possibility of quantifiers with zero minima.
2584
2585 (b) If the JavaScript compatibility flag is set, set the length to zero
2586 so that the back reference matches an empty string.
2587
2588 Otherwise, set the length to the length of what was matched by the
2589 referenced subpattern. */
2590
2591 if (offset >= offset_top || md->offset_vector[offset] < 0)
2592 length = (md->jscript_compat)? 0 : -1;
2593 else
2594 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2595
2596 /* Set up for repetition, or handle the non-repeated case */
2597
2598 switch (*ecode)
2599 {
2600 case OP_CRSTAR:
2601 case OP_CRMINSTAR:
2602 case OP_CRPLUS:
2603 case OP_CRMINPLUS:
2604 case OP_CRQUERY:
2605 case OP_CRMINQUERY:
2606 c = *ecode++ - OP_CRSTAR;
2607 minimize = (c & 1) != 0;
2608 min = rep_min[c]; /* Pick up values from tables; */
2609 max = rep_max[c]; /* zero for max => infinity */
2610 if (max == 0) max = INT_MAX;
2611 break;
2612
2613 case OP_CRRANGE:
2614 case OP_CRMINRANGE:
2615 minimize = (*ecode == OP_CRMINRANGE);
2616 min = GET2(ecode, 1);
2617 max = GET2(ecode, 1 + IMM2_SIZE);
2618 if (max == 0) max = INT_MAX;
2619 ecode += 1 + 2 * IMM2_SIZE;
2620 break;
2621
2622 default: /* No repeat follows */
2623 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2624 {
2625 CHECK_PARTIAL();
2626 RRETURN(MATCH_NOMATCH);
2627 }
2628 eptr += length;
2629 continue; /* With the main loop */
2630 }
2631
2632 /* Handle repeated back references. If the length of the reference is
2633 zero, just continue with the main loop. */
2634
2635 if (length == 0) continue;
2636
2637 /* First, ensure the minimum number of matches are present. We get back
2638 the length of the reference string explicitly rather than passing the
2639 address of eptr, so that eptr can be a register variable. */
2640
2641 for (i = 1; i <= min; i++)
2642 {
2643 int slength;
2644 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2645 {
2646 CHECK_PARTIAL();
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 eptr += slength;
2650 }
2651
2652 /* If min = max, continue at the same level without recursion.
2653 They are not both allowed to be zero. */
2654
2655 if (min == max) continue;
2656
2657 /* If minimizing, keep trying and advancing the pointer */
2658
2659 if (minimize)
2660 {
2661 for (fi = min;; fi++)
2662 {
2663 int slength;
2664 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666 if (fi >= max) RRETURN(MATCH_NOMATCH);
2667 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2668 {
2669 CHECK_PARTIAL();
2670 RRETURN(MATCH_NOMATCH);
2671 }
2672 eptr += slength;
2673 }
2674 /* Control never gets here */
2675 }
2676
2677 /* If maximizing, find the longest string and work backwards */
2678
2679 else
2680 {
2681 pp = eptr;
2682 for (i = min; i < max; i++)
2683 {
2684 int slength;
2685 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2686 {
2687 CHECK_PARTIAL();
2688 break;
2689 }
2690 eptr += slength;
2691 }
2692 while (eptr >= pp)
2693 {
2694 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2696 eptr -= length;
2697 }
2698 RRETURN(MATCH_NOMATCH);
2699 }
2700 /* Control never gets here */
2701
2702 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2703 used when all the characters in the class have values in the range 0-255,
2704 and either the matching is caseful, or the characters are in the range
2705 0-127 when UTF-8 processing is enabled. The only difference between
2706 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2707 encountered.
2708
2709 First, look past the end of the item to see if there is repeat information
2710 following. Then obey similar code to character type repeats - written out
2711 again for speed. */
2712
2713 case OP_NCLASS:
2714 case OP_CLASS:
2715 {
2716 /* The data variable is saved across frames, so the byte map needs to
2717 be stored there. */
2718 #define BYTE_MAP ((pcre_uint8 *)data)
2719 data = ecode + 1; /* Save for matching */
2720 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2721
2722 switch (*ecode)
2723 {
2724 case OP_CRSTAR:
2725 case OP_CRMINSTAR:
2726 case OP_CRPLUS:
2727 case OP_CRMINPLUS:
2728 case OP_CRQUERY:
2729 case OP_CRMINQUERY:
2730 c = *ecode++ - OP_CRSTAR;
2731 minimize = (c & 1) != 0;
2732 min = rep_min[c]; /* Pick up values from tables; */
2733 max = rep_max[c]; /* zero for max => infinity */
2734 if (max == 0) max = INT_MAX;
2735 break;
2736
2737 case OP_CRRANGE:
2738 case OP_CRMINRANGE:
2739 minimize = (*ecode == OP_CRMINRANGE);
2740 min = GET2(ecode, 1);
2741 max = GET2(ecode, 1 + IMM2_SIZE);
2742 if (max == 0) max = INT_MAX;
2743 ecode += 1 + 2 * IMM2_SIZE;
2744 break;
2745
2746 default: /* No repeat follows */
2747 min = max = 1;
2748 break;
2749 }
2750
2751 /* First, ensure the minimum number of matches are present. */
2752
2753 #ifdef SUPPORT_UTF
2754 if (utf)
2755 {
2756 for (i = 1; i <= min; i++)
2757 {
2758 if (eptr >= md->end_subject)
2759 {
2760 SCHECK_PARTIAL();
2761 RRETURN(MATCH_NOMATCH);
2762 }
2763 GETCHARINC(c, eptr);
2764 if (c > 255)
2765 {
2766 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2767 }
2768 else
2769 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2770 }
2771 }
2772 else
2773 #endif
2774 /* Not UTF mode */
2775 {
2776 for (i = 1; i <= min; i++)
2777 {
2778 if (eptr >= md->end_subject)
2779 {
2780 SCHECK_PARTIAL();
2781 RRETURN(MATCH_NOMATCH);
2782 }
2783 c = *eptr++;
2784 #ifndef COMPILE_PCRE8
2785 if (c > 255)
2786 {
2787 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2788 }
2789 else
2790 #endif
2791 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2792 }
2793 }
2794
2795 /* If max == min we can continue with the main loop without the
2796 need to recurse. */
2797
2798 if (min == max) continue;
2799
2800 /* If minimizing, keep testing the rest of the expression and advancing
2801 the pointer while it matches the class. */
2802
2803 if (minimize)
2804 {
2805 #ifdef SUPPORT_UTF
2806 if (utf)
2807 {
2808 for (fi = min;; fi++)
2809 {
2810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812 if (fi >= max) RRETURN(MATCH_NOMATCH);
2813 if (eptr >= md->end_subject)
2814 {
2815 SCHECK_PARTIAL();
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 GETCHARINC(c, eptr);
2819 if (c > 255)
2820 {
2821 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2822 }
2823 else
2824 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2825 }
2826 }
2827 else
2828 #endif
2829 /* Not UTF mode */
2830 {
2831 for (fi = min;; fi++)
2832 {
2833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2835 if (fi >= max) RRETURN(MATCH_NOMATCH);
2836 if (eptr >= md->end_subject)
2837 {
2838 SCHECK_PARTIAL();
2839 RRETURN(MATCH_NOMATCH);
2840 }
2841 c = *eptr++;
2842 #ifndef COMPILE_PCRE8
2843 if (c > 255)
2844 {
2845 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2846 }
2847 else
2848 #endif
2849 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2850 }
2851 }
2852 /* Control never gets here */
2853 }
2854
2855 /* If maximizing, find the longest possible run, then work backwards. */
2856
2857 else
2858 {
2859 pp = eptr;
2860
2861 #ifdef SUPPORT_UTF
2862 if (utf)
2863 {
2864 for (i = min; i < max; i++)
2865 {
2866 int len = 1;
2867 if (eptr >= md->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 break;
2871 }
2872 GETCHARLEN(c, eptr, len);
2873 if (c > 255)
2874 {
2875 if (op == OP_CLASS) break;
2876 }
2877 else
2878 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2879 eptr += len;
2880 }
2881 for (;;)
2882 {
2883 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2885 if (eptr-- == pp) break; /* Stop if tried at original pos */
2886 BACKCHAR(eptr);
2887 }
2888 }
2889 else
2890 #endif
2891 /* Not UTF mode */
2892 {
2893 for (i = min; i < max; i++)
2894 {
2895 if (eptr >= md->end_subject)
2896 {
2897 SCHECK_PARTIAL();
2898 break;
2899 }
2900 c = *eptr;
2901 #ifndef COMPILE_PCRE8
2902 if (c > 255)
2903 {
2904 if (op == OP_CLASS) break;
2905 }
2906 else
2907 #endif
2908 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2909 eptr++;
2910 }
2911 while (eptr >= pp)
2912 {
2913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2915 eptr--;
2916 }
2917 }
2918
2919 RRETURN(MATCH_NOMATCH);
2920 }
2921 #undef BYTE_MAP
2922 }
2923 /* Control never gets here */
2924
2925
2926 /* Match an extended character class. This opcode is encountered only
2927 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2928 mode, because Unicode properties are supported in non-UTF-8 mode. */
2929
2930 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2931 case OP_XCLASS:
2932 {
2933 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2934 ecode += GET(ecode, 1); /* Advance past the item */
2935
2936 switch (*ecode)
2937 {
2938 case OP_CRSTAR:
2939 case OP_CRMINSTAR:
2940 case OP_CRPLUS:
2941 case OP_CRMINPLUS:
2942 case OP_CRQUERY:
2943 case OP_CRMINQUERY:
2944 c = *ecode++ - OP_CRSTAR;
2945 minimize = (c & 1) != 0;
2946 min = rep_min[c]; /* Pick up values from tables; */
2947 max = rep_max[c]; /* zero for max => infinity */
2948 if (max == 0) max = INT_MAX;
2949 break;
2950
2951 case OP_CRRANGE:
2952 case OP_CRMINRANGE:
2953 minimize = (*ecode == OP_CRMINRANGE);
2954 min = GET2(ecode, 1);
2955 max = GET2(ecode, 1 + IMM2_SIZE);
2956 if (max == 0) max = INT_MAX;
2957 ecode += 1 + 2 * IMM2_SIZE;
2958 break;
2959
2960 default: /* No repeat follows */
2961 min = max = 1;
2962 break;
2963 }
2964
2965 /* First, ensure the minimum number of matches are present. */
2966
2967 for (i = 1; i <= min; i++)
2968 {
2969 if (eptr >= md->end_subject)
2970 {
2971 SCHECK_PARTIAL();
2972 RRETURN(MATCH_NOMATCH);
2973 }
2974 GETCHARINCTEST(c, eptr);
2975 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2976 }
2977
2978 /* If max == min we can continue with the main loop without the
2979 need to recurse. */
2980
2981 if (min == max) continue;
2982
2983 /* If minimizing, keep testing the rest of the expression and advancing
2984 the pointer while it matches the class. */
2985
2986 if (minimize)
2987 {
2988 for (fi = min;; fi++)
2989 {
2990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2992 if (fi >= max) RRETURN(MATCH_NOMATCH);
2993 if (eptr >= md->end_subject)
2994 {
2995 SCHECK_PARTIAL();
2996 RRETURN(MATCH_NOMATCH);
2997 }
2998 GETCHARINCTEST(c, eptr);
2999 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3000 }
3001 /* Control never gets here */
3002 }
3003
3004 /* If maximizing, find the longest possible run, then work backwards. */
3005
3006 else
3007 {
3008 pp = eptr;
3009 for (i = min; i < max; i++)
3010 {
3011 int len = 1;
3012 if (eptr >= md->end_subject)
3013 {
3014 SCHECK_PARTIAL();
3015 break;
3016 }
3017 #ifdef SUPPORT_UTF
3018 GETCHARLENTEST(c, eptr, len);
3019 #else
3020 c = *eptr;
3021 #endif
3022 if (!PRIV(xclass)(c, data, utf)) break;
3023 eptr += len;
3024 }
3025 for(;;)
3026 {
3027 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 if (eptr-- == pp) break; /* Stop if tried at original pos */
3030 #ifdef SUPPORT_UTF
3031 if (utf) BACKCHAR(eptr);
3032 #endif
3033 }
3034 RRETURN(MATCH_NOMATCH);
3035 }
3036
3037 /* Control never gets here */
3038 }
3039 #endif /* End of XCLASS */
3040
3041 /* Match a single character, casefully */
3042
3043 case OP_CHAR:
3044 #ifdef SUPPORT_UTF
3045 if (utf)
3046 {
3047 length = 1;
3048 ecode++;
3049 GETCHARLEN(fc, ecode, length);
3050 if (length > md->end_subject - eptr)
3051 {
3052 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3053 RRETURN(MATCH_NOMATCH);
3054 }
3055 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3056 }
3057 else
3058 #endif
3059 /* Not UTF mode */
3060 {
3061 if (md->end_subject - eptr < 1)
3062 {
3063 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3064 RRETURN(MATCH_NOMATCH);
3065 }
3066 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3067 ecode += 2;
3068 }
3069 break;
3070
3071 /* Match a single character, caselessly. If we are at the end of the
3072 subject, give up immediately. */
3073
3074 case OP_CHARI:
3075 if (eptr >= md->end_subject)
3076 {
3077 SCHECK_PARTIAL();
3078 RRETURN(MATCH_NOMATCH);
3079 }
3080
3081 #ifdef SUPPORT_UTF
3082 if (utf)
3083 {
3084 length = 1;
3085 ecode++;
3086 GETCHARLEN(fc, ecode, length);
3087
3088 /* If the pattern character's value is < 128, we have only one byte, and
3089 we know that its other case must also be one byte long, so we can use the
3090 fast lookup table. We know that there is at least one byte left in the
3091 subject. */
3092
3093 if (fc < 128)
3094 {
3095 if (md->lcc[fc]
3096 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3097 ecode++;
3098 eptr++;
3099 }
3100
3101 /* Otherwise we must pick up the subject character. Note that we cannot
3102 use the value of "length" to check for sufficient bytes left, because the
3103 other case of the character may have more or fewer bytes. */
3104
3105 else
3106 {
3107 unsigned int dc;
3108 GETCHARINC(dc, eptr);
3109 ecode += length;
3110
3111 /* If we have Unicode property support, we can use it to test the other
3112 case of the character, if there is one. */
3113
3114 if (fc != dc)
3115 {
3116 #ifdef SUPPORT_UCP
3117 if (dc != UCD_OTHERCASE(fc))
3118 #endif
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 }
3122 }
3123 else
3124 #endif /* SUPPORT_UTF */
3125
3126 /* Not UTF mode */
3127 {
3128 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3129 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3130 eptr++;
3131 ecode += 2;
3132 }
3133 break;
3134
3135 /* Match a single character repeatedly. */
3136
3137 case OP_EXACT:
3138 case OP_EXACTI:
3139 min = max = GET2(ecode, 1);
3140 ecode += 1 + IMM2_SIZE;
3141 goto REPEATCHAR;
3142
3143 case OP_POSUPTO:
3144 case OP_POSUPTOI:
3145 possessive = TRUE;
3146 /* Fall through */
3147
3148 case OP_UPTO:
3149 case OP_UPTOI:
3150 case OP_MINUPTO:
3151 case OP_MINUPTOI:
3152 min = 0;
3153 max = GET2(ecode, 1);
3154 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3155 ecode += 1 + IMM2_SIZE;
3156 goto REPEATCHAR;
3157
3158 case OP_POSSTAR:
3159 case OP_POSSTARI:
3160 possessive = TRUE;
3161 min = 0;
3162 max = INT_MAX;
3163 ecode++;
3164 goto REPEATCHAR;
3165
3166 case OP_POSPLUS:
3167 case OP_POSPLUSI:
3168 possessive = TRUE;
3169 min = 1;
3170 max = INT_MAX;
3171 ecode++;
3172 goto REPEATCHAR;
3173
3174 case OP_POSQUERY:
3175 case OP_POSQUERYI:
3176 possessive = TRUE;
3177 min = 0;
3178 max = 1;
3179 ecode++;
3180 goto REPEATCHAR;
3181
3182 case OP_STAR:
3183 case OP_STARI:
3184 case OP_MINSTAR:
3185 case OP_MINSTARI:
3186 case OP_PLUS:
3187 case OP_PLUSI:
3188 case OP_MINPLUS:
3189 case OP_MINPLUSI:
3190 case OP_QUERY:
3191 case OP_QUERYI:
3192 case OP_MINQUERY:
3193 case OP_MINQUERYI:
3194 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3195 minimize = (c & 1) != 0;
3196 min = rep_min[c]; /* Pick up values from tables; */
3197 max = rep_max[c]; /* zero for max => infinity */
3198 if (max == 0) max = INT_MAX;
3199
3200 /* Common code for all repeated single-character matches. */
3201
3202 REPEATCHAR:
3203 #ifdef SUPPORT_UTF
3204 if (utf)
3205 {
3206 length = 1;
3207 charptr = ecode;
3208 GETCHARLEN(fc, ecode, length);
3209 ecode += length;
3210
3211 /* Handle multibyte character matching specially here. There is
3212 support for caseless matching if UCP support is present. */
3213
3214 if (length > 1)
3215 {
3216 #ifdef SUPPORT_UCP
3217 unsigned int othercase;
3218 if (op >= OP_STARI && /* Caseless */
3219 (othercase = UCD_OTHERCASE(fc)) != fc)
3220 oclength = PRIV(ord2utf)(othercase, occhars);
3221 else oclength = 0;
3222 #endif /* SUPPORT_UCP */
3223
3224 for (i = 1; i <= min; i++)
3225 {
3226 if (eptr <= md->end_subject - length &&
3227 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3228 #ifdef SUPPORT_UCP
3229 else if (oclength > 0 &&
3230 eptr <= md->end_subject - oclength &&
3231 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3232 #endif /* SUPPORT_UCP */
3233 else
3234 {
3235 CHECK_PARTIAL();
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 }
3239
3240 if (min == max) continue;
3241
3242 if (minimize)
3243 {
3244 for (fi = min;; fi++)
3245 {
3246 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3248 if (fi >= max) RRETURN(MATCH_NOMATCH);
3249 if (eptr <= md->end_subject - length &&
3250 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3251 #ifdef SUPPORT_UCP
3252 else if (oclength > 0 &&
3253 eptr <= md->end_subject - oclength &&
3254 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3255 #endif /* SUPPORT_UCP */
3256 else
3257 {
3258 CHECK_PARTIAL();
3259 RRETURN(MATCH_NOMATCH);
3260 }
3261 }
3262 /* Control never gets here */
3263 }
3264
3265 else /* Maximize */
3266 {
3267 pp = eptr;
3268 for (i = min; i < max; i++)
3269 {
3270 if (eptr <= md->end_subject - length &&
3271 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3272 #ifdef SUPPORT_UCP
3273 else if (oclength > 0 &&
3274 eptr <= md->end_subject - oclength &&
3275 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3276 #endif /* SUPPORT_UCP */
3277 else
3278 {
3279 CHECK_PARTIAL();
3280 break;
3281 }
3282 }
3283
3284 if (possessive) continue;
3285
3286 for(;;)
3287 {
3288 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3290 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3291 #ifdef SUPPORT_UCP
3292 eptr--;
3293 BACKCHAR(eptr);
3294 #else /* without SUPPORT_UCP */
3295 eptr -= length;
3296 #endif /* SUPPORT_UCP */
3297 }
3298 }
3299 /* Control never gets here */
3300 }
3301
3302 /* If the length of a UTF-8 character is 1, we fall through here, and
3303 obey the code as for non-UTF-8 characters below, though in this case the
3304 value of fc will always be < 128. */
3305 }
3306 else
3307 #endif /* SUPPORT_UTF */
3308 /* When not in UTF-8 mode, load a single-byte character. */
3309 fc = *ecode++;
3310
3311 /* The value of fc at this point is always one character, though we may
3312 or may not be in UTF mode. The code is duplicated for the caseless and
3313 caseful cases, for speed, since matching characters is likely to be quite
3314 common. First, ensure the minimum number of matches are present. If min =
3315 max, continue at the same level without recursing. Otherwise, if
3316 minimizing, keep trying the rest of the expression and advancing one
3317 matching character if failing, up to the maximum. Alternatively, if
3318 maximizing, find the maximum number of characters and work backwards. */
3319
3320 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3321 max, eptr));
3322
3323 if (op >= OP_STARI) /* Caseless */
3324 {
3325 #ifdef COMPILE_PCRE8
3326 /* fc must be < 128 if UTF is enabled. */
3327 foc = md->fcc[fc];
3328 #else
3329 #ifdef SUPPORT_UTF
3330 #ifdef SUPPORT_UCP
3331 if (utf && fc > 127)
3332 foc = UCD_OTHERCASE(fc);
3333 #else
3334 if (utf && fc > 127)
3335 foc = fc;
3336 #endif /* SUPPORT_UCP */
3337 else
3338 #endif /* SUPPORT_UTF */
3339 foc = TABLE_GET(fc, md->fcc, fc);
3340 #endif /* COMPILE_PCRE8 */
3341
3342 for (i = 1; i <= min; i++)
3343 {
3344 if (eptr >= md->end_subject)
3345 {
3346 SCHECK_PARTIAL();
3347 RRETURN(MATCH_NOMATCH);
3348 }
3349 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3350 eptr++;
3351 }
3352 if (min == max) continue;
3353 if (minimize)
3354 {
3355 for (fi = min;; fi++)
3356 {
3357 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3359 if (fi >= max) RRETURN(MATCH_NOMATCH);
3360 if (eptr >= md->end_subject)
3361 {
3362 SCHECK_PARTIAL();
3363 RRETURN(MATCH_NOMATCH);
3364 }
3365 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3366 eptr++;
3367 }
3368 /* Control never gets here */
3369 }
3370 else /* Maximize */
3371 {
3372 pp = eptr;
3373 for (i = min; i < max; i++)
3374 {
3375 if (eptr >= md->end_subject)
3376 {
3377 SCHECK_PARTIAL();
3378 break;
3379 }
3380 if (fc != *eptr && foc != *eptr) break;
3381 eptr++;
3382 }
3383
3384 if (possessive) continue;
3385
3386 while (eptr >= pp)
3387 {
3388 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3389 eptr--;
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 }
3392 RRETURN(MATCH_NOMATCH);
3393 }
3394 /* Control never gets here */
3395 }
3396
3397 /* Caseful comparisons (includes all multi-byte characters) */
3398
3399 else
3400 {
3401 for (i = 1; i <= min; i++)
3402 {
3403 if (eptr >= md->end_subject)
3404 {
3405 SCHECK_PARTIAL();
3406 RRETURN(MATCH_NOMATCH);
3407 }
3408 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3409 }
3410
3411 if (min == max) continue;
3412
3413 if (minimize)
3414 {
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max) RRETURN(MATCH_NOMATCH);
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 RRETURN(MATCH_NOMATCH);
3424 }
3425 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3426 }
3427 /* Control never gets here */
3428 }
3429 else /* Maximize */
3430 {
3431 pp = eptr;
3432 for (i = min; i < max; i++)
3433 {
3434 if (eptr >= md->end_subject)
3435 {
3436 SCHECK_PARTIAL();
3437 break;
3438 }
3439 if (fc != *eptr) break;
3440 eptr++;
3441 }
3442 if (possessive) continue;
3443
3444 while (eptr >= pp)
3445 {
3446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3447 eptr--;
3448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3449 }
3450 RRETURN(MATCH_NOMATCH);
3451 }
3452 }
3453 /* Control never gets here */
3454
3455 /* Match a negated single one-byte character. The character we are
3456 checking can be multibyte. */
3457
3458 case OP_NOT:
3459 case OP_NOTI:
3460 if (eptr >= md->end_subject)
3461 {
3462 SCHECK_PARTIAL();
3463 RRETURN(MATCH_NOMATCH);
3464 }
3465 ecode++;
3466 GETCHARINCTEST(c, eptr);
3467 if (op == OP_NOTI) /* The caseless case */
3468 {
3469 register int ch, och;
3470 ch = *ecode++;
3471 #ifdef COMPILE_PCRE8
3472 /* ch must be < 128 if UTF is enabled. */
3473 och = md->fcc[ch];
3474 #else
3475 #ifdef SUPPORT_UTF
3476 #ifdef SUPPORT_UCP
3477 if (utf && ch > 127)
3478 och = UCD_OTHERCASE(ch);
3479 #else
3480 if (utf && ch > 127)
3481 och = ch;
3482 #endif /* SUPPORT_UCP */
3483 else
3484 #endif /* SUPPORT_UTF */
3485 och = TABLE_GET(ch, md->fcc, ch);
3486 #endif /* COMPILE_PCRE8 */
3487 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3488 }
3489 else /* Caseful */
3490 {
3491 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3492 }
3493 break;
3494
3495 /* Match a negated single one-byte character repeatedly. This is almost a
3496 repeat of the code for a repeated single character, but I haven't found a
3497 nice way of commoning these up that doesn't require a test of the
3498 positive/negative option for each character match. Maybe that wouldn't add
3499 very much to the time taken, but character matching *is* what this is all
3500 about... */
3501
3502 case OP_NOTEXACT:
3503 case OP_NOTEXACTI:
3504 min = max = GET2(ecode, 1);
3505 ecode += 1 + IMM2_SIZE;
3506 goto REPEATNOTCHAR;
3507
3508 case OP_NOTUPTO:
3509 case OP_NOTUPTOI:
3510 case OP_NOTMINUPTO:
3511 case OP_NOTMINUPTOI:
3512 min = 0;
3513 max = GET2(ecode, 1);
3514 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3515 ecode += 1 + IMM2_SIZE;
3516 goto REPEATNOTCHAR;
3517
3518 case OP_NOTPOSSTAR:
3519 case OP_NOTPOSSTARI:
3520 possessive = TRUE;
3521 min = 0;
3522 max = INT_MAX;
3523 ecode++;
3524 goto REPEATNOTCHAR;
3525
3526 case OP_NOTPOSPLUS:
3527 case OP_NOTPOSPLUSI:
3528 possessive = TRUE;
3529 min = 1;
3530 max = INT_MAX;
3531 ecode++;
3532 goto REPEATNOTCHAR;
3533
3534 case OP_NOTPOSQUERY:
3535 case OP_NOTPOSQUERYI:
3536 possessive = TRUE;
3537 min = 0;
3538 max = 1;
3539 ecode++;
3540 goto REPEATNOTCHAR;
3541
3542 case OP_NOTPOSUPTO:
3543 case OP_NOTPOSUPTOI:
3544 possessive = TRUE;
3545 min = 0;
3546 max = GET2(ecode, 1);
3547 ecode += 1 + IMM2_SIZE;
3548 goto REPEATNOTCHAR;
3549
3550 case OP_NOTSTAR:
3551 case OP_NOTSTARI:
3552 case OP_NOTMINSTAR:
3553 case OP_NOTMINSTARI:
3554 case OP_NOTPLUS:
3555 case OP_NOTPLUSI:
3556 case OP_NOTMINPLUS:
3557 case OP_NOTMINPLUSI:
3558 case OP_NOTQUERY:
3559 case OP_NOTQUERYI:
3560 case OP_NOTMINQUERY:
3561 case OP_NOTMINQUERYI:
3562 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3563 minimize = (c & 1) != 0;
3564 min = rep_min[c]; /* Pick up values from tables; */
3565 max = rep_max[c]; /* zero for max => infinity */
3566 if (max == 0) max = INT_MAX;
3567
3568 /* Common code for all repeated single-byte matches. */
3569
3570 REPEATNOTCHAR:
3571 fc = *ecode++;
3572
3573 /* The code is duplicated for the caseless and caseful cases, for speed,
3574 since matching characters is likely to be quite common. First, ensure the
3575 minimum number of matches are present. If min = max, continue at the same
3576 level without recursing. Otherwise, if minimizing, keep trying the rest of
3577 the expression and advancing one matching character if failing, up to the
3578 maximum. Alternatively, if maximizing, find the maximum number of
3579 characters and work backwards. */
3580
3581 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3582 max, eptr));
3583
3584 if (op >= OP_NOTSTARI) /* Caseless */
3585 {
3586 #ifdef COMPILE_PCRE8
3587 /* fc must be < 128 if UTF is enabled. */
3588 foc = md->fcc[fc];
3589 #else
3590 #ifdef SUPPORT_UTF
3591 #ifdef SUPPORT_UCP
3592 if (utf && fc > 127)
3593 foc = UCD_OTHERCASE(fc);
3594 #else
3595 if (utf && fc > 127)
3596 foc = fc;
3597 #endif /* SUPPORT_UCP */
3598 else
3599 #endif /* SUPPORT_UTF */
3600 foc = TABLE_GET(fc, md->fcc, fc);
3601 #endif /* COMPILE_PCRE8 */
3602
3603 #ifdef SUPPORT_UTF
3604 if (utf)
3605 {
3606 register unsigned int d;
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 RRETURN(MATCH_NOMATCH);
3613 }
3614 GETCHARINC(d, eptr);
3615 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3616 }
3617 }
3618 else
3619 #endif
3620 /* Not UTF mode */
3621 {
3622 for (i = 1; i <= min; i++)
3623 {
3624 if (eptr >= md->end_subject)
3625 {
3626 SCHECK_PARTIAL();
3627 RRETURN(MATCH_NOMATCH);
3628 }
3629 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3630 eptr++;
3631 }
3632 }
3633
3634 if (min == max) continue;
3635
3636 if (minimize)
3637 {
3638 #ifdef SUPPORT_UTF
3639 if (utf)
3640 {
3641 register unsigned int d;
3642 for (fi = min;; fi++)
3643 {
3644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3646 if (fi >= max) RRETURN(MATCH_NOMATCH);
3647 if (eptr >= md->end_subject)
3648 {
3649 SCHECK_PARTIAL();
3650 RRETURN(MATCH_NOMATCH);
3651 }
3652 GETCHARINC(d, eptr);
3653 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3654 }
3655 }
3656 else
3657 #endif
3658 /* Not UTF mode */
3659 {
3660 for (fi = min;; fi++)
3661 {
3662 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3664 if (fi >= max) RRETURN(MATCH_NOMATCH);
3665 if (eptr >= md->end_subject)
3666 {
3667 SCHECK_PARTIAL();
3668 RRETURN(MATCH_NOMATCH);
3669 }
3670 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3671 eptr++;
3672 }
3673 }
3674 /* Control never gets here */
3675 }
3676
3677 /* Maximize case */
3678
3679 else
3680 {
3681 pp = eptr;
3682
3683 #ifdef SUPPORT_UTF
3684 if (utf)
3685 {
3686 register unsigned int d;
3687 for (i = min; i < max; i++)
3688 {
3689 int len = 1;
3690 if (eptr >= md->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 break;
3694 }
3695 GETCHARLEN(d, eptr, len);
3696 if (fc == d || foc == d) break;
3697 eptr += len;
3698 }
3699 if (possessive) continue;
3700 for(;;)
3701 {
3702 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3703 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3704 if (eptr-- == pp) break; /* Stop if tried at original pos */
3705 BACKCHAR(eptr);
3706 }
3707 }
3708 else
3709 #endif
3710 /* Not UTF mode */
3711 {
3712 for (i = min; i < max; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 break;
3718 }
3719 if (fc == *eptr || foc == *eptr) break;
3720 eptr++;
3721 }
3722 if (possessive) continue;
3723 while (eptr >= pp)
3724 {
3725 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3727 eptr--;
3728 }
3729 }
3730
3731 RRETURN(MATCH_NOMATCH);
3732 }
3733 /* Control never gets here */
3734 }
3735
3736 /* Caseful comparisons */
3737
3738 else
3739 {
3740 #ifdef SUPPORT_UTF
3741 if (utf)
3742 {
3743 register unsigned int d;
3744 for (i = 1; i <= min; i++)
3745 {
3746 if (eptr >= md->end_subject)
3747 {
3748 SCHECK_PARTIAL();
3749 RRETURN(MATCH_NOMATCH);
3750 }
3751 GETCHARINC(d, eptr);
3752 if (fc == d) RRETURN(MATCH_NOMATCH);
3753 }
3754 }
3755 else
3756 #endif
3757 /* Not UTF mode */
3758 {
3759 for (i = 1; i <= min; i++)
3760 {
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 RRETURN(MATCH_NOMATCH);
3765 }
3766 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3767 }
3768 }
3769
3770 if (min == max) continue;
3771
3772 if (minimize)
3773 {
3774 #ifdef SUPPORT_UTF
3775 if (utf)
3776 {
3777 register unsigned int d;
3778 for (fi = min;; fi++)
3779 {
3780 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3782 if (fi >= max) RRETURN(MATCH_NOMATCH);
3783 if (eptr >= md->end_subject)
3784 {
3785 SCHECK_PARTIAL();
3786 RRETURN(MATCH_NOMATCH);
3787 }
3788 GETCHARINC(d, eptr);
3789 if (fc == d) RRETURN(MATCH_NOMATCH);
3790 }
3791 }
3792 else
3793 #endif
3794 /* Not UTF mode */
3795 {
3796 for (fi = min;; fi++)
3797 {
3798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3800 if (fi >= max) RRETURN(MATCH_NOMATCH);
3801 if (eptr >= md->end_subject)
3802 {
3803 SCHECK_PARTIAL();
3804 RRETURN(MATCH_NOMATCH);
3805 }
3806 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3807 }
3808 }
3809 /* Control never gets here */
3810 }
3811
3812 /* Maximize case */
3813
3814 else
3815 {
3816 pp = eptr;
3817
3818 #ifdef SUPPORT_UTF
3819 if (utf)
3820 {
3821 register unsigned int d;
3822 for (i = min; i < max; i++)
3823 {
3824 int len = 1;
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 break;
3829 }
3830 GETCHARLEN(d, eptr, len);
3831 if (fc == d) break;
3832 eptr += len;
3833 }
3834 if (possessive) continue;
3835 for(;;)
3836 {
3837 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3839 if (eptr-- == pp) break; /* Stop if tried at original pos */
3840 BACKCHAR(eptr);
3841 }
3842 }
3843 else
3844 #endif
3845 /* Not UTF mode */
3846 {
3847 for (i = min; i < max; i++)
3848 {
3849 if (eptr >= md->end_subject)
3850 {
3851 SCHECK_PARTIAL();
3852 break;
3853 }
3854 if (fc == *eptr) break;
3855 eptr++;
3856 }
3857 if (possessive) continue;
3858 while (eptr >= pp)
3859 {
3860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862 eptr--;
3863 }
3864 }
3865
3866 RRETURN(MATCH_NOMATCH);
3867 }
3868 }
3869 /* Control never gets here */
3870
3871 /* Match a single character type repeatedly; several different opcodes
3872 share code. This is very similar to the code for single characters, but we
3873 repeat it in the interests of efficiency. */
3874
3875 case OP_TYPEEXACT:
3876 min = max = GET2(ecode, 1);
3877 minimize = TRUE;
3878 ecode += 1 + IMM2_SIZE;
3879 goto REPEATTYPE;
3880
3881 case OP_TYPEUPTO:
3882 case OP_TYPEMINUPTO:
3883 min = 0;
3884 max = GET2(ecode, 1);
3885 minimize = *ecode == OP_TYPEMINUPTO;
3886 ecode += 1 + IMM2_SIZE;
3887 goto REPEATTYPE;
3888
3889 case OP_TYPEPOSSTAR:
3890 possessive = TRUE;
3891 min = 0;
3892 max = INT_MAX;
3893 ecode++;
3894 goto REPEATTYPE;
3895
3896 case OP_TYPEPOSPLUS:
3897 possessive = TRUE;
3898 min = 1;
3899 max = INT_MAX;
3900 ecode++;
3901 goto REPEATTYPE;
3902
3903 case OP_TYPEPOSQUERY:
3904 possessive = TRUE;
3905 min = 0;
3906 max = 1;
3907 ecode++;
3908 goto REPEATTYPE;
3909
3910 case OP_TYPEPOSUPTO:
3911 possessive = TRUE;
3912 min = 0;
3913 max = GET2(ecode, 1);
3914 ecode += 1 + IMM2_SIZE;
3915 goto REPEATTYPE;
3916
3917 case OP_TYPESTAR:
3918 case OP_TYPEMINSTAR:
3919 case OP_TYPEPLUS:
3920 case OP_TYPEMINPLUS:
3921 case OP_TYPEQUERY:
3922 case OP_TYPEMINQUERY:
3923 c = *ecode++ - OP_TYPESTAR;
3924 minimize = (c & 1) != 0;
3925 min = rep_min[c]; /* Pick up values from tables; */
3926 max = rep_max[c]; /* zero for max => infinity */
3927 if (max == 0) max = INT_MAX;
3928
3929 /* Common code for all repeated single character type matches. Note that
3930 in UTF-8 mode, '.' matches a character of any length, but for the other
3931 character types, the valid characters are all one-byte long. */
3932
3933 REPEATTYPE:
3934 ctype = *ecode++; /* Code for the character type */
3935
3936 #ifdef SUPPORT_UCP
3937 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3938 {
3939 prop_fail_result = ctype == OP_NOTPROP;
3940 prop_type = *ecode++;
3941 prop_value = *ecode++;
3942 }
3943 else prop_type = -1;
3944 #endif
3945
3946 /* First, ensure the minimum number of matches are present. Use inline
3947 code for maximizing the speed, and do the type test once at the start
3948 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3949 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3950 and single-bytes. */
3951
3952 if (min > 0)
3953 {
3954 #ifdef SUPPORT_UCP
3955 if (prop_type >= 0)
3956 {
3957 switch(prop_type)
3958 {
3959 case PT_ANY:
3960 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3961 for (i = 1; i <= min; i++)
3962 {
3963 if (eptr >= md->end_subject)
3964 {
3965 SCHECK_PARTIAL();
3966 RRETURN(MATCH_NOMATCH);
3967 }
3968 GETCHARINCTEST(c, eptr);
3969 }
3970 break;
3971
3972 case PT_LAMP:
3973 for (i = 1; i <= min; i++)
3974 {
3975 int chartype;
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 RRETURN(MATCH_NOMATCH);
3980 }
3981 GETCHARINCTEST(c, eptr);
3982 chartype = UCD_CHARTYPE(c);
3983 if ((chartype == ucp_Lu ||
3984 chartype == ucp_Ll ||
3985 chartype == ucp_Lt) == prop_fail_result)
3986 RRETURN(MATCH_NOMATCH);
3987 }
3988 break;
3989
3990 case PT_GC:
3991 for (i = 1; i <= min; i++)
3992 {
3993 if (eptr >= md->end_subject)
3994 {
3995 SCHECK_PARTIAL();
3996 RRETURN(MATCH_NOMATCH);
3997 }
3998 GETCHARINCTEST(c, eptr);
3999 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4000 RRETURN(MATCH_NOMATCH);
4001 }
4002 break;
4003
4004 case PT_PC:
4005 for (i = 1; i <= min; i++)
4006 {
4007 if (eptr >= md->end_subject)
4008 {
4009 SCHECK_PARTIAL();
4010 RRETURN(MATCH_NOMATCH);
4011 }
4012 GETCHARINCTEST(c, eptr);
4013 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4014 RRETURN(MATCH_NOMATCH);
4015 }
4016 break;
4017
4018 case PT_SC:
4019 for (i = 1; i <= min; i++)
4020 {
4021 if (eptr >= md->end_subject)
4022 {
4023 SCHECK_PARTIAL();
4024 RRETURN(MATCH_NOMATCH);
4025 }
4026 GETCHARINCTEST(c, eptr);
4027 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 break;
4031
4032 case PT_ALNUM:
4033 for (i = 1; i <= min; i++)
4034 {
4035 int category;
4036 if (eptr >= md->end_subject)
4037 {
4038 SCHECK_PARTIAL();
4039 RRETURN(MATCH_NOMATCH);
4040 }
4041 GETCHARINCTEST(c, eptr);
4042 category = UCD_CATEGORY(c);
4043 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4044 RRETURN(MATCH_NOMATCH);
4045 }
4046 break;
4047
4048 case PT_SPACE: /* Perl space */
4049 for (i = 1; i <= min; i++)
4050 {
4051 if (eptr >= md->end_subject)
4052 {
4053 SCHECK_PARTIAL();
4054 RRETURN(MATCH_NOMATCH);
4055 }
4056 GETCHARINCTEST(c, eptr);
4057 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4058 c == CHAR_FF || c == CHAR_CR)
4059 == prop_fail_result)
4060 RRETURN(MATCH_NOMATCH);
4061 }
4062 break;
4063
4064 case PT_PXSPACE: /* POSIX space */
4065 for (i = 1; i <= min; i++)
4066 {
4067 if (eptr >= md->end_subject)
4068 {
4069 SCHECK_PARTIAL();
4070 RRETURN(MATCH_NOMATCH);
4071 }
4072 GETCHARINCTEST(c, eptr);
4073 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4074 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4075 == prop_fail_result)
4076 RRETURN(MATCH_NOMATCH);
4077 }
4078 break;
4079
4080 case PT_WORD:
4081 for (i = 1; i <= min; i++)
4082 {
4083 int category;
4084 if (eptr >= md->end_subject)
4085 {
4086 SCHECK_PARTIAL();
4087 RRETURN(MATCH_NOMATCH);
4088 }
4089 GETCHARINCTEST(c, eptr);
4090 category = UCD_CATEGORY(c);
4091 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4092 == prop_fail_result)
4093 RRETURN(MATCH_NOMATCH);
4094 }
4095 break;
4096
4097 /* This should not occur */
4098
4099 default:
4100 RRETURN(PCRE_ERROR_INTERNAL);
4101 }
4102 }
4103
4104 /* Match extended Unicode sequences. We will get here only if the
4105 support is in the binary; otherwise a compile-time error occurs. */
4106
4107 else if (ctype == OP_EXTUNI)
4108 {
4109 for (i = 1; i <= min; i++)
4110 {
4111 if (eptr >= md->end_subject)
4112 {
4113 SCHECK_PARTIAL();
4114 RRETURN(MATCH_NOMATCH);
4115 }
4116 GETCHARINCTEST(c, eptr);
4117 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4118 while (eptr < md->end_subject)
4119 {
4120 int len = 1;
4121 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4122 if (UCD_CATEGORY(c) != ucp_M) break;
4123 eptr += len;
4124 }
4125 }
4126 }
4127
4128 else
4129 #endif /* SUPPORT_UCP */
4130
4131 /* Handle all other cases when the coding is UTF-8 */
4132
4133 #ifdef SUPPORT_UTF
4134 if (utf) switch(ctype)
4135 {
4136 case OP_ANY:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 RRETURN(MATCH_NOMATCH);
4143 }
4144 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4145 eptr++;
4146 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4147 }
4148 break;
4149
4150 case OP_ALLANY:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 eptr++;
4159 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4160 }
4161 break;
4162
4163 case OP_ANYBYTE:
4164 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4165 eptr += min;
4166 break;
4167
4168 case OP_ANYNL:
4169 for (i = 1; i <= min; i++)
4170 {
4171 if (eptr >= md->end_subject)
4172 {
4173 SCHECK_PARTIAL();
4174 RRETURN(MATCH_NOMATCH);
4175 }
4176 GETCHARINC(c, eptr);
4177 switch(c)
4178 {
4179 default: RRETURN(MATCH_NOMATCH);
4180
4181 case 0x000d:
4182 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4183 break;
4184
4185 case 0x000a:
4186 break;
4187
4188 case 0x000b:
4189 case 0x000c:
4190 case 0x0085:
4191 case 0x2028:
4192 case 0x2029:
4193 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4194 break;
4195 }
4196 }
4197 break;
4198
4199 case OP_NOT_HSPACE:
4200 for (i = 1; i <= min; i++)
4201 {
4202 if (eptr >= md->end_subject)
4203 {
4204 SCHECK_PARTIAL();
4205 RRETURN(MATCH_NOMATCH);
4206 }
4207 GETCHARINC(c, eptr);
4208 switch(c)
4209 {
4210 default: break;
4211 case 0x09: /* HT */
4212 case 0x20: /* SPACE */
4213 case 0xa0: /* NBSP */
4214 case 0x1680: /* OGHAM SPACE MARK */
4215 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4216 case 0x2000: /* EN QUAD */
4217 case 0x2001: /* EM QUAD */
4218 case 0x2002: /* EN SPACE */
4219 case 0x2003: /* EM SPACE */
4220 case 0x2004: /* THREE-PER-EM SPACE */
4221 case 0x2005: /* FOUR-PER-EM SPACE */
4222 case 0x2006: /* SIX-PER-EM SPACE */
4223 case 0x2007: /* FIGURE SPACE */
4224 case 0x2008: /* PUNCTUATION SPACE */
4225 case 0x2009: /* THIN SPACE */
4226 case 0x200A: /* HAIR SPACE */
4227 case 0x202f: /* NARROW NO-BREAK SPACE */
4228 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4229 case 0x3000: /* IDEOGRAPHIC SPACE */
4230 RRETURN(MATCH_NOMATCH);
4231 }
4232 }
4233 break;
4234
4235 case OP_HSPACE:
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 RRETURN(MATCH_NOMATCH);
4242 }
4243 GETCHARINC(c, eptr);
4244 switch(c)
4245 {
4246 default: RRETURN(MATCH_NOMATCH);
4247 case 0x09: /* HT */
4248 case 0x20: /* SPACE */
4249 case 0xa0: /* NBSP */
4250 case 0x1680: /* OGHAM SPACE MARK */
4251 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4252 case 0x2000: /* EN QUAD */
4253 case 0x2001: /* EM QUAD */
4254 case 0x2002: /* EN SPACE */
4255 case 0x2003: /* EM SPACE */
4256 case 0x2004: /* THREE-PER-EM SPACE */
4257 case 0x2005: /* FOUR-PER-EM SPACE */
4258 case 0x2006: /* SIX-PER-EM SPACE */
4259 case 0x2007: /* FIGURE SPACE */
4260 case 0x2008: /* PUNCTUATION SPACE */
4261 case 0x2009: /* THIN SPACE */
4262 case 0x200A: /* HAIR SPACE */
4263 case 0x202f: /* NARROW NO-BREAK SPACE */
4264 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4265 case 0x3000: /* IDEOGRAPHIC SPACE */
4266 break;
4267 }
4268 }
4269 break;
4270
4271 case OP_NOT_VSPACE:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 RRETURN(MATCH_NOMATCH);
4278 }
4279 GETCHARINC(c, eptr);
4280 switch(c)
4281 {
4282 default: break;
4283 case 0x0a: /* LF */
4284 case 0x0b: /* VT */
4285 case 0x0c: /* FF */
4286 case 0x0d: /* CR */
4287 case 0x85: /* NEL */
4288 case 0x2028: /* LINE SEPARATOR */
4289 case 0x2029: /* PARAGRAPH SEPARATOR */
4290 RRETURN(MATCH_NOMATCH);
4291 }
4292 }
4293 break;
4294
4295 case OP_VSPACE:
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 GETCHARINC(c, eptr);
4304 switch(c)
4305 {
4306 default: RRETURN(MATCH_NOMATCH);
4307 case 0x0a: /* LF */
4308 case 0x0b: /* VT */
4309 case 0x0c: /* FF */
4310 case 0x0d: /* CR */
4311 case 0x85: /* NEL */
4312 case 0x2028: /* LINE SEPARATOR */
4313 case 0x2029: /* PARAGRAPH SEPARATOR */
4314 break;
4315 }
4316 }
4317 break;
4318
4319 case OP_NOT_DIGIT:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 GETCHARINC(c, eptr);
4328 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4329 RRETURN(MATCH_NOMATCH);
4330 }
4331 break;
4332
4333 case OP_DIGIT:
4334 for (i = 1; i <= min; i++)
4335 {
4336 if (eptr >= md->end_subject)
4337 {
4338 SCHECK_PARTIAL();
4339 RRETURN(MATCH_NOMATCH);
4340 }
4341 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4342 RRETURN(MATCH_NOMATCH);
4343 eptr++;
4344 /* No need to skip more bytes - we know it's a 1-byte character */
4345 }
4346 break;
4347
4348 case OP_NOT_WHITESPACE:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4357 RRETURN(MATCH_NOMATCH);
4358 eptr++;
4359 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4360 }
4361 break;
4362
4363 case OP_WHITESPACE:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 RRETURN(MATCH_NOMATCH);
4370 }
4371 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4372 RRETURN(MATCH_NOMATCH);
4373 eptr++;
4374 /* No need to skip more bytes - we know it's a 1-byte character */
4375 }
4376 break;
4377
4378 case OP_NOT_WORDCHAR:
4379 for (i = 1; i <= min; i++)
4380 {
4381 if (eptr >= md->end_subject)
4382 {
4383 SCHECK_PARTIAL();
4384 RRETURN(MATCH_NOMATCH);
4385 }
4386 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4387 RRETURN(MATCH_NOMATCH);
4388 eptr++;
4389 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4390 }
4391 break;
4392
4393 case OP_WORDCHAR:
4394 for (i = 1; i <= min; i++)
4395 {
4396 if (eptr >= md->end_subject)
4397 {
4398 SCHECK_PARTIAL();
4399 RRETURN(MATCH_NOMATCH);
4400 }
4401 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4402 RRETURN(MATCH_NOMATCH);
4403 eptr++;
4404 /* No need to skip more bytes - we know it's a 1-byte character */
4405 }
4406 break;
4407
4408 default:
4409 RRETURN(PCRE_ERROR_INTERNAL);
4410 } /* End switch(ctype) */
4411
4412 else
4413 #endif /* SUPPORT_UTF */
4414
4415 /* Code for the non-UTF-8 case for minimum matching of operators other
4416 than OP_PROP and OP_NOTPROP. */
4417
4418 switch(ctype)
4419 {
4420 case OP_ANY:
4421 for (i = 1; i <= min; i++)
4422 {
4423 if (eptr >= md->end_subject)
4424 {
4425 SCHECK_PARTIAL();
4426 RRETURN(MATCH_NOMATCH);
4427 }
4428 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4429 eptr++;
4430 }
4431 break;
4432
4433 case OP_ALLANY:
4434 if (eptr > md->end_subject - min)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 eptr += min;
4440 break;
4441
4442 case OP_ANYBYTE:
4443 if (eptr > md->end_subject - min)
4444 {
4445 SCHECK_PARTIAL();
4446 RRETURN(MATCH_NOMATCH);
4447 }
4448 eptr += min;
4449 break;
4450
4451 case OP_ANYNL:
4452 for (i = 1; i <= min; i++)
4453 {
4454 if (eptr >= md->end_subject)
4455 {
4456 SCHECK_PARTIAL();
4457 RRETURN(MATCH_NOMATCH);
4458 }
4459 switch(*eptr++)
4460 {
4461 default: RRETURN(MATCH_NOMATCH);
4462
4463 case 0x000d:
4464 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4465 break;
4466
4467 case 0x000a:
4468 break;
4469
4470 case 0x000b:
4471 case 0x000c:
4472 case 0x0085:
4473 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4474 break;
4475 }
4476 }
4477 break;
4478
4479 case OP_NOT_HSPACE:
4480 for (i = 1; i <= min; i++)
4481 {
4482 if (eptr >= md->end_subject)
4483 {
4484 SCHECK_PARTIAL();
4485 RRETURN(MATCH_NOMATCH);
4486 }
4487 switch(*eptr++)
4488 {
4489 default: break;
4490 case 0x09: /* HT */
4491 case 0x20: /* SPACE */
4492 case 0xa0: /* NBSP */
4493 RRETURN(MATCH_NOMATCH);
4494 }
4495 }
4496 break;
4497
4498 case OP_HSPACE:
4499 for (i = 1; i <= min; i++)
4500 {
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 RRETURN(MATCH_NOMATCH);
4505 }
4506 switch(*eptr++)
4507 {
4508 default: RRETURN(MATCH_NOMATCH);
4509 case 0x09: /* HT */
4510 case 0x20: /* SPACE */
4511 case 0xa0: /* NBSP */
4512 break;
4513 }
4514 }
4515 break;
4516
4517 case OP_NOT_VSPACE:
4518 for (i = 1; i <= min; i++)
4519 {
4520 if (eptr >= md->end_subject)
4521 {
4522 SCHECK_PARTIAL();
4523 RRETURN(MATCH_NOMATCH);
4524 }
4525 switch(*eptr++)
4526 {
4527 default: break;
4528 case 0x0a: /* LF */
4529 case 0x0b: /* VT */
4530 case 0x0c: /* FF */
4531 case 0x0d: /* CR */
4532 case 0x85: /* NEL */
4533 RRETURN(MATCH_NOMATCH);
4534 }
4535 }
4536 break;
4537
4538 case OP_VSPACE:
4539 for (i = 1; i <= min; i++)
4540 {
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 RRETURN(MATCH_NOMATCH);
4545 }
4546 switch(*eptr++)
4547 {
4548 default: RRETURN(MATCH_NOMATCH);
4549 case 0x0a: /* LF */
4550 case 0x0b: /* VT */
4551 case 0x0c: /* FF */
4552 case 0x0d: /* CR */
4553 case 0x85: /* NEL */
4554 break;
4555 }
4556 }
4557 break;
4558
4559 case OP_NOT_DIGIT:
4560 for (i = 1; i <= min; i++)
4561 {
4562 if (eptr >= md->end_subject)
4563 {
4564 SCHECK_PARTIAL();
4565 RRETURN(MATCH_NOMATCH);
4566 }
4567 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4568 RRETURN(MATCH_NOMATCH);
4569 eptr++;
4570 }
4571 break;
4572
4573 case OP_DIGIT:
4574 for (i = 1; i <= min; i++)
4575 {
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4580 }
4581 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4582 RRETURN(MATCH_NOMATCH);
4583 eptr++;
4584 }
4585 break;
4586
4587 case OP_NOT_WHITESPACE:
4588 for (i = 1; i <= min; i++)
4589 {
4590 if (eptr >= md->end_subject)
4591 {
4592 SCHECK_PARTIAL();
4593 RRETURN(MATCH_NOMATCH);
4594 }
4595 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4596 RRETURN(MATCH_NOMATCH);
4597 eptr++;
4598 }
4599 break;
4600
4601 case OP_WHITESPACE:
4602 for (i = 1; i <= min; i++)
4603 {
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4610 RRETURN(MATCH_NOMATCH);
4611 eptr++;
4612 }
4613 break;
4614
4615 case OP_NOT_WORDCHAR:
4616 for (i = 1; i <= min; i++)
4617 {
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4624 RRETURN(MATCH_NOMATCH);
4625 eptr++;
4626 }
4627 break;
4628
4629 case OP_WORDCHAR:
4630 for (i = 1; i <= min; i++)
4631 {
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 RRETURN(MATCH_NOMATCH);
4636 }
4637 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4638 RRETURN(MATCH_NOMATCH);
4639 eptr++;
4640 }
4641 break;
4642
4643 default:
4644 RRETURN(PCRE_ERROR_INTERNAL);
4645 }
4646 }
4647
4648 /* If min = max, continue at the same level without recursing */
4649
4650 if (min == max) continue;
4651
4652 /* If minimizing, we have to test the rest of the pattern before each
4653 subsequent match. Again, separate the UTF-8 case for speed, and also
4654 separate the UCP cases. */
4655
4656 if (minimize)
4657 {
4658 #ifdef SUPPORT_UCP
4659 if (prop_type >= 0)
4660 {
4661 switch(prop_type)
4662 {
4663 case PT_ANY:
4664 for (fi = min;; fi++)
4665 {
4666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4668 if (fi >= max) RRETURN(MATCH_NOMATCH);
4669 if (eptr >= md->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 RRETURN(MATCH_NOMATCH);
4673 }
4674 GETCHARINCTEST(c, eptr);
4675 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4676 }
4677 /* Control never gets here */
4678
4679 case PT_LAMP:
4680 for (fi = min;; fi++)
4681 {
4682 int chartype;
4683 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4685 if (fi >= max) RRETURN(MATCH_NOMATCH);
4686 if (eptr >= md->end_subject)
4687 {
4688 SCHECK_PARTIAL();
4689 RRETURN(MATCH_NOMATCH);
4690 }
4691 GETCHARINCTEST(c, eptr);
4692 chartype = UCD_CHARTYPE(c);
4693 if ((chartype == ucp_Lu ||
4694 chartype == ucp_Ll ||
4695 chartype == ucp_Lt) == prop_fail_result)
4696 RRETURN(MATCH_NOMATCH);
4697 }
4698 /* Control never gets here */
4699
4700 case PT_GC:
4701 for (fi = min;; fi++)
4702 {
4703 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4704 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4705 if (fi >= max) RRETURN(MATCH_NOMATCH);
4706 if (eptr >= md->end_subject)
4707 {
4708 SCHECK_PARTIAL();
4709 RRETURN(MATCH_NOMATCH);
4710 }
4711 GETCHARINCTEST(c, eptr);
4712 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4713 RRETURN(MATCH_NOMATCH);
4714 }
4715 /* Control never gets here */
4716
4717 case PT_PC:
4718 for (fi = min;; fi++)
4719 {
4720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4722 if (fi >= max) RRETURN(MATCH_NOMATCH);
4723 if (eptr >= md->end_subject)
4724 {
4725 SCHECK_PARTIAL();
4726 RRETURN(MATCH_NOMATCH);
4727 }
4728 GETCHARINCTEST(c, eptr);
4729 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 /* Control never gets here */
4733
4734 case PT_SC:
4735 for (fi = min;; fi++)
4736 {
4737 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4739 if (fi >= max) RRETURN(MATCH_NOMATCH);
4740 if (eptr >= md->end_subject)
4741 {
4742 SCHECK_PARTIAL();
4743 RRETURN(MATCH_NOMATCH);
4744 }
4745 GETCHARINCTEST(c, eptr);
4746 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4747 RRETURN(MATCH_NOMATCH);
4748 }
4749 /* Control never gets here */
4750
4751 case PT_ALNUM:
4752 for (fi = min;; fi++)
4753 {
4754 int category;
4755 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4756 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4757 if (fi >= max) RRETURN(MATCH_NOMATCH);
4758 if (eptr >= md->end_subject)
4759 {
4760 SCHECK_PARTIAL();
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 GETCHARINCTEST(c, eptr);
4764 category = UCD_CATEGORY(c);
4765 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4766 RRETURN(MATCH_NOMATCH);
4767 }
4768 /* Control never gets here */
4769
4770 case PT_SPACE: /* Perl space */
4771 for (fi = min;; fi++)
4772 {
4773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4775 if (fi >= max) RRETURN(MATCH_NOMATCH);
4776 if (eptr >= md->end_subject)
4777 {
4778 SCHECK_PARTIAL();
4779 RRETURN(MATCH_NOMATCH);
4780 }
4781 GETCHARINCTEST(c, eptr);
4782 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4783 c == CHAR_FF || c == CHAR_CR)
4784 == prop_fail_result)
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 /* Control never gets here */
4788
4789 case PT_PXSPACE: /* POSIX space */
4790 for (fi = min;; fi++)
4791 {
4792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4794 if (fi >= max) RRETURN(MATCH_NOMATCH);
4795 if (eptr >= md->end_subject)
4796 {
4797 SCHECK_PARTIAL();
4798 RRETURN(MATCH_NOMATCH);
4799 }
4800 GETCHARINCTEST(c, eptr);
4801 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4802 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4803 == prop_fail_result)
4804 RRETURN(MATCH_NOMATCH);
4805 }
4806 /* Control never gets here */
4807
4808 case PT_WORD:
4809 for (fi = min;; fi++)
4810 {
4811 int category;
4812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4814 if (fi >= max) RRETURN(MATCH_NOMATCH);
4815 if (eptr >= md->end_subject)
4816 {
4817 SCHECK_PARTIAL();
4818 RRETURN(MATCH_NOMATCH);
4819 }
4820 GETCHARINCTEST(c, eptr);
4821 category = UCD_CATEGORY(c);
4822 if ((category == ucp_L ||
4823 category == ucp_N ||
4824 c == CHAR_UNDERSCORE)
4825 == prop_fail_result)
4826 RRETURN(MATCH_NOMATCH);
4827 }
4828 /* Control never gets here */
4829
4830 /* This should never occur */
4831
4832 default:
4833 RRETURN(PCRE_ERROR_INTERNAL);
4834 }
4835 }
4836
4837 /* Match extended Unicode sequences. We will get here only if the
4838 support is in the binary; otherwise a compile-time error occurs. */
4839
4840 else if (ctype == OP_EXTUNI)
4841 {
4842 for (fi = min;; fi++)
4843 {
4844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4846 if (fi >= max) RRETURN(MATCH_NOMATCH);
4847 if (eptr >= md->end_subject)
4848 {
4849 SCHECK_PARTIAL();
4850 RRETURN(MATCH_NOMATCH);
4851 }
4852 GETCHARINCTEST(c, eptr);
4853 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4854 while (eptr < md->end_subject)
4855 {
4856 int len = 1;
4857 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4858 if (UCD_CATEGORY(c) != ucp_M) break;
4859 eptr += len;
4860 }
4861 }
4862 }
4863 else
4864 #endif /* SUPPORT_UCP */
4865
4866 #ifdef SUPPORT_UTF
4867 if (utf)
4868 {
4869 for (fi = min;; fi++)
4870 {
4871 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4873 if (fi >= max) RRETURN(MATCH_NOMATCH);
4874 if (eptr >= md->end_subject)
4875 {
4876 SCHECK_PARTIAL();
4877 RRETURN(MATCH_NOMATCH);
4878 }
4879 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4880 RRETURN(MATCH_NOMATCH);
4881 GETCHARINC(c, eptr);
4882 switch(ctype)
4883 {
4884 case OP_ANY: /* This is the non-NL case */
4885 case OP_ALLANY:
4886 case OP_ANYBYTE:
4887 break;
4888
4889 case OP_ANYNL:
4890 switch(c)
4891 {
4892 default: RRETURN(MATCH_NOMATCH);
4893 case 0x000d:
4894 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4895 break;
4896 case 0x000a:
4897 break;
4898
4899 case 0x000b:
4900 case 0x000c:
4901 case 0x0085:
4902 case 0x2028:
4903 case 0x2029:
4904 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4905 break;
4906 }
4907 break;
4908
4909 case OP_NOT_HSPACE:
4910 switch(c)
4911 {
4912 default: break;
4913 case 0x09: /* HT */
4914 case 0x20: /* SPACE */
4915 case 0xa0: /* NBSP */
4916 case 0x1680: /* OGHAM SPACE MARK */
4917 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4918 case 0x2000: /* EN QUAD */
4919 case 0x2001: /* EM QUAD */
4920 case 0x2002: /* EN SPACE */
4921 case 0x2003: /* EM SPACE */
4922 case 0x2004: /* THREE-PER-EM SPACE */
4923 case 0x2005: /* FOUR-PER-EM SPACE */
4924 case 0x2006: /* SIX-PER-EM SPACE */
4925 case 0x2007: /* FIGURE SPACE */
4926 case 0x2008: /* PUNCTUATION SPACE */
4927 case 0x2009: /* THIN SPACE */
4928 case 0x200A: /* HAIR SPACE */
4929 case 0x202f: /* NARROW NO-BREAK SPACE */
4930 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4931 case 0x3000: /* IDEOGRAPHIC SPACE */
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 break;
4935
4936 case OP_HSPACE:
4937 switch(c)
4938 {
4939 default: RRETURN(MATCH_NOMATCH);
4940 case 0x09: /* HT */
4941 case 0x20: /* SPACE */
4942 case 0xa0: /* NBSP */
4943 case 0x1680: /* OGHAM SPACE MARK */
4944 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4945 case 0x2000: /* EN QUAD */
4946 case 0x2001: /* EM QUAD */
4947 case 0x2002: /* EN SPACE */
4948 case 0x2003: /* EM SPACE */
4949 case 0x2004: /* THREE-PER-EM SPACE */
4950 case 0x2005: /* FOUR-PER-EM SPACE */
4951 case 0x2006: /* SIX-PER-EM SPACE */
4952 case 0x2007: /* FIGURE SPACE */
4953 case 0x2008: /* PUNCTUATION SPACE */
4954 case 0x2009: /* THIN SPACE */
4955 case 0x200A: /* HAIR SPACE */
4956 case 0x202f: /* NARROW NO-BREAK SPACE */
4957 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4958 case 0x3000: /* IDEOGRAPHIC SPACE */
4959 break;
4960 }
4961 break;
4962
4963 case OP_NOT_VSPACE:
4964 switch(c)
4965 {
4966 default: break;
4967 case 0x0a: /* LF */
4968 case 0x0b: /* VT */
4969 case 0x0c: /* FF */
4970 case 0x0d: /* CR */
4971 case 0x85: /* NEL */
4972 case 0x2028: /* LINE SEPARATOR */
4973 case 0x2029: /* PARAGRAPH SEPARATOR */
4974 RRETURN(MATCH_NOMATCH);
4975 }
4976 break;
4977
4978 case OP_VSPACE:
4979 switch(c)
4980 {
4981 default: RRETURN(MATCH_NOMATCH);
4982 case 0x0a: /* LF */
4983 case 0x0b: /* VT */
4984 case 0x0c: /* FF */
4985 case 0x0d: /* CR */
4986 case 0x85: /* NEL */
4987 case 0x2028: /* LINE SEPARATOR */
4988 case 0x2029: /* PARAGRAPH SEPARATOR */
4989 break;
4990 }
4991 break;
4992
4993 case OP_NOT_DIGIT:
4994 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4995 RRETURN(MATCH_NOMATCH);
4996 break;
4997
4998 case OP_DIGIT:
4999 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5000 RRETURN(MATCH_NOMATCH);
5001 break;
5002
5003 case OP_NOT_WHITESPACE:
5004 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5005 RRETURN(MATCH_NOMATCH);
5006 break;
5007
5008 case OP_WHITESPACE:
5009 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5010 RRETURN(MATCH_NOMATCH);
5011 break;
5012
5013 case OP_NOT_WORDCHAR:
5014 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5015 RRETURN(MATCH_NOMATCH);
5016 break;
5017
5018 case OP_WORDCHAR:
5019 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5020 RRETURN(MATCH_NOMATCH);
5021 break;
5022
5023 default:
5024 RRETURN(PCRE_ERROR_INTERNAL);
5025 }
5026 }
5027 }
5028 else
5029 #endif
5030 /* Not UTF mode */
5031 {
5032 for (fi = min;; fi++)
5033 {
5034 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5036 if (fi >= max) RRETURN(MATCH_NOMATCH);
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 RRETURN(MATCH_NOMATCH);
5041 }
5042 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5043 RRETURN(MATCH_NOMATCH);
5044 c = *eptr++;
5045 switch(ctype)
5046 {
5047 case OP_ANY: /* This is the non-NL case */
5048 case OP_ALLANY:
5049 case OP_ANYBYTE:
5050 break;
5051
5052 case OP_ANYNL:
5053 switch(c)
5054 {
5055 default: RRETURN(MATCH_NOMATCH);
5056 case 0x000d:
5057 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5058 break;
5059
5060 case 0x000a:
5061 break;
5062
5063 case 0x000b:
5064 case 0x000c:
5065 case 0x0085:
5066 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5067 break;
5068 }
5069 break;
5070
5071 case OP_NOT_HSPACE:
5072 switch(c)
5073 {
5074 default: break;
5075 case 0x09: /* HT */
5076 case 0x20: /* SPACE */
5077 case 0xa0: /* NBSP */
5078 RRETURN(MATCH_NOMATCH);
5079 }
5080 break;
5081
5082 case OP_HSPACE:
5083 switch(c)
5084 {
5085 default: RRETURN(MATCH_NOMATCH);
5086 case 0x09: /* HT */
5087 case 0x20: /* SPACE */
5088 case 0xa0: /* NBSP */
5089 break;
5090 }
5091 break;
5092
5093 case OP_NOT_VSPACE:
5094 switch(c)
5095 {
5096 default: break;
5097 case 0x0a: /* LF */
5098 case 0x0b: /* VT */
5099 case 0x0c: /* FF */
5100 case 0x0d: /* CR */
5101 case 0x85: /* NEL */
5102 RRETURN(MATCH_NOMATCH);
5103 }
5104 break;
5105
5106 case OP_VSPACE:
5107 switch(c)
5108 {
5109 default: RRETURN(MATCH_NOMATCH);
5110 case 0x0a: /* LF */
5111 case 0x0b: /* VT */
5112 case 0x0c: /* FF */
5113 case 0x0d: /* CR */
5114 case 0x85: /* NEL */
5115 break;
5116 }
5117 break;
5118
5119 case OP_NOT_DIGIT:
5120 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5121 break;
5122
5123 case OP_DIGIT:
5124 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5125 break;
5126
5127 case OP_NOT_WHITESPACE:
5128 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5129 break;
5130
5131 case OP_WHITESPACE:
5132 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5133 break;
5134
5135 case OP_NOT_WORDCHAR:
5136 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5137 break;
5138
5139 case OP_WORDCHAR:
5140 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5141 break;
5142
5143 default:
5144 RRETURN(PCRE_ERROR_INTERNAL);
5145 }
5146 }
5147 }
5148 /* Control never gets here */
5149 }
5150
5151 /* If maximizing, it is worth using inline code for speed, doing the type
5152 test once at the start (i.e. keep it out of the loop). Again, keep the
5153 UTF-8 and UCP stuff separate. */
5154
5155 else
5156 {
5157 pp = eptr; /* Remember where we started */
5158
5159 #ifdef SUPPORT_UCP
5160 if (prop_type >= 0)
5161 {
5162 switch(prop_type)
5163 {
5164 case PT_ANY:
5165 for (i = min; i < max; i++)
5166 {
5167 int len = 1;
5168 if (eptr >= md->end_subject)
5169 {
5170 SCHECK_PARTIAL();
5171 break;
5172 }
5173 GETCHARLENTEST(c, eptr, len);
5174 if (prop_fail_result) break;
5175 eptr+= len;
5176 }
5177 break;
5178
5179 case PT_LAMP:
5180 for (i = min; i < max; i++)
5181 {
5182 int chartype;
5183 int len = 1;
5184 if (eptr >= md->end_subject)
5185 {
5186 SCHECK_PARTIAL();
5187 break;
5188 }
5189 GETCHARLENTEST(c, eptr, len);
5190 chartype = UCD_CHARTYPE(c);
5191 if ((chartype == ucp_Lu ||
5192 chartype == ucp_Ll ||
5193 chartype == ucp_Lt) == prop_fail_result)
5194 break;
5195 eptr+= len;
5196 }
5197 break;
5198
5199 case PT_GC:
5200 for (i = min; i < max; i++)
5201 {
5202 int len = 1;
5203 if (eptr >= md->end_subject)
5204 {
5205 SCHECK_PARTIAL();
5206 break;
5207 }
5208 GETCHARLENTEST(c, eptr, len);
5209 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5210 eptr+= len;
5211 }
5212 break;
5213
5214 case PT_PC:
5215 for (i = min; i < max; i++)
5216 {
5217 int len = 1;
5218 if (eptr >= md->end_subject)
5219 {
5220 SCHECK_PARTIAL();
5221 break;
5222 }
5223 GETCHARLENTEST(c, eptr, len);
5224 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5225 eptr+= len;
5226 }
5227 break;
5228
5229 case PT_SC:
5230 for (i = min; i < max; i++)
5231 {
5232 int len = 1;
5233 if (eptr >= md->end_subject)
5234 {
5235 SCHECK_PARTIAL();
5236 break;
5237 }
5238 GETCHARLENTEST(c, eptr, len);
5239 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5240 eptr+= len;
5241 }
5242 break;
5243
5244 case PT_ALNUM:
5245 for (i = min; i < max; i++)
5246 {
5247 int category;
5248 int len = 1;
5249 if (eptr >= md->end_subject)
5250 {
5251 SCHECK_PARTIAL();
5252 break;
5253 }
5254 GETCHARLENTEST(c, eptr, len);
5255 category = UCD_CATEGORY(c);
5256 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5257 break;
5258 eptr+= len;
5259 }
5260 break;
5261
5262 case PT_SPACE: /* Perl space */
5263 for (i = min; i < max; i++)
5264 {
5265 int len = 1;
5266 if (eptr >= md->end_subject)
5267 {
5268 SCHECK_PARTIAL();
5269 break;
5270 }
5271 GETCHARLENTEST(c, eptr, len);
5272 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5273 c == CHAR_FF || c == CHAR_CR)
5274 == prop_fail_result)
5275 break;
5276 eptr+= len;
5277 }
5278 break;
5279
5280 case PT_PXSPACE: /* POSIX space */
5281 for (i = min; i < max; i++)
5282 {
5283 int len = 1;
5284 if (eptr >= md->end_subject)
5285 {
5286 SCHECK_PARTIAL();
5287 break;
5288 }
5289 GETCHARLENTEST(c, eptr, len);
5290 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5291 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5292 == prop_fail_result)
5293 break;
5294 eptr+= len;
5295 }
5296 break;
5297
5298 case PT_WORD:
5299 for (i = min; i < max; i++)
5300 {
5301 int category;
5302 int len = 1;
5303 if (eptr >= md->end_subject)
5304 {
5305 SCHECK_PARTIAL();
5306 break;
5307 }
5308 GETCHARLENTEST(c, eptr, len);
5309 category = UCD_CATEGORY(c);
5310 if ((category == ucp_L || category == ucp_N ||
5311 c == CHAR_UNDERSCORE) == prop_fail_result)
5312 break;
5313 eptr+= len;
5314 }
5315 break;
5316
5317 default:
5318 RRETURN(PCRE_ERROR_INTERNAL);
5319 }
5320
5321 /* eptr is now past the end of the maximum run */
5322
5323 if (possessive) continue;
5324 for(;;)
5325 {
5326 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5327 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5328 if (eptr-- == pp) break; /* Stop if tried at original pos */
5329 if (utf) BACKCHAR(eptr);
5330 }
5331 }
5332
5333 /* Match extended Unicode sequences. We will get here only if the
5334 support is in the binary; otherwise a compile-time error occurs. */
5335
5336 else if (ctype == OP_EXTUNI)
5337 {
5338 for (i = min; i < max; i++)
5339 {
5340 int len = 1;
5341 if (eptr >= md->end_subject)
5342 {
5343 SCHECK_PARTIAL();
5344 break;
5345 }
5346 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5347 if (UCD_CATEGORY(c) == ucp_M) break;
5348 eptr += len;
5349 while (eptr < md->end_subject)
5350 {
5351 len = 1;
5352 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5353 if (UCD_CATEGORY(c) != ucp_M) break;
5354 eptr += len;
5355 }
5356 }
5357
5358 /* eptr is now past the end of the maximum run */
5359
5360 if (possessive) continue;
5361
5362 for(;;)
5363 {
5364 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5365 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5366 if (eptr-- == pp) break; /* Stop if tried at original pos */
5367 for (;;) /* Move back over one extended */
5368 {
5369 if (!utf) c = *eptr; else
5370 {
5371 BACKCHAR(eptr);
5372 GETCHAR(c, eptr);
5373 }
5374 if (UCD_CATEGORY(c) != ucp_M) break;
5375 eptr--;
5376 }
5377 }
5378 }
5379
5380 else
5381 #endif /* SUPPORT_UCP */
5382
5383 #ifdef SUPPORT_UTF
5384 if (utf)
5385 {
5386 switch(ctype)
5387 {
5388 case OP_ANY:
5389 if (max < INT_MAX)
5390 {
5391 for (i = min; i < max; i++)
5392 {
5393 if (eptr >= md->end_subject)
5394 {
5395 SCHECK_PARTIAL();
5396 break;
5397 }
5398 if (IS_NEWLINE(eptr)) break;
5399 eptr++;
5400 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5401 }
5402 }
5403
5404 /* Handle unlimited UTF-8 repeat */
5405
5406 else
5407 {
5408 for (i = min; i < max; i++)
5409 {
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 if (IS_NEWLINE(eptr)) break;
5416 eptr++;
5417 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5418 }
5419 }
5420 break;
5421
5422 case OP_ALLANY:
5423 if (max < INT_MAX)
5424 {
5425 for (i = min; i < max; i++)
5426 {
5427 if (eptr >= md->end_subject)
5428 {
5429 SCHECK_PARTIAL();
5430 break;
5431 }
5432 eptr++;
5433 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5434 }
5435 }
5436 else
5437 {
5438 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5439 SCHECK_PARTIAL();
5440 }
5441 break;
5442
5443 /* The byte case is the same as non-UTF8 */
5444
5445 case OP_ANYBYTE:
5446 c = max - min;
5447 if (c > (unsigned int)(md->end_subject - eptr))
5448 {
5449 eptr = md->end_subject;
5450 SCHECK_PARTIAL();
5451 }
5452 else eptr += c;
5453 break;
5454
5455 case OP_ANYNL:
5456 for (i = min; i < max; i++)
5457 {
5458 int len = 1;
5459 if (eptr >= md->end_subject)
5460 {
5461 SCHECK_PARTIAL();
5462 break;
5463 }
5464 GETCHARLEN(c, eptr, len);
5465 if (c == 0x000d)
5466 {
5467 if (++eptr >= md->end_subject) break;
5468 if (*eptr == 0x000a) eptr++;
5469 }
5470 else
5471 {
5472 if (c != 0x000a &&
5473 (md->bsr_anycrlf ||
5474 (c != 0x000b && c != 0x000c &&
5475 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5476 break;
5477 eptr += len;
5478 }
5479 }
5480 break;
5481
5482 case OP_NOT_HSPACE:
5483 case OP_HSPACE:
5484 for (i = min; i < max; i++)
5485 {
5486 BOOL gotspace;
5487 int len = 1;
5488 if (eptr >= md->end_subject)
5489 {
5490 SCHECK_PARTIAL();
5491 break;
5492 }
5493 GETCHARLEN(c, eptr, len);
5494 switch(c)
5495 {
5496 default: gotspace = FALSE; break;
5497 case 0x09: /* HT */
5498 case 0x20: /* SPACE */
5499 case 0xa0: /* NBSP */
5500 case 0x1680: /* OGHAM SPACE MARK */
5501 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5502 case 0x2000: /* EN QUAD */
5503 case 0x2001: /* EM QUAD */
5504 case 0x2002: /* EN SPACE */
5505 case 0x2003: /* EM SPACE */
5506 case 0x2004: /* THREE-PER-EM SPACE */
5507 case 0x2005: /* FOUR-PER-EM SPACE */
5508 case 0x2006: /* SIX-PER-EM SPACE */
5509 case 0x2007: /* FIGURE SPACE */
5510 case 0x2008: /* PUNCTUATION SPACE */
5511 case 0x2009: /* THIN SPACE */
5512 case 0x200A: /* HAIR SPACE */
5513 case 0x202f: /* NARROW NO-BREAK SPACE */
5514 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5515 case 0x3000: /* IDEOGRAPHIC SPACE */
5516 gotspace = TRUE;
5517 break;
5518 }
5519 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5520 eptr += len;
5521 }
5522 break;
5523
5524 case OP_NOT_VSPACE:
5525 case OP_VSPACE:
5526 for (i = min; i < max; i++)
5527 {
5528 BOOL gotspace;
5529 int len = 1;
5530 if (eptr >= md->end_subject)
5531 {
5532 SCHECK_PARTIAL();
5533 break;
5534 }
5535 GETCHARLEN(c, eptr, len);
5536 switch(c)
5537 {
5538 default: gotspace = FALSE; break;
5539 case 0x0a: /* LF */
5540 case 0x0b: /* VT */
5541 case 0x0c: /* FF */
5542 case 0x0d: /* CR */
5543 case 0x85: /* NEL */
5544 case 0x2028: /* LINE SEPARATOR */
5545 case 0x2029: /* PARAGRAPH SEPARATOR */
5546 gotspace = TRUE;
5547 break;
5548 }
5549 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5550 eptr += len;
5551 }
5552 break;
5553
5554 case OP_NOT_DIGIT:
5555 for (i = min; i < max; i++)
5556 {
5557 int len = 1;
5558 if (eptr >= md->end_subject)
5559 {
5560 SCHECK_PARTIAL();
5561 break;
5562 }
5563 GETCHARLEN(c, eptr, len);
5564 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5565 eptr+= len;
5566 }
5567 break;
5568
5569 case OP_DIGIT:
5570 for (i = min; i < max; i++)
5571 {
5572 int len = 1;
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 GETCHARLEN(c, eptr, len);
5579 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5580 eptr+= len;
5581 }
5582 break;
5583
5584 case OP_NOT_WHITESPACE:
5585 for (i = min; i < max; i++)
5586 {
5587 int len = 1;
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 GETCHARLEN(c, eptr, len);
5594 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5595 eptr+= len;
5596 }
5597 break;
5598
5599 case OP_WHITESPACE:
5600 for (i = min; i < max; i++)
5601 {
5602 int len = 1;
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 GETCHARLEN(c, eptr, len);
5609 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5610 eptr+= len;
5611 }
5612 break;
5613
5614 case OP_NOT_WORDCHAR:
5615 for (i = min; i < max; i++)
5616 {
5617 int len = 1;
5618 if (eptr >= md->end_subject)
5619 {
5620 SCHECK_PARTIAL();
5621 break;
5622 }
5623 GETCHARLEN(c, eptr, len);
5624 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5625 eptr+= len;
5626 }
5627 break;
5628
5629 case OP_WORDCHAR:
5630 for (i = min; i < max; i++)
5631 {
5632 int len = 1;
5633 if (eptr >= md->end_subject)
5634 {
5635 SCHECK_PARTIAL();
5636 break;
5637 }
5638 GETCHARLEN(c, eptr, len);
5639 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5640 eptr+= len;
5641 }
5642 break;
5643
5644 default:
5645 RRETURN(PCRE_ERROR_INTERNAL);
5646 }
5647
5648 /* eptr is now past the end of the maximum run. If possessive, we are
5649 done (no backing up). Otherwise, match at this position; anything other
5650 than no match is immediately returned. For nomatch, back up one
5651 character, unless we are matching \R and the last thing matched was
5652 \r\n, in which case, back up two bytes. */
5653
5654 if (possessive) continue;
5655 for(;;)
5656 {
5657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5659 if (eptr-- == pp) break; /* Stop if tried at original pos */
5660 BACKCHAR(eptr);
5661 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5662 eptr[-1] == '\r') eptr--;
5663 }
5664 }
5665 else
5666 #endif /* SUPPORT_UTF */
5667 /* Not UTF mode */
5668 {
5669 switch(ctype)
5670 {
5671 case OP_ANY:
5672 for (i = min; i < max; i++)
5673 {
5674 if (eptr >= md->end_subject)
5675 {
5676 SCHECK_PARTIAL();
5677 break;
5678 }
5679 if (IS_NEWLINE(eptr)) break;
5680 eptr++;
5681 }
5682 break;
5683
5684 case OP_ALLANY:
5685 case OP_ANYBYTE:
5686 c = max - min;
5687 if (c > (unsigned int)(md->end_subject - eptr))
5688 {
5689 eptr = md->end_subject;
5690 SCHECK_PARTIAL();
5691 }
5692 else eptr += c;
5693 break;
5694
5695 case OP_ANYNL:
5696 for (i = min; i < max; i++)
5697 {
5698 if (eptr >= md->end_subject)
5699 {
5700 SCHECK_PARTIAL();
5701 break;
5702 }
5703 c = *eptr;
5704 if (c == 0x000d)
5705 {
5706 if (++eptr >= md->end_subject) break;
5707 if (*eptr == 0x000a) eptr++;
5708 }
5709 else
5710 {
5711 if (c != 0x000a &&
5712 (md->bsr_anycrlf ||
5713 (c != 0x000b && c != 0x000c && c != 0x0085)))
5714 break;
5715 eptr++;
5716 }
5717 }
5718 break;
5719
5720 case OP_NOT_HSPACE:
5721 for (i = min; i < max; i++)
5722 {
5723 if (eptr >= md->end_subject)
5724 {
5725 SCHECK_PARTIAL();
5726 break;
5727 }
5728 c = *eptr;
5729 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5730 eptr++;
5731 }
5732 break;
5733
5734 case OP_HSPACE:
5735 for (i = min; i < max; i++)
5736 {
5737 if (eptr >= md->end_subject)
5738 {
5739 SCHECK_PARTIAL();
5740 break;
5741 }
5742 c = *eptr;
5743 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5744 eptr++;
5745 }
5746 break;
5747
5748 case OP_NOT_VSPACE:
5749 for (i = min; i < max; i++)
5750 {
5751 if (eptr >= md->end_subject)
5752 {
5753 SCHECK_PARTIAL();
5754 break;
5755 }
5756 c = *eptr;
5757 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5758 break;
5759 eptr++;
5760 }
5761 break;
5762
5763 case OP_VSPACE:
5764 for (i = min; i < max; i++)
5765 {
5766 if (eptr >= md->end_subject)
5767 {
5768 SCHECK_PARTIAL();
5769 break;
5770 }
5771 c = *eptr;
5772 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5773 break;
5774 eptr++;
5775 }
5776 break;
5777
5778 case OP_NOT_DIGIT:
5779 for (i = min; i < max; i++)
5780 {
5781 if (eptr >= md->end_subject)
5782 {
5783 SCHECK_PARTIAL();
5784 break;
5785 }
5786 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5787 eptr++;
5788 }
5789 break;
5790
5791 case OP_DIGIT:
5792 for (i = min; i < max; i++)
5793 {
5794 if (eptr >= md->end_subject)
5795 {
5796 SCHECK_PARTIAL();
5797 break;
5798 }
5799 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5800 eptr++;
5801 }
5802 break;
5803
5804 case OP_NOT_WHITESPACE:
5805 for (i = min; i < max; i++)
5806 {
5807 if (eptr >= md->end_subject)
5808 {
5809 SCHECK_PARTIAL();
5810 break;
5811 }
5812 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5813 eptr++;
5814 }
5815 break;
5816
5817 case OP_WHITESPACE:
5818 for (i = min; i < max; i++)
5819 {
5820 if (eptr >= md->end_subject)
5821 {
5822 SCHECK_PARTIAL();
5823 break;
5824 }
5825 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5826 eptr++;
5827 }
5828 break;
5829
5830 case OP_NOT_WORDCHAR:
5831 for (i = min; i < max; i++)
5832 {
5833 if (eptr >= md->end_subject)
5834 {
5835 SCHECK_PARTIAL();
5836 break;
5837 }
5838 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5839 eptr++;
5840 }
5841 break;
5842
5843 case OP_WORDCHAR:
5844 for (i = min; i < max; i++)
5845 {
5846 if (eptr >= md->end_subject)
5847 {
5848 SCHECK_PARTIAL();
5849 break;
5850 }
5851 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5852 eptr++;
5853 }
5854 break;
5855
5856 default:
5857 RRETURN(PCRE_ERROR_INTERNAL);
5858 }
5859
5860 /* eptr is now past the end of the maximum run. If possessive, we are
5861 done (no backing up). Otherwise, match at this position; anything other
5862 than no match is immediately returned. For nomatch, back up one
5863 character (byte), unless we are matching \R and the last thing matched
5864 was \r\n, in which case, back up two bytes. */
5865
5866 if (possessive) continue;
5867 while (eptr >= pp)
5868 {
5869 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5871 eptr--;
5872 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5873 eptr[-1] == '\r') eptr--;
5874 }
5875 }
5876
5877 /* Get here if we can't make it match with any permitted repetitions */
5878
5879 RRETURN(MATCH_NOMATCH);
5880 }
5881 /* Control never gets here */
5882
5883 /* There's been some horrible disaster. Arrival here can only mean there is
5884 something seriously wrong in the code above or the OP_xxx definitions. */
5885
5886 default:
5887 DPRINTF(("Unknown opcode %d\n", *ecode));
5888 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5889 }
5890
5891 /* Do not stick any code in here without much thought; it is assumed
5892 that "continue" in the code above comes out to here to repeat the main
5893 loop. */
5894
5895 } /* End of main loop */
5896 /* Control never reaches here */
5897
5898
5899 /* When compiling to use the heap rather than the stack for recursive calls to
5900 match(), the RRETURN() macro jumps here. The number that is saved in
5901 frame->Xwhere indicates which label we actually want to return to. */
5902
5903 #ifdef NO_RECURSE
5904 #define LBL(val) case val: goto L_RM##val;
5905 HEAP_RETURN:
5906 switch (frame->Xwhere)
5907 {
5908 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5909 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5910 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5911 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5912 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5913 LBL(65) LBL(66)
5914 #ifdef SUPPORT_UTF
5915 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5916 LBL(32) LBL(34) LBL(42) LBL(46)
5917 #ifdef SUPPORT_UCP
5918 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5919 LBL(59) LBL(60) LBL(61) LBL(62)
5920 #endif /* SUPPORT_UCP */
5921 #endif /* SUPPORT_UTF */
5922 default:
5923 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5924 return PCRE_ERROR_INTERNAL;
5925 }
5926 #undef LBL
5927 #endif /* NO_RECURSE */
5928 }
5929
5930
5931 /***************************************************************************
5932 ****************************************************************************
5933 RECURSION IN THE match() FUNCTION
5934
5935 Undefine all the macros that were defined above to handle this. */
5936
5937 #ifdef NO_RECURSE
5938 #undef eptr
5939 #undef ecode
5940 #undef mstart
5941 #undef offset_top
5942 #undef eptrb
5943 #undef flags
5944
5945 #undef callpat
5946 #undef charptr
5947 #undef data
5948 #undef next
5949 #undef pp
5950 #undef prev
5951 #undef saved_eptr
5952
5953 #undef new_recursive
5954
5955 #undef cur_is_word
5956 #undef condition
5957 #undef prev_is_word
5958
5959 #undef ctype
5960 #undef length
5961 #undef max
5962 #undef min
5963 #undef number
5964 #undef offset
5965 #undef op
5966 #undef save_capture_last
5967 #undef save_offset1
5968 #undef save_offset2
5969 #undef save_offset3
5970 #undef stacksave
5971
5972 #undef newptrb
5973
5974 #endif
5975
5976 /* These two are defined as macros in both cases */
5977
5978 #undef fc
5979 #undef fi
5980
5981 /***************************************************************************
5982 ***************************************************************************/
5983
5984
5985
5986 /*************************************************
5987 * Execute a Regular Expression *
5988 *************************************************/
5989
5990 /* This function applies a compiled re to a subject string and picks out
5991 portions of the string if it matches. Two elements in the vector are set for
5992 each substring: the offsets to the start and end of the substring.
5993
5994 Arguments:
5995 argument_re points to the compiled expression
5996 extra_data points to extra data or is NULL
5997 subject points to the subject string
5998 length length of subject string (may contain binary zeros)
5999 start_offset where to start in the subject string
6000 options option bits
6001 offsets points to a vector of ints to be filled in with offsets
6002 offsetcount the number of elements in the vector
6003
6004 Returns: > 0 => success; value is the number of elements filled in
6005 = 0 => success, but offsets is not big enough
6006 -1 => failed to match
6007 < -1 => some kind of unexpected problem
6008 */
6009
6010 #ifdef COMPILE_PCRE8
6011 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6012 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6013 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6014 int offsetcount)
6015 #else
6016 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6017 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
6018 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6019 int offsetcount)
6020 #endif
6021 {
6022 int rc, ocount, arg_offset_max;
6023 int newline;
6024 BOOL using_temporary_offsets = FALSE;
6025 BOOL anchored;
6026 BOOL startline;
6027 BOOL firstline;
6028 BOOL utf;
6029 BOOL has_first_char = FALSE;
6030 BOOL has_req_char = FALSE;
6031 pcre_uchar first_char = 0;
6032 pcre_uchar first_char2 = 0;
6033 pcre_uchar req_char = 0;
6034 pcre_uchar req_char2 = 0;
6035 match_data match_block;
6036 match_data *md = &match_block;
6037 const pcre_uint8 *tables;
6038 const pcre_uint8 *start_bits = NULL;
6039 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6040 PCRE_PUCHAR end_subject;
6041 PCRE_PUCHAR start_partial = NULL;
6042 PCRE_PUCHAR req_char_ptr = start_match - 1;
6043
6044 const pcre_study_data *study;
6045 const real_pcre *external_re = (const real_pcre *)argument_re;
6046 const real_pcre *re = external_re;
6047
6048 /* Plausibility checks */
6049
6050 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6051 if (re == NULL || subject == NULL ||
6052 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6053 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6054 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6055
6056 /* These two settings are used in the code for checking a UTF-8 string that
6057 follows immediately afterwards. Other values in the md block are used only
6058 during "normal" pcre_exec() processing, not when the JIT support is in use,
6059 so they are set up later. */
6060
6061 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6062 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6063 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6064 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6065
6066 /* Check a UTF-8 string if required. Pass back the character offset and error
6067 code for an invalid string if a results vector is available. */
6068
6069 #ifdef SUPPORT_UTF
6070 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6071 {
6072 int erroroffset;
6073 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6074 if (errorcode != 0)
6075 {
6076 if (offsetcount >= 2)
6077 {
6078 offsets[0] = erroroffset;
6079 offsets[1] = errorcode;
6080 }
6081 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6082 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6083 }
6084
6085 /* Check that a start_offset points to the start of a UTF character. */
6086 if (start_offset > 0 && start_offset < length &&
6087 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6088 return PCRE_ERROR_BADUTF8_OFFSET;
6089 }
6090 #endif
6091
6092 /* If the pattern was successfully studied with JIT support, run the JIT
6093 executable instead of the rest of this function. Most options must be set at
6094 compile time for the JIT code to be usable. Fallback to the normal code path if
6095 an unsupported flag is set. In particular, JIT does not support partial
6096 matching. */
6097
6098 #ifdef SUPPORT_JIT
6099 if (extra_data != NULL
6100 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6101 && extra_data->executable_jit != NULL
6102 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6103 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6104 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6105 return PRIV(jit_exec)(re, extra_data->executable_jit,
6106 (const pcre_uchar *)subject, length, start_offset, options,
6107 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6108 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6109 #endif
6110
6111 /* Carry on with non-JIT matching. This information is for finding all the
6112 numbers associated with a given name, for condition testing. */
6113
6114 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6115 md->name_count = re->name_count;
6116 md->name_entry_size = re->name_entry_size;
6117
6118 /* Fish out the optional data from the extra_data structure, first setting
6119 the default values. */
6120
6121 study = NULL;
6122 md->match_limit = MATCH_LIMIT;
6123 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6124 md->callout_data = NULL;
6125
6126 /* The table pointer is always in native byte order. */
6127
6128 tables = external_re->tables;
6129
6130 if (extra_data != NULL)
6131 {
6132 register unsigned int flags = extra_data->flags;
6133 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6134 study = (const pcre_study_data *)extra_data->study_data;
6135 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6136 md->match_limit = extra_data->match_limit;
6137 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6138 md->match_limit_recursion = extra_data->match_limit_recursion;
6139 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6140 md->callout_data = extra_data->callout_data;
6141 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6142 }
6143
6144 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6145 is a feature that makes it possible to save compiled regex and re-use them
6146 in other programs later. */
6147
6148 if (tables == NULL) tables = PRIV(default_tables);
6149
6150 /* Check that the first field in the block is the magic number. If it is not,
6151 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6152 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6153 means that the pattern is likely compiled with different endianness. */
6154
6155 if (re->magic_number != MAGIC_NUMBER)
6156 return re->magic_number == REVERSED_MAGIC_NUMBER?
6157 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6158 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6159
6160 /* Set up other data */
6161
6162 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6163 startline = (re->flags & PCRE_STARTLINE) != 0;
6164 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6165
6166 /* The code starts after the real_pcre block and the capture name table. */
6167
6168 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6169 re->name_count * re->name_entry_size;
6170
6171 md->start_subject = (PCRE_PUCHAR)subject;
6172 md->start_offset = start_offset;
6173 md->end_subject = md->start_subject + length;
6174 end_subject = md->end_subject;
6175
6176 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6177 md->use_ucp = (re->options & PCRE_UCP) != 0;
6178 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6179 md->ignore_skip_arg = FALSE;
6180
6181 /* Some options are unpacked into BOOL variables in the hope that testing
6182 them will be faster than individual option bits. */
6183
6184 md->notbol = (options & PCRE_NOTBOL) != 0;
6185 md->noteol = (options & PCRE_NOTEOL) != 0;
6186 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6187 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6188
6189 md->hitend = FALSE;
6190 md->mark = md->nomatch_mark = NULL; /* In case never set */
6191
6192 md->recursive = NULL; /* No recursion at top level */
6193 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6194
6195 md->lcc = tables + lcc_offset;
6196 md->fcc = tables + fcc_offset;
6197 md->ctypes = tables + ctypes_offset;
6198
6199 /* Handle different \R options. */
6200
6201 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6202 {
6203 case 0:
6204 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6205 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6206 else
6207 #ifdef BSR_ANYCRLF
6208 md->bsr_anycrlf = TRUE;
6209 #else
6210 md->bsr_anycrlf = FALSE;
6211 #endif
6212 break;
6213
6214 case PCRE_BSR_ANYCRLF:
6215 md->bsr_anycrlf = TRUE;
6216 break;
6217
6218 case PCRE_BSR_UNICODE:
6219 md->bsr_anycrlf = FALSE;
6220 break;
6221
6222 default: return PCRE_ERROR_BADNEWLINE;
6223 }
6224
6225 /* Handle different types of newline. The three bits give eight cases. If
6226 nothing is set at run time, whatever was used at compile time applies. */
6227
6228 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6229 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6230 {
6231 case 0: newline = NEWLINE; break; /* Compile-time default */
6232 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6233 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6234 case PCRE_NEWLINE_CR+
6235 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6236 case PCRE_NEWLINE_ANY: newline = -1; break;
6237 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6238 default: return PCRE_ERROR_BADNEWLINE;
6239 }
6240
6241 if (newline == -2)
6242 {
6243 md->nltype = NLTYPE_ANYCRLF;
6244 }
6245 else if (newline < 0)
6246 {
6247 md->nltype = NLTYPE_ANY;
6248 }
6249 else
6250 {
6251 md->nltype = NLTYPE_FIXED;
6252 if (newline > 255)
6253 {
6254 md->nllen = 2;
6255 md->nl[0] = (newline >> 8) & 255;
6256 md->nl[1] = newline & 255;
6257 }
6258 else
6259 {
6260 md->nllen = 1;
6261 md->nl[0] = newline;
6262 }
6263 }
6264
6265 /* Partial matching was originally supported only for a restricted set of
6266 regexes; from release 8.00 there are no restrictions, but the bits are still
6267 defined (though never set). So there's no harm in leaving this code. */
6268
6269 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6270 return PCRE_ERROR_BADPARTIAL;
6271
6272 /* If the expression has got more back references than the offsets supplied can
6273 hold, we get a temporary chunk of working store to use during the matching.
6274 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6275 of 3. */
6276
6277 ocount = offsetcount - (offsetcount % 3);
6278 arg_offset_max = (2*ocount)/3;
6279
6280 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6281 {
6282 ocount = re->top_backref * 3 + 3;
6283 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6284 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6285 using_temporary_offsets = TRUE;
6286 DPRINTF(("Got memory to hold back references\n"));
6287 }
6288 else md->offset_vector = offsets;
6289
6290 md->offset_end = ocount;
6291 md->offset_max = (2*ocount)/3;
6292 md->offset_overflow = FALSE;
6293 md->capture_last = -1;
6294
6295 /* Reset the working variable associated with each extraction. These should
6296 never be used unless previously set, but they get saved and restored, and so we
6297 initialize them to avoid reading uninitialized locations. Also, unset the
6298 offsets for the matched string. This is really just for tidiness with callouts,
6299 in case they inspect these fields. */
6300
6301 if (md->offset_vector != NULL)
6302 {
6303 register int *iptr = md->offset_vector + ocount;
6304 register int *iend = iptr - re->top_bracket;
6305 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6306 while (--iptr >= iend) *iptr = -1;
6307 md->offset_vector[0] = md->offset_vector[1] = -1;
6308 }
6309
6310 /* Set up the first character to match, if available. The first_char value is
6311 never set for an anchored regular expression, but the anchoring may be forced
6312 at run time, so we have to test for anchoring. The first char may be unset for
6313 an unanchored pattern, of course. If there's no first char and the pattern was
6314 studied, there may be a bitmap of possible first characters. */
6315
6316 if (!anchored)
6317 {
6318 if ((re->flags & PCRE_FIRSTSET) != 0)
6319 {
6320 has_first_char = TRUE;
6321 first_char = first_char2 = re->first_char;
6322 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6323 {
6324 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6325 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6326 if (utf && first_char > 127)
6327 first_char2 = UCD_OTHERCASE(first_char);
6328 #endif
6329 }
6330 }
6331 else
6332 if (!startline && study != NULL &&
6333 (study->flags & PCRE_STUDY_MAPPED) != 0)
6334 start_bits = study->start_bits;
6335 }
6336
6337 /* For anchored or unanchored matches, there may be a "last known required
6338 character" set. */
6339
6340 if ((re->flags & PCRE_REQCHSET) != 0)
6341 {
6342 has_req_char = TRUE;
6343 req_char = req_char2 = re->req_char;
6344 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6345 {
6346 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6347 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6348 if (utf && req_char > 127)
6349 req_char2 = UCD_OTHERCASE(req_char);
6350 #endif
6351 }
6352 }
6353
6354
6355 /* ==========================================================================*/
6356
6357 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6358 the loop runs just once. */
6359
6360 for(;;)
6361 {
6362 PCRE_PUCHAR save_end_subject = end_subject;
6363 PCRE_PUCHAR new_start_match;
6364
6365 /* If firstline is TRUE, the start of the match is constrained to the first
6366 line of a multiline string. That is, the match must be before or at the first
6367 newline. Implement this by temporarily adjusting end_subject so that we stop
6368 scanning at a newline. If the match fails at the newline, later code breaks
6369 this loop. */
6370
6371 if (firstline)
6372 {
6373 PCRE_PUCHAR t = start_match;
6374 #ifdef SUPPORT_UTF
6375 if (utf)
6376 {
6377 while (t < md->end_subject && !IS_NEWLINE(t))
6378 {
6379 t++;
6380 ACROSSCHAR(t < end_subject, *t, t++);
6381 }
6382 }
6383 else
6384 #endif
6385 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6386 end_subject = t;
6387 }
6388
6389 /* There are some optimizations that avoid running the match if a known
6390 starting point is not found, or if a known later character is not present.
6391 However, there is an option that disables these, for testing and for ensuring
6392 that all callouts do actually occur. The option can be set in the regex by
6393 (*NO_START_OPT) or passed in match-time options. */
6394
6395 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6396 {
6397 /* Advance to a unique first char if there is one. */
6398
6399 if (has_first_char)
6400 {
6401 if (first_char != first_char2)
6402 while (start_match < end_subject &&
6403 *start_match != first_char && *start_match != first_char2)
6404 start_match++;
6405 else
6406 while (start_match < end_subject && *start_match != first_char)
6407 start_match++;
6408 }
6409
6410 /* Or to just after a linebreak for a multiline match */
6411
6412 else if (startline)
6413 {
6414 if (start_match > md->start_subject + start_offset)
6415 {
6416 #ifdef SUPPORT_UTF
6417 if (utf)
6418 {
6419 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6420 {
6421 start_match++;
6422 ACROSSCHAR(start_match < end_subject, *start_match,
6423 start_match++);
6424 }
6425 }
6426 else
6427 #endif
6428 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6429 start_match++;
6430
6431 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6432 and we are now at a LF, advance the match position by one more character.
6433 */
6434
6435 if (start_match[-1] == CHAR_CR &&
6436 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6437 start_match < end_subject &&
6438 *start_match == CHAR_NL)
6439 start_match++;
6440 }
6441 }
6442
6443 /* Or to a non-unique first byte after study */
6444
6445 else if (start_bits != NULL)
6446 {
6447 while (start_match < end_subject)
6448 {
6449 register unsigned int c = *start_match;
6450 #ifndef COMPILE_PCRE8
6451 if (c > 255) c = 255;
6452 #endif
6453 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6454 {
6455 start_match++;
6456 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6457 /* In non 8-bit mode, the iteration will stop for
6458 characters > 255 at the beginning or not stop at all. */
6459 if (utf)
6460 ACROSSCHAR(start_match < end_subject, *start_match,
6461 start_match++);
6462 #endif
6463 }
6464 else break;
6465 }
6466 }
6467 } /* Starting optimizations */
6468
6469 /* Restore fudged end_subject */
6470
6471 end_subject = save_end_subject;
6472
6473 /* The following two optimizations are disabled for partial matching or if
6474 disabling is explicitly requested. */
6475
6476 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6477 {
6478 /* If the pattern was studied, a minimum subject length may be set. This is
6479 a lower bound; no actual string of that length may actually match the
6480 pattern. Although the value is, strictly, in characters, we treat it as
6481 bytes to avoid spending too much time in this optimization. */
6482
6483 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6484 (pcre_uint32)(end_subject - start_match) < study->minlength)
6485 {
6486 rc = MATCH_NOMATCH;
6487 break;
6488 }
6489
6490 /* If req_char is set, we know that that character must appear in the
6491 subject for the match to succeed. If the first character is set, req_char
6492 must be later in the subject; otherwise the test starts at the match point.
6493 This optimization can save a huge amount of backtracking in patterns with
6494 nested unlimited repeats that aren't going to match. Writing separate code
6495 for cased/caseless versions makes it go faster, as does using an
6496 autoincrement and backing off on a match.
6497
6498 HOWEVER: when the subject string is very, very long, searching to its end
6499 can take a long time, and give bad performance on quite ordinary patterns.
6500 This showed up when somebody was matching something like /^\d+C/ on a
6501 32-megabyte string... so we don't do this when the string is sufficiently
6502 long. */
6503
6504 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6505 {
6506 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6507
6508 /* We don't need to repeat the search if we haven't yet reached the
6509 place we found it at last time. */
6510
6511 if (p > req_char_ptr)
6512 {
6513 if (req_char != req_char2)
6514 {
6515 while (p < end_subject)
6516 {
6517 register int pp = *p++;
6518 if (pp == req_char || pp == req_char2) { p--; break; }
6519 }
6520 }
6521 else
6522 {
6523 while (p < end_subject)
6524 {
6525 if (*p++ == req_char) { p--; break; }
6526 }
6527 }
6528
6529 /* If we can't find the required character, break the matching loop,
6530 forcing a match failure. */
6531
6532 if (p >= end_subject)
6533 {
6534 rc = MATCH_NOMATCH;
6535 break;
6536 }
6537
6538 /* If we have found the required character, save the point where we
6539 found it, so that we don't search again next time round the loop if
6540 the start hasn't passed this character yet. */
6541
6542 req_char_ptr = p;
6543 }
6544 }
6545 }
6546
6547 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6548 printf(">>>> Match against: ");
6549 pchars(start_match, end_subject - start_match, TRUE, md);
6550 printf("\n");
6551 #endif
6552
6553 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6554 first starting point for which a partial match was found. */
6555
6556 md->start_match_ptr = start_match;
6557 md->start_used_ptr = start_match;
6558 md->match_call_count = 0;
6559 md->match_function_type = 0;
6560 md->end_offset_top = 0;
6561 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6562 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6563
6564 switch(rc)
6565 {
6566 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6567 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6568 entirely. The only way we can do that is to re-do the match at the same
6569 point, with a flag to force SKIP with an argument to be ignored. Just
6570 treating this case as NOMATCH does not work because it does not check other
6571 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6572
6573 case MATCH_SKIP_ARG:
6574 new_start_match = start_match;
6575 md->ignore_skip_arg = TRUE;
6576 break;
6577
6578 /* SKIP passes back the next starting point explicitly, but if it is the
6579 same as the match we have just done, treat it as NOMATCH. */
6580
6581 case MATCH_SKIP:
6582 if (md->start_match_ptr != start_match)
6583 {
6584 new_start_match = md->start_match_ptr;
6585 break;
6586 }
6587 /* Fall through */
6588
6589 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6590 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6591
6592 case MATCH_NOMATCH:
6593 case MATCH_PRUNE:
6594 case MATCH_THEN:
6595 md->ignore_skip_arg = FALSE;
6596 new_start_match = start_match + 1;
6597 #ifdef SUPPORT_UTF
6598 if (utf)
6599 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6600 new_start_match++);
6601 #endif
6602 break;
6603
6604 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6605
6606 case MATCH_COMMIT:
6607 rc = MATCH_NOMATCH;
6608 goto ENDLOOP;
6609
6610 /* Any other return is either a match, or some kind of error. */
6611
6612 default:
6613 goto ENDLOOP;
6614 }
6615
6616 /* Control reaches here for the various types of "no match at this point"
6617 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6618
6619 rc = MATCH_NOMATCH;
6620
6621 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6622 newline in the subject (though it may continue over the newline). Therefore,
6623 if we have just failed to match, starting at a newline, do not continue. */
6624
6625 if (firstline && IS_NEWLINE(start_match)) break;
6626
6627 /* Advance to new matching position */
6628
6629 start_match = new_start_match;
6630
6631 /* Break the loop if the pattern is anchored or if we have passed the end of
6632 the subject. */
6633
6634 if (anchored || start_match > end_subject) break;
6635
6636 /* If we have just passed a CR and we are now at a LF, and the pattern does
6637 not contain any explicit matches for \r or \n, and the newline option is CRLF
6638 or ANY or ANYCRLF, advance the match position by one more character. In
6639 normal matching start_match will aways be greater than the first position at
6640 this stage, but a failed *SKIP can cause a return at the same point, which is
6641 why the first test exists. */
6642
6643 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
6644 start_match[-1] == CHAR_CR &&
6645 start_match < end_subject &&
6646 *start_match == CHAR_NL &&
6647 (re->flags & PCRE_HASCRORLF) == 0 &&
6648 (md->nltype == NLTYPE_ANY ||
6649 md->nltype == NLTYPE_ANYCRLF ||
6650 md->nllen == 2))
6651 start_match++;
6652
6653 md->mark = NULL; /* Reset for start of next match attempt */
6654 } /* End of for(;;) "bumpalong" loop */
6655
6656 /* ==========================================================================*/
6657
6658 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6659 conditions is true:
6660
6661 (1) The pattern is anchored or the match was failed by (*COMMIT);
6662
6663 (2) We are past the end of the subject;
6664
6665 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6666 this option requests that a match occur at or before the first newline in
6667 the subject.
6668
6669 When we have a match and the offset vector is big enough to deal with any
6670 backreferences, captured substring offsets will already be set up. In the case
6671 where we had to get some local store to hold offsets for backreference
6672 processing, copy those that we can. In this case there need not be overflow if
6673 certain parts of the pattern were not used, even though there are more
6674 capturing parentheses than vector slots. */
6675
6676 ENDLOOP:
6677
6678 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6679 {
6680 if (using_temporary_offsets)
6681 {
6682 if (arg_offset_max >= 4)
6683 {
6684 memcpy(offsets + 2, md->offset_vector + 2,
6685 (arg_offset_max - 2) * sizeof(int));
6686 DPRINTF(("Copied offsets from temporary memory\n"));
6687 }
6688 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6689 DPRINTF(("Freeing temporary memory\n"));
6690 (PUBL(free))(md->offset_vector);
6691 }
6692
6693 /* Set the return code to the number of captured strings, or 0 if there were
6694 too many to fit into the vector. */
6695
6696 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6697 0 : md->end_offset_top/2;
6698
6699 /* If there is space in the offset vector, set any unused pairs at the end of
6700 the pattern to -1 for backwards compatibility. It is documented that this
6701 happens. In earlier versions, the whole set of potential capturing offsets
6702 was set to -1 each time round the loop, but this is handled differently now.
6703 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6704 those at the end that need unsetting here. We can't just unset them all at
6705 the start of the whole thing because they may get set in one branch that is
6706 not the final matching branch. */
6707
6708 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6709 {
6710 register int *iptr, *iend;
6711 int resetcount = 2 + re->top_bracket * 2;
6712 if (resetcount > offsetcount) resetcount = ocount;
6713 iptr = offsets + md->end_offset_top;
6714 iend = offsets + resetcount;
6715 while (iptr < iend) *iptr++ = -1;
6716 }
6717
6718 /* If there is space, set up the whole thing as substring 0. The value of
6719 md->start_match_ptr might be modified if \K was encountered on the success
6720 matching path. */
6721
6722 if (offsetcount < 2) rc = 0; else
6723 {
6724 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6725 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6726 }
6727
6728 /* Return MARK data if requested */
6729
6730 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6731 *(extra_data->mark) = (unsigned char *)(md->mark);
6732 DPRINTF((">>>> returning %d\n", rc));
6733 return rc;
6734 }
6735
6736 /* Control gets here if there has been an error, or if the overall match
6737 attempt has failed at all permitted starting positions. */
6738
6739 if (using_temporary_offsets)
6740 {
6741 DPRINTF(("Freeing temporary memory\n"));
6742 (PUBL(free))(md->offset_vector);
6743 }
6744
6745 /* For anything other than nomatch or partial match, just return the code. */
6746