/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 825 - (show annotations)
Mon Dec 26 21:23:17 2011 UTC (7 years, 7 months ago) by zherczeg
File MIME type: text/plain
File size: 210935 byte(s)
Supporting all newlines, horizontal and vertical spaces in 16 bit mode
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496
497 /* Copy in the original argument variables */
498
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xoffset_top = offset_top;
503 frame->Xeptrb = eptrb;
504 frame->Xrdepth = rdepth;
505
506 /* This is where control jumps back to to effect "recursion" */
507
508 HEAP_RECURSE:
509
510 /* Macros make the argument variables come from the current frame */
511
512 #define eptr frame->Xeptr
513 #define ecode frame->Xecode
514 #define mstart frame->Xmstart
515 #define offset_top frame->Xoffset_top
516 #define eptrb frame->Xeptrb
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #ifdef SUPPORT_UCP
539 #define prop_type frame->Xprop_type
540 #define prop_value frame->Xprop_value
541 #define prop_fail_result frame->Xprop_fail_result
542 #define oclength frame->Xoclength
543 #define occhars frame->Xocchars
544 #endif
545
546 #define ctype frame->Xctype
547 #define fc frame->Xfc
548 #define fi frame->Xfi
549 #define length frame->Xlength
550 #define max frame->Xmax
551 #define min frame->Xmin
552 #define number frame->Xnumber
553 #define offset frame->Xoffset
554 #define op frame->Xop
555 #define save_capture_last frame->Xsave_capture_last
556 #define save_offset1 frame->Xsave_offset1
557 #define save_offset2 frame->Xsave_offset2
558 #define save_offset3 frame->Xsave_offset3
559 #define stacksave frame->Xstacksave
560
561 #define newptrb frame->Xnewptrb
562
563 /* When recursion is being used, local variables are allocated on the stack and
564 get preserved during recursion in the normal way. In this environment, fi and
565 i, and fc and c, can be the same variables. */
566
567 #else /* NO_RECURSE not defined */
568 #define fi i
569 #define fc c
570
571 /* Many of the following variables are used only in small blocks of the code.
572 My normal style of coding would have declared them within each of those blocks.
573 However, in order to accommodate the version of this code that uses an external
574 "stack" implemented on the heap, it is easier to declare them all here, so the
575 declarations can be cut out in a block. The only declarations within blocks
576 below are for variables that do not have to be preserved over a recursive call
577 to RMATCH(). */
578
579 #ifdef SUPPORT_UTF
580 const pcre_uchar *charptr;
581 #endif
582 const pcre_uchar *callpat;
583 const pcre_uchar *data;
584 const pcre_uchar *next;
585 PCRE_PUCHAR pp;
586 const pcre_uchar *prev;
587 PCRE_PUCHAR saved_eptr;
588
589 recursion_info new_recursive;
590
591 BOOL cur_is_word;
592 BOOL condition;
593 BOOL prev_is_word;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int oclength;
600 pcre_uchar occhars[6];
601 #endif
602
603 int codelink;
604 int ctype;
605 int length;
606 int max;
607 int min;
608 int number;
609 int offset;
610 int op;
611 int save_capture_last;
612 int save_offset1, save_offset2, save_offset3;
613 int stacksave[REC_STACK_SAVE_MAX];
614
615 eptrblock newptrb;
616 #endif /* NO_RECURSE */
617
618 /* To save space on the stack and in the heap frame, I have doubled up on some
619 of the local variables that are used only in localised parts of the code, but
620 still need to be preserved over recursive calls of match(). These macros define
621 the alternative names that are used. */
622
623 #define allow_zero cur_is_word
624 #define cbegroup condition
625 #define code_offset codelink
626 #define condassert condition
627 #define matched_once prev_is_word
628 #define foc number
629
630 /* These statements are here to stop the compiler complaining about unitialized
631 variables. */
632
633 #ifdef SUPPORT_UCP
634 prop_value = 0;
635 prop_fail_result = 0;
636 #endif
637
638
639 /* This label is used for tail recursion, which is used in a few cases even
640 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641 used. Thanks to Ian Taylor for noticing this possibility and sending the
642 original patch. */
643
644 TAIL_RECURSE:
645
646 /* OK, now we can get on with the real code of the function. Recursive calls
647 are specified by the macro RMATCH and RRETURN is used to return. When
648 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 defined). However, RMATCH isn't like a function call because it's quite a
651 complicated macro. It has to be used in one particular way. This shouldn't,
652 however, impact performance when true recursion is being used. */
653
654 #ifdef SUPPORT_UTF
655 utf = md->utf; /* Local copy of the flag */
656 #else
657 utf = FALSE;
658 #endif
659
660 /* First check that we haven't called match() too many times, or that we
661 haven't exceeded the recursive call limit. */
662
663 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665
666 /* At the start of a group with an unlimited repeat that may match an empty
667 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668 done this way to save having to use another function argument, which would take
669 up space on the stack. See also MATCH_CONDASSERT below.
670
671 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672 such remembered pointers, to be checked when we hit the closing ket, in order
673 to break infinite loops that match no characters. When match() is called in
674 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675 NOT be used with tail recursion, because the memory block that is used is on
676 the stack, so a new one may be required for each match(). */
677
678 if (md->match_function_type == MATCH_CBEGROUP)
679 {
680 newptrb.epb_saved_eptr = eptr;
681 newptrb.epb_prev = eptrb;
682 eptrb = &newptrb;
683 md->match_function_type = 0;
684 }
685
686 /* Now start processing the opcodes. */
687
688 for (;;)
689 {
690 minimize = possessive = FALSE;
691 op = *ecode;
692
693 switch(op)
694 {
695 case OP_MARK:
696 md->nomatch_mark = ecode + 2;
697 md->mark = NULL; /* In case previously set by assertion */
698 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 eptrb, RM55);
700 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701 md->mark == NULL) md->mark = ecode + 2;
702
703 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704 argument, and we must check whether that argument matches this MARK's
705 argument. It is passed back in md->start_match_ptr (an overloading of that
706 variable). If it does match, we reset that variable to the current subject
707 position and return MATCH_SKIP. Otherwise, pass back the return code
708 unaltered. */
709
710 else if (rrc == MATCH_SKIP_ARG &&
711 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 {
713 md->start_match_ptr = eptr;
714 RRETURN(MATCH_SKIP);
715 }
716 RRETURN(rrc);
717
718 case OP_FAIL:
719 RRETURN(MATCH_NOMATCH);
720
721 /* COMMIT overrides PRUNE, SKIP, and THEN */
722
723 case OP_COMMIT:
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 eptrb, RM52);
726 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728 rrc != MATCH_THEN)
729 RRETURN(rrc);
730 RRETURN(MATCH_COMMIT);
731
732 /* PRUNE overrides THEN */
733
734 case OP_PRUNE:
735 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 eptrb, RM51);
737 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 RRETURN(MATCH_PRUNE);
739
740 case OP_PRUNE_ARG:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM56);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 RRETURN(MATCH_PRUNE);
749
750 /* SKIP overrides PRUNE and THEN */
751
752 case OP_SKIP:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM53);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 RRETURN(rrc);
757 md->start_match_ptr = eptr; /* Pass back current position */
758 RRETURN(MATCH_SKIP);
759
760 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761 nomatch_mark. There is a flag that disables this opcode when re-matching a
762 pattern that ended with a SKIP for which there was not a matching MARK. */
763
764 case OP_SKIP_ARG:
765 if (md->ignore_skip_arg)
766 {
767 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768 break;
769 }
770 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 eptrb, RM57);
772 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 RRETURN(rrc);
774
775 /* Pass back the current skip name by overloading md->start_match_ptr and
776 returning the special MATCH_SKIP_ARG return code. This will either be
777 caught by a matching MARK, or get to the top, where it causes a rematch
778 with the md->ignore_skip_arg flag set. */
779
780 md->start_match_ptr = ecode + 2;
781 RRETURN(MATCH_SKIP_ARG);
782
783 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784 the branch in which it occurs can be determined. Overload the start of
785 match pointer to do this. */
786
787 case OP_THEN:
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 eptrb, RM54);
790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 md->start_match_ptr = ecode;
792 RRETURN(MATCH_THEN);
793
794 case OP_THEN_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 md, eptrb, RM58);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode;
803 RRETURN(MATCH_THEN);
804
805 /* Handle an atomic group that does not contain any capturing parentheses.
806 This can be handled like an assertion. Prior to 8.13, all atomic groups
807 were handled this way. In 8.13, the code was changed as below for ONCE, so
808 that backups pass through the group and thereby reset captured values.
809 However, this uses a lot more stack, so in 8.20, atomic groups that do not
810 contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 less stack intensive way.
812
813 Check the alternative branches in turn - the matching won't pass the KET
814 for this kind of subpattern. If any one branch matches, we carry on as at
815 the end of a normal bracket, leaving the subject pointer, but resetting
816 the start-of-match value in case it was changed by \K. */
817
818 case OP_ONCE_NC:
819 prev = ecode;
820 saved_eptr = eptr;
821 do
822 {
823 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825 {
826 mstart = md->start_match_ptr;
827 break;
828 }
829 if (rrc == MATCH_THEN)
830 {
831 next = ecode + GET(ecode,1);
832 if (md->start_match_ptr < next &&
833 (*ecode == OP_ALT || *next == OP_ALT))
834 rrc = MATCH_NOMATCH;
835 }
836
837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838 ecode += GET(ecode,1);
839 }
840 while (*ecode == OP_ALT);
841
842 /* If hit the end of the group (which could be repeated), fail */
843
844 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845
846 /* Continue as from after the group, updating the offsets high water
847 mark, since extracts may have been taken. */
848
849 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850
851 offset_top = md->end_offset_top;
852 eptr = md->end_match_ptr;
853
854 /* For a non-repeating ket, just continue at this level. This also
855 happens for a repeating ket if no characters were matched in the group.
856 This is the forcible breaking of infinite loops as implemented in Perl
857 5.005. */
858
859 if (*ecode == OP_KET || eptr == saved_eptr)
860 {
861 ecode += 1+LINK_SIZE;
862 break;
863 }
864
865 /* The repeating kets try the rest of the pattern or restart from the
866 preceding bracket, in the appropriate order. The second "call" of match()
867 uses tail recursion, to avoid using another stack frame. */
868
869 if (*ecode == OP_KETRMIN)
870 {
871 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873 ecode = prev;
874 goto TAIL_RECURSE;
875 }
876 else /* OP_KETRMAX */
877 {
878 md->match_function_type = MATCH_CBEGROUP;
879 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881 ecode += 1 + LINK_SIZE;
882 goto TAIL_RECURSE;
883 }
884 /* Control never gets here */
885
886 /* Handle a capturing bracket, other than those that are possessive with an
887 unlimited repeat. If there is space in the offset vector, save the current
888 subject position in the working slot at the top of the vector. We mustn't
889 change the current values of the data slot, because they may be set from a
890 previous iteration of this group, and be referred to by a reference inside
891 the group. A failure to match might occur after the group has succeeded,
892 if something later on doesn't match. For this reason, we need to restore
893 the working value and also the values of the final offsets, in case they
894 were set by a previous iteration of the same bracket.
895
896 If there isn't enough space in the offset vector, treat this as if it were
897 a non-capturing bracket. Don't worry about setting the flag for the error
898 case here; that is handled in the code for KET. */
899
900 case OP_CBRA:
901 case OP_SCBRA:
902 number = GET2(ecode, 1+LINK_SIZE);
903 offset = number << 1;
904
905 #ifdef PCRE_DEBUG
906 printf("start bracket %d\n", number);
907 printf("subject=");
908 pchars(eptr, 16, TRUE, md);
909 printf("\n");
910 #endif
911
912 if (offset < md->offset_max)
913 {
914 save_offset1 = md->offset_vector[offset];
915 save_offset2 = md->offset_vector[offset+1];
916 save_offset3 = md->offset_vector[md->offset_end - number];
917 save_capture_last = md->capture_last;
918
919 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 md->offset_vector[md->offset_end - number] =
921 (int)(eptr - md->start_subject);
922
923 for (;;)
924 {
925 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 eptrb, RM1);
928 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929
930 /* If we backed up to a THEN, check whether it is within the current
931 branch by comparing the address of the THEN that is passed back with
932 the end of the branch. If it is within the current branch, and the
933 branch is one of two or more alternatives (it either starts or ends
934 with OP_ALT), we have reached the limit of THEN's action, so convert
935 the return code to NOMATCH, which will cause normal backtracking to
936 happen from now on. Otherwise, THEN is passed back to an outer
937 alternative. This implements Perl's treatment of parenthesized groups,
938 where a group not containing | does not affect the current alternative,
939 that is, (X) is NOT the same as (X|(*F)). */
940
941 if (rrc == MATCH_THEN)
942 {
943 next = ecode + GET(ecode,1);
944 if (md->start_match_ptr < next &&
945 (*ecode == OP_ALT || *next == OP_ALT))
946 rrc = MATCH_NOMATCH;
947 }
948
949 /* Anything other than NOMATCH is passed back. */
950
951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 md->capture_last = save_capture_last;
953 ecode += GET(ecode, 1);
954 if (*ecode != OP_ALT) break;
955 }
956
957 DPRINTF(("bracket %d failed\n", number));
958 md->offset_vector[offset] = save_offset1;
959 md->offset_vector[offset+1] = save_offset2;
960 md->offset_vector[md->offset_end - number] = save_offset3;
961
962 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963
964 RRETURN(rrc);
965 }
966
967 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968 as a non-capturing bracket. */
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974
975 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977
978 /* Non-capturing or atomic group, except for possessive with unlimited
979 repeat and ONCE group with no captures. Loop for all the alternatives.
980
981 When we get to the final alternative within the brackets, we used to return
982 the result of a recursive call to match() whatever happened so it was
983 possible to reduce stack usage by turning this into a tail recursion,
984 except in the case of a possibly empty group. However, now that there is
985 the possiblity of (*THEN) occurring in the final alternative, this
986 optimization is no longer always possible.
987
988 We can optimize if we know there are no (*THEN)s in the pattern; at present
989 this is the best that can be done.
990
991 MATCH_ONCE is returned when the end of an atomic group is successfully
992 reached, but subsequent matching fails. It passes back up the tree (causing
993 captured values to be reset) until the original atomic group level is
994 reached. This is tested by comparing md->once_target with the start of the
995 group. At this point, the return is converted into MATCH_NOMATCH so that
996 previous backup points can be taken. */
997
998 case OP_ONCE:
999 case OP_BRA:
1000 case OP_SBRA:
1001 DPRINTF(("start non-capturing bracket\n"));
1002
1003 for (;;)
1004 {
1005 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006
1007 /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 the pattern, and this is the final alternative, optimize as described
1009 above. */
1010
1011 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012 {
1013 ecode += PRIV(OP_lengths)[*ecode];
1014 goto TAIL_RECURSE;
1015 }
1016
1017 /* In all other cases, we have to make another call to match(). */
1018
1019 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 RM2);
1021
1022 /* See comment in the code for capturing groups above about handling
1023 THEN. */
1024
1025 if (rrc == MATCH_THEN)
1026 {
1027 next = ecode + GET(ecode,1);
1028 if (md->start_match_ptr < next &&
1029 (*ecode == OP_ALT || *next == OP_ALT))
1030 rrc = MATCH_NOMATCH;
1031 }
1032
1033 if (rrc != MATCH_NOMATCH)
1034 {
1035 if (rrc == MATCH_ONCE)
1036 {
1037 const pcre_uchar *scode = ecode;
1038 if (*scode != OP_ONCE) /* If not at start, find it */
1039 {
1040 while (*scode == OP_ALT) scode += GET(scode, 1);
1041 scode -= GET(scode, 1);
1042 }
1043 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 }
1045 RRETURN(rrc);
1046 }
1047 ecode += GET(ecode, 1);
1048 if (*ecode != OP_ALT) break;
1049 }
1050
1051 RRETURN(MATCH_NOMATCH);
1052
1053 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055 handled similarly to the normal case above. However, the matching is
1056 different. The end of these brackets will always be OP_KETRPOS, which
1057 returns MATCH_KETRPOS without going further in the pattern. By this means
1058 we can handle the group by iteration rather than recursion, thereby
1059 reducing the amount of stack needed. */
1060
1061 case OP_CBRAPOS:
1062 case OP_SCBRAPOS:
1063 allow_zero = FALSE;
1064
1065 POSSESSIVE_CAPTURE:
1066 number = GET2(ecode, 1+LINK_SIZE);
1067 offset = number << 1;
1068
1069 #ifdef PCRE_DEBUG
1070 printf("start possessive bracket %d\n", number);
1071 printf("subject=");
1072 pchars(eptr, 16, TRUE, md);
1073 printf("\n");
1074 #endif
1075
1076 if (offset < md->offset_max)
1077 {
1078 matched_once = FALSE;
1079 code_offset = (int)(ecode - md->start_code);
1080
1081 save_offset1 = md->offset_vector[offset];
1082 save_offset2 = md->offset_vector[offset+1];
1083 save_offset3 = md->offset_vector[md->offset_end - number];
1084 save_capture_last = md->capture_last;
1085
1086 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087
1088 /* Each time round the loop, save the current subject position for use
1089 when the group matches. For MATCH_MATCH, the group has matched, so we
1090 restart it with a new subject starting position, remembering that we had
1091 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092 usual. If we haven't matched any alternatives in any iteration, check to
1093 see if a previous iteration matched. If so, the group has matched;
1094 continue from afterwards. Otherwise it has failed; restore the previous
1095 capture values before returning NOMATCH. */
1096
1097 for (;;)
1098 {
1099 md->offset_vector[md->offset_end - number] =
1100 (int)(eptr - md->start_subject);
1101 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 eptrb, RM63);
1104 if (rrc == MATCH_KETRPOS)
1105 {
1106 offset_top = md->end_offset_top;
1107 eptr = md->end_match_ptr;
1108 ecode = md->start_code + code_offset;
1109 save_capture_last = md->capture_last;
1110 matched_once = TRUE;
1111 continue;
1112 }
1113
1114 /* See comment in the code for capturing groups above about handling
1115 THEN. */
1116
1117 if (rrc == MATCH_THEN)
1118 {
1119 next = ecode + GET(ecode,1);
1120 if (md->start_match_ptr < next &&
1121 (*ecode == OP_ALT || *next == OP_ALT))
1122 rrc = MATCH_NOMATCH;
1123 }
1124
1125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 md->capture_last = save_capture_last;
1127 ecode += GET(ecode, 1);
1128 if (*ecode != OP_ALT) break;
1129 }
1130
1131 if (!matched_once)
1132 {
1133 md->offset_vector[offset] = save_offset1;
1134 md->offset_vector[offset+1] = save_offset2;
1135 md->offset_vector[md->offset_end - number] = save_offset3;
1136 }
1137
1138 if (allow_zero || matched_once)
1139 {
1140 ecode += 1 + LINK_SIZE;
1141 break;
1142 }
1143
1144 RRETURN(MATCH_NOMATCH);
1145 }
1146
1147 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148 as a non-capturing bracket. */
1149
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152
1153 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154
1155 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157
1158 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160 without the capturing complication. It is written out separately for speed
1161 and cleanliness. */
1162
1163 case OP_BRAPOS:
1164 case OP_SBRAPOS:
1165 allow_zero = FALSE;
1166
1167 POSSESSIVE_NON_CAPTURE:
1168 matched_once = FALSE;
1169 code_offset = (int)(ecode - md->start_code);
1170
1171 for (;;)
1172 {
1173 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 eptrb, RM48);
1176 if (rrc == MATCH_KETRPOS)
1177 {
1178 offset_top = md->end_offset_top;
1179 eptr = md->end_match_ptr;
1180 ecode = md->start_code + code_offset;
1181 matched_once = TRUE;
1182 continue;
1183 }
1184
1185 /* See comment in the code for capturing groups above about handling
1186 THEN. */
1187
1188 if (rrc == MATCH_THEN)
1189 {
1190 next = ecode + GET(ecode,1);
1191 if (md->start_match_ptr < next &&
1192 (*ecode == OP_ALT || *next == OP_ALT))
1193 rrc = MATCH_NOMATCH;
1194 }
1195
1196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ecode += GET(ecode, 1);
1198 if (*ecode != OP_ALT) break;
1199 }
1200
1201 if (matched_once || allow_zero)
1202 {
1203 ecode += 1 + LINK_SIZE;
1204 break;
1205 }
1206 RRETURN(MATCH_NOMATCH);
1207
1208 /* Control never reaches here. */
1209
1210 /* Conditional group: compilation checked that there are no more than
1211 two branches. If the condition is false, skipping the first branch takes us
1212 past the end if there is only one branch, but that's OK because that is
1213 exactly what going to the ket would do. */
1214
1215 case OP_COND:
1216 case OP_SCOND:
1217 codelink = GET(ecode, 1);
1218
1219 /* Because of the way auto-callout works during compile, a callout item is
1220 inserted between OP_COND and an assertion condition. */
1221
1222 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223 {
1224 if (PUBL(callout) != NULL)
1225 {
1226 pcre_callout_block cb;
1227 cb.version = 2; /* Version 1 of the callout block */
1228 cb.callout_number = ecode[LINK_SIZE+2];
1229 cb.offset_vector = md->offset_vector;
1230 cb.subject = (PCRE_SPTR)md->start_subject;
1231 cb.subject_length = (int)(md->end_subject - md->start_subject);
1232 cb.start_match = (int)(mstart - md->start_subject);
1233 cb.current_position = (int)(eptr - md->start_subject);
1234 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1235 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1236 cb.capture_top = offset_top/2;
1237 cb.capture_last = md->capture_last;
1238 cb.callout_data = md->callout_data;
1239 cb.mark = md->nomatch_mark;
1240 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1241 if (rrc < 0) RRETURN(rrc);
1242 }
1243 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1244 }
1245
1246 condcode = ecode[LINK_SIZE+1];
1247
1248 /* Now see what the actual condition is */
1249
1250 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1251 {
1252 if (md->recursive == NULL) /* Not recursing => FALSE */
1253 {
1254 condition = FALSE;
1255 ecode += GET(ecode, 1);
1256 }
1257 else
1258 {
1259 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1260 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1261
1262 /* If the test is for recursion into a specific subpattern, and it is
1263 false, but the test was set up by name, scan the table to see if the
1264 name refers to any other numbers, and test them. The condition is true
1265 if any one is set. */
1266
1267 if (!condition && condcode == OP_NRREF)
1268 {
1269 pcre_uchar *slotA = md->name_table;
1270 for (i = 0; i < md->name_count; i++)
1271 {
1272 if (GET2(slotA, 0) == recno) break;
1273 slotA += md->name_entry_size;
1274 }
1275
1276 /* Found a name for the number - there can be only one; duplicate
1277 names for different numbers are allowed, but not vice versa. First
1278 scan down for duplicates. */
1279
1280 if (i < md->name_count)
1281 {
1282 pcre_uchar *slotB = slotA;
1283 while (slotB > md->name_table)
1284 {
1285 slotB -= md->name_entry_size;
1286 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1287 {
1288 condition = GET2(slotB, 0) == md->recursive->group_num;
1289 if (condition) break;
1290 }
1291 else break;
1292 }
1293
1294 /* Scan up for duplicates */
1295
1296 if (!condition)
1297 {
1298 slotB = slotA;
1299 for (i++; i < md->name_count; i++)
1300 {
1301 slotB += md->name_entry_size;
1302 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1303 {
1304 condition = GET2(slotB, 0) == md->recursive->group_num;
1305 if (condition) break;
1306 }
1307 else break;
1308 }
1309 }
1310 }
1311 }
1312
1313 /* Chose branch according to the condition */
1314
1315 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1316 }
1317 }
1318
1319 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1320 {
1321 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1322 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1323
1324 /* If the numbered capture is unset, but the reference was by name,
1325 scan the table to see if the name refers to any other numbers, and test
1326 them. The condition is true if any one is set. This is tediously similar
1327 to the code above, but not close enough to try to amalgamate. */
1328
1329 if (!condition && condcode == OP_NCREF)
1330 {
1331 int refno = offset >> 1;
1332 pcre_uchar *slotA = md->name_table;
1333
1334 for (i = 0; i < md->name_count; i++)
1335 {
1336 if (GET2(slotA, 0) == refno) break;
1337 slotA += md->name_entry_size;
1338 }
1339
1340 /* Found a name for the number - there can be only one; duplicate names
1341 for different numbers are allowed, but not vice versa. First scan down
1342 for duplicates. */
1343
1344 if (i < md->name_count)
1345 {
1346 pcre_uchar *slotB = slotA;
1347 while (slotB > md->name_table)
1348 {
1349 slotB -= md->name_entry_size;
1350 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1351 {
1352 offset = GET2(slotB, 0) << 1;
1353 condition = offset < offset_top &&
1354 md->offset_vector[offset] >= 0;
1355 if (condition) break;
1356 }
1357 else break;
1358 }
1359
1360 /* Scan up for duplicates */
1361
1362 if (!condition)
1363 {
1364 slotB = slotA;
1365 for (i++; i < md->name_count; i++)
1366 {
1367 slotB += md->name_entry_size;
1368 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1369 {
1370 offset = GET2(slotB, 0) << 1;
1371 condition = offset < offset_top &&
1372 md->offset_vector[offset] >= 0;
1373 if (condition) break;
1374 }
1375 else break;
1376 }
1377 }
1378 }
1379 }
1380
1381 /* Chose branch according to the condition */
1382
1383 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1384 }
1385
1386 else if (condcode == OP_DEF) /* DEFINE - always false */
1387 {
1388 condition = FALSE;
1389 ecode += GET(ecode, 1);
1390 }
1391
1392 /* The condition is an assertion. Call match() to evaluate it - setting
1393 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1394 an assertion. */
1395
1396 else
1397 {
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1406 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1407 }
1408
1409 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1410 assertion; it is therefore treated as NOMATCH. */
1411
1412 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 {
1414 RRETURN(rrc); /* Need braces because of following else */
1415 }
1416 else
1417 {
1418 condition = FALSE;
1419 ecode += codelink;
1420 }
1421 }
1422
1423 /* We are now at the branch that is to be obeyed. As there is only one, can
1424 use tail recursion to avoid using another stack frame, except when there is
1425 unlimited repeat of a possibly empty group. In the latter case, a recursive
1426 call to match() is always required, unless the second alternative doesn't
1427 exist, in which case we can just plough on. Note that, for compatibility
1428 with Perl, the | in a conditional group is NOT treated as creating two
1429 alternatives. If a THEN is encountered in the branch, it propagates out to
1430 the enclosing alternative (unless nested in a deeper set of alternatives,
1431 of course). */
1432
1433 if (condition || *ecode == OP_ALT)
1434 {
1435 if (op != OP_SCOND)
1436 {
1437 ecode += 1 + LINK_SIZE;
1438 goto TAIL_RECURSE;
1439 }
1440
1441 md->match_function_type = MATCH_CBEGROUP;
1442 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1443 RRETURN(rrc);
1444 }
1445
1446 /* Condition false & no alternative; continue after the group. */
1447
1448 else
1449 {
1450 ecode += 1 + LINK_SIZE;
1451 }
1452 break;
1453
1454
1455 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1456 to close any currently open capturing brackets. */
1457
1458 case OP_CLOSE:
1459 number = GET2(ecode, 1);
1460 offset = number << 1;
1461
1462 #ifdef PCRE_DEBUG
1463 printf("end bracket %d at *ACCEPT", number);
1464 printf("\n");
1465 #endif
1466
1467 md->capture_last = number;
1468 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1469 {
1470 md->offset_vector[offset] =
1471 md->offset_vector[md->offset_end - number];
1472 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1473 if (offset_top <= offset) offset_top = offset + 2;
1474 }
1475 ecode += 1 + IMM2_SIZE;
1476 break;
1477
1478
1479 /* End of the pattern, either real or forced. */
1480
1481 case OP_END:
1482 case OP_ACCEPT:
1483 case OP_ASSERT_ACCEPT:
1484
1485 /* If we have matched an empty string, fail if not in an assertion and not
1486 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1487 is set and we have matched at the start of the subject. In both cases,
1488 backtracking will then try other alternatives, if any. */
1489
1490 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1491 md->recursive == NULL &&
1492 (md->notempty ||
1493 (md->notempty_atstart &&
1494 mstart == md->start_subject + md->start_offset)))
1495 RRETURN(MATCH_NOMATCH);
1496
1497 /* Otherwise, we have a match. */
1498
1499 md->end_match_ptr = eptr; /* Record where we ended */
1500 md->end_offset_top = offset_top; /* and how many extracts were taken */
1501 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1502
1503 /* For some reason, the macros don't work properly if an expression is
1504 given as the argument to RRETURN when the heap is in use. */
1505
1506 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1507 RRETURN(rrc);
1508
1509 /* Assertion brackets. Check the alternative branches in turn - the
1510 matching won't pass the KET for an assertion. If any one branch matches,
1511 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1512 start of each branch to move the current point backwards, so the code at
1513 this level is identical to the lookahead case. When the assertion is part
1514 of a condition, we want to return immediately afterwards. The caller of
1515 this incarnation of the match() function will have set MATCH_CONDASSERT in
1516 md->match_function type, and one of these opcodes will be the first opcode
1517 that is processed. We use a local variable that is preserved over calls to
1518 match() to remember this case. */
1519
1520 case OP_ASSERT:
1521 case OP_ASSERTBACK:
1522 if (md->match_function_type == MATCH_CONDASSERT)
1523 {
1524 condassert = TRUE;
1525 md->match_function_type = 0;
1526 }
1527 else condassert = FALSE;
1528
1529 do
1530 {
1531 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1532 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1533 {
1534 mstart = md->start_match_ptr; /* In case \K reset it */
1535 break;
1536 }
1537
1538 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1539 as NOMATCH. */
1540
1541 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1542 ecode += GET(ecode, 1);
1543 }
1544 while (*ecode == OP_ALT);
1545
1546 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1547
1548 /* If checking an assertion for a condition, return MATCH_MATCH. */
1549
1550 if (condassert) RRETURN(MATCH_MATCH);
1551
1552 /* Continue from after the assertion, updating the offsets high water
1553 mark, since extracts may have been taken during the assertion. */
1554
1555 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1556 ecode += 1 + LINK_SIZE;
1557 offset_top = md->end_offset_top;
1558 continue;
1559
1560 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1561 PRUNE, or COMMIT means we must assume failure without checking subsequent
1562 branches. */
1563
1564 case OP_ASSERT_NOT:
1565 case OP_ASSERTBACK_NOT:
1566 if (md->match_function_type == MATCH_CONDASSERT)
1567 {
1568 condassert = TRUE;
1569 md->match_function_type = 0;
1570 }
1571 else condassert = FALSE;
1572
1573 do
1574 {
1575 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1576 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1577 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1578 {
1579 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1580 break;
1581 }
1582
1583 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1584 as NOMATCH. */
1585
1586 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1587 ecode += GET(ecode,1);
1588 }
1589 while (*ecode == OP_ALT);
1590
1591 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1592
1593 ecode += 1 + LINK_SIZE;
1594 continue;
1595
1596 /* Move the subject pointer back. This occurs only at the start of
1597 each branch of a lookbehind assertion. If we are too close to the start to
1598 move back, this match function fails. When working with UTF-8 we move
1599 back a number of characters, not bytes. */
1600
1601 case OP_REVERSE:
1602 #ifdef SUPPORT_UTF
1603 if (utf)
1604 {
1605 i = GET(ecode, 1);
1606 while (i-- > 0)
1607 {
1608 eptr--;
1609 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1610 BACKCHAR(eptr);
1611 }
1612 }
1613 else
1614 #endif
1615
1616 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1617
1618 {
1619 eptr -= GET(ecode, 1);
1620 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1621 }
1622
1623 /* Save the earliest consulted character, then skip to next op code */
1624
1625 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1626 ecode += 1 + LINK_SIZE;
1627 break;
1628
1629 /* The callout item calls an external function, if one is provided, passing
1630 details of the match so far. This is mainly for debugging, though the
1631 function is able to force a failure. */
1632
1633 case OP_CALLOUT:
1634 if (PUBL(callout) != NULL)
1635 {
1636 pcre_callout_block cb;
1637 cb.version = 2; /* Version 1 of the callout block */
1638 cb.callout_number = ecode[1];
1639 cb.offset_vector = md->offset_vector;
1640 cb.subject = (PCRE_SPTR)md->start_subject;
1641 cb.subject_length = (int)(md->end_subject - md->start_subject);
1642 cb.start_match = (int)(mstart - md->start_subject);
1643 cb.current_position = (int)(eptr - md->start_subject);
1644 cb.pattern_position = GET(ecode, 2);
1645 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1646 cb.capture_top = offset_top/2;
1647 cb.capture_last = md->capture_last;
1648 cb.callout_data = md->callout_data;
1649 cb.mark = md->nomatch_mark;
1650 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1651 if (rrc < 0) RRETURN(rrc);
1652 }
1653 ecode += 2 + 2*LINK_SIZE;
1654 break;
1655
1656 /* Recursion either matches the current regex, or some subexpression. The
1657 offset data is the offset to the starting bracket from the start of the
1658 whole pattern. (This is so that it works from duplicated subpatterns.)
1659
1660 The state of the capturing groups is preserved over recursion, and
1661 re-instated afterwards. We don't know how many are started and not yet
1662 finished (offset_top records the completed total) so we just have to save
1663 all the potential data. There may be up to 65535 such values, which is too
1664 large to put on the stack, but using malloc for small numbers seems
1665 expensive. As a compromise, the stack is used when there are no more than
1666 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1667
1668 There are also other values that have to be saved. We use a chained
1669 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1670 for the original version of this logic. It has, however, been hacked around
1671 a lot, so he is not to blame for the current way it works. */
1672
1673 case OP_RECURSE:
1674 {
1675 recursion_info *ri;
1676 int recno;
1677
1678 callpat = md->start_code + GET(ecode, 1);
1679 recno = (callpat == md->start_code)? 0 :
1680 GET2(callpat, 1 + LINK_SIZE);
1681
1682 /* Check for repeating a recursion without advancing the subject pointer.
1683 This should catch convoluted mutual recursions. (Some simple cases are
1684 caught at compile time.) */
1685
1686 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1687 if (recno == ri->group_num && eptr == ri->subject_position)
1688 RRETURN(PCRE_ERROR_RECURSELOOP);
1689
1690 /* Add to "recursing stack" */
1691
1692 new_recursive.group_num = recno;
1693 new_recursive.subject_position = eptr;
1694 new_recursive.prevrec = md->recursive;
1695 md->recursive = &new_recursive;
1696
1697 /* Where to continue from afterwards */
1698
1699 ecode += 1 + LINK_SIZE;
1700
1701 /* Now save the offset data */
1702
1703 new_recursive.saved_max = md->offset_end;
1704 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1705 new_recursive.offset_save = stacksave;
1706 else
1707 {
1708 new_recursive.offset_save =
1709 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1710 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1711 }
1712 memcpy(new_recursive.offset_save, md->offset_vector,
1713 new_recursive.saved_max * sizeof(int));
1714
1715 /* OK, now we can do the recursion. After processing each alternative,
1716 restore the offset data. If there were nested recursions, md->recursive
1717 might be changed, so reset it before looping. */
1718
1719 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1720 cbegroup = (*callpat >= OP_SBRA);
1721 do
1722 {
1723 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1724 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1725 md, eptrb, RM6);
1726 memcpy(md->offset_vector, new_recursive.offset_save,
1727 new_recursive.saved_max * sizeof(int));
1728 md->recursive = new_recursive.prevrec;
1729 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1730 {
1731 DPRINTF(("Recursion matched\n"));
1732 if (new_recursive.offset_save != stacksave)
1733 (PUBL(free))(new_recursive.offset_save);
1734
1735 /* Set where we got to in the subject, and reset the start in case
1736 it was changed by \K. This *is* propagated back out of a recursion,
1737 for Perl compatibility. */
1738
1739 eptr = md->end_match_ptr;
1740 mstart = md->start_match_ptr;
1741 goto RECURSION_MATCHED; /* Exit loop; end processing */
1742 }
1743
1744 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1745 as NOMATCH. */
1746
1747 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1748 {
1749 DPRINTF(("Recursion gave error %d\n", rrc));
1750 if (new_recursive.offset_save != stacksave)
1751 (PUBL(free))(new_recursive.offset_save);
1752 RRETURN(rrc);
1753 }
1754
1755 md->recursive = &new_recursive;
1756 callpat += GET(callpat, 1);
1757 }
1758 while (*callpat == OP_ALT);
1759
1760 DPRINTF(("Recursion didn't match\n"));
1761 md->recursive = new_recursive.prevrec;
1762 if (new_recursive.offset_save != stacksave)
1763 (PUBL(free))(new_recursive.offset_save);
1764 RRETURN(MATCH_NOMATCH);
1765 }
1766
1767 RECURSION_MATCHED:
1768 break;
1769
1770 /* An alternation is the end of a branch; scan along to find the end of the
1771 bracketed group and go to there. */
1772
1773 case OP_ALT:
1774 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1775 break;
1776
1777 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1778 indicating that it may occur zero times. It may repeat infinitely, or not
1779 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1780 with fixed upper repeat limits are compiled as a number of copies, with the
1781 optional ones preceded by BRAZERO or BRAMINZERO. */
1782
1783 case OP_BRAZERO:
1784 next = ecode + 1;
1785 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787 do next += GET(next, 1); while (*next == OP_ALT);
1788 ecode = next + 1 + LINK_SIZE;
1789 break;
1790
1791 case OP_BRAMINZERO:
1792 next = ecode + 1;
1793 do next += GET(next, 1); while (*next == OP_ALT);
1794 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796 ecode++;
1797 break;
1798
1799 case OP_SKIPZERO:
1800 next = ecode+1;
1801 do next += GET(next,1); while (*next == OP_ALT);
1802 ecode = next + 1 + LINK_SIZE;
1803 break;
1804
1805 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1806 here; just jump to the group, with allow_zero set TRUE. */
1807
1808 case OP_BRAPOSZERO:
1809 op = *(++ecode);
1810 allow_zero = TRUE;
1811 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1812 goto POSSESSIVE_NON_CAPTURE;
1813
1814 /* End of a group, repeated or non-repeating. */
1815
1816 case OP_KET:
1817 case OP_KETRMIN:
1818 case OP_KETRMAX:
1819 case OP_KETRPOS:
1820 prev = ecode - GET(ecode, 1);
1821
1822 /* If this was a group that remembered the subject start, in order to break
1823 infinite repeats of empty string matches, retrieve the subject start from
1824 the chain. Otherwise, set it NULL. */
1825
1826 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1827 {
1828 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1829 eptrb = eptrb->epb_prev; /* Backup to previous group */
1830 }
1831 else saved_eptr = NULL;
1832
1833 /* If we are at the end of an assertion group or a non-capturing atomic
1834 group, stop matching and return MATCH_MATCH, but record the current high
1835 water mark for use by positive assertions. We also need to record the match
1836 start in case it was changed by \K. */
1837
1838 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1839 *prev == OP_ONCE_NC)
1840 {
1841 md->end_match_ptr = eptr; /* For ONCE_NC */
1842 md->end_offset_top = offset_top;
1843 md->start_match_ptr = mstart;
1844 RRETURN(MATCH_MATCH); /* Sets md->mark */
1845 }
1846
1847 /* For capturing groups we have to check the group number back at the start
1848 and if necessary complete handling an extraction by setting the offsets and
1849 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1850 into group 0, so it won't be picked up here. Instead, we catch it when the
1851 OP_END is reached. Other recursion is handled here. We just have to record
1852 the current subject position and start match pointer and give a MATCH
1853 return. */
1854
1855 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1856 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1857 {
1858 number = GET2(prev, 1+LINK_SIZE);
1859 offset = number << 1;
1860
1861 #ifdef PCRE_DEBUG
1862 printf("end bracket %d", number);
1863 printf("\n");
1864 #endif
1865
1866 /* Handle a recursively called group. */
1867
1868 if (md->recursive != NULL && md->recursive->group_num == number)
1869 {
1870 md->end_match_ptr = eptr;
1871 md->start_match_ptr = mstart;
1872 RRETURN(MATCH_MATCH);
1873 }
1874
1875 /* Deal with capturing */
1876
1877 md->capture_last = number;
1878 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1879 {
1880 /* If offset is greater than offset_top, it means that we are
1881 "skipping" a capturing group, and that group's offsets must be marked
1882 unset. In earlier versions of PCRE, all the offsets were unset at the
1883 start of matching, but this doesn't work because atomic groups and
1884 assertions can cause a value to be set that should later be unset.
1885 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1886 part of the atomic group, but this is not on the final matching path,
1887 so must be unset when 2 is set. (If there is no group 2, there is no
1888 problem, because offset_top will then be 2, indicating no capture.) */
1889
1890 if (offset > offset_top)
1891 {
1892 register int *iptr = md->offset_vector + offset_top;
1893 register int *iend = md->offset_vector + offset;
1894 while (iptr < iend) *iptr++ = -1;
1895 }
1896
1897 /* Now make the extraction */
1898
1899 md->offset_vector[offset] =
1900 md->offset_vector[md->offset_end - number];
1901 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1902 if (offset_top <= offset) offset_top = offset + 2;
1903 }
1904 }
1905
1906 /* For an ordinary non-repeating ket, just continue at this level. This
1907 also happens for a repeating ket if no characters were matched in the
1908 group. This is the forcible breaking of infinite loops as implemented in
1909 Perl 5.005. For a non-repeating atomic group that includes captures,
1910 establish a backup point by processing the rest of the pattern at a lower
1911 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1912 original OP_ONCE level, thereby bypassing intermediate backup points, but
1913 resetting any captures that happened along the way. */
1914
1915 if (*ecode == OP_KET || eptr == saved_eptr)
1916 {
1917 if (*prev == OP_ONCE)
1918 {
1919 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1922 RRETURN(MATCH_ONCE);
1923 }
1924 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1925 break;
1926 }
1927
1928 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1929 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1930 at a time from the outer level, thus saving stack. */
1931
1932 if (*ecode == OP_KETRPOS)
1933 {
1934 md->end_match_ptr = eptr;
1935 md->end_offset_top = offset_top;
1936 RRETURN(MATCH_KETRPOS);
1937 }
1938
1939 /* The normal repeating kets try the rest of the pattern or restart from
1940 the preceding bracket, in the appropriate order. In the second case, we can
1941 use tail recursion to avoid using another stack frame, unless we have an
1942 an atomic group or an unlimited repeat of a group that can match an empty
1943 string. */
1944
1945 if (*ecode == OP_KETRMIN)
1946 {
1947 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 if (*prev == OP_ONCE)
1950 {
1951 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1953 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1954 RRETURN(MATCH_ONCE);
1955 }
1956 if (*prev >= OP_SBRA) /* Could match an empty string */
1957 {
1958 md->match_function_type = MATCH_CBEGROUP;
1959 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1960 RRETURN(rrc);
1961 }
1962 ecode = prev;
1963 goto TAIL_RECURSE;
1964 }
1965 else /* OP_KETRMAX */
1966 {
1967 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1968 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1969 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 if (*prev == OP_ONCE)
1972 {
1973 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1975 md->once_target = prev;
1976 RRETURN(MATCH_ONCE);
1977 }
1978 ecode += 1 + LINK_SIZE;
1979 goto TAIL_RECURSE;
1980 }
1981 /* Control never gets here */
1982
1983 /* Not multiline mode: start of subject assertion, unless notbol. */
1984
1985 case OP_CIRC:
1986 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1987
1988 /* Start of subject assertion */
1989
1990 case OP_SOD:
1991 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1992 ecode++;
1993 break;
1994
1995 /* Multiline mode: start of subject unless notbol, or after any newline. */
1996
1997 case OP_CIRCM:
1998 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1999 if (eptr != md->start_subject &&
2000 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2001 RRETURN(MATCH_NOMATCH);
2002 ecode++;
2003 break;
2004
2005 /* Start of match assertion */
2006
2007 case OP_SOM:
2008 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2009 ecode++;
2010 break;
2011
2012 /* Reset the start of match point */
2013
2014 case OP_SET_SOM:
2015 mstart = eptr;
2016 ecode++;
2017 break;
2018
2019 /* Multiline mode: assert before any newline, or before end of subject
2020 unless noteol is set. */
2021
2022 case OP_DOLLM:
2023 if (eptr < md->end_subject)
2024 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2025 else
2026 {
2027 if (md->noteol) RRETURN(MATCH_NOMATCH);
2028 SCHECK_PARTIAL();
2029 }
2030 ecode++;
2031 break;
2032
2033 /* Not multiline mode: assert before a terminating newline or before end of
2034 subject unless noteol is set. */
2035
2036 case OP_DOLL:
2037 if (md->noteol) RRETURN(MATCH_NOMATCH);
2038 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2039
2040 /* ... else fall through for endonly */
2041
2042 /* End of subject assertion (\z) */
2043
2044 case OP_EOD:
2045 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2046 SCHECK_PARTIAL();
2047 ecode++;
2048 break;
2049
2050 /* End of subject or ending \n assertion (\Z) */
2051
2052 case OP_EODN:
2053 ASSERT_NL_OR_EOS:
2054 if (eptr < md->end_subject &&
2055 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2056 RRETURN(MATCH_NOMATCH);
2057
2058 /* Either at end of string or \n before end. */
2059
2060 SCHECK_PARTIAL();
2061 ecode++;
2062 break;
2063
2064 /* Word boundary assertions */
2065
2066 case OP_NOT_WORD_BOUNDARY:
2067 case OP_WORD_BOUNDARY:
2068 {
2069
2070 /* Find out if the previous and current characters are "word" characters.
2071 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2072 be "non-word" characters. Remember the earliest consulted character for
2073 partial matching. */
2074
2075 #ifdef SUPPORT_UTF
2076 if (utf)
2077 {
2078 /* Get status of previous character */
2079
2080 if (eptr == md->start_subject) prev_is_word = FALSE; else
2081 {
2082 PCRE_PUCHAR lastptr = eptr - 1;
2083 BACKCHAR(lastptr);
2084 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2085 GETCHAR(c, lastptr);
2086 #ifdef SUPPORT_UCP
2087 if (md->use_ucp)
2088 {
2089 if (c == '_') prev_is_word = TRUE; else
2090 {
2091 int cat = UCD_CATEGORY(c);
2092 prev_is_word = (cat == ucp_L || cat == ucp_N);
2093 }
2094 }
2095 else
2096 #endif
2097 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2098 }
2099
2100 /* Get status of next character */
2101
2102 if (eptr >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 cur_is_word = FALSE;
2106 }
2107 else
2108 {
2109 GETCHAR(c, eptr);
2110 #ifdef SUPPORT_UCP
2111 if (md->use_ucp)
2112 {
2113 if (c == '_') cur_is_word = TRUE; else
2114 {
2115 int cat = UCD_CATEGORY(c);
2116 cur_is_word = (cat == ucp_L || cat == ucp_N);
2117 }
2118 }
2119 else
2120 #endif
2121 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2122 }
2123 }
2124 else
2125 #endif
2126
2127 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2128 consistency with the behaviour of \w we do use it in this case. */
2129
2130 {
2131 /* Get status of previous character */
2132
2133 if (eptr == md->start_subject) prev_is_word = FALSE; else
2134 {
2135 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2136 #ifdef SUPPORT_UCP
2137 if (md->use_ucp)
2138 {
2139 c = eptr[-1];
2140 if (c == '_') prev_is_word = TRUE; else
2141 {
2142 int cat = UCD_CATEGORY(c);
2143 prev_is_word = (cat == ucp_L || cat == ucp_N);
2144 }
2145 }
2146 else
2147 #endif
2148 prev_is_word = MAX_255(eptr[-1])
2149 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2150 }
2151
2152 /* Get status of next character */
2153
2154 if (eptr >= md->end_subject)
2155 {
2156 SCHECK_PARTIAL();
2157 cur_is_word = FALSE;
2158 }
2159 else
2160 #ifdef SUPPORT_UCP
2161 if (md->use_ucp)
2162 {
2163 c = *eptr;
2164 if (c == '_') cur_is_word = TRUE; else
2165 {
2166 int cat = UCD_CATEGORY(c);
2167 cur_is_word = (cat == ucp_L || cat == ucp_N);
2168 }
2169 }
2170 else
2171 #endif
2172 cur_is_word = MAX_255(*eptr)
2173 && ((md->ctypes[*eptr] & ctype_word) != 0);
2174 }
2175
2176 /* Now see if the situation is what we want */
2177
2178 if ((*ecode++ == OP_WORD_BOUNDARY)?
2179 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2180 RRETURN(MATCH_NOMATCH);
2181 }
2182 break;
2183
2184 /* Match a single character type; inline for speed */
2185
2186 case OP_ANY:
2187 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2188 /* Fall through */
2189
2190 case OP_ALLANY:
2191 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2192 { /* not be updated before SCHECK_PARTIAL. */
2193 SCHECK_PARTIAL();
2194 RRETURN(MATCH_NOMATCH);
2195 }
2196 eptr++;
2197 #ifdef SUPPORT_UTF
2198 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2199 #endif
2200 ecode++;
2201 break;
2202
2203 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2204 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2205
2206 case OP_ANYBYTE:
2207 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2208 { /* not be updated before SCHECK_PARTIAL. */
2209 SCHECK_PARTIAL();
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 eptr++;
2213 ecode++;
2214 break;
2215
2216 case OP_NOT_DIGIT:
2217 if (eptr >= md->end_subject)
2218 {
2219 SCHECK_PARTIAL();
2220 RRETURN(MATCH_NOMATCH);
2221 }
2222 GETCHARINCTEST(c, eptr);
2223 if (
2224 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2225 c < 256 &&
2226 #endif
2227 (md->ctypes[c] & ctype_digit) != 0
2228 )
2229 RRETURN(MATCH_NOMATCH);
2230 ecode++;
2231 break;
2232
2233 case OP_DIGIT:
2234 if (eptr >= md->end_subject)
2235 {
2236 SCHECK_PARTIAL();
2237 RRETURN(MATCH_NOMATCH);
2238 }
2239 GETCHARINCTEST(c, eptr);
2240 if (
2241 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2242 c > 255 ||
2243 #endif
2244 (md->ctypes[c] & ctype_digit) == 0
2245 )
2246 RRETURN(MATCH_NOMATCH);
2247 ecode++;
2248 break;
2249
2250 case OP_NOT_WHITESPACE:
2251 if (eptr >= md->end_subject)
2252 {
2253 SCHECK_PARTIAL();
2254 RRETURN(MATCH_NOMATCH);
2255 }
2256 GETCHARINCTEST(c, eptr);
2257 if (
2258 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2259 c < 256 &&
2260 #endif
2261 (md->ctypes[c] & ctype_space) != 0
2262 )
2263 RRETURN(MATCH_NOMATCH);
2264 ecode++;
2265 break;
2266
2267 case OP_WHITESPACE:
2268 if (eptr >= md->end_subject)
2269 {
2270 SCHECK_PARTIAL();
2271 RRETURN(MATCH_NOMATCH);
2272 }
2273 GETCHARINCTEST(c, eptr);
2274 if (
2275 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2276 c > 255 ||
2277 #endif
2278 (md->ctypes[c] & ctype_space) == 0
2279 )
2280 RRETURN(MATCH_NOMATCH);
2281 ecode++;
2282 break;
2283
2284 case OP_NOT_WORDCHAR:
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 RRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 if (
2292 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2293 c < 256 &&
2294 #endif
2295 (md->ctypes[c] & ctype_word) != 0
2296 )
2297 RRETURN(MATCH_NOMATCH);
2298 ecode++;
2299 break;
2300
2301 case OP_WORDCHAR:
2302 if (eptr >= md->end_subject)
2303 {
2304 SCHECK_PARTIAL();
2305 RRETURN(MATCH_NOMATCH);
2306 }
2307 GETCHARINCTEST(c, eptr);
2308 if (
2309 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2310 c > 255 ||
2311 #endif
2312 (md->ctypes[c] & ctype_word) == 0
2313 )
2314 RRETURN(MATCH_NOMATCH);
2315 ecode++;
2316 break;
2317
2318 case OP_ANYNL:
2319 if (eptr >= md->end_subject)
2320 {
2321 SCHECK_PARTIAL();
2322 RRETURN(MATCH_NOMATCH);
2323 }
2324 GETCHARINCTEST(c, eptr);
2325 switch(c)
2326 {
2327 default: RRETURN(MATCH_NOMATCH);
2328
2329 case 0x000d:
2330 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2331 break;
2332
2333 case 0x000a:
2334 break;
2335
2336 case 0x000b:
2337 case 0x000c:
2338 case 0x0085:
2339 case 0x2028:
2340 case 0x2029:
2341 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2342 break;
2343 }
2344 ecode++;
2345 break;
2346
2347 case OP_NOT_HSPACE:
2348 if (eptr >= md->end_subject)
2349 {
2350 SCHECK_PARTIAL();
2351 RRETURN(MATCH_NOMATCH);
2352 }
2353 GETCHARINCTEST(c, eptr);
2354 switch(c)
2355 {
2356 default: break;
2357 case 0x09: /* HT */
2358 case 0x20: /* SPACE */
2359 case 0xa0: /* NBSP */
2360 case 0x1680: /* OGHAM SPACE MARK */
2361 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2362 case 0x2000: /* EN QUAD */
2363 case 0x2001: /* EM QUAD */
2364 case 0x2002: /* EN SPACE */
2365 case 0x2003: /* EM SPACE */
2366 case 0x2004: /* THREE-PER-EM SPACE */
2367 case 0x2005: /* FOUR-PER-EM SPACE */
2368 case 0x2006: /* SIX-PER-EM SPACE */
2369 case 0x2007: /* FIGURE SPACE */
2370 case 0x2008: /* PUNCTUATION SPACE */
2371 case 0x2009: /* THIN SPACE */
2372 case 0x200A: /* HAIR SPACE */
2373 case 0x202f: /* NARROW NO-BREAK SPACE */
2374 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2375 case 0x3000: /* IDEOGRAPHIC SPACE */
2376 RRETURN(MATCH_NOMATCH);
2377 }
2378 ecode++;
2379 break;
2380
2381 case OP_HSPACE:
2382 if (eptr >= md->end_subject)
2383 {
2384 SCHECK_PARTIAL();
2385 RRETURN(MATCH_NOMATCH);
2386 }
2387 GETCHARINCTEST(c, eptr);
2388 switch(c)
2389 {
2390 default: RRETURN(MATCH_NOMATCH);
2391 case 0x09: /* HT */
2392 case 0x20: /* SPACE */
2393 case 0xa0: /* NBSP */
2394 case 0x1680: /* OGHAM SPACE MARK */
2395 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2396 case 0x2000: /* EN QUAD */
2397 case 0x2001: /* EM QUAD */
2398 case 0x2002: /* EN SPACE */
2399 case 0x2003: /* EM SPACE */
2400 case 0x2004: /* THREE-PER-EM SPACE */
2401 case 0x2005: /* FOUR-PER-EM SPACE */
2402 case 0x2006: /* SIX-PER-EM SPACE */
2403 case 0x2007: /* FIGURE SPACE */
2404 case 0x2008: /* PUNCTUATION SPACE */
2405 case 0x2009: /* THIN SPACE */
2406 case 0x200A: /* HAIR SPACE */
2407 case 0x202f: /* NARROW NO-BREAK SPACE */
2408 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2409 case 0x3000: /* IDEOGRAPHIC SPACE */
2410 break;
2411 }
2412 ecode++;
2413 break;
2414
2415 case OP_NOT_VSPACE:
2416 if (eptr >= md->end_subject)
2417 {
2418 SCHECK_PARTIAL();
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 GETCHARINCTEST(c, eptr);
2422 switch(c)
2423 {
2424 default: break;
2425 case 0x0a: /* LF */
2426 case 0x0b: /* VT */
2427 case 0x0c: /* FF */
2428 case 0x0d: /* CR */
2429 case 0x85: /* NEL */
2430 case 0x2028: /* LINE SEPARATOR */
2431 case 0x2029: /* PARAGRAPH SEPARATOR */
2432 RRETURN(MATCH_NOMATCH);
2433 }
2434 ecode++;
2435 break;
2436
2437 case OP_VSPACE:
2438 if (eptr >= md->end_subject)
2439 {
2440 SCHECK_PARTIAL();
2441 RRETURN(MATCH_NOMATCH);
2442 }
2443 GETCHARINCTEST(c, eptr);
2444 switch(c)
2445 {
2446 default: RRETURN(MATCH_NOMATCH);
2447 case 0x0a: /* LF */
2448 case 0x0b: /* VT */
2449 case 0x0c: /* FF */
2450 case 0x0d: /* CR */
2451 case 0x85: /* NEL */
2452 case 0x2028: /* LINE SEPARATOR */
2453 case 0x2029: /* PARAGRAPH SEPARATOR */
2454 break;
2455 }
2456 ecode++;
2457 break;
2458
2459 #ifdef SUPPORT_UCP
2460 /* Check the next character by Unicode property. We will get here only
2461 if the support is in the binary; otherwise a compile-time error occurs. */
2462
2463 case OP_PROP:
2464 case OP_NOTPROP:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 RRETURN(MATCH_NOMATCH);
2469 }
2470 GETCHARINCTEST(c, eptr);
2471 {
2472 const ucd_record *prop = GET_UCD(c);
2473
2474 switch(ecode[1])
2475 {
2476 case PT_ANY:
2477 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_LAMP:
2481 if ((prop->chartype == ucp_Lu ||
2482 prop->chartype == ucp_Ll ||
2483 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2484 RRETURN(MATCH_NOMATCH);
2485 break;
2486
2487 case PT_GC:
2488 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2489 RRETURN(MATCH_NOMATCH);
2490 break;
2491
2492 case PT_PC:
2493 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2494 RRETURN(MATCH_NOMATCH);
2495 break;
2496
2497 case PT_SC:
2498 if ((ecode[2] != prop->script) == (op == OP_PROP))
2499 RRETURN(MATCH_NOMATCH);
2500 break;
2501
2502 /* These are specials */
2503
2504 case PT_ALNUM:
2505 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2506 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2507 RRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_SPACE: /* Perl space */
2511 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2513 == (op == OP_NOTPROP))
2514 RRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 case PT_PXSPACE: /* POSIX space */
2518 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2519 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2520 c == CHAR_FF || c == CHAR_CR)
2521 == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_WORD:
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2527 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2528 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2529 RRETURN(MATCH_NOMATCH);
2530 break;
2531
2532 /* This should never occur */
2533
2534 default:
2535 RRETURN(PCRE_ERROR_INTERNAL);
2536 }
2537
2538 ecode += 3;
2539 }
2540 break;
2541
2542 /* Match an extended Unicode sequence. We will get here only if the support
2543 is in the binary; otherwise a compile-time error occurs. */
2544
2545 case OP_EXTUNI:
2546 if (eptr >= md->end_subject)
2547 {
2548 SCHECK_PARTIAL();
2549 RRETURN(MATCH_NOMATCH);
2550 }
2551 GETCHARINCTEST(c, eptr);
2552 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2553 while (eptr < md->end_subject)
2554 {
2555 int len = 1;
2556 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2557 if (UCD_CATEGORY(c) != ucp_M) break;
2558 eptr += len;
2559 }
2560 ecode++;
2561 break;
2562 #endif
2563
2564
2565 /* Match a back reference, possibly repeatedly. Look past the end of the
2566 item to see if there is repeat information following. The code is similar
2567 to that for character classes, but repeated for efficiency. Then obey
2568 similar code to character type repeats - written out again for speed.
2569 However, if the referenced string is the empty string, always treat
2570 it as matched, any number of times (otherwise there could be infinite
2571 loops). */
2572
2573 case OP_REF:
2574 case OP_REFI:
2575 caseless = op == OP_REFI;
2576 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2577 ecode += 1 + IMM2_SIZE;
2578
2579 /* If the reference is unset, there are two possibilities:
2580
2581 (a) In the default, Perl-compatible state, set the length negative;
2582 this ensures that every attempt at a match fails. We can't just fail
2583 here, because of the possibility of quantifiers with zero minima.
2584
2585 (b) If the JavaScript compatibility flag is set, set the length to zero
2586 so that the back reference matches an empty string.
2587
2588 Otherwise, set the length to the length of what was matched by the
2589 referenced subpattern. */
2590
2591 if (offset >= offset_top || md->offset_vector[offset] < 0)
2592 length = (md->jscript_compat)? 0 : -1;
2593 else
2594 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2595
2596 /* Set up for repetition, or handle the non-repeated case */
2597
2598 switch (*ecode)
2599 {
2600 case OP_CRSTAR:
2601 case OP_CRMINSTAR:
2602 case OP_CRPLUS:
2603 case OP_CRMINPLUS:
2604 case OP_CRQUERY:
2605 case OP_CRMINQUERY:
2606 c = *ecode++ - OP_CRSTAR;
2607 minimize = (c & 1) != 0;
2608 min = rep_min[c]; /* Pick up values from tables; */
2609 max = rep_max[c]; /* zero for max => infinity */
2610 if (max == 0) max = INT_MAX;
2611 break;
2612
2613 case OP_CRRANGE:
2614 case OP_CRMINRANGE:
2615 minimize = (*ecode == OP_CRMINRANGE);
2616 min = GET2(ecode, 1);
2617 max = GET2(ecode, 1 + IMM2_SIZE);
2618 if (max == 0) max = INT_MAX;
2619 ecode += 1 + 2 * IMM2_SIZE;
2620 break;
2621
2622 default: /* No repeat follows */
2623 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2624 {
2625 CHECK_PARTIAL();
2626 RRETURN(MATCH_NOMATCH);
2627 }
2628 eptr += length;
2629 continue; /* With the main loop */
2630 }
2631
2632 /* Handle repeated back references. If the length of the reference is
2633 zero, just continue with the main loop. */
2634
2635 if (length == 0) continue;
2636
2637 /* First, ensure the minimum number of matches are present. We get back
2638 the length of the reference string explicitly rather than passing the
2639 address of eptr, so that eptr can be a register variable. */
2640
2641 for (i = 1; i <= min; i++)
2642 {
2643 int slength;
2644 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2645 {
2646 CHECK_PARTIAL();
2647 RRETURN(MATCH_NOMATCH);
2648 }
2649 eptr += slength;
2650 }
2651
2652 /* If min = max, continue at the same level without recursion.
2653 They are not both allowed to be zero. */
2654
2655 if (min == max) continue;
2656
2657 /* If minimizing, keep trying and advancing the pointer */
2658
2659 if (minimize)
2660 {
2661 for (fi = min;; fi++)
2662 {
2663 int slength;
2664 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2666 if (fi >= max) RRETURN(MATCH_NOMATCH);
2667 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2668 {
2669 CHECK_PARTIAL();
2670 RRETURN(MATCH_NOMATCH);
2671 }
2672 eptr += slength;
2673 }
2674 /* Control never gets here */
2675 }
2676
2677 /* If maximizing, find the longest string and work backwards */
2678
2679 else
2680 {
2681 pp = eptr;
2682 for (i = min; i < max; i++)
2683 {
2684 int slength;
2685 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2686 {
2687 CHECK_PARTIAL();
2688 break;
2689 }
2690 eptr += slength;
2691 }
2692 while (eptr >= pp)
2693 {
2694 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2696 eptr -= length;
2697 }
2698 RRETURN(MATCH_NOMATCH);
2699 }
2700 /* Control never gets here */
2701
2702 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2703 used when all the characters in the class have values in the range 0-255,
2704 and either the matching is caseful, or the characters are in the range
2705 0-127 when UTF-8 processing is enabled. The only difference between
2706 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2707 encountered.
2708
2709 First, look past the end of the item to see if there is repeat information
2710 following. Then obey similar code to character type repeats - written out
2711 again for speed. */
2712
2713 case OP_NCLASS:
2714 case OP_CLASS:
2715 {
2716 /* The data variable is saved across frames, so the byte map needs to
2717 be stored there. */
2718 #define BYTE_MAP ((pcre_uint8 *)data)
2719 data = ecode + 1; /* Save for matching */
2720 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2721
2722 switch (*ecode)
2723 {
2724 case OP_CRSTAR:
2725 case OP_CRMINSTAR:
2726 case OP_CRPLUS:
2727 case OP_CRMINPLUS:
2728 case OP_CRQUERY:
2729 case OP_CRMINQUERY:
2730 c = *ecode++ - OP_CRSTAR;
2731 minimize = (c & 1) != 0;
2732 min = rep_min[c]; /* Pick up values from tables; */
2733 max = rep_max[c]; /* zero for max => infinity */
2734 if (max == 0) max = INT_MAX;
2735 break;
2736
2737 case OP_CRRANGE:
2738 case OP_CRMINRANGE:
2739 minimize = (*ecode == OP_CRMINRANGE);
2740 min = GET2(ecode, 1);
2741 max = GET2(ecode, 1 + IMM2_SIZE);
2742 if (max == 0) max = INT_MAX;
2743 ecode += 1 + 2 * IMM2_SIZE;
2744 break;
2745
2746 default: /* No repeat follows */
2747 min = max = 1;
2748 break;
2749 }
2750
2751 /* First, ensure the minimum number of matches are present. */
2752
2753 #ifdef SUPPORT_UTF
2754 if (utf)
2755 {
2756 for (i = 1; i <= min; i++)
2757 {
2758 if (eptr >= md->end_subject)
2759 {
2760 SCHECK_PARTIAL();
2761 RRETURN(MATCH_NOMATCH);
2762 }
2763 GETCHARINC(c, eptr);
2764 if (c > 255)
2765 {
2766 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2767 }
2768 else
2769 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2770 }
2771 }
2772 else
2773 #endif
2774 /* Not UTF mode */
2775 {
2776 for (i = 1; i <= min; i++)
2777 {
2778 if (eptr >= md->end_subject)
2779 {
2780 SCHECK_PARTIAL();
2781 RRETURN(MATCH_NOMATCH);
2782 }
2783 c = *eptr++;
2784 #ifndef COMPILE_PCRE8
2785 if (c > 255)
2786 {
2787 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2788 }
2789 else
2790 #endif
2791 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2792 }
2793 }
2794
2795 /* If max == min we can continue with the main loop without the
2796 need to recurse. */
2797
2798 if (min == max) continue;
2799
2800 /* If minimizing, keep testing the rest of the expression and advancing
2801 the pointer while it matches the class. */
2802
2803 if (minimize)
2804 {
2805 #ifdef SUPPORT_UTF
2806 if (utf)
2807 {
2808 for (fi = min;; fi++)
2809 {
2810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812 if (fi >= max) RRETURN(MATCH_NOMATCH);
2813 if (eptr >= md->end_subject)
2814 {
2815 SCHECK_PARTIAL();
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 GETCHARINC(c, eptr);
2819 if (c > 255)
2820 {
2821 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2822 }
2823 else
2824 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2825 }
2826 }
2827 else
2828 #endif
2829 /* Not UTF mode */
2830 {
2831 for (fi = min;; fi++)
2832 {
2833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2835 if (fi >= max) RRETURN(MATCH_NOMATCH);
2836 if (eptr >= md->end_subject)
2837 {
2838 SCHECK_PARTIAL();
2839 RRETURN(MATCH_NOMATCH);
2840 }
2841 c = *eptr++;
2842 #ifndef COMPILE_PCRE8
2843 if (c > 255)
2844 {
2845 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2846 }
2847 else
2848 #endif
2849 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2850 }
2851 }
2852 /* Control never gets here */
2853 }
2854
2855 /* If maximizing, find the longest possible run, then work backwards. */
2856
2857 else
2858 {
2859 pp = eptr;
2860
2861 #ifdef SUPPORT_UTF
2862 if (utf)
2863 {
2864 for (i = min; i < max; i++)
2865 {
2866 int len = 1;
2867 if (eptr >= md->end_subject)
2868 {
2869 SCHECK_PARTIAL();
2870 break;
2871 }
2872 GETCHARLEN(c, eptr, len);
2873 if (c > 255)
2874 {
2875 if (op == OP_CLASS) break;
2876 }
2877 else
2878 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2879 eptr += len;
2880 }
2881 for (;;)
2882 {
2883 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2885 if (eptr-- == pp) break; /* Stop if tried at original pos */
2886 BACKCHAR(eptr);
2887 }
2888 }
2889 else
2890 #endif
2891 /* Not UTF mode */
2892 {
2893 for (i = min; i < max; i++)
2894 {
2895 if (eptr >= md->end_subject)
2896 {
2897 SCHECK_PARTIAL();
2898 break;
2899 }
2900 c = *eptr;
2901 #ifndef COMPILE_PCRE8
2902 if (c > 255)
2903 {
2904 if (op == OP_CLASS) break;
2905 }
2906 else
2907 #endif
2908 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2909 eptr++;
2910 }
2911 while (eptr >= pp)
2912 {
2913 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2914 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2915 eptr--;
2916 }
2917 }
2918
2919 RRETURN(MATCH_NOMATCH);
2920 }
2921 #undef BYTE_MAP
2922 }
2923 /* Control never gets here */
2924
2925
2926 /* Match an extended character class. This opcode is encountered only
2927 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2928 mode, because Unicode properties are supported in non-UTF-8 mode. */
2929
2930 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2931 case OP_XCLASS:
2932 {
2933 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2934 ecode += GET(ecode, 1); /* Advance past the item */
2935
2936 switch (*ecode)
2937 {
2938 case OP_CRSTAR:
2939 case OP_CRMINSTAR:
2940 case OP_CRPLUS:
2941 case OP_CRMINPLUS:
2942 case OP_CRQUERY:
2943 case OP_CRMINQUERY:
2944 c = *ecode++ - OP_CRSTAR;
2945 minimize = (c & 1) != 0;
2946 min = rep_min[c]; /* Pick up values from tables; */
2947 max = rep_max[c]; /* zero for max => infinity */
2948 if (max == 0) max = INT_MAX;
2949 break;
2950
2951 case OP_CRRANGE:
2952 case OP_CRMINRANGE:
2953 minimize = (*ecode == OP_CRMINRANGE);
2954 min = GET2(ecode, 1);
2955 max = GET2(ecode, 1 + IMM2_SIZE);
2956 if (max == 0) max = INT_MAX;
2957 ecode += 1 + 2 * IMM2_SIZE;
2958 break;
2959
2960 default: /* No repeat follows */
2961 min = max = 1;
2962 break;
2963 }
2964
2965 /* First, ensure the minimum number of matches are present. */
2966
2967 for (i = 1; i <= min; i++)
2968 {
2969 if (eptr >= md->end_subject)
2970 {
2971 SCHECK_PARTIAL();
2972 RRETURN(MATCH_NOMATCH);
2973 }
2974 GETCHARINCTEST(c, eptr);
2975 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2976 }
2977
2978 /* If max == min we can continue with the main loop without the
2979 need to recurse. */
2980
2981 if (min == max) continue;
2982
2983 /* If minimizing, keep testing the rest of the expression and advancing
2984 the pointer while it matches the class. */
2985
2986 if (minimize)
2987 {
2988 for (fi = min;; fi++)
2989 {
2990 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2991 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2992 if (fi >= max) RRETURN(MATCH_NOMATCH);
2993 if (eptr >= md->end_subject)
2994 {
2995 SCHECK_PARTIAL();
2996 RRETURN(MATCH_NOMATCH);
2997 }
2998 GETCHARINCTEST(c, eptr);
2999 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3000 }
3001 /* Control never gets here */
3002 }
3003
3004 /* If maximizing, find the longest possible run, then work backwards. */
3005
3006 else
3007 {
3008 pp = eptr;
3009 for (i = min; i < max; i++)
3010 {
3011 int len = 1;
3012 if (eptr >= md->end_subject)
3013 {
3014 SCHECK_PARTIAL();
3015 break;
3016 }
3017 #ifdef SUPPORT_UTF
3018 GETCHARLENTEST(c, eptr, len);
3019 #else
3020 c = *eptr;
3021 #endif
3022 if (!PRIV(xclass)(c, data, utf)) break;
3023 eptr += len;
3024 }
3025 for(;;)
3026 {
3027 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 if (eptr-- == pp) break; /* Stop if tried at original pos */
3030 #ifdef SUPPORT_UTF
3031 if (utf) BACKCHAR(eptr);
3032 #endif
3033 }
3034 RRETURN(MATCH_NOMATCH);
3035 }
3036
3037 /* Control never gets here */
3038 }
3039 #endif /* End of XCLASS */
3040
3041 /* Match a single character, casefully */
3042
3043 case OP_CHAR:
3044 #ifdef SUPPORT_UTF
3045 if (utf)
3046 {
3047 length = 1;
3048 ecode++;
3049 GETCHARLEN(fc, ecode, length);
3050 if (length > md->end_subject - eptr)
3051 {
3052 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3053 RRETURN(MATCH_NOMATCH);
3054 }
3055 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3056 }
3057 else
3058 #endif
3059 /* Not UTF mode */
3060 {
3061 if (md->end_subject - eptr < 1)
3062 {
3063 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3064 RRETURN(MATCH_NOMATCH);
3065 }
3066 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3067 ecode += 2;
3068 }
3069 break;
3070
3071 /* Match a single character, caselessly. If we are at the end of the
3072 subject, give up immediately. */
3073
3074 case OP_CHARI:
3075 if (eptr >= md->end_subject)
3076 {
3077 SCHECK_PARTIAL();
3078 RRETURN(MATCH_NOMATCH);
3079 }
3080
3081 #ifdef SUPPORT_UTF
3082 if (utf)
3083 {
3084 length = 1;
3085 ecode++;
3086 GETCHARLEN(fc, ecode, length);
3087
3088 /* If the pattern character's value is < 128, we have only one byte, and
3089 we know that its other case must also be one byte long, so we can use the
3090 fast lookup table. We know that there is at least one byte left in the
3091 subject. */
3092
3093 if (fc < 128)
3094 {
3095 if (md->lcc[fc]
3096 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3097 ecode++;
3098 eptr++;
3099 }
3100
3101 /* Otherwise we must pick up the subject character. Note that we cannot
3102 use the value of "length" to check for sufficient bytes left, because the
3103 other case of the character may have more or fewer bytes. */
3104
3105 else
3106 {
3107 unsigned int dc;
3108 GETCHARINC(dc, eptr);
3109 ecode += length;
3110
3111 /* If we have Unicode property support, we can use it to test the other
3112 case of the character, if there is one. */
3113
3114 if (fc != dc)
3115 {
3116 #ifdef SUPPORT_UCP
3117 if (dc != UCD_OTHERCASE(fc))
3118 #endif
3119 RRETURN(MATCH_NOMATCH);
3120 }
3121 }
3122 }
3123 else
3124 #endif /* SUPPORT_UTF */
3125
3126 /* Not UTF mode */
3127 {
3128 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3129 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3130 eptr++;
3131 ecode += 2;
3132 }
3133 break;
3134
3135 /* Match a single character repeatedly. */
3136
3137 case OP_EXACT:
3138 case OP_EXACTI:
3139 min = max = GET2(ecode, 1);
3140 ecode += 1 + IMM2_SIZE;
3141 goto REPEATCHAR;
3142
3143 case OP_POSUPTO:
3144 case OP_POSUPTOI:
3145 possessive = TRUE;
3146 /* Fall through */
3147
3148 case OP_UPTO:
3149 case OP_UPTOI:
3150 case OP_MINUPTO:
3151 case OP_MINUPTOI:
3152 min = 0;
3153 max = GET2(ecode, 1);
3154 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3155 ecode += 1 + IMM2_SIZE;
3156 goto REPEATCHAR;
3157
3158 case OP_POSSTAR:
3159 case OP_POSSTARI:
3160 possessive = TRUE;
3161 min = 0;
3162 max = INT_MAX;
3163 ecode++;
3164 goto REPEATCHAR;
3165
3166 case OP_POSPLUS:
3167 case OP_POSPLUSI:
3168 possessive = TRUE;
3169 min = 1;
3170 max = INT_MAX;
3171 ecode++;
3172 goto REPEATCHAR;
3173
3174 case OP_POSQUERY:
3175 case OP_POSQUERYI:
3176 possessive = TRUE;
3177 min = 0;
3178 max = 1;
3179 ecode++;
3180 goto REPEATCHAR;
3181
3182 case OP_STAR:
3183 case OP_STARI:
3184 case OP_MINSTAR:
3185 case OP_MINSTARI:
3186 case OP_PLUS:
3187 case OP_PLUSI:
3188 case OP_MINPLUS:
3189 case OP_MINPLUSI:
3190 case OP_QUERY:
3191 case OP_QUERYI:
3192 case OP_MINQUERY:
3193 case OP_MINQUERYI:
3194 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3195 minimize = (c & 1) != 0;
3196 min = rep_min[c]; /* Pick up values from tables; */
3197 max = rep_max[c]; /* zero for max => infinity */
3198 if (max == 0) max = INT_MAX;
3199
3200 /* Common code for all repeated single-character matches. */
3201
3202 REPEATCHAR:
3203 #ifdef SUPPORT_UTF
3204 if (utf)
3205 {
3206 length = 1;
3207 charptr = ecode;
3208 GETCHARLEN(fc, ecode, length);
3209 ecode += length;
3210
3211 /* Handle multibyte character matching specially here. There is
3212 support for caseless matching if UCP support is present. */
3213
3214 if (length > 1)
3215 {
3216 #ifdef SUPPORT_UCP
3217 unsigned int othercase;
3218 if (op >= OP_STARI && /* Caseless */
3219 (othercase = UCD_OTHERCASE(fc)) != fc)
3220 oclength = PRIV(ord2utf)(othercase, occhars);
3221 else oclength = 0;
3222 #endif /* SUPPORT_UCP */
3223
3224 for (i = 1; i <= min; i++)
3225 {
3226 if (eptr <= md->end_subject - length &&
3227 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3228 #ifdef SUPPORT_UCP
3229 else if (oclength > 0 &&
3230 eptr <= md->end_subject - oclength &&
3231 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3232 #endif /* SUPPORT_UCP */
3233 else
3234 {
3235 CHECK_PARTIAL();
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 }
3239
3240 if (min == max) continue;
3241
3242 if (minimize)
3243 {
3244 for (fi = min;; fi++)
3245 {
3246 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3248 if (fi >= max) RRETURN(MATCH_NOMATCH);
3249 if (eptr <= md->end_subject - length &&
3250 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3251 #ifdef SUPPORT_UCP
3252 else if (oclength > 0 &&
3253 eptr <= md->end_subject - oclength &&
3254 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3255 #endif /* SUPPORT_UCP */
3256 else
3257 {
3258 CHECK_PARTIAL();
3259 RRETURN(MATCH_NOMATCH);
3260 }
3261 }
3262 /* Control never gets here */
3263 }
3264
3265 else /* Maximize */
3266 {
3267 pp = eptr;
3268 for (i = min; i < max; i++)
3269 {
3270 if (eptr <= md->end_subject - length &&
3271 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3272 #ifdef SUPPORT_UCP
3273 else if (oclength > 0 &&
3274 eptr <= md->end_subject - oclength &&
3275 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3276 #endif /* SUPPORT_UCP */
3277 else
3278 {
3279 CHECK_PARTIAL();
3280 break;
3281 }
3282 }
3283
3284 if (possessive) continue;
3285
3286 for(;;)
3287 {
3288 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3289 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3290 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3291 #ifdef SUPPORT_UCP
3292 eptr--;
3293 BACKCHAR(eptr);
3294 #else /* without SUPPORT_UCP */
3295 eptr -= length;
3296 #endif /* SUPPORT_UCP */
3297 }
3298 }
3299 /* Control never gets here */
3300 }
3301
3302 /* If the length of a UTF-8 character is 1, we fall through here, and
3303 obey the code as for non-UTF-8 characters below, though in this case the
3304 value of fc will always be < 128. */
3305 }
3306 else
3307 #endif /* SUPPORT_UTF */
3308 /* When not in UTF-8 mode, load a single-byte character. */
3309 fc = *ecode++;
3310
3311 /* The value of fc at this point is always one character, though we may
3312 or may not be in UTF mode. The code is duplicated for the caseless and
3313 caseful cases, for speed, since matching characters is likely to be quite
3314 common. First, ensure the minimum number of matches are present. If min =
3315 max, continue at the same level without recursing. Otherwise, if
3316 minimizing, keep trying the rest of the expression and advancing one
3317 matching character if failing, up to the maximum. Alternatively, if
3318 maximizing, find the maximum number of characters and work backwards. */
3319
3320 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3321 max, eptr));
3322
3323 if (op >= OP_STARI) /* Caseless */
3324 {
3325 #ifdef COMPILE_PCRE8
3326 /* fc must be < 128 if UTF is enabled. */
3327 foc = md->fcc[fc];
3328 #else
3329 #ifdef SUPPORT_UTF
3330 #ifdef SUPPORT_UCP
3331 if (utf && fc > 127)
3332 foc = UCD_OTHERCASE(fc);
3333 #else
3334 if (utf && fc > 127)
3335 foc = fc;
3336 #endif /* SUPPORT_UCP */
3337 else
3338 #endif /* SUPPORT_UTF */
3339 foc = TABLE_GET(fc, md->fcc, fc);
3340 #endif /* COMPILE_PCRE8 */
3341
3342 for (i = 1; i <= min; i++)
3343 {
3344 if (eptr >= md->end_subject)
3345 {
3346 SCHECK_PARTIAL();
3347 RRETURN(MATCH_NOMATCH);
3348 }
3349 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3350 eptr++;
3351 }
3352 if (min == max) continue;
3353 if (minimize)
3354 {
3355 for (fi = min;; fi++)
3356 {
3357 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3358 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3359 if (fi >= max) RRETURN(MATCH_NOMATCH);
3360 if (eptr >= md->end_subject)
3361 {
3362 SCHECK_PARTIAL();
3363 RRETURN(MATCH_NOMATCH);
3364 }
3365 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3366 eptr++;
3367 }
3368 /* Control never gets here */
3369 }
3370 else /* Maximize */
3371 {
3372 pp = eptr;
3373 for (i = min; i < max; i++)
3374 {
3375 if (eptr >= md->end_subject)
3376 {
3377 SCHECK_PARTIAL();
3378 break;
3379 }
3380 if (fc != *eptr && foc != *eptr) break;
3381 eptr++;
3382 }
3383
3384 if (possessive) continue;
3385
3386 while (eptr >= pp)
3387 {
3388 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3389 eptr--;
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 }
3392 RRETURN(MATCH_NOMATCH);
3393 }
3394 /* Control never gets here */
3395 }
3396
3397 /* Caseful comparisons (includes all multi-byte characters) */
3398
3399 else
3400 {
3401 for (i = 1; i <= min; i++)
3402 {
3403 if (eptr >= md->end_subject)
3404 {
3405 SCHECK_PARTIAL();
3406 RRETURN(MATCH_NOMATCH);
3407 }
3408 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3409 }
3410
3411 if (min == max) continue;
3412
3413 if (minimize)
3414 {
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max) RRETURN(MATCH_NOMATCH);
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 RRETURN(MATCH_NOMATCH);
3424 }
3425 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3426 }
3427 /* Control never gets here */
3428 }
3429 else /* Maximize */
3430 {
3431 pp = eptr;
3432 for (i = min; i < max; i++)
3433 {
3434 if (eptr >= md->end_subject)
3435 {
3436 SCHECK_PARTIAL();
3437 break;
3438 }
3439 if (fc != *eptr) break;
3440 eptr++;
3441 }
3442 if (possessive) continue;
3443
3444 while (eptr >= pp)
3445 {
3446 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3447 eptr--;
3448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3449 }
3450 RRETURN(MATCH_NOMATCH);
3451 }
3452 }
3453 /* Control never gets here */
3454
3455 /* Match a negated single one-byte character. The character we are
3456 checking can be multibyte. */
3457
3458 case OP_NOT:
3459 case OP_NOTI:
3460 if (eptr >= md->end_subject)
3461 {
3462 SCHECK_PARTIAL();
3463 RRETURN(MATCH_NOMATCH);
3464 }
3465 ecode++;
3466 GETCHARINCTEST(c, eptr);
3467 if (op == OP_NOTI) /* The caseless case */
3468 {
3469 register int ch, och;
3470 ch = *ecode++;
3471 #ifdef COMPILE_PCRE8
3472 /* ch must be < 128 if UTF is enabled. */
3473 och = md->fcc[ch];
3474 #else
3475 #ifdef SUPPORT_UTF
3476 #ifdef SUPPORT_UCP
3477 if (utf && ch > 127)
3478 och = UCD_OTHERCASE(ch);
3479 #else
3480 if (utf && ch > 127)
3481 och = ch;
3482 #endif /* SUPPORT_UCP */
3483 else
3484 #endif /* SUPPORT_UTF */
3485 och = TABLE_GET(ch, md->fcc, ch);
3486 #endif /* COMPILE_PCRE8 */
3487 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3488 }
3489 else /* Caseful */
3490 {
3491 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3492 }
3493 break;
3494
3495 /* Match a negated single one-byte character repeatedly. This is almost a
3496 repeat of the code for a repeated single character, but I haven't found a
3497 nice way of commoning these up that doesn't require a test of the
3498 positive/negative option for each character match. Maybe that wouldn't add
3499 very much to the time taken, but character matching *is* what this is all
3500 about... */
3501
3502 case OP_NOTEXACT:
3503 case OP_NOTEXACTI:
3504 min = max = GET2(ecode, 1);
3505 ecode += 1 + IMM2_SIZE;
3506 goto REPEATNOTCHAR;
3507
3508 case OP_NOTUPTO:
3509 case OP_NOTUPTOI:
3510 case OP_NOTMINUPTO:
3511 case OP_NOTMINUPTOI:
3512 min = 0;
3513 max = GET2(ecode, 1);
3514 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3515 ecode += 1 + IMM2_SIZE;
3516 goto REPEATNOTCHAR;
3517
3518 case OP_NOTPOSSTAR:
3519 case OP_NOTPOSSTARI:
3520 possessive = TRUE;
3521 min = 0;
3522 max = INT_MAX;
3523 ecode++;
3524 goto REPEATNOTCHAR;
3525
3526 case OP_NOTPOSPLUS:
3527 case OP_NOTPOSPLUSI:
3528 possessive = TRUE;
3529 min = 1;
3530 max = INT_MAX;
3531 ecode++;
3532 goto REPEATNOTCHAR;
3533
3534 case OP_NOTPOSQUERY:
3535 case OP_NOTPOSQUERYI:
3536 possessive = TRUE;
3537 min = 0;
3538 max = 1;
3539 ecode++;
3540 goto REPEATNOTCHAR;
3541
3542 case OP_NOTPOSUPTO:
3543 case OP_NOTPOSUPTOI:
3544 possessive = TRUE;
3545 min = 0;
3546 max = GET2(ecode, 1);
3547 ecode += 1 + IMM2_SIZE;
3548 goto REPEATNOTCHAR;
3549
3550 case OP_NOTSTAR:
3551 case OP_NOTSTARI:
3552 case OP_NOTMINSTAR:
3553 case OP_NOTMINSTARI:
3554 case OP_NOTPLUS:
3555 case OP_NOTPLUSI:
3556 case OP_NOTMINPLUS:
3557 case OP_NOTMINPLUSI:
3558 case OP_NOTQUERY:
3559 case OP_NOTQUERYI:
3560 case OP_NOTMINQUERY:
3561 case OP_NOTMINQUERYI:
3562 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3563 minimize = (c & 1) != 0;
3564 min = rep_min[c]; /* Pick up values from tables; */
3565 max = rep_max[c]; /* zero for max => infinity */
3566 if (max == 0) max = INT_MAX;
3567
3568 /* Common code for all repeated single-byte matches. */
3569
3570 REPEATNOTCHAR:
3571 fc = *ecode++;
3572
3573 /* The code is duplicated for the caseless and caseful cases, for speed,
3574 since matching characters is likely to be quite common. First, ensure the
3575 minimum number of matches are present. If min = max, continue at the same
3576 level without recursing. Otherwise, if minimizing, keep trying the rest of
3577 the expression and advancing one matching character if failing, up to the
3578 maximum. Alternatively, if maximizing, find the maximum number of
3579 characters and work backwards. */
3580
3581 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3582 max, eptr));
3583
3584 if (op >= OP_NOTSTARI) /* Caseless */
3585 {
3586 #ifdef COMPILE_PCRE8
3587 /* fc must be < 128 if UTF is enabled. */
3588 foc = md->fcc[fc];
3589 #else
3590 #ifdef SUPPORT_UTF
3591 #ifdef SUPPORT_UCP
3592 if (utf && fc > 127)
3593 foc = UCD_OTHERCASE(fc);
3594 #else
3595 if (utf && fc > 127)
3596 foc = fc;
3597 #endif /* SUPPORT_UCP */
3598 else
3599 #endif /* SUPPORT_UTF */
3600 foc = TABLE_GET(fc, md->fcc, fc);
3601 #endif /* COMPILE_PCRE8 */
3602
3603 #ifdef SUPPORT_UTF
3604 if (utf)
3605 {
3606 register unsigned int d;
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 RRETURN(MATCH_NOMATCH);
3613 }
3614 GETCHARINC(d, eptr);
3615 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3616 }
3617 }
3618 else
3619 #endif
3620 /* Not UTF mode */
3621 {
3622 for (i = 1; i <= min; i++)
3623 {
3624 if (eptr >= md->end_subject)
3625 {
3626 SCHECK_PARTIAL();
3627 RRETURN(MATCH_NOMATCH);
3628 }
3629 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3630 eptr++;
3631 }
3632 }
3633
3634 if (min == max) continue;
3635
3636 if (minimize)
3637 {
3638 #ifdef SUPPORT_UTF
3639 if (utf)
3640 {
3641 register unsigned int d;
3642 for (fi = min;; fi++)
3643 {
3644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3646 if (fi >= max) RRETURN(MATCH_NOMATCH);
3647 if (eptr >= md->end_subject)
3648 {
3649 SCHECK_PARTIAL();
3650 RRETURN(MATCH_NOMATCH);
3651 }
3652 GETCHARINC(d, eptr);
3653 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3654 }
3655 }
3656 else
3657 #endif
3658 /* Not UTF mode */
3659 {
3660 for (fi = min;; fi++)
3661 {
3662 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3664 if (fi >= max) RRETURN(MATCH_NOMATCH);
3665 if (eptr >= md->end_subject)
3666 {
3667 SCHECK_PARTIAL();
3668 RRETURN(MATCH_NOMATCH);
3669 }
3670 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3671 eptr++;
3672 }
3673 }
3674 /* Control never gets here */
3675 }
3676
3677 /* Maximize case */
3678
3679 else
3680 {
3681 pp = eptr;
3682
3683 #ifdef SUPPORT_UTF
3684 if (utf)
3685 {
3686 register unsigned int d;
3687 for (i = min; i < max; i++)
3688 {
3689 int len = 1;
3690 if (eptr >= md->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 break;
3694 }
3695 GETCHARLEN(d, eptr, len);
3696 if (fc == d || foc == d) break;
3697 eptr += len;
3698 }
3699 if (possessive) continue;
3700 for(;;)
3701 {
3702 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3703 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3704 if (eptr-- == pp) break; /* Stop if tried at original pos */
3705 BACKCHAR(eptr);
3706 }
3707 }
3708 else
3709 #endif
3710 /* Not UTF mode */
3711 {
3712 for (i = min; i < max; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 break;
3718 }
3719 if (fc == *eptr || foc == *eptr) break;
3720 eptr++;
3721 }
3722 if (possessive) continue;
3723 while (eptr >= pp)
3724 {
3725 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3727 eptr--;
3728 }
3729 }
3730
3731 RRETURN(MATCH_NOMATCH);
3732 }
3733 /* Control never gets here */
3734 }
3735
3736 /* Caseful comparisons */
3737
3738 else
3739 {
3740 #ifdef SUPPORT_UTF
3741 if (utf)
3742 {
3743 register unsigned int d;
3744 for (i = 1; i <= min; i++)
3745 {
3746 if (eptr >= md->end_subject)
3747 {
3748 SCHECK_PARTIAL();
3749 RRETURN(MATCH_NOMATCH);
3750 }
3751 GETCHARINC(d, eptr);
3752 if (fc == d) RRETURN(MATCH_NOMATCH);
3753 }
3754 }
3755 else
3756 #endif
3757 /* Not UTF mode */
3758 {
3759 for (i = 1; i <= min; i++)
3760 {
3761 if (eptr >= md->end_subject)
3762 {
3763 SCHECK_PARTIAL();
3764 RRETURN(MATCH_NOMATCH);
3765 }
3766 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3767 }
3768 }
3769
3770 if (min == max) continue;
3771
3772 if (minimize)
3773 {
3774 #ifdef SUPPORT_UTF
3775 if (utf)
3776 {
3777 register unsigned int d;
3778 for (fi = min;; fi++)
3779 {
3780 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3782 if (fi >= max) RRETURN(MATCH_NOMATCH);
3783 if (eptr >= md->end_subject)
3784 {
3785 SCHECK_PARTIAL();
3786 RRETURN(MATCH_NOMATCH);
3787 }
3788 GETCHARINC(d, eptr);
3789 if (fc == d) RRETURN(MATCH_NOMATCH);
3790 }
3791 }
3792 else
3793 #endif
3794 /* Not UTF mode */
3795 {
3796 for (fi = min;; fi++)
3797 {
3798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3800 if (fi >= max) RRETURN(MATCH_NOMATCH);
3801 if (eptr >= md->end_subject)
3802 {
3803 SCHECK_PARTIAL();
3804 RRETURN(MATCH_NOMATCH);
3805 }
3806 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3807 }
3808 }
3809 /* Control never gets here */
3810 }
3811
3812 /* Maximize case */
3813
3814 else
3815 {
3816 pp = eptr;
3817
3818 #ifdef SUPPORT_UTF
3819 if (utf)
3820 {
3821 register unsigned int d;
3822 for (i = min; i < max; i++)
3823 {
3824 int len = 1;
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 break;
3829 }
3830 GETCHARLEN(d, eptr, len);
3831 if (fc == d) break;
3832 eptr += len;
3833 }
3834 if (possessive) continue;
3835 for(;;)
3836 {
3837 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3839 if (eptr-- == pp) break; /* Stop if tried at original pos */
3840 BACKCHAR(eptr);
3841 }
3842 }
3843 else
3844 #endif
3845 /* Not UTF mode */
3846 {
3847 for (i = min; i < max; i++)
3848 {
3849 if (eptr >= md->end_subject)
3850 {
3851 SCHECK_PARTIAL();
3852 break;
3853 }
3854 if (fc == *eptr) break;
3855 eptr++;
3856 }
3857 if (possessive) continue;
3858 while (eptr >= pp)
3859 {
3860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3862 eptr--;
3863 }
3864 }
3865
3866 RRETURN(MATCH_NOMATCH);
3867 }
3868 }
3869 /* Control never gets here */
3870
3871 /* Match a single character type repeatedly; several different opcodes
3872 share code. This is very similar to the code for single characters, but we
3873 repeat it in the interests of efficiency. */
3874
3875 case OP_TYPEEXACT:
3876 min = max = GET2(ecode, 1);
3877 minimize = TRUE;
3878 ecode += 1 + IMM2_SIZE;
3879 goto REPEATTYPE;
3880
3881 case OP_TYPEUPTO:
3882 case OP_TYPEMINUPTO:
3883 min = 0;
3884 max = GET2(ecode, 1);
3885 minimize = *ecode == OP_TYPEMINUPTO;
3886 ecode += 1 + IMM2_SIZE;
3887 goto REPEATTYPE;
3888
3889 case OP_TYPEPOSSTAR:
3890 possessive = TRUE;
3891 min = 0;
3892 max = INT_MAX;
3893 ecode++;
3894 goto REPEATTYPE;
3895
3896 case OP_TYPEPOSPLUS:
3897 possessive = TRUE;
3898 min = 1;
3899 max = INT_MAX;
3900 ecode++;
3901 goto REPEATTYPE;
3902
3903 case OP_TYPEPOSQUERY:
3904 possessive = TRUE;
3905 min = 0;
3906 max = 1;
3907 ecode++;
3908 goto REPEATTYPE;
3909
3910 case OP_TYPEPOSUPTO:
3911 possessive = TRUE;
3912 min = 0;
3913 max = GET2(ecode, 1);
3914 ecode += 1 + IMM2_SIZE;
3915 goto REPEATTYPE;
3916
3917 case OP_TYPESTAR:
3918 case OP_TYPEMINSTAR:
3919 case OP_TYPEPLUS:
3920 case OP_TYPEMINPLUS:
3921 case OP_TYPEQUERY:
3922 case OP_TYPEMINQUERY:
3923 c = *ecode++ - OP_TYPESTAR;
3924 minimize = (c & 1) != 0;
3925 min = rep_min[c]; /* Pick up values from tables; */
3926 max = rep_max[c]; /* zero for max => infinity */
3927 if (max == 0) max = INT_MAX;
3928
3929 /* Common code for all repeated single character type matches. Note that
3930 in UTF-8 mode, '.' matches a character of any length, but for the other
3931 character types, the valid characters are all one-byte long. */
3932
3933 REPEATTYPE:
3934 ctype = *ecode++; /* Code for the character type */
3935
3936 #ifdef SUPPORT_UCP
3937 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3938 {
3939 prop_fail_result = ctype == OP_NOTPROP;
3940 prop_type = *ecode++;
3941 prop_value = *ecode++;
3942 }
3943 else prop_type = -1;
3944 #endif
3945
3946 /* First, ensure the minimum number of matches are present. Use inline
3947 code for maximizing the speed, and do the type test once at the start
3948 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3949 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3950 and single-bytes. */
3951
3952 if (min > 0)
3953 {
3954 #ifdef SUPPORT_UCP
3955 if (prop_type >= 0)
3956 {
3957 switch(prop_type)
3958 {
3959 case PT_ANY:
3960 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3961 for (i = 1; i <= min; i++)
3962 {
3963 if (eptr >= md->end_subject)
3964 {
3965 SCHECK_PARTIAL();
3966 RRETURN(MATCH_NOMATCH);
3967 }
3968 GETCHARINCTEST(c, eptr);
3969 }
3970 break;
3971
3972 case PT_LAMP:
3973 for (i = 1; i <= min; i++)
3974 {
3975 int chartype;
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 RRETURN(MATCH_NOMATCH);
3980 }
3981 GETCHARINCTEST(c, eptr);
3982 chartype = UCD_CHARTYPE(c);
3983 if ((chartype == ucp_Lu ||
3984 chartype == ucp_Ll ||
3985 chartype == ucp_Lt) == prop_fail_result)
3986 RRETURN(MATCH_NOMATCH);
3987 }
3988 break;
3989
3990 case PT_GC:
3991 for (i = 1; i <= min; i++)
3992 {
3993 if (eptr >= md->end_subject)
3994 {
3995 SCHECK_PARTIAL();
3996 RRETURN(MATCH_NOMATCH);
3997 }
3998 GETCHARINCTEST(c, eptr);
3999 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4000 RRETURN(MATCH_NOMATCH);
4001 }
4002 break;
4003
4004 case PT_PC:
4005 for (i = 1; i <= min; i++)
4006 {
4007 if (eptr >= md->end_subject)
4008 {
4009 SCHECK_PARTIAL();
4010 RRETURN(MATCH_NOMATCH);
4011 }
4012 GETCHARINCTEST(c, eptr);
4013 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4014 RRETURN(MATCH_NOMATCH);
4015 }
4016 break;
4017
4018 case PT_SC:
4019 for (i = 1; i <= min; i++)
4020 {
4021 if (eptr >= md->end_subject)
4022 {
4023 SCHECK_PARTIAL();
4024 RRETURN(MATCH_NOMATCH);
4025 }
4026 GETCHARINCTEST(c, eptr);
4027 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 break;
4031
4032 case PT_ALNUM:
4033 for (i = 1; i <= min; i++)
4034 {
4035 int category;
4036 if (eptr >= md->end_subject)
4037 {
4038 SCHECK_PARTIAL();
4039 RRETURN(MATCH_NOMATCH);
4040 }
4041 GETCHARINCTEST(c, eptr);
4042 category = UCD_CATEGORY(c);
4043 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4044 RRETURN(MATCH_NOMATCH);
4045 }
4046 break;
4047
4048 case PT_SPACE: /* Perl space */
4049 for (i = 1; i <= min; i++)
4050 {
4051 if (eptr >= md->end_subject)
4052 {
4053 SCHECK_PARTIAL();
4054 RRETURN(MATCH_NOMATCH);
4055 }
4056 GETCHARINCTEST(c, eptr);
4057 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4058 c == CHAR_FF || c == CHAR_CR)
4059 == prop_fail_result)
4060 RRETURN(MATCH_NOMATCH);
4061 }
4062 break;
4063
4064 case PT_PXSPACE: /* POSIX space */
4065 for (i = 1; i <= min; i++)
4066 {
4067 if (eptr >= md->end_subject)
4068 {
4069 SCHECK_PARTIAL();
4070 RRETURN(MATCH_NOMATCH);
4071 }
4072 GETCHARINCTEST(c, eptr);
4073 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4074 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4075 == prop_fail_result)
4076 RRETURN(MATCH_NOMATCH);
4077 }
4078 break;
4079
4080 case PT_WORD:
4081 for (i = 1; i <= min; i++)
4082 {
4083 int category;
4084 if (eptr >= md->end_subject)
4085 {
4086 SCHECK_PARTIAL();
4087 RRETURN(MATCH_NOMATCH);
4088 }
4089 GETCHARINCTEST(c, eptr);
4090 category = UCD_CATEGORY(c);
4091 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4092 == prop_fail_result)
4093 RRETURN(MATCH_NOMATCH);
4094 }
4095 break;
4096
4097 /* This should not occur */
4098
4099 default:
4100 RRETURN(PCRE_ERROR_INTERNAL);
4101 }
4102 }
4103
4104 /* Match extended Unicode sequences. We will get here only if the
4105 support is in the binary; otherwise a compile-time error occurs. */
4106
4107 else if (ctype == OP_EXTUNI)
4108 {
4109 for (i = 1; i <= min; i++)
4110 {
4111 if (eptr >= md->end_subject)
4112 {
4113 SCHECK_PARTIAL();
4114 RRETURN(MATCH_NOMATCH);
4115 }
4116 GETCHARINCTEST(c, eptr);
4117 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4118 while (eptr < md->end_subject)
4119 {
4120 int len = 1;
4121 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4122 if (UCD_CATEGORY(c) != ucp_M) break;
4123 eptr += len;
4124 }
4125 }
4126 }
4127
4128 else
4129 #endif /* SUPPORT_UCP */
4130
4131 /* Handle all other cases when the coding is UTF-8 */
4132
4133 #ifdef SUPPORT_UTF
4134 if (utf) switch(ctype)
4135 {
4136 case OP_ANY:
4137 for (i = 1; i <= min; i++)
4138 {
4139 if (eptr >= md->end_subject)
4140 {
4141 SCHECK_PARTIAL();
4142 RRETURN(MATCH_NOMATCH);
4143 }
4144 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4145 eptr++;
4146 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4147 }
4148 break;
4149
4150 case OP_ALLANY:
4151 for (i = 1; i <= min; i++)
4152 {
4153 if (eptr >= md->end_subject)
4154 {
4155 SCHECK_PARTIAL();
4156 RRETURN(MATCH_NOMATCH);
4157 }
4158 eptr++;
4159 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4160 }
4161 break;
4162
4163 case OP_ANYBYTE:
4164 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4165 eptr += min;
4166 break;
4167
4168 case OP_ANYNL:
4169 for (i = 1; i <= min; i++)
4170 {
4171 if (eptr >= md->end_subject)
4172 {
4173 SCHECK_PARTIAL();
4174 RRETURN(MATCH_NOMATCH);
4175 }
4176 GETCHARINC(c, eptr);
4177 switch(c)
4178 {
4179 default: RRETURN(MATCH_NOMATCH);
4180
4181 case 0x000d:
4182 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4183 break;
4184
4185 case 0x000a:
4186 break;
4187
4188 case 0x000b:
4189 case 0x000c:
4190 case 0x0085:
4191 case 0x2028:
4192 case 0x2029:
4193 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4194 break;
4195 }
4196 }
4197 break;
4198
4199 case OP_NOT_HSPACE:
4200 for (i = 1; i <= min; i++)
4201 {
4202 if (eptr >= md->end_subject)
4203 {
4204 SCHECK_PARTIAL();
4205 RRETURN(MATCH_NOMATCH);
4206 }
4207 GETCHARINC(c, eptr);
4208 switch(c)
4209 {
4210 default: break;
4211 case 0x09: /* HT */
4212 case 0x20: /* SPACE */
4213 case 0xa0: /* NBSP */
4214 case 0x1680: /* OGHAM SPACE MARK */
4215 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4216 case 0x2000: /* EN QUAD */
4217 case 0x2001: /* EM QUAD */
4218 case 0x2002: /* EN SPACE */
4219 case 0x2003: /* EM SPACE */
4220 case 0x2004: /* THREE-PER-EM SPACE */
4221 case 0x2005: /* FOUR-PER-EM SPACE */
4222 case 0x2006: /* SIX-PER-EM SPACE */
4223 case 0x2007: /* FIGURE SPACE */
4224 case 0x2008: /* PUNCTUATION SPACE */
4225 case 0x2009: /* THIN SPACE */
4226 case 0x200A: /* HAIR SPACE */
4227 case 0x202f: /* NARROW NO-BREAK SPACE */
4228 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4229 case 0x3000: /* IDEOGRAPHIC SPACE */
4230 RRETURN(MATCH_NOMATCH);
4231 }
4232 }
4233 break;
4234
4235 case OP_HSPACE:
4236 for (i = 1; i <= min; i++)
4237 {
4238 if (eptr >= md->end_subject)
4239 {
4240 SCHECK_PARTIAL();
4241 RRETURN(MATCH_NOMATCH);
4242 }
4243 GETCHARINC(c, eptr);
4244 switch(c)
4245 {
4246 default: RRETURN(MATCH_NOMATCH);
4247 case 0x09: /* HT */
4248 case 0x20: /* SPACE */
4249 case 0xa0: /* NBSP */
4250 case 0x1680: /* OGHAM SPACE MARK */
4251 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4252 case 0x2000: /* EN QUAD */
4253 case 0x2001: /* EM QUAD */
4254 case 0x2002: /* EN SPACE */
4255 case 0x2003: /* EM SPACE */
4256 case 0x2004: /* THREE-PER-EM SPACE */
4257 case 0x2005: /* FOUR-PER-EM SPACE */
4258 case 0x2006: /* SIX-PER-EM SPACE */
4259 case 0x2007: /* FIGURE SPACE */
4260 case 0x2008: /* PUNCTUATION SPACE */
4261 case 0x2009: /* THIN SPACE */
4262 case 0x200A: /* HAIR SPACE */
4263 case 0x202f: /* NARROW NO-BREAK SPACE */
4264 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4265 case 0x3000: /* IDEOGRAPHIC SPACE */
4266 break;
4267 }
4268 }
4269 break;
4270
4271 case OP_NOT_VSPACE:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 RRETURN(MATCH_NOMATCH);
4278 }
4279 GETCHARINC(c, eptr);
4280 switch(c)
4281 {
4282 default: break;
4283 case 0x0a: /* LF */
4284 case 0x0b: /* VT */
4285 case 0x0c: /* FF */
4286 case 0x0d: /* CR */
4287 case 0x85: /* NEL */
4288 case 0x2028: /* LINE SEPARATOR */
4289 case 0x2029: /* PARAGRAPH SEPARATOR */
4290 RRETURN(MATCH_NOMATCH);
4291 }
4292 }
4293 break;
4294
4295 case OP_VSPACE:
4296 for (i = 1; i <= min; i++)
4297 {
4298 if (eptr >= md->end_subject)
4299 {
4300 SCHECK_PARTIAL();
4301 RRETURN(MATCH_NOMATCH);
4302 }
4303 GETCHARINC(c, eptr);
4304 switch(c)
4305 {
4306 default: RRETURN(MATCH_NOMATCH);
4307 case 0x0a: /* LF */
4308 case 0x0b: /* VT */
4309 case 0x0c: /* FF */
4310 case 0x0d: /* CR */
4311 case 0x85: /* NEL */
4312 case 0x2028: /* LINE SEPARATOR */
4313 case 0x2029: /* PARAGRAPH SEPARATOR */
4314 break;
4315 }
4316 }
4317 break;
4318
4319 case OP_NOT_DIGIT:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 GETCHARINC(c, eptr);
4328 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4329 RRETURN(MATCH_NOMATCH);
4330 }
4331 break;
4332
4333 case OP_DIGIT:
4334 for (i = 1; i <= min; i++)
4335 {
4336 if (eptr >= md->end_subject)
4337 {
4338 SCHECK_PARTIAL();
4339 RRETURN(MATCH_NOMATCH);
4340 }
4341 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4342 RRETURN(MATCH_NOMATCH);
4343 eptr++;
4344 /* No need to skip more bytes - we know it's a 1-byte character */
4345 }
4346 break;
4347
4348 case OP_NOT_WHITESPACE:
4349 for (i = 1; i <= min; i++)
4350 {
4351 if (eptr >= md->end_subject)
4352 {
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4355 }
4356 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4357 RRETURN(MATCH_NOMATCH);
4358 eptr++;
4359 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4360 }
4361 break;
4362
4363 case OP_WHITESPACE:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 RRETURN(MATCH_NOMATCH);
4370 }
4371 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4372 RRETURN(MATCH_NOMATCH);
4373 eptr++;
4374 /* No need to skip more bytes - we know it's a 1-byte character */
4375 }
4376 break;
4377
4378 case OP_NOT_WORDCHAR:
4379 for (i = 1; i <= min; i++)
4380 {
4381 if (eptr >= md->end_subject)
4382 {
4383 SCHECK_PARTIAL();
4384 RRETURN(MATCH_NOMATCH);
4385 }
4386 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4387 RRETURN(MATCH_NOMATCH);
4388 eptr++;
4389 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4390 }
4391 break;
4392
4393 case OP_WORDCHAR:
4394 for (i = 1; i <= min; i++)
4395 {
4396 if (eptr >= md->end_subject)
4397 {
4398 SCHECK_PARTIAL();
4399 RRETURN(MATCH_NOMATCH);
4400 }
4401 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4402 RRETURN(MATCH_NOMATCH);
4403 eptr++;
4404 /* No need to skip more bytes - we know it's a 1-byte character */
4405 }
4406 break;
4407
4408 default:
4409 RRETURN(PCRE_ERROR_INTERNAL);
4410 } /* End switch(ctype) */
4411
4412 else
4413 #endif /* SUPPORT_UTF */
4414
4415 /* Code for the non-UTF-8 case for minimum matching of operators other
4416 than OP_PROP and OP_NOTPROP. */
4417
4418 switch(ctype)
4419 {
4420 case OP_ANY:
4421 for (i = 1; i <= min; i++)
4422 {
4423 if (eptr >= md->end_subject)
4424 {
4425 SCHECK_PARTIAL();
4426 RRETURN(MATCH_NOMATCH);
4427 }
4428 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4429 eptr++;
4430 }
4431 break;
4432
4433 case OP_ALLANY:
4434 if (eptr > md->end_subject - min)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 eptr += min;
4440 break;
4441
4442 case OP_ANYBYTE:
4443 if (eptr > md->end_subject - min)
4444 {
4445 SCHECK_PARTIAL();
4446 RRETURN(MATCH_NOMATCH);
4447 }
4448 eptr += min;
4449 break;
4450
4451 case OP_ANYNL:
4452 for (i = 1; i <= min; i++)
4453 {
4454 if (eptr >= md->end_subject)
4455 {
4456 SCHECK_PARTIAL();
4457 RRETURN(MATCH_NOMATCH);
4458 }
4459 switch(*eptr++)
4460 {
4461 default: RRETURN(MATCH_NOMATCH);
4462
4463 case 0x000d:
4464 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4465 break;
4466
4467 case 0x000a:
4468 break;
4469
4470 case 0x000b:
4471 case 0x000c:
4472 case 0x0085:
4473 #ifdef COMPILE_PCRE16
4474 case 0x2028:
4475 case 0x2029:
4476 #endif
4477 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4478 break;
4479 }
4480 }
4481 break;
4482
4483 case OP_NOT_HSPACE:
4484 for (i = 1; i <= min; i++)
4485 {
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 RRETURN(MATCH_NOMATCH);
4490 }
4491 switch(*eptr++)
4492 {
4493 default: break;
4494 case 0x09: /* HT */
4495 case 0x20: /* SPACE */
4496 case 0xa0: /* NBSP */
4497 #ifdef COMPILE_PCRE16
4498 case 0x1680: /* OGHAM SPACE MARK */
4499 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4500 case 0x2000: /* EN QUAD */
4501 case 0x2001: /* EM QUAD */
4502 case 0x2002: /* EN SPACE */
4503 case 0x2003: /* EM SPACE */
4504 case 0x2004: /* THREE-PER-EM SPACE */
4505 case 0x2005: /* FOUR-PER-EM SPACE */
4506 case 0x2006: /* SIX-PER-EM SPACE */
4507 case 0x2007: /* FIGURE SPACE */
4508 case 0x2008: /* PUNCTUATION SPACE */
4509 case 0x2009: /* THIN SPACE */
4510 case 0x200A: /* HAIR SPACE */
4511 case 0x202f: /* NARROW NO-BREAK SPACE */
4512 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4513 case 0x3000: /* IDEOGRAPHIC SPACE */
4514 #endif
4515 RRETURN(MATCH_NOMATCH);
4516 }
4517 }
4518 break;
4519
4520 case OP_HSPACE:
4521 for (i = 1; i <= min; i++)
4522 {
4523 if (eptr >= md->end_subject)
4524 {
4525 SCHECK_PARTIAL();
4526 RRETURN(MATCH_NOMATCH);
4527 }
4528 switch(*eptr++)
4529 {
4530 default: RRETURN(MATCH_NOMATCH);
4531 case 0x09: /* HT */
4532 case 0x20: /* SPACE */
4533 case 0xa0: /* NBSP */
4534 #ifdef COMPILE_PCRE16
4535 case 0x1680: /* OGHAM SPACE MARK */
4536 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4537 case 0x2000: /* EN QUAD */
4538 case 0x2001: /* EM QUAD */
4539 case 0x2002: /* EN SPACE */
4540 case 0x2003: /* EM SPACE */
4541 case 0x2004: /* THREE-PER-EM SPACE */
4542 case 0x2005: /* FOUR-PER-EM SPACE */
4543 case 0x2006: /* SIX-PER-EM SPACE */
4544 case 0x2007: /* FIGURE SPACE */
4545 case 0x2008: /* PUNCTUATION SPACE */
4546 case 0x2009: /* THIN SPACE */
4547 case 0x200A: /* HAIR SPACE */
4548 case 0x202f: /* NARROW NO-BREAK SPACE */
4549 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4550 case 0x3000: /* IDEOGRAPHIC SPACE */
4551 #endif
4552 break;
4553 }
4554 }
4555 break;
4556
4557 case OP_NOT_VSPACE:
4558 for (i = 1; i <= min; i++)
4559 {
4560 if (eptr >= md->end_subject)
4561 {
4562 SCHECK_PARTIAL();
4563 RRETURN(MATCH_NOMATCH);
4564 }
4565 switch(*eptr++)
4566 {
4567 default: break;
4568 case 0x0a: /* LF */
4569 case 0x0b: /* VT */
4570 case 0x0c: /* FF */
4571 case 0x0d: /* CR */
4572 case 0x85: /* NEL */
4573 #ifdef COMPILE_PCRE16
4574 case 0x2028: /* LINE SEPARATOR */
4575 case 0x2029: /* PARAGRAPH SEPARATOR */
4576 #endif
4577 RRETURN(MATCH_NOMATCH);
4578 }
4579 }
4580 break;
4581
4582 case OP_VSPACE:
4583 for (i = 1; i <= min; i++)
4584 {
4585 if (eptr >= md->end_subject)
4586 {
4587 SCHECK_PARTIAL();
4588 RRETURN(MATCH_NOMATCH);
4589 }
4590 switch(*eptr++)
4591 {
4592 default: RRETURN(MATCH_NOMATCH);
4593 case 0x0a: /* LF */
4594 case 0x0b: /* VT */
4595 case 0x0c: /* FF */
4596 case 0x0d: /* CR */
4597 case 0x85: /* NEL */
4598 #ifdef COMPILE_PCRE16
4599 case 0x2028: /* LINE SEPARATOR */
4600 case 0x2029: /* PARAGRAPH SEPARATOR */
4601 #endif
4602 break;
4603 }
4604 }
4605 break;
4606
4607 case OP_NOT_DIGIT:
4608 for (i = 1; i <= min; i++)
4609 {
4610 if (eptr >= md->end_subject)
4611 {
4612 SCHECK_PARTIAL();
4613 RRETURN(MATCH_NOMATCH);
4614 }
4615 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4616 RRETURN(MATCH_NOMATCH);
4617 eptr++;
4618 }
4619 break;
4620
4621 case OP_DIGIT:
4622 for (i = 1; i <= min; i++)
4623 {
4624 if (eptr >= md->end_subject)
4625 {
4626 SCHECK_PARTIAL();
4627 RRETURN(MATCH_NOMATCH);
4628 }
4629 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4630 RRETURN(MATCH_NOMATCH);
4631 eptr++;
4632 }
4633 break;
4634
4635 case OP_NOT_WHITESPACE:
4636 for (i = 1; i <= min; i++)
4637 {
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 RRETURN(MATCH_NOMATCH);
4642 }
4643 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4644 RRETURN(MATCH_NOMATCH);
4645 eptr++;
4646 }
4647 break;
4648
4649 case OP_WHITESPACE:
4650 for (i = 1; i <= min; i++)
4651 {
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 RRETURN(MATCH_NOMATCH);
4656 }
4657 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4658 RRETURN(MATCH_NOMATCH);
4659 eptr++;
4660 }
4661 break;
4662
4663 case OP_NOT_WORDCHAR:
4664 for (i = 1; i <= min; i++)
4665 {
4666 if (eptr >= md->end_subject)
4667 {
4668 SCHECK_PARTIAL();
4669 RRETURN(MATCH_NOMATCH);
4670 }
4671 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4672 RRETURN(MATCH_NOMATCH);
4673 eptr++;
4674 }
4675 break;
4676
4677 case OP_WORDCHAR:
4678 for (i = 1; i <= min; i++)
4679 {
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 RRETURN(MATCH_NOMATCH);
4684 }
4685 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4686 RRETURN(MATCH_NOMATCH);
4687 eptr++;
4688 }
4689 break;
4690
4691 default:
4692 RRETURN(PCRE_ERROR_INTERNAL);
4693 }
4694 }
4695
4696 /* If min = max, continue at the same level without recursing */
4697
4698 if (min == max) continue;
4699
4700 /* If minimizing, we have to test the rest of the pattern before each
4701 subsequent match. Again, separate the UTF-8 case for speed, and also
4702 separate the UCP cases. */
4703
4704 if (minimize)
4705 {
4706 #ifdef SUPPORT_UCP
4707 if (prop_type >= 0)
4708 {
4709 switch(prop_type)
4710 {
4711 case PT_ANY:
4712 for (fi = min;; fi++)
4713 {
4714 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4716 if (fi >= max) RRETURN(MATCH_NOMATCH);
4717 if (eptr >= md->end_subject)
4718 {
4719 SCHECK_PARTIAL();
4720 RRETURN(MATCH_NOMATCH);
4721 }
4722 GETCHARINCTEST(c, eptr);
4723 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4724 }
4725 /* Control never gets here */
4726
4727 case PT_LAMP:
4728 for (fi = min;; fi++)
4729 {
4730 int chartype;
4731 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4733 if (fi >= max) RRETURN(MATCH_NOMATCH);
4734 if (eptr >= md->end_subject)
4735 {
4736 SCHECK_PARTIAL();
4737 RRETURN(MATCH_NOMATCH);
4738 }
4739 GETCHARINCTEST(c, eptr);
4740 chartype = UCD_CHARTYPE(c);
4741 if ((chartype == ucp_Lu ||
4742 chartype == ucp_Ll ||
4743 chartype == ucp_Lt) == prop_fail_result)
4744 RRETURN(MATCH_NOMATCH);
4745 }
4746 /* Control never gets here */
4747
4748 case PT_GC:
4749 for (fi = min;; fi++)
4750 {
4751 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4753 if (fi >= max) RRETURN(MATCH_NOMATCH);
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 GETCHARINCTEST(c, eptr);
4760 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 /* Control never gets here */
4764
4765 case PT_PC:
4766 for (fi = min;; fi++)
4767 {
4768 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4769 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4770 if (fi >= max) RRETURN(MATCH_NOMATCH);
4771 if (eptr >= md->end_subject)
4772 {
4773 SCHECK_PARTIAL();
4774 RRETURN(MATCH_NOMATCH);
4775 }
4776 GETCHARINCTEST(c, eptr);
4777 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4778 RRETURN(MATCH_NOMATCH);
4779 }
4780 /* Control never gets here */
4781
4782 case PT_SC:
4783 for (fi = min;; fi++)
4784 {
4785 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4787 if (fi >= max) RRETURN(MATCH_NOMATCH);
4788 if (eptr >= md->end_subject)
4789 {
4790 SCHECK_PARTIAL();
4791 RRETURN(MATCH_NOMATCH);
4792 }
4793 GETCHARINCTEST(c, eptr);
4794 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 /* Control never gets here */
4798
4799 case PT_ALNUM:
4800 for (fi = min;; fi++)
4801 {
4802 int category;
4803 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4805 if (fi >= max) RRETURN(MATCH_NOMATCH);
4806 if (eptr >= md->end_subject)
4807 {
4808 SCHECK_PARTIAL();
4809 RRETURN(MATCH_NOMATCH);
4810 }
4811 GETCHARINCTEST(c, eptr);
4812 category = UCD_CATEGORY(c);
4813 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 /* Control never gets here */
4817
4818 case PT_SPACE: /* Perl space */
4819 for (fi = min;; fi++)
4820 {
4821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4823 if (fi >= max) RRETURN(MATCH_NOMATCH);
4824 if (eptr >= md->end_subject)
4825 {
4826 SCHECK_PARTIAL();
4827 RRETURN(MATCH_NOMATCH);
4828 }
4829 GETCHARINCTEST(c, eptr);
4830 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4831 c == CHAR_FF || c == CHAR_CR)
4832 == prop_fail_result)
4833 RRETURN(MATCH_NOMATCH);
4834 }
4835 /* Control never gets here */
4836
4837 case PT_PXSPACE: /* POSIX space */
4838 for (fi = min;; fi++)
4839 {
4840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4842 if (fi >= max) RRETURN(MATCH_NOMATCH);
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 RRETURN(MATCH_NOMATCH);
4847 }
4848 GETCHARINCTEST(c, eptr);
4849 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4850 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4851 == prop_fail_result)
4852 RRETURN(MATCH_NOMATCH);
4853 }
4854 /* Control never gets here */
4855
4856 case PT_WORD:
4857 for (fi = min;; fi++)
4858 {
4859 int category;
4860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4862 if (fi >= max) RRETURN(MATCH_NOMATCH);
4863 if (eptr >= md->end_subject)
4864 {
4865 SCHECK_PARTIAL();
4866 RRETURN(MATCH_NOMATCH);
4867 }
4868 GETCHARINCTEST(c, eptr);
4869 category = UCD_CATEGORY(c);
4870 if ((category == ucp_L ||
4871 category == ucp_N ||
4872 c == CHAR_UNDERSCORE)
4873 == prop_fail_result)
4874 RRETURN(MATCH_NOMATCH);
4875 }
4876 /* Control never gets here */
4877
4878 /* This should never occur */
4879
4880 default:
4881 RRETURN(PCRE_ERROR_INTERNAL);
4882 }
4883 }
4884
4885 /* Match extended Unicode sequences. We will get here only if the
4886 support is in the binary; otherwise a compile-time error occurs. */
4887
4888 else if (ctype == OP_EXTUNI)
4889 {
4890 for (fi = min;; fi++)
4891 {
4892 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4894 if (fi >= max) RRETURN(MATCH_NOMATCH);
4895 if (eptr >= md->end_subject)
4896 {
4897 SCHECK_PARTIAL();
4898 RRETURN(MATCH_NOMATCH);
4899 }
4900 GETCHARINCTEST(c, eptr);
4901 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4902 while (eptr < md->end_subject)
4903 {
4904 int len = 1;
4905 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4906 if (UCD_CATEGORY(c) != ucp_M) break;
4907 eptr += len;
4908 }
4909 }
4910 }
4911 else
4912 #endif /* SUPPORT_UCP */
4913
4914 #ifdef SUPPORT_UTF
4915 if (utf)
4916 {
4917 for (fi = min;; fi++)
4918 {
4919 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4921 if (fi >= max) RRETURN(MATCH_NOMATCH);
4922 if (eptr >= md->end_subject)
4923 {
4924 SCHECK_PARTIAL();
4925 RRETURN(MATCH_NOMATCH);
4926 }
4927 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4928 RRETURN(MATCH_NOMATCH);
4929 GETCHARINC(c, eptr);
4930 switch(ctype)
4931 {
4932 case OP_ANY: /* This is the non-NL case */
4933 case OP_ALLANY:
4934 case OP_ANYBYTE:
4935 break;
4936
4937 case OP_ANYNL:
4938 switch(c)
4939 {
4940 default: RRETURN(MATCH_NOMATCH);
4941 case 0x000d:
4942 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4943 break;
4944 case 0x000a:
4945 break;
4946
4947 case 0x000b:
4948 case 0x000c:
4949 case 0x0085:
4950 case 0x2028:
4951 case 0x2029:
4952 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4953 break;
4954 }
4955 break;
4956
4957 case OP_NOT_HSPACE:
4958 switch(c)
4959 {
4960 default: break;
4961 case 0x09: /* HT */
4962 case 0x20: /* SPACE */
4963 case 0xa0: /* NBSP */
4964 case 0x1680: /* OGHAM SPACE MARK */
4965 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4966 case 0x2000: /* EN QUAD */
4967 case 0x2001: /* EM QUAD */
4968 case 0x2002: /* EN SPACE */
4969 case 0x2003: /* EM SPACE */
4970 case 0x2004: /* THREE-PER-EM SPACE */
4971 case 0x2005: /* FOUR-PER-EM SPACE */
4972 case 0x2006: /* SIX-PER-EM SPACE */
4973 case 0x2007: /* FIGURE SPACE */
4974 case 0x2008: /* PUNCTUATION SPACE */
4975 case 0x2009: /* THIN SPACE */
4976 case 0x200A: /* HAIR SPACE */
4977 case 0x202f: /* NARROW NO-BREAK SPACE */
4978 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4979 case 0x3000: /* IDEOGRAPHIC SPACE */
4980 RRETURN(MATCH_NOMATCH);
4981 }
4982 break;
4983
4984 case OP_HSPACE:
4985 switch(c)
4986 {
4987 default: RRETURN(MATCH_NOMATCH);
4988 case 0x09: /* HT */
4989 case 0x20: /* SPACE */
4990 case 0xa0: /* NBSP */
4991 case 0x1680: /* OGHAM SPACE MARK */
4992 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4993 case 0x2000: /* EN QUAD */
4994 case 0x2001: /* EM QUAD */
4995 case 0x2002: /* EN SPACE */
4996 case 0x2003: /* EM SPACE */
4997 case 0x2004: /* THREE-PER-EM SPACE */
4998 case 0x2005: /* FOUR-PER-EM SPACE */
4999 case 0x2006: /* SIX-PER-EM SPACE */
5000 case 0x2007: /* FIGURE SPACE */
5001 case 0x2008: /* PUNCTUATION SPACE */
5002 case 0x2009: /* THIN SPACE */
5003 case 0x200A: /* HAIR SPACE */
5004 case 0x202f: /* NARROW NO-BREAK SPACE */
5005 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5006 case 0x3000: /* IDEOGRAPHIC SPACE */
5007 break;
5008 }
5009 break;
5010
5011 case OP_NOT_VSPACE:
5012 switch(c)
5013 {
5014 default: break;
5015 case 0x0a: /* LF */
5016 case 0x0b: /* VT */
5017 case 0x0c: /* FF */
5018 case 0x0d: /* CR */
5019 case 0x85: /* NEL */
5020 case 0x2028: /* LINE SEPARATOR */
5021 case 0x2029: /* PARAGRAPH SEPARATOR */
5022 RRETURN(MATCH_NOMATCH);
5023 }
5024 break;
5025
5026 case OP_VSPACE:
5027 switch(c)
5028 {
5029 default: RRETURN(MATCH_NOMATCH);
5030 case 0x0a: /* LF */
5031 case 0x0b: /* VT */
5032 case 0x0c: /* FF */
5033 case 0x0d: /* CR */
5034 case 0x85: /* NEL */
5035 case 0x2028: /* LINE SEPARATOR */
5036 case 0x2029: /* PARAGRAPH SEPARATOR */
5037 break;
5038 }
5039 break;
5040
5041 case OP_NOT_DIGIT:
5042 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5043 RRETURN(MATCH_NOMATCH);
5044 break;
5045
5046 case OP_DIGIT:
5047 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5048 RRETURN(MATCH_NOMATCH);
5049 break;
5050
5051 case OP_NOT_WHITESPACE:
5052 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5053 RRETURN(MATCH_NOMATCH);
5054 break;
5055
5056 case OP_WHITESPACE:
5057 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5058 RRETURN(MATCH_NOMATCH);
5059 break;
5060
5061 case OP_NOT_WORDCHAR:
5062 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5063 RRETURN(MATCH_NOMATCH);
5064 break;
5065
5066 case OP_WORDCHAR:
5067 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5068 RRETURN(MATCH_NOMATCH);
5069 break;
5070
5071 default:
5072 RRETURN(PCRE_ERROR_INTERNAL);
5073 }
5074 }
5075 }
5076 else
5077 #endif
5078 /* Not UTF mode */
5079 {
5080 for (fi = min;; fi++)
5081 {
5082 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5084 if (fi >= max) RRETURN(MATCH_NOMATCH);
5085 if (eptr >= md->end_subject)
5086 {
5087 SCHECK_PARTIAL();
5088 RRETURN(MATCH_NOMATCH);
5089 }
5090 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5091 RRETURN(MATCH_NOMATCH);
5092 c = *eptr++;
5093 switch(ctype)
5094 {
5095 case OP_ANY: /* This is the non-NL case */
5096 case OP_ALLANY:
5097 case OP_ANYBYTE:
5098 break;
5099
5100 case OP_ANYNL:
5101 switch(c)
5102 {
5103 default: RRETURN(MATCH_NOMATCH);
5104 case 0x000d:
5105 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5106 break;
5107
5108 case 0x000a:
5109 break;
5110
5111 case 0x000b:
5112 case 0x000c:
5113 case 0x0085:
5114 #ifdef COMPILE_PCRE16
5115 case 0x2028:
5116 case 0x2029:
5117 #endif
5118 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5119 break;
5120 }
5121 break;
5122
5123 case OP_NOT_HSPACE:
5124 switch(c)
5125 {
5126 default: break;
5127 case 0x09: /* HT */
5128 case 0x20: /* SPACE */
5129 case 0xa0: /* NBSP */
5130 #ifdef COMPILE_PCRE16
5131 case 0x1680: /* OGHAM SPACE MARK */
5132 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5133 case 0x2000: /* EN QUAD */
5134 case 0x2001: /* EM QUAD */
5135 case 0x2002: /* EN SPACE */
5136 case 0x2003: /* EM SPACE */
5137 case 0x2004: /* THREE-PER-EM SPACE */
5138 case 0x2005: /* FOUR-PER-EM SPACE */
5139 case 0x2006: /* SIX-PER-EM SPACE */
5140 case 0x2007: /* FIGURE SPACE */
5141 case 0x2008: /* PUNCTUATION SPACE */
5142 case 0x2009: /* THIN SPACE */
5143 case 0x200A: /* HAIR SPACE */
5144 case 0x202f: /* NARROW NO-BREAK SPACE */
5145 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5146 case 0x3000: /* IDEOGRAPHIC SPACE */
5147 #endif
5148 RRETURN(MATCH_NOMATCH);
5149 }
5150 break;
5151
5152 case OP_HSPACE:
5153 switch(c)
5154 {
5155 default: RRETURN(MATCH_NOMATCH);
5156 case 0x09: /* HT */
5157 case 0x20: /* SPACE */
5158 case 0xa0: /* NBSP */
5159 #ifdef COMPILE_PCRE16
5160 case 0x1680: /* OGHAM SPACE MARK */
5161 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5162 case 0x2000: /* EN QUAD */
5163 case 0x2001: /* EM QUAD */
5164 case 0x2002: /* EN SPACE */
5165 case 0x2003: /* EM SPACE */
5166 case 0x2004: /* THREE-PER-EM SPACE */
5167 case 0x2005: /* FOUR-PER-EM SPACE */
5168 case 0x2006: /* SIX-PER-EM SPACE */
5169 case 0x2007: /* FIGURE SPACE */
5170 case 0x2008: /* PUNCTUATION SPACE */
5171 case 0x2009: /* THIN SPACE */
5172 case 0x200A: /* HAIR SPACE */
5173 case 0x202f: /* NARROW NO-BREAK SPACE */
5174 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5175 case 0x3000: /* IDEOGRAPHIC SPACE */
5176 #endif
5177 break;
5178 }
5179 break;
5180
5181 case OP_NOT_VSPACE:
5182 switch(c)
5183 {
5184 default: break;
5185 case 0x0a: /* LF */
5186 case 0x0b: /* VT */
5187 case 0x0c: /* FF */
5188 case 0x0d: /* CR */
5189 case 0x85: /* NEL */
5190 #ifdef COMPILE_PCRE16
5191 case 0x2028: /* LINE SEPARATOR */
5192 case 0x2029: /* PARAGRAPH SEPARATOR */
5193 #endif
5194 RRETURN(MATCH_NOMATCH);
5195 }
5196 break;
5197
5198 case OP_VSPACE:
5199 switch(c)
5200 {
5201 default: RRETURN(MATCH_NOMATCH);
5202 case 0x0a: /* LF */
5203 case 0x0b: /* VT */
5204 case 0x0c: /* FF */
5205 case 0x0d: /* CR */
5206 case 0x85: /* NEL */
5207 #ifdef COMPILE_PCRE16
5208 case 0x2028: /* LINE SEPARATOR */
5209 case 0x2029: /* PARAGRAPH SEPARATOR */
5210 #endif
5211 break;
5212 }
5213 break;
5214
5215 case OP_NOT_DIGIT:
5216 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5217 break;
5218
5219 case OP_DIGIT:
5220 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5221 break;
5222
5223 case OP_NOT_WHITESPACE:
5224 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5225 break;
5226
5227 case OP_WHITESPACE:
5228 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5229 break;
5230
5231 case OP_NOT_WORDCHAR:
5232 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5233 break;
5234
5235 case OP_WORDCHAR:
5236 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5237 break;
5238
5239 default:
5240 RRETURN(PCRE_ERROR_INTERNAL);
5241 }
5242 }
5243 }
5244 /* Control never gets here */
5245 }
5246
5247 /* If maximizing, it is worth using inline code for speed, doing the type
5248 test once at the start (i.e. keep it out of the loop). Again, keep the
5249 UTF-8 and UCP stuff separate. */
5250
5251 else
5252 {
5253 pp = eptr; /* Remember where we started */
5254
5255 #ifdef SUPPORT_UCP
5256 if (prop_type >= 0)
5257 {
5258 switch(prop_type)
5259 {
5260 case PT_ANY:
5261 for (i = min; i < max; i++)
5262 {
5263 int len = 1;
5264 if (eptr >= md->end_subject)
5265 {
5266 SCHECK_PARTIAL();
5267 break;
5268 }
5269 GETCHARLENTEST(c, eptr, len);
5270 if (prop_fail_result) break;
5271 eptr+= len;
5272 }
5273 break;
5274
5275 case PT_LAMP:
5276 for (i = min; i < max; i++)
5277 {
5278 int chartype;
5279 int len = 1;
5280 if (eptr >= md->end_subject)
5281 {
5282 SCHECK_PARTIAL();
5283 break;
5284 }
5285 GETCHARLENTEST(c, eptr, len);
5286 chartype = UCD_CHARTYPE(c);
5287 if ((chartype == ucp_Lu ||
5288 chartype == ucp_Ll ||
5289 chartype == ucp_Lt) == prop_fail_result)
5290 break;
5291 eptr+= len;
5292 }
5293 break;
5294
5295 case PT_GC:
5296 for (i = min; i < max; i++)
5297 {
5298 int len = 1;
5299 if (eptr >= md->end_subject)
5300 {
5301 SCHECK_PARTIAL();
5302 break;
5303 }
5304 GETCHARLENTEST(c, eptr, len);
5305 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5306 eptr+= len;
5307 }
5308 break;
5309
5310 case PT_PC:
5311 for (i = min; i < max; i++)
5312 {
5313 int len = 1;
5314 if (eptr >= md->end_subject)
5315 {
5316 SCHECK_PARTIAL();
5317 break;
5318 }
5319 GETCHARLENTEST(c, eptr, len);
5320 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5321 eptr+= len;
5322 }
5323 break;
5324
5325 case PT_SC:
5326 for (i = min; i < max; i++)
5327 {
5328 int len = 1;
5329 if (eptr >= md->end_subject)
5330 {
5331 SCHECK_PARTIAL();
5332 break;
5333 }
5334 GETCHARLENTEST(c, eptr, len);
5335 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5336 eptr+= len;
5337 }
5338 break;
5339
5340 case PT_ALNUM:
5341 for (i = min; i < max; i++)
5342 {
5343 int category;
5344 int len = 1;
5345 if (eptr >= md->end_subject)
5346 {
5347 SCHECK_PARTIAL();
5348 break;
5349 }
5350 GETCHARLENTEST(c, eptr, len);
5351 category = UCD_CATEGORY(c);
5352 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5353 break;
5354 eptr+= len;
5355 }
5356 break;
5357
5358 case PT_SPACE: /* Perl space */
5359 for (i = min; i < max; i++)
5360 {
5361 int len = 1;
5362 if (eptr >= md->end_subject)
5363 {
5364 SCHECK_PARTIAL();
5365 break;
5366 }
5367 GETCHARLENTEST(c, eptr, len);
5368 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5369 c == CHAR_FF || c == CHAR_CR)
5370 == prop_fail_result)
5371 break;
5372 eptr+= len;
5373 }
5374 break;
5375
5376 case PT_PXSPACE: /* POSIX space */
5377 for (i = min; i < max; i++)
5378 {
5379 int len = 1;
5380 if (eptr >= md->end_subject)
5381 {
5382 SCHECK_PARTIAL();
5383 break;
5384 }
5385 GETCHARLENTEST(c, eptr, len);
5386 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5387 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5388 == prop_fail_result)
5389 break;
5390 eptr+= len;
5391 }
5392 break;
5393
5394 case PT_WORD:
5395 for (i = min; i < max; i++)
5396 {
5397 int category;
5398 int len = 1;
5399 if (eptr >= md->end_subject)
5400 {
5401 SCHECK_PARTIAL();
5402 break;
5403 }
5404 GETCHARLENTEST(c, eptr, len);
5405 category = UCD_CATEGORY(c);
5406 if ((category == ucp_L || category == ucp_N ||
5407 c == CHAR_UNDERSCORE) == prop_fail_result)
5408 break;
5409 eptr+= len;
5410 }
5411 break;
5412
5413 default:
5414 RRETURN(PCRE_ERROR_INTERNAL);
5415 }
5416
5417 /* eptr is now past the end of the maximum run */
5418
5419 if (possessive) continue;
5420 for(;;)
5421 {
5422 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5423 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5424 if (eptr-- == pp) break; /* Stop if tried at original pos */
5425 if (utf) BACKCHAR(eptr);
5426 }
5427 }
5428
5429 /* Match extended Unicode sequences. We will get here only if the
5430 support is in the binary; otherwise a compile-time error occurs. */
5431
5432 else if (ctype == OP_EXTUNI)
5433 {
5434 for (i = min; i < max; i++)
5435 {
5436 int len = 1;
5437 if (eptr >= md->end_subject)
5438 {
5439 SCHECK_PARTIAL();
5440 break;
5441 }
5442 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5443 if (UCD_CATEGORY(c) == ucp_M) break;
5444 eptr += len;
5445 while (eptr < md->end_subject)
5446 {
5447 len = 1;
5448 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5449 if (UCD_CATEGORY(c) != ucp_M) break;
5450 eptr += len;
5451 }
5452 }
5453
5454 /* eptr is now past the end of the maximum run */
5455
5456 if (possessive) continue;
5457
5458 for(;;)
5459 {
5460 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5462 if (eptr-- == pp) break; /* Stop if tried at original pos */
5463 for (;;) /* Move back over one extended */
5464 {
5465 if (!utf) c = *eptr; else
5466 {
5467 BACKCHAR(eptr);
5468 GETCHAR(c, eptr);
5469 }
5470 if (UCD_CATEGORY(c) != ucp_M) break;
5471 eptr--;
5472 }
5473 }
5474 }
5475
5476 else
5477 #endif /* SUPPORT_UCP */
5478
5479 #ifdef SUPPORT_UTF
5480 if (utf)
5481 {
5482 switch(ctype)
5483 {
5484 case OP_ANY:
5485 if (max < INT_MAX)
5486 {
5487 for (i = min; i < max; i++)
5488 {
5489 if (eptr >= md->end_subject)
5490 {
5491 SCHECK_PARTIAL();
5492 break;
5493 }
5494 if (IS_NEWLINE(eptr)) break;
5495 eptr++;
5496 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5497 }
5498 }
5499
5500 /* Handle unlimited UTF-8 repeat */
5501
5502 else
5503 {
5504 for (i = min; i < max; i++)
5505 {
5506 if (eptr >= md->end_subject)
5507 {
5508 SCHECK_PARTIAL();
5509 break;
5510 }
5511 if (IS_NEWLINE(eptr)) break;
5512 eptr++;
5513 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5514 }
5515 }
5516 break;
5517
5518 case OP_ALLANY:
5519 if (max < INT_MAX)
5520 {
5521 for (i = min; i < max; i++)
5522 {
5523 if (eptr >= md->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 eptr++;
5529 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5530 }
5531 }
5532 else
5533 {
5534 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5535 SCHECK_PARTIAL();
5536 }
5537 break;
5538
5539 /* The byte case is the same as non-UTF8 */
5540
5541 case OP_ANYBYTE:
5542 c = max - min;
5543 if (c > (unsigned int)(md->end_subject - eptr))
5544 {
5545 eptr = md->end_subject;
5546 SCHECK_PARTIAL();
5547 }
5548 else eptr += c;
5549 break;
5550
5551 case OP_ANYNL:
5552 for (i = min; i < max; i++)
5553 {
5554 int len = 1;
5555 if (eptr >= md->end_subject)
5556 {
5557 SCHECK_PARTIAL();
5558 break;
5559 }
5560 GETCHARLEN(c, eptr, len);
5561 if (c == 0x000d)
5562 {
5563 if (++eptr >= md->end_subject) break;
5564 if (*eptr == 0x000a) eptr++;
5565 }
5566 else
5567 {
5568 if (c != 0x000a &&
5569 (md->bsr_anycrlf ||
5570 (c != 0x000b && c != 0x000c &&
5571 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5572 break;
5573 eptr += len;
5574 }
5575 }
5576 break;
5577
5578 case OP_NOT_HSPACE:
5579 case OP_HSPACE:
5580 for (i = min; i < max; i++)
5581 {
5582 BOOL gotspace;
5583 int len = 1;
5584 if (eptr >= md->end_subject)
5585 {
5586 SCHECK_PARTIAL();
5587 break;
5588 }
5589 GETCHARLEN(c, eptr, len);
5590 switch(c)
5591 {
5592 default: gotspace = FALSE; break;
5593 case 0x09: /* HT */
5594 case 0x20: /* SPACE */
5595 case 0xa0: /* NBSP */
5596 case 0x1680: /* OGHAM SPACE MARK */
5597 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5598 case 0x2000: /* EN QUAD */
5599 case 0x2001: /* EM QUAD */
5600 case 0x2002: /* EN SPACE */
5601 case 0x2003: /* EM SPACE */
5602 case 0x2004: /* THREE-PER-EM SPACE */
5603 case 0x2005: /* FOUR-PER-EM SPACE */
5604 case 0x2006: /* SIX-PER-EM SPACE */
5605 case 0x2007: /* FIGURE SPACE */
5606 case 0x2008: /* PUNCTUATION SPACE */
5607 case 0x2009: /* THIN SPACE */
5608 case 0x200A: /* HAIR SPACE */
5609 case 0x202f: /* NARROW NO-BREAK SPACE */
5610 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5611 case 0x3000: /* IDEOGRAPHIC SPACE */
5612 gotspace = TRUE;
5613 break;
5614 }
5615 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5616 eptr += len;
5617 }
5618 break;
5619
5620 case OP_NOT_VSPACE:
5621 case OP_VSPACE:
5622 for (i = min; i < max; i++)
5623 {
5624 BOOL gotspace;
5625 int len = 1;
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 GETCHARLEN(c, eptr, len);
5632 switch(c)
5633 {
5634 default: gotspace = FALSE; break;
5635 case 0x0a: /* LF */
5636 case 0x0b: /* VT */
5637 case 0x0c: /* FF */
5638 case 0x0d: /* CR */
5639 case 0x85: /* NEL */
5640 case 0x2028: /* LINE SEPARATOR */
5641 case 0x2029: /* PARAGRAPH SEPARATOR */
5642 gotspace = TRUE;
5643 break;
5644 }
5645 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5646 eptr += len;
5647 }
5648 break;
5649
5650 case OP_NOT_DIGIT:
5651 for (i = min; i < max; i++)
5652 {
5653 int len = 1;
5654 if (eptr >= md->end_subject)
5655 {
5656 SCHECK_PARTIAL();
5657 break;
5658 }
5659 GETCHARLEN(c, eptr, len);
5660 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5661 eptr+= len;
5662 }
5663 break;
5664
5665 case OP_DIGIT:
5666 for (i = min; i < max; i++)
5667 {
5668 int len = 1;
5669 if (eptr >= md->end_subject)
5670 {
5671 SCHECK_PARTIAL();
5672 break;
5673 }
5674 GETCHARLEN(c, eptr, len);
5675 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5676 eptr+= len;
5677 }
5678 break;
5679
5680 case OP_NOT_WHITESPACE:
5681 for (i = min; i < max; i++)
5682 {
5683 int len = 1;
5684 if (eptr >= md->end_subject)
5685 {
5686 SCHECK_PARTIAL();
5687 break;
5688 }
5689 GETCHARLEN(c, eptr, len);
5690 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5691 eptr+= len;
5692 }
5693 break;
5694
5695 case OP_WHITESPACE:
5696 for (i = min; i < max; i++)
5697 {
5698 int len = 1;
5699 if (eptr >= md->end_subject)
5700 {
5701 SCHECK_PARTIAL();
5702 break;
5703 }
5704 GETCHARLEN(c, eptr, len);
5705 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5706 eptr+= len;
5707 }
5708 break;
5709
5710 case OP_NOT_WORDCHAR:
5711 for (i = min; i < max; i++)
5712 {
5713 int len = 1;
5714 if (eptr >= md->end_subject)
5715 {
5716 SCHECK_PARTIAL();
5717 break;
5718 }
5719 GETCHARLEN(c, eptr, len);
5720 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5721 eptr+= len;
5722 }
5723 break;
5724
5725 case OP_WORDCHAR:
5726 for (i = min; i < max; i++)
5727 {
5728 int len = 1;
5729 if (eptr >= md->end_subject)
5730 {
5731 SCHECK_PARTIAL();
5732 break;
5733 }
5734 GETCHARLEN(c, eptr, len);
5735 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5736 eptr+= len;
5737 }
5738 break;
5739
5740 default:
5741 RRETURN(PCRE_ERROR_INTERNAL);
5742 }
5743
5744 /* eptr is now past the end of the maximum run. If possessive, we are
5745 done (no backing up). Otherwise, match at this position; anything other
5746 than no match is immediately returned. For nomatch, back up one
5747 character, unless we are matching \R and the last thing matched was
5748 \r\n, in which case, back up two bytes. */
5749
5750 if (possessive) continue;
5751 for(;;)
5752 {
5753 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5754 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5755 if (eptr-- == pp) break; /* Stop if tried at original pos */
5756 BACKCHAR(eptr);
5757 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5758 eptr[-1] == '\r') eptr--;
5759 }
5760 }
5761 else
5762 #endif /* SUPPORT_UTF */
5763 /* Not UTF mode */
5764 {
5765 switch(ctype)
5766 {
5767 case OP_ANY:
5768 for (i = min; i < max; i++)
5769 {
5770 if (eptr >= md->end_subject)
5771 {
5772 SCHECK_PARTIAL();
5773 break;
5774 }
5775 if (IS_NEWLINE(eptr)) break;
5776 eptr++;
5777 }
5778 break;
5779
5780 case OP_ALLANY:
5781 case OP_ANYBYTE:
5782 c = max - min;
5783 if (c > (unsigned int)(md->end_subject - eptr))
5784 {
5785 eptr = md->end_subject;
5786 SCHECK_PARTIAL();
5787 }
5788 else eptr += c;
5789 break;
5790
5791 case OP_ANYNL:
5792 for (i = min; i < max; i++)
5793 {
5794 if (eptr >= md->end_subject)
5795 {
5796 SCHECK_PARTIAL();
5797 break;
5798 }
5799 c = *eptr;
5800 if (c == 0x000d)
5801 {
5802 if (++eptr >= md->end_subject) break;
5803 if (*eptr == 0x000a) eptr++;
5804 }
5805 else
5806 {
5807 if (c != 0x000a && (md->bsr_anycrlf ||
5808 (c != 0x000b && c != 0x000c && c != 0x0085
5809 #ifdef COMPILE_PCRE16
5810 && c != 0x2028 && c != 0x2029
5811 #endif
5812 ))) break;
5813 eptr++;
5814 }
5815 }
5816 break;
5817
5818 case OP_NOT_HSPACE:
5819 for (i = min; i < max; i++)
5820 {
5821 if (eptr >= md->end_subject)
5822 {
5823 SCHECK_PARTIAL();
5824 break;
5825 }
5826 c = *eptr;
5827 if (c == 0x09 || c == 0x20 || c == 0xa0
5828 #ifdef COMPILE_PCRE16
5829 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5830 || c == 0x202f || c == 0x205f || c == 0x3000
5831 #endif
5832 ) break;
5833 eptr++;
5834 }
5835 break;
5836
5837 case OP_HSPACE:
5838 for (i = min; i < max; i++)
5839 {
5840 if (eptr >= md->end_subject)
5841 {
5842 SCHECK_PARTIAL();
5843 break;
5844 }
5845 c = *eptr;
5846 if (c != 0x09 && c != 0x20 && c != 0xa0
5847 #ifdef COMPILE_PCRE16
5848 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5849 && c != 0x202f && c != 0x205f && c != 0x3000
5850 #endif
5851 ) break;
5852 eptr++;
5853 }
5854 break;
5855
5856 case OP_NOT_VSPACE:
5857 for (i = min; i < max; i++)
5858 {
5859 if (eptr >= md->end_subject)
5860 {
5861 SCHECK_PARTIAL();
5862 break;
5863 }
5864 c = *eptr;
5865 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5866 #ifdef COMPILE_PCRE16
5867 || c == 0x2028 || c == 0x2029
5868 #endif
5869 ) break;
5870 eptr++;
5871 }
5872 break;
5873
5874 case OP_VSPACE:
5875 for (i = min; i < max; i++)
5876 {
5877 if (eptr >= md->end_subject)
5878 {
5879 SCHECK_PARTIAL();
5880 break;
5881 }
5882 c = *eptr;
5883 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5884 #ifdef COMPILE_PCRE16
5885 && c != 0x2028 && c != 0x2029
5886 #endif
5887 ) break;
5888 eptr++;
5889 }
5890 break;
5891
5892 case OP_NOT_DIGIT:
5893 for (i = min; i < max; i++)
5894 {
5895 if (eptr >= md->end_subject)
5896 {
5897 SCHECK_PARTIAL();
5898 break;
5899 }
5900 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5901 eptr++;
5902 }
5903 break;
5904
5905 case OP_DIGIT:
5906 for (i = min; i < max; i++)
5907 {
5908 if (eptr >= md->end_subject)
5909 {
5910 SCHECK_PARTIAL();
5911 break;
5912 }
5913 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5914 eptr++;
5915 }
5916 break;
5917
5918 case OP_NOT_WHITESPACE:
5919 for (i = min; i < max; i++)
5920 {
5921 if (eptr >= md->end_subject)
5922 {
5923 SCHECK_PARTIAL();
5924 break;
5925 }
5926 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5927 eptr++;
5928 }
5929 break;
5930
5931 case OP_WHITESPACE:
5932 for (i = min; i < max; i++)
5933 {
5934 if (eptr >= md->end_subject)
5935 {
5936 SCHECK_PARTIAL();
5937 break;
5938 }
5939 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5940 eptr++;
5941 }
5942 break;
5943
5944 case OP_NOT_WORDCHAR:
5945 for (i = min; i < max; i++)
5946 {
5947 if (eptr >= md->end_subject)
5948 {
5949 SCHECK_PARTIAL();
5950 break;
5951 }
5952 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5953 eptr++;
5954 }
5955 break;
5956
5957 case OP_WORDCHAR:
5958 for (i = min; i < max; i++)
5959 {
5960 if (eptr >= md->end_subject)
5961 {
5962 SCHECK_PARTIAL();
5963 break;
5964 }
5965 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5966 eptr++;
5967 }
5968 break;
5969
5970 default:
5971 RRETURN(PCRE_ERROR_INTERNAL);
5972 }
5973
5974 /* eptr is now past the end of the maximum run. If possessive, we are
5975 done (no backing up). Otherwise, match at this position; anything other
5976 than no match is immediately returned. For nomatch, back up one
5977 character (byte), unless we are matching \R and the last thing matched
5978 was \r\n, in which case, back up two bytes. */
5979
5980 if (possessive) continue;
5981 while (eptr >= pp)
5982 {
5983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5985 eptr--;
5986 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5987 eptr[-1] == '\r') eptr--;
5988 }
5989 }
5990
5991 /* Get here if we can't make it match with any permitted repetitions */
5992
5993 RRETURN(MATCH_NOMATCH);
5994 }
5995 /* Control never gets here */
5996
5997 /* There's been some horrible disaster. Arrival here can only mean there is
5998 something seriously wrong in the code above or the OP_xxx definitions. */
5999
6000 default:
6001 DPRINTF(("Unknown opcode %d\n", *ecode));
6002 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6003 }
6004
6005 /* Do not stick any code in here without much thought; it is assumed
6006 that "continue" in the code above comes out to here to repeat the main
6007 loop. */
6008
6009 } /* End of main loop */
6010 /* Control never reaches here */
6011
6012
6013 /* When compiling to use the heap rather than the stack for recursive calls to
6014 match(), the RRETURN() macro jumps here. The number that is saved in
6015 frame->Xwhere indicates which label we actually want to return to. */
6016
6017 #ifdef NO_RECURSE
6018 #define LBL(val) case val: goto L_RM##val;
6019 HEAP_RETURN:
6020 switch (frame->Xwhere)
6021 {
6022 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6023 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6024 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6025 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6026 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6027 LBL(65) LBL(66)
6028 #ifdef SUPPORT_UTF
6029 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
6030 LBL(32) LBL(34) LBL(42) LBL(46)
6031 #ifdef SUPPORT_UCP
6032 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6033 LBL(59) LBL(60) LBL(61) LBL(62)
6034 #endif /* SUPPORT_UCP */
6035 #endif /* SUPPORT_UTF */
6036 default:
6037 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6038 return PCRE_ERROR_INTERNAL;
6039 }
6040 #undef LBL
6041 #endif /* NO_RECURSE */
6042 }
6043
6044
6045 /***************************************************************************
6046 ****************************************************************************
6047 RECURSION IN THE match() FUNCTION
6048
6049 Undefine all the macros that were defined above to handle this. */
6050
6051 #ifdef NO_RECURSE
6052 #undef eptr
6053 #undef ecode
6054 #undef mstart
6055 #undef offset_top
6056 #undef eptrb
6057 #undef flags
6058
6059 #undef callpat
6060 #undef charptr
6061 #undef data
6062 #undef next
6063 #undef pp
6064 #undef prev
6065 #undef saved_eptr
6066
6067 #undef new_recursive
6068
6069 #undef cur_is_word
6070 #undef condition
6071 #undef prev_is_word
6072
6073 #undef ctype
6074 #undef length
6075 #undef max
6076 #undef min
6077 #undef number
6078 #undef offset
6079 #undef op
6080 #undef save_capture_last
6081 #undef save_offset1
6082 #undef save_offset2
6083 #undef save_offset3
6084 #undef stacksave
6085
6086 #undef newptrb
6087
6088 #endif
6089
6090 /* These two are defined as macros in both cases */
6091
6092 #undef fc
6093 #undef fi
6094
6095 /***************************************************************************
6096 ***************************************************************************/
6097
6098
6099
6100 /*************************************************
6101 * Execute a Regular Expression *
6102 *************************************************/
6103
6104 /* This function applies a compiled re to a subject string and picks out
6105 portions of the string if it matches. Two elements in the vector are set for
6106 each substring: the offsets to the start and end of the substring.
6107
6108 Arguments:
6109 argument_re points to the compiled expression
6110 extra_data points to extra data or is NULL
6111 subject points to the subject string
6112 length length of subject string (may contain binary zeros)
6113 start_offset where to start in the subject string
6114 options option bits
6115 offsets points to a vector of ints to be filled in with offsets
6116 offsetcount the number of elements in the vector
6117
6118 Returns: > 0 => success; value is the number of elements filled in
6119 = 0 => success, but offsets is not big enough
6120 -1 => failed to match
6121 < -1 => some kind of unexpected problem
6122 */
6123
6124 #ifdef COMPILE_PCRE8
6125 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6126 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6127 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6128 int offsetcount)
6129 #else
6130 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6131 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
6132 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6133 int offsetcount)
6134 #endif
6135 {
6136 int rc, ocount, arg_offset_max;
6137 int newline;
6138 BOOL using_temporary_offsets = FALSE;
6139 BOOL anchored;
6140 BOOL startline;
6141 BOOL firstline;
6142 BOOL utf;
6143 BOOL has_first_char = FALSE;
6144 BOOL has_req_char = FALSE;
6145 pcre_uchar first_char = 0;
6146 pcre_uchar first_char2 = 0;
6147 pcre_uchar req_char = 0;
6148 pcre_uchar req_char2 = 0;
6149 match_data match_block;
6150 match_data *md = &match_block;
6151 const pcre_uint8 *tables;
6152 const pcre_uint8 *start_bits = NULL;
6153 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6154 PCRE_PUCHAR end_subject;
6155 PCRE_PUCHAR start_partial = NULL;
6156 PCRE_PUCHAR req_char_ptr = start_match - 1;
6157
6158 const pcre_study_data *study;
6159 const real_pcre *external_re = (const real_pcre *)argument_re;
6160 const real_pcre *re = external_re;
6161
6162 /* Plausibility checks */
6163
6164 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6165 if (re == NULL || subject == NULL ||
6166 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6167 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6168 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6169
6170 /* These two settings are used in the code for checking a UTF-8 string that
6171 follows immediately afterwards. Other values in the md block are used only
6172 during "normal" pcre_exec() processing, not when the JIT support is in use,
6173 so they are set up later. */
6174
6175 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6176 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6177 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6178 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6179
6180 /* Check a UTF-8 string if required. Pass back the character offset and error
6181 code for an invalid string if a results vector is available. */
6182
6183 #ifdef SUPPORT_UTF
6184 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6185 {
6186 int erroroffset;
6187 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6188 if (errorcode != 0)
6189 {
6190 if (offsetcount >= 2)
6191 {
6192 offsets[0] = erroroffset;
6193 offsets[1] = errorcode;
6194 }
6195 #ifdef COMPILE_PCRE16
6196 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6197 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6198 #else
6199 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6200 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6201 #endif
6202 }
6203
6204 /* Check that a start_offset points to the start of a UTF character. */
6205 if (start_offset > 0 && start_offset < length &&
6206 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6207 return PCRE_ERROR_BADUTF8_OFFSET;
6208 }
6209 #endif
6210
6211 /* If the pattern was successfully studied with JIT support, run the JIT
6212 executable instead of the rest of this function. Most options must be set at
6213 compile time for the JIT code to be usable. Fallback to the normal code path if
6214 an unsupported flag is set. In particular, JIT does not support partial
6215 matching. */
6216
6217 #ifdef SUPPORT_JIT
6218 if (extra_data != NULL
6219 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6220 && extra_data->executable_jit != NULL
6221 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6222 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6223 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6224 return PRIV(jit_exec)(re, extra_data->executable_jit,
6225 (const pcre_uchar *)subject, length, start_offset, options,
6226 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6227 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6228 #endif
6229
6230 /* Carry on with non-JIT matching. This information is for finding all the
6231 numbers associated with a given name, for condition testing. */
6232
6233 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6234 md->name_count = re->name_count;
6235 md->name_entry_size = re->name_entry_size;
6236
6237 /* Fish out the optional data from the extra_data structure, first setting
6238 the default values. */
6239
6240 study = NULL;
6241 md->match_limit = MATCH_LIMIT;
6242 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6243 md->callout_data = NULL;
6244
6245 /* The table pointer is always in native byte order. */
6246
6247 tables = external_re->tables;
6248
6249 if (extra_data != NULL)
6250 {
6251 register unsigned int flags = extra_data->flags;
6252 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6253 study = (const pcre_study_data *)extra_data->study_data;
6254 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6255 md->match_limit = extra_data->match_limit;
6256 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6257 md->match_limit_recursion = extra_data->match_limit_recursion;
6258 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6259 md->callout_data = extra_data->callout_data;
6260 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6261 }
6262
6263 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6264 is a feature that makes it possible to save compiled regex and re-use them
6265 in other programs later. */
6266
6267 if (tables == NULL) tables = PRIV(default_tables);
6268
6269 /* Check that the first field in the block is the magic number. If it is not,
6270 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6271 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6272 means that the pattern is likely compiled with different endianness. */
6273
6274 if (re->magic_number != MAGIC_NUMBER)
6275 return re->magic_number == REVERSED_MAGIC_NUMBER?
6276 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6277 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6278
6279 /* Set up other data */
6280
6281 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6282 startline = (re->flags & PCRE_STARTLINE) != 0;
6283 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6284
6285 /* The code starts after the real_pcre block and the capture name table. */
6286
6287 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6288 re->name_count * re->name_entry_size;
6289
6290 md->start_subject = (PCRE_PUCHAR)subject;
6291 md->start_offset = start_offset;
6292 md->end_subject = md->start_subject + length;
6293 end_subject = md->end_subject;
6294
6295 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6296 md->use_ucp = (re->options & PCRE_UCP) != 0;
6297 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6298 md->ignore_skip_arg = FALSE;
6299
6300 /* Some options are unpacked into BOOL variables in the hope that testing
6301 them will be faster than individual option bits. */
6302
6303 md->notbol = (options & PCRE_NOTBOL) != 0;
6304 md->noteol = (options & PCRE_NOTEOL) != 0;
6305 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6306 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6307
6308 md->hitend = FALSE;
6309 md->mark = md->nomatch_mark = NULL; /* In case never set */
6310
6311 md->recursive = NULL; /* No recursion at top level */
6312 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6313
6314 md->lcc = tables + lcc_offset;
6315 md->fcc = tables + fcc_offset;
6316 md->ctypes = tables + ctypes_offset;
6317
6318 /* Handle different \R options. */
6319
6320 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6321 {
6322 case 0:
6323 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6324 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6325 else
6326 #ifdef BSR_ANYCRLF
6327 md->bsr_anycrlf = TRUE;
6328 #else
6329 md->bsr_anycrlf = FALSE;
6330 #endif
6331 break;
6332
6333 case PCRE_BSR_ANYCRLF:
6334 md->bsr_anycrlf = TRUE;
6335 break;
6336
6337 case PCRE_BSR_UNICODE:
6338 md->bsr_anycrlf = FALSE;
6339 break;
6340
6341 default: return PCRE_ERROR_BADNEWLINE;
6342 }
6343
6344 /* Handle different types of newline. The three bits give eight cases. If
6345 nothing is set at run time, whatever was used at compile time applies. */
6346
6347 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6348 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6349 {
6350 case 0: newline = NEWLINE; break; /* Compile-time default */
6351 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6352 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6353 case PCRE_NEWLINE_CR+
6354 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6355 case PCRE_NEWLINE_ANY: newline = -1; break;
6356 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6357 default: return PCRE_ERROR_BADNEWLINE;
6358 }
6359
6360 if (newline == -2)
6361 {
6362 md->nltype = NLTYPE_ANYCRLF;
6363 }
6364 else if (newline < 0)
6365 {
6366 md->nltype = NLTYPE_ANY;
6367 }
6368 else
6369 {
6370 md->nltype = NLTYPE_FIXED;
6371 if (newline > 255)
6372 {
6373 md->nllen = 2;
6374 md->nl[0] = (newline >> 8) & 255;
6375 md->nl[1] = newline & 255;
6376 }
6377 else
6378 {
6379 md->nllen = 1;
6380 md->nl[0] = newline;
6381 }
6382 }
6383
6384 /* Partial matching was originally supported only for a restricted set of
6385 regexes; from release 8.00 there are no restrictions, but the bits are still
6386 defined (though never set). So there's no harm in leaving this code. */
6387
6388 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6389 return PCRE_ERROR_BADPARTIAL;
6390
6391 /* If the expression has got more back references than the offsets supplied can
6392 hold, we get a temporary chunk of working store to use during the matching.
6393 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6394 of 3. */
6395
6396 ocount = offsetcount - (offsetcount % 3);
6397 arg_offset_max = (2*ocount)/3;
6398
6399 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6400 {
6401 ocount = re->top_backref * 3 + 3;
6402 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6403 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6404 using_temporary_offsets = TRUE;
6405 DPRINTF(("Got memory to hold back references\n"));
6406 }
6407 else md->offset_vector = offsets;
6408
6409 md->offset_end = ocount;
6410 md->offset_max = (2*ocount)/3;
6411 md->offset_overflow = FALSE;
6412 md->capture_last = -1;
6413
6414 /* Reset the working variable associated with each extraction. These should
6415 never be used unless previously set, but they get saved and restored, and so we
6416 initialize them to avoid reading uninitialized locations. Also, unset the
6417 offsets for the matched string. This is really just for tidiness with callouts,
6418 in case they inspect these fields. */
6419
6420 if (md->offset_vector != NULL)
6421 {
6422 register int *iptr = md->offset_vector + ocount;
6423 register int *iend = iptr - re->top_bracket;
6424 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6425 while (--iptr >= iend) *iptr = -1;
6426 md->offset_vector[0] = md->offset_vector[1] = -1;
6427 }
6428
6429 /* Set up the first character to match, if available. The first_char value is
6430 never set for an anchored regular expression, but the anchoring may be forced
6431 at run time, so we have to test for anchoring. The first char may be unset for
6432 an unanchored pattern, of course. If there's no first char and the pattern was
6433 studied, there may be a bitmap of possible first characters. */
6434
6435 if (!anchored)
6436 {
6437 if ((re->flags & PCRE_FIRSTSET) != 0)
6438 {
6439 has_first_char = TRUE;
6440 first_char = first_char2 = re->first_char;
6441 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6442 {
6443 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6444 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6445 if (utf && first_char > 127)
6446 first_char2 = UCD_OTHERCASE(first_char);
6447 #endif
6448 }
6449 }
6450 else
6451 if (!startline && study != NULL &&
6452 (study->flags & PCRE_STUDY_MAPPED) != 0)
6453 start_bits = study->start_bits;
6454 }
6455
6456 /* For anchored or unanchored matches, there may be a "last known required
6457 character" set. */
6458
6459 if ((re->flags & PCRE_REQCHSET) != 0)
6460 {
6461 has_req_char = TRUE;
6462 req_char = req_char2 = re->req_char;
6463 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6464 {
6465 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6466 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6467 if (utf && req_char > 127)
6468 req_char2 = UCD_OTHERCASE(req_char);
6469 #endif
6470 }
6471 }
6472
6473
6474 /* ==========================================================================*/
6475
6476 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6477 the loop runs just once. */
6478
6479 for(;;)
6480 {
6481 PCRE_PUCHAR save_end_subject = end_subject;
6482 PCRE_PUCHAR new_start_match;
6483
6484 /* If firstline is TRUE, the start of the match is constrained to the first
6485 line of a multiline string. That is, the match must be before or at the first
6486 newline. Implement this by temporarily adjusting end_subject so that we stop
6487 scanning at a newline. If the match fails at the newline, later code breaks
6488 this loop. */
6489
6490 if (firstline)
6491 {
6492 PCRE_PUCHAR t = start_match;
6493 #ifdef SUPPORT_UTF
6494 if (utf)
6495 {
6496 while (t < md->end_subject && !IS_NEWLINE(t))
6497 {
6498 t++;
6499 ACROSSCHAR(t < end_subject, *t, t++);
6500 }
6501 }
6502 else
6503 #endif
6504 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6505 end_subject = t;
6506 }
6507
6508 /* There are some optimizations that avoid running the match if a known
6509 starting point is not found, or if a known later character is not present.
6510 However, there is an option that disables these, for testing and for ensuring
6511 that all callouts do actually occur. The option can be set in the regex by
6512 (*NO_START_OPT) or passed in match-time options. */
6513
6514 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6515 {
6516 /* Advance to a unique first char if there is one. */
6517
6518 if (has_first_char)
6519 {
6520 if (first_char != first_char2)
6521 while (start_match < end_subject &&
6522 *start_match != first_char && *start_match != first_char2)
6523 start_match++;
6524 else
6525 while (start_match < end_subject && *start_match != first_char)
6526 start_match++;
6527 }
6528
6529 /* Or to just after a linebreak for a multiline match */
6530
6531 else if (startline)
6532 {
6533 if (start_match > md->start_subject + start_offset)
6534 {
6535 #ifdef SUPPORT_UTF
6536 if (utf)
6537 {
6538 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6539 {
6540 start_match++;
6541 ACROSSCHAR(start_match < end_subject, *start_match,
6542 start_match++);
6543 }
6544 }
6545 else
6546 #endif
6547 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6548 start_match++;
6549
6550 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6551 and we are now at a LF, advance the match position by one more character.
6552 */
6553
6554 if (start_match[-1] == CHAR_CR &&
6555 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6556 start_match < end_subject &&
6557 *start_match == CHAR_NL)
6558 start_match++;
6559 }
6560 }
6561
6562 /* Or to a non-unique first byte after study */
6563
6564 else if (start_bits != NULL)
6565 {
6566 while (start_match < end_subject)
6567 {
6568 register unsigned int c = *start_match;
6569 #ifndef COMPILE_PCRE8
6570 if (c > 255) c = 255;
6571 #endif
6572 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6573 {
6574 start_match++;
6575 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6576 /* In non 8-bit mode, the iteration will stop for
6577 characters > 255 at the beginning or not stop at all. */
6578 if (utf)
6579 ACROSSCHAR(start_match < end_subject, *start_match,
6580 start_match++);
6581 #endif
6582 }
6583 else break;
6584 }
6585 }
6586 } /* Starting optimizations */
6587
6588 /* Restore fudged end_subject */
6589
6590 end_subject = save_end_subject;
6591
6592 /* The following two optimizations are disabled for partial matching or if
6593 disabling is explicitly requested. */
6594
6595 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6596 {
6597 /* If the pattern was studied, a minimum subject length may be set. This is
6598 a lower bound; no actual string of that length may actually match the
6599 pattern. Although the value is, strictly, in characters, we treat it as
6600 bytes to avoid spending too much time in this optimization. */
6601
6602 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6603 (pcre_uint32)(end_subject - start_match) < study->minlength)
6604 {
6605 rc = MATCH_NOMATCH;
6606 break;
6607 }
6608
6609 /* If req_char is set, we know that that character must appear in the
6610 subject for the match to succeed. If the first character is set, req_char
6611 must be later in the subject; otherwise the test starts at the match point.
6612 This optimization can save a huge amount of backtracking in patterns with
6613 nested unlimited repeats that aren't going to match. Writing separate code
6614 for cased/caseless versions makes it go faster, as does using an
6615 autoincrement and backing off on a match.
6616
6617 HOWEVER: when the subject string is very, very long, searching to its end
6618 can take a long time, and give bad performance on quite ordinary patterns.
6619 This showed up when somebody was matching something like /^\d+C/ on a
6620 32-megabyte string... so we don't do this when the string is sufficiently
6621 long. */
6622
6623 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6624 {
6625 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6626
6627 /* We don't need to repeat the search if we haven't yet reached the
6628 place we found it at last time. */
6629
6630 if (p > req_char_ptr)
6631 {
6632 if (req_char != req_char2)
6633 {
6634 while (p < end_subject)
6635 {
6636 register int pp = *p++;
6637 if (pp == req_char || pp == req_char2) { p--; break; }
6638 }
6639 }
6640 else
6641 {
6642 while (p < end_subject)
6643 {
6644 if (*p++ == req_char) { p--; break; }
6645 }
6646 }
6647
6648 /* If we can't find the required character, break the matching loop,
6649 forcing a match failure. */
6650
6651 if (p >= end_subject)
6652 {
6653 rc = MATCH_NOMATCH;
6654 break;
6655 }
6656
6657 /* If we have found the required character, save the point where we
6658 found it, so that we don't search again next time round the loop if
6659 the start hasn't passed this character yet. */
6660
6661 req_char_ptr = p;
6662 }
6663 }
6664 }
6665
6666 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6667 printf(">>>> Match against: ");
6668 pchars(start_match, end_subject - start_match, TRUE, md);
6669 printf("\n");
6670 #endif
6671
6672 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6673 first starting point for which a partial match was found. */
6674
6675 md->start_match_ptr = start_match;
6676 md->start_used_ptr = start_match;
6677 md->match_call_count = 0;
6678 md->match_function_type = 0;
6679 md->end_offset_top = 0;
6680 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6681 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6682
6683 switch(rc)
6684 {
6685 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6686 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6687 entirely. The only way we can do that is to re-do the match at the same
6688 point, with a flag to force SKIP with an argument to be ignored. Just
6689 treating this case as NOMATCH does not work because it does not check other
6690 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6691
6692 case MATCH_SKIP_ARG:
6693 new_start_match = start_match;
6694 md->ignore_skip_arg = TRUE;
6695 break;
6696
6697 /* SKIP passes back the next starting point explicitly, but if it is the
6698 same as the match we have just done, treat it as NOMATCH. */
6699
6700 case MATCH_SKIP:
6701 if (md->start_match_ptr != start_match)
6702 {
6703 new_start_match = md->start_match_ptr;
6704 break;
6705 }
6706 /* Fall through */
6707
6708 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6709 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6710
6711 case MATCH_NOMATCH:
6712 case MATCH_PRUNE:
6713 case MATCH_THEN:
6714 md->ignore_skip_arg = FALSE;
6715 new_start_match = start_match + 1;
6716 #ifdef SUPPORT_UTF
6717 if (utf)
6718 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6719 new_start_match++);
6720 #endif
6721 break;
6722
6723 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6724
6725 case MATCH_COMMIT:
6726 rc = MATCH_NOMATCH;
6727 goto ENDLOOP;
6728
6729 /* Any other return is either a match, or some kind of error. */
6730
6731 default:
6732 goto ENDLOOP;
6733 }
6734
6735 /* Control reaches here for the various types of "no match at this point"
6736 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6737
6738 rc = MATCH_NOMATCH;
6739
6740 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6741 newline in the subject (though it may continue over the newline). Therefore,
6742 if we have just failed to match, starting at a newline, do not continue. */
6743
6744 if (firstline && IS_NEWLINE(start_match)) break;