/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 728 - (show annotations)
Mon Oct 10 16:01:03 2011 UTC (3 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 201404 byte(s)
Error occurred while calculating annotation data.
PCRE_NO_START_OPTIMIZE, if given to pcre_compile(), did not suppress the 
subject length check at run time.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle an atomic group that does not contain any capturing parentheses.
798 This can be handled like an assertion. Prior to 8.13, all atomic groups
799 were handled this way. In 8.13, the code was changed as below for ONCE, so
800 that backups pass through the group and thereby reset captured values.
801 However, this uses a lot more stack, so in 8.20, atomic groups that do not
802 contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 less stack intensive way.
804
805 Check the alternative branches in turn - the matching won't pass the KET
806 for this kind of subpattern. If any one branch matches, we carry on as at
807 the end of a normal bracket, leaving the subject pointer, but resetting
808 the start-of-match value in case it was changed by \K. */
809
810 case OP_ONCE_NC:
811 prev = ecode;
812 saved_eptr = eptr;
813 do
814 {
815 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817 {
818 mstart = md->start_match_ptr;
819 break;
820 }
821 if (rrc == MATCH_THEN)
822 {
823 next = ecode + GET(ecode,1);
824 if (md->start_match_ptr < next &&
825 (*ecode == OP_ALT || *next == OP_ALT))
826 rrc = MATCH_NOMATCH;
827 }
828
829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830 ecode += GET(ecode,1);
831 }
832 while (*ecode == OP_ALT);
833
834 /* If hit the end of the group (which could be repeated), fail */
835
836 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
837
838 /* Continue as from after the group, updating the offsets high water
839 mark, since extracts may have been taken. */
840
841 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
842
843 offset_top = md->end_offset_top;
844 eptr = md->end_match_ptr;
845
846 /* For a non-repeating ket, just continue at this level. This also
847 happens for a repeating ket if no characters were matched in the group.
848 This is the forcible breaking of infinite loops as implemented in Perl
849 5.005. */
850
851 if (*ecode == OP_KET || eptr == saved_eptr)
852 {
853 ecode += 1+LINK_SIZE;
854 break;
855 }
856
857 /* The repeating kets try the rest of the pattern or restart from the
858 preceding bracket, in the appropriate order. The second "call" of match()
859 uses tail recursion, to avoid using another stack frame. */
860
861 if (*ecode == OP_KETRMIN)
862 {
863 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
865 ecode = prev;
866 goto TAIL_RECURSE;
867 }
868 else /* OP_KETRMAX */
869 {
870 md->match_function_type = MATCH_CBEGROUP;
871 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873 ecode += 1 + LINK_SIZE;
874 goto TAIL_RECURSE;
875 }
876 /* Control never gets here */
877
878 /* Handle a capturing bracket, other than those that are possessive with an
879 unlimited repeat. If there is space in the offset vector, save the current
880 subject position in the working slot at the top of the vector. We mustn't
881 change the current values of the data slot, because they may be set from a
882 previous iteration of this group, and be referred to by a reference inside
883 the group. A failure to match might occur after the group has succeeded,
884 if something later on doesn't match. For this reason, we need to restore
885 the working value and also the values of the final offsets, in case they
886 were set by a previous iteration of the same bracket.
887
888 If there isn't enough space in the offset vector, treat this as if it were
889 a non-capturing bracket. Don't worry about setting the flag for the error
890 case here; that is handled in the code for KET. */
891
892 case OP_CBRA:
893 case OP_SCBRA:
894 number = GET2(ecode, 1+LINK_SIZE);
895 offset = number << 1;
896
897 #ifdef PCRE_DEBUG
898 printf("start bracket %d\n", number);
899 printf("subject=");
900 pchars(eptr, 16, TRUE, md);
901 printf("\n");
902 #endif
903
904 if (offset < md->offset_max)
905 {
906 save_offset1 = md->offset_vector[offset];
907 save_offset2 = md->offset_vector[offset+1];
908 save_offset3 = md->offset_vector[md->offset_end - number];
909 save_capture_last = md->capture_last;
910
911 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
912 md->offset_vector[md->offset_end - number] =
913 (int)(eptr - md->start_subject);
914
915 for (;;)
916 {
917 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
918 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
919 eptrb, RM1);
920 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
921
922 /* If we backed up to a THEN, check whether it is within the current
923 branch by comparing the address of the THEN that is passed back with
924 the end of the branch. If it is within the current branch, and the
925 branch is one of two or more alternatives (it either starts or ends
926 with OP_ALT), we have reached the limit of THEN's action, so convert
927 the return code to NOMATCH, which will cause normal backtracking to
928 happen from now on. Otherwise, THEN is passed back to an outer
929 alternative. This implements Perl's treatment of parenthesized groups,
930 where a group not containing | does not affect the current alternative,
931 that is, (X) is NOT the same as (X|(*F)). */
932
933 if (rrc == MATCH_THEN)
934 {
935 next = ecode + GET(ecode,1);
936 if (md->start_match_ptr < next &&
937 (*ecode == OP_ALT || *next == OP_ALT))
938 rrc = MATCH_NOMATCH;
939 }
940
941 /* Anything other than NOMATCH is passed back. */
942
943 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
944 md->capture_last = save_capture_last;
945 ecode += GET(ecode, 1);
946 if (*ecode != OP_ALT) break;
947 }
948
949 DPRINTF(("bracket %d failed\n", number));
950 md->offset_vector[offset] = save_offset1;
951 md->offset_vector[offset+1] = save_offset2;
952 md->offset_vector[md->offset_end - number] = save_offset3;
953
954 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
955
956 if (md->mark == NULL) md->mark = markptr;
957 RRETURN(rrc);
958 }
959
960 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
961 as a non-capturing bracket. */
962
963 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965
966 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
967
968 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970
971 /* Non-capturing or atomic group, except for possessive with unlimited
972 repeat and ONCE group with no captures. Loop for all the alternatives.
973
974 When we get to the final alternative within the brackets, we used to return
975 the result of a recursive call to match() whatever happened so it was
976 possible to reduce stack usage by turning this into a tail recursion,
977 except in the case of a possibly empty group. However, now that there is
978 the possiblity of (*THEN) occurring in the final alternative, this
979 optimization is no longer always possible.
980
981 We can optimize if we know there are no (*THEN)s in the pattern; at present
982 this is the best that can be done.
983
984 MATCH_ONCE is returned when the end of an atomic group is successfully
985 reached, but subsequent matching fails. It passes back up the tree (causing
986 captured values to be reset) until the original atomic group level is
987 reached. This is tested by comparing md->once_target with the start of the
988 group. At this point, the return is converted into MATCH_NOMATCH so that
989 previous backup points can be taken. */
990
991 case OP_ONCE:
992 case OP_BRA:
993 case OP_SBRA:
994 DPRINTF(("start non-capturing bracket\n"));
995
996 for (;;)
997 {
998 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
999
1000 /* If this is not a possibly empty group, and there are no (*THEN)s in
1001 the pattern, and this is the final alternative, optimize as described
1002 above. */
1003
1004 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1005 {
1006 ecode += _pcre_OP_lengths[*ecode];
1007 goto TAIL_RECURSE;
1008 }
1009
1010 /* In all other cases, we have to make another call to match(). */
1011
1012 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
1013 RM2);
1014
1015 /* See comment in the code for capturing groups above about handling
1016 THEN. */
1017
1018 if (rrc == MATCH_THEN)
1019 {
1020 next = ecode + GET(ecode,1);
1021 if (md->start_match_ptr < next &&
1022 (*ecode == OP_ALT || *next == OP_ALT))
1023 rrc = MATCH_NOMATCH;
1024 }
1025
1026 if (rrc != MATCH_NOMATCH)
1027 {
1028 if (rrc == MATCH_ONCE)
1029 {
1030 const uschar *scode = ecode;
1031 if (*scode != OP_ONCE) /* If not at start, find it */
1032 {
1033 while (*scode == OP_ALT) scode += GET(scode, 1);
1034 scode -= GET(scode, 1);
1035 }
1036 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1037 }
1038 RRETURN(rrc);
1039 }
1040 ecode += GET(ecode, 1);
1041 if (*ecode != OP_ALT) break;
1042 }
1043
1044 if (md->mark == NULL) md->mark = markptr;
1045 RRETURN(MATCH_NOMATCH);
1046
1047 /* Handle possessive capturing brackets with an unlimited repeat. We come
1048 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1049 handled similarly to the normal case above. However, the matching is
1050 different. The end of these brackets will always be OP_KETRPOS, which
1051 returns MATCH_KETRPOS without going further in the pattern. By this means
1052 we can handle the group by iteration rather than recursion, thereby
1053 reducing the amount of stack needed. */
1054
1055 case OP_CBRAPOS:
1056 case OP_SCBRAPOS:
1057 allow_zero = FALSE;
1058
1059 POSSESSIVE_CAPTURE:
1060 number = GET2(ecode, 1+LINK_SIZE);
1061 offset = number << 1;
1062
1063 #ifdef PCRE_DEBUG
1064 printf("start possessive bracket %d\n", number);
1065 printf("subject=");
1066 pchars(eptr, 16, TRUE, md);
1067 printf("\n");
1068 #endif
1069
1070 if (offset < md->offset_max)
1071 {
1072 matched_once = FALSE;
1073 code_offset = ecode - md->start_code;
1074
1075 save_offset1 = md->offset_vector[offset];
1076 save_offset2 = md->offset_vector[offset+1];
1077 save_offset3 = md->offset_vector[md->offset_end - number];
1078 save_capture_last = md->capture_last;
1079
1080 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1081
1082 /* Each time round the loop, save the current subject position for use
1083 when the group matches. For MATCH_MATCH, the group has matched, so we
1084 restart it with a new subject starting position, remembering that we had
1085 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1086 usual. If we haven't matched any alternatives in any iteration, check to
1087 see if a previous iteration matched. If so, the group has matched;
1088 continue from afterwards. Otherwise it has failed; restore the previous
1089 capture values before returning NOMATCH. */
1090
1091 for (;;)
1092 {
1093 md->offset_vector[md->offset_end - number] =
1094 (int)(eptr - md->start_subject);
1095 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1096 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1097 eptrb, RM63);
1098 if (rrc == MATCH_KETRPOS)
1099 {
1100 offset_top = md->end_offset_top;
1101 eptr = md->end_match_ptr;
1102 ecode = md->start_code + code_offset;
1103 save_capture_last = md->capture_last;
1104 matched_once = TRUE;
1105 continue;
1106 }
1107
1108 /* See comment in the code for capturing groups above about handling
1109 THEN. */
1110
1111 if (rrc == MATCH_THEN)
1112 {
1113 next = ecode + GET(ecode,1);
1114 if (md->start_match_ptr < next &&
1115 (*ecode == OP_ALT || *next == OP_ALT))
1116 rrc = MATCH_NOMATCH;
1117 }
1118
1119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 md->capture_last = save_capture_last;
1121 ecode += GET(ecode, 1);
1122 if (*ecode != OP_ALT) break;
1123 }
1124
1125 if (!matched_once)
1126 {
1127 md->offset_vector[offset] = save_offset1;
1128 md->offset_vector[offset+1] = save_offset2;
1129 md->offset_vector[md->offset_end - number] = save_offset3;
1130 }
1131
1132 if (md->mark == NULL) md->mark = markptr;
1133 if (allow_zero || matched_once)
1134 {
1135 ecode += 1 + LINK_SIZE;
1136 break;
1137 }
1138
1139 RRETURN(MATCH_NOMATCH);
1140 }
1141
1142 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1143 as a non-capturing bracket. */
1144
1145 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147
1148 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1149
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152
1153 /* Non-capturing possessive bracket with unlimited repeat. We come here
1154 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1155 without the capturing complication. It is written out separately for speed
1156 and cleanliness. */
1157
1158 case OP_BRAPOS:
1159 case OP_SBRAPOS:
1160 allow_zero = FALSE;
1161
1162 POSSESSIVE_NON_CAPTURE:
1163 matched_once = FALSE;
1164 code_offset = ecode - md->start_code;
1165
1166 for (;;)
1167 {
1168 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1169 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1170 eptrb, RM48);
1171 if (rrc == MATCH_KETRPOS)
1172 {
1173 offset_top = md->end_offset_top;
1174 eptr = md->end_match_ptr;
1175 ecode = md->start_code + code_offset;
1176 matched_once = TRUE;
1177 continue;
1178 }
1179
1180 /* See comment in the code for capturing groups above about handling
1181 THEN. */
1182
1183 if (rrc == MATCH_THEN)
1184 {
1185 next = ecode + GET(ecode,1);
1186 if (md->start_match_ptr < next &&
1187 (*ecode == OP_ALT || *next == OP_ALT))
1188 rrc = MATCH_NOMATCH;
1189 }
1190
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ecode += GET(ecode, 1);
1193 if (*ecode != OP_ALT) break;
1194 }
1195
1196 if (matched_once || allow_zero)
1197 {
1198 ecode += 1 + LINK_SIZE;
1199 break;
1200 }
1201 RRETURN(MATCH_NOMATCH);
1202
1203 /* Control never reaches here. */
1204
1205 /* Conditional group: compilation checked that there are no more than
1206 two branches. If the condition is false, skipping the first branch takes us
1207 past the end if there is only one branch, but that's OK because that is
1208 exactly what going to the ket would do. */
1209
1210 case OP_COND:
1211 case OP_SCOND:
1212 codelink = GET(ecode, 1);
1213
1214 /* Because of the way auto-callout works during compile, a callout item is
1215 inserted between OP_COND and an assertion condition. */
1216
1217 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1218 {
1219 if (pcre_callout != NULL)
1220 {
1221 pcre_callout_block cb;
1222 cb.version = 2; /* Version 1 of the callout block */
1223 cb.callout_number = ecode[LINK_SIZE+2];
1224 cb.offset_vector = md->offset_vector;
1225 cb.subject = (PCRE_SPTR)md->start_subject;
1226 cb.subject_length = (int)(md->end_subject - md->start_subject);
1227 cb.start_match = (int)(mstart - md->start_subject);
1228 cb.current_position = (int)(eptr - md->start_subject);
1229 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1230 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1231 cb.capture_top = offset_top/2;
1232 cb.capture_last = md->capture_last;
1233 cb.callout_data = md->callout_data;
1234 cb.mark = markptr;
1235 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1236 if (rrc < 0) RRETURN(rrc);
1237 }
1238 ecode += _pcre_OP_lengths[OP_CALLOUT];
1239 }
1240
1241 condcode = ecode[LINK_SIZE+1];
1242
1243 /* Now see what the actual condition is */
1244
1245 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1246 {
1247 if (md->recursive == NULL) /* Not recursing => FALSE */
1248 {
1249 condition = FALSE;
1250 ecode += GET(ecode, 1);
1251 }
1252 else
1253 {
1254 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1255 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1256
1257 /* If the test is for recursion into a specific subpattern, and it is
1258 false, but the test was set up by name, scan the table to see if the
1259 name refers to any other numbers, and test them. The condition is true
1260 if any one is set. */
1261
1262 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1263 {
1264 uschar *slotA = md->name_table;
1265 for (i = 0; i < md->name_count; i++)
1266 {
1267 if (GET2(slotA, 0) == recno) break;
1268 slotA += md->name_entry_size;
1269 }
1270
1271 /* Found a name for the number - there can be only one; duplicate
1272 names for different numbers are allowed, but not vice versa. First
1273 scan down for duplicates. */
1274
1275 if (i < md->name_count)
1276 {
1277 uschar *slotB = slotA;
1278 while (slotB > md->name_table)
1279 {
1280 slotB -= md->name_entry_size;
1281 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1282 {
1283 condition = GET2(slotB, 0) == md->recursive->group_num;
1284 if (condition) break;
1285 }
1286 else break;
1287 }
1288
1289 /* Scan up for duplicates */
1290
1291 if (!condition)
1292 {
1293 slotB = slotA;
1294 for (i++; i < md->name_count; i++)
1295 {
1296 slotB += md->name_entry_size;
1297 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1298 {
1299 condition = GET2(slotB, 0) == md->recursive->group_num;
1300 if (condition) break;
1301 }
1302 else break;
1303 }
1304 }
1305 }
1306 }
1307
1308 /* Chose branch according to the condition */
1309
1310 ecode += condition? 3 : GET(ecode, 1);
1311 }
1312 }
1313
1314 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1315 {
1316 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1317 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1318
1319 /* If the numbered capture is unset, but the reference was by name,
1320 scan the table to see if the name refers to any other numbers, and test
1321 them. The condition is true if any one is set. This is tediously similar
1322 to the code above, but not close enough to try to amalgamate. */
1323
1324 if (!condition && condcode == OP_NCREF)
1325 {
1326 int refno = offset >> 1;
1327 uschar *slotA = md->name_table;
1328
1329 for (i = 0; i < md->name_count; i++)
1330 {
1331 if (GET2(slotA, 0) == refno) break;
1332 slotA += md->name_entry_size;
1333 }
1334
1335 /* Found a name for the number - there can be only one; duplicate names
1336 for different numbers are allowed, but not vice versa. First scan down
1337 for duplicates. */
1338
1339 if (i < md->name_count)
1340 {
1341 uschar *slotB = slotA;
1342 while (slotB > md->name_table)
1343 {
1344 slotB -= md->name_entry_size;
1345 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1346 {
1347 offset = GET2(slotB, 0) << 1;
1348 condition = offset < offset_top &&
1349 md->offset_vector[offset] >= 0;
1350 if (condition) break;
1351 }
1352 else break;
1353 }
1354
1355 /* Scan up for duplicates */
1356
1357 if (!condition)
1358 {
1359 slotB = slotA;
1360 for (i++; i < md->name_count; i++)
1361 {
1362 slotB += md->name_entry_size;
1363 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1364 {
1365 offset = GET2(slotB, 0) << 1;
1366 condition = offset < offset_top &&
1367 md->offset_vector[offset] >= 0;
1368 if (condition) break;
1369 }
1370 else break;
1371 }
1372 }
1373 }
1374 }
1375
1376 /* Chose branch according to the condition */
1377
1378 ecode += condition? 3 : GET(ecode, 1);
1379 }
1380
1381 else if (condcode == OP_DEF) /* DEFINE - always false */
1382 {
1383 condition = FALSE;
1384 ecode += GET(ecode, 1);
1385 }
1386
1387 /* The condition is an assertion. Call match() to evaluate it - setting
1388 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1389 an assertion. */
1390
1391 else
1392 {
1393 md->match_function_type = MATCH_CONDASSERT;
1394 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1395 if (rrc == MATCH_MATCH)
1396 {
1397 if (md->end_offset_top > offset_top)
1398 offset_top = md->end_offset_top; /* Captures may have happened */
1399 condition = TRUE;
1400 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1401 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1402 }
1403
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. */
1406
1407 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1408 {
1409 RRETURN(rrc); /* Need braces because of following else */
1410 }
1411 else
1412 {
1413 condition = FALSE;
1414 ecode += codelink;
1415 }
1416 }
1417
1418 /* We are now at the branch that is to be obeyed. As there is only one, can
1419 use tail recursion to avoid using another stack frame, except when there is
1420 unlimited repeat of a possibly empty group. In the latter case, a recursive
1421 call to match() is always required, unless the second alternative doesn't
1422 exist, in which case we can just plough on. Note that, for compatibility
1423 with Perl, the | in a conditional group is NOT treated as creating two
1424 alternatives. If a THEN is encountered in the branch, it propagates out to
1425 the enclosing alternative (unless nested in a deeper set of alternatives,
1426 of course). */
1427
1428 if (condition || *ecode == OP_ALT)
1429 {
1430 if (op != OP_SCOND)
1431 {
1432 ecode += 1 + LINK_SIZE;
1433 goto TAIL_RECURSE;
1434 }
1435
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1439 }
1440
1441 /* Condition false & no alternative; continue after the group. */
1442
1443 else
1444 {
1445 ecode += 1 + LINK_SIZE;
1446 }
1447 break;
1448
1449
1450 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1451 to close any currently open capturing brackets. */
1452
1453 case OP_CLOSE:
1454 number = GET2(ecode, 1);
1455 offset = number << 1;
1456
1457 #ifdef PCRE_DEBUG
1458 printf("end bracket %d at *ACCEPT", number);
1459 printf("\n");
1460 #endif
1461
1462 md->capture_last = number;
1463 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1464 {
1465 md->offset_vector[offset] =
1466 md->offset_vector[md->offset_end - number];
1467 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1468 if (offset_top <= offset) offset_top = offset + 2;
1469 }
1470 ecode += 3;
1471 break;
1472
1473
1474 /* End of the pattern, either real or forced. */
1475
1476 case OP_END:
1477 case OP_ACCEPT:
1478 case OP_ASSERT_ACCEPT:
1479
1480 /* If we have matched an empty string, fail if not in an assertion and not
1481 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1482 is set and we have matched at the start of the subject. In both cases,
1483 backtracking will then try other alternatives, if any. */
1484
1485 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1486 md->recursive == NULL &&
1487 (md->notempty ||
1488 (md->notempty_atstart &&
1489 mstart == md->start_subject + md->start_offset)))
1490 MRRETURN(MATCH_NOMATCH);
1491
1492 /* Otherwise, we have a match. */
1493
1494 md->end_match_ptr = eptr; /* Record where we ended */
1495 md->end_offset_top = offset_top; /* and how many extracts were taken */
1496 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1497
1498 /* For some reason, the macros don't work properly if an expression is
1499 given as the argument to MRRETURN when the heap is in use. */
1500
1501 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1502 MRRETURN(rrc);
1503
1504 /* Assertion brackets. Check the alternative branches in turn - the
1505 matching won't pass the KET for an assertion. If any one branch matches,
1506 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1507 start of each branch to move the current point backwards, so the code at
1508 this level is identical to the lookahead case. When the assertion is part
1509 of a condition, we want to return immediately afterwards. The caller of
1510 this incarnation of the match() function will have set MATCH_CONDASSERT in
1511 md->match_function type, and one of these opcodes will be the first opcode
1512 that is processed. We use a local variable that is preserved over calls to
1513 match() to remember this case. */
1514
1515 case OP_ASSERT:
1516 case OP_ASSERTBACK:
1517 if (md->match_function_type == MATCH_CONDASSERT)
1518 {
1519 condassert = TRUE;
1520 md->match_function_type = 0;
1521 }
1522 else condassert = FALSE;
1523
1524 do
1525 {
1526 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1527 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1528 {
1529 mstart = md->start_match_ptr; /* In case \K reset it */
1530 markptr = md->mark;
1531 break;
1532 }
1533
1534 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1535 as NOMATCH. */
1536
1537 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1538 ecode += GET(ecode, 1);
1539 }
1540 while (*ecode == OP_ALT);
1541
1542 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1543
1544 /* If checking an assertion for a condition, return MATCH_MATCH. */
1545
1546 if (condassert) RRETURN(MATCH_MATCH);
1547
1548 /* Continue from after the assertion, updating the offsets high water
1549 mark, since extracts may have been taken during the assertion. */
1550
1551 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1552 ecode += 1 + LINK_SIZE;
1553 offset_top = md->end_offset_top;
1554 continue;
1555
1556 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1557 PRUNE, or COMMIT means we must assume failure without checking subsequent
1558 branches. */
1559
1560 case OP_ASSERT_NOT:
1561 case OP_ASSERTBACK_NOT:
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1573 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1574 {
1575 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1576 break;
1577 }
1578
1579 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580 as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 ecode += GET(ecode,1);
1584 }
1585 while (*ecode == OP_ALT);
1586
1587 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1588
1589 ecode += 1 + LINK_SIZE;
1590 continue;
1591
1592 /* Move the subject pointer back. This occurs only at the start of
1593 each branch of a lookbehind assertion. If we are too close to the start to
1594 move back, this match function fails. When working with UTF-8 we move
1595 back a number of characters, not bytes. */
1596
1597 case OP_REVERSE:
1598 #ifdef SUPPORT_UTF8
1599 if (utf8)
1600 {
1601 i = GET(ecode, 1);
1602 while (i-- > 0)
1603 {
1604 eptr--;
1605 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1606 BACKCHAR(eptr);
1607 }
1608 }
1609 else
1610 #endif
1611
1612 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1613
1614 {
1615 eptr -= GET(ecode, 1);
1616 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1617 }
1618
1619 /* Save the earliest consulted character, then skip to next op code */
1620
1621 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1622 ecode += 1 + LINK_SIZE;
1623 break;
1624
1625 /* The callout item calls an external function, if one is provided, passing
1626 details of the match so far. This is mainly for debugging, though the
1627 function is able to force a failure. */
1628
1629 case OP_CALLOUT:
1630 if (pcre_callout != NULL)
1631 {
1632 pcre_callout_block cb;
1633 cb.version = 2; /* Version 1 of the callout block */
1634 cb.callout_number = ecode[1];
1635 cb.offset_vector = md->offset_vector;
1636 cb.subject = (PCRE_SPTR)md->start_subject;
1637 cb.subject_length = (int)(md->end_subject - md->start_subject);
1638 cb.start_match = (int)(mstart - md->start_subject);
1639 cb.current_position = (int)(eptr - md->start_subject);
1640 cb.pattern_position = GET(ecode, 2);
1641 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1642 cb.capture_top = offset_top/2;
1643 cb.capture_last = md->capture_last;
1644 cb.callout_data = md->callout_data;
1645 cb.mark = markptr;
1646 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1647 if (rrc < 0) RRETURN(rrc);
1648 }
1649 ecode += 2 + 2*LINK_SIZE;
1650 break;
1651
1652 /* Recursion either matches the current regex, or some subexpression. The
1653 offset data is the offset to the starting bracket from the start of the
1654 whole pattern. (This is so that it works from duplicated subpatterns.)
1655
1656 The state of the capturing groups is preserved over recursion, and
1657 re-instated afterwards. We don't know how many are started and not yet
1658 finished (offset_top records the completed total) so we just have to save
1659 all the potential data. There may be up to 65535 such values, which is too
1660 large to put on the stack, but using malloc for small numbers seems
1661 expensive. As a compromise, the stack is used when there are no more than
1662 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1663
1664 There are also other values that have to be saved. We use a chained
1665 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1666 for the original version of this logic. It has, however, been hacked around
1667 a lot, so he is not to blame for the current way it works. */
1668
1669 case OP_RECURSE:
1670 {
1671 recursion_info *ri;
1672 int recno;
1673
1674 callpat = md->start_code + GET(ecode, 1);
1675 recno = (callpat == md->start_code)? 0 :
1676 GET2(callpat, 1 + LINK_SIZE);
1677
1678 /* Check for repeating a recursion without advancing the subject pointer.
1679 This should catch convoluted mutual recursions. (Some simple cases are
1680 caught at compile time.) */
1681
1682 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1683 if (recno == ri->group_num && eptr == ri->subject_position)
1684 RRETURN(PCRE_ERROR_RECURSELOOP);
1685
1686 /* Add to "recursing stack" */
1687
1688 new_recursive.group_num = recno;
1689 new_recursive.subject_position = eptr;
1690 new_recursive.prevrec = md->recursive;
1691 md->recursive = &new_recursive;
1692
1693 /* Where to continue from afterwards */
1694
1695 ecode += 1 + LINK_SIZE;
1696
1697 /* Now save the offset data */
1698
1699 new_recursive.saved_max = md->offset_end;
1700 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1701 new_recursive.offset_save = stacksave;
1702 else
1703 {
1704 new_recursive.offset_save =
1705 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1706 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1707 }
1708 memcpy(new_recursive.offset_save, md->offset_vector,
1709 new_recursive.saved_max * sizeof(int));
1710
1711 /* OK, now we can do the recursion. After processing each alternative,
1712 restore the offset data. If there were nested recursions, md->recursive
1713 might be changed, so reset it before looping. */
1714
1715 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1716 cbegroup = (*callpat >= OP_SBRA);
1717 do
1718 {
1719 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1720 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1721 md, eptrb, RM6);
1722 memcpy(md->offset_vector, new_recursive.offset_save,
1723 new_recursive.saved_max * sizeof(int));
1724 md->recursive = new_recursive.prevrec;
1725 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1726 {
1727 DPRINTF(("Recursion matched\n"));
1728 if (new_recursive.offset_save != stacksave)
1729 (pcre_free)(new_recursive.offset_save);
1730
1731 /* Set where we got to in the subject, and reset the start in case
1732 it was changed by \K. This *is* propagated back out of a recursion,
1733 for Perl compatibility. */
1734
1735 eptr = md->end_match_ptr;
1736 mstart = md->start_match_ptr;
1737 goto RECURSION_MATCHED; /* Exit loop; end processing */
1738 }
1739
1740 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1741 as NOMATCH. */
1742
1743 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1744 {
1745 DPRINTF(("Recursion gave error %d\n", rrc));
1746 if (new_recursive.offset_save != stacksave)
1747 (pcre_free)(new_recursive.offset_save);
1748 RRETURN(rrc);
1749 }
1750
1751 md->recursive = &new_recursive;
1752 callpat += GET(callpat, 1);
1753 }
1754 while (*callpat == OP_ALT);
1755
1756 DPRINTF(("Recursion didn't match\n"));
1757 md->recursive = new_recursive.prevrec;
1758 if (new_recursive.offset_save != stacksave)
1759 (pcre_free)(new_recursive.offset_save);
1760 MRRETURN(MATCH_NOMATCH);
1761 }
1762
1763 RECURSION_MATCHED:
1764 break;
1765
1766 /* An alternation is the end of a branch; scan along to find the end of the
1767 bracketed group and go to there. */
1768
1769 case OP_ALT:
1770 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1771 break;
1772
1773 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1774 indicating that it may occur zero times. It may repeat infinitely, or not
1775 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1776 with fixed upper repeat limits are compiled as a number of copies, with the
1777 optional ones preceded by BRAZERO or BRAMINZERO. */
1778
1779 case OP_BRAZERO:
1780 next = ecode + 1;
1781 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 do next += GET(next, 1); while (*next == OP_ALT);
1784 ecode = next + 1 + LINK_SIZE;
1785 break;
1786
1787 case OP_BRAMINZERO:
1788 next = ecode + 1;
1789 do next += GET(next, 1); while (*next == OP_ALT);
1790 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792 ecode++;
1793 break;
1794
1795 case OP_SKIPZERO:
1796 next = ecode+1;
1797 do next += GET(next,1); while (*next == OP_ALT);
1798 ecode = next + 1 + LINK_SIZE;
1799 break;
1800
1801 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1802 here; just jump to the group, with allow_zero set TRUE. */
1803
1804 case OP_BRAPOSZERO:
1805 op = *(++ecode);
1806 allow_zero = TRUE;
1807 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1808 goto POSSESSIVE_NON_CAPTURE;
1809
1810 /* End of a group, repeated or non-repeating. */
1811
1812 case OP_KET:
1813 case OP_KETRMIN:
1814 case OP_KETRMAX:
1815 case OP_KETRPOS:
1816 prev = ecode - GET(ecode, 1);
1817
1818 /* If this was a group that remembered the subject start, in order to break
1819 infinite repeats of empty string matches, retrieve the subject start from
1820 the chain. Otherwise, set it NULL. */
1821
1822 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1823 {
1824 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1825 eptrb = eptrb->epb_prev; /* Backup to previous group */
1826 }
1827 else saved_eptr = NULL;
1828
1829 /* If we are at the end of an assertion group or a non-capturing atomic
1830 group, stop matching and return MATCH_MATCH, but record the current high
1831 water mark for use by positive assertions. We also need to record the match
1832 start in case it was changed by \K. */
1833
1834 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1835 *prev == OP_ONCE_NC)
1836 {
1837 md->end_match_ptr = eptr; /* For ONCE_NC */
1838 md->end_offset_top = offset_top;
1839 md->start_match_ptr = mstart;
1840 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1841 }
1842
1843 /* For capturing groups we have to check the group number back at the start
1844 and if necessary complete handling an extraction by setting the offsets and
1845 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1846 into group 0, so it won't be picked up here. Instead, we catch it when the
1847 OP_END is reached. Other recursion is handled here. We just have to record
1848 the current subject position and start match pointer and give a MATCH
1849 return. */
1850
1851 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1852 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1853 {
1854 number = GET2(prev, 1+LINK_SIZE);
1855 offset = number << 1;
1856
1857 #ifdef PCRE_DEBUG
1858 printf("end bracket %d", number);
1859 printf("\n");
1860 #endif
1861
1862 /* Handle a recursively called group. */
1863
1864 if (md->recursive != NULL && md->recursive->group_num == number)
1865 {
1866 md->end_match_ptr = eptr;
1867 md->start_match_ptr = mstart;
1868 RRETURN(MATCH_MATCH);
1869 }
1870
1871 /* Deal with capturing */
1872
1873 md->capture_last = number;
1874 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1875 {
1876 /* If offset is greater than offset_top, it means that we are
1877 "skipping" a capturing group, and that group's offsets must be marked
1878 unset. In earlier versions of PCRE, all the offsets were unset at the
1879 start of matching, but this doesn't work because atomic groups and
1880 assertions can cause a value to be set that should later be unset.
1881 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1882 part of the atomic group, but this is not on the final matching path,
1883 so must be unset when 2 is set. (If there is no group 2, there is no
1884 problem, because offset_top will then be 2, indicating no capture.) */
1885
1886 if (offset > offset_top)
1887 {
1888 register int *iptr = md->offset_vector + offset_top;
1889 register int *iend = md->offset_vector + offset;
1890 while (iptr < iend) *iptr++ = -1;
1891 }
1892
1893 /* Now make the extraction */
1894
1895 md->offset_vector[offset] =
1896 md->offset_vector[md->offset_end - number];
1897 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1898 if (offset_top <= offset) offset_top = offset + 2;
1899 }
1900 }
1901
1902 /* For an ordinary non-repeating ket, just continue at this level. This
1903 also happens for a repeating ket if no characters were matched in the
1904 group. This is the forcible breaking of infinite loops as implemented in
1905 Perl 5.005. For a non-repeating atomic group that includes captures,
1906 establish a backup point by processing the rest of the pattern at a lower
1907 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1908 original OP_ONCE level, thereby bypassing intermediate backup points, but
1909 resetting any captures that happened along the way. */
1910
1911 if (*ecode == OP_KET || eptr == saved_eptr)
1912 {
1913 if (*prev == OP_ONCE)
1914 {
1915 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1918 RRETURN(MATCH_ONCE);
1919 }
1920 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1921 break;
1922 }
1923
1924 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1925 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1926 at a time from the outer level, thus saving stack. */
1927
1928 if (*ecode == OP_KETRPOS)
1929 {
1930 md->end_match_ptr = eptr;
1931 md->end_offset_top = offset_top;
1932 RRETURN(MATCH_KETRPOS);
1933 }
1934
1935 /* The normal repeating kets try the rest of the pattern or restart from
1936 the preceding bracket, in the appropriate order. In the second case, we can
1937 use tail recursion to avoid using another stack frame, unless we have an
1938 an atomic group or an unlimited repeat of a group that can match an empty
1939 string. */
1940
1941 if (*ecode == OP_KETRMIN)
1942 {
1943 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 if (*prev == OP_ONCE)
1946 {
1947 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1950 RRETURN(MATCH_ONCE);
1951 }
1952 if (*prev >= OP_SBRA) /* Could match an empty string */
1953 {
1954 md->match_function_type = MATCH_CBEGROUP;
1955 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1956 RRETURN(rrc);
1957 }
1958 ecode = prev;
1959 goto TAIL_RECURSE;
1960 }
1961 else /* OP_KETRMAX */
1962 {
1963 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1964 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1965 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1967 if (*prev == OP_ONCE)
1968 {
1969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 md->once_target = prev;
1972 RRETURN(MATCH_ONCE);
1973 }
1974 ecode += 1 + LINK_SIZE;
1975 goto TAIL_RECURSE;
1976 }
1977 /* Control never gets here */
1978
1979 /* Not multiline mode: start of subject assertion, unless notbol. */
1980
1981 case OP_CIRC:
1982 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1983
1984 /* Start of subject assertion */
1985
1986 case OP_SOD:
1987 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1988 ecode++;
1989 break;
1990
1991 /* Multiline mode: start of subject unless notbol, or after any newline. */
1992
1993 case OP_CIRCM:
1994 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1995 if (eptr != md->start_subject &&
1996 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1997 MRRETURN(MATCH_NOMATCH);
1998 ecode++;
1999 break;
2000
2001 /* Start of match assertion */
2002
2003 case OP_SOM:
2004 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2005 ecode++;
2006 break;
2007
2008 /* Reset the start of match point */
2009
2010 case OP_SET_SOM:
2011 mstart = eptr;
2012 ecode++;
2013 break;
2014
2015 /* Multiline mode: assert before any newline, or before end of subject
2016 unless noteol is set. */
2017
2018 case OP_DOLLM:
2019 if (eptr < md->end_subject)
2020 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2021 else
2022 {
2023 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2024 SCHECK_PARTIAL();
2025 }
2026 ecode++;
2027 break;
2028
2029 /* Not multiline mode: assert before a terminating newline or before end of
2030 subject unless noteol is set. */
2031
2032 case OP_DOLL:
2033 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2034 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2035
2036 /* ... else fall through for endonly */
2037
2038 /* End of subject assertion (\z) */
2039
2040 case OP_EOD:
2041 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2042 SCHECK_PARTIAL();
2043 ecode++;
2044 break;
2045
2046 /* End of subject or ending \n assertion (\Z) */
2047
2048 case OP_EODN:
2049 ASSERT_NL_OR_EOS:
2050 if (eptr < md->end_subject &&
2051 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2052 MRRETURN(MATCH_NOMATCH);
2053
2054 /* Either at end of string or \n before end. */
2055
2056 SCHECK_PARTIAL();
2057 ecode++;
2058 break;
2059
2060 /* Word boundary assertions */
2061
2062 case OP_NOT_WORD_BOUNDARY:
2063 case OP_WORD_BOUNDARY:
2064 {
2065
2066 /* Find out if the previous and current characters are "word" characters.
2067 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2068 be "non-word" characters. Remember the earliest consulted character for
2069 partial matching. */
2070
2071 #ifdef SUPPORT_UTF8
2072 if (utf8)
2073 {
2074 /* Get status of previous character */
2075
2076 if (eptr == md->start_subject) prev_is_word = FALSE; else
2077 {
2078 USPTR lastptr = eptr - 1;
2079 while((*lastptr & 0xc0) == 0x80) lastptr--;
2080 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2081 GETCHAR(c, lastptr);
2082 #ifdef SUPPORT_UCP
2083 if (md->use_ucp)
2084 {
2085 if (c == '_') prev_is_word = TRUE; else
2086 {
2087 int cat = UCD_CATEGORY(c);
2088 prev_is_word = (cat == ucp_L || cat == ucp_N);
2089 }
2090 }
2091 else
2092 #endif
2093 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2094 }
2095
2096 /* Get status of next character */
2097
2098 if (eptr >= md->end_subject)
2099 {
2100 SCHECK_PARTIAL();
2101 cur_is_word = FALSE;
2102 }
2103 else
2104 {
2105 GETCHAR(c, eptr);
2106 #ifdef SUPPORT_UCP
2107 if (md->use_ucp)
2108 {
2109 if (c == '_') cur_is_word = TRUE; else
2110 {
2111 int cat = UCD_CATEGORY(c);
2112 cur_is_word = (cat == ucp_L || cat == ucp_N);
2113 }
2114 }
2115 else
2116 #endif
2117 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2118 }
2119 }
2120 else
2121 #endif
2122
2123 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2124 consistency with the behaviour of \w we do use it in this case. */
2125
2126 {
2127 /* Get status of previous character */
2128
2129 if (eptr == md->start_subject) prev_is_word = FALSE; else
2130 {
2131 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2132 #ifdef SUPPORT_UCP
2133 if (md->use_ucp)
2134 {
2135 c = eptr[-1];
2136 if (c == '_') prev_is_word = TRUE; else
2137 {
2138 int cat = UCD_CATEGORY(c);
2139 prev_is_word = (cat == ucp_L || cat == ucp_N);
2140 }
2141 }
2142 else
2143 #endif
2144 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2145 }
2146
2147 /* Get status of next character */
2148
2149 if (eptr >= md->end_subject)
2150 {
2151 SCHECK_PARTIAL();
2152 cur_is_word = FALSE;
2153 }
2154 else
2155 #ifdef SUPPORT_UCP
2156 if (md->use_ucp)
2157 {
2158 c = *eptr;
2159 if (c == '_') cur_is_word = TRUE; else
2160 {
2161 int cat = UCD_CATEGORY(c);
2162 cur_is_word = (cat == ucp_L || cat == ucp_N);
2163 }
2164 }
2165 else
2166 #endif
2167 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2168 }
2169
2170 /* Now see if the situation is what we want */
2171
2172 if ((*ecode++ == OP_WORD_BOUNDARY)?
2173 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2174 MRRETURN(MATCH_NOMATCH);
2175 }
2176 break;
2177
2178 /* Match a single character type; inline for speed */
2179
2180 case OP_ANY:
2181 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2182 /* Fall through */
2183
2184 case OP_ALLANY:
2185 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2186 { /* not be updated before SCHECK_PARTIAL. */
2187 SCHECK_PARTIAL();
2188 MRRETURN(MATCH_NOMATCH);
2189 }
2190 eptr++;
2191 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2192 ecode++;
2193 break;
2194
2195 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2196 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2197
2198 case OP_ANYBYTE:
2199 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2200 { /* not be updated before SCHECK_PARTIAL. */
2201 SCHECK_PARTIAL();
2202 MRRETURN(MATCH_NOMATCH);
2203 }
2204 eptr++;
2205 ecode++;
2206 break;
2207
2208 case OP_NOT_DIGIT:
2209 if (eptr >= md->end_subject)
2210 {
2211 SCHECK_PARTIAL();
2212 MRRETURN(MATCH_NOMATCH);
2213 }
2214 GETCHARINCTEST(c, eptr);
2215 if (
2216 #ifdef SUPPORT_UTF8
2217 c < 256 &&
2218 #endif
2219 (md->ctypes[c] & ctype_digit) != 0
2220 )
2221 MRRETURN(MATCH_NOMATCH);
2222 ecode++;
2223 break;
2224
2225 case OP_DIGIT:
2226 if (eptr >= md->end_subject)
2227 {
2228 SCHECK_PARTIAL();
2229 MRRETURN(MATCH_NOMATCH);
2230 }
2231 GETCHARINCTEST(c, eptr);
2232 if (
2233 #ifdef SUPPORT_UTF8
2234 c >= 256 ||
2235 #endif
2236 (md->ctypes[c] & ctype_digit) == 0
2237 )
2238 MRRETURN(MATCH_NOMATCH);
2239 ecode++;
2240 break;
2241
2242 case OP_NOT_WHITESPACE:
2243 if (eptr >= md->end_subject)
2244 {
2245 SCHECK_PARTIAL();
2246 MRRETURN(MATCH_NOMATCH);
2247 }
2248 GETCHARINCTEST(c, eptr);
2249 if (
2250 #ifdef SUPPORT_UTF8
2251 c < 256 &&
2252 #endif
2253 (md->ctypes[c] & ctype_space) != 0
2254 )
2255 MRRETURN(MATCH_NOMATCH);
2256 ecode++;
2257 break;
2258
2259 case OP_WHITESPACE:
2260 if (eptr >= md->end_subject)
2261 {
2262 SCHECK_PARTIAL();
2263 MRRETURN(MATCH_NOMATCH);
2264 }
2265 GETCHARINCTEST(c, eptr);
2266 if (
2267 #ifdef SUPPORT_UTF8
2268 c >= 256 ||
2269 #endif
2270 (md->ctypes[c] & ctype_space) == 0
2271 )
2272 MRRETURN(MATCH_NOMATCH);
2273 ecode++;
2274 break;
2275
2276 case OP_NOT_WORDCHAR:
2277 if (eptr >= md->end_subject)
2278 {
2279 SCHECK_PARTIAL();
2280 MRRETURN(MATCH_NOMATCH);
2281 }
2282 GETCHARINCTEST(c, eptr);
2283 if (
2284 #ifdef SUPPORT_UTF8
2285 c < 256 &&
2286 #endif
2287 (md->ctypes[c] & ctype_word) != 0
2288 )
2289 MRRETURN(MATCH_NOMATCH);
2290 ecode++;
2291 break;
2292
2293 case OP_WORDCHAR:
2294 if (eptr >= md->end_subject)
2295 {
2296 SCHECK_PARTIAL();
2297 MRRETURN(MATCH_NOMATCH);
2298 }
2299 GETCHARINCTEST(c, eptr);
2300 if (
2301 #ifdef SUPPORT_UTF8
2302 c >= 256 ||
2303 #endif
2304 (md->ctypes[c] & ctype_word) == 0
2305 )
2306 MRRETURN(MATCH_NOMATCH);
2307 ecode++;
2308 break;
2309
2310 case OP_ANYNL:
2311 if (eptr >= md->end_subject)
2312 {
2313 SCHECK_PARTIAL();
2314 MRRETURN(MATCH_NOMATCH);
2315 }
2316 GETCHARINCTEST(c, eptr);
2317 switch(c)
2318 {
2319 default: MRRETURN(MATCH_NOMATCH);
2320
2321 case 0x000d:
2322 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2323 break;
2324
2325 case 0x000a:
2326 break;
2327
2328 case 0x000b:
2329 case 0x000c:
2330 case 0x0085:
2331 case 0x2028:
2332 case 0x2029:
2333 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2334 break;
2335 }
2336 ecode++;
2337 break;
2338
2339 case OP_NOT_HSPACE:
2340 if (eptr >= md->end_subject)
2341 {
2342 SCHECK_PARTIAL();
2343 MRRETURN(MATCH_NOMATCH);
2344 }
2345 GETCHARINCTEST(c, eptr);
2346 switch(c)
2347 {
2348 default: break;
2349 case 0x09: /* HT */
2350 case 0x20: /* SPACE */
2351 case 0xa0: /* NBSP */
2352 case 0x1680: /* OGHAM SPACE MARK */
2353 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2354 case 0x2000: /* EN QUAD */
2355 case 0x2001: /* EM QUAD */
2356 case 0x2002: /* EN SPACE */
2357 case 0x2003: /* EM SPACE */
2358 case 0x2004: /* THREE-PER-EM SPACE */
2359 case 0x2005: /* FOUR-PER-EM SPACE */
2360 case 0x2006: /* SIX-PER-EM SPACE */
2361 case 0x2007: /* FIGURE SPACE */
2362 case 0x2008: /* PUNCTUATION SPACE */
2363 case 0x2009: /* THIN SPACE */
2364 case 0x200A: /* HAIR SPACE */
2365 case 0x202f: /* NARROW NO-BREAK SPACE */
2366 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2367 case 0x3000: /* IDEOGRAPHIC SPACE */
2368 MRRETURN(MATCH_NOMATCH);
2369 }
2370 ecode++;
2371 break;
2372
2373 case OP_HSPACE:
2374 if (eptr >= md->end_subject)
2375 {
2376 SCHECK_PARTIAL();
2377 MRRETURN(MATCH_NOMATCH);
2378 }
2379 GETCHARINCTEST(c, eptr);
2380 switch(c)
2381 {
2382 default: MRRETURN(MATCH_NOMATCH);
2383 case 0x09: /* HT */
2384 case 0x20: /* SPACE */
2385 case 0xa0: /* NBSP */
2386 case 0x1680: /* OGHAM SPACE MARK */
2387 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2388 case 0x2000: /* EN QUAD */
2389 case 0x2001: /* EM QUAD */
2390 case 0x2002: /* EN SPACE */
2391 case 0x2003: /* EM SPACE */
2392 case 0x2004: /* THREE-PER-EM SPACE */
2393 case 0x2005: /* FOUR-PER-EM SPACE */
2394 case 0x2006: /* SIX-PER-EM SPACE */
2395 case 0x2007: /* FIGURE SPACE */
2396 case 0x2008: /* PUNCTUATION SPACE */
2397 case 0x2009: /* THIN SPACE */
2398 case 0x200A: /* HAIR SPACE */
2399 case 0x202f: /* NARROW NO-BREAK SPACE */
2400 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2401 case 0x3000: /* IDEOGRAPHIC SPACE */
2402 break;
2403 }
2404 ecode++;
2405 break;
2406
2407 case OP_NOT_VSPACE:
2408 if (eptr >= md->end_subject)
2409 {
2410 SCHECK_PARTIAL();
2411 MRRETURN(MATCH_NOMATCH);
2412 }
2413 GETCHARINCTEST(c, eptr);
2414 switch(c)
2415 {
2416 default: break;
2417 case 0x0a: /* LF */
2418 case 0x0b: /* VT */
2419 case 0x0c: /* FF */
2420 case 0x0d: /* CR */
2421 case 0x85: /* NEL */
2422 case 0x2028: /* LINE SEPARATOR */
2423 case 0x2029: /* PARAGRAPH SEPARATOR */
2424 MRRETURN(MATCH_NOMATCH);
2425 }
2426 ecode++;
2427 break;
2428
2429 case OP_VSPACE:
2430 if (eptr >= md->end_subject)
2431 {
2432 SCHECK_PARTIAL();
2433 MRRETURN(MATCH_NOMATCH);
2434 }
2435 GETCHARINCTEST(c, eptr);
2436 switch(c)
2437 {
2438 default: MRRETURN(MATCH_NOMATCH);
2439 case 0x0a: /* LF */
2440 case 0x0b: /* VT */
2441 case 0x0c: /* FF */
2442 case 0x0d: /* CR */
2443 case 0x85: /* NEL */
2444 case 0x2028: /* LINE SEPARATOR */
2445 case 0x2029: /* PARAGRAPH SEPARATOR */
2446 break;
2447 }
2448 ecode++;
2449 break;
2450
2451 #ifdef SUPPORT_UCP
2452 /* Check the next character by Unicode property. We will get here only
2453 if the support is in the binary; otherwise a compile-time error occurs. */
2454
2455 case OP_PROP:
2456 case OP_NOTPROP:
2457 if (eptr >= md->end_subject)
2458 {
2459 SCHECK_PARTIAL();
2460 MRRETURN(MATCH_NOMATCH);
2461 }
2462 GETCHARINCTEST(c, eptr);
2463 {
2464 const ucd_record *prop = GET_UCD(c);
2465
2466 switch(ecode[1])
2467 {
2468 case PT_ANY:
2469 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2470 break;
2471
2472 case PT_LAMP:
2473 if ((prop->chartype == ucp_Lu ||
2474 prop->chartype == ucp_Ll ||
2475 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2476 MRRETURN(MATCH_NOMATCH);
2477 break;
2478
2479 case PT_GC:
2480 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2481 MRRETURN(MATCH_NOMATCH);
2482 break;
2483
2484 case PT_PC:
2485 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2486 MRRETURN(MATCH_NOMATCH);
2487 break;
2488
2489 case PT_SC:
2490 if ((ecode[2] != prop->script) == (op == OP_PROP))
2491 MRRETURN(MATCH_NOMATCH);
2492 break;
2493
2494 /* These are specials */
2495
2496 case PT_ALNUM:
2497 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2498 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2499 MRRETURN(MATCH_NOMATCH);
2500 break;
2501
2502 case PT_SPACE: /* Perl space */
2503 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2504 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2505 == (op == OP_NOTPROP))
2506 MRRETURN(MATCH_NOMATCH);
2507 break;
2508
2509 case PT_PXSPACE: /* POSIX space */
2510 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2511 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2512 c == CHAR_FF || c == CHAR_CR)
2513 == (op == OP_NOTPROP))
2514 MRRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 case PT_WORD:
2518 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2519 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2520 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2521 MRRETURN(MATCH_NOMATCH);
2522 break;
2523
2524 /* This should never occur */
2525
2526 default:
2527 RRETURN(PCRE_ERROR_INTERNAL);
2528 }
2529
2530 ecode += 3;
2531 }
2532 break;
2533
2534 /* Match an extended Unicode sequence. We will get here only if the support
2535 is in the binary; otherwise a compile-time error occurs. */
2536
2537 case OP_EXTUNI:
2538 if (eptr >= md->end_subject)
2539 {
2540 SCHECK_PARTIAL();
2541 MRRETURN(MATCH_NOMATCH);
2542 }
2543 GETCHARINCTEST(c, eptr);
2544 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2545 while (eptr < md->end_subject)
2546 {
2547 int len = 1;
2548 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2549 if (UCD_CATEGORY(c) != ucp_M) break;
2550 eptr += len;
2551 }
2552 ecode++;
2553 break;
2554 #endif
2555
2556
2557 /* Match a back reference, possibly repeatedly. Look past the end of the
2558 item to see if there is repeat information following. The code is similar
2559 to that for character classes, but repeated for efficiency. Then obey
2560 similar code to character type repeats - written out again for speed.
2561 However, if the referenced string is the empty string, always treat
2562 it as matched, any number of times (otherwise there could be infinite
2563 loops). */
2564
2565 case OP_REF:
2566 case OP_REFI:
2567 caseless = op == OP_REFI;
2568 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2569 ecode += 3;
2570
2571 /* If the reference is unset, there are two possibilities:
2572
2573 (a) In the default, Perl-compatible state, set the length negative;
2574 this ensures that every attempt at a match fails. We can't just fail
2575 here, because of the possibility of quantifiers with zero minima.
2576
2577 (b) If the JavaScript compatibility flag is set, set the length to zero
2578 so that the back reference matches an empty string.
2579
2580 Otherwise, set the length to the length of what was matched by the
2581 referenced subpattern. */
2582
2583 if (offset >= offset_top || md->offset_vector[offset] < 0)
2584 length = (md->jscript_compat)? 0 : -1;
2585 else
2586 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2587
2588 /* Set up for repetition, or handle the non-repeated case */
2589
2590 switch (*ecode)
2591 {
2592 case OP_CRSTAR:
2593 case OP_CRMINSTAR:
2594 case OP_CRPLUS:
2595 case OP_CRMINPLUS:
2596 case OP_CRQUERY:
2597 case OP_CRMINQUERY:
2598 c = *ecode++ - OP_CRSTAR;
2599 minimize = (c & 1) != 0;
2600 min = rep_min[c]; /* Pick up values from tables; */
2601 max = rep_max[c]; /* zero for max => infinity */
2602 if (max == 0) max = INT_MAX;
2603 break;
2604
2605 case OP_CRRANGE:
2606 case OP_CRMINRANGE:
2607 minimize = (*ecode == OP_CRMINRANGE);
2608 min = GET2(ecode, 1);
2609 max = GET2(ecode, 3);
2610 if (max == 0) max = INT_MAX;
2611 ecode += 5;
2612 break;
2613
2614 default: /* No repeat follows */
2615 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2616 {
2617 CHECK_PARTIAL();
2618 MRRETURN(MATCH_NOMATCH);
2619 }
2620 eptr += length;
2621 continue; /* With the main loop */
2622 }
2623
2624 /* Handle repeated back references. If the length of the reference is
2625 zero, just continue with the main loop. */
2626
2627 if (length == 0) continue;
2628
2629 /* First, ensure the minimum number of matches are present. We get back
2630 the length of the reference string explicitly rather than passing the
2631 address of eptr, so that eptr can be a register variable. */
2632
2633 for (i = 1; i <= min; i++)
2634 {
2635 int slength;
2636 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2637 {
2638 CHECK_PARTIAL();
2639 MRRETURN(MATCH_NOMATCH);
2640 }
2641 eptr += slength;
2642 }
2643
2644 /* If min = max, continue at the same level without recursion.
2645 They are not both allowed to be zero. */
2646
2647 if (min == max) continue;
2648
2649 /* If minimizing, keep trying and advancing the pointer */
2650
2651 if (minimize)
2652 {
2653 for (fi = min;; fi++)
2654 {
2655 int slength;
2656 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2658 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2659 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2660 {
2661 CHECK_PARTIAL();
2662 MRRETURN(MATCH_NOMATCH);
2663 }
2664 eptr += slength;
2665 }
2666 /* Control never gets here */
2667 }
2668
2669 /* If maximizing, find the longest string and work backwards */
2670
2671 else
2672 {
2673 pp = eptr;
2674 for (i = min; i < max; i++)
2675 {
2676 int slength;
2677 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2678 {
2679 CHECK_PARTIAL();
2680 break;
2681 }
2682 eptr += slength;
2683 }
2684 while (eptr >= pp)
2685 {
2686 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688 eptr -= length;
2689 }
2690 MRRETURN(MATCH_NOMATCH);
2691 }
2692 /* Control never gets here */
2693
2694 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2695 used when all the characters in the class have values in the range 0-255,
2696 and either the matching is caseful, or the characters are in the range
2697 0-127 when UTF-8 processing is enabled. The only difference between
2698 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2699 encountered.
2700
2701 First, look past the end of the item to see if there is repeat information
2702 following. Then obey similar code to character type repeats - written out
2703 again for speed. */
2704
2705 case OP_NCLASS:
2706 case OP_CLASS:
2707 {
2708 data = ecode + 1; /* Save for matching */
2709 ecode += 33; /* Advance past the item */
2710
2711 switch (*ecode)
2712 {
2713 case OP_CRSTAR:
2714 case OP_CRMINSTAR:
2715 case OP_CRPLUS:
2716 case OP_CRMINPLUS:
2717 case OP_CRQUERY:
2718 case OP_CRMINQUERY:
2719 c = *ecode++ - OP_CRSTAR;
2720 minimize = (c & 1) != 0;
2721 min = rep_min[c]; /* Pick up values from tables; */
2722 max = rep_max[c]; /* zero for max => infinity */
2723 if (max == 0) max = INT_MAX;
2724 break;
2725
2726 case OP_CRRANGE:
2727 case OP_CRMINRANGE:
2728 minimize = (*ecode == OP_CRMINRANGE);
2729 min = GET2(ecode, 1);
2730 max = GET2(ecode, 3);
2731 if (max == 0) max = INT_MAX;
2732 ecode += 5;
2733 break;
2734
2735 default: /* No repeat follows */
2736 min = max = 1;
2737 break;
2738 }
2739
2740 /* First, ensure the minimum number of matches are present. */
2741
2742 #ifdef SUPPORT_UTF8
2743 /* UTF-8 mode */
2744 if (utf8)
2745 {
2746 for (i = 1; i <= min; i++)
2747 {
2748 if (eptr >= md->end_subject)
2749 {
2750 SCHECK_PARTIAL();
2751 MRRETURN(MATCH_NOMATCH);
2752 }
2753 GETCHARINC(c, eptr);
2754 if (c > 255)
2755 {
2756 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2757 }
2758 else
2759 {
2760 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2761 }
2762 }
2763 }
2764 else
2765 #endif
2766 /* Not UTF-8 mode */
2767 {
2768 for (i = 1; i <= min; i++)
2769 {
2770 if (eptr >= md->end_subject)
2771 {
2772 SCHECK_PARTIAL();
2773 MRRETURN(MATCH_NOMATCH);
2774 }
2775 c = *eptr++;
2776 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2777 }
2778 }
2779
2780 /* If max == min we can continue with the main loop without the
2781 need to recurse. */
2782
2783 if (min == max) continue;
2784
2785 /* If minimizing, keep testing the rest of the expression and advancing
2786 the pointer while it matches the class. */
2787
2788 if (minimize)
2789 {
2790 #ifdef SUPPORT_UTF8
2791 /* UTF-8 mode */
2792 if (utf8)
2793 {
2794 for (fi = min;; fi++)
2795 {
2796 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2799 if (eptr >= md->end_subject)
2800 {
2801 SCHECK_PARTIAL();
2802 MRRETURN(MATCH_NOMATCH);
2803 }
2804 GETCHARINC(c, eptr);
2805 if (c > 255)
2806 {
2807 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2808 }
2809 else
2810 {
2811 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2812 }
2813 }
2814 }
2815 else
2816 #endif
2817 /* Not UTF-8 mode */
2818 {
2819 for (fi = min;; fi++)
2820 {
2821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2823 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2824 if (eptr >= md->end_subject)
2825 {
2826 SCHECK_PARTIAL();
2827 MRRETURN(MATCH_NOMATCH);
2828 }
2829 c = *eptr++;
2830 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2831 }
2832 }
2833 /* Control never gets here */
2834 }
2835
2836 /* If maximizing, find the longest possible run, then work backwards. */
2837
2838 else
2839 {
2840 pp = eptr;
2841
2842 #ifdef SUPPORT_UTF8
2843 /* UTF-8 mode */
2844 if (utf8)
2845 {
2846 for (i = min; i < max; i++)
2847 {
2848 int len = 1;
2849 if (eptr >= md->end_subject)
2850 {
2851 SCHECK_PARTIAL();
2852 break;
2853 }
2854 GETCHARLEN(c, eptr, len);
2855 if (c > 255)
2856 {
2857 if (op == OP_CLASS) break;
2858 }
2859 else
2860 {
2861 if ((data[c/8] & (1 << (c&7))) == 0) break;
2862 }
2863 eptr += len;
2864 }
2865 for (;;)
2866 {
2867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2869 if (eptr-- == pp) break; /* Stop if tried at original pos */
2870 BACKCHAR(eptr);
2871 }
2872 }
2873 else
2874 #endif
2875 /* Not UTF-8 mode */
2876 {
2877 for (i = min; i < max; i++)
2878 {
2879 if (eptr >= md->end_subject)
2880 {
2881 SCHECK_PARTIAL();
2882 break;
2883 }
2884 c = *eptr;
2885 if ((data[c/8] & (1 << (c&7))) == 0) break;
2886 eptr++;
2887 }
2888 while (eptr >= pp)
2889 {
2890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2892 eptr--;
2893 }
2894 }
2895
2896 MRRETURN(MATCH_NOMATCH);
2897 }
2898 }
2899 /* Control never gets here */
2900
2901
2902 /* Match an extended character class. This opcode is encountered only
2903 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2904 mode, because Unicode properties are supported in non-UTF-8 mode. */
2905
2906 #ifdef SUPPORT_UTF8
2907 case OP_XCLASS:
2908 {
2909 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2910 ecode += GET(ecode, 1); /* Advance past the item */
2911
2912 switch (*ecode)
2913 {
2914 case OP_CRSTAR:
2915 case OP_CRMINSTAR:
2916 case OP_CRPLUS:
2917 case OP_CRMINPLUS:
2918 case OP_CRQUERY:
2919 case OP_CRMINQUERY:
2920 c = *ecode++ - OP_CRSTAR;
2921 minimize = (c & 1) != 0;
2922 min = rep_min[c]; /* Pick up values from tables; */
2923 max = rep_max[c]; /* zero for max => infinity */
2924 if (max == 0) max = INT_MAX;
2925 break;
2926
2927 case OP_CRRANGE:
2928 case OP_CRMINRANGE:
2929 minimize = (*ecode == OP_CRMINRANGE);
2930 min = GET2(ecode, 1);
2931 max = GET2(ecode, 3);
2932 if (max == 0) max = INT_MAX;
2933 ecode += 5;
2934 break;
2935
2936 default: /* No repeat follows */
2937 min = max = 1;
2938 break;
2939 }
2940
2941 /* First, ensure the minimum number of matches are present. */
2942
2943 for (i = 1; i <= min; i++)
2944 {
2945 if (eptr >= md->end_subject)
2946 {
2947 SCHECK_PARTIAL();
2948 MRRETURN(MATCH_NOMATCH);
2949 }
2950 GETCHARINCTEST(c, eptr);
2951 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2952 }
2953
2954 /* If max == min we can continue with the main loop without the
2955 need to recurse. */
2956
2957 if (min == max) continue;
2958
2959 /* If minimizing, keep testing the rest of the expression and advancing
2960 the pointer while it matches the class. */
2961
2962 if (minimize)
2963 {
2964 for (fi = min;; fi++)
2965 {
2966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2968 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2969 if (eptr >= md->end_subject)
2970 {
2971 SCHECK_PARTIAL();
2972 MRRETURN(MATCH_NOMATCH);
2973 }
2974 GETCHARINCTEST(c, eptr);
2975 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2976 }
2977 /* Control never gets here */
2978 }
2979
2980 /* If maximizing, find the longest possible run, then work backwards. */
2981
2982 else
2983 {
2984 pp = eptr;
2985 for (i = min; i < max; i++)
2986 {
2987 int len = 1;
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 break;
2992 }
2993 GETCHARLENTEST(c, eptr, len);
2994 if (!_pcre_xclass(c, data)) break;
2995 eptr += len;
2996 }
2997 for(;;)
2998 {
2999 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 if (eptr-- == pp) break; /* Stop if tried at original pos */
3002 if (utf8) BACKCHAR(eptr);
3003 }
3004 MRRETURN(MATCH_NOMATCH);
3005 }
3006
3007 /* Control never gets here */
3008 }
3009 #endif /* End of XCLASS */
3010
3011 /* Match a single character, casefully */
3012
3013 case OP_CHAR:
3014 #ifdef SUPPORT_UTF8
3015 if (utf8)
3016 {
3017 length = 1;
3018 ecode++;
3019 GETCHARLEN(fc, ecode, length);
3020 if (length > md->end_subject - eptr)
3021 {
3022 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3023 MRRETURN(MATCH_NOMATCH);
3024 }
3025 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3026 }
3027 else
3028 #endif
3029
3030 /* Non-UTF-8 mode */
3031 {
3032 if (md->end_subject - eptr < 1)
3033 {
3034 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3035 MRRETURN(MATCH_NOMATCH);
3036 }
3037 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3038 ecode += 2;
3039 }
3040 break;
3041
3042 /* Match a single character, caselessly */
3043
3044 case OP_CHARI:
3045 #ifdef SUPPORT_UTF8
3046 if (utf8)
3047 {
3048 length = 1;
3049 ecode++;
3050 GETCHARLEN(fc, ecode, length);
3051
3052 if (length > md->end_subject - eptr)
3053 {
3054 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3055 MRRETURN(MATCH_NOMATCH);
3056 }
3057
3058 /* If the pattern character's value is < 128, we have only one byte, and
3059 can use the fast lookup table. */
3060
3061 if (fc < 128)
3062 {
3063 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3064 }
3065
3066 /* Otherwise we must pick up the subject character */
3067
3068 else
3069 {
3070 unsigned int dc;
3071 GETCHARINC(dc, eptr);
3072 ecode += length;
3073
3074 /* If we have Unicode property support, we can use it to test the other
3075 case of the character, if there is one. */
3076
3077 if (fc != dc)
3078 {
3079 #ifdef SUPPORT_UCP
3080 if (dc != UCD_OTHERCASE(fc))
3081 #endif
3082 MRRETURN(MATCH_NOMATCH);
3083 }
3084 }
3085 }
3086 else
3087 #endif /* SUPPORT_UTF8 */
3088
3089 /* Non-UTF-8 mode */
3090 {
3091 if (md->end_subject - eptr < 1)
3092 {
3093 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3094 MRRETURN(MATCH_NOMATCH);
3095 }
3096 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3097 ecode += 2;
3098 }
3099 break;
3100
3101 /* Match a single character repeatedly. */
3102
3103 case OP_EXACT:
3104 case OP_EXACTI:
3105 min = max = GET2(ecode, 1);
3106 ecode += 3;
3107 goto REPEATCHAR;
3108
3109 case OP_POSUPTO:
3110 case OP_POSUPTOI:
3111 possessive = TRUE;
3112 /* Fall through */
3113
3114 case OP_UPTO:
3115 case OP_UPTOI:
3116 case OP_MINUPTO:
3117 case OP_MINUPTOI:
3118 min = 0;
3119 max = GET2(ecode, 1);
3120 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3121 ecode += 3;
3122 goto REPEATCHAR;
3123
3124 case OP_POSSTAR:
3125 case OP_POSSTARI:
3126 possessive = TRUE;
3127 min = 0;
3128 max = INT_MAX;
3129 ecode++;
3130 goto REPEATCHAR;
3131
3132 case OP_POSPLUS:
3133 case OP_POSPLUSI:
3134 possessive = TRUE;
3135 min = 1;
3136 max = INT_MAX;
3137 ecode++;
3138 goto REPEATCHAR;
3139
3140 case OP_POSQUERY:
3141 case OP_POSQUERYI:
3142 possessive = TRUE;
3143 min = 0;
3144 max = 1;
3145 ecode++;
3146 goto REPEATCHAR;
3147
3148 case OP_STAR:
3149 case OP_STARI:
3150 case OP_MINSTAR:
3151 case OP_MINSTARI:
3152 case OP_PLUS:
3153 case OP_PLUSI:
3154 case OP_MINPLUS:
3155 case OP_MINPLUSI:
3156 case OP_QUERY:
3157 case OP_QUERYI:
3158 case OP_MINQUERY:
3159 case OP_MINQUERYI:
3160 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3161 minimize = (c & 1) != 0;
3162 min = rep_min[c]; /* Pick up values from tables; */
3163 max = rep_max[c]; /* zero for max => infinity */
3164 if (max == 0) max = INT_MAX;
3165
3166 /* Common code for all repeated single-character matches. */
3167
3168 REPEATCHAR:
3169 #ifdef SUPPORT_UTF8
3170 if (utf8)
3171 {
3172 length = 1;
3173 charptr = ecode;
3174 GETCHARLEN(fc, ecode, length);
3175 ecode += length;
3176
3177 /* Handle multibyte character matching specially here. There is
3178 support for caseless matching if UCP support is present. */
3179
3180 if (length > 1)
3181 {
3182 #ifdef SUPPORT_UCP
3183 unsigned int othercase;
3184 if (op >= OP_STARI && /* Caseless */
3185 (othercase = UCD_OTHERCASE(fc)) != fc)
3186 oclength = _pcre_ord2utf8(othercase, occhars);
3187 else oclength = 0;
3188 #endif /* SUPPORT_UCP */
3189
3190 for (i = 1; i <= min; i++)
3191 {
3192 if (eptr <= md->end_subject - length &&
3193 memcmp(eptr, charptr, length) == 0) eptr += length;
3194 #ifdef SUPPORT_UCP
3195 else if (oclength > 0 &&
3196 eptr <= md->end_subject - oclength &&
3197 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3198 #endif /* SUPPORT_UCP */
3199 else
3200 {
3201 CHECK_PARTIAL();
3202 MRRETURN(MATCH_NOMATCH);
3203 }
3204 }
3205
3206 if (min == max) continue;
3207
3208 if (minimize)
3209 {
3210 for (fi = min;; fi++)
3211 {
3212 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3215 if (eptr <= md->end_subject - length &&
3216 memcmp(eptr, charptr, length) == 0) eptr += length;
3217 #ifdef SUPPORT_UCP
3218 else if (oclength > 0 &&
3219 eptr <= md->end_subject - oclength &&
3220 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3221 #endif /* SUPPORT_UCP */
3222 else
3223 {
3224 CHECK_PARTIAL();
3225 MRRETURN(MATCH_NOMATCH);
3226 }
3227 }
3228 /* Control never gets here */
3229 }
3230
3231 else /* Maximize */
3232 {
3233 pp = eptr;
3234 for (i = min; i < max; i++)
3235 {
3236 if (eptr <= md->end_subject - length &&
3237 memcmp(eptr, charptr, length) == 0) eptr += length;
3238 #ifdef SUPPORT_UCP
3239 else if (oclength > 0 &&
3240 eptr <= md->end_subject - oclength &&
3241 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3242 #endif /* SUPPORT_UCP */
3243 else
3244 {
3245 CHECK_PARTIAL();
3246 break;
3247 }
3248 }
3249
3250 if (possessive) continue;
3251
3252 for(;;)
3253 {
3254 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3255 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3256 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3257 #ifdef SUPPORT_UCP
3258 eptr--;
3259 BACKCHAR(eptr);
3260 #else /* without SUPPORT_UCP */
3261 eptr -= length;
3262 #endif /* SUPPORT_UCP */
3263 }
3264 }
3265 /* Control never gets here */
3266 }
3267
3268 /* If the length of a UTF-8 character is 1, we fall through here, and
3269 obey the code as for non-UTF-8 characters below, though in this case the
3270 value of fc will always be < 128. */
3271 }
3272 else
3273 #endif /* SUPPORT_UTF8 */
3274
3275 /* When not in UTF-8 mode, load a single-byte character. */
3276
3277 fc = *ecode++;
3278
3279 /* The value of fc at this point is always less than 256, though we may or
3280 may not be in UTF-8 mode. The code is duplicated for the caseless and
3281 caseful cases, for speed, since matching characters is likely to be quite
3282 common. First, ensure the minimum number of matches are present. If min =
3283 max, continue at the same level without recursing. Otherwise, if
3284 minimizing, keep trying the rest of the expression and advancing one
3285 matching character if failing, up to the maximum. Alternatively, if
3286 maximizing, find the maximum number of characters and work backwards. */
3287
3288 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3289 max, eptr));
3290
3291 if (op >= OP_STARI) /* Caseless */
3292 {
3293 fc = md->lcc[fc];
3294 for (i = 1; i <= min; i++)
3295 {
3296 if (eptr >= md->end_subject)
3297 {
3298 SCHECK_PARTIAL();
3299 MRRETURN(MATCH_NOMATCH);
3300 }
3301 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3302 }
3303 if (min == max) continue;
3304 if (minimize)
3305 {
3306 for (fi = min;; fi++)
3307 {
3308 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3309 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3310 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3311 if (eptr >= md->end_subject)
3312 {
3313 SCHECK_PARTIAL();
3314 MRRETURN(MATCH_NOMATCH);
3315 }
3316 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3317 }
3318 /* Control never gets here */
3319 }
3320 else /* Maximize */
3321 {
3322 pp = eptr;
3323 for (i = min; i < max; i++)
3324 {
3325 if (eptr >= md->end_subject)
3326 {
3327 SCHECK_PARTIAL();
3328 break;
3329 }
3330 if (fc != md->lcc[*eptr]) break;
3331 eptr++;
3332 }
3333
3334 if (possessive) continue;
3335
3336 while (eptr >= pp)
3337 {
3338 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3339 eptr--;
3340 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3341 }
3342 MRRETURN(MATCH_NOMATCH);
3343 }
3344 /* Control never gets here */
3345 }
3346
3347 /* Caseful comparisons (includes all multi-byte characters) */
3348
3349 else
3350 {
3351 for (i = 1; i <= min; i++)
3352 {
3353 if (eptr >= md->end_subject)
3354 {
3355 SCHECK_PARTIAL();
3356 MRRETURN(MATCH_NOMATCH);
3357 }
3358 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3359 }
3360
3361 if (min == max) continue;
3362
3363 if (minimize)
3364 {
3365 for (fi = min;; fi++)
3366 {
3367 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3368 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3370 if (eptr >= md->end_subject)
3371 {
3372 SCHECK_PARTIAL();
3373 MRRETURN(MATCH_NOMATCH);
3374 }
3375 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3376 }
3377 /* Control never gets here */
3378 }
3379 else /* Maximize */
3380 {
3381 pp = eptr;
3382 for (i = min; i < max; i++)
3383 {
3384 if (eptr >= md->end_subject)
3385 {
3386 SCHECK_PARTIAL();
3387 break;
3388 }
3389 if (fc != *eptr) break;
3390 eptr++;
3391 }
3392 if (possessive) continue;
3393
3394 while (eptr >= pp)
3395 {
3396 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3397 eptr--;
3398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3399 }
3400 MRRETURN(MATCH_NOMATCH);
3401 }
3402 }
3403 /* Control never gets here */
3404
3405 /* Match a negated single one-byte character. The character we are
3406 checking can be multibyte. */
3407
3408 case OP_NOT:
3409 case OP_NOTI:
3410 if (eptr >= md->end_subject)
3411 {
3412 SCHECK_PARTIAL();
3413 MRRETURN(MATCH_NOMATCH);
3414 }
3415 ecode++;
3416 GETCHARINCTEST(c, eptr);
3417 if (op == OP_NOTI) /* The caseless case */
3418 {
3419 #ifdef SUPPORT_UTF8
3420 if (c < 256)
3421 #endif
3422 c = md->lcc[c];
3423 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3424 }
3425 else /* Caseful */
3426 {
3427 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3428 }
3429 break;
3430
3431 /* Match a negated single one-byte character repeatedly. This is almost a
3432 repeat of the code for a repeated single character, but I haven't found a
3433 nice way of commoning these up that doesn't require a test of the
3434 positive/negative option for each character match. Maybe that wouldn't add
3435 very much to the time taken, but character matching *is* what this is all
3436 about... */
3437
3438 case OP_NOTEXACT:
3439 case OP_NOTEXACTI:
3440 min = max = GET2(ecode, 1);
3441 ecode += 3;
3442 goto REPEATNOTCHAR;
3443
3444 case OP_NOTUPTO:
3445 case OP_NOTUPTOI:
3446 case OP_NOTMINUPTO:
3447 case OP_NOTMINUPTOI:
3448 min = 0;
3449 max = GET2(ecode, 1);
3450 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3451 ecode += 3;
3452 goto REPEATNOTCHAR;
3453
3454 case OP_NOTPOSSTAR:
3455 case OP_NOTPOSSTARI:
3456 possessive = TRUE;
3457 min = 0;
3458 max = INT_MAX;
3459 ecode++;
3460 goto REPEATNOTCHAR;
3461
3462 case OP_NOTPOSPLUS:
3463 case OP_NOTPOSPLUSI:
3464 possessive = TRUE;
3465 min = 1;
3466 max = INT_MAX;
3467 ecode++;
3468 goto REPEATNOTCHAR;
3469
3470 case OP_NOTPOSQUERY:
3471 case OP_NOTPOSQUERYI:
3472 possessive = TRUE;
3473 min = 0;
3474 max = 1;
3475 ecode++;
3476 goto REPEATNOTCHAR;
3477
3478 case OP_NOTPOSUPTO:
3479 case OP_NOTPOSUPTOI:
3480 possessive = TRUE;
3481 min = 0;
3482 max = GET2(ecode, 1);
3483 ecode += 3;
3484 goto REPEATNOTCHAR;
3485
3486 case OP_NOTSTAR:
3487 case OP_NOTSTARI:
3488 case OP_NOTMINSTAR:
3489 case OP_NOTMINSTARI:
3490 case OP_NOTPLUS:
3491 case OP_NOTPLUSI:
3492 case OP_NOTMINPLUS:
3493 case OP_NOTMINPLUSI:
3494 case OP_NOTQUERY:
3495 case OP_NOTQUERYI:
3496 case OP_NOTMINQUERY:
3497 case OP_NOTMINQUERYI:
3498 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3499 minimize = (c & 1) != 0;
3500 min = rep_min[c]; /* Pick up values from tables; */
3501 max = rep_max[c]; /* zero for max => infinity */
3502 if (max == 0) max = INT_MAX;
3503
3504 /* Common code for all repeated single-byte matches. */
3505
3506 REPEATNOTCHAR:
3507 fc = *ecode++;
3508
3509 /* The code is duplicated for the caseless and caseful cases, for speed,
3510 since matching characters is likely to be quite common. First, ensure the
3511 minimum number of matches are present. If min = max, continue at the same
3512 level without recursing. Otherwise, if minimizing, keep trying the rest of
3513 the expression and advancing one matching character if failing, up to the
3514 maximum. Alternatively, if maximizing, find the maximum number of
3515 characters and work backwards. */
3516
3517 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3518 max, eptr));
3519
3520 if (op >= OP_NOTSTARI) /* Caseless */
3521 {
3522 fc = md->lcc[fc];
3523
3524 #ifdef SUPPORT_UTF8
3525 /* UTF-8 mode */
3526 if (utf8)
3527 {
3528 register unsigned int d;
3529 for (i = 1; i <= min; i++)
3530 {
3531 if (eptr >= md->end_subject)
3532 {
3533 SCHECK_PARTIAL();
3534 MRRETURN(MATCH_NOMATCH);
3535 }
3536 GETCHARINC(d, eptr);
3537 if (d < 256) d = md->lcc[d];
3538 if (fc == d) MRRETURN(MATCH_NOMATCH);
3539 }
3540 }
3541 else
3542 #endif
3543
3544 /* Not UTF-8 mode */
3545 {
3546 for (i = 1; i <= min; i++)
3547 {
3548 if (eptr >= md->end_subject)
3549 {
3550 SCHECK_PARTIAL();
3551 MRRETURN(MATCH_NOMATCH);
3552 }
3553 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3554 }
3555 }
3556
3557 if (min == max) continue;
3558
3559 if (minimize)
3560 {
3561 #ifdef SUPPORT_UTF8
3562 /* UTF-8 mode */
3563 if (utf8)
3564 {
3565 register unsigned int d;
3566 for (fi = min;; fi++)
3567 {
3568 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3570 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3571 if (eptr >= md->end_subject)
3572 {
3573 SCHECK_PARTIAL();
3574 MRRETURN(MATCH_NOMATCH);
3575 }
3576 GETCHARINC(d, eptr);
3577 if (d < 256) d = md->lcc[d];
3578 if (fc == d) MRRETURN(MATCH_NOMATCH);
3579 }
3580 }
3581 else
3582 #endif
3583 /* Not UTF-8 mode */
3584 {
3585 for (fi = min;; fi++)
3586 {
3587 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3589 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3590 if (eptr >= md->end_subject)
3591 {
3592 SCHECK_PARTIAL();
3593 MRRETURN(MATCH_NOMATCH);
3594 }
3595 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3596 }
3597 }
3598 /* Control never gets here */
3599 }
3600
3601 /* Maximize case */
3602
3603 else
3604 {
3605 pp = eptr;
3606
3607 #ifdef SUPPORT_UTF8
3608 /* UTF-8 mode */
3609 if (utf8)
3610 {
3611 register unsigned int d;
3612 for (i = min; i < max; i++)
3613 {
3614 int len = 1;
3615 if (eptr >= md->end_subject)
3616 {
3617 SCHECK_PARTIAL();
3618 break;
3619 }
3620 GETCHARLEN(d, eptr, len);
3621 if (d < 256) d = md->lcc[d];
3622 if (fc == d) break;
3623 eptr += len;
3624 }
3625 if (possessive) continue;
3626 for(;;)
3627 {
3628 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3629 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3630 if (eptr-- == pp) break; /* Stop if tried at original pos */
3631 BACKCHAR(eptr);
3632 }
3633 }
3634 else
3635 #endif
3636 /* Not UTF-8 mode */
3637 {
3638 for (i = min; i < max; i++)
3639 {
3640 if (eptr >= md->end_subject)
3641 {
3642 SCHECK_PARTIAL();
3643 break;
3644 }
3645 if (fc == md->lcc[*eptr]) break;
3646 eptr++;
3647 }
3648 if (possessive) continue;
3649 while (eptr >= pp)
3650 {
3651 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3653 eptr--;
3654 }
3655 }
3656
3657 MRRETURN(MATCH_NOMATCH);
3658 }
3659 /* Control never gets here */
3660 }
3661
3662 /* Caseful comparisons */
3663
3664 else
3665 {
3666 #ifdef SUPPORT_UTF8
3667 /* UTF-8 mode */
3668 if (utf8)
3669 {
3670 register unsigned int d;
3671 for (i = 1; i <= min; i++)
3672 {
3673 if (eptr >= md->end_subject)
3674 {
3675 SCHECK_PARTIAL();
3676 MRRETURN(MATCH_NOMATCH);
3677 }
3678 GETCHARINC(d, eptr);
3679 if (fc == d) MRRETURN(MATCH_NOMATCH);
3680 }
3681 }
3682 else
3683 #endif
3684 /* Not UTF-8 mode */
3685 {
3686 for (i = 1; i <= min; i++)
3687 {
3688 if (eptr >= md->end_subject)
3689 {
3690 SCHECK_PARTIAL();
3691 MRRETURN(MATCH_NOMATCH);
3692 }
3693 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3694 }
3695 }
3696
3697 if (min == max) continue;
3698
3699 if (minimize)
3700 {
3701 #ifdef SUPPORT_UTF8
3702 /* UTF-8 mode */
3703 if (utf8)
3704 {
3705 register unsigned int d;
3706 for (fi = min;; fi++)
3707 {
3708 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3710 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3711 if (eptr >= md->end_subject)
3712 {
3713 SCHECK_PARTIAL();
3714 MRRETURN(MATCH_NOMATCH);
3715 }
3716 GETCHARINC(d, eptr);
3717 if (fc == d) MRRETURN(MATCH_NOMATCH);
3718 }
3719 }
3720 else
3721 #endif
3722 /* Not UTF-8 mode */
3723 {
3724 for (fi = min;; fi++)
3725 {
3726 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3728 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3729 if (eptr >= md->end_subject)
3730 {
3731 SCHECK_PARTIAL();
3732 MRRETURN(MATCH_NOMATCH);
3733 }
3734 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3735 }
3736 }
3737 /* Control never gets here */
3738 }
3739
3740 /* Maximize case */
3741
3742 else
3743 {
3744 pp = eptr;
3745
3746 #ifdef SUPPORT_UTF8
3747 /* UTF-8 mode */
3748 if (utf8)
3749 {
3750 register unsigned int d;
3751 for (i = min; i < max; i++)
3752 {
3753 int len = 1;
3754 if (eptr >= md->end_subject)
3755 {
3756 SCHECK_PARTIAL();
3757 break;
3758 }
3759 GETCHARLEN(d, eptr, len);
3760 if (fc == d) break;
3761 eptr += len;
3762 }
3763 if (possessive) continue;
3764 for(;;)
3765 {
3766 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3767 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3768 if (eptr-- == pp) break; /* Stop if tried at original pos */
3769 BACKCHAR(eptr);
3770 }
3771 }
3772 else
3773 #endif
3774 /* Not UTF-8 mode */
3775 {
3776 for (i = min; i < max; i++)
3777 {
3778 if (eptr >= md->end_subject)
3779 {
3780 SCHECK_PARTIAL();
3781 break;
3782 }
3783 if (fc == *eptr) break;
3784 eptr++;
3785 }
3786 if (possessive) continue;
3787 while (eptr >= pp)
3788 {
3789 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 eptr--;
3792 }
3793 }
3794
3795 MRRETURN(MATCH_NOMATCH);
3796 }
3797 }
3798 /* Control never gets here */
3799
3800 /* Match a single character type repeatedly; several different opcodes
3801 share code. This is very similar to the code for single characters, but we
3802 repeat it in the interests of efficiency. */
3803
3804 case OP_TYPEEXACT:
3805 min = max = GET2(ecode, 1);
3806 minimize = TRUE;
3807 ecode += 3;
3808 goto REPEATTYPE;
3809
3810 case OP_TYPEUPTO:
3811 case OP_TYPEMINUPTO:
3812 min = 0;
3813 max = GET2(ecode, 1);
3814 minimize = *ecode == OP_TYPEMINUPTO;
3815 ecode += 3;
3816 goto REPEATTYPE;
3817
3818 case OP_TYPEPOSSTAR:
3819 possessive = TRUE;
3820 min = 0;
3821 max = INT_MAX;
3822 ecode++;
3823 goto REPEATTYPE;
3824
3825 case OP_TYPEPOSPLUS:
3826 possessive = TRUE;
3827 min = 1;
3828 max = INT_MAX;
3829 ecode++;
3830 goto REPEATTYPE;
3831
3832 case OP_TYPEPOSQUERY:
3833 possessive = TRUE;
3834 min = 0;
3835 max = 1;
3836 ecode++;
3837 goto REPEATTYPE;
3838
3839 case OP_TYPEPOSUPTO:
3840 possessive = TRUE;
3841 min = 0;
3842 max = GET2(ecode, 1);
3843 ecode += 3;
3844 goto REPEATTYPE;
3845
3846 case OP_TYPESTAR:
3847 case OP_TYPEMINSTAR:
3848 case OP_TYPEPLUS:
3849 case OP_TYPEMINPLUS:
3850 case OP_TYPEQUERY:
3851 case OP_TYPEMINQUERY:
3852 c = *ecode++ - OP_TYPESTAR;
3853 minimize = (c & 1) != 0;
3854 min = rep_min[c]; /* Pick up values from tables; */
3855 max = rep_max[c]; /* zero for max => infinity */
3856 if (max == 0) max = INT_MAX;
3857
3858 /* Common code for all repeated single character type matches. Note that
3859 in UTF-8 mode, '.' matches a character of any length, but for the other
3860 character types, the valid characters are all one-byte long. */
3861
3862 REPEATTYPE:
3863 ctype = *ecode++; /* Code for the character type */
3864
3865 #ifdef SUPPORT_UCP
3866 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3867 {
3868 prop_fail_result = ctype == OP_NOTPROP;
3869 prop_type = *ecode++;
3870 prop_value = *ecode++;
3871 }
3872 else prop_type = -1;
3873 #endif
3874
3875 /* First, ensure the minimum number of matches are present. Use inline
3876 code for maximizing the speed, and do the type test once at the start
3877 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3878 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3879 and single-bytes. */
3880
3881 if (min > 0)
3882 {
3883 #ifdef SUPPORT_UCP
3884 if (prop_type >= 0)
3885 {
3886 switch(prop_type)
3887 {
3888 case PT_ANY:
3889 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3890 for (i = 1; i <= min; i++)
3891 {
3892 if (eptr >= md->end_subject)
3893 {
3894 SCHECK_PARTIAL();
3895 MRRETURN(MATCH_NOMATCH);
3896 }
3897 GETCHARINCTEST(c, eptr);
3898 }
3899 break;
3900
3901 case PT_LAMP:
3902 for (i = 1; i <= min; i++)
3903 {
3904 int chartype;
3905 if (eptr >= md->end_subject)
3906 {
3907 SCHECK_PARTIAL();
3908 MRRETURN(MATCH_NOMATCH);
3909 }
3910 GETCHARINCTEST(c, eptr);
3911 chartype = UCD_CHARTYPE(c);
3912 if ((chartype == ucp_Lu ||
3913 chartype == ucp_Ll ||
3914 chartype == ucp_Lt) == prop_fail_result)
3915 MRRETURN(MATCH_NOMATCH);
3916 }
3917 break;
3918
3919 case PT_GC:
3920 for (i = 1; i <= min; i++)
3921 {
3922 if (eptr >= md->end_subject)
3923 {
3924 SCHECK_PARTIAL();
3925 MRRETURN(MATCH_NOMATCH);
3926 }
3927 GETCHARINCTEST(c, eptr);
3928 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3929 MRRETURN(MATCH_NOMATCH);
3930 }
3931 break;
3932
3933 case PT_PC:
3934 for (i = 1; i <= min; i++)
3935 {
3936 if (eptr >= md->end_subject)
3937 {
3938 SCHECK_PARTIAL();
3939 MRRETURN(MATCH_NOMATCH);
3940 }
3941 GETCHARINCTEST(c, eptr);
3942 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3943 MRRETURN(MATCH_NOMATCH);
3944 }
3945 break;
3946
3947 case PT_SC:
3948 for (i = 1; i <= min; i++)
3949 {
3950 if (eptr >= md->end_subject)
3951 {
3952 SCHECK_PARTIAL();
3953 MRRETURN(MATCH_NOMATCH);
3954 }
3955 GETCHARINCTEST(c, eptr);
3956 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3957 MRRETURN(MATCH_NOMATCH);
3958 }
3959 break;
3960
3961 case PT_ALNUM:
3962 for (i = 1; i <= min; i++)
3963 {
3964 int category;
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 MRRETURN(MATCH_NOMATCH);
3969 }
3970 GETCHARINCTEST(c, eptr);
3971 category = UCD_CATEGORY(c);
3972 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3973 MRRETURN(MATCH_NOMATCH);
3974 }
3975 break;
3976
3977 case PT_SPACE: /* Perl space */
3978 for (i = 1; i <= min; i++)
3979 {
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 MRRETURN(MATCH_NOMATCH);
3984 }
3985 GETCHARINCTEST(c, eptr);
3986 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3987 c == CHAR_FF || c == CHAR_CR)
3988 == prop_fail_result)
3989 MRRETURN(MATCH_NOMATCH);
3990 }
3991 break;
3992
3993 case PT_PXSPACE: /* POSIX space */
3994 for (i = 1; i <= min; i++)
3995 {
3996 if (eptr >= md->end_subject)
3997 {
3998 SCHECK_PARTIAL();
3999 MRRETURN(MATCH_NOMATCH);
4000 }
4001 GETCHARINCTEST(c, eptr);
4002 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4003 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4004 == prop_fail_result)
4005 MRRETURN(MATCH_NOMATCH);
4006 }
4007 break;
4008
4009 case PT_WORD:
4010 for (i = 1; i <= min; i++)
4011 {
4012 int category;
4013 if (eptr >= md->end_subject)
4014 {
4015 SCHECK_PARTIAL();
4016 MRRETURN(MATCH_NOMATCH);
4017 }
4018 GETCHARINCTEST(c, eptr);
4019 category = UCD_CATEGORY(c);
4020 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4021 == prop_fail_result)
4022 MRRETURN(MATCH_NOMATCH);
4023 }
4024 break;
4025
4026 /* This should not occur */
4027
4028 default:
4029 RRETURN(PCRE_ERROR_INTERNAL);
4030 }
4031 }
4032
4033 /* Match extended Unicode sequences. We will get here only if the
4034 support is in the binary; otherwise a compile-time error occurs. */
4035
4036 else if (ctype == OP_EXTUNI)
4037 {
4038 for (i = 1; i <= min; i++)
4039 {
4040 if (eptr >= md->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 MRRETURN(MATCH_NOMATCH);
4044 }
4045 GETCHARINCTEST(c, eptr);
4046 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4047 while (eptr < md->end_subject)
4048 {
4049 int len = 1;
4050 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4051 if (UCD_CATEGORY(c) != ucp_M) break;
4052 eptr += len;
4053 }
4054 }
4055 }
4056
4057 else
4058 #endif /* SUPPORT_UCP */
4059
4060 /* Handle all other cases when the coding is UTF-8 */
4061
4062 #ifdef SUPPORT_UTF8
4063 if (utf8) switch(ctype)
4064 {
4065 case OP_ANY:
4066 for (i = 1; i <= min; i++)
4067 {
4068 if (eptr >= md->end_subject)
4069 {
4070 SCHECK_PARTIAL();
4071 MRRETURN(MATCH_NOMATCH);
4072 }
4073 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4074 eptr++;
4075 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4076 }
4077 break;
4078
4079 case OP_ALLANY:
4080 for (i = 1; i <= min; i++)
4081 {
4082 if (eptr >= md->end_subject)
4083 {
4084 SCHECK_PARTIAL();
4085 MRRETURN(MATCH_NOMATCH);
4086 }
4087 eptr++;
4088 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4089 }
4090 break;
4091
4092 case OP_ANYBYTE:
4093 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4094 eptr += min;
4095 break;
4096
4097 case OP_ANYNL:
4098 for (i = 1; i <= min; i++)
4099 {
4100 if (eptr >= md->end_subject)
4101 {
4102 SCHECK_PARTIAL();
4103 MRRETURN(MATCH_NOMATCH);
4104 }
4105 GETCHARINC(c, eptr);
4106 switch(c)
4107 {
4108 default: MRRETURN(MATCH_NOMATCH);
4109
4110 case 0x000d:
4111 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4112 break;
4113
4114 case 0x000a:
4115 break;
4116
4117 case 0x000b:
4118 case 0x000c:
4119 case 0x0085:
4120 case 0x2028:
4121 case 0x2029:
4122 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4123 break;
4124 }
4125 }
4126 break;
4127
4128 case OP_NOT_HSPACE:
4129 for (i = 1; i <= min; i++)
4130 {
4131 if (eptr >= md->end_subject)
4132 {
4133 SCHECK_PARTIAL();
4134 MRRETURN(MATCH_NOMATCH);
4135 }
4136 GETCHARINC(c, eptr);
4137 switch(c)
4138 {
4139 default: break;
4140 case 0x09: /* HT */
4141 case 0x20: /* SPACE */
4142 case 0xa0: /* NBSP */
4143 case 0x1680: /* OGHAM SPACE MARK */
4144 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4145 case 0x2000: /* EN QUAD */
4146 case 0x2001: /* EM QUAD */
4147 case 0x2002: /* EN SPACE */
4148 case 0x2003: /* EM SPACE */
4149 case 0x2004: /* THREE-PER-EM SPACE */
4150 case 0x2005: /* FOUR-PER-EM SPACE */
4151 case 0x2006: /* SIX-PER-EM SPACE */
4152 case 0x2007: /* FIGURE SPACE */
4153 case 0x2008: /* PUNCTUATION SPACE */
4154 case 0x2009: /* THIN SPACE */
4155 case 0x200A: /* HAIR SPACE */
4156 case 0x202f: /* NARROW NO-BREAK SPACE */
4157 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4158 case 0x3000: /* IDEOGRAPHIC SPACE */
4159 MRRETURN(MATCH_NOMATCH);
4160 }
4161 }
4162 break;
4163
4164 case OP_HSPACE:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 MRRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINC(c, eptr);
4173 switch(c)
4174 {
4175 default: MRRETURN(MATCH_NOMATCH);
4176 case 0x09: /* HT */
4177 case 0x20: /* SPACE */
4178 case 0xa0: /* NBSP */
4179 case 0x1680: /* OGHAM SPACE MARK */
4180 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4181 case 0x2000: /* EN QUAD */
4182 case 0x2001: /* EM QUAD */
4183 case 0x2002: /* EN SPACE */
4184 case 0x2003: /* EM SPACE */
4185 case 0x2004: /* THREE-PER-EM SPACE */
4186 case 0x2005: /* FOUR-PER-EM SPACE */
4187 case 0x2006: /* SIX-PER-EM SPACE */
4188 case 0x2007: /* FIGURE SPACE */
4189 case 0x2008: /* PUNCTUATION SPACE */
4190 case 0x2009: /* THIN SPACE */
4191 case 0x200A: /* HAIR SPACE */
4192 case 0x202f: /* NARROW NO-BREAK SPACE */
4193 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4194 case 0x3000: /* IDEOGRAPHIC SPACE */
4195 break;
4196 }
4197 }
4198 break;
4199
4200 case OP_NOT_VSPACE:
4201 for (i = 1; i <= min; i++)
4202 {
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 MRRETURN(MATCH_NOMATCH);
4207 }
4208 GETCHARINC(c, eptr);
4209 switch(c)
4210 {
4211 default: break;
4212 case 0x0a: /* LF */
4213 case 0x0b: /* VT */
4214 case 0x0c: /* FF */
4215 case 0x0d: /* CR */
4216 case 0x85: /* NEL */
4217 case 0x2028: /* LINE SEPARATOR */
4218 case 0x2029: /* PARAGRAPH SEPARATOR */
4219 MRRETURN(MATCH_NOMATCH);
4220 }
4221 }
4222 break;
4223
4224 case OP_VSPACE:
4225 for (i = 1; i <= min; i++)
4226 {
4227 if (eptr >= md->end_subject)
4228 {
4229 SCHECK_PARTIAL();
4230 MRRETURN(MATCH_NOMATCH);
4231 }
4232 GETCHARINC(c, eptr);
4233 switch(c)
4234 {
4235 default: MRRETURN(MATCH_NOMATCH);
4236 case 0x0a: /* LF */
4237 case 0x0b: /* VT */
4238 case 0x0c: /* FF */
4239 case 0x0d: /* CR */
4240 case 0x85: /* NEL */
4241 case 0x2028: /* LINE SEPARATOR */
4242 case 0x2029: /* PARAGRAPH SEPARATOR */
4243 break;
4244 }
4245 }
4246 break;
4247
4248 case OP_NOT_DIGIT:
4249 for (i = 1; i <= min; i++)
4250 {
4251 if (eptr >= md->end_subject)
4252 {
4253 SCHECK_PARTIAL();
4254 MRRETURN(MATCH_NOMATCH);
4255 }
4256 GETCHARINC(c, eptr);
4257 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4258 MRRETURN(MATCH_NOMATCH);
4259 }
4260 break;
4261
4262 case OP_DIGIT:
4263 for (i = 1; i <= min; i++)
4264 {
4265 if (eptr >= md->end_subject)
4266 {
4267 SCHECK_PARTIAL();
4268 MRRETURN(MATCH_NOMATCH);
4269 }
4270 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4271 MRRETURN(MATCH_NOMATCH);
4272 /* No need to skip more bytes - we know it's a 1-byte character */
4273 }
4274 break;
4275
4276 case OP_NOT_WHITESPACE:
4277 for (i = 1; i <= min; i++)
4278 {
4279 if (eptr >= md->end_subject)
4280 {
4281 SCHECK_PARTIAL();
4282 MRRETURN(MATCH_NOMATCH);
4283 }
4284 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4285 MRRETURN(MATCH_NOMATCH);
4286 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4287 }
4288 break;
4289
4290 case OP_WHITESPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 MRRETURN(MATCH_NOMATCH);
4297 }
4298 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4299 MRRETURN(MATCH_NOMATCH);
4300 /* No need to skip more bytes - we know it's a 1-byte character */
4301 }
4302 break;
4303
4304 case OP_NOT_WORDCHAR:
4305 for (i = 1; i <= min; i++)
4306 {
4307 if (eptr >= md->end_subject)
4308 {
4309 SCHECK_PARTIAL();
4310 MRRETURN(MATCH_NOMATCH);
4311 }
4312 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4313 MRRETURN(MATCH_NOMATCH);
4314 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4315 }
4316 break;
4317
4318 case OP_WORDCHAR:
4319 for (i = 1; i <= min; i++)
4320 {
4321 if (eptr >= md->end_subject)
4322 {
4323 SCHECK_PARTIAL();
4324 MRRETURN(MATCH_NOMATCH);
4325 }
4326 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4327 MRRETURN(MATCH_NOMATCH);
4328 /* No need to skip more bytes - we know it's a 1-byte character */
4329 }
4330 break;
4331
4332 default:
4333 RRETURN(PCRE_ERROR_INTERNAL);
4334 } /* End switch(ctype) */
4335
4336 else
4337 #endif /* SUPPORT_UTF8 */
4338
4339 /* Code for the non-UTF-8 case for minimum matching of operators other
4340 than OP_PROP and OP_NOTPROP. */
4341
4342 switch(ctype)
4343 {
4344 case OP_ANY:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 MRRETURN(MATCH_NOMATCH);
4351 }
4352 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4353 eptr++;
4354 }
4355 break;
4356
4357 case OP_ALLANY:
4358 if (eptr > md->end_subject - min)
4359 {
4360 SCHECK_PARTIAL();
4361 MRRETURN(MATCH_NOMATCH);
4362 }
4363 eptr += min;
4364 break;
4365
4366 case OP_ANYBYTE:
4367 if (eptr > md->end_subject - min)
4368 {
4369 SCHECK_PARTIAL();
4370 MRRETURN(MATCH_NOMATCH);
4371 }
4372 eptr += min;
4373 break;
4374
4375 case OP_ANYNL:
4376 for (i = 1; i <= min; i++)
4377 {
4378 if (eptr >= md->end_subject)
4379 {
4380 SCHECK_PARTIAL();
4381 MRRETURN(MATCH_NOMATCH);
4382 }
4383 switch(*eptr++)
4384 {
4385 default: MRRETURN(MATCH_NOMATCH);
4386
4387 case 0x000d:
4388 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4389 break;
4390
4391 case 0x000a:
4392 break;
4393
4394 case 0x000b:
4395 case 0x000c:
4396 case 0x0085:
4397 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4398 break;
4399 }
4400 }
4401 break;
4402
4403 case OP_NOT_HSPACE:
4404 for (i = 1; i <= min; i++)
4405 {
4406 if (eptr >= md->end_subject)
4407 {
4408 SCHECK_PARTIAL();
4409 MRRETURN(MATCH_NOMATCH);
4410 }
4411 switch(*eptr++)
4412 {
4413 default: break;
4414 case 0x09: /* HT */
4415 case 0x20: /* SPACE */
4416 case 0xa0: /* NBSP */
4417 MRRETURN(MATCH_NOMATCH);
4418 }
4419 }
4420 break;
4421
4422 case OP_HSPACE:
4423 for (i = 1; i <= min; i++)
4424 {
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 MRRETURN(MATCH_NOMATCH);
4429 }
4430 switch(*eptr++)
4431 {
4432 default: MRRETURN(MATCH_NOMATCH);
4433 case 0x09: /* HT */
4434 case 0x20: /* SPACE */
4435 case 0xa0: /* NBSP */
4436 break;
4437 }
4438 }
4439 break;
4440
4441 case OP_NOT_VSPACE:
4442 for (i = 1; i <= min; i++)
4443 {
4444 if (eptr >= md->end_subject)
4445 {
4446 SCHECK_PARTIAL();
4447 MRRETURN(MATCH_NOMATCH);
4448 }
4449 switch(*eptr++)
4450 {
4451 default: break;
4452 case 0x0a: /* LF */
4453 case 0x0b: /* VT */
4454 case 0x0c: /* FF */
4455 case 0x0d: /* CR */
4456 case 0x85: /* NEL */
4457 MRRETURN(MATCH_NOMATCH);
4458 }
4459 }
4460 break;
4461
4462 case OP_VSPACE:
4463 for (i = 1; i <= min; i++)
4464 {
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 MRRETURN(MATCH_NOMATCH);
4469 }
4470 switch(*eptr++)
4471 {
4472 default: MRRETURN(MATCH_NOMATCH);
4473 case 0x0a: /* LF */
4474 case 0x0b: /* VT */
4475 case 0x0c: /* FF */
4476 case 0x0d: /* CR */
4477 case 0x85: /* NEL */
4478 break;
4479 }
4480 }
4481 break;
4482
4483 case OP_NOT_DIGIT:
4484 for (i = 1; i <= min; i++)
4485 {
4486 if (eptr >= md->end_subject)
4487 {
4488 SCHECK_PARTIAL();
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4492 }
4493 break;
4494
4495 case OP_DIGIT:
4496 for (i = 1; i <= min; i++)
4497 {
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 MRRETURN(MATCH_NOMATCH);
4502 }
4503 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4504 }
4505 break;
4506
4507 case OP_NOT_WHITESPACE:
4508 for (i = 1; i <= min; i++)
4509 {
4510 if (eptr >= md->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 MRRETURN(MATCH_NOMATCH);
4514 }
4515 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4516 }
4517 break;
4518
4519 case OP_WHITESPACE:
4520 for (i = 1; i <= min; i++)
4521 {
4522 if (eptr >= md->end_subject)
4523 {
4524 SCHECK_PARTIAL();
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4528 }
4529 break;
4530
4531 case OP_NOT_WORDCHAR:
4532 for (i = 1; i <= min; i++)
4533 {
4534 if (eptr >= md->end_subject)
4535 {
4536 SCHECK_PARTIAL();
4537 MRRETURN(MATCH_NOMATCH);
4538 }
4539 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4540 MRRETURN(MATCH_NOMATCH);
4541 }
4542 break;
4543
4544 case OP_WORDCHAR:
4545 for (i = 1; i <= min; i++)
4546 {
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 MRRETURN(MATCH_NOMATCH);
4551 }
4552 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4553 MRRETURN(MATCH_NOMATCH);
4554 }
4555 break;
4556
4557 default:
4558 RRETURN(PCRE_ERROR_INTERNAL);
4559 }
4560 }
4561
4562 /* If min = max, continue at the same level without recursing */
4563
4564 if (min == max) continue;
4565
4566 /* If minimizing, we have to test the rest of the pattern before each
4567 subsequent match. Again, separate the UTF-8 case for speed, and also
4568 separate the UCP cases. */
4569
4570 if (minimize)
4571 {
4572 #ifdef SUPPORT_UCP
4573 if (prop_type >= 0)
4574 {
4575 switch(prop_type)
4576 {
4577 case PT_ANY:
4578 for (fi = min;; fi++)
4579 {
4580 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4581 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4582 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4583 if (eptr >= md->end_subject)
4584 {
4585 SCHECK_PARTIAL();
4586 MRRETURN(MATCH_NOMATCH);
4587 }
4588 GETCHARINCTEST(c, eptr);
4589 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4590 }
4591 /* Control never gets here */
4592
4593 case PT_LAMP:
4594 for (fi = min;; fi++)
4595 {
4596 int chartype;
4597 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4598 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4599 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4600 if (eptr >= md->end_subject)
4601 {
4602 SCHECK_PARTIAL();
4603 MRRETURN(MATCH_NOMATCH);
4604 }
4605 GETCHARINCTEST(c, eptr);
4606 chartype = UCD_CHARTYPE(c);
4607 if ((chartype == ucp_Lu ||
4608 chartype == ucp_Ll ||
4609 chartype == ucp_Lt) == prop_fail_result)
4610 MRRETURN(MATCH_NOMATCH);
4611 }
4612 /* Control never gets here */
4613
4614 case PT_GC:
4615 for (fi = min;; fi++)
4616 {
4617 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4619 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4620 if (eptr >= md->end_subject)
4621 {
4622 SCHECK_PARTIAL();
4623 MRRETURN(MATCH_NOMATCH);
4624 }
4625 GETCHARINCTEST(c, eptr);
4626 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4627 MRRETURN(MATCH_NOMATCH);
4628 }
4629 /* Control never gets here */
4630
4631 case PT_PC:
4632 for (fi = min;; fi++)
4633 {
4634 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4636 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4637 if (eptr >= md->end_subject)
4638 {
4639 SCHECK_PARTIAL();
4640 MRRETURN(MATCH_NOMATCH);
4641 }
4642 GETCHARINCTEST(c, eptr);
4643 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4644 MRRETURN(MATCH_NOMATCH);
4645 }
4646 /* Control never gets here */
4647
4648 case PT_SC:
4649 for (fi = min;; fi++)
4650 {
4651 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4653 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4654 if (eptr >= md->end_subject)
4655 {
4656 SCHECK_PARTIAL();
4657 MRRETURN(MATCH_NOMATCH);
4658 }
4659 GETCHARINCTEST(c, eptr);
4660 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4661 MRRETURN(MATCH_NOMATCH);
4662 }
4663 /* Control never gets here */
4664
4665 case PT_ALNUM:
4666 for (fi = min;; fi++)
4667 {
4668 int category;
4669 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4671 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4672 if (eptr >= md->end_subject)
4673 {
4674 SCHECK_PARTIAL();
4675 MRRETURN(MATCH_NOMATCH);
4676 }
4677 GETCHARINCTEST(c, eptr);
4678 category = UCD_CATEGORY(c);
4679 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4680 MRRETURN(MATCH_NOMATCH);
4681 }
4682 /* Control never gets here */
4683
4684 case PT_SPACE: /* Perl space */
4685 for (fi = min;; fi++)
4686 {
4687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4689 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4690 if (eptr >= md->end_subject)
4691 {
4692 SCHECK_PARTIAL();
4693 MRRETURN(MATCH_NOMATCH);
4694 }
4695 GETCHARINCTEST(c, eptr);
4696 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4697 c == CHAR_FF || c == CHAR_CR)
4698 == prop_fail_result)
4699 MRRETURN(MATCH_NOMATCH);
4700 }
4701 /* Control never gets here */
4702
4703 case PT_PXSPACE: /* POSIX space */
4704 for (fi = min;; fi++)
4705 {
4706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4708 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4709 if (eptr >= md->end_subject)
4710 {
4711 SCHECK_PARTIAL();
4712 MRRETURN(MATCH_NOMATCH);
4713 }
4714 GETCHARINCTEST(c, eptr);
4715 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4716 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4717 == prop_fail_result)
4718 MRRETURN(MATCH_NOMATCH);
4719 }
4720 /* Control never gets here */
4721
4722 case PT_WORD:
4723 for (fi = min;; fi++)
4724 {
4725 int category;
4726 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4728 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 MRRETURN(MATCH_NOMATCH);
4733 }
4734 GETCHARINCTEST(c, eptr);
4735 category = UCD_CATEGORY(c);
4736 if ((category == ucp_L ||
4737 category == ucp_N ||
4738 c == CHAR_UNDERSCORE)
4739 == prop_fail_result)
4740 MRRETURN(MATCH_NOMATCH);
4741 }
4742 /* Control never gets here */
4743
4744 /* This should never occur */
4745
4746 default:
4747 RRETURN(PCRE_ERROR_INTERNAL);
4748 }
4749 }
4750
4751 /* Match extended Unicode sequences. We will get here only if the
4752 support is in the binary; otherwise a compile-time error occurs. */
4753
4754 else if (ctype == OP_EXTUNI)
4755 {
4756 for (fi = min;; fi++)
4757 {
4758 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4760 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4761 if (eptr >= md->end_subject)
4762 {
4763 SCHECK_PARTIAL();
4764 MRRETURN(MATCH_NOMATCH);
4765 }
4766 GETCHARINCTEST(c, eptr);
4767 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4768 while (eptr < md->end_subject)
4769 {
4770 int len = 1;
4771 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4772 if (UCD_CATEGORY(c) != ucp_M) break;
4773 eptr += len;
4774 }
4775 }
4776 }
4777 else
4778 #endif /* SUPPORT_UCP */
4779
4780 #ifdef SUPPORT_UTF8
4781 /* UTF-8 mode */
4782 if (utf8)
4783 {
4784 for (fi = min;; fi++)
4785 {
4786 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4788 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4789 if (eptr >= md->end_subject)
4790 {
4791 SCHECK_PARTIAL();
4792 MRRETURN(MATCH_NOMATCH);
4793 }
4794 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4795 MRRETURN(MATCH_NOMATCH);
4796 GETCHARINC(c, eptr);
4797 switch(ctype)
4798 {
4799 case OP_ANY: /* This is the non-NL case */
4800 case OP_ALLANY:
4801 case OP_ANYBYTE:
4802 break;
4803
4804 case OP_ANYNL:
4805 switch(c)
4806 {
4807 default: MRRETURN(MATCH_NOMATCH);
4808 case 0x000d:
4809 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4810 break;
4811 case 0x000a:
4812 break;
4813
4814 case 0x000b:
4815 case 0x000c:
4816 case 0x0085:
4817 case 0x2028:
4818 case 0x2029:
4819 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4820 break;
4821 }
4822 break;
4823
4824 case OP_NOT_HSPACE:
4825 switch(c)
4826 {
4827 default: break;
4828 case 0x09: /* HT */
4829 case 0x20: /* SPACE */
4830 case 0xa0: /* NBSP */
4831 case 0x1680: /* OGHAM SPACE MARK */
4832 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4833 case 0x2000: /* EN QUAD */
4834 case 0x2001: /* EM QUAD */
4835 case 0x2002: /* EN SPACE */
4836 case 0x2003: /* EM SPACE */
4837 case 0x2004: /* THREE-PER-EM SPACE */
4838 case 0x2005: /* FOUR-PER-EM SPACE */
4839 case 0x2006: /* SIX-PER-EM SPACE */
4840 case 0x2007: /* FIGURE SPACE */
4841 case 0x2008: /* PUNCTUATION SPACE */
4842 case 0x2009: /* THIN SPACE */
4843 case 0x200A: /* HAIR SPACE */
4844 case 0x202f: /* NARROW NO-BREAK SPACE */
4845 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4846 case 0x3000: /* IDEOGRAPHIC SPACE */
4847 MRRETURN(MATCH_NOMATCH);
4848 }
4849 break;
4850
4851 case OP_HSPACE:
4852 switch(c)
4853 {
4854 default: MRRETURN(MATCH_NOMATCH);
4855 case 0x09: /* HT */
4856 case 0x20: /* SPACE */
4857 case 0xa0: /* NBSP */
4858 case 0x1680: /* OGHAM SPACE MARK */
4859 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4860 case 0x2000: /* EN QUAD */
4861 case 0x2001: /* EM QUAD */
4862 case 0x2002: /* EN SPACE */
4863 case 0x2003: /* EM SPACE */
4864 case 0x2004: /* THREE-PER-EM SPACE */
4865 case 0x2005: /* FOUR-PER-EM SPACE */
4866 case 0x2006: /* SIX-PER-EM SPACE */
4867 case 0x2007: /* FIGURE SPACE */
4868 case 0x2008: /* PUNCTUATION SPACE */
4869 case 0x2009: /* THIN SPACE */
4870 case 0x200A: /* HAIR SPACE */
4871 case 0x202f: /* NARROW NO-BREAK SPACE */
4872 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4873 case 0x3000: /* IDEOGRAPHIC SPACE */
4874 break;
4875 }
4876 break;
4877
4878 case OP_NOT_VSPACE:
4879 switch(c)
4880 {
4881 default: break;
4882 case 0x0a: /* LF */
4883 case 0x0b: /* VT */
4884 case 0x0c: /* FF */
4885 case 0x0d: /* CR */
4886 case 0x85: /* NEL */
4887 case 0x2028: /* LINE SEPARATOR */
4888 case 0x2029: /* PARAGRAPH SEPARATOR */
4889 MRRETURN(MATCH_NOMATCH);
4890 }
4891 break;
4892
4893 case OP_VSPACE:
4894 switch(c)
4895 {
4896 default: MRRETURN(MATCH_NOMATCH);
4897 case 0x0a: /* LF */
4898 case 0x0b: /* VT */
4899 case 0x0c: /* FF */
4900 case 0x0d: /* CR */
4901 case 0x85: /* NEL */
4902 case 0x2028: /* LINE SEPARATOR */
4903 case 0x2029: /* PARAGRAPH SEPARATOR */
4904 break;
4905 }
4906 break;
4907
4908 case OP_NOT_DIGIT:
4909 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4910 MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 case OP_DIGIT:
4914 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4915 MRRETURN(MATCH_NOMATCH);
4916 break;
4917
4918 case OP_NOT_WHITESPACE:
4919 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4920 MRRETURN(MATCH_NOMATCH);
4921 break;
4922
4923 case OP_WHITESPACE:
4924 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4925 MRRETURN(MATCH_NOMATCH);
4926 break;
4927
4928 case OP_NOT_WORDCHAR:
4929 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4930 MRRETURN(MATCH_NOMATCH);
4931 break;
4932
4933 case OP_WORDCHAR:
4934 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4935 MRRETURN(MATCH_NOMATCH);
4936 break;
4937
4938 default:
4939 RRETURN(PCRE_ERROR_INTERNAL);
4940 }
4941 }
4942 }
4943 else
4944 #endif
4945 /* Not UTF-8 mode */
4946 {
4947 for (fi = min;; fi++)
4948 {
4949 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4951 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 MRRETURN(MATCH_NOMATCH);
4956 }
4957 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4958 MRRETURN(MATCH_NOMATCH);
4959 c = *eptr++;
4960 switch(ctype)
4961 {
4962 case OP_ANY: /* This is the non-NL case */
4963 case OP_ALLANY:
4964 case OP_ANYBYTE:
4965 break;
4966
4967 case OP_ANYNL:
4968 switch(c)
4969 {
4970 default: MRRETURN(MATCH_NOMATCH);
4971 case 0x000d:
4972 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4973 break;
4974
4975 case 0x000a:
4976 break;
4977
4978 case 0x000b:
4979 case 0x000c:
4980 case 0x0085:
4981 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4982 break;
4983 }
4984 break;
4985
4986 case OP_NOT_HSPACE:
4987 switch(c)
4988 {
4989 default: break;
4990 case 0x09: /* HT */
4991 case 0x20: /* SPACE */
4992 case 0xa0: /* NBSP */
4993 MRRETURN(MATCH_NOMATCH);
4994 }
4995 break;
4996
4997 case OP_HSPACE:
4998 switch(c)
4999 {
5000 default: MRRETURN(MATCH_NOMATCH);
5001 case 0x09: /* HT */
5002 case 0x20: /* SPACE */
5003 case 0xa0: /* NBSP */
5004 break;
5005 }
5006 break;
5007
5008 case OP_NOT_VSPACE:
5009 switch(c)
5010 {
5011 default: break;
5012 case 0x0a: /* LF */
5013 case 0x0b: /* VT */
5014 case 0x0c: /* FF */
5015 case 0x0d: /* CR */
5016 case 0x85: /* NEL */
5017 MRRETURN(MATCH_NOMATCH);
5018 }
5019 break;
5020
5021 case OP_VSPACE:
5022 switch(c)
5023 {
5024 default: MRRETURN(MATCH_NOMATCH);
5025 case 0x0a: /* LF */
5026 case 0x0b: /* VT */
5027 case 0x0c: /* FF */
5028 case 0x0d: /* CR */
5029 case 0x85: /* NEL */
5030 break;
5031 }
5032 break;
5033
5034 case OP_NOT_DIGIT:
5035 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5036 break;
5037
5038 case OP_DIGIT:
5039 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5040 break;
5041
5042 case OP_NOT_WHITESPACE:
5043 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5044 break;
5045
5046 case OP_WHITESPACE:
5047 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5048 break;
5049
5050 case OP_NOT_WORDCHAR:
5051 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5052 break;
5053
5054 case OP_WORDCHAR:
5055 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5056 break;
5057
5058 default:
5059 RRETURN(PCRE_ERROR_INTERNAL);
5060 }
5061 }
5062 }
5063 /* Control never gets here */
5064 }
5065
5066 /* If maximizing, it is worth using inline code for speed, doing the type
5067 test once at the start (i.e. keep it out of the loop). Again, keep the
5068 UTF-8 and UCP stuff separate. */
5069
5070 else
5071 {
5072 pp = eptr; /* Remember where we started */
5073
5074 #ifdef SUPPORT_UCP
5075 if (prop_type >= 0)
5076 {
5077 switch(prop_type)
5078 {
5079 case PT_ANY:
5080 for (i = min; i < max; i++)
5081 {
5082 int len = 1;
5083 if (eptr >= md->end_subject)
5084 {
5085 SCHECK_PARTIAL();
5086 break;
5087 }
5088 GETCHARLENTEST(c, eptr, len);
5089 if (prop_fail_result) break;
5090 eptr+= len;
5091 }
5092 break;
5093
5094 case PT_LAMP:
5095 for (i = min; i < max; i++)
5096 {
5097 int chartype;
5098 int len = 1;
5099 if (eptr >= md->end_subject)
5100 {
5101 SCHECK_PARTIAL();
5102 break;
5103 }
5104 GETCHARLENTEST(c, eptr, len);
5105 chartype = UCD_CHARTYPE(c);
5106 if ((chartype == ucp_Lu ||
5107 chartype == ucp_Ll ||
5108 chartype == ucp_Lt) == prop_fail_result)
5109 break;
5110 eptr+= len;
5111 }
5112 break;
5113
5114 case PT_GC:
5115 for (i = min; i < max; i++)
5116 {
5117 int len = 1;
5118 if (eptr >= md->end_subject)
5119 {
5120 SCHECK_PARTIAL();
5121 break;
5122 }
5123 GETCHARLENTEST(c, eptr, len);
5124 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5125 eptr+= len;
5126 }
5127 break;
5128
5129 case PT_PC:
5130 for (i = min; i < max; i++)
5131 {
5132 int len = 1;
5133 if (eptr >= md->end_subject)
5134 {
5135 SCHECK_PARTIAL();
5136 break;
5137 }
5138 GETCHARLENTEST(c, eptr, len);
5139 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5140 eptr+= len;
5141 }
5142 break;
5143
5144 case PT_SC:
5145 for (i = min; i < max; i++)
5146 {
5147 int len = 1;
5148 if (eptr >= md->end_subject)
5149 {
5150 SCHECK_PARTIAL();
5151 break;
5152 }
5153 GETCHARLENTEST(c, eptr, len);
5154 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5155 eptr+= len;
5156 }
5157 break;
5158
5159 case PT_ALNUM:
5160 for (i = min; i < max; i++)
5161 {
5162 int category;
5163 int len = 1;
5164 if (eptr >= md->end_subject)
5165 {
5166 SCHECK_PARTIAL();
5167 break;
5168 }
5169 GETCHARLENTEST(c, eptr, len);
5170 category = UCD_CATEGORY(c);
5171 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5172 break;
5173 eptr+= len;
5174 }
5175 break;
5176
5177 case PT_SPACE: /* Perl space */
5178 for (i = min; i < max; i++)
5179 {
5180 int len = 1;
5181 if (eptr >= md->end_subject)
5182 {
5183 SCHECK_PARTIAL();
5184 break;
5185 }
5186 GETCHARLENTEST(c, eptr, len);
5187 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5188 c == CHAR_FF || c == CHAR_CR)
5189 == prop_fail_result)
5190 break;
5191 eptr+= len;
5192 }
5193 break;
5194
5195 case PT_PXSPACE: /* POSIX space */
5196 for (i = min; i < max; i++)
5197 {
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 GETCHARLENTEST(c, eptr, len);
5205 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5206 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5207 == prop_fail_result)
5208 break;
5209 eptr+= len;
5210 }
5211 break;
5212
5213 case PT_WORD:
5214 for (i = min; i < max; i++)
5215 {
5216 int category;
5217 int len = 1;
5218 if (eptr >= md->end_subject)
5219 {
5220 SCHECK_PARTIAL();
5221 break;
5222 }
5223 GETCHARLENTEST(c, eptr, len);
5224 category = UCD_CATEGORY(c);
5225 if ((category == ucp_L || category == ucp_N ||
5226 c == CHAR_UNDERSCORE) == prop_fail_result)
5227 break;
5228 eptr+= len;
5229 }
5230 break;
5231
5232 default:
5233 RRETURN(PCRE_ERROR_INTERNAL);
5234 }
5235
5236 /* eptr is now past the end of the maximum run */
5237
5238 if (possessive) continue;
5239 for(;;)
5240 {
5241 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5243 if (eptr-- == pp) break; /* Stop if tried at original pos */
5244 if (utf8) BACKCHAR(eptr);
5245 }
5246 }
5247
5248 /* Match extended Unicode sequences. We will get here only if the
5249 support is in the binary; otherwise a compile-time error occurs. */
5250
5251 else if (ctype == OP_EXTUNI)
5252 {
5253 for (i = min; i < max; i++)
5254 {
5255 int len = 1;
5256 if (eptr >= md->end_subject)
5257 {
5258 SCHECK_PARTIAL();
5259 break;
5260 }
5261 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5262 if (UCD_CATEGORY(c) == ucp_M) break;
5263 eptr += len;
5264 while (eptr < md->end_subject)
5265 {
5266 len = 1;
5267 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5268 if (UCD_CATEGORY(c) != ucp_M) break;
5269 eptr += len;
5270 }
5271 }
5272
5273 /* eptr is now past the end of the maximum run */
5274
5275 if (possessive) continue;
5276
5277 for(;;)
5278 {
5279 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5281 if (eptr-- == pp) break; /* Stop if tried at original pos */
5282 for (;;) /* Move back over one extended */
5283 {
5284 if (!utf8) c = *eptr; else
5285 {
5286 BACKCHAR(eptr);
5287 GETCHAR(c, eptr);
5288 }
5289 if (UCD_CATEGORY(c) != ucp_M) break;
5290 eptr--;
5291 }
5292 }
5293 }
5294
5295 else
5296 #endif /* SUPPORT_UCP */
5297
5298 #ifdef SUPPORT_UTF8
5299 /* UTF-8 mode */
5300
5301 if (utf8)
5302 {
5303 switch(ctype)
5304 {
5305 case OP_ANY:
5306 if (max < INT_MAX)
5307 {
5308 for (i = min; i < max; i++)
5309 {
5310 if (eptr >= md->end_subject)
5311 {
5312 SCHECK_PARTIAL();
5313 break;
5314 }
5315 if (IS_NEWLINE(eptr)) break;
5316 eptr++;
5317 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5318 }
5319 }
5320
5321 /* Handle unlimited UTF-8 repeat */
5322
5323 else
5324 {
5325 for (i = min; i < max; i++)
5326 {
5327 if (eptr >= md->end_subject)
5328 {
5329 SCHECK_PARTIAL();
5330 break;
5331 }
5332 if (IS_NEWLINE(eptr)) break;
5333 eptr++;
5334 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5335 }
5336 }
5337 break;
5338
5339 case OP_ALLANY:
5340 if (max < INT_MAX)
5341 {
5342 for (i = min; i < max; i++)
5343 {
5344 if (eptr >= md->end_subject)
5345 {
5346 SCHECK_PARTIAL();
5347 break;
5348 }
5349 eptr++;
5350 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5351 }
5352 }
5353 else
5354 {
5355 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5356 SCHECK_PARTIAL();
5357 }
5358 break;
5359
5360 /* The byte case is the same as non-UTF8 */
5361
5362 case OP_ANYBYTE:
5363 c = max - min;
5364 if (c > (unsigned int)(md->end_subject - eptr))
5365 {
5366 eptr = md->end_subject;
5367 SCHECK_PARTIAL();
5368 }
5369 else eptr += c;
5370 break;
5371
5372 case OP_ANYNL:
5373 for (i = min; i < max; i++)
5374 {
5375 int len = 1;
5376 if (eptr >= md->end_subject)
5377 {
5378 SCHECK_PARTIAL();
5379 break;
5380 }
5381 GETCHARLEN(c, eptr, len);
5382 if (c == 0x000d)
5383 {
5384 if (++eptr >= md->end_subject) break;
5385 if (*eptr == 0x000a) eptr++;
5386 }
5387 else
5388 {
5389 if (c != 0x000a &&
5390 (md->bsr_anycrlf ||
5391 (c != 0x000b && c != 0x000c &&
5392 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5393 break;
5394 eptr += len;
5395 }
5396 }
5397 break;
5398
5399 case OP_NOT_HSPACE:
5400 case OP_HSPACE:
5401 for (i = min; i < max; i++)
5402 {
5403 BOOL gotspace;
5404 int len = 1;
5405 if (eptr >= md->end_subject)
5406 {
5407 SCHECK_PARTIAL();
5408 break;
5409 }
5410 GETCHARLEN(c, eptr, len);
5411 switch(c)
5412 {
5413 default: gotspace = FALSE; break;
5414 case 0x09: /* HT */
5415 case 0x20: /* SPACE */
5416 case 0xa0: /* NBSP */
5417 case 0x1680: /* OGHAM SPACE MARK */
5418 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5419 case 0x2000: /* EN QUAD */
5420 case 0x2001: /* EM QUAD */
5421 case 0x2002: /* EN SPACE */
5422 case 0x2003: /* EM SPACE */
5423 case 0x2004: /* THREE-PER-EM SPACE */
5424 case 0x2005: /* FOUR-PER-EM SPACE */
5425 case 0x2006: /* SIX-PER-EM SPACE */
5426 case 0x2007: /* FIGURE SPACE */
5427 case 0x2008: /* PUNCTUATION SPACE */
5428 case 0x2009: /* THIN SPACE */
5429 case 0x200A: /* HAIR SPACE */
5430 case 0x202f: /* NARROW NO-BREAK SPACE */
5431 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5432 case 0x3000: /* IDEOGRAPHIC SPACE */
5433 gotspace = TRUE;
5434 break;
5435 }
5436 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5437 eptr += len;
5438 }
5439 break;
5440
5441 case OP_NOT_VSPACE:
5442 case OP_VSPACE:
5443 for (i = min; i < max; i++)
5444 {
5445 BOOL gotspace;
5446 int len = 1;
5447 if (eptr >= md->end_subject)
5448 {
5449 SCHECK_PARTIAL();
5450 break;
5451 }
5452 GETCHARLEN(c, eptr, len);
5453 switch(c)
5454 {
5455 default: gotspace = FALSE; break;
5456 case 0x0a: /* LF */
5457 case 0x0b: /* VT */
5458 case 0x0c: /* FF */
5459 case 0x0d: /* CR */
5460 case 0x85: /* NEL */
5461 case 0x2028: /* LINE SEPARATOR */
5462 case 0x2029: /* PARAGRAPH SEPARATOR */
5463 gotspace = TRUE;
5464 break;
5465 }
5466 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5467 eptr += len;
5468 }
5469 break;
5470
5471 case OP_NOT_DIGIT:
5472 for (i = min; i < max; i++)
5473 {
5474 int len = 1;
5475 if (eptr >= md->end_subject)
5476 {
5477 SCHECK_PARTIAL();
5478 break;
5479 }
5480 GETCHARLEN(c, eptr, len);
5481 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5482 eptr+= len;
5483 }
5484 break;
5485
5486 case OP_DIGIT:
5487 for (i = min; i < max; i++)
5488 {
5489 int len = 1;
5490 if (eptr >= md->end_subject)
5491 {
5492 SCHECK_PARTIAL();
5493 break;
5494 }
5495 GETCHARLEN(c, eptr, len);
5496 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5497 eptr+= len;
5498 }
5499 break;
5500
5501 case OP_NOT_WHITESPACE:
5502 for (i = min; i < max; i++)
5503 {
5504 int len = 1;
5505 if (eptr >= md->end_subject)
5506 {
5507 SCHECK_PARTIAL();
5508 break;
5509 }
5510 GETCHARLEN(c, eptr, len);
5511 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5512 eptr+= len;
5513 }
5514 break;
5515
5516 case OP_WHITESPACE:
5517 for (i = min; i < max; i++)
5518 {
5519 int len = 1;
5520 if (eptr >= md->end_subject)
5521 {
5522 SCHECK_PARTIAL();
5523 break;
5524 }
5525 GETCHARLEN(c, eptr, len);
5526 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5527 eptr+= len;
5528 }
5529 break;
5530
5531 case OP_NOT_WORDCHAR:
5532 for (i = min; i < max; i++)
5533 {
5534 int len = 1;
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 GETCHARLEN(c, eptr, len);
5541 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5542 eptr+= len;
5543 }
5544 break;
5545
5546 case OP_WORDCHAR:
5547 for (i = min; i < max; i++)
5548 {
5549 int len = 1;
5550 if (eptr >= md->end_subject)
5551 {
5552 SCHECK_PARTIAL();
5553 break;
5554 }
5555 GETCHARLEN(c, eptr, len);
5556 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5557 eptr+= len;
5558 }
5559 break;
5560
5561 default:
5562 RRETURN(PCRE_ERROR_INTERNAL);
5563 }
5564
5565 /* eptr is now past the end of the maximum run. If possessive, we are
5566 done (no backing up). Otherwise, match at this position; anything other
5567 than no match is immediately returned. For nomatch, back up one
5568 character, unless we are matching \R and the last thing matched was
5569 \r\n, in which case, back up two bytes. */
5570
5571 if (possessive) continue;
5572 for(;;)
5573 {
5574 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5575 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5576 if (eptr-- == pp) break; /* Stop if tried at original pos */
5577 BACKCHAR(eptr);
5578 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5579 eptr[-1] == '\r') eptr--;
5580 }
5581 }
5582 else
5583 #endif /* SUPPORT_UTF8 */
5584
5585 /* Not UTF-8 mode */
5586 {
5587 switch(ctype)
5588 {
5589 case OP_ANY:
5590 for (i = min; i < max; i++)
5591 {
5592 if (eptr >= md->end_subject)
5593 {
5594 SCHECK_PARTIAL();
5595 break;
5596 }
5597 if (IS_NEWLINE(eptr)) break;
5598 eptr++;
5599 }
5600 break;
5601
5602 case OP_ALLANY:
5603 case OP_ANYBYTE:
5604 c = max - min;
5605 if (c > (unsigned int)(md->end_subject - eptr))
5606 {
5607 eptr = md->end_subject;
5608 SCHECK_PARTIAL();
5609 }
5610 else eptr += c;
5611 break;
5612
5613 case OP_ANYNL:
5614 for (i = min; i < max; i++)
5615 {
5616 if (eptr >= md->end_subject)
5617 {
5618 SCHECK_PARTIAL();
5619 break;
5620 }
5621 c = *eptr;
5622 if (c == 0x000d)
5623 {
5624 if (++eptr >= md->end_subject) break;
5625 if (*eptr == 0x000a) eptr++;
5626 }
5627 else
5628 {
5629 if (c != 0x000a &&
5630 (md->bsr_anycrlf ||
5631 (c != 0x000b && c != 0x000c && c != 0x0085)))
5632 break;
5633 eptr++;
5634 }
5635 }
5636 break;
5637
5638 case OP_NOT_HSPACE:
5639 for (i = min; i < max; i++)
5640 {
5641 if (eptr >= md->end_subject)
5642 {
5643 SCHECK_PARTIAL();
5644 break;
5645 }
5646 c = *eptr;
5647 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5648 eptr++;
5649 }
5650 break;
5651
5652 case OP_HSPACE:
5653 for (i = min; i < max; i++)
5654 {
5655 if (eptr >= md->end_subject)
5656 {
5657 SCHECK_PARTIAL();
5658 break;
5659 }
5660 c = *eptr;
5661 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5662 eptr++;
5663 }
5664 break;
5665
5666 case OP_NOT_VSPACE:
5667 for (i = min; i < max; i++)
5668 {
5669 if (eptr >= md->end_subject)
5670 {
5671 SCHECK_PARTIAL();
5672 break;
5673 }
5674 c = *eptr;
5675 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5676 break;
5677 eptr++;
5678 }
5679 break;
5680
5681 case OP_VSPACE:
5682 for (i = min; i < max; i++)
5683 {
5684 if (eptr >= md->end_subject)
5685 {
5686 SCHECK_PARTIAL();
5687 break;
5688 }
5689 c = *eptr;
5690 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5691 break;
5692 eptr++;
5693 }
5694 break;
5695
5696 case OP_NOT_DIGIT:
5697 for (i = min; i < max; i++)
5698 {
5699 if (eptr >= md->end_subject)
5700 {
5701 SCHECK_PARTIAL();
5702 break;
5703 }
5704 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5705 eptr++;
5706 }
5707 break;
5708
5709 case OP_DIGIT:
5710 for (i = min; i < max; i++)
5711 {
5712 if (eptr >= md->end_subject)
5713 {
5714 SCHECK_PARTIAL();
5715 break;
5716 }
5717 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5718 eptr++;
5719 }
5720 break;
5721
5722 case OP_NOT_WHITESPACE:
5723 for (i = min; i < max; i++)
5724 {
5725 if (eptr >= md->end_subject)
5726 {
5727 SCHECK_PARTIAL();
5728 break;
5729 }
5730 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5731 eptr++;
5732 }
5733 break;
5734
5735 case OP_WHITESPACE:
5736 for (i = min; i < max; i++)
5737 {
5738 if (eptr >= md->end_subject)
5739 {
5740 SCHECK_PARTIAL();
5741 break;
5742 }
5743 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5744 eptr++;
5745 }
5746 break;
5747
5748 case OP_NOT_WORDCHAR:
5749 for (i = min; i < max; i++)
5750 {
5751 if (eptr >= md->end_subject)
5752 {
5753 SCHECK_PARTIAL();
5754 break;
5755 }
5756 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5757 eptr++;
5758 }
5759 break;
5760
5761 case OP_WORDCHAR:
5762 for (i = min; i < max; i++)
5763 {
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5770 eptr++;
5771 }
5772 break;
5773
5774 default:
5775 RRETURN(PCRE_ERROR_INTERNAL);
5776 }
5777
5778 /* eptr is now past the end of the maximum run. If possessive, we are
5779 done (no backing up). Otherwise, match at this position; anything other
5780 than no match is immediately returned. For nomatch, back up one
5781 character (byte), unless we are matching \R and the last thing matched
5782 was \r\n, in which case, back up two bytes. */
5783
5784 if (possessive) continue;
5785 while (eptr >= pp)
5786 {
5787 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5789 eptr--;
5790 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5791 eptr[-1] == '\r') eptr--;
5792 }
5793 }
5794
5795 /* Get here if we can't make it match with any permitted repetitions */
5796
5797 MRRETURN(MATCH_NOMATCH);
5798 }
5799 /* Control never gets here */
5800
5801 /* There's been some horrible disaster. Arrival here can only mean there is
5802 something seriously wrong in the code above or the OP_xxx definitions. */
5803
5804 default:
5805 DPRINTF(("Unknown opcode %d\n", *ecode));
5806 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5807 }
5808
5809 /* Do not stick any code in here without much thought; it is assumed
5810 that "continue" in the code above comes out to here to repeat the main
5811 loop. */
5812
5813 } /* End of main loop */
5814 /* Control never reaches here */
5815
5816
5817 /* When compiling to use the heap rather than the stack for recursive calls to
5818 match(), the RRETURN() macro jumps here. The number that is saved in
5819 frame->Xwhere indicates which label we actually want to return to. */
5820
5821 #ifdef NO_RECURSE
5822 #define LBL(val) case val: goto L_RM##val;
5823 HEAP_RETURN:
5824 switch (frame->Xwhere)
5825 {
5826 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5827 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5828 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5829 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5830 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5831 LBL(65) LBL(66)
5832 #ifdef SUPPORT_UTF8
5833 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5834 LBL(32) LBL(34) LBL(42) LBL(46)
5835 #ifdef SUPPORT_UCP
5836 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5837 LBL(59) LBL(60) LBL(61) LBL(62)
5838 #endif /* SUPPORT_UCP */
5839 #endif /* SUPPORT_UTF8 */
5840 default:
5841 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5842 return PCRE_ERROR_INTERNAL;
5843 }
5844 #undef LBL
5845 #endif /* NO_RECURSE */
5846 }
5847
5848
5849 /***************************************************************************
5850 ****************************************************************************
5851 RECURSION IN THE match() FUNCTION
5852
5853 Undefine all the macros that were defined above to handle this. */
5854
5855 #ifdef NO_RECURSE
5856 #undef eptr
5857 #undef ecode
5858 #undef mstart
5859 #undef offset_top
5860 #undef eptrb
5861 #undef flags
5862
5863 #undef callpat
5864 #undef charptr
5865 #undef data
5866 #undef next
5867 #undef pp
5868 #undef prev
5869 #undef saved_eptr
5870
5871 #undef new_recursive
5872
5873 #undef cur_is_word
5874 #undef condition
5875 #undef prev_is_word
5876
5877 #undef ctype
5878 #undef length
5879 #undef max
5880 #undef min
5881 #undef number
5882 #undef offset
5883 #undef op
5884 #undef save_capture_last
5885 #undef save_offset1
5886 #undef save_offset2
5887 #undef save_offset3
5888 #undef stacksave
5889
5890 #undef newptrb
5891
5892 #endif
5893
5894 /* These two are defined as macros in both cases */
5895
5896 #undef fc
5897 #undef fi
5898
5899 /***************************************************************************
5900 ***************************************************************************/
5901
5902
5903
5904 /*************************************************
5905 * Execute a Regular Expression *
5906 *************************************************/
5907
5908 /* This function applies a compiled re to a subject string and picks out
5909 portions of the string if it matches. Two elements in the vector are set for
5910 each substring: the offsets to the start and end of the substring.
5911
5912 Arguments:
5913 argument_re points to the compiled expression
5914 extra_data points to extra data or is NULL
5915 subject points to the subject string
5916 length length of subject string (may contain binary zeros)
5917 start_offset where to start in the subject string
5918 options option bits
5919 offsets points to a vector of ints to be filled in with offsets
5920 offsetcount the number of elements in the vector
5921
5922 Returns: > 0 => success; value is the number of elements filled in
5923 = 0 => success, but offsets is not big enough
5924 -1 => failed to match
5925 < -1 => some kind of unexpected problem
5926 */
5927
5928 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5929 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5930 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5931 int offsetcount)
5932 {
5933 int rc, ocount, arg_offset_max;
5934 int first_byte = -1;
5935 int req_byte = -1;
5936 int req_byte2 = -1;
5937 int newline;
5938 BOOL using_temporary_offsets = FALSE;
5939 BOOL anchored;
5940 BOOL startline;
5941 BOOL firstline;
5942 BOOL first_byte_caseless = FALSE;
5943 BOOL req_byte_caseless = FALSE;
5944 BOOL utf8;
5945 match_data match_block;
5946 match_data *md = &match_block;
5947 const uschar *tables;
5948 const uschar *start_bits = NULL;
5949 USPTR start_match = (USPTR)subject + start_offset;
5950 USPTR end_subject;
5951 USPTR start_partial = NULL;
5952 USPTR req_byte_ptr = start_match - 1;
5953
5954 pcre_study_data internal_study;
5955 const pcre_study_data *study;
5956
5957 real_pcre internal_re;
5958 const real_pcre *external_re = (const real_pcre *)argument_re;
5959 const real_pcre *re = external_re;
5960
5961 /* Plausibility checks */
5962
5963 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5964 if (re == NULL || subject == NULL ||
5965 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5966 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5967 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5968
5969 /* These two settings are used in the code for checking a UTF-8 string that
5970 follows immediately afterwards. Other values in the md block are used only
5971 during "normal" pcre_exec() processing, not when the JIT support is in use,
5972 so they are set up later. */
5973
5974 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5975 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5976 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5977
5978 /* Check a UTF-8 string if required. Pass back the character offset and error
5979 code for an invalid string if a results vector is available. */
5980
5981 #ifdef SUPPORT_UTF8
5982 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5983 {
5984 int erroroffset;
5985 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5986 if (errorcode != 0)
5987 {
5988 if (offsetcount >= 2)
5989 {
5990 offsets[0] = erroroffset;
5991 offsets[1] = errorcode;
5992 }
5993 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5994 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5995 }
5996
5997 /* Check that a start_offset points to the start of a UTF-8 character. */
5998 if (start_offset > 0 && start_offset < length &&
5999 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6000 return PCRE_ERROR_BADUTF8_OFFSET;
6001 }
6002 #endif
6003
6004 /* If the pattern was successfully studied with JIT support, run the JIT
6005 executable instead of the rest of this function. Most options must be set at
6006 compile time for the JIT code to be usable. Fallback to the normal code path if
6007 an unsupported flag is set. In particular, JIT does not support partial
6008 matching. */
6009
6010 #ifdef SUPPORT_JIT
6011 if (extra_data != NULL
6012 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6013 && extra_data->executable_jit != NULL
6014 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6015 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6016 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
6017 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6018 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6019 #endif
6020
6021 /* Carry on with non-JIT matching. This information is for finding all the
6022 numbers associated with a given name, for condition testing. */
6023
6024 md->name_table = (uschar *)re + re->name_table_offset;
6025 md->name_count = re->name_count;
6026 md->name_entry_size = re->name_entry_size;
6027
6028 /* Fish out the optional data from the extra_data structure, first setting
6029 the default values. */
6030
6031 study = NULL;
6032 md->match_limit = MATCH_LIMIT;
6033 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6034 md->callout_data = NULL;
6035
6036 /* The table pointer is always in native byte order. */
6037
6038 tables = external_re->tables;
6039
6040 if (extra_data != NULL)
6041 {
6042 register unsigned int flags = extra_data->flags;
6043 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6044 study = (const pcre_study_data *)extra_data->study_data;
6045 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6046 md->match_limit = extra_data->match_limit;
6047 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6048 md->match_limit_recursion = extra_data->match_limit_recursion;
6049 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6050 md->callout_data = extra_data->callout_data;
6051 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6052 }
6053
6054 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6055 is a feature that makes it possible to save compiled regex and re-use them
6056 in other programs later. */
6057
6058 if (tables == NULL) tables = _pcre_default_tables;
6059
6060 /* Check that the first field in the block is the magic number. If it is not,
6061 test for a regex that was compiled on a host of opposite endianness. If this is
6062 the case, flipped values are put in internal_re and internal_study if there was
6063 study data too. */
6064
6065 if (re->magic_number != MAGIC_NUMBER)
6066 {
6067 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
6068 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6069 if (study != NULL) study = &internal_study;
6070 }
6071
6072 /* Set up other data */
6073
6074 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6075 startline = (re->flags & PCRE_STARTLINE) != 0;
6076 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6077
6078 /* The code starts after the real_pcre block and the capture name table. */
6079
6080 md->start_code = (const uschar *)external_re + re->name_table_offset +
6081 re->name_count * re->name_entry_size;
6082
6083 md->start_subject = (USPTR)subject;
6084 md->start_offset = start_offset;
6085 md->end_subject = md->start_subject + length;
6086 end_subject = md->end_subject;
6087
6088 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6089 md->use_ucp = (re->options & PCRE_UCP) != 0;
6090 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6091
6092 /* Some options are unpacked into BOOL variables in the hope that testing
6093 them will be faster than individual option bits. */
6094
6095 md->notbol = (options & PCRE_NOTBOL) != 0;
6096 md->noteol = (options & PCRE_NOTEOL) != 0;
6097 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6098 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6099
6100 md->hitend = FALSE;
6101 md->mark = NULL; /* In case never set */
6102
6103 md->recursive = NULL; /* No recursion at top level */
6104 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6105
6106 md->lcc = tables + lcc_offset;
6107 md->ctypes = tables + ctypes_offset;
6108
6109 /* Handle different \R options. */
6110
6111 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6112 {
6113 case 0:
6114 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6115 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6116 else
6117 #ifdef BSR_ANYCRLF
6118 md->bsr_anycrlf = TRUE;
6119 #else
6120 md->bsr_anycrlf = FALSE;
6121 #endif
6122 break;
6123
6124 case PCRE_BSR_ANYCRLF:
6125 md->bsr_anycrlf = TRUE;
6126 break;
6127
6128 case PCRE_BSR_UNICODE:
6129 md->bsr_anycrlf = FALSE;
6130 break;
6131
6132 default: return PCRE_ERROR_BADNEWLINE;
6133 }
6134
6135 /* Handle different types of newline. The three bits give eight cases. If
6136 nothing is set at run time, whatever was used at compile time applies. */
6137
6138 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6139 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6140 {
6141 case 0: newline = NEWLINE; break; /* Compile-time default */
6142 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6143 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6144 case PCRE_NEWLINE_CR+
6145 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6146 case PCRE_NEWLINE_ANY: newline = -1; break;
6147 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6148 default: return PCRE_ERROR_BADNEWLINE;
6149 }
6150
6151 if (newline == -2)
6152 {
6153 md->nltype = NLTYPE_ANYCRLF;
6154 }
6155 else if (newline < 0)
6156 {
6157 md->nltype = NLTYPE_ANY;
6158 }
6159 else
6160 {
6161 md->nltype = NLTYPE_FIXED;
6162 if (newline > 255)
6163 {
6164 md->nllen = 2;
6165 md->nl[0] = (newline >> 8) & 255;
6166 md->nl[1] = newline & 255;
6167 }
6168 else
6169 {
6170 md->nllen = 1;
6171 md->nl[0] = newline;
6172 }
6173 }
6174
6175 /* Partial matching was originally supported only for a restricted set of
6176 regexes; from release 8.00 there are no restrictions, but the bits are still
6177 defined (though never set). So there's no harm in leaving this code. */
6178
6179 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6180 return PCRE_ERROR_BADPARTIAL;
6181
6182 /* If the expression has got more back references than the offsets supplied can
6183 hold, we get a temporary chunk of working store to use during the matching.
6184 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6185 of 3. */
6186
6187 ocount = offsetcount - (offsetcount % 3);
6188 arg_offset_max = (2*ocount)/3;
6189
6190 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6191 {
6192 ocount = re->top_backref * 3 + 3;
6193 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6194 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6195 using_temporary_offsets = TRUE;
6196 DPRINTF(("Got memory to hold back references\n"));
6197 }
6198 else md->offset_vector = offsets;
6199
6200 md->offset_end = ocount;
6201 md->offset_max = (2*ocount)/3;
6202 md->offset_overflow = FALSE;
6203 md->capture_last = -1;
6204
6205 /* Reset the working variable associated with each extraction. These should
6206 never be used unless previously set, but they get saved and restored, and so we
6207 initialize them to avoid reading uninitialized locations. Also, unset the
6208 offsets for the matched string. This is really just for tidiness with callouts,
6209 in case they inspect these fields. */
6210
6211 if (md->offset_vector != NULL)
6212 {
6213 register int *iptr = md->offset_vector + ocount;
6214 register int *iend = iptr - re->top_bracket;
6215 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6216 while (--iptr >= iend) *iptr = -1;
6217 md->offset_vector[0] = md->offset_vector[1] = -1;
6218 }
6219
6220 /* Set up the first character to match, if available. The first_byte value is
6221 never set for an anchored regular expression, but the anchoring may be forced
6222 at run time, so we have to test for anchoring. The first char may be unset for
6223 an unanchored pattern, of course. If there's no first char and the pattern was
6224 studied, there may be a bitmap of possible first characters. */
6225
6226 if (!anchored)
6227 {
6228 if ((re->flags & PCRE_FIRSTSET) != 0)
6229 {
6230 first_byte = re->first_byte & 255;
6231 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6232 first_byte = md->lcc[first_byte];
6233 }
6234 else
6235 if (!startline && study != NULL &&
6236 (study->flags & PCRE_STUDY_MAPPED) != 0)
6237 start_bits = study->start_bits;
6238 }
6239
6240 /* For anchored or unanchored matches, there may be a "last known required
6241 character" set. */
6242
6243 if ((re->flags & PCRE_REQCHSET) != 0)
6244 {
6245 req_byte = re->req_byte & 255;
6246 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6247 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6248 }
6249
6250
6251
6252
6253 /* ==========================================================================*/
6254
6255 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6256 the loop runs just once. */
6257
6258 for(;;)
6259 {
6260 USPTR save_end_subject = end_subject;
6261 USPTR new_start_match;
6262
6263 /* If firstline is TRUE, the start of the match is constrained to the first
6264 line of a multiline string. That is, the match must be before or at the first
6265 newline. Implement this by temporarily adjusting end_subject so that we stop
6266 scanning at a newline. If the match fails at the newline, later code breaks
6267 this loop. */
6268
6269 if (firstline)
6270 {
6271 USPTR t = start_match;
6272 #ifdef SUPPORT_UTF8
6273 if (utf8)
6274 {
6275 while (t < md->end_subject && !IS_NEWLINE(t))
6276 {
6277 t++;
6278 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6279 }
6280 }
6281 else
6282 #endif
6283 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6284 end_subject = t;
6285 }
6286
6287 /* There are some optimizations that avoid running the match if a known
6288 starting point is not found, or if a known later character is not present.
6289 However, there is an option that disables these, for testing and for ensuring
6290 that all callouts do actually occur. The option can be set in the regex by
6291 (*NO_START_OPT) or passed in match-time options. */
6292
6293 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6294 {
6295 /* Advance to a unique first byte if there is one. */
6296
6297 if (first_byte >= 0)
6298 {
6299 if (first_byte_caseless)
6300 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6301 start_match++;
6302 else
6303 while (start_match < end_subject && *start_match != first_byte)
6304 start_match++;
6305 }
6306
6307 /* Or to just after a linebreak for a multiline match */
6308
6309 else if (startline)
6310 {
6311 if (start_match > md->start_subject + start_offset)
6312 {
6313 #ifdef SUPPORT_UTF8
6314 if (utf8)
6315 {
6316 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6317 {
6318 start_match++;
6319 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6320 start_match++;
6321 }
6322 }
6323 else
6324 #endif
6325 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6326 start_match++;
6327
6328 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6329 and we are now at a LF, advance the match position by one more character.
6330 */
6331
6332 if (start_match[-1] == CHAR_CR &&
6333 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6334 start_match < end_subject &&
6335 *start_match == CHAR_NL)
6336 start_match++;
6337 }
6338 }
6339
6340 /* Or to a non-unique first byte after study */
6341
6342 else if (start_bits != NULL)
6343 {
6344 while (start_match < end_subject)
6345 {
6346 register unsigned int c = *start_match;
6347 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6348 {
6349 start_match++;
6350 #ifdef SUPPORT_UTF8
6351 if (utf8)
6352 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6353 start_match++;
6354 #endif
6355 }
6356 else break;
6357 }
6358 }
6359 } /* Starting optimizations */
6360
6361 /* Restore fudged end_subject */
6362
6363 end_subject = save_end_subject;
6364
6365 /* The following two optimizations are disabled for partial matching or if
6366 disabling is explicitly requested. */
6367
6368 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6369 {
6370 /* If the pattern was studied, a minimum subject length may be set. This is
6371 a lower bound; no actual string of that length may actually match the
6372 pattern. Although the value is, strictly, in characters, we treat it as
6373 bytes to avoid spending too much time in this optimization. */
6374
6375 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6376 (pcre_uint32)(end_subject - start_match) < study->minlength)
6377 {
6378 rc = MATCH_NOMATCH;
6379 break;
6380 }
6381
6382 /* If req_byte is set, we know that that character must appear in the
6383 subject for the match to succeed. If the first character is set, req_byte
6384 must be later in the subject; otherwise the test starts at the match point.
6385 This optimization can save a huge amount of backtracking in patterns with
6386 nested unlimited repeats that aren't going to match. Writing separate code
6387 for cased/caseless versions makes it go faster, as does using an
6388 autoincrement and backing off on a match.
6389
6390 HOWEVER: when the subject string is very, very long, searching to its end
6391 can take a long time, and give bad performance on quite ordinary patterns.
6392 This showed up when somebody was matching something like /^\d+C/ on a
6393 32-megabyte string... so we don't do this when the string is sufficiently
6394 long. */
6395
6396 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6397 {
6398 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6399
6400 /* We don't need to repeat the search if we haven't yet reached the
6401 place we found it at last time. */
6402
6403 if (p > req_byte_ptr)
6404 {
6405 if (req_byte_caseless)
6406 {
6407 while (p < end_subject)
6408 {
6409 register int pp = *p++;
6410 if (pp == req_byte || pp == req_byte2) { p--; break; }
6411 }
6412 }
6413 else
6414 {
6415 while (p < end_subject)
6416 {
6417 if (*p++ == req_byte) { p--; break; }
6418 }
6419 }
6420
6421 /* If we can't find the required character, break the matching loop,
6422 forcing a match failure. */
6423
6424 if (p >= end_subject)
6425 {
6426 rc = MATCH_NOMATCH;
6427 break;
6428 }
6429
6430 /* If we have found the required character, save the point where we
6431 found it, so that we don't search again next time round the loop if
6432 the start hasn't passed this character yet. */
6433
6434 req_byte_ptr = p;
6435 }
6436 }
6437 }
6438
6439 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6440 printf(">>>> Match against: ");
6441 pchars(start_match, end_subject - start_match, TRUE, md);
6442 printf("\n");
6443 #endif
6444
6445 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6446 first starting point for which a partial match was found. */
6447
6448 md->start_match_ptr = start_match;
6449 md->start_used_ptr = start_match;
6450 md->match_call_count = 0;
6451 md->match_function_type = 0;
6452 md->end_offset_top = 0;
6453 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6454 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6455
6456 switch(rc)
6457 {
6458 /* SKIP passes back the next starting point explicitly, but if it is the
6459 same as the match we have just done, treat it as NOMATCH. */
6460
6461 case MATCH_SKIP:
6462 if (md->start_match_ptr != start_match)
6463 {
6464 new_start_match = md->start_match_ptr;
6465 break;
6466 }
6467 /* Fall through */
6468
6469 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6470 the SKIP's arg was not found. We also treat this as NOMATCH. */
6471
6472 case MATCH_SKIP_ARG:
6473 /* Fall through */
6474
6475 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6476 exactly like PRUNE. */
6477
6478 case MATCH_NOMATCH:
6479 case MATCH_PRUNE:
6480 case MATCH_THEN:
6481 new_start_match = start_match + 1;
6482 #ifdef SUPPORT_UTF8
6483 if (utf8)
6484 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6485 new_start_match++;
6486 #endif
6487 break;
6488
6489 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6490
6491 case MATCH_COMMIT:
6492 rc = MATCH_NOMATCH;
6493 goto ENDLOOP;
6494
6495 /* Any other return is either a match, or some kind of error. */
6496
6497 default:
6498 goto ENDLOOP;
6499 }
6500
6501 /* Control reaches here for the various types of "no match at this point"
6502 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6503
6504 rc = MATCH_NOMATCH;
6505
6506 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6507 newline in the subject (though it may continue over the newline). Therefore,
6508 if we have just failed to match, starting at a newline, do not continue. */
6509
6510 if (firstline && IS_NEWLINE(start_match)) break;
6511
6512 /* Advance to new matching position */
6513
6514 start_match = new_start_match;
6515
6516 /* Break the loop if the pattern is anchored or if we have passed the end of
6517 the subject. */
6518
6519 if (anchored || start_match > end_subject) break;
6520
6521 /* If we have just passed a CR and we are now at a LF, and the pattern does
6522 not contain any explicit matches for \r or \n, and the newline option is CRLF
6523 or ANY or ANYCRLF, advance the match position by one more character. */
6524
6525 if (start_match[-1] == CHAR_CR &&
6526 start_match < end_subject &&
6527 *start_match == CHAR_NL &&
6528 (re->flags & PCRE_HASCRORLF) == 0 &&
6529 (md->nltype == NLTYPE_ANY ||
6530 md->nltype == NLTYPE_ANYCRLF ||
6531 md->nllen == 2))
6532 start_match++;
6533
6534 md->mark = NULL; /* Reset for start of next match attempt */
6535 } /* End of for(;;) "bumpalong" loop */
6536
6537 /* ==========================================================================*/
6538
6539 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6540 conditions is true:
6541
6542 (1) The pattern is anchored or the match was failed by (*COMMIT);
6543
6544 (2) We are past the end of the subject;
6545
6546 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6547 this option requests that a match occur at or before the first newline in
6548 the subject.
6549
6550 When we have a match and the offset vector is big enough to deal with any
6551 backreferences, captured substring offsets will already be set up. In the case
6552 where we had to get some local store to hold offsets for backreference
6553 processing, copy those that we can. In this case there need not be overflow if
6554 certain parts of the pattern were not used, even though there are more
6555 capturing parentheses than vector slots. */
6556
6557 ENDLOOP:
6558
6559 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6560 {
6561 if (using_temporary_offsets)
6562 {
6563 if (arg_offset_max >= 4)
6564 {
6565 memcpy(offsets + 2, md->offset_vector + 2,
6566 (arg_offset_max - 2) * sizeof(int));
6567 DPRINTF(("Copied offsets from temporary memory\n"));
6568 }
6569 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6570 DPRINTF(("Freeing temporary memory\n"));
6571 (pcre_free)(md->offset_vector);
6572 }
6573
6574 /* Set the return code to the number of captured strings, or 0 if there were
6575 too many to fit into the vector. */
6576
6577 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6578 0 : md->end_offset_top/2;
6579
6580 /* If there is space in the offset vector, set any unused pairs at the end of
6581 the pattern to -1 for backwards compatibility. It is documented that this
6582 happens. In earlier versions, the whole set of potential capturing offsets
6583 was set to -1 each time round the loop, but this is handled differently now.
6584 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6585 those at the end that need unsetting here. We can't just unset them all at
6586 the start of the whole thing because they may get set in one branch that is
6587 not the final matching branch. */
6588
6589 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6590 {
6591 register int *iptr, *iend;
6592 int resetcount = 2 + re->top_bracket * 2;
6593 if (resetcount > offsetcount) resetcount = ocount;
6594 iptr = offsets + md->end_offset_top;
6595 iend = offsets + resetcount;
6596 while (iptr < iend) *iptr++ = -1;
6597 }
6598
6599 /* If there is space, set up the whole thing as substring 0. The value of
6600 md->start_match_ptr might be modified if \K was encountered on the success
6601 matching path. */
6602
6603 if (offsetcount < 2) rc = 0; else
6604 {
6605 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6606 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6607 }
6608
6609 DPRINTF((">>>> returning %d\n", rc));
6610 goto RETURN_MARK;
6611 }
6612
6613 /* Control gets here if there has been an error, or if the overall match
6614 attempt has failed at all permitted starting positions. */
6615
6616 if (using_temporary_offsets)
6617 {
6618 DPRINTF(("Freeing temporary memory\n"));
6619 (pcre_free)(md->offset_vector);
6620 }
6621
6622 /* For anything other than nomatch or partial match, just return the code. */
6623
6624 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6625 {
6626 DPRINTF((">>>> error: returning %d\n", rc));
6627 return rc;
6628 }
6629
6630 /* Handle partial matches - disable any mark data */
6631
6632 if (start_partial != NULL)
6633 {
6634 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6635 md->mark = NULL;
6636 if (offsetcount > 1)
6637 {
6638 offsets[0] = (int)(start_partial - (USPTR)subject);
6639 offsets[1] = (int)(end_subject - (USPTR)subject);
6640 }
6641 rc = PCRE_ERROR_PARTIAL;
6642 }
6643
6644 /* This is the classic nomatch case */
6645
6646 else
6647 {
6648 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6649 rc = PCRE_ERROR_NOMATCH;
6650 }
6651
6652 /* Return the MARK data if it has been requested. */
6653
6654 RETURN_MARK:
6655
6656 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6657 *(extra_data->mark) = (unsigned char *)(md->mark);
6658 return rc;
6659 }
6660
6661 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5