/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 713 - (show annotations)
Tue Sep 27 11:03:15 2011 UTC (8 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 197207 byte(s)
Error occurred while calculating annotation data.
Fix bug with (*THEN) in a subroutine/recursion.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives.
874
875 When we get to the final alternative within the brackets, we used to return
876 the result of a recursive call to match() whatever happened so it was
877 possible to reduce stack usage by turning this into a tail recursion,
878 except in the case of a possibly empty group. However, now that there is
879 the possiblity of (*THEN) occurring in the final alternative, this
880 optimization is no longer always possible.
881
882 We can optimize if we know there are no (*THEN)s in the pattern; at present
883 this is the best that can be done.
884
885 MATCH_ONCE is returned when the end of an atomic group is successfully
886 reached, but subsequent matching fails. It passes back up the tree (causing
887 captured values to be reset) until the original atomic group level is
888 reached. This is tested by comparing md->once_target with the start of the
889 group. At this point, the return is converted into MATCH_NOMATCH so that
890 previous backup points can be taken. */
891
892 case OP_ONCE:
893 case OP_BRA:
894 case OP_SBRA:
895 DPRINTF(("start non-capturing bracket\n"));
896
897 for (;;)
898 {
899 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
900
901 /* If this is not a possibly empty group, and there are no (*THEN)s in
902 the pattern, and this is the final alternative, optimize as described
903 above. */
904
905 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
906 {
907 ecode += _pcre_OP_lengths[*ecode];
908 goto TAIL_RECURSE;
909 }
910
911 /* In all other cases, we have to make another call to match(). */
912
913 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
914 RM2);
915 if (rrc != MATCH_NOMATCH &&
916 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
917 {
918 if (rrc == MATCH_ONCE)
919 {
920 const uschar *scode = ecode;
921 if (*scode != OP_ONCE) /* If not at start, find it */
922 {
923 while (*scode == OP_ALT) scode += GET(scode, 1);
924 scode -= GET(scode, 1);
925 }
926 if (md->once_target == scode) rrc = MATCH_NOMATCH;
927 }
928 RRETURN(rrc);
929 }
930 ecode += GET(ecode, 1);
931 if (*ecode != OP_ALT) break;
932 }
933 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
934 RRETURN(MATCH_NOMATCH);
935
936 /* Handle possessive capturing brackets with an unlimited repeat. We come
937 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
938 handled similarly to the normal case above. However, the matching is
939 different. The end of these brackets will always be OP_KETRPOS, which
940 returns MATCH_KETRPOS without going further in the pattern. By this means
941 we can handle the group by iteration rather than recursion, thereby
942 reducing the amount of stack needed. */
943
944 case OP_CBRAPOS:
945 case OP_SCBRAPOS:
946 allow_zero = FALSE;
947
948 POSSESSIVE_CAPTURE:
949 number = GET2(ecode, 1+LINK_SIZE);
950 offset = number << 1;
951
952 #ifdef PCRE_DEBUG
953 printf("start possessive bracket %d\n", number);
954 printf("subject=");
955 pchars(eptr, 16, TRUE, md);
956 printf("\n");
957 #endif
958
959 if (offset < md->offset_max)
960 {
961 matched_once = FALSE;
962 code_offset = ecode - md->start_code;
963
964 save_offset1 = md->offset_vector[offset];
965 save_offset2 = md->offset_vector[offset+1];
966 save_offset3 = md->offset_vector[md->offset_end - number];
967 save_capture_last = md->capture_last;
968
969 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
970
971 /* Each time round the loop, save the current subject position for use
972 when the group matches. For MATCH_MATCH, the group has matched, so we
973 restart it with a new subject starting position, remembering that we had
974 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
975 usual. If we haven't matched any alternatives in any iteration, check to
976 see if a previous iteration matched. If so, the group has matched;
977 continue from afterwards. Otherwise it has failed; restore the previous
978 capture values before returning NOMATCH. */
979
980 for (;;)
981 {
982 md->offset_vector[md->offset_end - number] =
983 (int)(eptr - md->start_subject);
984 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
985 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
986 eptrb, RM63);
987 if (rrc == MATCH_KETRPOS)
988 {
989 offset_top = md->end_offset_top;
990 eptr = md->end_match_ptr;
991 ecode = md->start_code + code_offset;
992 save_capture_last = md->capture_last;
993 matched_once = TRUE;
994 continue;
995 }
996 if (rrc != MATCH_NOMATCH &&
997 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
998 RRETURN(rrc);
999 md->capture_last = save_capture_last;
1000 ecode += GET(ecode, 1);
1001 if (*ecode != OP_ALT) break;
1002 }
1003
1004 if (!matched_once)
1005 {
1006 md->offset_vector[offset] = save_offset1;
1007 md->offset_vector[offset+1] = save_offset2;
1008 md->offset_vector[md->offset_end - number] = save_offset3;
1009 }
1010
1011 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1012 if (allow_zero || matched_once)
1013 {
1014 ecode += 1 + LINK_SIZE;
1015 break;
1016 }
1017
1018 RRETURN(MATCH_NOMATCH);
1019 }
1020
1021 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1022 as a non-capturing bracket. */
1023
1024 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1025 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1026
1027 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1028
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031
1032 /* Non-capturing possessive bracket with unlimited repeat. We come here
1033 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1034 without the capturing complication. It is written out separately for speed
1035 and cleanliness. */
1036
1037 case OP_BRAPOS:
1038 case OP_SBRAPOS:
1039 allow_zero = FALSE;
1040
1041 POSSESSIVE_NON_CAPTURE:
1042 matched_once = FALSE;
1043 code_offset = ecode - md->start_code;
1044
1045 for (;;)
1046 {
1047 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1048 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1049 eptrb, RM48);
1050 if (rrc == MATCH_KETRPOS)
1051 {
1052 offset_top = md->end_offset_top;
1053 eptr = md->end_match_ptr;
1054 ecode = md->start_code + code_offset;
1055 matched_once = TRUE;
1056 continue;
1057 }
1058 if (rrc != MATCH_NOMATCH &&
1059 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1060 RRETURN(rrc);
1061 ecode += GET(ecode, 1);
1062 if (*ecode != OP_ALT) break;
1063 }
1064
1065 if (matched_once || allow_zero)
1066 {
1067 ecode += 1 + LINK_SIZE;
1068 break;
1069 }
1070 RRETURN(MATCH_NOMATCH);
1071
1072 /* Control never reaches here. */
1073
1074 /* Conditional group: compilation checked that there are no more than
1075 two branches. If the condition is false, skipping the first branch takes us
1076 past the end if there is only one branch, but that's OK because that is
1077 exactly what going to the ket would do. */
1078
1079 case OP_COND:
1080 case OP_SCOND:
1081 codelink = GET(ecode, 1);
1082
1083 /* Because of the way auto-callout works during compile, a callout item is
1084 inserted between OP_COND and an assertion condition. */
1085
1086 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1087 {
1088 if (pcre_callout != NULL)
1089 {
1090 pcre_callout_block cb;
1091 cb.version = 2; /* Version 1 of the callout block */
1092 cb.callout_number = ecode[LINK_SIZE+2];
1093 cb.offset_vector = md->offset_vector;
1094 cb.subject = (PCRE_SPTR)md->start_subject;
1095 cb.subject_length = (int)(md->end_subject - md->start_subject);
1096 cb.start_match = (int)(mstart - md->start_subject);
1097 cb.current_position = (int)(eptr - md->start_subject);
1098 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1099 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1100 cb.capture_top = offset_top/2;
1101 cb.capture_last = md->capture_last;
1102 cb.callout_data = md->callout_data;
1103 cb.mark = markptr;
1104 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1105 if (rrc < 0) RRETURN(rrc);
1106 }
1107 ecode += _pcre_OP_lengths[OP_CALLOUT];
1108 }
1109
1110 condcode = ecode[LINK_SIZE+1];
1111
1112 /* Now see what the actual condition is */
1113
1114 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1115 {
1116 if (md->recursive == NULL) /* Not recursing => FALSE */
1117 {
1118 condition = FALSE;
1119 ecode += GET(ecode, 1);
1120 }
1121 else
1122 {
1123 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1124 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1125
1126 /* If the test is for recursion into a specific subpattern, and it is
1127 false, but the test was set up by name, scan the table to see if the
1128 name refers to any other numbers, and test them. The condition is true
1129 if any one is set. */
1130
1131 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1132 {
1133 uschar *slotA = md->name_table;
1134 for (i = 0; i < md->name_count; i++)
1135 {
1136 if (GET2(slotA, 0) == recno) break;
1137 slotA += md->name_entry_size;
1138 }
1139
1140 /* Found a name for the number - there can be only one; duplicate
1141 names for different numbers are allowed, but not vice versa. First
1142 scan down for duplicates. */
1143
1144 if (i < md->name_count)
1145 {
1146 uschar *slotB = slotA;
1147 while (slotB > md->name_table)
1148 {
1149 slotB -= md->name_entry_size;
1150 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1151 {
1152 condition = GET2(slotB, 0) == md->recursive->group_num;
1153 if (condition) break;
1154 }
1155 else break;
1156 }
1157
1158 /* Scan up for duplicates */
1159
1160 if (!condition)
1161 {
1162 slotB = slotA;
1163 for (i++; i < md->name_count; i++)
1164 {
1165 slotB += md->name_entry_size;
1166 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1167 {
1168 condition = GET2(slotB, 0) == md->recursive->group_num;
1169 if (condition) break;
1170 }
1171 else break;
1172 }
1173 }
1174 }
1175 }
1176
1177 /* Chose branch according to the condition */
1178
1179 ecode += condition? 3 : GET(ecode, 1);
1180 }
1181 }
1182
1183 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1184 {
1185 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1186 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1187
1188 /* If the numbered capture is unset, but the reference was by name,
1189 scan the table to see if the name refers to any other numbers, and test
1190 them. The condition is true if any one is set. This is tediously similar
1191 to the code above, but not close enough to try to amalgamate. */
1192
1193 if (!condition && condcode == OP_NCREF)
1194 {
1195 int refno = offset >> 1;
1196 uschar *slotA = md->name_table;
1197
1198 for (i = 0; i < md->name_count; i++)
1199 {
1200 if (GET2(slotA, 0) == refno) break;
1201 slotA += md->name_entry_size;
1202 }
1203
1204 /* Found a name for the number - there can be only one; duplicate names
1205 for different numbers are allowed, but not vice versa. First scan down
1206 for duplicates. */
1207
1208 if (i < md->name_count)
1209 {
1210 uschar *slotB = slotA;
1211 while (slotB > md->name_table)
1212 {
1213 slotB -= md->name_entry_size;
1214 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215 {
1216 offset = GET2(slotB, 0) << 1;
1217 condition = offset < offset_top &&
1218 md->offset_vector[offset] >= 0;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223
1224 /* Scan up for duplicates */
1225
1226 if (!condition)
1227 {
1228 slotB = slotA;
1229 for (i++; i < md->name_count; i++)
1230 {
1231 slotB += md->name_entry_size;
1232 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1233 {
1234 offset = GET2(slotB, 0) << 1;
1235 condition = offset < offset_top &&
1236 md->offset_vector[offset] >= 0;
1237 if (condition) break;
1238 }
1239 else break;
1240 }
1241 }
1242 }
1243 }
1244
1245 /* Chose branch according to the condition */
1246
1247 ecode += condition? 3 : GET(ecode, 1);
1248 }
1249
1250 else if (condcode == OP_DEF) /* DEFINE - always false */
1251 {
1252 condition = FALSE;
1253 ecode += GET(ecode, 1);
1254 }
1255
1256 /* The condition is an assertion. Call match() to evaluate it - setting
1257 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1258 an assertion. */
1259
1260 else
1261 {
1262 md->match_function_type = MATCH_CONDASSERT;
1263 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1264 if (rrc == MATCH_MATCH)
1265 {
1266 if (md->end_offset_top > offset_top)
1267 offset_top = md->end_offset_top; /* Captures may have happened */
1268 condition = TRUE;
1269 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1270 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1271 }
1272 else if (rrc != MATCH_NOMATCH &&
1273 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1274 {
1275 RRETURN(rrc); /* Need braces because of following else */
1276 }
1277 else
1278 {
1279 condition = FALSE;
1280 ecode += codelink;
1281 }
1282 }
1283
1284 /* We are now at the branch that is to be obeyed. As there is only one,
1285 we used always to use tail recursion to avoid using another stack frame,
1286 except when there was unlimited repeat of a possibly empty group. However,
1287 that strategy no longer works because of the possibilty of (*THEN) being
1288 encountered in the branch. However, we can still use tail recursion if
1289 there are no (*THEN)s in the pattern. Otherwise, a recursive call to
1290 match() is always required, unless the second alternative doesn't exist, in
1291 which case we can just plough on. */
1292
1293 if (condition || *ecode == OP_ALT)
1294 {
1295 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1296 else if (!md->hasthen)
1297 {
1298 ecode += 1 + LINK_SIZE;
1299 goto TAIL_RECURSE;
1300 }
1301
1302 /* A call to match() is required. */
1303
1304 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1305
1306 /* If the result is THEN from within the "true" branch of the condition,
1307 md->start_match_ptr will point to the original OP_COND, not to the start
1308 of the branch, so we have do work to see if it matches. If THEN comes
1309 from the "false" branch, md->start_match_ptr does point to OP_ALT. */
1310
1311 if (rrc == MATCH_THEN)
1312 {
1313 if (*ecode != OP_ALT)
1314 {
1315 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1316 ecode -= GET(ecode, 1);
1317 }
1318 if (md->start_match_ptr == ecode) rrc = MATCH_NOMATCH;
1319 }
1320 RRETURN(rrc);
1321 }
1322
1323 /* Condition false & no alternative; continue after the group. */
1324
1325 else
1326 {
1327 ecode += 1 + LINK_SIZE;
1328 }
1329 break;
1330
1331
1332 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1333 to close any currently open capturing brackets. */
1334
1335 case OP_CLOSE:
1336 number = GET2(ecode, 1);
1337 offset = number << 1;
1338
1339 #ifdef PCRE_DEBUG
1340 printf("end bracket %d at *ACCEPT", number);
1341 printf("\n");
1342 #endif
1343
1344 md->capture_last = number;
1345 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1346 {
1347 md->offset_vector[offset] =
1348 md->offset_vector[md->offset_end - number];
1349 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1350 if (offset_top <= offset) offset_top = offset + 2;
1351 }
1352 ecode += 3;
1353 break;
1354
1355
1356 /* End of the pattern, either real or forced. */
1357
1358 case OP_END:
1359 case OP_ACCEPT:
1360 case OP_ASSERT_ACCEPT:
1361
1362 /* If we have matched an empty string, fail if not in an assertion and not
1363 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1364 is set and we have matched at the start of the subject. In both cases,
1365 backtracking will then try other alternatives, if any. */
1366
1367 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1368 md->recursive == NULL &&
1369 (md->notempty ||
1370 (md->notempty_atstart &&
1371 mstart == md->start_subject + md->start_offset)))
1372 MRRETURN(MATCH_NOMATCH);
1373
1374 /* Otherwise, we have a match. */
1375
1376 md->end_match_ptr = eptr; /* Record where we ended */
1377 md->end_offset_top = offset_top; /* and how many extracts were taken */
1378 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1379
1380 /* For some reason, the macros don't work properly if an expression is
1381 given as the argument to MRRETURN when the heap is in use. */
1382
1383 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1384 MRRETURN(rrc);
1385
1386 /* Assertion brackets. Check the alternative branches in turn - the
1387 matching won't pass the KET for an assertion. If any one branch matches,
1388 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1389 start of each branch to move the current point backwards, so the code at
1390 this level is identical to the lookahead case. When the assertion is part
1391 of a condition, we want to return immediately afterwards. The caller of
1392 this incarnation of the match() function will have set MATCH_CONDASSERT in
1393 md->match_function type, and one of these opcodes will be the first opcode
1394 that is processed. We use a local variable that is preserved over calls to
1395 match() to remember this case. */
1396
1397 case OP_ASSERT:
1398 case OP_ASSERTBACK:
1399 if (md->match_function_type == MATCH_CONDASSERT)
1400 {
1401 condassert = TRUE;
1402 md->match_function_type = 0;
1403 }
1404 else condassert = FALSE;
1405
1406 do
1407 {
1408 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1409 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1410 {
1411 mstart = md->start_match_ptr; /* In case \K reset it */
1412 markptr = md->mark;
1413 break;
1414 }
1415 if (rrc != MATCH_NOMATCH &&
1416 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1417 RRETURN(rrc);
1418 ecode += GET(ecode, 1);
1419 }
1420 while (*ecode == OP_ALT);
1421
1422 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1423
1424 /* If checking an assertion for a condition, return MATCH_MATCH. */
1425
1426 if (condassert) RRETURN(MATCH_MATCH);
1427
1428 /* Continue from after the assertion, updating the offsets high water
1429 mark, since extracts may have been taken during the assertion. */
1430
1431 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1432 ecode += 1 + LINK_SIZE;
1433 offset_top = md->end_offset_top;
1434 continue;
1435
1436 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1437 PRUNE, or COMMIT means we must assume failure without checking subsequent
1438 branches. */
1439
1440 case OP_ASSERT_NOT:
1441 case OP_ASSERTBACK_NOT:
1442 if (md->match_function_type == MATCH_CONDASSERT)
1443 {
1444 condassert = TRUE;
1445 md->match_function_type = 0;
1446 }
1447 else condassert = FALSE;
1448
1449 do
1450 {
1451 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1452 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1453 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1454 {
1455 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1456 break;
1457 }
1458 if (rrc != MATCH_NOMATCH &&
1459 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1460 RRETURN(rrc);
1461 ecode += GET(ecode,1);
1462 }
1463 while (*ecode == OP_ALT);
1464
1465 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1466
1467 ecode += 1 + LINK_SIZE;
1468 continue;
1469
1470 /* Move the subject pointer back. This occurs only at the start of
1471 each branch of a lookbehind assertion. If we are too close to the start to
1472 move back, this match function fails. When working with UTF-8 we move
1473 back a number of characters, not bytes. */
1474
1475 case OP_REVERSE:
1476 #ifdef SUPPORT_UTF8
1477 if (utf8)
1478 {
1479 i = GET(ecode, 1);
1480 while (i-- > 0)
1481 {
1482 eptr--;
1483 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1484 BACKCHAR(eptr);
1485 }
1486 }
1487 else
1488 #endif
1489
1490 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1491
1492 {
1493 eptr -= GET(ecode, 1);
1494 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1495 }
1496
1497 /* Save the earliest consulted character, then skip to next op code */
1498
1499 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1500 ecode += 1 + LINK_SIZE;
1501 break;
1502
1503 /* The callout item calls an external function, if one is provided, passing
1504 details of the match so far. This is mainly for debugging, though the
1505 function is able to force a failure. */
1506
1507 case OP_CALLOUT:
1508 if (pcre_callout != NULL)
1509 {
1510 pcre_callout_block cb;
1511 cb.version = 2; /* Version 1 of the callout block */
1512 cb.callout_number = ecode[1];
1513 cb.offset_vector = md->offset_vector;
1514 cb.subject = (PCRE_SPTR)md->start_subject;
1515 cb.subject_length = (int)(md->end_subject - md->start_subject);
1516 cb.start_match = (int)(mstart - md->start_subject);
1517 cb.current_position = (int)(eptr - md->start_subject);
1518 cb.pattern_position = GET(ecode, 2);
1519 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1520 cb.capture_top = offset_top/2;
1521 cb.capture_last = md->capture_last;
1522 cb.callout_data = md->callout_data;
1523 cb.mark = markptr;
1524 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1525 if (rrc < 0) RRETURN(rrc);
1526 }
1527 ecode += 2 + 2*LINK_SIZE;
1528 break;
1529
1530 /* Recursion either matches the current regex, or some subexpression. The
1531 offset data is the offset to the starting bracket from the start of the
1532 whole pattern. (This is so that it works from duplicated subpatterns.)
1533
1534 The state of the capturing groups is preserved over recursion, and
1535 re-instated afterwards. We don't know how many are started and not yet
1536 finished (offset_top records the completed total) so we just have to save
1537 all the potential data. There may be up to 65535 such values, which is too
1538 large to put on the stack, but using malloc for small numbers seems
1539 expensive. As a compromise, the stack is used when there are no more than
1540 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1541
1542 There are also other values that have to be saved. We use a chained
1543 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1544 for the original version of this logic. It has, however, been hacked around
1545 a lot, so he is not to blame for the current way it works. */
1546
1547 case OP_RECURSE:
1548 {
1549 recursion_info *ri;
1550 int recno;
1551
1552 callpat = md->start_code + GET(ecode, 1);
1553 recno = (callpat == md->start_code)? 0 :
1554 GET2(callpat, 1 + LINK_SIZE);
1555
1556 /* Check for repeating a recursion without advancing the subject pointer.
1557 This should catch convoluted mutual recursions. (Some simple cases are
1558 caught at compile time.) */
1559
1560 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1561 if (recno == ri->group_num && eptr == ri->subject_position)
1562 RRETURN(PCRE_ERROR_RECURSELOOP);
1563
1564 /* Add to "recursing stack" */
1565
1566 new_recursive.group_num = recno;
1567 new_recursive.subject_position = eptr;
1568 new_recursive.prevrec = md->recursive;
1569 md->recursive = &new_recursive;
1570
1571 /* Where to continue from afterwards */
1572
1573 ecode += 1 + LINK_SIZE;
1574
1575 /* Now save the offset data */
1576
1577 new_recursive.saved_max = md->offset_end;
1578 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1579 new_recursive.offset_save = stacksave;
1580 else
1581 {
1582 new_recursive.offset_save =
1583 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1584 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1585 }
1586 memcpy(new_recursive.offset_save, md->offset_vector,
1587 new_recursive.saved_max * sizeof(int));
1588
1589 /* OK, now we can do the recursion. After processing each alternative,
1590 restore the offset data. If there were nested recursions, md->recursive
1591 might be changed, so reset it before looping. */
1592
1593 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1594 cbegroup = (*callpat >= OP_SBRA);
1595 do
1596 {
1597 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1598 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1599 md, eptrb, RM6);
1600 memcpy(md->offset_vector, new_recursive.offset_save,
1601 new_recursive.saved_max * sizeof(int));
1602 md->recursive = new_recursive.prevrec;
1603 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1604 {
1605 DPRINTF(("Recursion matched\n"));
1606 if (new_recursive.offset_save != stacksave)
1607 (pcre_free)(new_recursive.offset_save);
1608
1609 /* Set where we got to in the subject, and reset the start in case
1610 it was changed by \K. This *is* propagated back out of a recursion,
1611 for Perl compatibility. */
1612
1613 eptr = md->end_match_ptr;
1614 mstart = md->start_match_ptr;
1615 goto RECURSION_MATCHED; /* Exit loop; end processing */
1616 }
1617 else if (rrc != MATCH_NOMATCH &&
1618 (rrc != MATCH_THEN || md->start_match_ptr != callpat))
1619 {
1620 DPRINTF(("Recursion gave error %d\n", rrc));
1621 if (new_recursive.offset_save != stacksave)
1622 (pcre_free)(new_recursive.offset_save);
1623 RRETURN(rrc);
1624 }
1625
1626 md->recursive = &new_recursive;
1627 callpat += GET(callpat, 1);
1628 }
1629 while (*callpat == OP_ALT);
1630
1631 DPRINTF(("Recursion didn't match\n"));
1632 md->recursive = new_recursive.prevrec;
1633 if (new_recursive.offset_save != stacksave)
1634 (pcre_free)(new_recursive.offset_save);
1635 MRRETURN(MATCH_NOMATCH);
1636 }
1637
1638 RECURSION_MATCHED:
1639 break;
1640
1641 /* An alternation is the end of a branch; scan along to find the end of the
1642 bracketed group and go to there. */
1643
1644 case OP_ALT:
1645 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646 break;
1647
1648 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1649 indicating that it may occur zero times. It may repeat infinitely, or not
1650 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1651 with fixed upper repeat limits are compiled as a number of copies, with the
1652 optional ones preceded by BRAZERO or BRAMINZERO. */
1653
1654 case OP_BRAZERO:
1655 next = ecode + 1;
1656 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1658 do next += GET(next, 1); while (*next == OP_ALT);
1659 ecode = next + 1 + LINK_SIZE;
1660 break;
1661
1662 case OP_BRAMINZERO:
1663 next = ecode + 1;
1664 do next += GET(next, 1); while (*next == OP_ALT);
1665 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1666 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1667 ecode++;
1668 break;
1669
1670 case OP_SKIPZERO:
1671 next = ecode+1;
1672 do next += GET(next,1); while (*next == OP_ALT);
1673 ecode = next + 1 + LINK_SIZE;
1674 break;
1675
1676 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1677 here; just jump to the group, with allow_zero set TRUE. */
1678
1679 case OP_BRAPOSZERO:
1680 op = *(++ecode);
1681 allow_zero = TRUE;
1682 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1683 goto POSSESSIVE_NON_CAPTURE;
1684
1685 /* End of a group, repeated or non-repeating. */
1686
1687 case OP_KET:
1688 case OP_KETRMIN:
1689 case OP_KETRMAX:
1690 case OP_KETRPOS:
1691 prev = ecode - GET(ecode, 1);
1692
1693 /* If this was a group that remembered the subject start, in order to break
1694 infinite repeats of empty string matches, retrieve the subject start from
1695 the chain. Otherwise, set it NULL. */
1696
1697 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1698 {
1699 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1700 eptrb = eptrb->epb_prev; /* Backup to previous group */
1701 }
1702 else saved_eptr = NULL;
1703
1704 /* If we are at the end of an assertion group, stop matching and return
1705 MATCH_MATCH, but record the current high water mark for use by positive
1706 assertions. We also need to record the match start in case it was changed
1707 by \K. */
1708
1709 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1710 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1711 {
1712 md->end_match_ptr = eptr; /* For ONCE */
1713 md->end_offset_top = offset_top;
1714 md->start_match_ptr = mstart;
1715 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1716 }
1717
1718 /* For capturing groups we have to check the group number back at the start
1719 and if necessary complete handling an extraction by setting the offsets and
1720 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1721 into group 0, so it won't be picked up here. Instead, we catch it when the
1722 OP_END is reached. Other recursion is handled here. We just have to record
1723 the current subject position and start match pointer and give a MATCH
1724 return. */
1725
1726 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1727 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1728 {
1729 number = GET2(prev, 1+LINK_SIZE);
1730 offset = number << 1;
1731
1732 #ifdef PCRE_DEBUG
1733 printf("end bracket %d", number);
1734 printf("\n");
1735 #endif
1736
1737 /* Handle a recursively called group. */
1738
1739 if (md->recursive != NULL && md->recursive->group_num == number)
1740 {
1741 md->end_match_ptr = eptr;
1742 md->start_match_ptr = mstart;
1743 RRETURN(MATCH_MATCH);
1744 }
1745
1746 /* Deal with capturing */
1747
1748 md->capture_last = number;
1749 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1750 {
1751 /* If offset is greater than offset_top, it means that we are
1752 "skipping" a capturing group, and that group's offsets must be marked
1753 unset. In earlier versions of PCRE, all the offsets were unset at the
1754 start of matching, but this doesn't work because atomic groups and
1755 assertions can cause a value to be set that should later be unset.
1756 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1757 part of the atomic group, but this is not on the final matching path,
1758 so must be unset when 2 is set. (If there is no group 2, there is no
1759 problem, because offset_top will then be 2, indicating no capture.) */
1760
1761 if (offset > offset_top)
1762 {
1763 register int *iptr = md->offset_vector + offset_top;
1764 register int *iend = md->offset_vector + offset;
1765 while (iptr < iend) *iptr++ = -1;
1766 }
1767
1768 /* Now make the extraction */
1769
1770 md->offset_vector[offset] =
1771 md->offset_vector[md->offset_end - number];
1772 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1773 if (offset_top <= offset) offset_top = offset + 2;
1774 }
1775 }
1776
1777 /* For an ordinary non-repeating ket, just continue at this level. This
1778 also happens for a repeating ket if no characters were matched in the
1779 group. This is the forcible breaking of infinite loops as implemented in
1780 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1781 processing the rest of the pattern at a lower level. If this results in a
1782 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1783 bypassing intermediate backup points, but resetting any captures that
1784 happened along the way. */
1785
1786 if (*ecode == OP_KET || eptr == saved_eptr)
1787 {
1788 if (*prev == OP_ONCE)
1789 {
1790 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1793 RRETURN(MATCH_ONCE);
1794 }
1795 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1796 break;
1797 }
1798
1799 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1800 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1801 at a time from the outer level, thus saving stack. */
1802
1803 if (*ecode == OP_KETRPOS)
1804 {
1805 md->end_match_ptr = eptr;
1806 md->end_offset_top = offset_top;
1807 RRETURN(MATCH_KETRPOS);
1808 }
1809
1810 /* The normal repeating kets try the rest of the pattern or restart from
1811 the preceding bracket, in the appropriate order. In the second case, we can
1812 use tail recursion to avoid using another stack frame, unless we have an
1813 an atomic group or an unlimited repeat of a group that can match an empty
1814 string. */
1815
1816 if (*ecode == OP_KETRMIN)
1817 {
1818 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1819 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1820 if (*prev == OP_ONCE)
1821 {
1822 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1824 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1825 RRETURN(MATCH_ONCE);
1826 }
1827 if (*prev >= OP_SBRA) /* Could match an empty string */
1828 {
1829 md->match_function_type = MATCH_CBEGROUP;
1830 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1831 RRETURN(rrc);
1832 }
1833 ecode = prev;
1834 goto TAIL_RECURSE;
1835 }
1836 else /* OP_KETRMAX */
1837 {
1838 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1839 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1840 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1842 if (*prev == OP_ONCE)
1843 {
1844 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1846 md->once_target = prev;
1847 RRETURN(MATCH_ONCE);
1848 }
1849 ecode += 1 + LINK_SIZE;
1850 goto TAIL_RECURSE;
1851 }
1852 /* Control never gets here */
1853
1854 /* Not multiline mode: start of subject assertion, unless notbol. */
1855
1856 case OP_CIRC:
1857 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1858
1859 /* Start of subject assertion */
1860
1861 case OP_SOD:
1862 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1863 ecode++;
1864 break;
1865
1866 /* Multiline mode: start of subject unless notbol, or after any newline. */
1867
1868 case OP_CIRCM:
1869 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1870 if (eptr != md->start_subject &&
1871 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1872 MRRETURN(MATCH_NOMATCH);
1873 ecode++;
1874 break;
1875
1876 /* Start of match assertion */
1877
1878 case OP_SOM:
1879 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1880 ecode++;
1881 break;
1882
1883 /* Reset the start of match point */
1884
1885 case OP_SET_SOM:
1886 mstart = eptr;
1887 ecode++;
1888 break;
1889
1890 /* Multiline mode: assert before any newline, or before end of subject
1891 unless noteol is set. */
1892
1893 case OP_DOLLM:
1894 if (eptr < md->end_subject)
1895 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1896 else
1897 {
1898 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1899 SCHECK_PARTIAL();
1900 }
1901 ecode++;
1902 break;
1903
1904 /* Not multiline mode: assert before a terminating newline or before end of
1905 subject unless noteol is set. */
1906
1907 case OP_DOLL:
1908 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1909 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1910
1911 /* ... else fall through for endonly */
1912
1913 /* End of subject assertion (\z) */
1914
1915 case OP_EOD:
1916 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1917 SCHECK_PARTIAL();
1918 ecode++;
1919 break;
1920
1921 /* End of subject or ending \n assertion (\Z) */
1922
1923 case OP_EODN:
1924 ASSERT_NL_OR_EOS:
1925 if (eptr < md->end_subject &&
1926 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1927 MRRETURN(MATCH_NOMATCH);
1928
1929 /* Either at end of string or \n before end. */
1930
1931 SCHECK_PARTIAL();
1932 ecode++;
1933 break;
1934
1935 /* Word boundary assertions */
1936
1937 case OP_NOT_WORD_BOUNDARY:
1938 case OP_WORD_BOUNDARY:
1939 {
1940
1941 /* Find out if the previous and current characters are "word" characters.
1942 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1943 be "non-word" characters. Remember the earliest consulted character for
1944 partial matching. */
1945
1946 #ifdef SUPPORT_UTF8
1947 if (utf8)
1948 {
1949 /* Get status of previous character */
1950
1951 if (eptr == md->start_subject) prev_is_word = FALSE; else
1952 {
1953 USPTR lastptr = eptr - 1;
1954 while((*lastptr & 0xc0) == 0x80) lastptr--;
1955 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1956 GETCHAR(c, lastptr);
1957 #ifdef SUPPORT_UCP
1958 if (md->use_ucp)
1959 {
1960 if (c == '_') prev_is_word = TRUE; else
1961 {
1962 int cat = UCD_CATEGORY(c);
1963 prev_is_word = (cat == ucp_L || cat == ucp_N);
1964 }
1965 }
1966 else
1967 #endif
1968 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1969 }
1970
1971 /* Get status of next character */
1972
1973 if (eptr >= md->end_subject)
1974 {
1975 SCHECK_PARTIAL();
1976 cur_is_word = FALSE;
1977 }
1978 else
1979 {
1980 GETCHAR(c, eptr);
1981 #ifdef SUPPORT_UCP
1982 if (md->use_ucp)
1983 {
1984 if (c == '_') cur_is_word = TRUE; else
1985 {
1986 int cat = UCD_CATEGORY(c);
1987 cur_is_word = (cat == ucp_L || cat == ucp_N);
1988 }
1989 }
1990 else
1991 #endif
1992 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1993 }
1994 }
1995 else
1996 #endif
1997
1998 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1999 consistency with the behaviour of \w we do use it in this case. */
2000
2001 {
2002 /* Get status of previous character */
2003
2004 if (eptr == md->start_subject) prev_is_word = FALSE; else
2005 {
2006 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2007 #ifdef SUPPORT_UCP
2008 if (md->use_ucp)
2009 {
2010 c = eptr[-1];
2011 if (c == '_') prev_is_word = TRUE; else
2012 {
2013 int cat = UCD_CATEGORY(c);
2014 prev_is_word = (cat == ucp_L || cat == ucp_N);
2015 }
2016 }
2017 else
2018 #endif
2019 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2020 }
2021
2022 /* Get status of next character */
2023
2024 if (eptr >= md->end_subject)
2025 {
2026 SCHECK_PARTIAL();
2027 cur_is_word = FALSE;
2028 }
2029 else
2030 #ifdef SUPPORT_UCP
2031 if (md->use_ucp)
2032 {
2033 c = *eptr;
2034 if (c == '_') cur_is_word = TRUE; else
2035 {
2036 int cat = UCD_CATEGORY(c);
2037 cur_is_word = (cat == ucp_L || cat == ucp_N);
2038 }
2039 }
2040 else
2041 #endif
2042 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2043 }
2044
2045 /* Now see if the situation is what we want */
2046
2047 if ((*ecode++ == OP_WORD_BOUNDARY)?
2048 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2049 MRRETURN(MATCH_NOMATCH);
2050 }
2051 break;
2052
2053 /* Match a single character type; inline for speed */
2054
2055 case OP_ANY:
2056 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2057 /* Fall through */
2058
2059 case OP_ALLANY:
2060 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2061 { /* not be updated before SCHECK_PARTIAL. */
2062 SCHECK_PARTIAL();
2063 MRRETURN(MATCH_NOMATCH);
2064 }
2065 eptr++;
2066 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2067 ecode++;
2068 break;
2069
2070 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2071 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2072
2073 case OP_ANYBYTE:
2074 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2075 { /* not be updated before SCHECK_PARTIAL. */
2076 SCHECK_PARTIAL();
2077 MRRETURN(MATCH_NOMATCH);
2078 }
2079 eptr++;
2080 ecode++;
2081 break;
2082
2083 case OP_NOT_DIGIT:
2084 if (eptr >= md->end_subject)
2085 {
2086 SCHECK_PARTIAL();
2087 MRRETURN(MATCH_NOMATCH);
2088 }
2089 GETCHARINCTEST(c, eptr);
2090 if (
2091 #ifdef SUPPORT_UTF8
2092 c < 256 &&
2093 #endif
2094 (md->ctypes[c] & ctype_digit) != 0
2095 )
2096 MRRETURN(MATCH_NOMATCH);
2097 ecode++;
2098 break;
2099
2100 case OP_DIGIT:
2101 if (eptr >= md->end_subject)
2102 {
2103 SCHECK_PARTIAL();
2104 MRRETURN(MATCH_NOMATCH);
2105 }
2106 GETCHARINCTEST(c, eptr);
2107 if (
2108 #ifdef SUPPORT_UTF8
2109 c >= 256 ||
2110 #endif
2111 (md->ctypes[c] & ctype_digit) == 0
2112 )
2113 MRRETURN(MATCH_NOMATCH);
2114 ecode++;
2115 break;
2116
2117 case OP_NOT_WHITESPACE:
2118 if (eptr >= md->end_subject)
2119 {
2120 SCHECK_PARTIAL();
2121 MRRETURN(MATCH_NOMATCH);
2122 }
2123 GETCHARINCTEST(c, eptr);
2124 if (
2125 #ifdef SUPPORT_UTF8
2126 c < 256 &&
2127 #endif
2128 (md->ctypes[c] & ctype_space) != 0
2129 )
2130 MRRETURN(MATCH_NOMATCH);
2131 ecode++;
2132 break;
2133
2134 case OP_WHITESPACE:
2135 if (eptr >= md->end_subject)
2136 {
2137 SCHECK_PARTIAL();
2138 MRRETURN(MATCH_NOMATCH);
2139 }
2140 GETCHARINCTEST(c, eptr);
2141 if (
2142 #ifdef SUPPORT_UTF8
2143 c >= 256 ||
2144 #endif
2145 (md->ctypes[c] & ctype_space) == 0
2146 )
2147 MRRETURN(MATCH_NOMATCH);
2148 ecode++;
2149 break;
2150
2151 case OP_NOT_WORDCHAR:
2152 if (eptr >= md->end_subject)
2153 {
2154 SCHECK_PARTIAL();
2155 MRRETURN(MATCH_NOMATCH);
2156 }
2157 GETCHARINCTEST(c, eptr);
2158 if (
2159 #ifdef SUPPORT_UTF8
2160 c < 256 &&
2161 #endif
2162 (md->ctypes[c] & ctype_word) != 0
2163 )
2164 MRRETURN(MATCH_NOMATCH);
2165 ecode++;
2166 break;
2167
2168 case OP_WORDCHAR:
2169 if (eptr >= md->end_subject)
2170 {
2171 SCHECK_PARTIAL();
2172 MRRETURN(MATCH_NOMATCH);
2173 }
2174 GETCHARINCTEST(c, eptr);
2175 if (
2176 #ifdef SUPPORT_UTF8
2177 c >= 256 ||
2178 #endif
2179 (md->ctypes[c] & ctype_word) == 0
2180 )
2181 MRRETURN(MATCH_NOMATCH);
2182 ecode++;
2183 break;
2184
2185 case OP_ANYNL:
2186 if (eptr >= md->end_subject)
2187 {
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 GETCHARINCTEST(c, eptr);
2192 switch(c)
2193 {
2194 default: MRRETURN(MATCH_NOMATCH);
2195
2196 case 0x000d:
2197 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2198 break;
2199
2200 case 0x000a:
2201 break;
2202
2203 case 0x000b:
2204 case 0x000c:
2205 case 0x0085:
2206 case 0x2028:
2207 case 0x2029:
2208 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2209 break;
2210 }
2211 ecode++;
2212 break;
2213
2214 case OP_NOT_HSPACE:
2215 if (eptr >= md->end_subject)
2216 {
2217 SCHECK_PARTIAL();
2218 MRRETURN(MATCH_NOMATCH);
2219 }
2220 GETCHARINCTEST(c, eptr);
2221 switch(c)
2222 {
2223 default: break;
2224 case 0x09: /* HT */
2225 case 0x20: /* SPACE */
2226 case 0xa0: /* NBSP */
2227 case 0x1680: /* OGHAM SPACE MARK */
2228 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2229 case 0x2000: /* EN QUAD */
2230 case 0x2001: /* EM QUAD */
2231 case 0x2002: /* EN SPACE */
2232 case 0x2003: /* EM SPACE */
2233 case 0x2004: /* THREE-PER-EM SPACE */
2234 case 0x2005: /* FOUR-PER-EM SPACE */
2235 case 0x2006: /* SIX-PER-EM SPACE */
2236 case 0x2007: /* FIGURE SPACE */
2237 case 0x2008: /* PUNCTUATION SPACE */
2238 case 0x2009: /* THIN SPACE */
2239 case 0x200A: /* HAIR SPACE */
2240 case 0x202f: /* NARROW NO-BREAK SPACE */
2241 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2242 case 0x3000: /* IDEOGRAPHIC SPACE */
2243 MRRETURN(MATCH_NOMATCH);
2244 }
2245 ecode++;
2246 break;
2247
2248 case OP_HSPACE:
2249 if (eptr >= md->end_subject)
2250 {
2251 SCHECK_PARTIAL();
2252 MRRETURN(MATCH_NOMATCH);
2253 }
2254 GETCHARINCTEST(c, eptr);
2255 switch(c)
2256 {
2257 default: MRRETURN(MATCH_NOMATCH);
2258 case 0x09: /* HT */
2259 case 0x20: /* SPACE */
2260 case 0xa0: /* NBSP */
2261 case 0x1680: /* OGHAM SPACE MARK */
2262 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2263 case 0x2000: /* EN QUAD */
2264 case 0x2001: /* EM QUAD */
2265 case 0x2002: /* EN SPACE */
2266 case 0x2003: /* EM SPACE */
2267 case 0x2004: /* THREE-PER-EM SPACE */
2268 case 0x2005: /* FOUR-PER-EM SPACE */
2269 case 0x2006: /* SIX-PER-EM SPACE */
2270 case 0x2007: /* FIGURE SPACE */
2271 case 0x2008: /* PUNCTUATION SPACE */
2272 case 0x2009: /* THIN SPACE */
2273 case 0x200A: /* HAIR SPACE */
2274 case 0x202f: /* NARROW NO-BREAK SPACE */
2275 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2276 case 0x3000: /* IDEOGRAPHIC SPACE */
2277 break;
2278 }
2279 ecode++;
2280 break;
2281
2282 case OP_NOT_VSPACE:
2283 if (eptr >= md->end_subject)
2284 {
2285 SCHECK_PARTIAL();
2286 MRRETURN(MATCH_NOMATCH);
2287 }
2288 GETCHARINCTEST(c, eptr);
2289 switch(c)
2290 {
2291 default: break;
2292 case 0x0a: /* LF */
2293 case 0x0b: /* VT */
2294 case 0x0c: /* FF */
2295 case 0x0d: /* CR */
2296 case 0x85: /* NEL */
2297 case 0x2028: /* LINE SEPARATOR */
2298 case 0x2029: /* PARAGRAPH SEPARATOR */
2299 MRRETURN(MATCH_NOMATCH);
2300 }
2301 ecode++;
2302 break;
2303
2304 case OP_VSPACE:
2305 if (eptr >= md->end_subject)
2306 {
2307 SCHECK_PARTIAL();
2308 MRRETURN(MATCH_NOMATCH);
2309 }
2310 GETCHARINCTEST(c, eptr);
2311 switch(c)
2312 {
2313 default: MRRETURN(MATCH_NOMATCH);
2314 case 0x0a: /* LF */
2315 case 0x0b: /* VT */
2316 case 0x0c: /* FF */
2317 case 0x0d: /* CR */
2318 case 0x85: /* NEL */
2319 case 0x2028: /* LINE SEPARATOR */
2320 case 0x2029: /* PARAGRAPH SEPARATOR */
2321 break;
2322 }
2323 ecode++;
2324 break;
2325
2326 #ifdef SUPPORT_UCP
2327 /* Check the next character by Unicode property. We will get here only
2328 if the support is in the binary; otherwise a compile-time error occurs. */
2329
2330 case OP_PROP:
2331 case OP_NOTPROP:
2332 if (eptr >= md->end_subject)
2333 {
2334 SCHECK_PARTIAL();
2335 MRRETURN(MATCH_NOMATCH);
2336 }
2337 GETCHARINCTEST(c, eptr);
2338 {
2339 const ucd_record *prop = GET_UCD(c);
2340
2341 switch(ecode[1])
2342 {
2343 case PT_ANY:
2344 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2345 break;
2346
2347 case PT_LAMP:
2348 if ((prop->chartype == ucp_Lu ||
2349 prop->chartype == ucp_Ll ||
2350 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2351 MRRETURN(MATCH_NOMATCH);
2352 break;
2353
2354 case PT_GC:
2355 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2356 MRRETURN(MATCH_NOMATCH);
2357 break;
2358
2359 case PT_PC:
2360 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2361 MRRETURN(MATCH_NOMATCH);
2362 break;
2363
2364 case PT_SC:
2365 if ((ecode[2] != prop->script) == (op == OP_PROP))
2366 MRRETURN(MATCH_NOMATCH);
2367 break;
2368
2369 /* These are specials */
2370
2371 case PT_ALNUM:
2372 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2373 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2374 MRRETURN(MATCH_NOMATCH);
2375 break;
2376
2377 case PT_SPACE: /* Perl space */
2378 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2379 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2380 == (op == OP_NOTPROP))
2381 MRRETURN(MATCH_NOMATCH);
2382 break;
2383
2384 case PT_PXSPACE: /* POSIX space */
2385 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2386 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2387 c == CHAR_FF || c == CHAR_CR)
2388 == (op == OP_NOTPROP))
2389 MRRETURN(MATCH_NOMATCH);
2390 break;
2391
2392 case PT_WORD:
2393 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2394 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2395 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2396 MRRETURN(MATCH_NOMATCH);
2397 break;
2398
2399 /* This should never occur */
2400
2401 default:
2402 RRETURN(PCRE_ERROR_INTERNAL);
2403 }
2404
2405 ecode += 3;
2406 }
2407 break;
2408
2409 /* Match an extended Unicode sequence. We will get here only if the support
2410 is in the binary; otherwise a compile-time error occurs. */
2411
2412 case OP_EXTUNI:
2413 if (eptr >= md->end_subject)
2414 {
2415 SCHECK_PARTIAL();
2416 MRRETURN(MATCH_NOMATCH);
2417 }
2418 GETCHARINCTEST(c, eptr);
2419 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2420 while (eptr < md->end_subject)
2421 {
2422 int len = 1;
2423 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2424 if (UCD_CATEGORY(c) != ucp_M) break;
2425 eptr += len;
2426 }
2427 ecode++;
2428 break;
2429 #endif
2430
2431
2432 /* Match a back reference, possibly repeatedly. Look past the end of the
2433 item to see if there is repeat information following. The code is similar
2434 to that for character classes, but repeated for efficiency. Then obey
2435 similar code to character type repeats - written out again for speed.
2436 However, if the referenced string is the empty string, always treat
2437 it as matched, any number of times (otherwise there could be infinite
2438 loops). */
2439
2440 case OP_REF:
2441 case OP_REFI:
2442 caseless = op == OP_REFI;
2443 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2444 ecode += 3;
2445
2446 /* If the reference is unset, there are two possibilities:
2447
2448 (a) In the default, Perl-compatible state, set the length negative;
2449 this ensures that every attempt at a match fails. We can't just fail
2450 here, because of the possibility of quantifiers with zero minima.
2451
2452 (b) If the JavaScript compatibility flag is set, set the length to zero
2453 so that the back reference matches an empty string.
2454
2455 Otherwise, set the length to the length of what was matched by the
2456 referenced subpattern. */
2457
2458 if (offset >= offset_top || md->offset_vector[offset] < 0)
2459 length = (md->jscript_compat)? 0 : -1;
2460 else
2461 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2462
2463 /* Set up for repetition, or handle the non-repeated case */
2464
2465 switch (*ecode)
2466 {
2467 case OP_CRSTAR:
2468 case OP_CRMINSTAR:
2469 case OP_CRPLUS:
2470 case OP_CRMINPLUS:
2471 case OP_CRQUERY:
2472 case OP_CRMINQUERY:
2473 c = *ecode++ - OP_CRSTAR;
2474 minimize = (c & 1) != 0;
2475 min = rep_min[c]; /* Pick up values from tables; */
2476 max = rep_max[c]; /* zero for max => infinity */
2477 if (max == 0) max = INT_MAX;
2478 break;
2479
2480 case OP_CRRANGE:
2481 case OP_CRMINRANGE:
2482 minimize = (*ecode == OP_CRMINRANGE);
2483 min = GET2(ecode, 1);
2484 max = GET2(ecode, 3);
2485 if (max == 0) max = INT_MAX;
2486 ecode += 5;
2487 break;
2488
2489 default: /* No repeat follows */
2490 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2491 {
2492 CHECK_PARTIAL();
2493 MRRETURN(MATCH_NOMATCH);
2494 }
2495 eptr += length;
2496 continue; /* With the main loop */
2497 }
2498
2499 /* Handle repeated back references. If the length of the reference is
2500 zero, just continue with the main loop. */
2501
2502 if (length == 0) continue;
2503
2504 /* First, ensure the minimum number of matches are present. We get back
2505 the length of the reference string explicitly rather than passing the
2506 address of eptr, so that eptr can be a register variable. */
2507
2508 for (i = 1; i <= min; i++)
2509 {
2510 int slength;
2511 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2512 {
2513 CHECK_PARTIAL();
2514 MRRETURN(MATCH_NOMATCH);
2515 }
2516 eptr += slength;
2517 }
2518
2519 /* If min = max, continue at the same level without recursion.
2520 They are not both allowed to be zero. */
2521
2522 if (min == max) continue;
2523
2524 /* If minimizing, keep trying and advancing the pointer */
2525
2526 if (minimize)
2527 {
2528 for (fi = min;; fi++)
2529 {
2530 int slength;
2531 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2532 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2533 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2534 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2535 {
2536 CHECK_PARTIAL();
2537 MRRETURN(MATCH_NOMATCH);
2538 }
2539 eptr += slength;
2540 }
2541 /* Control never gets here */
2542 }
2543
2544 /* If maximizing, find the longest string and work backwards */
2545
2546 else
2547 {
2548 pp = eptr;
2549 for (i = min; i < max; i++)
2550 {
2551 int slength;
2552 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2553 {
2554 CHECK_PARTIAL();
2555 break;
2556 }
2557 eptr += slength;
2558 }
2559 while (eptr >= pp)
2560 {
2561 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2562 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2563 eptr -= length;
2564 }
2565 MRRETURN(MATCH_NOMATCH);
2566 }
2567 /* Control never gets here */
2568
2569 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2570 used when all the characters in the class have values in the range 0-255,
2571 and either the matching is caseful, or the characters are in the range
2572 0-127 when UTF-8 processing is enabled. The only difference between
2573 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2574 encountered.
2575
2576 First, look past the end of the item to see if there is repeat information
2577 following. Then obey similar code to character type repeats - written out
2578 again for speed. */
2579
2580 case OP_NCLASS:
2581 case OP_CLASS:
2582 {
2583 data = ecode + 1; /* Save for matching */
2584 ecode += 33; /* Advance past the item */
2585
2586 switch (*ecode)
2587 {
2588 case OP_CRSTAR:
2589 case OP_CRMINSTAR:
2590 case OP_CRPLUS:
2591 case OP_CRMINPLUS:
2592 case OP_CRQUERY:
2593 case OP_CRMINQUERY:
2594 c = *ecode++ - OP_CRSTAR;
2595 minimize = (c & 1) != 0;
2596 min = rep_min[c]; /* Pick up values from tables; */
2597 max = rep_max[c]; /* zero for max => infinity */
2598 if (max == 0) max = INT_MAX;
2599 break;
2600
2601 case OP_CRRANGE:
2602 case OP_CRMINRANGE:
2603 minimize = (*ecode == OP_CRMINRANGE);
2604 min = GET2(ecode, 1);
2605 max = GET2(ecode, 3);
2606 if (max == 0) max = INT_MAX;
2607 ecode += 5;
2608 break;
2609
2610 default: /* No repeat follows */
2611 min = max = 1;
2612 break;
2613 }
2614
2615 /* First, ensure the minimum number of matches are present. */
2616
2617 #ifdef SUPPORT_UTF8
2618 /* UTF-8 mode */
2619 if (utf8)
2620 {
2621 for (i = 1; i <= min; i++)
2622 {
2623 if (eptr >= md->end_subject)
2624 {
2625 SCHECK_PARTIAL();
2626 MRRETURN(MATCH_NOMATCH);
2627 }
2628 GETCHARINC(c, eptr);
2629 if (c > 255)
2630 {
2631 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2632 }
2633 else
2634 {
2635 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2636 }
2637 }
2638 }
2639 else
2640 #endif
2641 /* Not UTF-8 mode */
2642 {
2643 for (i = 1; i <= min; i++)
2644 {
2645 if (eptr >= md->end_subject)
2646 {
2647 SCHECK_PARTIAL();
2648 MRRETURN(MATCH_NOMATCH);
2649 }
2650 c = *eptr++;
2651 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2652 }
2653 }
2654
2655 /* If max == min we can continue with the main loop without the
2656 need to recurse. */
2657
2658 if (min == max) continue;
2659
2660 /* If minimizing, keep testing the rest of the expression and advancing
2661 the pointer while it matches the class. */
2662
2663 if (minimize)
2664 {
2665 #ifdef SUPPORT_UTF8
2666 /* UTF-8 mode */
2667 if (utf8)
2668 {
2669 for (fi = min;; fi++)
2670 {
2671 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2673 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2674 if (eptr >= md->end_subject)
2675 {
2676 SCHECK_PARTIAL();
2677 MRRETURN(MATCH_NOMATCH);
2678 }
2679 GETCHARINC(c, eptr);
2680 if (c > 255)
2681 {
2682 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2683 }
2684 else
2685 {
2686 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2687 }
2688 }
2689 }
2690 else
2691 #endif
2692 /* Not UTF-8 mode */
2693 {
2694 for (fi = min;; fi++)
2695 {
2696 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2697 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2698 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2699 if (eptr >= md->end_subject)
2700 {
2701 SCHECK_PARTIAL();
2702 MRRETURN(MATCH_NOMATCH);
2703 }
2704 c = *eptr++;
2705 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2706 }
2707 }
2708 /* Control never gets here */
2709 }
2710
2711 /* If maximizing, find the longest possible run, then work backwards. */
2712
2713 else
2714 {
2715 pp = eptr;
2716
2717 #ifdef SUPPORT_UTF8
2718 /* UTF-8 mode */
2719 if (utf8)
2720 {
2721 for (i = min; i < max; i++)
2722 {
2723 int len = 1;
2724 if (eptr >= md->end_subject)
2725 {
2726 SCHECK_PARTIAL();
2727 break;
2728 }
2729 GETCHARLEN(c, eptr, len);
2730 if (c > 255)
2731 {
2732 if (op == OP_CLASS) break;
2733 }
2734 else
2735 {
2736 if ((data[c/8] & (1 << (c&7))) == 0) break;
2737 }
2738 eptr += len;
2739 }
2740 for (;;)
2741 {
2742 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2744 if (eptr-- == pp) break; /* Stop if tried at original pos */
2745 BACKCHAR(eptr);
2746 }
2747 }
2748 else
2749 #endif
2750 /* Not UTF-8 mode */
2751 {
2752 for (i = min; i < max; i++)
2753 {
2754 if (eptr >= md->end_subject)
2755 {
2756 SCHECK_PARTIAL();
2757 break;
2758 }
2759 c = *eptr;
2760 if ((data[c/8] & (1 << (c&7))) == 0) break;
2761 eptr++;
2762 }
2763 while (eptr >= pp)
2764 {
2765 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2766 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2767 eptr--;
2768 }
2769 }
2770
2771 MRRETURN(MATCH_NOMATCH);
2772 }
2773 }
2774 /* Control never gets here */
2775
2776
2777 /* Match an extended character class. This opcode is encountered only
2778 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2779 mode, because Unicode properties are supported in non-UTF-8 mode. */
2780
2781 #ifdef SUPPORT_UTF8
2782 case OP_XCLASS:
2783 {
2784 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2785 ecode += GET(ecode, 1); /* Advance past the item */
2786
2787 switch (*ecode)
2788 {
2789 case OP_CRSTAR:
2790 case OP_CRMINSTAR:
2791 case OP_CRPLUS:
2792 case OP_CRMINPLUS:
2793 case OP_CRQUERY:
2794 case OP_CRMINQUERY:
2795 c = *ecode++ - OP_CRSTAR;
2796 minimize = (c & 1) != 0;
2797 min = rep_min[c]; /* Pick up values from tables; */
2798 max = rep_max[c]; /* zero for max => infinity */
2799 if (max == 0) max = INT_MAX;
2800 break;
2801
2802 case OP_CRRANGE:
2803 case OP_CRMINRANGE:
2804 minimize = (*ecode == OP_CRMINRANGE);
2805 min = GET2(ecode, 1);
2806 max = GET2(ecode, 3);
2807 if (max == 0) max = INT_MAX;
2808 ecode += 5;
2809 break;
2810
2811 default: /* No repeat follows */
2812 min = max = 1;
2813 break;
2814 }
2815
2816 /* First, ensure the minimum number of matches are present. */
2817
2818 for (i = 1; i <= min; i++)
2819 {
2820 if (eptr >= md->end_subject)
2821 {
2822 SCHECK_PARTIAL();
2823 MRRETURN(MATCH_NOMATCH);
2824 }
2825 GETCHARINCTEST(c, eptr);
2826 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2827 }
2828
2829 /* If max == min we can continue with the main loop without the
2830 need to recurse. */
2831
2832 if (min == max) continue;
2833
2834 /* If minimizing, keep testing the rest of the expression and advancing
2835 the pointer while it matches the class. */
2836
2837 if (minimize)
2838 {
2839 for (fi = min;; fi++)
2840 {
2841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2843 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2844 if (eptr >= md->end_subject)
2845 {
2846 SCHECK_PARTIAL();
2847 MRRETURN(MATCH_NOMATCH);
2848 }
2849 GETCHARINCTEST(c, eptr);
2850 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2851 }
2852 /* Control never gets here */
2853 }
2854
2855 /* If maximizing, find the longest possible run, then work backwards. */
2856
2857 else
2858 {
2859 pp = eptr;
2860 for (i = min; i < max; i++)
2861 {
2862 int len = 1;
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 break;
2867 }
2868 GETCHARLENTEST(c, eptr, len);
2869 if (!_pcre_xclass(c, data)) break;
2870 eptr += len;
2871 }
2872 for(;;)
2873 {
2874 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2875 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2876 if (eptr-- == pp) break; /* Stop if tried at original pos */
2877 if (utf8) BACKCHAR(eptr);
2878 }
2879 MRRETURN(MATCH_NOMATCH);
2880 }
2881
2882 /* Control never gets here */
2883 }
2884 #endif /* End of XCLASS */
2885
2886 /* Match a single character, casefully */
2887
2888 case OP_CHAR:
2889 #ifdef SUPPORT_UTF8
2890 if (utf8)
2891 {
2892 length = 1;
2893 ecode++;
2894 GETCHARLEN(fc, ecode, length);
2895 if (length > md->end_subject - eptr)
2896 {
2897 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2898 MRRETURN(MATCH_NOMATCH);
2899 }
2900 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2901 }
2902 else
2903 #endif
2904
2905 /* Non-UTF-8 mode */
2906 {
2907 if (md->end_subject - eptr < 1)
2908 {
2909 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2910 MRRETURN(MATCH_NOMATCH);
2911 }
2912 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2913 ecode += 2;
2914 }
2915 break;
2916
2917 /* Match a single character, caselessly */
2918
2919 case OP_CHARI:
2920 #ifdef SUPPORT_UTF8
2921 if (utf8)
2922 {
2923 length = 1;
2924 ecode++;
2925 GETCHARLEN(fc, ecode, length);
2926
2927 if (length > md->end_subject - eptr)
2928 {
2929 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2930 MRRETURN(MATCH_NOMATCH);
2931 }
2932
2933 /* If the pattern character's value is < 128, we have only one byte, and
2934 can use the fast lookup table. */
2935
2936 if (fc < 128)
2937 {
2938 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2939 }
2940
2941 /* Otherwise we must pick up the subject character */
2942
2943 else
2944 {
2945 unsigned int dc;
2946 GETCHARINC(dc, eptr);
2947 ecode += length;
2948
2949 /* If we have Unicode property support, we can use it to test the other
2950 case of the character, if there is one. */
2951
2952 if (fc != dc)
2953 {
2954 #ifdef SUPPORT_UCP
2955 if (dc != UCD_OTHERCASE(fc))
2956 #endif
2957 MRRETURN(MATCH_NOMATCH);
2958 }
2959 }
2960 }
2961 else
2962 #endif /* SUPPORT_UTF8 */
2963
2964 /* Non-UTF-8 mode */
2965 {
2966 if (md->end_subject - eptr < 1)
2967 {
2968 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2969 MRRETURN(MATCH_NOMATCH);
2970 }
2971 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2972 ecode += 2;
2973 }
2974 break;
2975
2976 /* Match a single character repeatedly. */
2977
2978 case OP_EXACT:
2979 case OP_EXACTI:
2980 min = max = GET2(ecode, 1);
2981 ecode += 3;
2982 goto REPEATCHAR;
2983
2984 case OP_POSUPTO:
2985 case OP_POSUPTOI:
2986 possessive = TRUE;
2987 /* Fall through */
2988
2989 case OP_UPTO:
2990 case OP_UPTOI:
2991 case OP_MINUPTO:
2992 case OP_MINUPTOI:
2993 min = 0;
2994 max = GET2(ecode, 1);
2995 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2996 ecode += 3;
2997 goto REPEATCHAR;
2998
2999 case OP_POSSTAR:
3000 case OP_POSSTARI:
3001 possessive = TRUE;
3002 min = 0;
3003 max = INT_MAX;
3004 ecode++;
3005 goto REPEATCHAR;
3006
3007 case OP_POSPLUS:
3008 case OP_POSPLUSI:
3009 possessive = TRUE;
3010 min = 1;
3011 max = INT_MAX;
3012 ecode++;
3013 goto REPEATCHAR;
3014
3015 case OP_POSQUERY:
3016 case OP_POSQUERYI:
3017 possessive = TRUE;
3018 min = 0;
3019 max = 1;
3020 ecode++;
3021 goto REPEATCHAR;
3022
3023 case OP_STAR:
3024 case OP_STARI:
3025 case OP_MINSTAR:
3026 case OP_MINSTARI:
3027 case OP_PLUS:
3028 case OP_PLUSI:
3029 case OP_MINPLUS:
3030 case OP_MINPLUSI:
3031 case OP_QUERY:
3032 case OP_QUERYI:
3033 case OP_MINQUERY:
3034 case OP_MINQUERYI:
3035 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3036 minimize = (c & 1) != 0;
3037 min = rep_min[c]; /* Pick up values from tables; */
3038 max = rep_max[c]; /* zero for max => infinity */
3039 if (max == 0) max = INT_MAX;
3040
3041 /* Common code for all repeated single-character matches. */
3042
3043 REPEATCHAR:
3044 #ifdef SUPPORT_UTF8
3045 if (utf8)
3046 {
3047 length = 1;
3048 charptr = ecode;
3049 GETCHARLEN(fc, ecode, length);
3050 ecode += length;
3051
3052 /* Handle multibyte character matching specially here. There is
3053 support for caseless matching if UCP support is present. */
3054
3055 if (length > 1)
3056 {
3057 #ifdef SUPPORT_UCP
3058 unsigned int othercase;
3059 if (op >= OP_STARI && /* Caseless */
3060 (othercase = UCD_OTHERCASE(fc)) != fc)
3061 oclength = _pcre_ord2utf8(othercase, occhars);
3062 else oclength = 0;
3063 #endif /* SUPPORT_UCP */
3064
3065 for (i = 1; i <= min; i++)
3066 {
3067 if (eptr <= md->end_subject - length &&
3068 memcmp(eptr, charptr, length) == 0) eptr += length;
3069 #ifdef SUPPORT_UCP
3070 else if (oclength > 0 &&
3071 eptr <= md->end_subject - oclength &&
3072 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3073 #endif /* SUPPORT_UCP */
3074 else
3075 {
3076 CHECK_PARTIAL();
3077 MRRETURN(MATCH_NOMATCH);
3078 }
3079 }
3080
3081 if (min == max) continue;
3082
3083 if (minimize)
3084 {
3085 for (fi = min;; fi++)
3086 {
3087 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3088 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3089 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3090 if (eptr <= md->end_subject - length &&
3091 memcmp(eptr, charptr, length) == 0) eptr += length;
3092 #ifdef SUPPORT_UCP
3093 else if (oclength > 0 &&
3094 eptr <= md->end_subject - oclength &&
3095 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3096 #endif /* SUPPORT_UCP */
3097 else
3098 {
3099 CHECK_PARTIAL();
3100 MRRETURN(MATCH_NOMATCH);
3101 }
3102 }
3103 /* Control never gets here */
3104 }
3105
3106 else /* Maximize */
3107 {
3108 pp = eptr;
3109 for (i = min; i < max; i++)
3110 {
3111 if (eptr <= md->end_subject - length &&
3112 memcmp(eptr, charptr, length) == 0) eptr += length;
3113 #ifdef SUPPORT_UCP
3114 else if (oclength > 0 &&
3115 eptr <= md->end_subject - oclength &&
3116 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3117 #endif /* SUPPORT_UCP */
3118 else
3119 {
3120 CHECK_PARTIAL();
3121 break;
3122 }
3123 }
3124
3125 if (possessive) continue;
3126
3127 for(;;)
3128 {
3129 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3130 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3131 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3132 #ifdef SUPPORT_UCP
3133 eptr--;
3134 BACKCHAR(eptr);
3135 #else /* without SUPPORT_UCP */
3136 eptr -= length;
3137 #endif /* SUPPORT_UCP */
3138 }
3139 }
3140 /* Control never gets here */
3141 }
3142
3143 /* If the length of a UTF-8 character is 1, we fall through here, and
3144 obey the code as for non-UTF-8 characters below, though in this case the
3145 value of fc will always be < 128. */
3146 }
3147 else
3148 #endif /* SUPPORT_UTF8 */
3149
3150 /* When not in UTF-8 mode, load a single-byte character. */
3151
3152 fc = *ecode++;
3153
3154 /* The value of fc at this point is always less than 256, though we may or
3155 may not be in UTF-8 mode. The code is duplicated for the caseless and
3156 caseful cases, for speed, since matching characters is likely to be quite
3157 common. First, ensure the minimum number of matches are present. If min =
3158 max, continue at the same level without recursing. Otherwise, if
3159 minimizing, keep trying the rest of the expression and advancing one
3160 matching character if failing, up to the maximum. Alternatively, if
3161 maximizing, find the maximum number of characters and work backwards. */
3162
3163 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3164 max, eptr));
3165
3166 if (op >= OP_STARI) /* Caseless */
3167 {
3168 fc = md->lcc[fc];
3169 for (i = 1; i <= min; i++)
3170 {
3171 if (eptr >= md->end_subject)
3172 {
3173 SCHECK_PARTIAL();
3174 MRRETURN(MATCH_NOMATCH);
3175 }
3176 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3177 }
3178 if (min == max) continue;
3179 if (minimize)
3180 {
3181 for (fi = min;; fi++)
3182 {
3183 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3184 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3185 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3186 if (eptr >= md->end_subject)
3187 {
3188 SCHECK_PARTIAL();
3189 MRRETURN(MATCH_NOMATCH);
3190 }
3191 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3192 }
3193 /* Control never gets here */
3194 }
3195 else /* Maximize */
3196 {
3197 pp = eptr;
3198 for (i = min; i < max; i++)
3199 {
3200 if (eptr >= md->end_subject)
3201 {
3202 SCHECK_PARTIAL();
3203 break;
3204 }
3205 if (fc != md->lcc[*eptr]) break;
3206 eptr++;
3207 }
3208
3209 if (possessive) continue;
3210
3211 while (eptr >= pp)
3212 {
3213 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3214 eptr--;
3215 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3216 }
3217 MRRETURN(MATCH_NOMATCH);
3218 }
3219 /* Control never gets here */
3220 }
3221
3222 /* Caseful comparisons (includes all multi-byte characters) */
3223
3224 else
3225 {
3226 for (i = 1; i <= min; i++)
3227 {
3228 if (eptr >= md->end_subject)
3229 {
3230 SCHECK_PARTIAL();
3231 MRRETURN(MATCH_NOMATCH);
3232 }
3233 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3234 }
3235
3236 if (min == max) continue;
3237
3238 if (minimize)
3239 {
3240 for (fi = min;; fi++)
3241 {
3242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3244 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3245 if (eptr >= md->end_subject)
3246 {
3247 SCHECK_PARTIAL();
3248 MRRETURN(MATCH_NOMATCH);
3249 }
3250 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3251 }
3252 /* Control never gets here */
3253 }
3254 else /* Maximize */
3255 {
3256 pp = eptr;
3257 for (i = min; i < max; i++)
3258 {
3259 if (eptr >= md->end_subject)
3260 {
3261 SCHECK_PARTIAL();
3262 break;
3263 }
3264 if (fc != *eptr) break;
3265 eptr++;
3266 }
3267 if (possessive) continue;
3268
3269 while (eptr >= pp)
3270 {
3271 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3272 eptr--;
3273 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3274 }
3275 MRRETURN(MATCH_NOMATCH);
3276 }
3277 }
3278 /* Control never gets here */
3279
3280 /* Match a negated single one-byte character. The character we are
3281 checking can be multibyte. */
3282
3283 case OP_NOT:
3284 case OP_NOTI:
3285 if (eptr >= md->end_subject)
3286 {
3287 SCHECK_PARTIAL();
3288 MRRETURN(MATCH_NOMATCH);
3289 }
3290 ecode++;
3291 GETCHARINCTEST(c, eptr);
3292 if (op == OP_NOTI) /* The caseless case */
3293 {
3294 #ifdef SUPPORT_UTF8
3295 if (c < 256)
3296 #endif
3297 c = md->lcc[c];
3298 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3299 }
3300 else /* Caseful */
3301 {
3302 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3303 }
3304 break;
3305
3306 /* Match a negated single one-byte character repeatedly. This is almost a
3307 repeat of the code for a repeated single character, but I haven't found a
3308 nice way of commoning these up that doesn't require a test of the
3309 positive/negative option for each character match. Maybe that wouldn't add
3310 very much to the time taken, but character matching *is* what this is all
3311 about... */
3312
3313 case OP_NOTEXACT:
3314 case OP_NOTEXACTI:
3315 min = max = GET2(ecode, 1);
3316 ecode += 3;
3317 goto REPEATNOTCHAR;
3318
3319 case OP_NOTUPTO:
3320 case OP_NOTUPTOI:
3321 case OP_NOTMINUPTO:
3322 case OP_NOTMINUPTOI:
3323 min = 0;
3324 max = GET2(ecode, 1);
3325 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3326 ecode += 3;
3327 goto REPEATNOTCHAR;
3328
3329 case OP_NOTPOSSTAR:
3330 case OP_NOTPOSSTARI:
3331 possessive = TRUE;
3332 min = 0;
3333 max = INT_MAX;
3334 ecode++;
3335 goto REPEATNOTCHAR;
3336
3337 case OP_NOTPOSPLUS:
3338 case OP_NOTPOSPLUSI:
3339 possessive = TRUE;
3340 min = 1;
3341 max = INT_MAX;
3342 ecode++;
3343 goto REPEATNOTCHAR;
3344
3345 case OP_NOTPOSQUERY:
3346 case OP_NOTPOSQUERYI:
3347 possessive = TRUE;
3348 min = 0;
3349 max = 1;
3350 ecode++;
3351 goto REPEATNOTCHAR;
3352
3353 case OP_NOTPOSUPTO:
3354 case OP_NOTPOSUPTOI:
3355 possessive = TRUE;
3356 min = 0;
3357 max = GET2(ecode, 1);
3358 ecode += 3;
3359 goto REPEATNOTCHAR;
3360
3361 case OP_NOTSTAR:
3362 case OP_NOTSTARI:
3363 case OP_NOTMINSTAR:
3364 case OP_NOTMINSTARI:
3365 case OP_NOTPLUS:
3366 case OP_NOTPLUSI:
3367 case OP_NOTMINPLUS:
3368 case OP_NOTMINPLUSI:
3369 case OP_NOTQUERY:
3370 case OP_NOTQUERYI:
3371 case OP_NOTMINQUERY:
3372 case OP_NOTMINQUERYI:
3373 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3374 minimize = (c & 1) != 0;
3375 min = rep_min[c]; /* Pick up values from tables; */
3376 max = rep_max[c]; /* zero for max => infinity */
3377 if (max == 0) max = INT_MAX;
3378
3379 /* Common code for all repeated single-byte matches. */
3380
3381 REPEATNOTCHAR:
3382 fc = *ecode++;
3383
3384 /* The code is duplicated for the caseless and caseful cases, for speed,
3385 since matching characters is likely to be quite common. First, ensure the
3386 minimum number of matches are present. If min = max, continue at the same
3387 level without recursing. Otherwise, if minimizing, keep trying the rest of
3388 the expression and advancing one matching character if failing, up to the
3389 maximum. Alternatively, if maximizing, find the maximum number of
3390 characters and work backwards. */
3391
3392 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3393 max, eptr));
3394
3395 if (op >= OP_NOTSTARI) /* Caseless */
3396 {
3397 fc = md->lcc[fc];
3398
3399 #ifdef SUPPORT_UTF8
3400 /* UTF-8 mode */
3401 if (utf8)
3402 {
3403 register unsigned int d;
3404 for (i = 1; i <= min; i++)
3405 {
3406 if (eptr >= md->end_subject)
3407 {
3408 SCHECK_PARTIAL();
3409 MRRETURN(MATCH_NOMATCH);
3410 }
3411 GETCHARINC(d, eptr);
3412 if (d < 256) d = md->lcc[d];
3413 if (fc == d) MRRETURN(MATCH_NOMATCH);
3414 }
3415 }
3416 else
3417 #endif
3418
3419 /* Not UTF-8 mode */
3420 {
3421 for (i = 1; i <= min; i++)
3422 {
3423 if (eptr >= md->end_subject)
3424 {
3425 SCHECK_PARTIAL();
3426 MRRETURN(MATCH_NOMATCH);
3427 }
3428 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3429 }
3430 }
3431
3432 if (min == max) continue;
3433
3434 if (minimize)
3435 {
3436 #ifdef SUPPORT_UTF8
3437 /* UTF-8 mode */
3438 if (utf8)
3439 {
3440 register unsigned int d;
3441 for (fi = min;; fi++)
3442 {
3443 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3446 if (eptr >= md->end_subject)
3447 {
3448 SCHECK_PARTIAL();
3449 MRRETURN(MATCH_NOMATCH);
3450 }
3451 GETCHARINC(d, eptr);
3452 if (d < 256) d = md->lcc[d];
3453 if (fc == d) MRRETURN(MATCH_NOMATCH);
3454 }
3455 }
3456 else
3457 #endif
3458 /* Not UTF-8 mode */
3459 {
3460 for (fi = min;; fi++)
3461 {
3462 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3464 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3465 if (eptr >= md->end_subject)
3466 {
3467 SCHECK_PARTIAL();
3468 MRRETURN(MATCH_NOMATCH);
3469 }
3470 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3471 }
3472 }
3473 /* Control never gets here */
3474 }
3475
3476 /* Maximize case */
3477
3478 else
3479 {
3480 pp = eptr;
3481
3482 #ifdef SUPPORT_UTF8
3483 /* UTF-8 mode */
3484 if (utf8)
3485 {
3486 register unsigned int d;
3487 for (i = min; i < max; i++)
3488 {
3489 int len = 1;
3490 if (eptr >= md->end_subject)
3491 {
3492 SCHECK_PARTIAL();
3493 break;
3494 }
3495 GETCHARLEN(d, eptr, len);
3496 if (d < 256) d = md->lcc[d];
3497 if (fc == d) break;
3498 eptr += len;
3499 }
3500 if (possessive) continue;
3501 for(;;)
3502 {
3503 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3504 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3505 if (eptr-- == pp) break; /* Stop if tried at original pos */
3506 BACKCHAR(eptr);
3507 }
3508 }
3509 else
3510 #endif
3511 /* Not UTF-8 mode */
3512 {
3513 for (i = min; i < max; i++)
3514 {
3515 if (eptr >= md->end_subject)
3516 {
3517 SCHECK_PARTIAL();
3518 break;
3519 }
3520 if (fc == md->lcc[*eptr]) break;
3521 eptr++;
3522 }
3523 if (possessive) continue;
3524 while (eptr >= pp)
3525 {
3526 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3528 eptr--;
3529 }
3530 }
3531
3532 MRRETURN(MATCH_NOMATCH);
3533 }
3534 /* Control never gets here */
3535 }
3536
3537 /* Caseful comparisons */
3538
3539 else
3540 {
3541 #ifdef SUPPORT_UTF8
3542 /* UTF-8 mode */
3543 if (utf8)
3544 {
3545 register unsigned int d;
3546 for (i = 1; i <= min; i++)
3547 {
3548 if (eptr >= md->end_subject)
3549 {
3550 SCHECK_PARTIAL();
3551 MRRETURN(MATCH_NOMATCH);
3552 }
3553 GETCHARINC(d, eptr);
3554 if (fc == d) MRRETURN(MATCH_NOMATCH);
3555 }
3556 }
3557 else
3558 #endif
3559 /* Not UTF-8 mode */
3560 {
3561 for (i = 1; i <= min; i++)
3562 {
3563 if (eptr >= md->end_subject)
3564 {
3565 SCHECK_PARTIAL();
3566 MRRETURN(MATCH_NOMATCH);
3567 }
3568 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3569 }
3570 }
3571
3572 if (min == max) continue;
3573
3574 if (minimize)
3575 {
3576 #ifdef SUPPORT_UTF8
3577 /* UTF-8 mode */
3578 if (utf8)
3579 {
3580 register unsigned int d;
3581 for (fi = min;; fi++)
3582 {
3583 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3584 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3585 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3586 if (eptr >= md->end_subject)
3587 {
3588 SCHECK_PARTIAL();
3589 MRRETURN(MATCH_NOMATCH);
3590 }
3591 GETCHARINC(d, eptr);
3592 if (fc == d) MRRETURN(MATCH_NOMATCH);
3593 }
3594 }
3595 else
3596 #endif
3597 /* Not UTF-8 mode */
3598 {
3599 for (fi = min;; fi++)
3600 {
3601 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3603 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3604 if (eptr >= md->end_subject)
3605 {
3606 SCHECK_PARTIAL();
3607 MRRETURN(MATCH_NOMATCH);
3608 }
3609 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3610 }
3611 }
3612 /* Control never gets here */
3613 }
3614
3615 /* Maximize case */
3616
3617 else
3618 {
3619 pp = eptr;
3620
3621 #ifdef SUPPORT_UTF8
3622 /* UTF-8 mode */
3623 if (utf8)
3624 {
3625 register unsigned int d;
3626 for (i = min; i < max; i++)
3627 {
3628 int len = 1;
3629 if (eptr >= md->end_subject)
3630 {
3631 SCHECK_PARTIAL();
3632 break;
3633 }
3634 GETCHARLEN(d, eptr, len);
3635 if (fc == d) break;
3636 eptr += len;
3637 }
3638 if (possessive) continue;
3639 for(;;)
3640 {
3641 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3643 if (eptr-- == pp) break; /* Stop if tried at original pos */
3644 BACKCHAR(eptr);
3645 }
3646 }
3647 else
3648 #endif
3649 /* Not UTF-8 mode */
3650 {
3651 for (i = min; i < max; i++)
3652 {
3653 if (eptr >= md->end_subject)
3654 {
3655 SCHECK_PARTIAL();
3656 break;
3657 }
3658 if (fc == *eptr) break;
3659 eptr++;
3660 }
3661 if (possessive) continue;
3662 while (eptr >= pp)
3663 {
3664 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3666 eptr--;
3667 }
3668 }
3669
3670 MRRETURN(MATCH_NOMATCH);
3671 }
3672 }
3673 /* Control never gets here */
3674
3675 /* Match a single character type repeatedly; several different opcodes
3676 share code. This is very similar to the code for single characters, but we
3677 repeat it in the interests of efficiency. */
3678
3679 case OP_TYPEEXACT:
3680 min = max = GET2(ecode, 1);
3681 minimize = TRUE;
3682 ecode += 3;
3683 goto REPEATTYPE;
3684
3685 case OP_TYPEUPTO:
3686 case OP_TYPEMINUPTO:
3687 min = 0;
3688 max = GET2(ecode, 1);
3689 minimize = *ecode == OP_TYPEMINUPTO;
3690 ecode += 3;
3691 goto REPEATTYPE;
3692
3693 case OP_TYPEPOSSTAR:
3694 possessive = TRUE;
3695 min = 0;
3696 max = INT_MAX;
3697 ecode++;
3698 goto REPEATTYPE;
3699
3700 case OP_TYPEPOSPLUS:
3701 possessive = TRUE;
3702 min = 1;
3703 max = INT_MAX;
3704 ecode++;
3705 goto REPEATTYPE;
3706
3707 case OP_TYPEPOSQUERY:
3708 possessive = TRUE;
3709 min = 0;
3710 max = 1;
3711 ecode++;
3712 goto REPEATTYPE;
3713
3714 case OP_TYPEPOSUPTO:
3715 possessive = TRUE;
3716 min = 0;
3717 max = GET2(ecode, 1);
3718 ecode += 3;
3719 goto REPEATTYPE;
3720
3721 case OP_TYPESTAR:
3722 case OP_TYPEMINSTAR:
3723 case OP_TYPEPLUS:
3724 case OP_TYPEMINPLUS:
3725 case OP_TYPEQUERY:
3726 case OP_TYPEMINQUERY:
3727 c = *ecode++ - OP_TYPESTAR;
3728 minimize = (c & 1) != 0;
3729 min = rep_min[c]; /* Pick up values from tables; */
3730 max = rep_max[c]; /* zero for max => infinity */
3731 if (max == 0) max = INT_MAX;
3732
3733 /* Common code for all repeated single character type matches. Note that
3734 in UTF-8 mode, '.' matches a character of any length, but for the other
3735 character types, the valid characters are all one-byte long. */
3736
3737 REPEATTYPE:
3738 ctype = *ecode++; /* Code for the character type */
3739
3740 #ifdef SUPPORT_UCP
3741 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3742 {
3743 prop_fail_result = ctype == OP_NOTPROP;
3744 prop_type = *ecode++;
3745 prop_value = *ecode++;
3746 }
3747 else prop_type = -1;
3748 #endif
3749
3750 /* First, ensure the minimum number of matches are present. Use inline
3751 code for maximizing the speed, and do the type test once at the start
3752 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3753 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3754 and single-bytes. */
3755
3756 if (min > 0)
3757 {
3758 #ifdef SUPPORT_UCP
3759 if (prop_type >= 0)
3760 {
3761 switch(prop_type)
3762 {
3763 case PT_ANY:
3764 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3765 for (i = 1; i <= min; i++)
3766 {
3767 if (eptr >= md->end_subject)
3768 {
3769 SCHECK_PARTIAL();
3770 MRRETURN(MATCH_NOMATCH);
3771 }
3772 GETCHARINCTEST(c, eptr);
3773 }
3774 break;
3775
3776 case PT_LAMP:
3777 for (i = 1; i <= min; i++)
3778 {
3779 int chartype;
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 MRRETURN(MATCH_NOMATCH);
3784 }
3785 GETCHARINCTEST(c, eptr);
3786 chartype = UCD_CHARTYPE(c);
3787 if ((chartype == ucp_Lu ||
3788 chartype == ucp_Ll ||
3789 chartype == ucp_Lt) == prop_fail_result)
3790 MRRETURN(MATCH_NOMATCH);
3791 }
3792 break;
3793
3794 case PT_GC:
3795 for (i = 1; i <= min; i++)
3796 {
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 GETCHARINCTEST(c, eptr);
3803 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3804 MRRETURN(MATCH_NOMATCH);
3805 }
3806 break;
3807
3808 case PT_PC:
3809 for (i = 1; i <= min; i++)
3810 {
3811 if (eptr >= md->end_subject)
3812 {
3813 SCHECK_PARTIAL();
3814 MRRETURN(MATCH_NOMATCH);
3815 }
3816 GETCHARINCTEST(c, eptr);
3817 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3818 MRRETURN(MATCH_NOMATCH);
3819 }
3820 break;
3821
3822 case PT_SC:
3823 for (i = 1; i <= min; i++)
3824 {
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 MRRETURN(MATCH_NOMATCH);
3829 }
3830 GETCHARINCTEST(c, eptr);
3831 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3832 MRRETURN(MATCH_NOMATCH);
3833 }
3834 break;
3835
3836 case PT_ALNUM:
3837 for (i = 1; i <= min; i++)
3838 {
3839 int category;
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 MRRETURN(MATCH_NOMATCH);
3844 }
3845 GETCHARINCTEST(c, eptr);
3846 category = UCD_CATEGORY(c);
3847 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3848 MRRETURN(MATCH_NOMATCH);
3849 }
3850 break;
3851
3852 case PT_SPACE: /* Perl space */
3853 for (i = 1; i <= min; i++)
3854 {
3855 if (eptr >= md->end_subject)
3856 {
3857 SCHECK_PARTIAL();
3858 MRRETURN(MATCH_NOMATCH);
3859 }
3860 GETCHARINCTEST(c, eptr);
3861 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3862 c == CHAR_FF || c == CHAR_CR)
3863 == prop_fail_result)
3864 MRRETURN(MATCH_NOMATCH);
3865 }
3866 break;
3867
3868 case PT_PXSPACE: /* POSIX space */
3869 for (i = 1; i <= min; i++)
3870 {
3871 if (eptr >= md->end_subject)
3872 {
3873 SCHECK_PARTIAL();
3874 MRRETURN(MATCH_NOMATCH);
3875 }
3876 GETCHARINCTEST(c, eptr);
3877 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3878 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3879 == prop_fail_result)
3880 MRRETURN(MATCH_NOMATCH);
3881 }
3882 break;
3883
3884 case PT_WORD:
3885 for (i = 1; i <= min; i++)
3886 {
3887 int category;
3888 if (eptr >= md->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 MRRETURN(MATCH_NOMATCH);
3892 }
3893 GETCHARINCTEST(c, eptr);
3894 category = UCD_CATEGORY(c);
3895 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3896 == prop_fail_result)
3897 MRRETURN(MATCH_NOMATCH);
3898 }
3899 break;
3900
3901 /* This should not occur */
3902
3903 default:
3904 RRETURN(PCRE_ERROR_INTERNAL);
3905 }
3906 }
3907
3908 /* Match extended Unicode sequences. We will get here only if the
3909 support is in the binary; otherwise a compile-time error occurs. */
3910
3911 else if (ctype == OP_EXTUNI)
3912 {
3913 for (i = 1; i <= min; i++)
3914 {
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 MRRETURN(MATCH_NOMATCH);
3919 }
3920 GETCHARINCTEST(c, eptr);
3921 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3922 while (eptr < md->end_subject)
3923 {
3924 int len = 1;
3925 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3926 if (UCD_CATEGORY(c) != ucp_M) break;
3927 eptr += len;
3928 }
3929 }
3930 }
3931
3932 else
3933 #endif /* SUPPORT_UCP */
3934
3935 /* Handle all other cases when the coding is UTF-8 */
3936
3937 #ifdef SUPPORT_UTF8
3938 if (utf8) switch(ctype)
3939 {
3940 case OP_ANY:
3941 for (i = 1; i <= min; i++)
3942 {
3943 if (eptr >= md->end_subject)
3944 {
3945 SCHECK_PARTIAL();
3946 MRRETURN(MATCH_NOMATCH);
3947 }
3948 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3949 eptr++;
3950 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3951 }
3952 break;
3953
3954 case OP_ALLANY:
3955 for (i = 1; i <= min; i++)
3956 {
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 MRRETURN(MATCH_NOMATCH);
3961 }
3962 eptr++;
3963 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3964 }
3965 break;
3966
3967 case OP_ANYBYTE:
3968 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3969 eptr += min;
3970 break;
3971
3972 case OP_ANYNL:
3973 for (i = 1; i <= min; i++)
3974 {
3975 if (eptr >= md->end_subject)
3976 {
3977 SCHECK_PARTIAL();
3978 MRRETURN(MATCH_NOMATCH);
3979 }
3980 GETCHARINC(c, eptr);
3981 switch(c)
3982 {
3983 default: MRRETURN(MATCH_NOMATCH);
3984
3985 case 0x000d:
3986 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3987 break;
3988
3989 case 0x000a:
3990 break;
3991
3992 case 0x000b:
3993 case 0x000c:
3994 case 0x0085:
3995 case 0x2028:
3996 case 0x2029:
3997 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3998 break;
3999 }
4000 }
4001 break;
4002
4003 case OP_NOT_HSPACE:
4004 for (i = 1; i <= min; i++)
4005 {
4006 if (eptr >= md->end_subject)
4007 {
4008 SCHECK_PARTIAL();
4009 MRRETURN(MATCH_NOMATCH);
4010 }
4011 GETCHARINC(c, eptr);
4012 switch(c)
4013 {
4014 default: break;
4015 case 0x09: /* HT */
4016 case 0x20: /* SPACE */
4017 case 0xa0: /* NBSP */
4018 case 0x1680: /* OGHAM SPACE MARK */
4019 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4020 case 0x2000: /* EN QUAD */
4021 case 0x2001: /* EM QUAD */
4022 case 0x2002: /* EN SPACE */
4023 case 0x2003: /* EM SPACE */
4024 case 0x2004: /* THREE-PER-EM SPACE */
4025 case 0x2005: /* FOUR-PER-EM SPACE */
4026 case 0x2006: /* SIX-PER-EM SPACE */
4027 case 0x2007: /* FIGURE SPACE */
4028 case 0x2008: /* PUNCTUATION SPACE */
4029 case 0x2009: /* THIN SPACE */
4030 case 0x200A: /* HAIR SPACE */
4031 case 0x202f: /* NARROW NO-BREAK SPACE */
4032 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4033 case 0x3000: /* IDEOGRAPHIC SPACE */
4034 MRRETURN(MATCH_NOMATCH);
4035 }
4036 }
4037 break;
4038
4039 case OP_HSPACE:
4040 for (i = 1; i <= min; i++)
4041 {
4042 if (eptr >= md->end_subject)
4043 {
4044 SCHECK_PARTIAL();
4045 MRRETURN(MATCH_NOMATCH);
4046 }
4047 GETCHARINC(c, eptr);
4048 switch(c)
4049 {
4050 default: MRRETURN(MATCH_NOMATCH);
4051 case 0x09: /* HT */
4052 case 0x20: /* SPACE */
4053 case 0xa0: /* NBSP */
4054 case 0x1680: /* OGHAM SPACE MARK */
4055 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4056 case 0x2000: /* EN QUAD */
4057 case 0x2001: /* EM QUAD */
4058 case 0x2002: /* EN SPACE */
4059 case 0x2003: /* EM SPACE */
4060 case 0x2004: /* THREE-PER-EM SPACE */
4061 case 0x2005: /* FOUR-PER-EM SPACE */
4062 case 0x2006: /* SIX-PER-EM SPACE */
4063 case 0x2007: /* FIGURE SPACE */
4064 case 0x2008: /* PUNCTUATION SPACE */
4065 case 0x2009: /* THIN SPACE */
4066 case 0x200A: /* HAIR SPACE */
4067 case 0x202f: /* NARROW NO-BREAK SPACE */
4068 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4069 case 0x3000: /* IDEOGRAPHIC SPACE */
4070 break;
4071 }
4072 }
4073 break;
4074
4075 case OP_NOT_VSPACE:
4076 for (i = 1; i <= min; i++)
4077 {
4078 if (eptr >= md->end_subject)
4079 {
4080 SCHECK_PARTIAL();
4081 MRRETURN(MATCH_NOMATCH);
4082 }
4083 GETCHARINC(c, eptr);
4084 switch(c)
4085 {
4086 default: break;
4087 case 0x0a: /* LF */
4088 case 0x0b: /* VT */
4089 case 0x0c: /* FF */
4090 case 0x0d: /* CR */
4091 case 0x85: /* NEL */
4092 case 0x2028: /* LINE SEPARATOR */
4093 case 0x2029: /* PARAGRAPH SEPARATOR */
4094 MRRETURN(MATCH_NOMATCH);
4095 }
4096 }
4097 break;
4098
4099 case OP_VSPACE:
4100 for (i = 1; i <= min; i++)
4101 {
4102 if (eptr >= md->end_subject)
4103 {
4104 SCHECK_PARTIAL();
4105 MRRETURN(MATCH_NOMATCH);
4106 }
4107 GETCHARINC(c, eptr);
4108 switch(c)
4109 {
4110 default: MRRETURN(MATCH_NOMATCH);
4111 case 0x0a: /* LF */
4112 case 0x0b: /* VT */
4113 case 0x0c: /* FF */
4114 case 0x0d: /* CR */
4115 case 0x85: /* NEL */
4116 case 0x2028: /* LINE SEPARATOR */
4117 case 0x2029: /* PARAGRAPH SEPARATOR */
4118 break;
4119 }
4120 }
4121 break;
4122
4123 case OP_NOT_DIGIT:
4124 for (i = 1; i <= min; i++)
4125 {
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 MRRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINC(c, eptr);
4132 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4133 MRRETURN(MATCH_NOMATCH);
4134 }
4135 break;
4136
4137 case OP_DIGIT:
4138 for (i = 1; i <= min; i++)
4139 {
4140 if (eptr >= md->end_subject)
4141 {
4142 SCHECK_PARTIAL();
4143 MRRETURN(MATCH_NOMATCH);
4144 }
4145 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4146 MRRETURN(MATCH_NOMATCH);
4147 /* No need to skip more bytes - we know it's a 1-byte character */
4148 }
4149 break;
4150
4151 case OP_NOT_WHITESPACE:
4152 for (i = 1; i <= min; i++)
4153 {
4154 if (eptr >= md->end_subject)
4155 {
4156 SCHECK_PARTIAL();
4157 MRRETURN(MATCH_NOMATCH);
4158 }
4159 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4160 MRRETURN(MATCH_NOMATCH);
4161 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4162 }
4163 break;
4164
4165 case OP_WHITESPACE:
4166 for (i = 1; i <= min; i++)
4167 {
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 MRRETURN(MATCH_NOMATCH);
4172 }
4173 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4174 MRRETURN(MATCH_NOMATCH);
4175 /* No need to skip more bytes - we know it's a 1-byte character */
4176 }
4177 break;
4178
4179 case OP_NOT_WORDCHAR:
4180 for (i = 1; i <= min; i++)
4181 {
4182 if (eptr >= md->end_subject)
4183 {
4184 SCHECK_PARTIAL();
4185 MRRETURN(MATCH_NOMATCH);
4186 }
4187 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4188 MRRETURN(MATCH_NOMATCH);
4189 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4190 }
4191 break;
4192
4193 case OP_WORDCHAR:
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 MRRETURN(MATCH_NOMATCH);
4200 }
4201 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4202 MRRETURN(MATCH_NOMATCH);
4203 /* No need to skip more bytes - we know it's a 1-byte character */
4204 }
4205 break;
4206
4207 default:
4208 RRETURN(PCRE_ERROR_INTERNAL);
4209 } /* End switch(ctype) */
4210
4211 else
4212 #endif /* SUPPORT_UTF8 */
4213
4214 /* Code for the non-UTF-8 case for minimum matching of operators other
4215 than OP_PROP and OP_NOTPROP. */
4216
4217 switch(ctype)
4218 {
4219 case OP_ANY:
4220 for (i = 1; i <= min; i++)
4221 {
4222 if (eptr >= md->end_subject)
4223 {
4224 SCHECK_PARTIAL();
4225 MRRETURN(MATCH_NOMATCH);
4226 }
4227 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4228 eptr++;
4229 }
4230 break;
4231
4232 case OP_ALLANY:
4233 if (eptr > md->end_subject - min)
4234 {
4235 SCHECK_PARTIAL();
4236 MRRETURN(MATCH_NOMATCH);
4237 }
4238 eptr += min;
4239 break;
4240
4241 case OP_ANYBYTE:
4242 if (eptr > md->end_subject - min)
4243 {
4244 SCHECK_PARTIAL();
4245 MRRETURN(MATCH_NOMATCH);
4246 }
4247 eptr += min;
4248 break;
4249
4250 case OP_ANYNL:
4251 for (i = 1; i <= min; i++)
4252 {
4253 if (eptr >= md->end_subject)
4254 {
4255 SCHECK_PARTIAL();
4256 MRRETURN(MATCH_NOMATCH);
4257 }
4258 switch(*eptr++)
4259 {
4260 default: MRRETURN(MATCH_NOMATCH);
4261
4262 case 0x000d:
4263 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4264 break;
4265
4266 case 0x000a:
4267 break;
4268
4269 case 0x000b:
4270 case 0x000c:
4271 case 0x0085:
4272 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4273 break;
4274 }
4275 }
4276 break;
4277
4278 case OP_NOT_HSPACE:
4279 for (i = 1; i <= min; i++)
4280 {
4281 if (eptr >= md->end_subject)
4282 {
4283 SCHECK_PARTIAL();
4284 MRRETURN(MATCH_NOMATCH);
4285 }
4286 switch(*eptr++)
4287 {
4288 default: break;
4289 case 0x09: /* HT */
4290 case 0x20: /* SPACE */
4291 case 0xa0: /* NBSP */
4292 MRRETURN(MATCH_NOMATCH);
4293 }
4294 }
4295 break;
4296
4297 case OP_HSPACE:
4298 for (i = 1; i <= min; i++)
4299 {
4300 if (eptr >= md->end_subject)
4301 {
4302 SCHECK_PARTIAL();
4303 MRRETURN(MATCH_NOMATCH);
4304 }
4305 switch(*eptr++)
4306 {
4307 default: MRRETURN(MATCH_NOMATCH);
4308 case 0x09: /* HT */
4309 case 0x20: /* SPACE */
4310 case 0xa0: /* NBSP */
4311 break;
4312 }
4313 }
4314 break;
4315
4316 case OP_NOT_VSPACE:
4317 for (i = 1; i <= min; i++)
4318 {
4319 if (eptr >= md->end_subject)
4320 {
4321 SCHECK_PARTIAL();
4322 MRRETURN(MATCH_NOMATCH);
4323 }
4324 switch(*eptr++)
4325 {
4326 default: break;
4327 case 0x0a: /* LF */
4328 case 0x0b: /* VT */
4329 case 0x0c: /* FF */
4330 case 0x0d: /* CR */
4331 case 0x85: /* NEL */
4332 MRRETURN(MATCH_NOMATCH);
4333 }
4334 }
4335 break;
4336
4337 case OP_VSPACE:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 MRRETURN(MATCH_NOMATCH);
4344 }
4345 switch(*eptr++)
4346 {
4347 default: MRRETURN(MATCH_NOMATCH);
4348 case 0x0a: /* LF */
4349 case 0x0b: /* VT */
4350 case 0x0c: /* FF */
4351 case 0x0d: /* CR */
4352 case 0x85: /* NEL */
4353 break;
4354 }
4355 }
4356 break;
4357
4358 case OP_NOT_DIGIT:
4359 for (i = 1; i <= min; i++)
4360 {
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 MRRETURN(MATCH_NOMATCH);
4365 }
4366 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4367 }
4368 break;
4369
4370 case OP_DIGIT:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 MRRETURN(MATCH_NOMATCH);
4377 }
4378 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4379 }
4380 break;
4381
4382 case OP_NOT_WHITESPACE:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 MRRETURN(MATCH_NOMATCH);
4389 }
4390 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4391 }
4392 break;
4393
4394 case OP_WHITESPACE:
4395 for (i = 1; i <= min; i++)
4396 {
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 MRRETURN(MATCH_NOMATCH);
4401 }
4402 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4403 }
4404 break;
4405
4406 case OP_NOT_WORDCHAR:
4407 for (i = 1; i <= min; i++)
4408 {
4409 if (eptr >= md->end_subject)
4410 {
4411 SCHECK_PARTIAL();
4412 MRRETURN(MATCH_NOMATCH);
4413 }
4414 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4415 MRRETURN(MATCH_NOMATCH);
4416 }
4417 break;
4418
4419 case OP_WORDCHAR:
4420 for (i = 1; i <= min; i++)
4421 {
4422 if (eptr >= md->end_subject)
4423 {
4424 SCHECK_PARTIAL();
4425 MRRETURN(MATCH_NOMATCH);
4426 }
4427 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4428 MRRETURN(MATCH_NOMATCH);
4429 }
4430 break;
4431
4432 default:
4433 RRETURN(PCRE_ERROR_INTERNAL);
4434 }
4435 }
4436
4437 /* If min = max, continue at the same level without recursing */
4438
4439 if (min == max) continue;
4440
4441 /* If minimizing, we have to test the rest of the pattern before each
4442 subsequent match. Again, separate the UTF-8 case for speed, and also
4443 separate the UCP cases. */
4444
4445 if (minimize)
4446 {
4447 #ifdef SUPPORT_UCP
4448 if (prop_type >= 0)
4449 {
4450 switch(prop_type)
4451 {
4452 case PT_ANY:
4453 for (fi = min;; fi++)
4454 {
4455 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4456 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4457 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4458 if (eptr >= md->end_subject)
4459 {
4460 SCHECK_PARTIAL();
4461 MRRETURN(MATCH_NOMATCH);
4462 }
4463 GETCHARINCTEST(c, eptr);
4464 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4465 }
4466 /* Control never gets here */
4467
4468 case PT_LAMP:
4469 for (fi = min;; fi++)
4470 {
4471 int chartype;
4472 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4474 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 MRRETURN(MATCH_NOMATCH);
4479 }
4480 GETCHARINCTEST(c, eptr);
4481 chartype = UCD_CHARTYPE(c);
4482 if ((chartype == ucp_Lu ||
4483 chartype == ucp_Ll ||
4484 chartype == ucp_Lt) == prop_fail_result)
4485 MRRETURN(MATCH_NOMATCH);
4486 }
4487 /* Control never gets here */
4488
4489 case PT_GC:
4490 for (fi = min;; fi++)
4491 {
4492 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4494 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4495 if (eptr >= md->end_subject)
4496 {
4497 SCHECK_PARTIAL();
4498 MRRETURN(MATCH_NOMATCH);
4499 }
4500 GETCHARINCTEST(c, eptr);
4501 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4502 MRRETURN(MATCH_NOMATCH);
4503 }
4504 /* Control never gets here */
4505
4506 case PT_PC:
4507 for (fi = min;; fi++)
4508 {
4509 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4511 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4512 if (eptr >= md->end_subject)
4513 {
4514 SCHECK_PARTIAL();
4515 MRRETURN(MATCH_NOMATCH);
4516 }
4517 GETCHARINCTEST(c, eptr);
4518 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4519 MRRETURN(MATCH_NOMATCH);
4520 }
4521 /* Control never gets here */
4522
4523 case PT_SC:
4524 for (fi = min;; fi++)
4525 {
4526 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4527 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4528 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 MRRETURN(MATCH_NOMATCH);
4533 }
4534 GETCHARINCTEST(c, eptr);
4535 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4536 MRRETURN(MATCH_NOMATCH);
4537 }
4538 /* Control never gets here */
4539
4540 case PT_ALNUM:
4541 for (fi = min;; fi++)
4542 {
4543 int category;
4544 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4545 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4546 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 MRRETURN(MATCH_NOMATCH);
4551 }
4552 GETCHARINCTEST(c, eptr);
4553 category = UCD_CATEGORY(c);
4554 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4555 MRRETURN(MATCH_NOMATCH);
4556 }
4557 /* Control never gets here */
4558
4559 case PT_SPACE: /* Perl space */
4560 for (fi = min;; fi++)
4561 {
4562 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4563 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4564 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4565 if (eptr >= md->end_subject)
4566 {
4567 SCHECK_PARTIAL();
4568 MRRETURN(MATCH_NOMATCH);
4569 }
4570 GETCHARINCTEST(c, eptr);
4571 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4572 c == CHAR_FF || c == CHAR_CR)
4573 == prop_fail_result)
4574 MRRETURN(MATCH_NOMATCH);
4575 }
4576 /* Control never gets here */
4577
4578 case PT_PXSPACE: /* POSIX space */
4579 for (fi = min;; fi++)
4580 {
4581 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4582 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4583 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4584 if (eptr >= md->end_subject)
4585 {
4586 SCHECK_PARTIAL();
4587 MRRETURN(MATCH_NOMATCH);
4588 }
4589 GETCHARINCTEST(c, eptr);
4590 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4591 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4592 == prop_fail_result)
4593 MRRETURN(MATCH_NOMATCH);
4594 }
4595 /* Control never gets here */
4596
4597 case PT_WORD:
4598 for (fi = min;; fi++)
4599 {
4600 int category;
4601 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4603 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 MRRETURN(MATCH_NOMATCH);
4608 }
4609 GETCHARINCTEST(c, eptr);
4610 category = UCD_CATEGORY(c);
4611 if ((category == ucp_L ||
4612 category == ucp_N ||
4613 c == CHAR_UNDERSCORE)
4614 == prop_fail_result)
4615 MRRETURN(MATCH_NOMATCH);
4616 }
4617 /* Control never gets here */
4618
4619 /* This should never occur */
4620
4621 default:
4622 RRETURN(PCRE_ERROR_INTERNAL);
4623 }
4624 }
4625
4626 /* Match extended Unicode sequences. We will get here only if the
4627 support is in the binary; otherwise a compile-time error occurs. */
4628
4629 else if (ctype == OP_EXTUNI)
4630 {
4631 for (fi = min;; fi++)
4632 {
4633 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4635 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4636 if (eptr >= md->end_subject)
4637 {
4638 SCHECK_PARTIAL();
4639 MRRETURN(MATCH_NOMATCH);
4640 }
4641 GETCHARINCTEST(c, eptr);
4642 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4643 while (eptr < md->end_subject)
4644 {
4645 int len = 1;
4646 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4647 if (UCD_CATEGORY(c) != ucp_M) break;
4648 eptr += len;
4649 }
4650 }
4651 }
4652 else
4653 #endif /* SUPPORT_UCP */
4654
4655 #ifdef SUPPORT_UTF8
4656 /* UTF-8 mode */
4657 if (utf8)
4658 {
4659 for (fi = min;; fi++)
4660 {
4661 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4663 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4664 if (eptr >= md->end_subject)
4665 {
4666 SCHECK_PARTIAL();
4667 MRRETURN(MATCH_NOMATCH);
4668 }
4669 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4670 MRRETURN(MATCH_NOMATCH);
4671 GETCHARINC(c, eptr);
4672 switch(ctype)
4673 {
4674 case OP_ANY: /* This is the non-NL case */
4675 case OP_ALLANY:
4676 case OP_ANYBYTE:
4677 break;
4678
4679 case OP_ANYNL:
4680 switch(c)
4681 {
4682 default: MRRETURN(MATCH_NOMATCH);
4683 case 0x000d:
4684 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4685 break;
4686 case 0x000a:
4687 break;
4688
4689 case 0x000b:
4690 case 0x000c:
4691 case 0x0085:
4692 case 0x2028:
4693 case 0x2029:
4694 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4695 break;
4696 }
4697 break;
4698
4699 case OP_NOT_HSPACE:
4700 switch(c)
4701 {
4702 default: break;
4703 case 0x09: /* HT */
4704 case 0x20: /* SPACE */
4705 case 0xa0: /* NBSP */
4706 case 0x1680: /* OGHAM SPACE MARK */
4707 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4708 case 0x2000: /* EN QUAD */
4709 case 0x2001: /* EM QUAD */
4710 case 0x2002: /* EN SPACE */
4711 case 0x2003: /* EM SPACE */
4712 case 0x2004: /* THREE-PER-EM SPACE */
4713 case 0x2005: /* FOUR-PER-EM SPACE */
4714 case 0x2006: /* SIX-PER-EM SPACE */
4715 case 0x2007: /* FIGURE SPACE */
4716 case 0x2008: /* PUNCTUATION SPACE */
4717 case 0x2009: /* THIN SPACE */
4718 case 0x200A: /* HAIR SPACE */
4719 case 0x202f: /* NARROW NO-BREAK SPACE */
4720 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4721 case 0x3000: /* IDEOGRAPHIC SPACE */
4722 MRRETURN(MATCH_NOMATCH);
4723 }
4724 break;
4725
4726 case OP_HSPACE:
4727 switch(c)
4728 {
4729 default: MRRETURN(MATCH_NOMATCH);
4730 case 0x09: /* HT */
4731 case 0x20: /* SPACE */
4732 case 0xa0: /* NBSP */
4733 case 0x1680: /* OGHAM SPACE MARK */
4734 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4735 case 0x2000: /* EN QUAD */
4736 case 0x2001: /* EM QUAD */
4737 case 0x2002: /* EN SPACE */
4738 case 0x2003: /* EM SPACE */
4739 case 0x2004: /* THREE-PER-EM SPACE */
4740 case 0x2005: /* FOUR-PER-EM SPACE */
4741 case 0x2006: /* SIX-PER-EM SPACE */
4742 case 0x2007: /* FIGURE SPACE */
4743 case 0x2008: /* PUNCTUATION SPACE */
4744 case 0x2009: /* THIN SPACE */
4745 case 0x200A: /* HAIR SPACE */
4746 case 0x202f: /* NARROW NO-BREAK SPACE */
4747 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4748 case 0x3000: /* IDEOGRAPHIC SPACE */
4749 break;
4750 }
4751 break;
4752
4753 case OP_NOT_VSPACE:
4754 switch(c)
4755 {
4756 default: break;
4757 case 0x0a: /* LF */
4758 case 0x0b: /* VT */
4759 case 0x0c: /* FF */
4760 case 0x0d: /* CR */
4761 case 0x85: /* NEL */
4762 case 0x2028: /* LINE SEPARATOR */
4763 case 0x2029: /* PARAGRAPH SEPARATOR */
4764 MRRETURN(MATCH_NOMATCH);
4765 }
4766 break;
4767
4768 case OP_VSPACE:
4769 switch(c)
4770 {
4771 default: MRRETURN(MATCH_NOMATCH);
4772 case 0x0a: /* LF */
4773 case 0x0b: /* VT */
4774 case 0x0c: /* FF */
4775 case 0x0d: /* CR */
4776 case 0x85: /* NEL */
4777 case 0x2028: /* LINE SEPARATOR */
4778 case 0x2029: /* PARAGRAPH SEPARATOR */
4779 break;
4780 }
4781 break;
4782
4783 case OP_NOT_DIGIT:
4784 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4785 MRRETURN(MATCH_NOMATCH);
4786 break;
4787
4788 case OP_DIGIT:
4789 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4790 MRRETURN(MATCH_NOMATCH);
4791 break;
4792
4793 case OP_NOT_WHITESPACE:
4794 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4795 MRRETURN(MATCH_NOMATCH);
4796 break;
4797
4798 case OP_WHITESPACE:
4799 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4800 MRRETURN(MATCH_NOMATCH);
4801 break;
4802
4803 case OP_NOT_WORDCHAR:
4804 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4805 MRRETURN(MATCH_NOMATCH);
4806 break;
4807
4808 case OP_WORDCHAR:
4809 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4810 MRRETURN(MATCH_NOMATCH);
4811 break;
4812
4813 default:
4814 RRETURN(PCRE_ERROR_INTERNAL);
4815 }
4816 }
4817 }
4818 else
4819 #endif
4820 /* Not UTF-8 mode */
4821 {
4822 for (fi = min;; fi++)
4823 {
4824 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4826 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4827 if (eptr >= md->end_subject)
4828 {
4829 SCHECK_PARTIAL();
4830 MRRETURN(MATCH_NOMATCH);
4831 }
4832 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4833 MRRETURN(MATCH_NOMATCH);
4834 c = *eptr++;
4835 switch(ctype)
4836 {
4837 case OP_ANY: /* This is the non-NL case */
4838 case OP_ALLANY:
4839 case OP_ANYBYTE:
4840 break;
4841
4842 case OP_ANYNL:
4843 switch(c)
4844 {
4845 default: MRRETURN(MATCH_NOMATCH);
4846 case 0x000d:
4847 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4848 break;
4849
4850 case 0x000a:
4851 break;
4852
4853 case 0x000b:
4854 case 0x000c:
4855 case 0x0085:
4856 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4857 break;
4858 }
4859 break;
4860
4861 case OP_NOT_HSPACE:
4862 switch(c)
4863 {
4864 default: break;
4865 case 0x09: /* HT */
4866 case 0x20: /* SPACE */
4867 case 0xa0: /* NBSP */
4868 MRRETURN(MATCH_NOMATCH);
4869 }
4870 break;
4871
4872 case OP_HSPACE:
4873 switch(c)
4874 {
4875 default: MRRETURN(MATCH_NOMATCH);
4876 case 0x09: /* HT */
4877 case 0x20: /* SPACE */
4878 case 0xa0: /* NBSP */
4879 break;
4880 }
4881 break;
4882
4883 case OP_NOT_VSPACE:
4884 switch(c)
4885 {
4886 default: break;
4887 case 0x0a: /* LF */
4888 case 0x0b: /* VT */
4889 case 0x0c: /* FF */
4890 case 0x0d: /* CR */
4891 case 0x85: /* NEL */
4892 MRRETURN(MATCH_NOMATCH);
4893 }
4894 break;
4895
4896 case OP_VSPACE:
4897 switch(c)
4898 {
4899 default: MRRETURN(MATCH_NOMATCH);
4900 case 0x0a: /* LF */
4901 case 0x0b: /* VT */
4902 case 0x0c: /* FF */
4903 case 0x0d: /* CR */
4904 case 0x85: /* NEL */
4905 break;
4906 }
4907 break;
4908
4909 case OP_NOT_DIGIT:
4910 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 case OP_DIGIT:
4914 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4915 break;
4916
4917 case OP_NOT_WHITESPACE:
4918 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4919 break;
4920
4921 case OP_WHITESPACE:
4922 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4923 break;
4924
4925 case OP_NOT_WORDCHAR:
4926 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4927 break;
4928
4929 case OP_WORDCHAR:
4930 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4931 break;
4932
4933 default:
4934 RRETURN(PCRE_ERROR_INTERNAL);
4935 }
4936 }
4937 }
4938 /* Control never gets here */
4939 }
4940
4941 /* If maximizing, it is worth using inline code for speed, doing the type
4942 test once at the start (i.e. keep it out of the loop). Again, keep the
4943 UTF-8 and UCP stuff separate. */
4944
4945 else
4946 {
4947 pp = eptr; /* Remember where we started */
4948
4949 #ifdef SUPPORT_UCP
4950 if (prop_type >= 0)
4951 {
4952 switch(prop_type)
4953 {
4954 case PT_ANY:
4955 for (i = min; i < max; i++)
4956 {
4957 int len = 1;
4958 if (eptr >= md->end_subject)
4959 {
4960 SCHECK_PARTIAL();
4961 break;
4962 }
4963 GETCHARLENTEST(c, eptr, len);
4964 if (prop_fail_result) break;
4965 eptr+= len;
4966 }
4967 break;
4968
4969 case PT_LAMP:
4970 for (i = min; i < max; i++)
4971 {
4972 int chartype;
4973 int len = 1;
4974 if (eptr >= md->end_subject)
4975 {
4976 SCHECK_PARTIAL();
4977 break;
4978 }
4979 GETCHARLENTEST(c, eptr, len);
4980 chartype = UCD_CHARTYPE(c);
4981 if ((chartype == ucp_Lu ||
4982 chartype == ucp_Ll ||
4983 chartype == ucp_Lt) == prop_fail_result)
4984 break;
4985 eptr+= len;
4986 }
4987 break;
4988
4989 case PT_GC:
4990 for (i = min; i < max; i++)
4991 {
4992 int len = 1;
4993 if (eptr >= md->end_subject)
4994 {
4995 SCHECK_PARTIAL();
4996 break;
4997 }
4998 GETCHARLENTEST(c, eptr, len);
4999 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5000 eptr+= len;
5001 }
5002 break;
5003
5004 case PT_PC:
5005 for (i = min; i < max; i++)
5006 {
5007 int len = 1;
5008 if (eptr >= md->end_subject)
5009 {
5010 SCHECK_PARTIAL();
5011 break;
5012 }
5013 GETCHARLENTEST(c, eptr, len);
5014 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5015 eptr+= len;
5016 }
5017 break;
5018
5019 case PT_SC:
5020 for (i = min; i < max; i++)
5021 {
5022 int len = 1;
5023 if (eptr >= md->end_subject)
5024 {
5025 SCHECK_PARTIAL();
5026 break;
5027 }
5028 GETCHARLENTEST(c, eptr, len);
5029 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5030 eptr+= len;
5031 }
5032 break;
5033
5034 case PT_ALNUM:
5035 for (i = min; i < max; i++)
5036 {
5037 int category;
5038 int len = 1;
5039 if (eptr >= md->end_subject)
5040 {
5041 SCHECK_PARTIAL();
5042 break;
5043 }
5044 GETCHARLENTEST(c, eptr, len);
5045 category = UCD_CATEGORY(c);
5046 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5047 break;
5048 eptr+= len;
5049 }
5050 break;
5051
5052 case PT_SPACE: /* Perl space */
5053 for (i = min; i < max; i++)
5054 {
5055 int len = 1;
5056 if (eptr >= md->end_subject)
5057 {
5058 SCHECK_PARTIAL();
5059 break;
5060 }
5061 GETCHARLENTEST(c, eptr, len);
5062 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5063 c == CHAR_FF || c == CHAR_CR)
5064 == prop_fail_result)
5065 break;
5066 eptr+= len;
5067 }
5068 break;
5069
5070 case PT_PXSPACE: /* POSIX space */
5071 for (i = min; i < max; i++)
5072 {
5073 int len = 1;
5074 if (eptr >= md->end_subject)
5075 {
5076 SCHECK_PARTIAL();
5077 break;
5078 }
5079 GETCHARLENTEST(c, eptr, len);
5080 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5081 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5082 == prop_fail_result)
5083 break;
5084 eptr+= len;
5085 }
5086 break;
5087
5088 case PT_WORD:
5089 for (i = min; i < max; i++)
5090 {
5091 int category;
5092 int len = 1;
5093 if (eptr >= md->end_subject)
5094 {
5095 SCHECK_PARTIAL();
5096 break;
5097 }
5098 GETCHARLENTEST(c, eptr, len);
5099 category = UCD_CATEGORY(c);
5100 if ((category == ucp_L || category == ucp_N ||
5101 c == CHAR_UNDERSCORE) == prop_fail_result)
5102 break;
5103 eptr+= len;
5104 }
5105 break;
5106
5107 default:
5108 RRETURN(PCRE_ERROR_INTERNAL);
5109 }
5110
5111 /* eptr is now past the end of the maximum run */
5112
5113 if (possessive) continue;
5114 for(;;)
5115 {
5116 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5118 if (eptr-- == pp) break; /* Stop if tried at original pos */
5119 if (utf8) BACKCHAR(eptr);
5120 }
5121 }
5122
5123 /* Match extended Unicode sequences. We will get here only if the
5124 support is in the binary; otherwise a compile-time error occurs. */
5125
5126 else if (ctype == OP_EXTUNI)
5127 {
5128 for (i = min; i < max; i++)
5129 {
5130 int len = 1;
5131 if (eptr >= md->end_subject)
5132 {
5133 SCHECK_PARTIAL();
5134 break;
5135 }
5136 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5137 if (UCD_CATEGORY(c) == ucp_M) break;
5138 eptr += len;
5139 while (eptr < md->end_subject)
5140 {
5141 len = 1;
5142 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5143 if (UCD_CATEGORY(c) != ucp_M) break;
5144 eptr += len;
5145 }
5146 }
5147
5148 /* eptr is now past the end of the maximum run */
5149
5150 if (possessive) continue;
5151
5152 for(;;)
5153 {
5154 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5155 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5156 if (eptr-- == pp) break; /* Stop if tried at original pos */
5157 for (;;) /* Move back over one extended */
5158 {
5159 if (!utf8) c = *eptr; else
5160 {
5161 BACKCHAR(eptr);
5162 GETCHAR(c, eptr);
5163 }
5164 if (UCD_CATEGORY(c) != ucp_M) break;
5165 eptr--;
5166 }
5167 }
5168 }
5169
5170 else
5171 #endif /* SUPPORT_UCP */
5172
5173 #ifdef SUPPORT_UTF8
5174 /* UTF-8 mode */
5175
5176 if (utf8)
5177 {
5178 switch(ctype)
5179 {
5180 case OP_ANY:
5181 if (max < INT_MAX)
5182 {
5183 for (i = min; i < max; i++)
5184 {
5185 if (eptr >= md->end_subject)
5186 {
5187 SCHECK_PARTIAL();
5188 break;
5189 }
5190 if (IS_NEWLINE(eptr)) break;
5191 eptr++;
5192 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5193 }
5194 }
5195
5196 /* Handle unlimited UTF-8 repeat */
5197
5198 else
5199 {
5200 for (i = min; i < max; i++)
5201 {
5202 if (eptr >= md->end_subject)
5203 {
5204 SCHECK_PARTIAL();
5205 break;
5206 }
5207 if (IS_NEWLINE(eptr)) break;
5208 eptr++;
5209 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5210 }
5211 }
5212 break;
5213
5214 case OP_ALLANY:
5215 if (max < INT_MAX)
5216 {
5217 for (i = min; i < max; i++)
5218 {
5219 if (eptr >= md->end_subject)
5220 {
5221 SCHECK_PARTIAL();
5222 break;
5223 }
5224 eptr++;
5225 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5226 }
5227 }
5228 else
5229 {
5230 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5231 SCHECK_PARTIAL();
5232 }
5233 break;
5234
5235 /* The byte case is the same as non-UTF8 */
5236
5237 case OP_ANYBYTE:
5238 c = max - min;
5239 if (c > (unsigned int)(md->end_subject - eptr))
5240 {
5241 eptr = md->end_subject;
5242 SCHECK_PARTIAL();
5243 }
5244 else eptr += c;
5245 break;
5246
5247 case OP_ANYNL:
5248 for (i = min; i < max; i++)
5249 {
5250 int len = 1;
5251 if (eptr >= md->end_subject)
5252 {
5253 SCHECK_PARTIAL();
5254 break;
5255 }
5256 GETCHARLEN(c, eptr, len);
5257 if (c == 0x000d)
5258 {
5259 if (++eptr >= md->end_subject) break;
5260 if (*eptr == 0x000a) eptr++;
5261 }
5262 else
5263 {
5264 if (c != 0x000a &&
5265 (md->bsr_anycrlf ||
5266 (c != 0x000b && c != 0x000c &&
5267 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5268 break;
5269 eptr += len;
5270 }
5271 }
5272 break;
5273
5274 case OP_NOT_HSPACE:
5275 case OP_HSPACE:
5276 for (i = min; i < max; i++)
5277 {
5278 BOOL gotspace;
5279 int len = 1;
5280 if (eptr >= md->end_subject)
5281 {
5282 SCHECK_PARTIAL();
5283 break;
5284 }
5285 GETCHARLEN(c, eptr, len);
5286 switch(c)
5287 {
5288 default: gotspace = FALSE; break;
5289 case 0x09: /* HT */
5290 case 0x20: /* SPACE */
5291 case 0xa0: /* NBSP */
5292 case 0x1680: /* OGHAM SPACE MARK */
5293 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5294 case 0x2000: /* EN QUAD */
5295 case 0x2001: /* EM QUAD */
5296 case 0x2002: /* EN SPACE */
5297 case 0x2003: /* EM SPACE */
5298 case 0x2004: /* THREE-PER-EM SPACE */
5299 case 0x2005: /* FOUR-PER-EM SPACE */
5300 case 0x2006: /* SIX-PER-EM SPACE */
5301 case 0x2007: /* FIGURE SPACE */
5302 case 0x2008: /* PUNCTUATION SPACE */
5303 case 0x2009: /* THIN SPACE */
5304 case 0x200A: /* HAIR SPACE */
5305 case 0x202f: /* NARROW NO-BREAK SPACE */
5306 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5307 case 0x3000: /* IDEOGRAPHIC SPACE */
5308 gotspace = TRUE;
5309 break;
5310 }
5311 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5312 eptr += len;
5313 }
5314 break;
5315
5316 case OP_NOT_VSPACE:
5317 case OP_VSPACE:
5318 for (i = min; i < max; i++)
5319 {
5320 BOOL gotspace;
5321 int len = 1;
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 GETCHARLEN(c, eptr, len);
5328 switch(c)
5329 {
5330 default: gotspace = FALSE; break;
5331 case 0x0a: /* LF */
5332 case 0x0b: /* VT */
5333 case 0x0c: /* FF */
5334 case 0x0d: /* CR */
5335 case 0x85: /* NEL */
5336 case 0x2028: /* LINE SEPARATOR */
5337 case 0x2029: /* PARAGRAPH SEPARATOR */
5338 gotspace = TRUE;
5339 break;
5340 }
5341 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5342 eptr += len;
5343 }
5344 break;
5345
5346 case OP_NOT_DIGIT:
5347 for (i = min; i < max; i++)
5348 {
5349 int len = 1;
5350 if (eptr >= md->end_subject)
5351 {
5352 SCHECK_PARTIAL();
5353 break;
5354 }
5355 GETCHARLEN(c, eptr, len);
5356 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5357 eptr+= len;
5358 }
5359 break;
5360
5361 case OP_DIGIT:
5362 for (i = min; i < max; i++)
5363 {
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLEN(c, eptr, len);
5371 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5372 eptr+= len;
5373 }
5374 break;
5375
5376 case OP_NOT_WHITESPACE:
5377 for (i = min; i < max; i++)
5378 {
5379 int len = 1;
5380 if (eptr >= md->end_subject)
5381 {
5382 SCHECK_PARTIAL();
5383 break;
5384 }
5385 GETCHARLEN(c, eptr, len);
5386 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5387 eptr+= len;
5388 }
5389 break;
5390
5391 case OP_WHITESPACE:
5392 for (i = min; i < max; i++)
5393 {
5394 int len = 1;
5395 if (eptr >= md->end_subject)
5396 {
5397 SCHECK_PARTIAL();
5398 break;
5399 }
5400 GETCHARLEN(c, eptr, len);
5401 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5402 eptr+= len;
5403 }
5404 break;
5405
5406 case OP_NOT_WORDCHAR:
5407 for (i = min; i < max; i++)
5408 {
5409 int len = 1;
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 GETCHARLEN(c, eptr, len);
5416 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5417 eptr+= len;
5418 }
5419 break;
5420
5421 case OP_WORDCHAR:
5422 for (i = min; i < max; i++)
5423 {
5424 int len = 1;
5425 if (eptr >= md->end_subject)
5426 {
5427 SCHECK_PARTIAL();
5428 break;
5429 }
5430 GETCHARLEN(c, eptr, len);
5431 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5432 eptr+= len;
5433 }
5434 break;
5435
5436 default:
5437 RRETURN(PCRE_ERROR_INTERNAL);
5438 }
5439
5440 /* eptr is now past the end of the maximum run. If possessive, we are
5441 done (no backing up). Otherwise, match at this position; anything other
5442 than no match is immediately returned. For nomatch, back up one
5443 character, unless we are matching \R and the last thing matched was
5444 \r\n, in which case, back up two bytes. */
5445
5446 if (possessive) continue;
5447 for(;;)
5448 {
5449 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5451 if (eptr-- == pp) break; /* Stop if tried at original pos */
5452 BACKCHAR(eptr);
5453 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5454 eptr[-1] == '\r') eptr--;
5455 }
5456 }
5457 else
5458 #endif /* SUPPORT_UTF8 */
5459
5460 /* Not UTF-8 mode */
5461 {
5462 switch(ctype)
5463 {
5464 case OP_ANY:
5465 for (i = min; i < max; i++)
5466 {
5467 if (eptr >= md->end_subject)
5468 {
5469 SCHECK_PARTIAL();
5470 break;
5471 }
5472 if (IS_NEWLINE(eptr)) break;
5473 eptr++;
5474 }
5475 break;
5476
5477 case OP_ALLANY:
5478 case OP_ANYBYTE:
5479 c = max - min;
5480 if (c > (unsigned int)(md->end_subject - eptr))
5481 {
5482 eptr = md->end_subject;
5483 SCHECK_PARTIAL();
5484 }
5485 else eptr += c;
5486 break;
5487
5488 case OP_ANYNL:
5489 for (i = min; i < max; i++)
5490 {
5491 if (eptr >= md->end_subject)
5492 {
5493 SCHECK_PARTIAL();
5494 break;
5495 }
5496 c = *eptr;
5497 if (c == 0x000d)
5498 {
5499 if (++eptr >= md->end_subject) break;
5500 if (*eptr == 0x000a) eptr++;
5501 }
5502 else
5503 {
5504 if (c != 0x000a &&
5505 (md->bsr_anycrlf ||
5506 (c != 0x000b && c != 0x000c && c != 0x0085)))
5507 break;
5508 eptr++;
5509 }
5510 }
5511 break;
5512
5513 case OP_NOT_HSPACE:
5514 for (i = min; i < max; i++)
5515 {
5516 if (eptr >= md->end_subject)
5517 {
5518 SCHECK_PARTIAL();
5519 break;
5520 }
5521 c = *eptr;
5522 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5523 eptr++;
5524 }
5525 break;
5526
5527 case OP_HSPACE:
5528 for (i = min; i < max; i++)
5529 {
5530 if (eptr >= md->end_subject)
5531 {
5532 SCHECK_PARTIAL();
5533 break;
5534 }
5535 c = *eptr;
5536 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5537 eptr++;
5538 }
5539 break;
5540
5541 case OP_NOT_VSPACE:
5542 for (i = min; i < max; i++)
5543 {
5544 if (eptr >= md->end_subject)
5545 {
5546 SCHECK_PARTIAL();
5547 break;
5548 }
5549 c = *eptr;
5550 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5551 break;
5552 eptr++;
5553 }
5554 break;
5555
5556 case OP_VSPACE:
5557 for (i = min; i < max; i++)
5558 {
5559 if (eptr >= md->end_subject)
5560 {
5561 SCHECK_PARTIAL();
5562 break;
5563 }
5564 c = *eptr;
5565 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5566 break;
5567 eptr++;
5568 }
5569 break;
5570
5571 case OP_NOT_DIGIT:
5572 for (i = min; i < max; i++)
5573 {
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5580 eptr++;
5581 }
5582 break;
5583
5584 case OP_DIGIT:
5585 for (i = min; i < max; i++)
5586 {
5587 if (eptr >= md->end_subject)
5588 {
5589 SCHECK_PARTIAL();
5590 break;
5591 }
5592 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5593 eptr++;
5594 }
5595 break;
5596
5597 case OP_NOT_WHITESPACE:
5598 for (i = min; i < max; i++)
5599 {
5600 if (eptr >= md->end_subject)
5601 {
5602 SCHECK_PARTIAL();
5603 break;
5604 }
5605 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5606 eptr++;
5607 }
5608 break;
5609
5610 case OP_WHITESPACE:
5611 for (i = min; i < max; i++)
5612 {
5613 if (eptr >= md->end_subject)
5614 {
5615 SCHECK_PARTIAL();
5616 break;
5617 }
5618 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5619 eptr++;
5620 }
5621 break;
5622
5623 case OP_NOT_WORDCHAR:
5624 for (i = min; i < max; i++)
5625 {
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5632 eptr++;
5633 }
5634 break;
5635
5636 case OP_WORDCHAR:
5637 for (i = min; i < max; i++)
5638 {
5639 if (eptr >= md->end_subject)
5640 {
5641 SCHECK_PARTIAL();
5642 break;
5643 }
5644 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5645 eptr++;
5646 }
5647 break;
5648
5649 default:
5650 RRETURN(PCRE_ERROR_INTERNAL);
5651 }
5652
5653 /* eptr is now past the end of the maximum run. If possessive, we are
5654 done (no backing up). Otherwise, match at this position; anything other
5655 than no match is immediately returned. For nomatch, back up one
5656 character (byte), unless we are matching \R and the last thing matched
5657 was \r\n, in which case, back up two bytes. */
5658
5659 if (possessive) continue;
5660 while (eptr >= pp)
5661 {
5662 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5664 eptr--;
5665 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5666 eptr[-1] == '\r') eptr--;
5667 }
5668 }
5669
5670 /* Get here if we can't make it match with any permitted repetitions */
5671
5672 MRRETURN(MATCH_NOMATCH);
5673 }
5674 /* Control never gets here */
5675
5676 /* There's been some horrible disaster. Arrival here can only mean there is
5677 something seriously wrong in the code above or the OP_xxx definitions. */
5678
5679 default:
5680 DPRINTF(("Unknown opcode %d\n", *ecode));
5681 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5682 }
5683
5684 /* Do not stick any code in here without much thought; it is assumed
5685 that "continue" in the code above comes out to here to repeat the main
5686 loop. */
5687
5688 } /* End of main loop */
5689 /* Control never reaches here */
5690
5691
5692 /* When compiling to use the heap rather than the stack for recursive calls to
5693 match(), the RRETURN() macro jumps here. The number that is saved in
5694 frame->Xwhere indicates which label we actually want to return to. */
5695
5696 #ifdef NO_RECURSE
5697 #define LBL(val) case val: goto L_RM##val;
5698 HEAP_RETURN:
5699 switch (frame->Xwhere)
5700 {
5701 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5702 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5703 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5704 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5705 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5706 #ifdef SUPPORT_UTF8
5707 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5708 LBL(32) LBL(34) LBL(42) LBL(46)
5709 #ifdef SUPPORT_UCP
5710 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5711 LBL(59) LBL(60) LBL(61) LBL(62)
5712 #endif /* SUPPORT_UCP */
5713 #endif /* SUPPORT_UTF8 */
5714 default:
5715 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5716 return PCRE_ERROR_INTERNAL;
5717 }
5718 #undef LBL
5719 #endif /* NO_RECURSE */
5720 }
5721
5722
5723 /***************************************************************************
5724 ****************************************************************************
5725 RECURSION IN THE match() FUNCTION
5726
5727 Undefine all the macros that were defined above to handle this. */
5728
5729 #ifdef NO_RECURSE
5730 #undef eptr
5731 #undef ecode
5732 #undef mstart
5733 #undef offset_top
5734 #undef eptrb
5735 #undef flags
5736
5737 #undef callpat
5738 #undef charptr
5739 #undef data
5740 #undef next
5741 #undef pp
5742 #undef prev
5743 #undef saved_eptr
5744
5745 #undef new_recursive
5746
5747 #undef cur_is_word
5748 #undef condition
5749 #undef prev_is_word
5750
5751 #undef ctype
5752 #undef length
5753 #undef max
5754 #undef min
5755 #undef number
5756 #undef offset
5757 #undef op
5758 #undef save_capture_last
5759 #undef save_offset1
5760 #undef save_offset2
5761 #undef save_offset3
5762 #undef stacksave
5763
5764 #undef newptrb
5765
5766 #endif
5767
5768 /* These two are defined as macros in both cases */
5769
5770 #undef fc
5771 #undef fi
5772
5773 /***************************************************************************
5774 ***************************************************************************/
5775
5776
5777
5778 /*************************************************
5779 * Execute a Regular Expression *
5780 *************************************************/
5781
5782 /* This function applies a compiled re to a subject string and picks out
5783 portions of the string if it matches. Two elements in the vector are set for
5784 each substring: the offsets to the start and end of the substring.
5785
5786 Arguments:
5787 argument_re points to the compiled expression
5788 extra_data points to extra data or is NULL
5789 subject points to the subject string
5790 length length of subject string (may contain binary zeros)
5791 start_offset where to start in the subject string
5792 options option bits
5793 offsets points to a vector of ints to be filled in with offsets
5794 offsetcount the number of elements in the vector
5795
5796 Returns: > 0 => success; value is the number of elements filled in
5797 = 0 => success, but offsets is not big enough
5798 -1 => failed to match
5799 < -1 => some kind of unexpected problem
5800 */
5801
5802 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5803 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5804 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5805 int offsetcount)
5806 {
5807 int rc, ocount, arg_offset_max;
5808 int first_byte = -1;
5809 int req_byte = -1;
5810 int req_byte2 = -1;
5811 int newline;
5812 BOOL using_temporary_offsets = FALSE;
5813 BOOL anchored;
5814 BOOL startline;
5815 BOOL firstline;
5816 BOOL first_byte_caseless = FALSE;
5817 BOOL req_byte_caseless = FALSE;
5818 BOOL utf8;
5819 match_data match_block;
5820 match_data *md = &match_block;
5821 const uschar *tables;
5822 const uschar *start_bits = NULL;
5823 USPTR start_match = (USPTR)subject + start_offset;
5824 USPTR end_subject;
5825 USPTR start_partial = NULL;
5826 USPTR req_byte_ptr = start_match - 1;
5827
5828 pcre_study_data internal_study;
5829 const pcre_study_data *study;
5830
5831 real_pcre internal_re;
5832 const real_pcre *external_re = (const real_pcre *)argument_re;
5833 const real_pcre *re = external_re;
5834
5835 /* Plausibility checks */
5836
5837 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5838 if (re == NULL || subject == NULL ||
5839 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5840 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5841 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5842
5843 /* These two settings are used in the code for checking a UTF-8 string that
5844 follows immediately afterwards. Other values in the md block are used only
5845 during "normal" pcre_exec() processing, not when the JIT support is in use,
5846 so they are set up later. */
5847
5848 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5849 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5850 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5851
5852 /* Check a UTF-8 string if required. Pass back the character offset and error
5853 code for an invalid string if a results vector is available. */
5854
5855 #ifdef SUPPORT_UTF8
5856 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5857 {
5858 int erroroffset;
5859 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5860 if (errorcode != 0)
5861 {
5862 if (offsetcount >= 2)
5863 {
5864 offsets[0] = erroroffset;
5865 offsets[1] = errorcode;
5866 }
5867 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5868 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5869 }
5870
5871 /* Check that a start_offset points to the start of a UTF-8 character. */
5872 if (start_offset > 0 && start_offset < length &&
5873 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5874 return PCRE_ERROR_BADUTF8_OFFSET;
5875 }
5876 #endif
5877
5878 /* If the pattern was successfully studied with JIT support, run the JIT
5879 executable instead of the rest of this function. Most options must be set at
5880 compile time for the JIT code to be usable. Fallback to the normal code path if
5881 an unsupported flag is set. In particular, JIT does not support partial
5882 matching. */
5883
5884 #ifdef SUPPORT_JIT
5885 if (extra_data != NULL
5886 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5887 && extra_data->executable_jit != NULL
5888 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5889 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5890 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5891 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5892 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5893 #endif
5894
5895 /* Carry on with non-JIT matching. This information is for finding all the
5896 numbers associated with a given name, for condition testing. */
5897
5898 md->name_table = (uschar *)re + re->name_table_offset;
5899 md->name_count = re->name_count;
5900 md->name_entry_size = re->name_entry_size;
5901
5902 /* Fish out the optional data from the extra_data structure, first setting
5903 the default values. */
5904
5905 study = NULL;
5906 md->match_limit = MATCH_LIMIT;
5907 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5908 md->callout_data = NULL;
5909
5910 /* The table pointer is always in native byte order. */
5911
5912 tables = external_re->tables;
5913
5914 if (extra_data != NULL)
5915 {
5916 register unsigned int flags = extra_data->flags;
5917 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5918 study = (const pcre_study_data *)extra_data->study_data;
5919 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5920 md->match_limit = extra_data->match_limit;
5921 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5922 md->match_limit_recursion = extra_data->match_limit_recursion;
5923 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5924 md->callout_data = extra_data->callout_data;
5925 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5926 }
5927
5928 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5929 is a feature that makes it possible to save compiled regex and re-use them
5930 in other programs later. */
5931
5932 if (tables == NULL) tables = _pcre_default_tables;
5933
5934 /* Check that the first field in the block is the magic number. If it is not,
5935 test for a regex that was compiled on a host of opposite endianness. If this is
5936 the case, flipped values are put in internal_re and internal_study if there was
5937 study data too. */
5938
5939 if (re->magic_number != MAGIC_NUMBER)
5940 {
5941 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5942 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5943 if (study != NULL) study = &internal_study;
5944 }
5945
5946 /* Set up other data */
5947
5948 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5949 startline = (re->flags & PCRE_STARTLINE) != 0;
5950 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5951
5952 /* The code starts after the real_pcre block and the capture name table. */
5953
5954 md->start_code = (const uschar *)external_re + re->name_table_offset +
5955 re->name_count * re->name_entry_size;
5956
5957 md->start_subject = (USPTR)subject;
5958 md->start_offset = start_offset;
5959 md->end_subject = md->start_subject + length;
5960 end_subject = md->end_subject;
5961
5962 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5963 md->use_ucp = (re->options & PCRE_UCP) != 0;
5964 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5965
5966 /* Some options are unpacked into BOOL variables in the hope that testing
5967 them will be faster than individual option bits. */
5968
5969 md->notbol = (options & PCRE_NOTBOL) != 0;
5970 md->noteol = (options & PCRE_NOTEOL) != 0;
5971 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5972 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5973
5974 md->hitend = FALSE;
5975 md->mark = NULL; /* In case never set */
5976
5977 md->recursive = NULL; /* No recursion at top level */
5978 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
5979
5980 md->lcc = tables + lcc_offset;
5981 md->ctypes = tables + ctypes_offset;
5982
5983 /* Handle different \R options. */
5984
5985 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5986 {
5987 case 0:
5988 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5989 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5990 else
5991 #ifdef BSR_ANYCRLF
5992 md->bsr_anycrlf = TRUE;
5993 #else
5994 md->bsr_anycrlf = FALSE;
5995 #endif
5996 break;
5997
5998 case PCRE_BSR_ANYCRLF:
5999 md->bsr_anycrlf = TRUE;
6000 break;
6001
6002 case PCRE_BSR_UNICODE:
6003 md->bsr_anycrlf = FALSE;
6004 break;
6005
6006 default: return PCRE_ERROR_BADNEWLINE;
6007 }
6008
6009 /* Handle different types of newline. The three bits give eight cases. If
6010 nothing is set at run time, whatever was used at compile time applies. */
6011
6012 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6013 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6014 {
6015 case 0: newline = NEWLINE; break; /* Compile-time default */
6016 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6017 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6018 case PCRE_NEWLINE_CR+
6019 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6020 case PCRE_NEWLINE_ANY: newline = -1; break;
6021 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6022 default: return PCRE_ERROR_BADNEWLINE;
6023 }
6024
6025 if (newline == -2)
6026 {
6027 md->nltype = NLTYPE_ANYCRLF;
6028 }
6029 else if (newline < 0)
6030 {
6031 md->nltype = NLTYPE_ANY;
6032 }
6033 else
6034 {
6035 md->nltype = NLTYPE_FIXED;
6036 if (newline > 255)
6037 {
6038 md->nllen = 2;
6039 md->nl[0] = (newline >> 8) & 255;
6040 md->nl[1] = newline & 255;
6041 }
6042 else
6043 {
6044 md->nllen = 1;
6045 md->nl[0] = newline;
6046 }
6047 }
6048
6049 /* Partial matching was originally supported only for a restricted set of
6050 regexes; from release 8.00 there are no restrictions, but the bits are still
6051 defined (though never set). So there's no harm in leaving this code. */
6052
6053 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6054 return PCRE_ERROR_BADPARTIAL;
6055
6056 /* If the expression has got more back references than the offsets supplied can
6057 hold, we get a temporary chunk of working store to use during the matching.
6058 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6059 of 3. */
6060
6061 ocount = offsetcount - (offsetcount % 3);
6062 arg_offset_max = (2*ocount)/3;
6063
6064 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6065 {
6066 ocount = re->top_backref * 3 + 3;
6067 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6068 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6069 using_temporary_offsets = TRUE;
6070 DPRINTF(("Got memory to hold back references\n"));
6071 }
6072 else md->offset_vector = offsets;
6073
6074 md->offset_end = ocount;
6075 md->offset_max = (2*ocount)/3;
6076 md->offset_overflow = FALSE;
6077 md->capture_last = -1;
6078
6079 /* Reset the working variable associated with each extraction. These should
6080 never be used unless previously set, but they get saved and restored, and so we
6081 initialize them to avoid reading uninitialized locations. Also, unset the
6082 offsets for the matched string. This is really just for tidiness with callouts,
6083 in case they inspect these fields. */
6084
6085 if (md->offset_vector != NULL)
6086 {
6087 register int *iptr = md->offset_vector + ocount;
6088 register int *iend = iptr - re->top_bracket;
6089 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6090 while (--iptr >= iend) *iptr = -1;
6091 md->offset_vector[0] = md->offset_vector[1] = -1;
6092 }
6093
6094 /* Set up the first character to match, if available. The first_byte value is
6095 never set for an anchored regular expression, but the anchoring may be forced
6096 at run time, so we have to test for anchoring. The first char may be unset for
6097 an unanchored pattern, of course. If there's no first char and the pattern was
6098 studied, there may be a bitmap of possible first characters. */
6099
6100 if (!anchored)
6101 {
6102 if ((re->flags & PCRE_FIRSTSET) != 0)
6103 {
6104 first_byte = re->first_byte & 255;
6105 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6106 first_byte = md->lcc[first_byte];
6107 }
6108 else
6109 if (!startline && study != NULL &&
6110 (study->flags & PCRE_STUDY_MAPPED) != 0)
6111 start_bits = study->start_bits;
6112 }
6113
6114 /* For anchored or unanchored matches, there may be a "last known required
6115 character" set. */
6116
6117 if ((re->flags & PCRE_REQCHSET) != 0)
6118 {
6119 req_byte = re->req_byte & 255;
6120 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6121 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6122 }
6123
6124
6125
6126
6127 /* ==========================================================================*/
6128
6129 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6130 the loop runs just once. */
6131
6132 for(;;)
6133 {
6134 USPTR save_end_subject = end_subject;
6135 USPTR new_start_match;
6136
6137 /* If firstline is TRUE, the start of the match is constrained to the first
6138 line of a multiline string. That is, the match must be before or at the first
6139 newline. Implement this by temporarily adjusting end_subject so that we stop
6140 scanning at a newline. If the match fails at the newline, later code breaks
6141 this loop. */
6142
6143 if (firstline)
6144 {
6145 USPTR t = start_match;
6146 #ifdef SUPPORT_UTF8
6147 if (utf8)
6148 {
6149 while (t < md->end_subject && !IS_NEWLINE(t))
6150 {
6151 t++;
6152 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6153 }
6154 }
6155 else
6156 #endif
6157 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6158 end_subject = t;
6159 }
6160
6161 /* There are some optimizations that avoid running the match if a known
6162 starting point is not found, or if a known later character is not present.
6163 However, there is an option that disables these, for testing and for ensuring
6164 that all callouts do actually occur. The option can be set in the regex by
6165 (*NO_START_OPT) or passed in match-time options. */
6166
6167 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6168 {
6169 /* Advance to a unique first byte if there is one. */
6170
6171 if (first_byte >= 0)
6172 {
6173 if (first_byte_caseless)
6174 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6175 start_match++;
6176 else
6177 while (start_match < end_subject && *start_match != first_byte)
6178 start_match++;
6179 }
6180
6181 /* Or to just after a linebreak for a multiline match */
6182
6183 else if (startline)
6184 {
6185 if (start_match > md->start_subject + start_offset)
6186 {
6187 #ifdef SUPPORT_UTF8
6188 if (utf8)
6189 {
6190 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6191 {
6192 start_match++;
6193 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6194 start_match++;
6195 }
6196 }
6197 else
6198 #endif
6199 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6200 start_match++;
6201
6202 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6203 and we are now at a LF, advance the match position by one more character.
6204 */
6205
6206 if (start_match[-1] == CHAR_CR &&
6207 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6208 start_match < end_subject &&
6209 *start_match == CHAR_NL)
6210 start_match++;
6211 }
6212 }
6213
6214 /* Or to a non-unique first byte after study */
6215
6216 else if (start_bits != NULL)
6217 {
6218 while (start_match < end_subject)
6219 {
6220 register unsigned int c = *start_match;
6221 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6222 {
6223 start_match++;
6224 #ifdef SUPPORT_UTF8
6225 if (utf8)
6226 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6227 start_match++;
6228 #endif
6229 }
6230 else break;
6231 }
6232 }
6233 } /* Starting optimizations */
6234
6235 /* Restore fudged end_subject */
6236
6237 end_subject = save_end_subject;
6238
6239 /* The following two optimizations are disabled for partial matching or if
6240 disabling is explicitly requested. */
6241
6242 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6243 {
6244 /* If the pattern was studied, a minimum subject length may be set. This is
6245 a lower bound; no actual string of that length may actually match the
6246 pattern. Although the value is, strictly, in characters, we treat it as
6247 bytes to avoid spending too much time in this optimization. */
6248
6249 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6250 (pcre_uint32)(end_subject - start_match) < study->minlength)
6251 {
6252 rc = MATCH_NOMATCH;
6253 break;
6254 }
6255
6256 /* If req_byte is set, we know that that character must appear in the
6257 subject for the match to succeed. If the first character is set, req_byte
6258 must be later in the subject; otherwise the test starts at the match point.
6259 This optimization can save a huge amount of backtracking in patterns with
6260 nested unlimited repeats that aren't going to match. Writing separate code
6261 for cased/caseless versions makes it go faster, as does using an
6262 autoincrement and backing off on a match.
6263
6264 HOWEVER: when the subject string is very, very long, searching to its end
6265 can take a long time, and give bad performance on quite ordinary patterns.
6266 This showed up when somebody was matching something like /^\d+C/ on a
6267 32-megabyte string... so we don't do this when the string is sufficiently
6268 long. */
6269
6270 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6271 {
6272 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6273
6274 /* We don't need to repeat the search if we haven't yet reached the
6275 place we found it at last time. */
6276
6277 if (p > req_byte_ptr)
6278 {
6279 if (req_byte_caseless)
6280 {
6281 while (p < end_subject)
6282 {
6283 register int pp = *p++;
6284 if (pp == req_byte || pp == req_byte2) { p--; break; }
6285 }
6286 }
6287 else
6288 {
6289 while (p < end_subject)
6290 {
6291 if (*p++ == req_byte) { p--; break; }
6292 }
6293 }
6294
6295 /* If we can't find the required character, break the matching loop,
6296 forcing a match failure. */
6297
6298 if (p >= end_subject)
6299 {
6300 rc = MATCH_NOMATCH;
6301 break;
6302 }
6303
6304 /* If we have found the required character, save the point where we
6305 found it, so that we don't search again next time round the loop if
6306 the start hasn't passed this character yet. */
6307
6308 req_byte_ptr = p;
6309 }
6310 }
6311 }
6312
6313 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6314 printf(">>>> Match against: ");
6315 pchars(start_match, end_subject - start_match, TRUE, md);
6316 printf("\n");
6317 #endif
6318
6319 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6320 first starting point for which a partial match was found. */
6321
6322 md->start_match_ptr = start_match;
6323 md->start_used_ptr = start_match;
6324 md->match_call_count = 0;
6325 md->match_function_type = 0;
6326 md->end_offset_top = 0;
6327 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6328 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6329
6330 switch(rc)
6331 {
6332 /* SKIP passes back the next starting point explicitly, but if it is the
6333 same as the match we have just done, treat it as NOMATCH. */
6334
6335 case MATCH_SKIP:
6336 if (md->start_match_ptr != start_match)
6337 {
6338 new_start_match = md->start_match_ptr;
6339 break;
6340 }
6341 /* Fall through */
6342
6343 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6344 the SKIP's arg was not found. We also treat this as NOMATCH. */
6345
6346 case MATCH_SKIP_ARG:
6347 /* Fall through */
6348
6349 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6350 exactly like PRUNE. */
6351
6352 case MATCH_NOMATCH:
6353 case MATCH_PRUNE:
6354 case MATCH_THEN:
6355 new_start_match = start_match + 1;
6356 #ifdef SUPPORT_UTF8
6357 if (utf8)
6358 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6359 new_start_match++;
6360 #endif
6361 break;
6362
6363 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6364
6365 case MATCH_COMMIT:
6366 rc = MATCH_NOMATCH;
6367 goto ENDLOOP;
6368
6369 /* Any other return is either a match, or some kind of error. */
6370
6371 default:
6372 goto ENDLOOP;
6373 }
6374
6375 /* Control reaches here for the various types of "no match at this point"
6376 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6377
6378 rc = MATCH_NOMATCH;
6379
6380 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6381 newline in the subject (though it may continue over the newline). Therefore,
6382 if we have just failed to match, starting at a newline, do not continue. */
6383
6384 if (firstline && IS_NEWLINE(start_match)) break;
6385
6386 /* Advance to new matching position */
6387
6388 start_match = new_start_match;
6389
6390 /* Break the loop if the pattern is anchored or if we have passed the end of
6391 the subject. */
6392
6393 if (anchored || start_match > end_subject) break;
6394
6395 /* If we have just passed a CR and we are now at a LF, and the pattern does
6396 not contain any explicit matches for \r or \n, and the newline option is CRLF
6397 or ANY or ANYCRLF, advance the match position by one more character. */
6398
6399 if (start_match[-1] == CHAR_CR &&
6400 start_match < end_subject &&
6401 *start_match == CHAR_NL &&
6402 (re->flags & PCRE_HASCRORLF) == 0 &&
6403 (md->nltype == NLTYPE_ANY ||
6404 md->nltype == NLTYPE_ANYCRLF ||
6405 md->nllen == 2))
6406 start_match++;
6407
6408 md->mark = NULL; /* Reset for start of next match attempt */
6409 } /* End of for(;;) "bumpalong" loop */
6410
6411 /* ==========================================================================*/
6412
6413 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6414 conditions is true:
6415
6416 (1) The pattern is anchored or the match was failed by (*COMMIT);
6417
6418 (2) We are past the end of the subject;
6419
6420 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6421 this option requests that a match occur at or before the first newline in
6422 the subject.
6423
6424 When we have a match and the offset vector is big enough to deal with any
6425 backreferences, captured substring offsets will already be set up. In the case
6426 where we had to get some local store to hold offsets for backreference
6427 processing, copy those that we can. In this case there need not be overflow if
6428 certain parts of the pattern were not used, even though there are more
6429 capturing parentheses than vector slots. */
6430
6431 ENDLOOP:
6432
6433 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6434 {
6435 if (using_temporary_offsets)
6436 {
6437 if (arg_offset_max >= 4)
6438 {
6439 memcpy(offsets + 2, md->offset_vector + 2,
6440 (arg_offset_max - 2) * sizeof(int));
6441 DPRINTF(("Copied offsets from temporary memory\n"));
6442 }
6443 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6444 DPRINTF(("Freeing temporary memory\n"));
6445 (pcre_free)(md->offset_vector);
6446 }
6447
6448 /* Set the return code to the number of captured strings, or 0 if there were
6449 too many to fit into the vector. */
6450
6451 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6452 0 : md->end_offset_top/2;
6453
6454 /* If there is space in the offset vector, set any unused pairs at the end of
6455 the pattern to -1 for backwards compatibility. It is documented that this
6456 happens. In earlier versions, the whole set of potential capturing offsets
6457 was set to -1 each time round the loop, but this is handled differently now.
6458 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6459 those at the end that need unsetting here. We can't just unset them all at
6460 the start of the whole thing because they may get set in one branch that is
6461 not the final matching branch. */
6462
6463 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6464 {
6465 register int *iptr, *iend;
6466 int resetcount = 2 + re->top_bracket * 2;
6467 if (resetcount > offsetcount) resetcount = ocount;
6468 iptr = offsets + md->end_offset_top;
6469 iend = offsets + resetcount;
6470 while (iptr < iend) *iptr++ = -1;
6471 }
6472
6473 /* If there is space, set up the whole thing as substring 0. The value of
6474 md->start_match_ptr might be modified if \K was encountered on the success
6475 matching path. */
6476
6477 if (offsetcount < 2) rc = 0; else
6478 {
6479 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6480 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6481 }
6482
6483 DPRINTF((">>>> returning %d\n", rc));
6484 goto RETURN_MARK;
6485 }
6486
6487 /* Control gets here if there has been an error, or if the overall match
6488 attempt has failed at all permitted starting positions. */
6489
6490 if (using_temporary_offsets)
6491 {
6492 DPRINTF(("Freeing temporary memory\n"));
6493 (pcre_free)(md->offset_vector);
6494 }
6495
6496 /* For anything other than nomatch or partial match, just return the code. */
6497
6498 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6499 {
6500 DPRINTF((">>>> error: returning %d\n", rc));
6501 return rc;
6502 }
6503
6504 /* Handle partial matches - disable any mark data */
6505
6506 if (start_partial != NULL)
6507 {
6508 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6509 md->mark = NULL;
6510 if (offsetcount > 1)
6511 {
6512 offsets[0] = (int)(start_partial - (USPTR)subject);
6513 offsets[1] = (int)(end_subject - (USPTR)subject);
6514 }
6515 rc = PCRE_ERROR_PARTIAL;
6516 }
6517
6518 /* This is the classic nomatch case */
6519
6520 else
6521 {
6522 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6523 rc = PCRE_ERROR_NOMATCH;
6524 }
6525
6526 /* Return the MARK data if it has been requested. */
6527
6528 RETURN_MARK:
6529
6530 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6531 *(extra_data->mark) = (unsigned char *)(md->mark);
6532 return rc;
6533 }
6534
6535 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5