/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 699 - (show annotations)
Tue Sep 20 10:46:54 2011 UTC (8 years ago) by ph10
File MIME type: text/plain
File size: 196376 byte(s)
Fix *THEN in condition issue.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 2; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 cb.mark = markptr;
1086 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1087 if (rrc < 0) RRETURN(rrc);
1088 }
1089 ecode += _pcre_OP_lengths[OP_CALLOUT];
1090 }
1091
1092 condcode = ecode[LINK_SIZE+1];
1093
1094 /* Now see what the actual condition is */
1095
1096 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1097 {
1098 if (md->recursive == NULL) /* Not recursing => FALSE */
1099 {
1100 condition = FALSE;
1101 ecode += GET(ecode, 1);
1102 }
1103 else
1104 {
1105 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1106 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1107
1108 /* If the test is for recursion into a specific subpattern, and it is
1109 false, but the test was set up by name, scan the table to see if the
1110 name refers to any other numbers, and test them. The condition is true
1111 if any one is set. */
1112
1113 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1114 {
1115 uschar *slotA = md->name_table;
1116 for (i = 0; i < md->name_count; i++)
1117 {
1118 if (GET2(slotA, 0) == recno) break;
1119 slotA += md->name_entry_size;
1120 }
1121
1122 /* Found a name for the number - there can be only one; duplicate
1123 names for different numbers are allowed, but not vice versa. First
1124 scan down for duplicates. */
1125
1126 if (i < md->name_count)
1127 {
1128 uschar *slotB = slotA;
1129 while (slotB > md->name_table)
1130 {
1131 slotB -= md->name_entry_size;
1132 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1133 {
1134 condition = GET2(slotB, 0) == md->recursive->group_num;
1135 if (condition) break;
1136 }
1137 else break;
1138 }
1139
1140 /* Scan up for duplicates */
1141
1142 if (!condition)
1143 {
1144 slotB = slotA;
1145 for (i++; i < md->name_count; i++)
1146 {
1147 slotB += md->name_entry_size;
1148 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1149 {
1150 condition = GET2(slotB, 0) == md->recursive->group_num;
1151 if (condition) break;
1152 }
1153 else break;
1154 }
1155 }
1156 }
1157 }
1158
1159 /* Chose branch according to the condition */
1160
1161 ecode += condition? 3 : GET(ecode, 1);
1162 }
1163 }
1164
1165 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1166 {
1167 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1168 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1169
1170 /* If the numbered capture is unset, but the reference was by name,
1171 scan the table to see if the name refers to any other numbers, and test
1172 them. The condition is true if any one is set. This is tediously similar
1173 to the code above, but not close enough to try to amalgamate. */
1174
1175 if (!condition && condcode == OP_NCREF)
1176 {
1177 int refno = offset >> 1;
1178 uschar *slotA = md->name_table;
1179
1180 for (i = 0; i < md->name_count; i++)
1181 {
1182 if (GET2(slotA, 0) == refno) break;
1183 slotA += md->name_entry_size;
1184 }
1185
1186 /* Found a name for the number - there can be only one; duplicate names
1187 for different numbers are allowed, but not vice versa. First scan down
1188 for duplicates. */
1189
1190 if (i < md->name_count)
1191 {
1192 uschar *slotB = slotA;
1193 while (slotB > md->name_table)
1194 {
1195 slotB -= md->name_entry_size;
1196 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197 {
1198 offset = GET2(slotB, 0) << 1;
1199 condition = offset < offset_top &&
1200 md->offset_vector[offset] >= 0;
1201 if (condition) break;
1202 }
1203 else break;
1204 }
1205
1206 /* Scan up for duplicates */
1207
1208 if (!condition)
1209 {
1210 slotB = slotA;
1211 for (i++; i < md->name_count; i++)
1212 {
1213 slotB += md->name_entry_size;
1214 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215 {
1216 offset = GET2(slotB, 0) << 1;
1217 condition = offset < offset_top &&
1218 md->offset_vector[offset] >= 0;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223 }
1224 }
1225 }
1226
1227 /* Chose branch according to the condition */
1228
1229 ecode += condition? 3 : GET(ecode, 1);
1230 }
1231
1232 else if (condcode == OP_DEF) /* DEFINE - always false */
1233 {
1234 condition = FALSE;
1235 ecode += GET(ecode, 1);
1236 }
1237
1238 /* The condition is an assertion. Call match() to evaluate it - setting
1239 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1240 an assertion. */
1241
1242 else
1243 {
1244 md->match_function_type = MATCH_CONDASSERT;
1245 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1246 if (rrc == MATCH_MATCH)
1247 {
1248 if (md->end_offset_top > offset_top)
1249 offset_top = md->end_offset_top; /* Captures may have happened */
1250 condition = TRUE;
1251 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1252 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1253 }
1254 else if (rrc != MATCH_NOMATCH &&
1255 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1256 {
1257 RRETURN(rrc); /* Need braces because of following else */
1258 }
1259 else
1260 {
1261 condition = FALSE;
1262 ecode += codelink;
1263 }
1264 }
1265
1266 /* We are now at the branch that is to be obeyed. As there is only one,
1267 we used to use tail recursion to avoid using another stack frame, except
1268 when there was unlimited repeat of a possibly empty group. However, that
1269 strategy no longer works because of the possibilty of (*THEN) being
1270 encountered in the branch. A recursive call to match() is always required,
1271 unless the second alternative doesn't exist, in which case we can just
1272 plough on. */
1273
1274 if (condition || *ecode == OP_ALT)
1275 {
1276 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1277 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1278
1279 /* If the result is THEN from within the "true" branch of the condition,
1280 md->start_match_ptr will point to the original OP_COND, not to the start
1281 of the branch, so we have do work to see if it matches. If THEN comes
1282 from the "false" branch, md->start_match_ptr does point to OP_ALT. */
1283
1284 if (rrc == MATCH_THEN)
1285 {
1286 if (*ecode != OP_ALT)
1287 {
1288 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1289 ecode -= GET(ecode, 1);
1290 }
1291 if (md->start_match_ptr == ecode) rrc = MATCH_NOMATCH;
1292 }
1293 RRETURN(rrc);
1294 }
1295 else /* Condition false & no alternative */
1296 {
1297 ecode += 1 + LINK_SIZE;
1298 }
1299 break;
1300
1301
1302 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1303 to close any currently open capturing brackets. */
1304
1305 case OP_CLOSE:
1306 number = GET2(ecode, 1);
1307 offset = number << 1;
1308
1309 #ifdef PCRE_DEBUG
1310 printf("end bracket %d at *ACCEPT", number);
1311 printf("\n");
1312 #endif
1313
1314 md->capture_last = number;
1315 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1316 {
1317 md->offset_vector[offset] =
1318 md->offset_vector[md->offset_end - number];
1319 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1320 if (offset_top <= offset) offset_top = offset + 2;
1321 }
1322 ecode += 3;
1323 break;
1324
1325
1326 /* End of the pattern, either real or forced. */
1327
1328 case OP_END:
1329 case OP_ACCEPT:
1330 case OP_ASSERT_ACCEPT:
1331
1332 /* If we have matched an empty string, fail if not in an assertion and not
1333 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1334 is set and we have matched at the start of the subject. In both cases,
1335 backtracking will then try other alternatives, if any. */
1336
1337 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1338 md->recursive == NULL &&
1339 (md->notempty ||
1340 (md->notempty_atstart &&
1341 mstart == md->start_subject + md->start_offset)))
1342 MRRETURN(MATCH_NOMATCH);
1343
1344 /* Otherwise, we have a match. */
1345
1346 md->end_match_ptr = eptr; /* Record where we ended */
1347 md->end_offset_top = offset_top; /* and how many extracts were taken */
1348 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1349
1350 /* For some reason, the macros don't work properly if an expression is
1351 given as the argument to MRRETURN when the heap is in use. */
1352
1353 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1354 MRRETURN(rrc);
1355
1356 /* Assertion brackets. Check the alternative branches in turn - the
1357 matching won't pass the KET for an assertion. If any one branch matches,
1358 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1359 start of each branch to move the current point backwards, so the code at
1360 this level is identical to the lookahead case. When the assertion is part
1361 of a condition, we want to return immediately afterwards. The caller of
1362 this incarnation of the match() function will have set MATCH_CONDASSERT in
1363 md->match_function type, and one of these opcodes will be the first opcode
1364 that is processed. We use a local variable that is preserved over calls to
1365 match() to remember this case. */
1366
1367 case OP_ASSERT:
1368 case OP_ASSERTBACK:
1369 if (md->match_function_type == MATCH_CONDASSERT)
1370 {
1371 condassert = TRUE;
1372 md->match_function_type = 0;
1373 }
1374 else condassert = FALSE;
1375
1376 do
1377 {
1378 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1379 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1380 {
1381 mstart = md->start_match_ptr; /* In case \K reset it */
1382 markptr = md->mark;
1383 break;
1384 }
1385 if (rrc != MATCH_NOMATCH &&
1386 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1387 RRETURN(rrc);
1388 ecode += GET(ecode, 1);
1389 }
1390 while (*ecode == OP_ALT);
1391
1392 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1393
1394 /* If checking an assertion for a condition, return MATCH_MATCH. */
1395
1396 if (condassert) RRETURN(MATCH_MATCH);
1397
1398 /* Continue from after the assertion, updating the offsets high water
1399 mark, since extracts may have been taken during the assertion. */
1400
1401 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1402 ecode += 1 + LINK_SIZE;
1403 offset_top = md->end_offset_top;
1404 continue;
1405
1406 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1407 PRUNE, or COMMIT means we must assume failure without checking subsequent
1408 branches. */
1409
1410 case OP_ASSERT_NOT:
1411 case OP_ASSERTBACK_NOT:
1412 if (md->match_function_type == MATCH_CONDASSERT)
1413 {
1414 condassert = TRUE;
1415 md->match_function_type = 0;
1416 }
1417 else condassert = FALSE;
1418
1419 do
1420 {
1421 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1422 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1423 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1424 {
1425 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1426 break;
1427 }
1428 if (rrc != MATCH_NOMATCH &&
1429 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1430 RRETURN(rrc);
1431 ecode += GET(ecode,1);
1432 }
1433 while (*ecode == OP_ALT);
1434
1435 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1436
1437 ecode += 1 + LINK_SIZE;
1438 continue;
1439
1440 /* Move the subject pointer back. This occurs only at the start of
1441 each branch of a lookbehind assertion. If we are too close to the start to
1442 move back, this match function fails. When working with UTF-8 we move
1443 back a number of characters, not bytes. */
1444
1445 case OP_REVERSE:
1446 #ifdef SUPPORT_UTF8
1447 if (utf8)
1448 {
1449 i = GET(ecode, 1);
1450 while (i-- > 0)
1451 {
1452 eptr--;
1453 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1454 BACKCHAR(eptr);
1455 }
1456 }
1457 else
1458 #endif
1459
1460 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1461
1462 {
1463 eptr -= GET(ecode, 1);
1464 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1465 }
1466
1467 /* Save the earliest consulted character, then skip to next op code */
1468
1469 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1470 ecode += 1 + LINK_SIZE;
1471 break;
1472
1473 /* The callout item calls an external function, if one is provided, passing
1474 details of the match so far. This is mainly for debugging, though the
1475 function is able to force a failure. */
1476
1477 case OP_CALLOUT:
1478 if (pcre_callout != NULL)
1479 {
1480 pcre_callout_block cb;
1481 cb.version = 2; /* Version 1 of the callout block */
1482 cb.callout_number = ecode[1];
1483 cb.offset_vector = md->offset_vector;
1484 cb.subject = (PCRE_SPTR)md->start_subject;
1485 cb.subject_length = (int)(md->end_subject - md->start_subject);
1486 cb.start_match = (int)(mstart - md->start_subject);
1487 cb.current_position = (int)(eptr - md->start_subject);
1488 cb.pattern_position = GET(ecode, 2);
1489 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1490 cb.capture_top = offset_top/2;
1491 cb.capture_last = md->capture_last;
1492 cb.callout_data = md->callout_data;
1493 cb.mark = markptr;
1494 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1495 if (rrc < 0) RRETURN(rrc);
1496 }
1497 ecode += 2 + 2*LINK_SIZE;
1498 break;
1499
1500 /* Recursion either matches the current regex, or some subexpression. The
1501 offset data is the offset to the starting bracket from the start of the
1502 whole pattern. (This is so that it works from duplicated subpatterns.)
1503
1504 The state of the capturing groups is preserved over recursion, and
1505 re-instated afterwards. We don't know how many are started and not yet
1506 finished (offset_top records the completed total) so we just have to save
1507 all the potential data. There may be up to 65535 such values, which is too
1508 large to put on the stack, but using malloc for small numbers seems
1509 expensive. As a compromise, the stack is used when there are no more than
1510 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1511
1512 There are also other values that have to be saved. We use a chained
1513 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1514 for the original version of this logic. It has, however, been hacked around
1515 a lot, so he is not to blame for the current way it works. */
1516
1517 case OP_RECURSE:
1518 {
1519 recursion_info *ri;
1520 int recno;
1521
1522 callpat = md->start_code + GET(ecode, 1);
1523 recno = (callpat == md->start_code)? 0 :
1524 GET2(callpat, 1 + LINK_SIZE);
1525
1526 /* Check for repeating a recursion without advancing the subject pointer.
1527 This should catch convoluted mutual recursions. (Some simple cases are
1528 caught at compile time.) */
1529
1530 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1531 if (recno == ri->group_num && eptr == ri->subject_position)
1532 RRETURN(PCRE_ERROR_RECURSELOOP);
1533
1534 /* Add to "recursing stack" */
1535
1536 new_recursive.group_num = recno;
1537 new_recursive.subject_position = eptr;
1538 new_recursive.prevrec = md->recursive;
1539 md->recursive = &new_recursive;
1540
1541 /* Where to continue from afterwards */
1542
1543 ecode += 1 + LINK_SIZE;
1544
1545 /* Now save the offset data */
1546
1547 new_recursive.saved_max = md->offset_end;
1548 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1549 new_recursive.offset_save = stacksave;
1550 else
1551 {
1552 new_recursive.offset_save =
1553 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1554 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1555 }
1556 memcpy(new_recursive.offset_save, md->offset_vector,
1557 new_recursive.saved_max * sizeof(int));
1558
1559 /* OK, now we can do the recursion. After processing each alternative,
1560 restore the offset data. If there were nested recursions, md->recursive
1561 might be changed, so reset it before looping. */
1562
1563 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1564 cbegroup = (*callpat >= OP_SBRA);
1565 do
1566 {
1567 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1568 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1569 md, eptrb, RM6);
1570 memcpy(md->offset_vector, new_recursive.offset_save,
1571 new_recursive.saved_max * sizeof(int));
1572 md->recursive = new_recursive.prevrec;
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1574 {
1575 DPRINTF(("Recursion matched\n"));
1576 if (new_recursive.offset_save != stacksave)
1577 (pcre_free)(new_recursive.offset_save);
1578
1579 /* Set where we got to in the subject, and reset the start in case
1580 it was changed by \K. This *is* propagated back out of a recursion,
1581 for Perl compatibility. */
1582
1583 eptr = md->end_match_ptr;
1584 mstart = md->start_match_ptr;
1585 goto RECURSION_MATCHED; /* Exit loop; end processing */
1586 }
1587 else if (rrc != MATCH_NOMATCH &&
1588 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1589 {
1590 DPRINTF(("Recursion gave error %d\n", rrc));
1591 if (new_recursive.offset_save != stacksave)
1592 (pcre_free)(new_recursive.offset_save);
1593 RRETURN(rrc);
1594 }
1595
1596 md->recursive = &new_recursive;
1597 callpat += GET(callpat, 1);
1598 }
1599 while (*callpat == OP_ALT);
1600
1601 DPRINTF(("Recursion didn't match\n"));
1602 md->recursive = new_recursive.prevrec;
1603 if (new_recursive.offset_save != stacksave)
1604 (pcre_free)(new_recursive.offset_save);
1605 MRRETURN(MATCH_NOMATCH);
1606 }
1607
1608 RECURSION_MATCHED:
1609 break;
1610
1611 /* An alternation is the end of a branch; scan along to find the end of the
1612 bracketed group and go to there. */
1613
1614 case OP_ALT:
1615 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1616 break;
1617
1618 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1619 indicating that it may occur zero times. It may repeat infinitely, or not
1620 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1621 with fixed upper repeat limits are compiled as a number of copies, with the
1622 optional ones preceded by BRAZERO or BRAMINZERO. */
1623
1624 case OP_BRAZERO:
1625 next = ecode + 1;
1626 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628 do next += GET(next, 1); while (*next == OP_ALT);
1629 ecode = next + 1 + LINK_SIZE;
1630 break;
1631
1632 case OP_BRAMINZERO:
1633 next = ecode + 1;
1634 do next += GET(next, 1); while (*next == OP_ALT);
1635 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1637 ecode++;
1638 break;
1639
1640 case OP_SKIPZERO:
1641 next = ecode+1;
1642 do next += GET(next,1); while (*next == OP_ALT);
1643 ecode = next + 1 + LINK_SIZE;
1644 break;
1645
1646 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1647 here; just jump to the group, with allow_zero set TRUE. */
1648
1649 case OP_BRAPOSZERO:
1650 op = *(++ecode);
1651 allow_zero = TRUE;
1652 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1653 goto POSSESSIVE_NON_CAPTURE;
1654
1655 /* End of a group, repeated or non-repeating. */
1656
1657 case OP_KET:
1658 case OP_KETRMIN:
1659 case OP_KETRMAX:
1660 case OP_KETRPOS:
1661 prev = ecode - GET(ecode, 1);
1662
1663 /* If this was a group that remembered the subject start, in order to break
1664 infinite repeats of empty string matches, retrieve the subject start from
1665 the chain. Otherwise, set it NULL. */
1666
1667 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1668 {
1669 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1670 eptrb = eptrb->epb_prev; /* Backup to previous group */
1671 }
1672 else saved_eptr = NULL;
1673
1674 /* If we are at the end of an assertion group, stop matching and return
1675 MATCH_MATCH, but record the current high water mark for use by positive
1676 assertions. We also need to record the match start in case it was changed
1677 by \K. */
1678
1679 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1680 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1681 {
1682 md->end_match_ptr = eptr; /* For ONCE */
1683 md->end_offset_top = offset_top;
1684 md->start_match_ptr = mstart;
1685 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1686 }
1687
1688 /* For capturing groups we have to check the group number back at the start
1689 and if necessary complete handling an extraction by setting the offsets and
1690 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1691 into group 0, so it won't be picked up here. Instead, we catch it when the
1692 OP_END is reached. Other recursion is handled here. We just have to record
1693 the current subject position and start match pointer and give a MATCH
1694 return. */
1695
1696 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1697 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1698 {
1699 number = GET2(prev, 1+LINK_SIZE);
1700 offset = number << 1;
1701
1702 #ifdef PCRE_DEBUG
1703 printf("end bracket %d", number);
1704 printf("\n");
1705 #endif
1706
1707 /* Handle a recursively called group. */
1708
1709 if (md->recursive != NULL && md->recursive->group_num == number)
1710 {
1711 md->end_match_ptr = eptr;
1712 md->start_match_ptr = mstart;
1713 RRETURN(MATCH_MATCH);
1714 }
1715
1716 /* Deal with capturing */
1717
1718 md->capture_last = number;
1719 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1720 {
1721 /* If offset is greater than offset_top, it means that we are
1722 "skipping" a capturing group, and that group's offsets must be marked
1723 unset. In earlier versions of PCRE, all the offsets were unset at the
1724 start of matching, but this doesn't work because atomic groups and
1725 assertions can cause a value to be set that should later be unset.
1726 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1727 part of the atomic group, but this is not on the final matching path,
1728 so must be unset when 2 is set. (If there is no group 2, there is no
1729 problem, because offset_top will then be 2, indicating no capture.) */
1730
1731 if (offset > offset_top)
1732 {
1733 register int *iptr = md->offset_vector + offset_top;
1734 register int *iend = md->offset_vector + offset;
1735 while (iptr < iend) *iptr++ = -1;
1736 }
1737
1738 /* Now make the extraction */
1739
1740 md->offset_vector[offset] =
1741 md->offset_vector[md->offset_end - number];
1742 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1743 if (offset_top <= offset) offset_top = offset + 2;
1744 }
1745 }
1746
1747 /* For an ordinary non-repeating ket, just continue at this level. This
1748 also happens for a repeating ket if no characters were matched in the
1749 group. This is the forcible breaking of infinite loops as implemented in
1750 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1751 processing the rest of the pattern at a lower level. If this results in a
1752 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1753 bypassing intermediate backup points, but resetting any captures that
1754 happened along the way. */
1755
1756 if (*ecode == OP_KET || eptr == saved_eptr)
1757 {
1758 if (*prev == OP_ONCE)
1759 {
1760 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1762 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1763 RRETURN(MATCH_ONCE);
1764 }
1765 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1766 break;
1767 }
1768
1769 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1770 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1771 at a time from the outer level, thus saving stack. */
1772
1773 if (*ecode == OP_KETRPOS)
1774 {
1775 md->end_match_ptr = eptr;
1776 md->end_offset_top = offset_top;
1777 RRETURN(MATCH_KETRPOS);
1778 }
1779
1780 /* The normal repeating kets try the rest of the pattern or restart from
1781 the preceding bracket, in the appropriate order. In the second case, we can
1782 use tail recursion to avoid using another stack frame, unless we have an
1783 an atomic group or an unlimited repeat of a group that can match an empty
1784 string. */
1785
1786 if (*ecode == OP_KETRMIN)
1787 {
1788 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790 if (*prev == OP_ONCE)
1791 {
1792 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1794 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1795 RRETURN(MATCH_ONCE);
1796 }
1797 if (*prev >= OP_SBRA) /* Could match an empty string */
1798 {
1799 md->match_function_type = MATCH_CBEGROUP;
1800 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1801 RRETURN(rrc);
1802 }
1803 ecode = prev;
1804 goto TAIL_RECURSE;
1805 }
1806 else /* OP_KETRMAX */
1807 {
1808 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1809 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1810 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1812 if (*prev == OP_ONCE)
1813 {
1814 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 md->once_target = prev;
1817 RRETURN(MATCH_ONCE);
1818 }
1819 ecode += 1 + LINK_SIZE;
1820 goto TAIL_RECURSE;
1821 }
1822 /* Control never gets here */
1823
1824 /* Not multiline mode: start of subject assertion, unless notbol. */
1825
1826 case OP_CIRC:
1827 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1828
1829 /* Start of subject assertion */
1830
1831 case OP_SOD:
1832 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1833 ecode++;
1834 break;
1835
1836 /* Multiline mode: start of subject unless notbol, or after any newline. */
1837
1838 case OP_CIRCM:
1839 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1840 if (eptr != md->start_subject &&
1841 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1842 MRRETURN(MATCH_NOMATCH);
1843 ecode++;
1844 break;
1845
1846 /* Start of match assertion */
1847
1848 case OP_SOM:
1849 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1850 ecode++;
1851 break;
1852
1853 /* Reset the start of match point */
1854
1855 case OP_SET_SOM:
1856 mstart = eptr;
1857 ecode++;
1858 break;
1859
1860 /* Multiline mode: assert before any newline, or before end of subject
1861 unless noteol is set. */
1862
1863 case OP_DOLLM:
1864 if (eptr < md->end_subject)
1865 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1866 else
1867 {
1868 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1869 SCHECK_PARTIAL();
1870 }
1871 ecode++;
1872 break;
1873
1874 /* Not multiline mode: assert before a terminating newline or before end of
1875 subject unless noteol is set. */
1876
1877 case OP_DOLL:
1878 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1879 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1880
1881 /* ... else fall through for endonly */
1882
1883 /* End of subject assertion (\z) */
1884
1885 case OP_EOD:
1886 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1887 SCHECK_PARTIAL();
1888 ecode++;
1889 break;
1890
1891 /* End of subject or ending \n assertion (\Z) */
1892
1893 case OP_EODN:
1894 ASSERT_NL_OR_EOS:
1895 if (eptr < md->end_subject &&
1896 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1897 MRRETURN(MATCH_NOMATCH);
1898
1899 /* Either at end of string or \n before end. */
1900
1901 SCHECK_PARTIAL();
1902 ecode++;
1903 break;
1904
1905 /* Word boundary assertions */
1906
1907 case OP_NOT_WORD_BOUNDARY:
1908 case OP_WORD_BOUNDARY:
1909 {
1910
1911 /* Find out if the previous and current characters are "word" characters.
1912 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1913 be "non-word" characters. Remember the earliest consulted character for
1914 partial matching. */
1915
1916 #ifdef SUPPORT_UTF8
1917 if (utf8)
1918 {
1919 /* Get status of previous character */
1920
1921 if (eptr == md->start_subject) prev_is_word = FALSE; else
1922 {
1923 USPTR lastptr = eptr - 1;
1924 while((*lastptr & 0xc0) == 0x80) lastptr--;
1925 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1926 GETCHAR(c, lastptr);
1927 #ifdef SUPPORT_UCP
1928 if (md->use_ucp)
1929 {
1930 if (c == '_') prev_is_word = TRUE; else
1931 {
1932 int cat = UCD_CATEGORY(c);
1933 prev_is_word = (cat == ucp_L || cat == ucp_N);
1934 }
1935 }
1936 else
1937 #endif
1938 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1939 }
1940
1941 /* Get status of next character */
1942
1943 if (eptr >= md->end_subject)
1944 {
1945 SCHECK_PARTIAL();
1946 cur_is_word = FALSE;
1947 }
1948 else
1949 {
1950 GETCHAR(c, eptr);
1951 #ifdef SUPPORT_UCP
1952 if (md->use_ucp)
1953 {
1954 if (c == '_') cur_is_word = TRUE; else
1955 {
1956 int cat = UCD_CATEGORY(c);
1957 cur_is_word = (cat == ucp_L || cat == ucp_N);
1958 }
1959 }
1960 else
1961 #endif
1962 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1963 }
1964 }
1965 else
1966 #endif
1967
1968 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1969 consistency with the behaviour of \w we do use it in this case. */
1970
1971 {
1972 /* Get status of previous character */
1973
1974 if (eptr == md->start_subject) prev_is_word = FALSE; else
1975 {
1976 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1977 #ifdef SUPPORT_UCP
1978 if (md->use_ucp)
1979 {
1980 c = eptr[-1];
1981 if (c == '_') prev_is_word = TRUE; else
1982 {
1983 int cat = UCD_CATEGORY(c);
1984 prev_is_word = (cat == ucp_L || cat == ucp_N);
1985 }
1986 }
1987 else
1988 #endif
1989 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1990 }
1991
1992 /* Get status of next character */
1993
1994 if (eptr >= md->end_subject)
1995 {
1996 SCHECK_PARTIAL();
1997 cur_is_word = FALSE;
1998 }
1999 else
2000 #ifdef SUPPORT_UCP
2001 if (md->use_ucp)
2002 {
2003 c = *eptr;
2004 if (c == '_') cur_is_word = TRUE; else
2005 {
2006 int cat = UCD_CATEGORY(c);
2007 cur_is_word = (cat == ucp_L || cat == ucp_N);
2008 }
2009 }
2010 else
2011 #endif
2012 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2013 }
2014
2015 /* Now see if the situation is what we want */
2016
2017 if ((*ecode++ == OP_WORD_BOUNDARY)?
2018 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2019 MRRETURN(MATCH_NOMATCH);
2020 }
2021 break;
2022
2023 /* Match a single character type; inline for speed */
2024
2025 case OP_ANY:
2026 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2027 /* Fall through */
2028
2029 case OP_ALLANY:
2030 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2031 { /* not be updated before SCHECK_PARTIAL. */
2032 SCHECK_PARTIAL();
2033 MRRETURN(MATCH_NOMATCH);
2034 }
2035 eptr++;
2036 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2037 ecode++;
2038 break;
2039
2040 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2041 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2042
2043 case OP_ANYBYTE:
2044 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2045 { /* not be updated before SCHECK_PARTIAL. */
2046 SCHECK_PARTIAL();
2047 MRRETURN(MATCH_NOMATCH);
2048 }
2049 eptr++;
2050 ecode++;
2051 break;
2052
2053 case OP_NOT_DIGIT:
2054 if (eptr >= md->end_subject)
2055 {
2056 SCHECK_PARTIAL();
2057 MRRETURN(MATCH_NOMATCH);
2058 }
2059 GETCHARINCTEST(c, eptr);
2060 if (
2061 #ifdef SUPPORT_UTF8
2062 c < 256 &&
2063 #endif
2064 (md->ctypes[c] & ctype_digit) != 0
2065 )
2066 MRRETURN(MATCH_NOMATCH);
2067 ecode++;
2068 break;
2069
2070 case OP_DIGIT:
2071 if (eptr >= md->end_subject)
2072 {
2073 SCHECK_PARTIAL();
2074 MRRETURN(MATCH_NOMATCH);
2075 }
2076 GETCHARINCTEST(c, eptr);
2077 if (
2078 #ifdef SUPPORT_UTF8
2079 c >= 256 ||
2080 #endif
2081 (md->ctypes[c] & ctype_digit) == 0
2082 )
2083 MRRETURN(MATCH_NOMATCH);
2084 ecode++;
2085 break;
2086
2087 case OP_NOT_WHITESPACE:
2088 if (eptr >= md->end_subject)
2089 {
2090 SCHECK_PARTIAL();
2091 MRRETURN(MATCH_NOMATCH);
2092 }
2093 GETCHARINCTEST(c, eptr);
2094 if (
2095 #ifdef SUPPORT_UTF8
2096 c < 256 &&
2097 #endif
2098 (md->ctypes[c] & ctype_space) != 0
2099 )
2100 MRRETURN(MATCH_NOMATCH);
2101 ecode++;
2102 break;
2103
2104 case OP_WHITESPACE:
2105 if (eptr >= md->end_subject)
2106 {
2107 SCHECK_PARTIAL();
2108 MRRETURN(MATCH_NOMATCH);
2109 }
2110 GETCHARINCTEST(c, eptr);
2111 if (
2112 #ifdef SUPPORT_UTF8
2113 c >= 256 ||
2114 #endif
2115 (md->ctypes[c] & ctype_space) == 0
2116 )
2117 MRRETURN(MATCH_NOMATCH);
2118 ecode++;
2119 break;
2120
2121 case OP_NOT_WORDCHAR:
2122 if (eptr >= md->end_subject)
2123 {
2124 SCHECK_PARTIAL();
2125 MRRETURN(MATCH_NOMATCH);
2126 }
2127 GETCHARINCTEST(c, eptr);
2128 if (
2129 #ifdef SUPPORT_UTF8
2130 c < 256 &&
2131 #endif
2132 (md->ctypes[c] & ctype_word) != 0
2133 )
2134 MRRETURN(MATCH_NOMATCH);
2135 ecode++;
2136 break;
2137
2138 case OP_WORDCHAR:
2139 if (eptr >= md->end_subject)
2140 {
2141 SCHECK_PARTIAL();
2142 MRRETURN(MATCH_NOMATCH);
2143 }
2144 GETCHARINCTEST(c, eptr);
2145 if (
2146 #ifdef SUPPORT_UTF8
2147 c >= 256 ||
2148 #endif
2149 (md->ctypes[c] & ctype_word) == 0
2150 )
2151 MRRETURN(MATCH_NOMATCH);
2152 ecode++;
2153 break;
2154
2155 case OP_ANYNL:
2156 if (eptr >= md->end_subject)
2157 {
2158 SCHECK_PARTIAL();
2159 MRRETURN(MATCH_NOMATCH);
2160 }
2161 GETCHARINCTEST(c, eptr);
2162 switch(c)
2163 {
2164 default: MRRETURN(MATCH_NOMATCH);
2165
2166 case 0x000d:
2167 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2168 break;
2169
2170 case 0x000a:
2171 break;
2172
2173 case 0x000b:
2174 case 0x000c:
2175 case 0x0085:
2176 case 0x2028:
2177 case 0x2029:
2178 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2179 break;
2180 }
2181 ecode++;
2182 break;
2183
2184 case OP_NOT_HSPACE:
2185 if (eptr >= md->end_subject)
2186 {
2187 SCHECK_PARTIAL();
2188 MRRETURN(MATCH_NOMATCH);
2189 }
2190 GETCHARINCTEST(c, eptr);
2191 switch(c)
2192 {
2193 default: break;
2194 case 0x09: /* HT */
2195 case 0x20: /* SPACE */
2196 case 0xa0: /* NBSP */
2197 case 0x1680: /* OGHAM SPACE MARK */
2198 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2199 case 0x2000: /* EN QUAD */
2200 case 0x2001: /* EM QUAD */
2201 case 0x2002: /* EN SPACE */
2202 case 0x2003: /* EM SPACE */
2203 case 0x2004: /* THREE-PER-EM SPACE */
2204 case 0x2005: /* FOUR-PER-EM SPACE */
2205 case 0x2006: /* SIX-PER-EM SPACE */
2206 case 0x2007: /* FIGURE SPACE */
2207 case 0x2008: /* PUNCTUATION SPACE */
2208 case 0x2009: /* THIN SPACE */
2209 case 0x200A: /* HAIR SPACE */
2210 case 0x202f: /* NARROW NO-BREAK SPACE */
2211 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2212 case 0x3000: /* IDEOGRAPHIC SPACE */
2213 MRRETURN(MATCH_NOMATCH);
2214 }
2215 ecode++;
2216 break;
2217
2218 case OP_HSPACE:
2219 if (eptr >= md->end_subject)
2220 {
2221 SCHECK_PARTIAL();
2222 MRRETURN(MATCH_NOMATCH);
2223 }
2224 GETCHARINCTEST(c, eptr);
2225 switch(c)
2226 {
2227 default: MRRETURN(MATCH_NOMATCH);
2228 case 0x09: /* HT */
2229 case 0x20: /* SPACE */
2230 case 0xa0: /* NBSP */
2231 case 0x1680: /* OGHAM SPACE MARK */
2232 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2233 case 0x2000: /* EN QUAD */
2234 case 0x2001: /* EM QUAD */
2235 case 0x2002: /* EN SPACE */
2236 case 0x2003: /* EM SPACE */
2237 case 0x2004: /* THREE-PER-EM SPACE */
2238 case 0x2005: /* FOUR-PER-EM SPACE */
2239 case 0x2006: /* SIX-PER-EM SPACE */
2240 case 0x2007: /* FIGURE SPACE */
2241 case 0x2008: /* PUNCTUATION SPACE */
2242 case 0x2009: /* THIN SPACE */
2243 case 0x200A: /* HAIR SPACE */
2244 case 0x202f: /* NARROW NO-BREAK SPACE */
2245 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2246 case 0x3000: /* IDEOGRAPHIC SPACE */
2247 break;
2248 }
2249 ecode++;
2250 break;
2251
2252 case OP_NOT_VSPACE:
2253 if (eptr >= md->end_subject)
2254 {
2255 SCHECK_PARTIAL();
2256 MRRETURN(MATCH_NOMATCH);
2257 }
2258 GETCHARINCTEST(c, eptr);
2259 switch(c)
2260 {
2261 default: break;
2262 case 0x0a: /* LF */
2263 case 0x0b: /* VT */
2264 case 0x0c: /* FF */
2265 case 0x0d: /* CR */
2266 case 0x85: /* NEL */
2267 case 0x2028: /* LINE SEPARATOR */
2268 case 0x2029: /* PARAGRAPH SEPARATOR */
2269 MRRETURN(MATCH_NOMATCH);
2270 }
2271 ecode++;
2272 break;
2273
2274 case OP_VSPACE:
2275 if (eptr >= md->end_subject)
2276 {
2277 SCHECK_PARTIAL();
2278 MRRETURN(MATCH_NOMATCH);
2279 }
2280 GETCHARINCTEST(c, eptr);
2281 switch(c)
2282 {
2283 default: MRRETURN(MATCH_NOMATCH);
2284 case 0x0a: /* LF */
2285 case 0x0b: /* VT */
2286 case 0x0c: /* FF */
2287 case 0x0d: /* CR */
2288 case 0x85: /* NEL */
2289 case 0x2028: /* LINE SEPARATOR */
2290 case 0x2029: /* PARAGRAPH SEPARATOR */
2291 break;
2292 }
2293 ecode++;
2294 break;
2295
2296 #ifdef SUPPORT_UCP
2297 /* Check the next character by Unicode property. We will get here only
2298 if the support is in the binary; otherwise a compile-time error occurs. */
2299
2300 case OP_PROP:
2301 case OP_NOTPROP:
2302 if (eptr >= md->end_subject)
2303 {
2304 SCHECK_PARTIAL();
2305 MRRETURN(MATCH_NOMATCH);
2306 }
2307 GETCHARINCTEST(c, eptr);
2308 {
2309 const ucd_record *prop = GET_UCD(c);
2310
2311 switch(ecode[1])
2312 {
2313 case PT_ANY:
2314 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2315 break;
2316
2317 case PT_LAMP:
2318 if ((prop->chartype == ucp_Lu ||
2319 prop->chartype == ucp_Ll ||
2320 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2321 MRRETURN(MATCH_NOMATCH);
2322 break;
2323
2324 case PT_GC:
2325 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2326 MRRETURN(MATCH_NOMATCH);
2327 break;
2328
2329 case PT_PC:
2330 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2331 MRRETURN(MATCH_NOMATCH);
2332 break;
2333
2334 case PT_SC:
2335 if ((ecode[2] != prop->script) == (op == OP_PROP))
2336 MRRETURN(MATCH_NOMATCH);
2337 break;
2338
2339 /* These are specials */
2340
2341 case PT_ALNUM:
2342 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2343 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2344 MRRETURN(MATCH_NOMATCH);
2345 break;
2346
2347 case PT_SPACE: /* Perl space */
2348 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2349 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2350 == (op == OP_NOTPROP))
2351 MRRETURN(MATCH_NOMATCH);
2352 break;
2353
2354 case PT_PXSPACE: /* POSIX space */
2355 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2356 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2357 c == CHAR_FF || c == CHAR_CR)
2358 == (op == OP_NOTPROP))
2359 MRRETURN(MATCH_NOMATCH);
2360 break;
2361
2362 case PT_WORD:
2363 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2364 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2365 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2366 MRRETURN(MATCH_NOMATCH);
2367 break;
2368
2369 /* This should never occur */
2370
2371 default:
2372 RRETURN(PCRE_ERROR_INTERNAL);
2373 }
2374
2375 ecode += 3;
2376 }
2377 break;
2378
2379 /* Match an extended Unicode sequence. We will get here only if the support
2380 is in the binary; otherwise a compile-time error occurs. */
2381
2382 case OP_EXTUNI:
2383 if (eptr >= md->end_subject)
2384 {
2385 SCHECK_PARTIAL();
2386 MRRETURN(MATCH_NOMATCH);
2387 }
2388 GETCHARINCTEST(c, eptr);
2389 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2390 while (eptr < md->end_subject)
2391 {
2392 int len = 1;
2393 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2394 if (UCD_CATEGORY(c) != ucp_M) break;
2395 eptr += len;
2396 }
2397 ecode++;
2398 break;
2399 #endif
2400
2401
2402 /* Match a back reference, possibly repeatedly. Look past the end of the
2403 item to see if there is repeat information following. The code is similar
2404 to that for character classes, but repeated for efficiency. Then obey
2405 similar code to character type repeats - written out again for speed.
2406 However, if the referenced string is the empty string, always treat
2407 it as matched, any number of times (otherwise there could be infinite
2408 loops). */
2409
2410 case OP_REF:
2411 case OP_REFI:
2412 caseless = op == OP_REFI;
2413 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2414 ecode += 3;
2415
2416 /* If the reference is unset, there are two possibilities:
2417
2418 (a) In the default, Perl-compatible state, set the length negative;
2419 this ensures that every attempt at a match fails. We can't just fail
2420 here, because of the possibility of quantifiers with zero minima.
2421
2422 (b) If the JavaScript compatibility flag is set, set the length to zero
2423 so that the back reference matches an empty string.
2424
2425 Otherwise, set the length to the length of what was matched by the
2426 referenced subpattern. */
2427
2428 if (offset >= offset_top || md->offset_vector[offset] < 0)
2429 length = (md->jscript_compat)? 0 : -1;
2430 else
2431 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2432
2433 /* Set up for repetition, or handle the non-repeated case */
2434
2435 switch (*ecode)
2436 {
2437 case OP_CRSTAR:
2438 case OP_CRMINSTAR:
2439 case OP_CRPLUS:
2440 case OP_CRMINPLUS:
2441 case OP_CRQUERY:
2442 case OP_CRMINQUERY:
2443 c = *ecode++ - OP_CRSTAR;
2444 minimize = (c & 1) != 0;
2445 min = rep_min[c]; /* Pick up values from tables; */
2446 max = rep_max[c]; /* zero for max => infinity */
2447 if (max == 0) max = INT_MAX;
2448 break;
2449
2450 case OP_CRRANGE:
2451 case OP_CRMINRANGE:
2452 minimize = (*ecode == OP_CRMINRANGE);
2453 min = GET2(ecode, 1);
2454 max = GET2(ecode, 3);
2455 if (max == 0) max = INT_MAX;
2456 ecode += 5;
2457 break;
2458
2459 default: /* No repeat follows */
2460 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2461 {
2462 CHECK_PARTIAL();
2463 MRRETURN(MATCH_NOMATCH);
2464 }
2465 eptr += length;
2466 continue; /* With the main loop */
2467 }
2468
2469 /* Handle repeated back references. If the length of the reference is
2470 zero, just continue with the main loop. */
2471
2472 if (length == 0) continue;
2473
2474 /* First, ensure the minimum number of matches are present. We get back
2475 the length of the reference string explicitly rather than passing the
2476 address of eptr, so that eptr can be a register variable. */
2477
2478 for (i = 1; i <= min; i++)
2479 {
2480 int slength;
2481 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2482 {
2483 CHECK_PARTIAL();
2484 MRRETURN(MATCH_NOMATCH);
2485 }
2486 eptr += slength;
2487 }
2488
2489 /* If min = max, continue at the same level without recursion.
2490 They are not both allowed to be zero. */
2491
2492 if (min == max) continue;
2493
2494 /* If minimizing, keep trying and advancing the pointer */
2495
2496 if (minimize)
2497 {
2498 for (fi = min;; fi++)
2499 {
2500 int slength;
2501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2503 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2504 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2505 {
2506 CHECK_PARTIAL();
2507 MRRETURN(MATCH_NOMATCH);
2508 }
2509 eptr += slength;
2510 }
2511 /* Control never gets here */
2512 }
2513
2514 /* If maximizing, find the longest string and work backwards */
2515
2516 else
2517 {
2518 pp = eptr;
2519 for (i = min; i < max; i++)
2520 {
2521 int slength;
2522 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2523 {
2524 CHECK_PARTIAL();
2525 break;
2526 }
2527 eptr += slength;
2528 }
2529 while (eptr >= pp)
2530 {
2531 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2532 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2533 eptr -= length;
2534 }
2535 MRRETURN(MATCH_NOMATCH);
2536 }
2537 /* Control never gets here */
2538
2539 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2540 used when all the characters in the class have values in the range 0-255,
2541 and either the matching is caseful, or the characters are in the range
2542 0-127 when UTF-8 processing is enabled. The only difference between
2543 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2544 encountered.
2545
2546 First, look past the end of the item to see if there is repeat information
2547 following. Then obey similar code to character type repeats - written out
2548 again for speed. */
2549
2550 case OP_NCLASS:
2551 case OP_CLASS:
2552 {
2553 data = ecode + 1; /* Save for matching */
2554 ecode += 33; /* Advance past the item */
2555
2556 switch (*ecode)
2557 {
2558 case OP_CRSTAR:
2559 case OP_CRMINSTAR:
2560 case OP_CRPLUS:
2561 case OP_CRMINPLUS:
2562 case OP_CRQUERY:
2563 case OP_CRMINQUERY:
2564 c = *ecode++ - OP_CRSTAR;
2565 minimize = (c & 1) != 0;
2566 min = rep_min[c]; /* Pick up values from tables; */
2567 max = rep_max[c]; /* zero for max => infinity */
2568 if (max == 0) max = INT_MAX;
2569 break;
2570
2571 case OP_CRRANGE:
2572 case OP_CRMINRANGE:
2573 minimize = (*ecode == OP_CRMINRANGE);
2574 min = GET2(ecode, 1);
2575 max = GET2(ecode, 3);
2576 if (max == 0) max = INT_MAX;
2577 ecode += 5;
2578 break;
2579
2580 default: /* No repeat follows */
2581 min = max = 1;
2582 break;
2583 }
2584
2585 /* First, ensure the minimum number of matches are present. */
2586
2587 #ifdef SUPPORT_UTF8
2588 /* UTF-8 mode */
2589 if (utf8)
2590 {
2591 for (i = 1; i <= min; i++)
2592 {
2593 if (eptr >= md->end_subject)
2594 {
2595 SCHECK_PARTIAL();
2596 MRRETURN(MATCH_NOMATCH);
2597 }
2598 GETCHARINC(c, eptr);
2599 if (c > 255)
2600 {
2601 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2602 }
2603 else
2604 {
2605 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2606 }
2607 }
2608 }
2609 else
2610 #endif
2611 /* Not UTF-8 mode */
2612 {
2613 for (i = 1; i <= min; i++)
2614 {
2615 if (eptr >= md->end_subject)
2616 {
2617 SCHECK_PARTIAL();
2618 MRRETURN(MATCH_NOMATCH);
2619 }
2620 c = *eptr++;
2621 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2622 }
2623 }
2624
2625 /* If max == min we can continue with the main loop without the
2626 need to recurse. */
2627
2628 if (min == max) continue;
2629
2630 /* If minimizing, keep testing the rest of the expression and advancing
2631 the pointer while it matches the class. */
2632
2633 if (minimize)
2634 {
2635 #ifdef SUPPORT_UTF8
2636 /* UTF-8 mode */
2637 if (utf8)
2638 {
2639 for (fi = min;; fi++)
2640 {
2641 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2643 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2644 if (eptr >= md->end_subject)
2645 {
2646 SCHECK_PARTIAL();
2647 MRRETURN(MATCH_NOMATCH);
2648 }
2649 GETCHARINC(c, eptr);
2650 if (c > 255)
2651 {
2652 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2653 }
2654 else
2655 {
2656 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2657 }
2658 }
2659 }
2660 else
2661 #endif
2662 /* Not UTF-8 mode */
2663 {
2664 for (fi = min;; fi++)
2665 {
2666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2669 if (eptr >= md->end_subject)
2670 {
2671 SCHECK_PARTIAL();
2672 MRRETURN(MATCH_NOMATCH);
2673 }
2674 c = *eptr++;
2675 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2676 }
2677 }
2678 /* Control never gets here */
2679 }
2680
2681 /* If maximizing, find the longest possible run, then work backwards. */
2682
2683 else
2684 {
2685 pp = eptr;
2686
2687 #ifdef SUPPORT_UTF8
2688 /* UTF-8 mode */
2689 if (utf8)
2690 {
2691 for (i = min; i < max; i++)
2692 {
2693 int len = 1;
2694 if (eptr >= md->end_subject)
2695 {
2696 SCHECK_PARTIAL();
2697 break;
2698 }
2699 GETCHARLEN(c, eptr, len);
2700 if (c > 255)
2701 {
2702 if (op == OP_CLASS) break;
2703 }
2704 else
2705 {
2706 if ((data[c/8] & (1 << (c&7))) == 0) break;
2707 }
2708 eptr += len;
2709 }
2710 for (;;)
2711 {
2712 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2714 if (eptr-- == pp) break; /* Stop if tried at original pos */
2715 BACKCHAR(eptr);
2716 }
2717 }
2718 else
2719 #endif
2720 /* Not UTF-8 mode */
2721 {
2722 for (i = min; i < max; i++)
2723 {
2724 if (eptr >= md->end_subject)
2725 {
2726 SCHECK_PARTIAL();
2727 break;
2728 }
2729 c = *eptr;
2730 if ((data[c/8] & (1 << (c&7))) == 0) break;
2731 eptr++;
2732 }
2733 while (eptr >= pp)
2734 {
2735 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2736 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2737 eptr--;
2738 }
2739 }
2740
2741 MRRETURN(MATCH_NOMATCH);
2742 }
2743 }
2744 /* Control never gets here */
2745
2746
2747 /* Match an extended character class. This opcode is encountered only
2748 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2749 mode, because Unicode properties are supported in non-UTF-8 mode. */
2750
2751 #ifdef SUPPORT_UTF8
2752 case OP_XCLASS:
2753 {
2754 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2755 ecode += GET(ecode, 1); /* Advance past the item */
2756
2757 switch (*ecode)
2758 {
2759 case OP_CRSTAR:
2760 case OP_CRMINSTAR:
2761 case OP_CRPLUS:
2762 case OP_CRMINPLUS:
2763 case OP_CRQUERY:
2764 case OP_CRMINQUERY:
2765 c = *ecode++ - OP_CRSTAR;
2766 minimize = (c & 1) != 0;
2767 min = rep_min[c]; /* Pick up values from tables; */
2768 max = rep_max[c]; /* zero for max => infinity */
2769 if (max == 0) max = INT_MAX;
2770 break;
2771
2772 case OP_CRRANGE:
2773 case OP_CRMINRANGE:
2774 minimize = (*ecode == OP_CRMINRANGE);
2775 min = GET2(ecode, 1);
2776 max = GET2(ecode, 3);
2777 if (max == 0) max = INT_MAX;
2778 ecode += 5;
2779 break;
2780
2781 default: /* No repeat follows */
2782 min = max = 1;
2783 break;
2784 }
2785
2786 /* First, ensure the minimum number of matches are present. */
2787
2788 for (i = 1; i <= min; i++)
2789 {
2790 if (eptr >= md->end_subject)
2791 {
2792 SCHECK_PARTIAL();
2793 MRRETURN(MATCH_NOMATCH);
2794 }
2795 GETCHARINCTEST(c, eptr);
2796 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2797 }
2798
2799 /* If max == min we can continue with the main loop without the
2800 need to recurse. */
2801
2802 if (min == max) continue;
2803
2804 /* If minimizing, keep testing the rest of the expression and advancing
2805 the pointer while it matches the class. */
2806
2807 if (minimize)
2808 {
2809 for (fi = min;; fi++)
2810 {
2811 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2813 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2814 if (eptr >= md->end_subject)
2815 {
2816 SCHECK_PARTIAL();
2817 MRRETURN(MATCH_NOMATCH);
2818 }
2819 GETCHARINCTEST(c, eptr);
2820 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2821 }
2822 /* Control never gets here */
2823 }
2824
2825 /* If maximizing, find the longest possible run, then work backwards. */
2826
2827 else
2828 {
2829 pp = eptr;
2830 for (i = min; i < max; i++)
2831 {
2832 int len = 1;
2833 if (eptr >= md->end_subject)
2834 {
2835 SCHECK_PARTIAL();
2836 break;
2837 }
2838 GETCHARLENTEST(c, eptr, len);
2839 if (!_pcre_xclass(c, data)) break;
2840 eptr += len;
2841 }
2842 for(;;)
2843 {
2844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2846 if (eptr-- == pp) break; /* Stop if tried at original pos */
2847 if (utf8) BACKCHAR(eptr);
2848 }
2849 MRRETURN(MATCH_NOMATCH);
2850 }
2851
2852 /* Control never gets here */
2853 }
2854 #endif /* End of XCLASS */
2855
2856 /* Match a single character, casefully */
2857
2858 case OP_CHAR:
2859 #ifdef SUPPORT_UTF8
2860 if (utf8)
2861 {
2862 length = 1;
2863 ecode++;
2864 GETCHARLEN(fc, ecode, length);
2865 if (length > md->end_subject - eptr)
2866 {
2867 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2868 MRRETURN(MATCH_NOMATCH);
2869 }
2870 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2871 }
2872 else
2873 #endif
2874
2875 /* Non-UTF-8 mode */
2876 {
2877 if (md->end_subject - eptr < 1)
2878 {
2879 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2880 MRRETURN(MATCH_NOMATCH);
2881 }
2882 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2883 ecode += 2;
2884 }
2885 break;
2886
2887 /* Match a single character, caselessly */
2888
2889 case OP_CHARI:
2890 #ifdef SUPPORT_UTF8
2891 if (utf8)
2892 {
2893 length = 1;
2894 ecode++;
2895 GETCHARLEN(fc, ecode, length);
2896
2897 if (length > md->end_subject - eptr)
2898 {
2899 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2900 MRRETURN(MATCH_NOMATCH);
2901 }
2902
2903 /* If the pattern character's value is < 128, we have only one byte, and
2904 can use the fast lookup table. */
2905
2906 if (fc < 128)
2907 {
2908 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2909 }
2910
2911 /* Otherwise we must pick up the subject character */
2912
2913 else
2914 {
2915 unsigned int dc;
2916 GETCHARINC(dc, eptr);
2917 ecode += length;
2918
2919 /* If we have Unicode property support, we can use it to test the other
2920 case of the character, if there is one. */
2921
2922 if (fc != dc)
2923 {
2924 #ifdef SUPPORT_UCP
2925 if (dc != UCD_OTHERCASE(fc))
2926 #endif
2927 MRRETURN(MATCH_NOMATCH);
2928 }
2929 }
2930 }
2931 else
2932 #endif /* SUPPORT_UTF8 */
2933
2934 /* Non-UTF-8 mode */
2935 {
2936 if (md->end_subject - eptr < 1)
2937 {
2938 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2939 MRRETURN(MATCH_NOMATCH);
2940 }
2941 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2942 ecode += 2;
2943 }
2944 break;
2945
2946 /* Match a single character repeatedly. */
2947
2948 case OP_EXACT:
2949 case OP_EXACTI:
2950 min = max = GET2(ecode, 1);
2951 ecode += 3;
2952 goto REPEATCHAR;
2953
2954 case OP_POSUPTO:
2955 case OP_POSUPTOI:
2956 possessive = TRUE;
2957 /* Fall through */
2958
2959 case OP_UPTO:
2960 case OP_UPTOI:
2961 case OP_MINUPTO:
2962 case OP_MINUPTOI:
2963 min = 0;
2964 max = GET2(ecode, 1);
2965 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2966 ecode += 3;
2967 goto REPEATCHAR;
2968
2969 case OP_POSSTAR:
2970 case OP_POSSTARI:
2971 possessive = TRUE;
2972 min = 0;
2973 max = INT_MAX;
2974 ecode++;
2975 goto REPEATCHAR;
2976
2977 case OP_POSPLUS:
2978 case OP_POSPLUSI:
2979 possessive = TRUE;
2980 min = 1;
2981 max = INT_MAX;
2982 ecode++;
2983 goto REPEATCHAR;
2984
2985 case OP_POSQUERY:
2986 case OP_POSQUERYI:
2987 possessive = TRUE;
2988 min = 0;
2989 max = 1;
2990 ecode++;
2991 goto REPEATCHAR;
2992
2993 case OP_STAR:
2994 case OP_STARI:
2995 case OP_MINSTAR:
2996 case OP_MINSTARI:
2997 case OP_PLUS:
2998 case OP_PLUSI:
2999 case OP_MINPLUS:
3000 case OP_MINPLUSI:
3001 case OP_QUERY:
3002 case OP_QUERYI:
3003 case OP_MINQUERY:
3004 case OP_MINQUERYI:
3005 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3006 minimize = (c & 1) != 0;
3007 min = rep_min[c]; /* Pick up values from tables; */
3008 max = rep_max[c]; /* zero for max => infinity */
3009 if (max == 0) max = INT_MAX;
3010
3011 /* Common code for all repeated single-character matches. */
3012
3013 REPEATCHAR:
3014 #ifdef SUPPORT_UTF8
3015 if (utf8)
3016 {
3017 length = 1;
3018 charptr = ecode;
3019 GETCHARLEN(fc, ecode, length);
3020 ecode += length;
3021
3022 /* Handle multibyte character matching specially here. There is
3023 support for caseless matching if UCP support is present. */
3024
3025 if (length > 1)
3026 {
3027 #ifdef SUPPORT_UCP
3028 unsigned int othercase;
3029 if (op >= OP_STARI && /* Caseless */
3030 (othercase = UCD_OTHERCASE(fc)) != fc)
3031 oclength = _pcre_ord2utf8(othercase, occhars);
3032 else oclength = 0;
3033 #endif /* SUPPORT_UCP */
3034
3035 for (i = 1; i <= min; i++)
3036 {
3037 if (eptr <= md->end_subject - length &&
3038 memcmp(eptr, charptr, length) == 0) eptr += length;
3039 #ifdef SUPPORT_UCP
3040 else if (oclength > 0 &&
3041 eptr <= md->end_subject - oclength &&
3042 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3043 #endif /* SUPPORT_UCP */
3044 else
3045 {
3046 CHECK_PARTIAL();
3047 MRRETURN(MATCH_NOMATCH);
3048 }
3049 }
3050
3051 if (min == max) continue;
3052
3053 if (minimize)
3054 {
3055 for (fi = min;; fi++)
3056 {
3057 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3059 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3060 if (eptr <= md->end_subject - length &&
3061 memcmp(eptr, charptr, length) == 0) eptr += length;
3062 #ifdef SUPPORT_UCP
3063 else if (oclength > 0 &&
3064 eptr <= md->end_subject - oclength &&
3065 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3066 #endif /* SUPPORT_UCP */
3067 else
3068 {
3069 CHECK_PARTIAL();
3070 MRRETURN(MATCH_NOMATCH);
3071 }
3072 }
3073 /* Control never gets here */
3074 }
3075
3076 else /* Maximize */
3077 {
3078 pp = eptr;
3079 for (i = min; i < max; i++)
3080 {
3081 if (eptr <= md->end_subject - length &&
3082 memcmp(eptr, charptr, length) == 0) eptr += length;
3083 #ifdef SUPPORT_UCP
3084 else if (oclength > 0 &&
3085 eptr <= md->end_subject - oclength &&
3086 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3087 #endif /* SUPPORT_UCP */
3088 else
3089 {
3090 CHECK_PARTIAL();
3091 break;
3092 }
3093 }
3094
3095 if (possessive) continue;
3096
3097 for(;;)
3098 {
3099 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3100 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3101 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3102 #ifdef SUPPORT_UCP
3103 eptr--;
3104 BACKCHAR(eptr);
3105 #else /* without SUPPORT_UCP */
3106 eptr -= length;
3107 #endif /* SUPPORT_UCP */
3108 }
3109 }
3110 /* Control never gets here */
3111 }
3112
3113 /* If the length of a UTF-8 character is 1, we fall through here, and
3114 obey the code as for non-UTF-8 characters below, though in this case the
3115 value of fc will always be < 128. */
3116 }
3117 else
3118 #endif /* SUPPORT_UTF8 */
3119
3120 /* When not in UTF-8 mode, load a single-byte character. */
3121
3122 fc = *ecode++;
3123
3124 /* The value of fc at this point is always less than 256, though we may or
3125 may not be in UTF-8 mode. The code is duplicated for the caseless and
3126 caseful cases, for speed, since matching characters is likely to be quite
3127 common. First, ensure the minimum number of matches are present. If min =
3128 max, continue at the same level without recursing. Otherwise, if
3129 minimizing, keep trying the rest of the expression and advancing one
3130 matching character if failing, up to the maximum. Alternatively, if
3131 maximizing, find the maximum number of characters and work backwards. */
3132
3133 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3134 max, eptr));
3135
3136 if (op >= OP_STARI) /* Caseless */
3137 {
3138 fc = md->lcc[fc];
3139 for (i = 1; i <= min; i++)
3140 {
3141 if (eptr >= md->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 MRRETURN(MATCH_NOMATCH);
3145 }
3146 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3147 }
3148 if (min == max) continue;
3149 if (minimize)
3150 {
3151 for (fi = min;; fi++)
3152 {
3153 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3154 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3155 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3156 if (eptr >= md->end_subject)
3157 {
3158 SCHECK_PARTIAL();
3159 MRRETURN(MATCH_NOMATCH);
3160 }
3161 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3162 }
3163 /* Control never gets here */
3164 }
3165 else /* Maximize */
3166 {
3167 pp = eptr;
3168 for (i = min; i < max; i++)
3169 {
3170 if (eptr >= md->end_subject)
3171 {
3172 SCHECK_PARTIAL();
3173 break;
3174 }
3175 if (fc != md->lcc[*eptr]) break;
3176 eptr++;
3177 }
3178
3179 if (possessive) continue;
3180
3181 while (eptr >= pp)
3182 {
3183 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3184 eptr--;
3185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3186 }
3187 MRRETURN(MATCH_NOMATCH);
3188 }
3189 /* Control never gets here */
3190 }
3191
3192 /* Caseful comparisons (includes all multi-byte characters) */
3193
3194 else
3195 {
3196 for (i = 1; i <= min; i++)
3197 {
3198 if (eptr >= md->end_subject)
3199 {
3200 SCHECK_PARTIAL();
3201 MRRETURN(MATCH_NOMATCH);
3202 }
3203 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3204 }
3205
3206 if (min == max) continue;
3207
3208 if (minimize)
3209 {
3210 for (fi = min;; fi++)
3211 {
3212 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3215 if (eptr >= md->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 MRRETURN(MATCH_NOMATCH);
3219 }
3220 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3221 }
3222 /* Control never gets here */
3223 }
3224 else /* Maximize */
3225 {
3226 pp = eptr;
3227 for (i = min; i < max; i++)
3228 {
3229 if (eptr >= md->end_subject)
3230 {
3231 SCHECK_PARTIAL();
3232 break;
3233 }
3234 if (fc != *eptr) break;
3235 eptr++;
3236 }
3237 if (possessive) continue;
3238
3239 while (eptr >= pp)
3240 {
3241 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3242 eptr--;
3243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3244 }
3245 MRRETURN(MATCH_NOMATCH);
3246 }
3247 }
3248 /* Control never gets here */
3249
3250 /* Match a negated single one-byte character. The character we are
3251 checking can be multibyte. */
3252
3253 case OP_NOT:
3254 case OP_NOTI:
3255 if (eptr >= md->end_subject)
3256 {
3257 SCHECK_PARTIAL();
3258 MRRETURN(MATCH_NOMATCH);
3259 }
3260 ecode++;
3261 GETCHARINCTEST(c, eptr);
3262 if (op == OP_NOTI) /* The caseless case */
3263 {
3264 #ifdef SUPPORT_UTF8
3265 if (c < 256)
3266 #endif
3267 c = md->lcc[c];
3268 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3269 }
3270 else /* Caseful */
3271 {
3272 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3273 }
3274 break;
3275
3276 /* Match a negated single one-byte character repeatedly. This is almost a
3277 repeat of the code for a repeated single character, but I haven't found a
3278 nice way of commoning these up that doesn't require a test of the
3279 positive/negative option for each character match. Maybe that wouldn't add
3280 very much to the time taken, but character matching *is* what this is all
3281 about... */
3282
3283 case OP_NOTEXACT:
3284 case OP_NOTEXACTI:
3285 min = max = GET2(ecode, 1);
3286 ecode += 3;
3287 goto REPEATNOTCHAR;
3288
3289 case OP_NOTUPTO:
3290 case OP_NOTUPTOI:
3291 case OP_NOTMINUPTO:
3292 case OP_NOTMINUPTOI:
3293 min = 0;
3294 max = GET2(ecode, 1);
3295 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3296 ecode += 3;
3297 goto REPEATNOTCHAR;
3298
3299 case OP_NOTPOSSTAR:
3300 case OP_NOTPOSSTARI:
3301 possessive = TRUE;
3302 min = 0;
3303 max = INT_MAX;
3304 ecode++;
3305 goto REPEATNOTCHAR;
3306
3307 case OP_NOTPOSPLUS:
3308 case OP_NOTPOSPLUSI:
3309 possessive = TRUE;
3310 min = 1;
3311 max = INT_MAX;
3312 ecode++;
3313 goto REPEATNOTCHAR;
3314
3315 case OP_NOTPOSQUERY:
3316 case OP_NOTPOSQUERYI:
3317 possessive = TRUE;
3318 min = 0;
3319 max = 1;
3320 ecode++;
3321 goto REPEATNOTCHAR;
3322
3323 case OP_NOTPOSUPTO:
3324 case OP_NOTPOSUPTOI:
3325 possessive = TRUE;
3326 min = 0;
3327 max = GET2(ecode, 1);
3328 ecode += 3;
3329 goto REPEATNOTCHAR;
3330
3331 case OP_NOTSTAR:
3332 case OP_NOTSTARI:
3333 case OP_NOTMINSTAR:
3334 case OP_NOTMINSTARI:
3335 case OP_NOTPLUS:
3336 case OP_NOTPLUSI:
3337 case OP_NOTMINPLUS:
3338 case OP_NOTMINPLUSI:
3339 case OP_NOTQUERY:
3340 case OP_NOTQUERYI:
3341 case OP_NOTMINQUERY:
3342 case OP_NOTMINQUERYI:
3343 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3344 minimize = (c & 1) != 0;
3345 min = rep_min[c]; /* Pick up values from tables; */
3346 max = rep_max[c]; /* zero for max => infinity */
3347 if (max == 0) max = INT_MAX;
3348
3349 /* Common code for all repeated single-byte matches. */
3350
3351 REPEATNOTCHAR:
3352 fc = *ecode++;
3353
3354 /* The code is duplicated for the caseless and caseful cases, for speed,
3355 since matching characters is likely to be quite common. First, ensure the
3356 minimum number of matches are present. If min = max, continue at the same
3357 level without recursing. Otherwise, if minimizing, keep trying the rest of
3358 the expression and advancing one matching character if failing, up to the
3359 maximum. Alternatively, if maximizing, find the maximum number of
3360 characters and work backwards. */
3361
3362 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3363 max, eptr));
3364
3365 if (op >= OP_NOTSTARI) /* Caseless */
3366 {
3367 fc = md->lcc[fc];
3368
3369 #ifdef SUPPORT_UTF8
3370 /* UTF-8 mode */
3371 if (utf8)
3372 {
3373 register unsigned int d;
3374 for (i = 1; i <= min; i++)
3375 {
3376 if (eptr >= md->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 MRRETURN(MATCH_NOMATCH);
3380 }
3381 GETCHARINC(d, eptr);
3382 if (d < 256) d = md->lcc[d];
3383 if (fc == d) MRRETURN(MATCH_NOMATCH);
3384 }
3385 }
3386 else
3387 #endif
3388
3389 /* Not UTF-8 mode */
3390 {
3391 for (i = 1; i <= min; i++)
3392 {
3393 if (eptr >= md->end_subject)
3394 {
3395 SCHECK_PARTIAL();
3396 MRRETURN(MATCH_NOMATCH);
3397 }
3398 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3399 }
3400 }
3401
3402 if (min == max) continue;
3403
3404 if (minimize)
3405 {
3406 #ifdef SUPPORT_UTF8
3407 /* UTF-8 mode */
3408 if (utf8)
3409 {
3410 register unsigned int d;
3411 for (fi = min;; fi++)
3412 {
3413 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3414 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3415 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3416 if (eptr >= md->end_subject)
3417 {
3418 SCHECK_PARTIAL();
3419 MRRETURN(MATCH_NOMATCH);
3420 }
3421 GETCHARINC(d, eptr);
3422 if (d < 256) d = md->lcc[d];
3423 if (fc == d) MRRETURN(MATCH_NOMATCH);
3424 }
3425 }
3426 else
3427 #endif
3428 /* Not UTF-8 mode */
3429 {
3430 for (fi = min;; fi++)
3431 {
3432 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3433 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3434 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3435 if (eptr >= md->end_subject)
3436 {
3437 SCHECK_PARTIAL();
3438 MRRETURN(MATCH_NOMATCH);
3439 }
3440 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3441 }
3442 }
3443 /* Control never gets here */
3444 }
3445
3446 /* Maximize case */
3447
3448 else
3449 {
3450 pp = eptr;
3451
3452 #ifdef SUPPORT_UTF8
3453 /* UTF-8 mode */
3454 if (utf8)
3455 {
3456 register unsigned int d;
3457 for (i = min; i < max; i++)
3458 {
3459 int len = 1;
3460 if (eptr >= md->end_subject)
3461 {
3462 SCHECK_PARTIAL();
3463 break;
3464 }
3465 GETCHARLEN(d, eptr, len);
3466 if (d < 256) d = md->lcc[d];
3467 if (fc == d) break;
3468 eptr += len;
3469 }
3470 if (possessive) continue;
3471 for(;;)
3472 {
3473 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3474 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3475 if (eptr-- == pp) break; /* Stop if tried at original pos */
3476 BACKCHAR(eptr);
3477 }
3478 }
3479 else
3480 #endif
3481 /* Not UTF-8 mode */
3482 {
3483 for (i = min; i < max; i++)
3484 {
3485 if (eptr >= md->end_subject)
3486 {
3487 SCHECK_PARTIAL();
3488 break;
3489 }
3490 if (fc == md->lcc[*eptr]) break;
3491 eptr++;
3492 }
3493 if (possessive) continue;
3494 while (eptr >= pp)
3495 {
3496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 eptr--;
3499 }
3500 }
3501
3502 MRRETURN(MATCH_NOMATCH);
3503 }
3504 /* Control never gets here */
3505 }
3506
3507 /* Caseful comparisons */
3508
3509 else
3510 {
3511 #ifdef SUPPORT_UTF8
3512 /* UTF-8 mode */
3513 if (utf8)
3514 {
3515 register unsigned int d;
3516 for (i = 1; i <= min; i++)
3517 {
3518 if (eptr >= md->end_subject)
3519 {
3520 SCHECK_PARTIAL();
3521 MRRETURN(MATCH_NOMATCH);
3522 }
3523 GETCHARINC(d, eptr);
3524 if (fc == d) MRRETURN(MATCH_NOMATCH);
3525 }
3526 }
3527 else
3528 #endif
3529 /* Not UTF-8 mode */
3530 {
3531 for (i = 1; i <= min; i++)
3532 {
3533 if (eptr >= md->end_subject)
3534 {
3535 SCHECK_PARTIAL();
3536 MRRETURN(MATCH_NOMATCH);
3537 }
3538 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3539 }
3540 }
3541
3542 if (min == max) continue;
3543
3544 if (minimize)
3545 {
3546 #ifdef SUPPORT_UTF8
3547 /* UTF-8 mode */
3548 if (utf8)
3549 {
3550 register unsigned int d;
3551 for (fi = min;; fi++)
3552 {
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3554 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3555 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3556 if (eptr >= md->end_subject)
3557 {
3558 SCHECK_PARTIAL();
3559 MRRETURN(MATCH_NOMATCH);
3560 }
3561 GETCHARINC(d, eptr);
3562 if (fc == d) MRRETURN(MATCH_NOMATCH);
3563 }
3564 }
3565 else
3566 #endif
3567 /* Not UTF-8 mode */
3568 {
3569 for (fi = min;; fi++)
3570 {
3571 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3572 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3573 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3574 if (eptr >= md->end_subject)
3575 {
3576 SCHECK_PARTIAL();
3577 MRRETURN(MATCH_NOMATCH);
3578 }
3579 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3580 }
3581 }
3582 /* Control never gets here */
3583 }
3584
3585 /* Maximize case */
3586
3587 else
3588 {
3589 pp = eptr;
3590
3591 #ifdef SUPPORT_UTF8
3592 /* UTF-8 mode */
3593 if (utf8)
3594 {
3595 register unsigned int d;
3596 for (i = min; i < max; i++)
3597 {
3598 int len = 1;
3599 if (eptr >= md->end_subject)
3600 {
3601 SCHECK_PARTIAL();
3602 break;
3603 }
3604 GETCHARLEN(d, eptr, len);
3605 if (fc == d) break;
3606 eptr += len;
3607 }
3608 if (possessive) continue;
3609 for(;;)
3610 {
3611 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3613 if (eptr-- == pp) break; /* Stop if tried at original pos */
3614 BACKCHAR(eptr);
3615 }
3616 }
3617 else
3618 #endif
3619 /* Not UTF-8 mode */
3620 {
3621 for (i = min; i < max; i++)
3622 {
3623 if (eptr >= md->end_subject)
3624 {
3625 SCHECK_PARTIAL();
3626 break;
3627 }
3628 if (fc == *eptr) break;
3629 eptr++;
3630 }
3631 if (possessive) continue;
3632 while (eptr >= pp)
3633 {
3634 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3635 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3636 eptr--;
3637 }
3638 }
3639
3640 MRRETURN(MATCH_NOMATCH);
3641 }
3642 }
3643 /* Control never gets here */
3644
3645 /* Match a single character type repeatedly; several different opcodes
3646 share code. This is very similar to the code for single characters, but we
3647 repeat it in the interests of efficiency. */
3648
3649 case OP_TYPEEXACT:
3650 min = max = GET2(ecode, 1);
3651 minimize = TRUE;
3652 ecode += 3;
3653 goto REPEATTYPE;
3654
3655 case OP_TYPEUPTO:
3656 case OP_TYPEMINUPTO:
3657 min = 0;
3658 max = GET2(ecode, 1);
3659 minimize = *ecode == OP_TYPEMINUPTO;
3660 ecode += 3;
3661 goto REPEATTYPE;
3662
3663 case OP_TYPEPOSSTAR:
3664 possessive = TRUE;
3665 min = 0;
3666 max = INT_MAX;
3667 ecode++;
3668 goto REPEATTYPE;
3669
3670 case OP_TYPEPOSPLUS:
3671 possessive = TRUE;
3672 min = 1;
3673 max = INT_MAX;
3674 ecode++;
3675 goto REPEATTYPE;
3676
3677 case OP_TYPEPOSQUERY:
3678 possessive = TRUE;
3679 min = 0;
3680 max = 1;
3681 ecode++;
3682 goto REPEATTYPE;
3683
3684 case OP_TYPEPOSUPTO:
3685 possessive = TRUE;
3686 min = 0;
3687 max = GET2(ecode, 1);
3688 ecode += 3;
3689 goto REPEATTYPE;
3690
3691 case OP_TYPESTAR:
3692 case OP_TYPEMINSTAR:
3693 case OP_TYPEPLUS:
3694 case OP_TYPEMINPLUS:
3695 case OP_TYPEQUERY:
3696 case OP_TYPEMINQUERY:
3697 c = *ecode++ - OP_TYPESTAR;
3698 minimize = (c & 1) != 0;
3699 min = rep_min[c]; /* Pick up values from tables; */
3700 max = rep_max[c]; /* zero for max => infinity */
3701 if (max == 0) max = INT_MAX;
3702
3703 /* Common code for all repeated single character type matches. Note that
3704 in UTF-8 mode, '.' matches a character of any length, but for the other
3705 character types, the valid characters are all one-byte long. */
3706
3707 REPEATTYPE:
3708 ctype = *ecode++; /* Code for the character type */
3709
3710 #ifdef SUPPORT_UCP
3711 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3712 {
3713 prop_fail_result = ctype == OP_NOTPROP;
3714 prop_type = *ecode++;
3715 prop_value = *ecode++;
3716 }
3717 else prop_type = -1;
3718 #endif
3719
3720 /* First, ensure the minimum number of matches are present. Use inline
3721 code for maximizing the speed, and do the type test once at the start
3722 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3723 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3724 and single-bytes. */
3725
3726 if (min > 0)
3727 {
3728 #ifdef SUPPORT_UCP
3729 if (prop_type >= 0)
3730 {
3731 switch(prop_type)
3732 {
3733 case PT_ANY:
3734 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3735 for (i = 1; i <= min; i++)
3736 {
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 MRRETURN(MATCH_NOMATCH);
3741 }
3742 GETCHARINCTEST(c, eptr);
3743 }
3744 break;
3745
3746 case PT_LAMP:
3747 for (i = 1; i <= min; i++)
3748 {
3749 int chartype;
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 MRRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINCTEST(c, eptr);
3756 chartype = UCD_CHARTYPE(c);
3757 if ((chartype == ucp_Lu ||
3758 chartype == ucp_Ll ||
3759 chartype == ucp_Lt) == prop_fail_result)
3760 MRRETURN(MATCH_NOMATCH);
3761 }
3762 break;
3763
3764 case PT_GC:
3765 for (i = 1; i <= min; i++)
3766 {
3767 if (eptr >= md->end_subject)
3768 {
3769 SCHECK_PARTIAL();
3770 MRRETURN(MATCH_NOMATCH);
3771 }
3772 GETCHARINCTEST(c, eptr);
3773 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3774 MRRETURN(MATCH_NOMATCH);
3775 }
3776 break;
3777
3778 case PT_PC:
3779 for (i = 1; i <= min; i++)
3780 {
3781 if (eptr >= md->end_subject)
3782 {
3783 SCHECK_PARTIAL();
3784 MRRETURN(MATCH_NOMATCH);
3785 }
3786 GETCHARINCTEST(c, eptr);
3787 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 break;
3791
3792 case PT_SC:
3793 for (i = 1; i <= min; i++)
3794 {
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 MRRETURN(MATCH_NOMATCH);
3799 }
3800 GETCHARINCTEST(c, eptr);
3801 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3802 MRRETURN(MATCH_NOMATCH);
3803 }
3804 break;
3805
3806 case PT_ALNUM:
3807 for (i = 1; i <= min; i++)
3808 {
3809 int category;
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 MRRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINCTEST(c, eptr);
3816 category = UCD_CATEGORY(c);
3817 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3818 MRRETURN(MATCH_NOMATCH);
3819 }
3820 break;
3821
3822 case PT_SPACE: /* Perl space */
3823 for (i = 1; i <= min; i++)
3824 {
3825 if (eptr >= md->end_subject)
3826 {
3827 SCHECK_PARTIAL();
3828 MRRETURN(MATCH_NOMATCH);
3829 }
3830 GETCHARINCTEST(c, eptr);
3831 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3832 c == CHAR_FF || c == CHAR_CR)
3833 == prop_fail_result)
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 break;
3837
3838 case PT_PXSPACE: /* POSIX space */
3839 for (i = 1; i <= min; i++)
3840 {
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 MRRETURN(MATCH_NOMATCH);
3845 }
3846 GETCHARINCTEST(c, eptr);
3847 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3848 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3849 == prop_fail_result)
3850 MRRETURN(MATCH_NOMATCH);
3851 }
3852 break;
3853
3854 case PT_WORD:
3855 for (i = 1; i <= min; i++)
3856 {
3857 int category;
3858 if (eptr >= md->end_subject)
3859 {
3860 SCHECK_PARTIAL();
3861 MRRETURN(MATCH_NOMATCH);
3862 }
3863 GETCHARINCTEST(c, eptr);
3864 category = UCD_CATEGORY(c);
3865 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3866 == prop_fail_result)
3867 MRRETURN(MATCH_NOMATCH);
3868 }
3869 break;
3870
3871 /* This should not occur */
3872
3873 default:
3874 RRETURN(PCRE_ERROR_INTERNAL);
3875 }
3876 }
3877
3878 /* Match extended Unicode sequences. We will get here only if the
3879 support is in the binary; otherwise a compile-time error occurs. */
3880
3881 else if (ctype == OP_EXTUNI)
3882 {
3883 for (i = 1; i <= min; i++)
3884 {
3885 if (eptr >= md->end_subject)
3886 {
3887 SCHECK_PARTIAL();
3888 MRRETURN(MATCH_NOMATCH);
3889 }
3890 GETCHARINCTEST(c, eptr);
3891 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3892 while (eptr < md->end_subject)
3893 {
3894 int len = 1;
3895 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3896 if (UCD_CATEGORY(c) != ucp_M) break;
3897 eptr += len;
3898 }
3899 }
3900 }
3901
3902 else
3903 #endif /* SUPPORT_UCP */
3904
3905 /* Handle all other cases when the coding is UTF-8 */
3906
3907 #ifdef SUPPORT_UTF8
3908 if (utf8) switch(ctype)
3909 {
3910 case OP_ANY:
3911 for (i = 1; i <= min; i++)
3912 {
3913 if (eptr >= md->end_subject)
3914 {
3915 SCHECK_PARTIAL();
3916 MRRETURN(MATCH_NOMATCH);
3917 }
3918 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3919 eptr++;
3920 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921 }
3922 break;
3923
3924 case OP_ALLANY:
3925 for (i = 1; i <= min; i++)
3926 {
3927 if (eptr >= md->end_subject)
3928 {
3929 SCHECK_PARTIAL();
3930 MRRETURN(MATCH_NOMATCH);
3931 }
3932 eptr++;
3933 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934 }
3935 break;
3936
3937 case OP_ANYBYTE:
3938 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3939 eptr += min;
3940 break;
3941
3942 case OP_ANYNL:
3943 for (i = 1; i <= min; i++)
3944 {
3945 if (eptr >= md->end_subject)
3946 {
3947 SCHECK_PARTIAL();
3948 MRRETURN(MATCH_NOMATCH);
3949 }
3950 GETCHARINC(c, eptr);
3951 switch(c)
3952 {
3953 default: MRRETURN(MATCH_NOMATCH);
3954
3955 case 0x000d:
3956 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3957 break;
3958
3959 case 0x000a:
3960 break;
3961
3962 case 0x000b:
3963 case 0x000c:
3964 case 0x0085:
3965 case 0x2028:
3966 case 0x2029:
3967 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3968 break;
3969 }
3970 }
3971 break;
3972
3973 case OP_NOT_HSPACE:
3974 for (i = 1; i <= min; i++)
3975 {
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 MRRETURN(MATCH_NOMATCH);
3980 }
3981 GETCHARINC(c, eptr);
3982 switch(c)
3983 {
3984 default: break;
3985 case 0x09: /* HT */
3986 case 0x20: /* SPACE */
3987 case 0xa0: /* NBSP */
3988 case 0x1680: /* OGHAM SPACE MARK */
3989 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3990 case 0x2000: /* EN QUAD */
3991 case 0x2001: /* EM QUAD */
3992 case 0x2002: /* EN SPACE */
3993 case 0x2003: /* EM SPACE */
3994 case 0x2004: /* THREE-PER-EM SPACE */
3995 case 0x2005: /* FOUR-PER-EM SPACE */
3996 case 0x2006: /* SIX-PER-EM SPACE */
3997 case 0x2007: /* FIGURE SPACE */
3998 case 0x2008: /* PUNCTUATION SPACE */
3999 case 0x2009: /* THIN SPACE */
4000 case 0x200A: /* HAIR SPACE */
4001 case 0x202f: /* NARROW NO-BREAK SPACE */
4002 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4003 case 0x3000: /* IDEOGRAPHIC SPACE */
4004 MRRETURN(MATCH_NOMATCH);
4005 }
4006 }
4007 break;
4008
4009 case OP_HSPACE:
4010 for (i = 1; i <= min; i++)
4011 {
4012 if (eptr >= md->end_subject)
4013 {
4014 SCHECK_PARTIAL();
4015 MRRETURN(MATCH_NOMATCH);
4016 }
4017 GETCHARINC(c, eptr);
4018 switch(c)
4019 {
4020 default: MRRETURN(MATCH_NOMATCH);
4021 case 0x09: /* HT */
4022 case 0x20: /* SPACE */
4023 case 0xa0: /* NBSP */
4024 case 0x1680: /* OGHAM SPACE MARK */
4025 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4026 case 0x2000: /* EN QUAD */
4027 case 0x2001: /* EM QUAD */
4028 case 0x2002: /* EN SPACE */
4029 case 0x2003: /* EM SPACE */
4030 case 0x2004: /* THREE-PER-EM SPACE */
4031 case 0x2005: /* FOUR-PER-EM SPACE */
4032 case 0x2006: /* SIX-PER-EM SPACE */
4033 case 0x2007: /* FIGURE SPACE */
4034 case 0x2008: /* PUNCTUATION SPACE */
4035 case 0x2009: /* THIN SPACE */
4036 case 0x200A: /* HAIR SPACE */
4037 case 0x202f: /* NARROW NO-BREAK SPACE */
4038 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4039 case 0x3000: /* IDEOGRAPHIC SPACE */
4040 break;
4041 }
4042 }
4043 break;
4044
4045 case OP_NOT_VSPACE:
4046 for (i = 1; i <= min; i++)
4047 {
4048 if (eptr >= md->end_subject)
4049 {
4050 SCHECK_PARTIAL();
4051 MRRETURN(MATCH_NOMATCH);
4052 }
4053 GETCHARINC(c, eptr);
4054 switch(c)
4055 {
4056 default: break;
4057 case 0x0a: /* LF */
4058 case 0x0b: /* VT */
4059 case 0x0c: /* FF */
4060 case 0x0d: /* CR */
4061 case 0x85: /* NEL */
4062 case 0x2028: /* LINE SEPARATOR */
4063 case 0x2029: /* PARAGRAPH SEPARATOR */
4064 MRRETURN(MATCH_NOMATCH);
4065 }
4066 }
4067 break;
4068
4069 case OP_VSPACE:
4070 for (i = 1; i <= min; i++)
4071 {
4072 if (eptr >= md->end_subject)
4073 {
4074 SCHECK_PARTIAL();
4075 MRRETURN(MATCH_NOMATCH);
4076 }
4077 GETCHARINC(c, eptr);
4078 switch(c)
4079 {
4080 default: MRRETURN(MATCH_NOMATCH);
4081 case 0x0a: /* LF */
4082 case 0x0b: /* VT */
4083 case 0x0c: /* FF */
4084 case 0x0d: /* CR */
4085 case 0x85: /* NEL */
4086 case 0x2028: /* LINE SEPARATOR */
4087 case 0x2029: /* PARAGRAPH SEPARATOR */
4088 break;
4089 }
4090 }
4091 break;
4092
4093 case OP_NOT_DIGIT:
4094 for (i = 1; i <= min; i++)
4095 {
4096 if (eptr >= md->end_subject)
4097 {
4098 SCHECK_PARTIAL();
4099 MRRETURN(MATCH_NOMATCH);
4100 }
4101 GETCHARINC(c, eptr);
4102 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4103 MRRETURN(MATCH_NOMATCH);
4104 }
4105 break;
4106
4107 case OP_DIGIT:
4108 for (i = 1; i <= min; i++)
4109 {
4110 if (eptr >= md->end_subject)
4111 {
4112 SCHECK_PARTIAL();
4113 MRRETURN(MATCH_NOMATCH);
4114 }
4115 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4116 MRRETURN(MATCH_NOMATCH);
4117 /* No need to skip more bytes - we know it's a 1-byte character */
4118 }
4119 break;
4120
4121 case OP_NOT_WHITESPACE:
4122 for (i = 1; i <= min; i++)
4123 {
4124 if (eptr >= md->end_subject)
4125 {
4126 SCHECK_PARTIAL();
4127 MRRETURN(MATCH_NOMATCH);
4128 }
4129 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4130 MRRETURN(MATCH_NOMATCH);
4131 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4132 }
4133 break;
4134
4135 case OP_WHITESPACE:
4136 for (i = 1; i <= min; i++)
4137 {
4138 if (eptr >= md->end_subject)
4139 {
4140 SCHECK_PARTIAL();
4141 MRRETURN(MATCH_NOMATCH);
4142 }
4143 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4144 MRRETURN(MATCH_NOMATCH);
4145 /* No need to skip more bytes - we know it's a 1-byte character */
4146 }
4147 break;
4148
4149 case OP_NOT_WORDCHAR:
4150 for (i = 1; i <= min; i++)
4151 {
4152 if (eptr >= md->end_subject)
4153 {
4154 SCHECK_PARTIAL();
4155 MRRETURN(MATCH_NOMATCH);
4156 }
4157 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4158 MRRETURN(MATCH_NOMATCH);
4159 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4160 }
4161 break;
4162
4163 case OP_WORDCHAR:
4164 for (i = 1; i <= min; i++)
4165 {
4166 if (eptr >= md->end_subject)
4167 {
4168 SCHECK_PARTIAL();
4169 MRRETURN(MATCH_NOMATCH);
4170 }
4171 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4172 MRRETURN(MATCH_NOMATCH);
4173 /* No need to skip more bytes - we know it's a 1-byte character */
4174 }
4175 break;
4176
4177 default:
4178 RRETURN(PCRE_ERROR_INTERNAL);
4179 } /* End switch(ctype) */
4180
4181 else
4182 #endif /* SUPPORT_UTF8 */
4183
4184 /* Code for the non-UTF-8 case for minimum matching of operators other
4185 than OP_PROP and OP_NOTPROP. */
4186
4187 switch(ctype)
4188 {
4189 case OP_ANY:
4190 for (i = 1; i <= min; i++)
4191 {
4192 if (eptr >= md->end_subject)
4193 {
4194 SCHECK_PARTIAL();
4195 MRRETURN(MATCH_NOMATCH);
4196 }
4197 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4198 eptr++;
4199 }
4200 break;
4201
4202 case OP_ALLANY:
4203 if (eptr > md->end_subject - min)
4204 {
4205 SCHECK_PARTIAL();
4206 MRRETURN(MATCH_NOMATCH);
4207 }
4208 eptr += min;
4209 break;
4210
4211 case OP_ANYBYTE:
4212 if (eptr > md->end_subject - min)
4213 {
4214 SCHECK_PARTIAL();
4215 MRRETURN(MATCH_NOMATCH);
4216 }
4217 eptr += min;
4218 break;
4219
4220 case OP_ANYNL:
4221 for (i = 1; i <= min; i++)
4222 {
4223 if (eptr >= md->end_subject)
4224 {
4225 SCHECK_PARTIAL();
4226 MRRETURN(MATCH_NOMATCH);
4227 }
4228 switch(*eptr++)
4229 {
4230 default: MRRETURN(MATCH_NOMATCH);
4231
4232 case 0x000d:
4233 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4234 break;
4235
4236 case 0x000a:
4237 break;
4238
4239 case 0x000b:
4240 case 0x000c:
4241 case 0x0085:
4242 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4243 break;
4244 }
4245 }
4246 break;
4247
4248 case OP_NOT_HSPACE:
4249 for (i = 1; i <= min; i++)
4250 {
4251 if (eptr >= md->end_subject)
4252 {
4253 SCHECK_PARTIAL();
4254 MRRETURN(MATCH_NOMATCH);
4255 }
4256 switch(*eptr++)
4257 {
4258 default: break;
4259 case 0x09: /* HT */
4260 case 0x20: /* SPACE */
4261 case 0xa0: /* NBSP */
4262 MRRETURN(MATCH_NOMATCH);
4263 }
4264 }
4265 break;
4266
4267 case OP_HSPACE:
4268 for (i = 1; i <= min; i++)
4269 {
4270 if (eptr >= md->end_subject)
4271 {
4272 SCHECK_PARTIAL();
4273 MRRETURN(MATCH_NOMATCH);
4274 }
4275 switch(*eptr++)
4276 {
4277 default: MRRETURN(MATCH_NOMATCH);
4278 case 0x09: /* HT */
4279 case 0x20: /* SPACE */
4280 case 0xa0: /* NBSP */
4281 break;
4282 }
4283 }
4284 break;
4285
4286 case OP_NOT_VSPACE:
4287 for (i = 1; i <= min; i++)
4288 {
4289 if (eptr >= md->end_subject)
4290 {
4291 SCHECK_PARTIAL();
4292 MRRETURN(MATCH_NOMATCH);
4293 }
4294 switch(*eptr++)
4295 {
4296 default: break;
4297 case 0x0a: /* LF */
4298 case 0x0b: /* VT */
4299 case 0x0c: /* FF */
4300 case 0x0d: /* CR */
4301 case 0x85: /* NEL */
4302 MRRETURN(MATCH_NOMATCH);
4303 }
4304 }
4305 break;
4306
4307 case OP_VSPACE:
4308 for (i = 1; i <= min; i++)
4309 {
4310 if (eptr >= md->end_subject)
4311 {
4312 SCHECK_PARTIAL();
4313 MRRETURN(MATCH_NOMATCH);
4314 }
4315 switch(*eptr++)
4316 {
4317 default: MRRETURN(MATCH_NOMATCH);
4318 case 0x0a: /* LF */
4319 case 0x0b: /* VT */
4320 case 0x0c: /* FF */
4321 case 0x0d: /* CR */
4322 case 0x85: /* NEL */
4323 break;
4324 }
4325 }
4326 break;
4327
4328 case OP_NOT_DIGIT:
4329 for (i = 1; i <= min; i++)
4330 {
4331 if (eptr >= md->end_subject)
4332 {
4333 SCHECK_PARTIAL();
4334 MRRETURN(MATCH_NOMATCH);
4335 }
4336 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4337 }
4338 break;
4339
4340 case OP_DIGIT:
4341 for (i = 1; i <= min; i++)
4342 {
4343 if (eptr >= md->end_subject)
4344 {
4345 SCHECK_PARTIAL();
4346 MRRETURN(MATCH_NOMATCH);
4347 }
4348 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4349 }
4350 break;
4351
4352 case OP_NOT_WHITESPACE:
4353 for (i = 1; i <= min; i++)
4354 {
4355 if (eptr >= md->end_subject)
4356 {
4357 SCHECK_PARTIAL();
4358 MRRETURN(MATCH_NOMATCH);
4359 }
4360 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4361 }
4362 break;
4363
4364 case OP_WHITESPACE:
4365 for (i = 1; i <= min; i++)
4366 {
4367 if (eptr >= md->end_subject)
4368 {
4369 SCHECK_PARTIAL();
4370 MRRETURN(MATCH_NOMATCH);
4371 }
4372 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4373 }
4374 break;
4375
4376 case OP_NOT_WORDCHAR:
4377 for (i = 1; i <= min; i++)
4378 {
4379 if (eptr >= md->end_subject)
4380 {
4381 SCHECK_PARTIAL();
4382 MRRETURN(MATCH_NOMATCH);
4383 }
4384 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 break;
4388
4389 case OP_WORDCHAR:
4390 for (i = 1; i <= min; i++)
4391 {
4392 if (eptr >= md->end_subject)
4393 {
4394 SCHECK_PARTIAL();
4395 MRRETURN(MATCH_NOMATCH);
4396 }
4397 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4398 MRRETURN(MATCH_NOMATCH);
4399 }
4400 break;
4401
4402 default:
4403 RRETURN(PCRE_ERROR_INTERNAL);
4404 }
4405 }
4406
4407 /* If min = max, continue at the same level without recursing */
4408
4409 if (min == max) continue;
4410
4411 /* If minimizing, we have to test the rest of the pattern before each
4412 subsequent match. Again, separate the UTF-8 case for speed, and also
4413 separate the UCP cases. */
4414
4415 if (minimize)
4416 {
4417 #ifdef SUPPORT_UCP
4418 if (prop_type >= 0)
4419 {
4420 switch(prop_type)
4421 {
4422 case PT_ANY:
4423 for (fi = min;; fi++)
4424 {
4425 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4427 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4428 if (eptr >= md->end_subject)
4429 {
4430 SCHECK_PARTIAL();
4431 MRRETURN(MATCH_NOMATCH);
4432 }
4433 GETCHARINCTEST(c, eptr);
4434 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4435 }
4436 /* Control never gets here */
4437
4438 case PT_LAMP:
4439 for (fi = min;; fi++)
4440 {
4441 int chartype;
4442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4443 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4444 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 MRRETURN(MATCH_NOMATCH);
4449 }
4450 GETCHARINCTEST(c, eptr);
4451 chartype = UCD_CHARTYPE(c);
4452 if ((chartype == ucp_Lu ||
4453 chartype == ucp_Ll ||
4454 chartype == ucp_Lt) == prop_fail_result)
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 /* Control never gets here */
4458
4459 case PT_GC:
4460 for (fi = min;; fi++)
4461 {
4462 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4463 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4464 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 MRRETURN(MATCH_NOMATCH);
4469 }
4470 GETCHARINCTEST(c, eptr);
4471 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 /* Control never gets here */
4475
4476 case PT_PC:
4477 for (fi = min;; fi++)
4478 {
4479 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4480 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4481 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4482 if (eptr >= md->end_subject)
4483 {
4484 SCHECK_PARTIAL();
4485 MRRETURN(MATCH_NOMATCH);
4486 }
4487 GETCHARINCTEST(c, eptr);
4488 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4489 MRRETURN(MATCH_NOMATCH);
4490 }
4491 /* Control never gets here */
4492
4493 case PT_SC:
4494 for (fi = min;; fi++)
4495 {
4496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4498 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 MRRETURN(MATCH_NOMATCH);
4503 }
4504 GETCHARINCTEST(c, eptr);
4505 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4506 MRRETURN(MATCH_NOMATCH);
4507 }
4508 /* Control never gets here */
4509
4510 case PT_ALNUM:
4511 for (fi = min;; fi++)
4512 {
4513 int category;
4514 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4516 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4517 if (eptr >= md->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 MRRETURN(MATCH_NOMATCH);
4521 }
4522 GETCHARINCTEST(c, eptr);
4523 category = UCD_CATEGORY(c);
4524 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 /* Control never gets here */
4528
4529 case PT_SPACE: /* Perl space */
4530 for (fi = min;; fi++)
4531 {
4532 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4534 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 MRRETURN(MATCH_NOMATCH);
4539 }
4540 GETCHARINCTEST(c, eptr);
4541 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4542 c == CHAR_FF || c == CHAR_CR)
4543 == prop_fail_result)
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 /* Control never gets here */
4547
4548 case PT_PXSPACE: /* POSIX space */
4549 for (fi = min;; fi++)
4550 {
4551 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4553 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4554 if (eptr >= md->end_subject)
4555 {
4556 SCHECK_PARTIAL();
4557 MRRETURN(MATCH_NOMATCH);
4558 }
4559 GETCHARINCTEST(c, eptr);
4560 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4561 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4562 == prop_fail_result)
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 /* Control never gets here */
4566
4567 case PT_WORD:
4568 for (fi = min;; fi++)
4569 {
4570 int category;
4571 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4572 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4573 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4574 if (eptr >= md->end_subject)
4575 {
4576 SCHECK_PARTIAL();
4577 MRRETURN(MATCH_NOMATCH);
4578 }
4579 GETCHARINCTEST(c, eptr);
4580 category = UCD_CATEGORY(c);
4581 if ((category == ucp_L ||
4582 category == ucp_N ||
4583 c == CHAR_UNDERSCORE)
4584 == prop_fail_result)
4585 MRRETURN(MATCH_NOMATCH);
4586 }
4587 /* Control never gets here */
4588
4589 /* This should never occur */
4590
4591 default:
4592 RRETURN(PCRE_ERROR_INTERNAL);
4593 }
4594 }
4595
4596 /* Match extended Unicode sequences. We will get here only if the
4597 support is in the binary; otherwise a compile-time error occurs. */
4598
4599 else if (ctype == OP_EXTUNI)
4600 {
4601 for (fi = min;; fi++)
4602 {
4603 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4605 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4606 if (eptr >= md->end_subject)
4607 {
4608 SCHECK_PARTIAL();
4609 MRRETURN(MATCH_NOMATCH);
4610 }
4611 GETCHARINCTEST(c, eptr);
4612 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4613 while (eptr < md->end_subject)
4614 {
4615 int len = 1;
4616 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4617 if (UCD_CATEGORY(c) != ucp_M) break;
4618 eptr += len;
4619 }
4620 }
4621 }
4622 else
4623 #endif /* SUPPORT_UCP */
4624
4625 #ifdef SUPPORT_UTF8
4626 /* UTF-8 mode */
4627 if (utf8)
4628 {
4629 for (fi = min;; fi++)
4630 {
4631 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4633 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4634 if (eptr >= md->end_subject)
4635 {
4636 SCHECK_PARTIAL();
4637 MRRETURN(MATCH_NOMATCH);
4638 }
4639 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4640 MRRETURN(MATCH_NOMATCH);
4641 GETCHARINC(c, eptr);
4642 switch(ctype)
4643 {
4644 case OP_ANY: /* This is the non-NL case */
4645 case OP_ALLANY:
4646 case OP_ANYBYTE:
4647 break;
4648
4649 case OP_ANYNL:
4650 switch(c)
4651 {
4652 default: MRRETURN(MATCH_NOMATCH);
4653 case 0x000d:
4654 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4655 break;
4656 case 0x000a:
4657 break;
4658
4659 case 0x000b:
4660 case 0x000c:
4661 case 0x0085:
4662 case 0x2028:
4663 case 0x2029:
4664 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4665 break;
4666 }
4667 break;
4668
4669 case OP_NOT_HSPACE:
4670 switch(c)
4671 {
4672 default: break;
4673 case 0x09: /* HT */
4674 case 0x20: /* SPACE */
4675 case 0xa0: /* NBSP */
4676 case 0x1680: /* OGHAM SPACE MARK */
4677 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4678 case 0x2000: /* EN QUAD */
4679 case 0x2001: /* EM QUAD */
4680 case 0x2002: /* EN SPACE */
4681 case 0x2003: /* EM SPACE */
4682 case 0x2004: /* THREE-PER-EM SPACE */
4683 case 0x2005: /* FOUR-PER-EM SPACE */
4684 case 0x2006: /* SIX-PER-EM SPACE */
4685 case 0x2007: /* FIGURE SPACE */
4686 case 0x2008: /* PUNCTUATION SPACE */
4687 case 0x2009: /* THIN SPACE */
4688 case 0x200A: /* HAIR SPACE */
4689 case 0x202f: /* NARROW NO-BREAK SPACE */
4690 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4691 case 0x3000: /* IDEOGRAPHIC SPACE */
4692 MRRETURN(MATCH_NOMATCH);
4693 }
4694 break;
4695
4696 case OP_HSPACE:
4697 switch(c)
4698 {
4699 default: MRRETURN(MATCH_NOMATCH);
4700 case 0x09: /* HT */
4701 case 0x20: /* SPACE */
4702 case 0xa0: /* NBSP */
4703 case 0x1680: /* OGHAM SPACE MARK */
4704 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4705 case 0x2000: /* EN QUAD */
4706 case 0x2001: /* EM QUAD */
4707 case 0x2002: /* EN SPACE */
4708 case 0x2003: /* EM SPACE */
4709 case 0x2004: /* THREE-PER-EM SPACE */
4710 case 0x2005: /* FOUR-PER-EM SPACE */
4711 case 0x2006: /* SIX-PER-EM SPACE */
4712 case 0x2007: /* FIGURE SPACE */
4713 case 0x2008: /* PUNCTUATION SPACE */
4714 case 0x2009: /* THIN SPACE */
4715 case 0x200A: /* HAIR SPACE */
4716 case 0x202f: /* NARROW NO-BREAK SPACE */
4717 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4718 case 0x3000: /* IDEOGRAPHIC SPACE */
4719 break;
4720 }
4721 break;
4722
4723 case OP_NOT_VSPACE:
4724 switch(c)
4725 {
4726 default: break;
4727 case 0x0a: /* LF */
4728 case 0x0b: /* VT */
4729 case 0x0c: /* FF */
4730 case 0x0d: /* CR */
4731 case 0x85: /* NEL */
4732 case 0x2028: /* LINE SEPARATOR */
4733 case 0x2029: /* PARAGRAPH SEPARATOR */
4734 MRRETURN(MATCH_NOMATCH);
4735 }
4736 break;
4737
4738 case OP_VSPACE:
4739 switch(c)
4740 {
4741 default: MRRETURN(MATCH_NOMATCH);
4742 case 0x0a: /* LF */
4743 case 0x0b: /* VT */
4744 case 0x0c: /* FF */
4745 case 0x0d: /* CR */
4746 case 0x85: /* NEL */
4747 case 0x2028: /* LINE SEPARATOR */
4748 case 0x2029: /* PARAGRAPH SEPARATOR */
4749 break;
4750 }
4751 break;
4752
4753 case OP_NOT_DIGIT:
4754 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4755 MRRETURN(MATCH_NOMATCH);
4756 break;
4757
4758 case OP_DIGIT:
4759 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4760 MRRETURN(MATCH_NOMATCH);
4761 break;
4762
4763 case OP_NOT_WHITESPACE:
4764 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4765 MRRETURN(MATCH_NOMATCH);
4766 break;
4767
4768 case OP_WHITESPACE:
4769 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4770 MRRETURN(MATCH_NOMATCH);
4771 break;
4772
4773 case OP_NOT_WORDCHAR:
4774 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4775 MRRETURN(MATCH_NOMATCH);
4776 break;
4777
4778 case OP_WORDCHAR:
4779 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4780 MRRETURN(MATCH_NOMATCH);
4781 break;
4782
4783 default:
4784 RRETURN(PCRE_ERROR_INTERNAL);
4785 }
4786 }
4787 }
4788 else
4789 #endif
4790 /* Not UTF-8 mode */
4791 {
4792 for (fi = min;; fi++)
4793 {
4794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4796 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4797 if (eptr >= md->end_subject)
4798 {
4799 SCHECK_PARTIAL();
4800 MRRETURN(MATCH_NOMATCH);
4801 }
4802 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4803 MRRETURN(MATCH_NOMATCH);
4804 c = *eptr++;
4805 switch(ctype)
4806 {
4807 case OP_ANY: /* This is the non-NL case */
4808 case OP_ALLANY:
4809 case OP_ANYBYTE:
4810 break;
4811
4812 case OP_ANYNL:
4813 switch(c)
4814 {
4815 default: MRRETURN(MATCH_NOMATCH);
4816 case 0x000d:
4817 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4818 break;
4819
4820 case 0x000a:
4821 break;
4822
4823 case 0x000b:
4824 case 0x000c:
4825 case 0x0085:
4826 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4827 break;
4828 }
4829 break;
4830
4831 case OP_NOT_HSPACE:
4832 switch(c)
4833 {
4834 default: break;
4835 case 0x09: /* HT */
4836 case 0x20: /* SPACE */
4837 case 0xa0: /* NBSP */
4838 MRRETURN(MATCH_NOMATCH);
4839 }
4840 break;
4841
4842 case OP_HSPACE:
4843 switch(c)
4844 {
4845 default: MRRETURN(MATCH_NOMATCH);
4846 case 0x09: /* HT */
4847 case 0x20: /* SPACE */
4848 case 0xa0: /* NBSP */
4849 break;
4850 }
4851 break;
4852
4853 case OP_NOT_VSPACE:
4854 switch(c)
4855 {
4856 default: break;
4857 case 0x0a: /* LF */
4858 case 0x0b: /* VT */
4859 case 0x0c: /* FF */
4860 case 0x0d: /* CR */
4861 case 0x85: /* NEL */
4862 MRRETURN(MATCH_NOMATCH);
4863 }
4864 break;
4865
4866 case OP_VSPACE:
4867 switch(c)
4868 {
4869 default: MRRETURN(MATCH_NOMATCH);
4870 case 0x0a: /* LF */
4871 case 0x0b: /* VT */
4872 case 0x0c: /* FF */
4873 case 0x0d: /* CR */
4874 case 0x85: /* NEL */
4875 break;
4876 }
4877 break;
4878
4879 case OP_NOT_DIGIT:
4880 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4881 break;
4882
4883 case OP_DIGIT:
4884 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4885 break;
4886
4887 case OP_NOT_WHITESPACE:
4888 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4889 break;
4890
4891 case OP_WHITESPACE:
4892 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4893 break;
4894
4895 case OP_NOT_WORDCHAR:
4896 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4897 break;
4898
4899 case OP_WORDCHAR:
4900 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4901 break;
4902
4903 default:
4904 RRETURN(PCRE_ERROR_INTERNAL);
4905 }
4906 }
4907 }
4908 /* Control never gets here */
4909 }
4910
4911 /* If maximizing, it is worth using inline code for speed, doing the type
4912 test once at the start (i.e. keep it out of the loop). Again, keep the
4913 UTF-8 and UCP stuff separate. */
4914
4915 else
4916 {
4917 pp = eptr; /* Remember where we started */
4918
4919 #ifdef SUPPORT_UCP
4920 if (prop_type >= 0)
4921 {
4922 switch(prop_type)
4923 {
4924 case PT_ANY:
4925 for (i = min; i < max; i++)
4926 {
4927 int len = 1;
4928 if (eptr >= md->end_subject)
4929 {
4930 SCHECK_PARTIAL();
4931 break;
4932 }
4933 GETCHARLENTEST(c, eptr, len);
4934 if (prop_fail_result) break;
4935 eptr+= len;
4936 }
4937 break;
4938
4939 case PT_LAMP:
4940 for (i = min; i < max; i++)
4941 {
4942 int chartype;
4943 int len = 1;
4944 if (eptr >= md->end_subject)
4945 {
4946 SCHECK_PARTIAL();
4947 break;
4948 }
4949 GETCHARLENTEST(c, eptr, len);
4950 chartype = UCD_CHARTYPE(c);
4951 if ((chartype == ucp_Lu ||
4952 chartype == ucp_Ll ||
4953 chartype == ucp_Lt) == prop_fail_result)
4954 break;
4955 eptr+= len;
4956 }
4957 break;
4958
4959 case PT_GC:
4960 for (i = min; i < max; i++)
4961 {
4962 int len = 1;
4963 if (eptr >= md->end_subject)
4964 {
4965 SCHECK_PARTIAL();
4966 break;
4967 }
4968 GETCHARLENTEST(c, eptr, len);
4969 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4970 eptr+= len;
4971 }
4972 break;
4973
4974 case PT_PC:
4975 for (i = min; i < max; i++)
4976 {
4977 int len = 1;
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 GETCHARLENTEST(c, eptr, len);
4984 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4985 eptr+= len;
4986 }
4987 break;
4988
4989 case PT_SC:
4990 for (i = min; i < max; i++)
4991 {
4992 int len = 1;
4993 if (eptr >= md->end_subject)
4994 {
4995 SCHECK_PARTIAL();
4996 break;
4997 }
4998 GETCHARLENTEST(c, eptr, len);
4999 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5000 eptr+= len;
5001 }
5002 break;
5003
5004 case PT_ALNUM:
5005 for (i = min; i < max; i++)
5006 {
5007 int category;
5008 int len = 1;
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 break;
5013 }
5014 GETCHARLENTEST(c, eptr, len);
5015 category = UCD_CATEGORY(c);
5016 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5017 break;
5018 eptr+= len;
5019 }
5020 break;
5021
5022 case PT_SPACE: /* Perl space */
5023 for (i = min; i < max; i++)
5024 {
5025 int len = 1;
5026 if (eptr >= md->end_subject)
5027 {
5028 SCHECK_PARTIAL();
5029 break;
5030 }
5031 GETCHARLENTEST(c, eptr, len);
5032 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5033 c == CHAR_FF || c == CHAR_CR)
5034 == prop_fail_result)
5035 break;
5036 eptr+= len;
5037 }
5038 break;
5039
5040 case PT_PXSPACE: /* POSIX space */
5041 for (i = min; i < max; i++)
5042 {
5043 int len = 1;
5044 if (eptr >= md->end_subject)
5045 {
5046 SCHECK_PARTIAL();
5047 break;
5048 }
5049 GETCHARLENTEST(c, eptr, len);
5050 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5051 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5052 == prop_fail_result)
5053 break;
5054 eptr+= len;
5055 }
5056 break;
5057
5058 case PT_WORD:
5059 for (i = min; i < max; i++)
5060 {
5061 int category;
5062 int len = 1;
5063 if (eptr >= md->end_subject)
5064 {
5065 SCHECK_PARTIAL();
5066 break;
5067 }
5068 GETCHARLENTEST(c, eptr, len);
5069 category = UCD_CATEGORY(c);
5070 if ((category == ucp_L || category == ucp_N ||
5071 c == CHAR_UNDERSCORE) == prop_fail_result)
5072 break;
5073 eptr+= len;
5074 }
5075 break;
5076
5077 default:
5078 RRETURN(PCRE_ERROR_INTERNAL);
5079 }
5080
5081 /* eptr is now past the end of the maximum run */
5082
5083 if (possessive) continue;
5084 for(;;)
5085 {
5086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5088 if (eptr-- == pp) break; /* Stop if tried at original pos */
5089 if (utf8) BACKCHAR(eptr);
5090 }
5091 }
5092
5093 /* Match extended Unicode sequences. We will get here only if the
5094 support is in the binary; otherwise a compile-time error occurs. */
5095
5096 else if (ctype == OP_EXTUNI)
5097 {
5098 for (i = min; i < max; i++)
5099 {
5100 int len = 1;
5101 if (eptr >= md->end_subject)
5102 {
5103 SCHECK_PARTIAL();
5104 break;
5105 }
5106 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5107 if (UCD_CATEGORY(c) == ucp_M) break;
5108 eptr += len;
5109 while (eptr < md->end_subject)
5110 {
5111 len = 1;
5112 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5113 if (UCD_CATEGORY(c) != ucp_M) break;
5114 eptr += len;
5115 }
5116 }
5117
5118 /* eptr is now past the end of the maximum run */
5119
5120 if (possessive) continue;
5121
5122 for(;;)
5123 {
5124 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5126 if (eptr-- == pp) break; /* Stop if tried at original pos */
5127 for (;;) /* Move back over one extended */
5128 {
5129 if (!utf8) c = *eptr; else
5130 {
5131 BACKCHAR(eptr);
5132 GETCHAR(c, eptr);
5133 }
5134 if (UCD_CATEGORY(c) != ucp_M) break;
5135 eptr--;
5136 }
5137 }
5138 }
5139
5140 else
5141 #endif /* SUPPORT_UCP */
5142
5143 #ifdef SUPPORT_UTF8
5144 /* UTF-8 mode */
5145
5146 if (utf8)
5147 {
5148 switch(ctype)
5149 {
5150 case OP_ANY:
5151 if (max < INT_MAX)
5152 {
5153 for (i = min; i < max; i++)
5154 {
5155 if (eptr >= md->end_subject)
5156 {
5157 SCHECK_PARTIAL();
5158 break;
5159 }
5160 if (IS_NEWLINE(eptr)) break;
5161 eptr++;
5162 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5163 }
5164 }
5165
5166 /* Handle unlimited UTF-8 repeat */
5167
5168 else
5169 {
5170 for (i = min; i < max; i++)
5171 {
5172 if (eptr >= md->end_subject)
5173 {
5174 SCHECK_PARTIAL();
5175 break;
5176 }
5177 if (IS_NEWLINE(eptr)) break;
5178 eptr++;
5179 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5180 }
5181 }
5182 break;
5183
5184 case OP_ALLANY:
5185 if (max < INT_MAX)
5186 {
5187 for (i = min; i < max; i++)
5188 {
5189 if (eptr >= md->end_subject)
5190 {
5191 SCHECK_PARTIAL();
5192 break;
5193 }
5194 eptr++;
5195 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5196 }
5197 }
5198 else
5199 {
5200 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5201 SCHECK_PARTIAL();
5202 }
5203 break;
5204
5205 /* The byte case is the same as non-UTF8 */
5206
5207 case OP_ANYBYTE:
5208 c = max - min;
5209 if (c > (unsigned int)(md->end_subject - eptr))
5210 {
5211 eptr = md->end_subject;
5212 SCHECK_PARTIAL();
5213 }
5214 else eptr += c;
5215 break;
5216
5217 case OP_ANYNL:
5218 for (i = min; i < max; i++)
5219 {
5220 int len = 1;
5221 if (eptr >= md->end_subject)
5222 {
5223 SCHECK_PARTIAL();
5224 break;
5225 }
5226 GETCHARLEN(c, eptr, len);
5227 if (c == 0x000d)
5228 {
5229 if (++eptr >= md->end_subject) break;
5230 if (*eptr == 0x000a) eptr++;
5231 }
5232 else
5233 {
5234 if (c != 0x000a &&
5235 (md->bsr_anycrlf ||
5236 (c != 0x000b && c != 0x000c &&
5237 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5238 break;
5239 eptr += len;
5240 }
5241 }
5242 break;
5243
5244 case OP_NOT_HSPACE:
5245 case OP_HSPACE:
5246 for (i = min; i < max; i++)
5247 {
5248 BOOL gotspace;
5249 int len = 1;
5250 if (eptr >= md->end_subject)
5251 {
5252 SCHECK_PARTIAL();
5253 break;
5254 }
5255 GETCHARLEN(c, eptr, len);
5256 switch(c)
5257 {
5258 default: gotspace = FALSE; break;
5259 case 0x09: /* HT */
5260 case 0x20: /* SPACE */
5261 case 0xa0: /* NBSP */
5262 case 0x1680: /* OGHAM SPACE MARK */
5263 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5264 case 0x2000: /* EN QUAD */
5265 case 0x2001: /* EM QUAD */
5266 case 0x2002: /* EN SPACE */
5267 case 0x2003: /* EM SPACE */
5268 case 0x2004: /* THREE-PER-EM SPACE */
5269 case 0x2005: /* FOUR-PER-EM SPACE */
5270 case 0x2006: /* SIX-PER-EM SPACE */
5271 case 0x2007: /* FIGURE SPACE */
5272 case 0x2008: /* PUNCTUATION SPACE */
5273 case 0x2009: /* THIN SPACE */
5274 case 0x200A: /* HAIR SPACE */
5275 case 0x202f: /* NARROW NO-BREAK SPACE */
5276 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5277 case 0x3000: /* IDEOGRAPHIC SPACE */
5278 gotspace = TRUE;
5279 break;
5280 }
5281 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5282 eptr += len;
5283 }
5284 break;
5285
5286 case OP_NOT_VSPACE:
5287 case OP_VSPACE:
5288 for (i = min; i < max; i++)
5289 {
5290 BOOL gotspace;
5291 int len = 1;
5292 if (eptr >= md->end_subject)
5293 {
5294 SCHECK_PARTIAL();
5295 break;
5296 }
5297 GETCHARLEN(c, eptr, len);
5298 switch(c)
5299 {
5300 default: gotspace = FALSE; break;
5301 case 0x0a: /* LF */
5302 case 0x0b: /* VT */
5303 case 0x0c: /* FF */
5304 case 0x0d: /* CR */
5305 case 0x85: /* NEL */
5306 case 0x2028: /* LINE SEPARATOR */
5307 case 0x2029: /* PARAGRAPH SEPARATOR */
5308 gotspace = TRUE;
5309 break;
5310 }
5311 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5312 eptr += len;
5313 }
5314 break;
5315
5316 case OP_NOT_DIGIT:
5317 for (i = min; i < max; i++)
5318 {
5319 int len = 1;
5320 if (eptr >= md->end_subject)
5321 {
5322 SCHECK_PARTIAL();
5323 break;
5324 }
5325 GETCHARLEN(c, eptr, len);
5326 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5327 eptr+= len;
5328 }
5329 break;
5330
5331 case OP_DIGIT:
5332 for (i = min; i < max; i++)
5333 {
5334 int len = 1;
5335 if (eptr >= md->end_subject)
5336 {
5337 SCHECK_PARTIAL();
5338 break;
5339 }
5340 GETCHARLEN(c, eptr, len);
5341 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5342 eptr+= len;
5343 }
5344 break;
5345
5346 case OP_NOT_WHITESPACE:
5347 for (i = min; i < max; i++)
5348 {
5349 int len = 1;
5350 if (eptr >= md->end_subject)
5351 {
5352 SCHECK_PARTIAL();
5353 break;
5354 }
5355 GETCHARLEN(c, eptr, len);
5356 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5357 eptr+= len;
5358 }
5359 break;
5360
5361 case OP_WHITESPACE:
5362 for (i = min; i < max; i++)
5363 {
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLEN(c, eptr, len);
5371 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5372 eptr+= len;
5373 }
5374 break;
5375
5376 case OP_NOT_WORDCHAR:
5377 for (i = min; i < max; i++)
5378 {
5379 int len = 1;
5380 if (eptr >= md->end_subject)
5381 {
5382 SCHECK_PARTIAL();
5383 break;
5384 }
5385 GETCHARLEN(c, eptr, len);
5386 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5387 eptr+= len;
5388 }
5389 break;
5390
5391 case OP_WORDCHAR:
5392 for (i = min; i < max; i++)
5393 {
5394 int len = 1;
5395 if (eptr >= md->end_subject)
5396 {
5397 SCHECK_PARTIAL();
5398 break;
5399 }
5400 GETCHARLEN(c, eptr, len);
5401 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5402 eptr+= len;
5403 }
5404 break;
5405
5406 default:
5407 RRETURN(PCRE_ERROR_INTERNAL);
5408 }
5409
5410 /* eptr is now past the end of the maximum run. If possessive, we are
5411 done (no backing up). Otherwise, match at this position; anything other
5412 than no match is immediately returned. For nomatch, back up one
5413 character, unless we are matching \R and the last thing matched was
5414 \r\n, in which case, back up two bytes. */
5415
5416 if (possessive) continue;
5417 for(;;)
5418 {
5419 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5421 if (eptr-- == pp) break; /* Stop if tried at original pos */
5422 BACKCHAR(eptr);
5423 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5424 eptr[-1] == '\r') eptr--;
5425 }
5426 }
5427 else
5428 #endif /* SUPPORT_UTF8 */
5429
5430 /* Not UTF-8 mode */
5431 {
5432 switch(ctype)
5433 {
5434 case OP_ANY:
5435 for (i = min; i < max; i++)
5436 {
5437 if (eptr >= md->end_subject)
5438 {
5439 SCHECK_PARTIAL();
5440 break;
5441 }
5442 if (IS_NEWLINE(eptr)) break;
5443 eptr++;
5444 }
5445 break;
5446
5447 case OP_ALLANY:
5448 case OP_ANYBYTE:
5449 c = max - min;
5450 if (c > (unsigned int)(md->end_subject - eptr))
5451 {
5452 eptr = md->end_subject;
5453 SCHECK_PARTIAL();
5454 }
5455 else eptr += c;
5456 break;
5457
5458 case OP_ANYNL:
5459 for (i = min; i < max; i++)
5460 {
5461 if (eptr >= md->end_subject)
5462 {
5463 SCHECK_PARTIAL();
5464 break;
5465 }
5466 c = *eptr;
5467 if (c == 0x000d)
5468 {
5469 if (++eptr >= md->end_subject) break;
5470 if (*eptr == 0x000a) eptr++;
5471 }
5472 else
5473 {
5474 if (c != 0x000a &&
5475 (md->bsr_anycrlf ||
5476 (c != 0x000b && c != 0x000c && c != 0x0085)))
5477 break;
5478 eptr++;
5479 }
5480 }
5481 break;
5482
5483 case OP_NOT_HSPACE:
5484 for (i = min; i < max; i++)
5485 {
5486 if (eptr >= md->end_subject)
5487 {
5488 SCHECK_PARTIAL();
5489 break;
5490 }
5491 c = *eptr;
5492 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5493 eptr++;
5494 }
5495 break;
5496
5497 case OP_HSPACE:
5498 for (i = min; i < max; i++)
5499 {
5500 if (eptr >= md->end_subject)
5501 {
5502 SCHECK_PARTIAL();
5503 break;
5504 }
5505 c = *eptr;
5506 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5507 eptr++;
5508 }
5509 break;
5510
5511 case OP_NOT_VSPACE:
5512 for (i = min; i < max; i++)
5513 {
5514 if (eptr >= md->end_subject)
5515 {
5516 SCHECK_PARTIAL();
5517 break;
5518 }
5519 c = *eptr;
5520 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5521 break;
5522 eptr++;
5523 }
5524 break;
5525
5526 case OP_VSPACE:
5527 for (i = min; i < max; i++)
5528 {
5529 if (eptr >= md->end_subject)
5530 {
5531 SCHECK_PARTIAL();
5532 break;
5533 }
5534 c = *eptr;
5535 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5536 break;
5537 eptr++;
5538 }
5539 break;
5540
5541 case OP_NOT_DIGIT:
5542 for (i = min; i < max; i++)
5543 {
5544 if (eptr >= md->end_subject)
5545 {
5546 SCHECK_PARTIAL();
5547 break;
5548 }
5549 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5550 eptr++;
5551 }
5552 break;
5553
5554 case OP_DIGIT:
5555 for (i = min; i < max; i++)
5556 {
5557 if (eptr >= md->end_subject)
5558 {
5559 SCHECK_PARTIAL();
5560 break;
5561 }
5562 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5563 eptr++;
5564 }
5565 break;
5566
5567 case OP_NOT_WHITESPACE:
5568 for (i = min; i < max; i++)
5569 {
5570 if (eptr >= md->end_subject)
5571 {
5572 SCHECK_PARTIAL();
5573 break;
5574 }
5575 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5576 eptr++;
5577 }
5578 break;
5579
5580 case OP_WHITESPACE:
5581 for (i = min; i < max; i++)
5582 {
5583 if (eptr >= md->end_subject)
5584 {
5585 SCHECK_PARTIAL();
5586 break;
5587 }
5588 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5589 eptr++;
5590 }
5591 break;
5592
5593 case OP_NOT_WORDCHAR:
5594 for (i = min; i < max; i++)
5595 {
5596 if (eptr >= md->end_subject)
5597 {
5598 SCHECK_PARTIAL();
5599 break;
5600 }
5601 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5602 eptr++;
5603 }
5604 break;
5605
5606 case OP_WORDCHAR:
5607 for (i = min; i < max; i++)
5608 {
5609 if (eptr >= md->end_subject)
5610 {
5611 SCHECK_PARTIAL();
5612 break;
5613 }
5614 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5615 eptr++;
5616 }
5617 break;
5618
5619 default:
5620 RRETURN(PCRE_ERROR_INTERNAL);
5621 }
5622
5623 /* eptr is now past the end of the maximum run. If possessive, we are
5624 done (no backing up). Otherwise, match at this position; anything other
5625 than no match is immediately returned. For nomatch, back up one
5626 character (byte), unless we are matching \R and the last thing matched
5627 was \r\n, in which case, back up two bytes. */
5628
5629 if (possessive) continue;
5630 while (eptr >= pp)
5631 {
5632 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5634 eptr--;
5635 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5636 eptr[-1] == '\r') eptr--;
5637 }
5638 }
5639
5640 /* Get here if we can't make it match with any permitted repetitions */
5641
5642 MRRETURN(MATCH_NOMATCH);
5643 }
5644 /* Control never gets here */
5645
5646 /* There's been some horrible disaster. Arrival here can only mean there is
5647 something seriously wrong in the code above or the OP_xxx definitions. */
5648
5649 default:
5650 DPRINTF(("Unknown opcode %d\n", *ecode));
5651 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5652 }
5653
5654 /* Do not stick any code in here without much thought; it is assumed
5655 that "continue" in the code above comes out to here to repeat the main
5656 loop. */
5657
5658 } /* End of main loop */
5659 /* Control never reaches here */
5660
5661
5662 /* When compiling to use the heap rather than the stack for recursive calls to
5663 match(), the RRETURN() macro jumps here. The number that is saved in
5664 frame->Xwhere indicates which label we actually want to return to. */
5665
5666 #ifdef NO_RECURSE
5667 #define LBL(val) case val: goto L_RM##val;
5668 HEAP_RETURN:
5669 switch (frame->Xwhere)
5670 {
5671 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5672 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5673 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5674 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5675 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5676 #ifdef SUPPORT_UTF8
5677 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5678 LBL(32) LBL(34) LBL(42) LBL(46)
5679 #ifdef SUPPORT_UCP
5680 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5681 LBL(59) LBL(60) LBL(61) LBL(62)
5682 #endif /* SUPPORT_UCP */
5683 #endif /* SUPPORT_UTF8 */
5684 default:
5685 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5686 return PCRE_ERROR_INTERNAL;
5687 }
5688 #undef LBL
5689 #endif /* NO_RECURSE */
5690 }
5691
5692
5693 /***************************************************************************
5694 ****************************************************************************
5695 RECURSION IN THE match() FUNCTION
5696
5697 Undefine all the macros that were defined above to handle this. */
5698
5699 #ifdef NO_RECURSE
5700 #undef eptr
5701 #undef ecode
5702 #undef mstart
5703 #undef offset_top
5704 #undef eptrb
5705 #undef flags
5706
5707 #undef callpat
5708 #undef charptr
5709 #undef data
5710 #undef next
5711 #undef pp
5712 #undef prev
5713 #undef saved_eptr
5714
5715 #undef new_recursive
5716
5717 #undef cur_is_word
5718 #undef condition
5719 #undef prev_is_word
5720
5721 #undef ctype
5722 #undef length
5723 #undef max
5724 #undef min
5725 #undef number
5726 #undef offset
5727 #undef op
5728 #undef save_capture_last
5729 #undef save_offset1
5730 #undef save_offset2
5731 #undef save_offset3
5732 #undef stacksave
5733
5734 #undef newptrb
5735
5736 #endif
5737
5738 /* These two are defined as macros in both cases */
5739
5740 #undef fc
5741 #undef fi
5742
5743 /***************************************************************************
5744 ***************************************************************************/
5745
5746
5747
5748 /*************************************************
5749 * Execute a Regular Expression *
5750 *************************************************/
5751
5752 /* This function applies a compiled re to a subject string and picks out
5753 portions of the string if it matches. Two elements in the vector are set for
5754 each substring: the offsets to the start and end of the substring.
5755
5756 Arguments:
5757 argument_re points to the compiled expression
5758 extra_data points to extra data or is NULL
5759 subject points to the subject string
5760 length length of subject string (may contain binary zeros)
5761 start_offset where to start in the subject string
5762 options option bits
5763 offsets points to a vector of ints to be filled in with offsets
5764 offsetcount the number of elements in the vector
5765
5766 Returns: > 0 => success; value is the number of elements filled in
5767 = 0 => success, but offsets is not big enough
5768 -1 => failed to match
5769 < -1 => some kind of unexpected problem
5770 */
5771
5772 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5773 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5774 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5775 int offsetcount)
5776 {
5777 int rc, ocount, arg_offset_max;
5778 int first_byte = -1;
5779 int req_byte = -1;
5780 int req_byte2 = -1;
5781 int newline;
5782 BOOL using_temporary_offsets = FALSE;
5783 BOOL anchored;
5784 BOOL startline;
5785 BOOL firstline;
5786 BOOL first_byte_caseless = FALSE;
5787 BOOL req_byte_caseless = FALSE;
5788 BOOL utf8;
5789 match_data match_block;
5790 match_data *md = &match_block;
5791 const uschar *tables;
5792 const uschar *start_bits = NULL;
5793 USPTR start_match = (USPTR)subject + start_offset;
5794 USPTR end_subject;
5795 USPTR start_partial = NULL;
5796 USPTR req_byte_ptr = start_match - 1;
5797
5798 pcre_study_data internal_study;
5799 const pcre_study_data *study;
5800
5801 real_pcre internal_re;
5802 const real_pcre *external_re = (const real_pcre *)argument_re;
5803 const real_pcre *re = external_re;
5804
5805 /* Plausibility checks */
5806
5807 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5808 if (re == NULL || subject == NULL ||
5809 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5810 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5811 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5812
5813 /* These two settings are used in the code for checking a UTF-8 string that
5814 follows immediately afterwards. Other values in the md block are used only
5815 during "normal" pcre_exec() processing, not when the JIT support is in use,
5816 so they are set up later. */
5817
5818 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5819 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5820 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5821
5822 /* Check a UTF-8 string if required. Pass back the character offset and error
5823 code for an invalid string if a results vector is available. */
5824
5825 #ifdef SUPPORT_UTF8
5826 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5827 {
5828 int erroroffset;
5829 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5830 if (errorcode != 0)
5831 {
5832 if (offsetcount >= 2)
5833 {
5834 offsets[0] = erroroffset;
5835 offsets[1] = errorcode;
5836 }
5837 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5838 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5839 }
5840
5841 /* Check that a start_offset points to the start of a UTF-8 character. */
5842 if (start_offset > 0 && start_offset < length &&
5843 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5844 return PCRE_ERROR_BADUTF8_OFFSET;
5845 }
5846 #endif
5847
5848 /* If the pattern was successfully studied with JIT support, run the JIT
5849 executable instead of the rest of this function. Most options must be set at
5850 compile time for the JIT code to be usable. Fallback to the normal code path if
5851 an unsupported flag is set. In particular, JIT does not support partial
5852 matching. */
5853
5854 #ifdef SUPPORT_JIT
5855 if (extra_data != NULL
5856 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5857 && extra_data->executable_jit != NULL
5858 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5859 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5860 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5861 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5862 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5863 #endif
5864
5865 /* Carry on with non-JIT matching. This information is for finding all the
5866 numbers associated with a given name, for condition testing. */
5867
5868 md->name_table = (uschar *)re + re->name_table_offset;
5869 md->name_count = re->name_count;
5870 md->name_entry_size = re->name_entry_size;
5871
5872 /* Fish out the optional data from the extra_data structure, first setting
5873 the default values. */
5874
5875 study = NULL;
5876 md->match_limit = MATCH_LIMIT;
5877 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5878 md->callout_data = NULL;
5879
5880 /* The table pointer is always in native byte order. */
5881
5882 tables = external_re->tables;
5883
5884 if (extra_data != NULL)
5885 {
5886 register unsigned int flags = extra_data->flags;
5887 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5888 study = (const pcre_study_data *)extra_data->study_data;
5889 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5890 md->match_limit = extra_data->match_limit;
5891 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5892 md->match_limit_recursion = extra_data->match_limit_recursion;
5893 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5894 md->callout_data = extra_data->callout_data;
5895 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5896 }
5897
5898 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5899 is a feature that makes it possible to save compiled regex and re-use them
5900 in other programs later. */
5901
5902 if (tables == NULL) tables = _pcre_default_tables;
5903
5904 /* Check that the first field in the block is the magic number. If it is not,
5905 test for a regex that was compiled on a host of opposite endianness. If this is
5906 the case, flipped values are put in internal_re and internal_study if there was
5907 study data too. */
5908
5909 if (re->magic_number != MAGIC_NUMBER)
5910 {
5911 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5912 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5913 if (study != NULL) study = &internal_study;
5914 }
5915
5916 /* Set up other data */
5917
5918 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5919 startline = (re->flags & PCRE_STARTLINE) != 0;
5920 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5921
5922 /* The code starts after the real_pcre block and the capture name table. */
5923
5924 md->start_code = (const uschar *)external_re + re->name_table_offset +
5925 re->name_count * re->name_entry_size;
5926
5927 md->start_subject = (USPTR)subject;
5928 md->start_offset = start_offset;
5929 md->end_subject = md->start_subject + length;
5930 end_subject = md->end_subject;
5931
5932 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5933 md->use_ucp = (re->options & PCRE_UCP) != 0;
5934 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5935
5936 /* Some options are unpacked into BOOL variables in the hope that testing
5937 them will be faster than individual option bits. */
5938
5939 md->notbol = (options & PCRE_NOTBOL) != 0;
5940 md->noteol = (options & PCRE_NOTEOL) != 0;
5941 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5942 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5943
5944 md->hitend = FALSE;
5945 md->mark = NULL; /* In case never set */
5946
5947 md->recursive = NULL; /* No recursion at top level */
5948
5949 md->lcc = tables + lcc_offset;
5950 md->ctypes = tables + ctypes_offset;
5951
5952 /* Handle different \R options. */
5953
5954 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5955 {
5956 case 0:
5957 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5958 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5959 else
5960 #ifdef BSR_ANYCRLF
5961 md->bsr_anycrlf = TRUE;
5962 #else
5963 md->bsr_anycrlf = FALSE;
5964 #endif
5965 break;
5966
5967 case PCRE_BSR_ANYCRLF:
5968 md->bsr_anycrlf = TRUE;
5969 break;
5970
5971 case PCRE_BSR_UNICODE:
5972 md->bsr_anycrlf = FALSE;
5973 break;
5974
5975 default: return PCRE_ERROR_BADNEWLINE;
5976 }
5977
5978 /* Handle different types of newline. The three bits give eight cases. If
5979 nothing is set at run time, whatever was used at compile time applies. */
5980
5981 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5982 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5983 {
5984 case 0: newline = NEWLINE; break; /* Compile-time default */
5985 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5986 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5987 case PCRE_NEWLINE_CR+
5988 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5989 case PCRE_NEWLINE_ANY: newline = -1; break;
5990 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5991 default: return PCRE_ERROR_BADNEWLINE;
5992 }
5993
5994 if (newline == -2)
5995 {
5996 md->nltype = NLTYPE_ANYCRLF;
5997 }
5998 else if (newline < 0)
5999 {
6000 md->nltype = NLTYPE_ANY;
6001 }
6002 else
6003 {
6004 md->nltype = NLTYPE_FIXED;
6005 if (newline > 255)
6006 {
6007 md->nllen = 2;
6008 md->nl[0] = (newline >> 8) & 255;
6009 md->nl[1] = newline & 255;
6010 }
6011 else
6012 {
6013 md->nllen = 1;
6014 md->nl[0] = newline;
6015 }
6016 }
6017
6018 /* Partial matching was originally supported only for a restricted set of
6019 regexes; from release 8.00 there are no restrictions, but the bits are still
6020 defined (though never set). So there's no harm in leaving this code. */
6021
6022 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6023 return PCRE_ERROR_BADPARTIAL;
6024
6025 /* If the expression has got more back references than the offsets supplied can
6026 hold, we get a temporary chunk of working store to use during the matching.
6027 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6028 of 3. */
6029
6030 ocount = offsetcount - (offsetcount % 3);
6031 arg_offset_max = (2*ocount)/3;
6032
6033 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6034 {
6035 ocount = re->top_backref * 3 + 3;
6036 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6037 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6038 using_temporary_offsets = TRUE;
6039 DPRINTF(("Got memory to hold back references\n"));
6040 }
6041 else md->offset_vector = offsets;
6042
6043 md->offset_end = ocount;
6044 md->offset_max = (2*ocount)/3;
6045 md->offset_overflow = FALSE;
6046 md->capture_last = -1;
6047
6048 /* Reset the working variable associated with each extraction. These should
6049 never be used unless previously set, but they get saved and restored, and so we
6050 initialize them to avoid reading uninitialized locations. Also, unset the
6051 offsets for the matched string. This is really just for tidiness with callouts,
6052 in case they inspect these fields. */
6053
6054 if (md->offset_vector != NULL)
6055 {
6056 register int *iptr = md->offset_vector + ocount;
6057 register int *iend = iptr - re->top_bracket;
6058 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6059 while (--iptr >= iend) *iptr = -1;
6060 md->offset_vector[0] = md->offset_vector[1] = -1;
6061 }
6062
6063 /* Set up the first character to match, if available. The first_byte value is
6064 never set for an anchored regular expression, but the anchoring may be forced
6065 at run time, so we have to test for anchoring. The first char may be unset for
6066 an unanchored pattern, of course. If there's no first char and the pattern was
6067 studied, there may be a bitmap of possible first characters. */
6068
6069 if (!anchored)
6070 {
6071 if ((re->flags & PCRE_FIRSTSET) != 0)
6072 {
6073 first_byte = re->first_byte & 255;
6074 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6075 first_byte = md->lcc[first_byte];
6076 }
6077 else
6078 if (!startline && study != NULL &&
6079 (study->flags & PCRE_STUDY_MAPPED) != 0)
6080 start_bits = study->start_bits;
6081 }
6082
6083 /* For anchored or unanchored matches, there may be a "last known required
6084 character" set. */
6085
6086 if ((re->flags & PCRE_REQCHSET) != 0)
6087 {
6088 req_byte = re->req_byte & 255;
6089 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6090 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6091 }
6092
6093
6094
6095
6096 /* ==========================================================================*/
6097
6098 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6099 the loop runs just once. */
6100
6101 for(;;)
6102 {
6103 USPTR save_end_subject = end_subject;
6104 USPTR new_start_match;
6105
6106 /* If firstline is TRUE, the start of the match is constrained to the first
6107 line of a multiline string. That is, the match must be before or at the first
6108 newline. Implement this by temporarily adjusting end_subject so that we stop
6109 scanning at a newline. If the match fails at the newline, later code breaks
6110 this loop. */
6111
6112 if (firstline)
6113 {
6114 USPTR t = start_match;
6115 #ifdef SUPPORT_UTF8
6116 if (utf8)
6117 {
6118 while (t < md->end_subject && !IS_NEWLINE(t))
6119 {
6120 t++;
6121 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6122 }
6123 }
6124 else
6125 #endif
6126 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6127 end_subject = t;
6128 }
6129
6130 /* There are some optimizations that avoid running the match if a known
6131 starting point is not found, or if a known later character is not present.
6132 However, there is an option that disables these, for testing and for ensuring
6133 that all callouts do actually occur. The option can be set in the regex by
6134 (*NO_START_OPT) or passed in match-time options. */
6135
6136 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6137 {
6138 /* Advance to a unique first byte if there is one. */
6139
6140 if (first_byte >= 0)
6141 {
6142 if (first_byte_caseless)
6143 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6144 start_match++;
6145 else
6146 while (start_match < end_subject && *start_match != first_byte)
6147 start_match++;
6148 }
6149
6150 /* Or to just after a linebreak for a multiline match */
6151
6152 else if (startline)
6153 {
6154 if (start_match > md->start_subject + start_offset)
6155 {
6156 #ifdef SUPPORT_UTF8
6157 if (utf8)
6158 {
6159 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6160 {
6161 start_match++;
6162 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6163 start_match++;
6164 }
6165 }
6166 else
6167 #endif
6168 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6169 start_match++;
6170
6171 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6172 and we are now at a LF, advance the match position by one more character.
6173 */
6174
6175 if (start_match[-1] == CHAR_CR &&
6176 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6177 start_match < end_subject &&
6178 *start_match == CHAR_NL)
6179 start_match++;
6180 }
6181 }
6182
6183 /* Or to a non-unique first byte after study */
6184
6185 else if (start_bits != NULL)
6186 {
6187 while (start_match < end_subject)
6188 {
6189 register unsigned int c = *start_match;
6190 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6191 {
6192 start_match++;
6193 #ifdef SUPPORT_UTF8
6194 if (utf8)
6195 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6196 start_match++;
6197 #endif
6198 }
6199 else break;
6200 }
6201 }
6202 } /* Starting optimizations */
6203
6204 /* Restore fudged end_subject */
6205
6206 end_subject = save_end_subject;
6207
6208 /* The following two optimizations are disabled for partial matching or if
6209 disabling is explicitly requested. */
6210
6211 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6212 {
6213 /* If the pattern was studied, a minimum subject length may be set. This is
6214 a lower bound; no actual string of that length may actually match the
6215 pattern. Although the value is, strictly, in characters, we treat it as
6216 bytes to avoid spending too much time in this optimization. */
6217
6218 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6219 (pcre_uint32)(end_subject - start_match) < study->minlength)
6220 {
6221 rc = MATCH_NOMATCH;
6222 break;
6223 }
6224
6225 /* If req_byte is set, we know that that character must appear in the
6226 subject for the match to succeed. If the first character is set, req_byte
6227 must be later in the subject; otherwise the test starts at the match point.
6228 This optimization can save a huge amount of backtracking in patterns with
6229 nested unlimited repeats that aren't going to match. Writing separate code
6230 for cased/caseless versions makes it go faster, as does using an
6231 autoincrement and backing off on a match.
6232
6233 HOWEVER: when the subject string is very, very long, searching to its end
6234 can take a long time, and give bad performance on quite ordinary patterns.
6235 This showed up when somebody was matching something like /^\d+C/ on a
6236 32-megabyte string... so we don't do this when the string is sufficiently
6237 long. */
6238
6239 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6240 {
6241 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6242
6243 /* We don't need to repeat the search if we haven't yet reached the
6244 place we found it at last time. */
6245
6246 if (p > req_byte_ptr)
6247 {
6248 if (req_byte_caseless)
6249 {
6250 while (p < end_subject)
6251 {
6252 register int pp = *p++;
6253 if (pp == req_byte || pp == req_byte2) { p--; break; }
6254 }
6255 }
6256 else
6257 {
6258 while (p < end_subject)
6259 {
6260 if (*p++ == req_byte) { p--; break; }
6261 }
6262 }
6263
6264 /* If we can't find the required character, break the matching loop,
6265 forcing a match failure. */
6266
6267 if (p >= end_subject)
6268 {
6269 rc = MATCH_NOMATCH;
6270 break;
6271 }
6272
6273 /* If we have found the required character, save the point where we
6274 found it, so that we don't search again next time round the loop if
6275 the start hasn't passed this character yet. */
6276
6277 req_byte_ptr = p;
6278 }
6279 }
6280 }
6281
6282 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6283 printf(">>>> Match against: ");
6284 pchars(start_match, end_subject - start_match, TRUE, md);
6285 printf("\n");
6286 #endif
6287
6288 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6289 first starting point for which a partial match was found. */
6290
6291 md->start_match_ptr = start_match;
6292 md->start_used_ptr = start_match;
6293 md->match_call_count = 0;
6294 md->match_function_type = 0;
6295 md->end_offset_top = 0;
6296 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6297 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6298
6299 switch(rc)
6300 {
6301 /* SKIP passes back the next starting point explicitly, but if it is the
6302 same as the match we have just done, treat it as NOMATCH. */
6303
6304 case MATCH_SKIP:
6305 if (md->start_match_ptr != start_match)
6306 {
6307 new_start_match = md->start_match_ptr;
6308 break;
6309 }
6310 /* Fall through */
6311
6312 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6313 the SKIP's arg was not found. We also treat this as NOMATCH. */
6314
6315 case MATCH_SKIP_ARG:
6316 /* Fall through */
6317
6318 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6319 exactly like PRUNE. */
6320
6321 case MATCH_NOMATCH:
6322 case MATCH_PRUNE:
6323 case MATCH_THEN:
6324 new_start_match = start_match + 1;
6325 #ifdef SUPPORT_UTF8
6326 if (utf8)
6327 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6328 new_start_match++;
6329 #endif
6330 break;
6331
6332 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6333
6334 case MATCH_COMMIT:
6335 rc = MATCH_NOMATCH;
6336 goto ENDLOOP;
6337
6338 /* Any other return is either a match, or some kind of error. */
6339
6340 default:
6341 goto ENDLOOP;
6342 }
6343
6344 /* Control reaches here for the various types of "no match at this point"
6345 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6346
6347 rc = MATCH_NOMATCH;
6348
6349 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6350 newline in the subject (though it may continue over the newline). Therefore,
6351 if we have just failed to match, starting at a newline, do not continue. */
6352
6353 if (firstline && IS_NEWLINE(start_match)) break;
6354
6355 /* Advance to new matching position */
6356
6357 start_match = new_start_match;
6358
6359 /* Break the loop if the pattern is anchored or if we have passed the end of
6360 the subject. */
6361
6362 if (anchored || start_match > end_subject) break;
6363
6364 /* If we have just passed a CR and we are now at a LF, and the pattern does
6365 not contain any explicit matches for \r or \n, and the newline option is CRLF
6366 or ANY or ANYCRLF, advance the match position by one more character. */
6367
6368 if (start_match[-1] == CHAR_CR &&
6369 start_match < end_subject &&
6370 *start_match == CHAR_NL &&
6371 (re->flags & PCRE_HASCRORLF) == 0 &&
6372 (md->nltype == NLTYPE_ANY ||
6373 md->nltype == NLTYPE_ANYCRLF ||
6374 md->nllen == 2))
6375 start_match++;
6376
6377 md->mark = NULL; /* Reset for start of next match attempt */
6378 } /* End of for(;;) "bumpalong" loop */
6379
6380 /* ==========================================================================*/
6381
6382 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6383 conditions is true:
6384
6385 (1) The pattern is anchored or the match was failed by (*COMMIT);
6386
6387 (2) We are past the end of the subject;
6388
6389 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6390 this option requests that a match occur at or before the first newline in
6391 the subject.
6392
6393 When we have a match and the offset vector is big enough to deal with any
6394 backreferences, captured substring offsets will already be set up. In the case
6395 where we had to get some local store to hold offsets for backreference
6396 processing, copy those that we can. In this case there need not be overflow if
6397 certain parts of the pattern were not used, even though there are more
6398 capturing parentheses than vector slots. */
6399
6400 ENDLOOP:
6401
6402 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6403 {
6404 if (using_temporary_offsets)
6405 {
6406 if (arg_offset_max >= 4)
6407 {
6408 memcpy(offsets + 2, md->offset_vector + 2,
6409 (arg_offset_max - 2) * sizeof(int));
6410 DPRINTF(("Copied offsets from temporary memory\n"));
6411 }
6412 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6413 DPRINTF(("Freeing temporary memory\n"));
6414 (pcre_free)(md->offset_vector);
6415 }
6416
6417 /* Set the return code to the number of captured strings, or 0 if there were
6418 too many to fit into the vector. */
6419
6420 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6421 0 : md->end_offset_top/2;
6422
6423 /* If there is space in the offset vector, set any unused pairs at the end of
6424 the pattern to -1 for backwards compatibility. It is documented that this
6425 happens. In earlier versions, the whole set of potential capturing offsets
6426 was set to -1 each time round the loop, but this is handled differently now.
6427 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6428 those at the end that need unsetting here. We can't just unset them all at
6429 the start of the whole thing because they may get set in one branch that is
6430 not the final matching branch. */
6431
6432 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6433 {
6434 register int *iptr, *iend;
6435 int resetcount = 2 + re->top_bracket * 2;
6436 if (resetcount > offsetcount) resetcount = ocount;
6437 iptr = offsets + md->end_offset_top;
6438 iend = offsets + resetcount;
6439 while (iptr < iend) *iptr++ = -1;
6440 }
6441
6442 /* If there is space, set up the whole thing as substring 0. The value of
6443 md->start_match_ptr might be modified if \K was encountered on the success
6444 matching path. */
6445
6446 if (offsetcount < 2) rc = 0; else
6447 {
6448 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6449 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6450 }
6451
6452 DPRINTF((">>>> returning %d\n", rc));
6453 goto RETURN_MARK;
6454 }
6455
6456 /* Control gets here if there has been an error, or if the overall match
6457 attempt has failed at all permitted starting positions. */
6458
6459 if (using_temporary_offsets)
6460 {
6461 DPRINTF(("Freeing temporary memory\n"));
6462 (pcre_free)(md->offset_vector);
6463 }
6464
6465 /* For anything other than nomatch or partial match, just return the code. */
6466
6467 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6468 {
6469 DPRINTF((">>>> error: returning %d\n", rc));
6470 return rc;
6471 }
6472
6473 /* Handle partial matches - disable any mark data */
6474
6475 if (start_partial != NULL)
6476 {
6477 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6478 md->mark = NULL;
6479 if (offsetcount > 1)
6480 {
6481 offsets[0] = (int)(start_partial - (USPTR)subject);
6482 offsets[1] = (int)(end_subject - (USPTR)subject);
6483 }
6484 rc = PCRE_ERROR_PARTIAL;
6485 }
6486
6487 /* This is the classic nomatch case */
6488
6489 else
6490 {
6491 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6492 rc = PCRE_ERROR_NOMATCH;
6493 }
6494
6495 /* Return the MARK data if it has been requested. */
6496
6497 RETURN_MARK:
6498
6499 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6500 *(extra_data->mark) = (unsigned char *)(md->mark);
6501 return rc;
6502 }
6503
6504 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5