/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 716 - (show annotations)
Tue Oct 4 16:38:05 2011 UTC (8 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 198414 byte(s)
Make (*THEN) work as in Perl in subpatterns that do not contain | alternatives.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle a capturing bracket, other than those that are possessive with an
798 unlimited repeat. If there is space in the offset vector, save the current
799 subject position in the working slot at the top of the vector. We mustn't
800 change the current values of the data slot, because they may be set from a
801 previous iteration of this group, and be referred to by a reference inside
802 the group. A failure to match might occur after the group has succeeded,
803 if something later on doesn't match. For this reason, we need to restore
804 the working value and also the values of the final offsets, in case they
805 were set by a previous iteration of the same bracket.
806
807 If there isn't enough space in the offset vector, treat this as if it were
808 a non-capturing bracket. Don't worry about setting the flag for the error
809 case here; that is handled in the code for KET. */
810
811 case OP_CBRA:
812 case OP_SCBRA:
813 number = GET2(ecode, 1+LINK_SIZE);
814 offset = number << 1;
815
816 #ifdef PCRE_DEBUG
817 printf("start bracket %d\n", number);
818 printf("subject=");
819 pchars(eptr, 16, TRUE, md);
820 printf("\n");
821 #endif
822
823 if (offset < md->offset_max)
824 {
825 save_offset1 = md->offset_vector[offset];
826 save_offset2 = md->offset_vector[offset+1];
827 save_offset3 = md->offset_vector[md->offset_end - number];
828 save_capture_last = md->capture_last;
829
830 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
831 md->offset_vector[md->offset_end - number] =
832 (int)(eptr - md->start_subject);
833
834 for (;;)
835 {
836 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
837 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
838 eptrb, RM1);
839 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
840
841 /* If we backed up to a THEN, check whether it is within the current
842 branch by comparing the address of the THEN that is passed back with
843 the end of the branch. If it is within the current branch, and the
844 branch is one of two or more alternatives (it either starts or ends
845 with OP_ALT), we have reached the limit of THEN's action, so convert
846 the return code to NOMATCH, which will cause normal backtracking to
847 happen from now on. Otherwise, THEN is passed back to an outer
848 alternative. This implements Perl's treatment of parenthesized groups,
849 where a group not containing | does not affect the current alternative,
850 that is, (X) is NOT the same as (X|(*F)). */
851
852 if (rrc == MATCH_THEN)
853 {
854 next = ecode + GET(ecode,1);
855 if (md->start_match_ptr < next &&
856 (*ecode == OP_ALT || *next == OP_ALT))
857 rrc = MATCH_NOMATCH;
858 }
859
860 /* Anything other than NOMATCH is passed back. */
861
862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863 md->capture_last = save_capture_last;
864 ecode += GET(ecode, 1);
865 if (*ecode != OP_ALT) break;
866 }
867
868 DPRINTF(("bracket %d failed\n", number));
869 md->offset_vector[offset] = save_offset1;
870 md->offset_vector[offset+1] = save_offset2;
871 md->offset_vector[md->offset_end - number] = save_offset3;
872
873 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
874
875 if (md->mark == NULL) md->mark = markptr;
876 RRETURN(rrc);
877 }
878
879 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
880 as a non-capturing bracket. */
881
882 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
883 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
884
885 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
886
887 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
888 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
889
890 /* Non-capturing or atomic group, except for possessive with unlimited
891 repeat. Loop for all the alternatives.
892
893 When we get to the final alternative within the brackets, we used to return
894 the result of a recursive call to match() whatever happened so it was
895 possible to reduce stack usage by turning this into a tail recursion,
896 except in the case of a possibly empty group. However, now that there is
897 the possiblity of (*THEN) occurring in the final alternative, this
898 optimization is no longer always possible.
899
900 We can optimize if we know there are no (*THEN)s in the pattern; at present
901 this is the best that can be done.
902
903 MATCH_ONCE is returned when the end of an atomic group is successfully
904 reached, but subsequent matching fails. It passes back up the tree (causing
905 captured values to be reset) until the original atomic group level is
906 reached. This is tested by comparing md->once_target with the start of the
907 group. At this point, the return is converted into MATCH_NOMATCH so that
908 previous backup points can be taken. */
909
910 case OP_ONCE:
911 case OP_BRA:
912 case OP_SBRA:
913 DPRINTF(("start non-capturing bracket\n"));
914
915 for (;;)
916 {
917 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
918
919 /* If this is not a possibly empty group, and there are no (*THEN)s in
920 the pattern, and this is the final alternative, optimize as described
921 above. */
922
923 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
924 {
925 ecode += _pcre_OP_lengths[*ecode];
926 goto TAIL_RECURSE;
927 }
928
929 /* In all other cases, we have to make another call to match(). */
930
931 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
932 RM2);
933
934 /* See comment in the code for capturing groups above about handling
935 THEN. */
936
937 if (rrc == MATCH_THEN)
938 {
939 next = ecode + GET(ecode,1);
940 if (md->start_match_ptr < next &&
941 (*ecode == OP_ALT || *next == OP_ALT))
942 rrc = MATCH_NOMATCH;
943 }
944
945 if (rrc != MATCH_NOMATCH)
946 {
947 if (rrc == MATCH_ONCE)
948 {
949 const uschar *scode = ecode;
950 if (*scode != OP_ONCE) /* If not at start, find it */
951 {
952 while (*scode == OP_ALT) scode += GET(scode, 1);
953 scode -= GET(scode, 1);
954 }
955 if (md->once_target == scode) rrc = MATCH_NOMATCH;
956 }
957 RRETURN(rrc);
958 }
959 ecode += GET(ecode, 1);
960 if (*ecode != OP_ALT) break;
961 }
962
963 if (md->mark == NULL) md->mark = markptr;
964 RRETURN(MATCH_NOMATCH);
965
966 /* Handle possessive capturing brackets with an unlimited repeat. We come
967 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
968 handled similarly to the normal case above. However, the matching is
969 different. The end of these brackets will always be OP_KETRPOS, which
970 returns MATCH_KETRPOS without going further in the pattern. By this means
971 we can handle the group by iteration rather than recursion, thereby
972 reducing the amount of stack needed. */
973
974 case OP_CBRAPOS:
975 case OP_SCBRAPOS:
976 allow_zero = FALSE;
977
978 POSSESSIVE_CAPTURE:
979 number = GET2(ecode, 1+LINK_SIZE);
980 offset = number << 1;
981
982 #ifdef PCRE_DEBUG
983 printf("start possessive bracket %d\n", number);
984 printf("subject=");
985 pchars(eptr, 16, TRUE, md);
986 printf("\n");
987 #endif
988
989 if (offset < md->offset_max)
990 {
991 matched_once = FALSE;
992 code_offset = ecode - md->start_code;
993
994 save_offset1 = md->offset_vector[offset];
995 save_offset2 = md->offset_vector[offset+1];
996 save_offset3 = md->offset_vector[md->offset_end - number];
997 save_capture_last = md->capture_last;
998
999 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1000
1001 /* Each time round the loop, save the current subject position for use
1002 when the group matches. For MATCH_MATCH, the group has matched, so we
1003 restart it with a new subject starting position, remembering that we had
1004 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1005 usual. If we haven't matched any alternatives in any iteration, check to
1006 see if a previous iteration matched. If so, the group has matched;
1007 continue from afterwards. Otherwise it has failed; restore the previous
1008 capture values before returning NOMATCH. */
1009
1010 for (;;)
1011 {
1012 md->offset_vector[md->offset_end - number] =
1013 (int)(eptr - md->start_subject);
1014 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1015 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1016 eptrb, RM63);
1017 if (rrc == MATCH_KETRPOS)
1018 {
1019 offset_top = md->end_offset_top;
1020 eptr = md->end_match_ptr;
1021 ecode = md->start_code + code_offset;
1022 save_capture_last = md->capture_last;
1023 matched_once = TRUE;
1024 continue;
1025 }
1026
1027 /* See comment in the code for capturing groups above about handling
1028 THEN. */
1029
1030 if (rrc == MATCH_THEN)
1031 {
1032 next = ecode + GET(ecode,1);
1033 if (md->start_match_ptr < next &&
1034 (*ecode == OP_ALT || *next == OP_ALT))
1035 rrc = MATCH_NOMATCH;
1036 }
1037
1038 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1039 md->capture_last = save_capture_last;
1040 ecode += GET(ecode, 1);
1041 if (*ecode != OP_ALT) break;
1042 }
1043
1044 if (!matched_once)
1045 {
1046 md->offset_vector[offset] = save_offset1;
1047 md->offset_vector[offset+1] = save_offset2;
1048 md->offset_vector[md->offset_end - number] = save_offset3;
1049 }
1050
1051 if (md->mark == NULL) md->mark = markptr;
1052 if (allow_zero || matched_once)
1053 {
1054 ecode += 1 + LINK_SIZE;
1055 break;
1056 }
1057
1058 RRETURN(MATCH_NOMATCH);
1059 }
1060
1061 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1062 as a non-capturing bracket. */
1063
1064 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1065 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1066
1067 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1068
1069 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1070 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1071
1072 /* Non-capturing possessive bracket with unlimited repeat. We come here
1073 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1074 without the capturing complication. It is written out separately for speed
1075 and cleanliness. */
1076
1077 case OP_BRAPOS:
1078 case OP_SBRAPOS:
1079 allow_zero = FALSE;
1080
1081 POSSESSIVE_NON_CAPTURE:
1082 matched_once = FALSE;
1083 code_offset = ecode - md->start_code;
1084
1085 for (;;)
1086 {
1087 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1088 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1089 eptrb, RM48);
1090 if (rrc == MATCH_KETRPOS)
1091 {
1092 offset_top = md->end_offset_top;
1093 eptr = md->end_match_ptr;
1094 ecode = md->start_code + code_offset;
1095 matched_once = TRUE;
1096 continue;
1097 }
1098
1099 /* See comment in the code for capturing groups above about handling
1100 THEN. */
1101
1102 if (rrc == MATCH_THEN)
1103 {
1104 next = ecode + GET(ecode,1);
1105 if (md->start_match_ptr < next &&
1106 (*ecode == OP_ALT || *next == OP_ALT))
1107 rrc = MATCH_NOMATCH;
1108 }
1109
1110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111 ecode += GET(ecode, 1);
1112 if (*ecode != OP_ALT) break;
1113 }
1114
1115 if (matched_once || allow_zero)
1116 {
1117 ecode += 1 + LINK_SIZE;
1118 break;
1119 }
1120 RRETURN(MATCH_NOMATCH);
1121
1122 /* Control never reaches here. */
1123
1124 /* Conditional group: compilation checked that there are no more than
1125 two branches. If the condition is false, skipping the first branch takes us
1126 past the end if there is only one branch, but that's OK because that is
1127 exactly what going to the ket would do. */
1128
1129 case OP_COND:
1130 case OP_SCOND:
1131 codelink = GET(ecode, 1);
1132
1133 /* Because of the way auto-callout works during compile, a callout item is
1134 inserted between OP_COND and an assertion condition. */
1135
1136 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1137 {
1138 if (pcre_callout != NULL)
1139 {
1140 pcre_callout_block cb;
1141 cb.version = 2; /* Version 1 of the callout block */
1142 cb.callout_number = ecode[LINK_SIZE+2];
1143 cb.offset_vector = md->offset_vector;
1144 cb.subject = (PCRE_SPTR)md->start_subject;
1145 cb.subject_length = (int)(md->end_subject - md->start_subject);
1146 cb.start_match = (int)(mstart - md->start_subject);
1147 cb.current_position = (int)(eptr - md->start_subject);
1148 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1149 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1150 cb.capture_top = offset_top/2;
1151 cb.capture_last = md->capture_last;
1152 cb.callout_data = md->callout_data;
1153 cb.mark = markptr;
1154 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1155 if (rrc < 0) RRETURN(rrc);
1156 }
1157 ecode += _pcre_OP_lengths[OP_CALLOUT];
1158 }
1159
1160 condcode = ecode[LINK_SIZE+1];
1161
1162 /* Now see what the actual condition is */
1163
1164 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1165 {
1166 if (md->recursive == NULL) /* Not recursing => FALSE */
1167 {
1168 condition = FALSE;
1169 ecode += GET(ecode, 1);
1170 }
1171 else
1172 {
1173 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1174 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1175
1176 /* If the test is for recursion into a specific subpattern, and it is
1177 false, but the test was set up by name, scan the table to see if the
1178 name refers to any other numbers, and test them. The condition is true
1179 if any one is set. */
1180
1181 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1182 {
1183 uschar *slotA = md->name_table;
1184 for (i = 0; i < md->name_count; i++)
1185 {
1186 if (GET2(slotA, 0) == recno) break;
1187 slotA += md->name_entry_size;
1188 }
1189
1190 /* Found a name for the number - there can be only one; duplicate
1191 names for different numbers are allowed, but not vice versa. First
1192 scan down for duplicates. */
1193
1194 if (i < md->name_count)
1195 {
1196 uschar *slotB = slotA;
1197 while (slotB > md->name_table)
1198 {
1199 slotB -= md->name_entry_size;
1200 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1201 {
1202 condition = GET2(slotB, 0) == md->recursive->group_num;
1203 if (condition) break;
1204 }
1205 else break;
1206 }
1207
1208 /* Scan up for duplicates */
1209
1210 if (!condition)
1211 {
1212 slotB = slotA;
1213 for (i++; i < md->name_count; i++)
1214 {
1215 slotB += md->name_entry_size;
1216 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1217 {
1218 condition = GET2(slotB, 0) == md->recursive->group_num;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223 }
1224 }
1225 }
1226
1227 /* Chose branch according to the condition */
1228
1229 ecode += condition? 3 : GET(ecode, 1);
1230 }
1231 }
1232
1233 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1234 {
1235 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1236 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1237
1238 /* If the numbered capture is unset, but the reference was by name,
1239 scan the table to see if the name refers to any other numbers, and test
1240 them. The condition is true if any one is set. This is tediously similar
1241 to the code above, but not close enough to try to amalgamate. */
1242
1243 if (!condition && condcode == OP_NCREF)
1244 {
1245 int refno = offset >> 1;
1246 uschar *slotA = md->name_table;
1247
1248 for (i = 0; i < md->name_count; i++)
1249 {
1250 if (GET2(slotA, 0) == refno) break;
1251 slotA += md->name_entry_size;
1252 }
1253
1254 /* Found a name for the number - there can be only one; duplicate names
1255 for different numbers are allowed, but not vice versa. First scan down
1256 for duplicates. */
1257
1258 if (i < md->name_count)
1259 {
1260 uschar *slotB = slotA;
1261 while (slotB > md->name_table)
1262 {
1263 slotB -= md->name_entry_size;
1264 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1265 {
1266 offset = GET2(slotB, 0) << 1;
1267 condition = offset < offset_top &&
1268 md->offset_vector[offset] >= 0;
1269 if (condition) break;
1270 }
1271 else break;
1272 }
1273
1274 /* Scan up for duplicates */
1275
1276 if (!condition)
1277 {
1278 slotB = slotA;
1279 for (i++; i < md->name_count; i++)
1280 {
1281 slotB += md->name_entry_size;
1282 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1283 {
1284 offset = GET2(slotB, 0) << 1;
1285 condition = offset < offset_top &&
1286 md->offset_vector[offset] >= 0;
1287 if (condition) break;
1288 }
1289 else break;
1290 }
1291 }
1292 }
1293 }
1294
1295 /* Chose branch according to the condition */
1296
1297 ecode += condition? 3 : GET(ecode, 1);
1298 }
1299
1300 else if (condcode == OP_DEF) /* DEFINE - always false */
1301 {
1302 condition = FALSE;
1303 ecode += GET(ecode, 1);
1304 }
1305
1306 /* The condition is an assertion. Call match() to evaluate it - setting
1307 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1308 an assertion. */
1309
1310 else
1311 {
1312 md->match_function_type = MATCH_CONDASSERT;
1313 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1314 if (rrc == MATCH_MATCH)
1315 {
1316 if (md->end_offset_top > offset_top)
1317 offset_top = md->end_offset_top; /* Captures may have happened */
1318 condition = TRUE;
1319 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1320 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1321 }
1322
1323 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1324 assertion; it is therefore treated as NOMATCH. */
1325
1326 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1327 {
1328 RRETURN(rrc); /* Need braces because of following else */
1329 }
1330 else
1331 {
1332 condition = FALSE;
1333 ecode += codelink;
1334 }
1335 }
1336
1337 /* We are now at the branch that is to be obeyed. As there is only one, can
1338 use tail recursion to avoid using another stack frame, except when there is
1339 unlimited repeat of a possibly empty group. In the latter case, a recursive
1340 call to match() is always required, unless the second alternative doesn't
1341 exist, in which case we can just plough on. Note that, for compatibility
1342 with Perl, the | in a conditional group is NOT treated as creating two
1343 alternatives. If a THEN is encountered in the branch, it propagates out to
1344 the enclosing alternative (unless nested in a deeper set of alternatives,
1345 of course). */
1346
1347 if (condition || *ecode == OP_ALT)
1348 {
1349 if (op != OP_SCOND)
1350 {
1351 ecode += 1 + LINK_SIZE;
1352 goto TAIL_RECURSE;
1353 }
1354
1355 md->match_function_type = MATCH_CBEGROUP;
1356 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1357 RRETURN(rrc);
1358 }
1359
1360 /* Condition false & no alternative; continue after the group. */
1361
1362 else
1363 {
1364 ecode += 1 + LINK_SIZE;
1365 }
1366 break;
1367
1368
1369 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1370 to close any currently open capturing brackets. */
1371
1372 case OP_CLOSE:
1373 number = GET2(ecode, 1);
1374 offset = number << 1;
1375
1376 #ifdef PCRE_DEBUG
1377 printf("end bracket %d at *ACCEPT", number);
1378 printf("\n");
1379 #endif
1380
1381 md->capture_last = number;
1382 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1383 {
1384 md->offset_vector[offset] =
1385 md->offset_vector[md->offset_end - number];
1386 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1387 if (offset_top <= offset) offset_top = offset + 2;
1388 }
1389 ecode += 3;
1390 break;
1391
1392
1393 /* End of the pattern, either real or forced. */
1394
1395 case OP_END:
1396 case OP_ACCEPT:
1397 case OP_ASSERT_ACCEPT:
1398
1399 /* If we have matched an empty string, fail if not in an assertion and not
1400 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1401 is set and we have matched at the start of the subject. In both cases,
1402 backtracking will then try other alternatives, if any. */
1403
1404 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1405 md->recursive == NULL &&
1406 (md->notempty ||
1407 (md->notempty_atstart &&
1408 mstart == md->start_subject + md->start_offset)))
1409 MRRETURN(MATCH_NOMATCH);
1410
1411 /* Otherwise, we have a match. */
1412
1413 md->end_match_ptr = eptr; /* Record where we ended */
1414 md->end_offset_top = offset_top; /* and how many extracts were taken */
1415 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1416
1417 /* For some reason, the macros don't work properly if an expression is
1418 given as the argument to MRRETURN when the heap is in use. */
1419
1420 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1421 MRRETURN(rrc);
1422
1423 /* Assertion brackets. Check the alternative branches in turn - the
1424 matching won't pass the KET for an assertion. If any one branch matches,
1425 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1426 start of each branch to move the current point backwards, so the code at
1427 this level is identical to the lookahead case. When the assertion is part
1428 of a condition, we want to return immediately afterwards. The caller of
1429 this incarnation of the match() function will have set MATCH_CONDASSERT in
1430 md->match_function type, and one of these opcodes will be the first opcode
1431 that is processed. We use a local variable that is preserved over calls to
1432 match() to remember this case. */
1433
1434 case OP_ASSERT:
1435 case OP_ASSERTBACK:
1436 if (md->match_function_type == MATCH_CONDASSERT)
1437 {
1438 condassert = TRUE;
1439 md->match_function_type = 0;
1440 }
1441 else condassert = FALSE;
1442
1443 do
1444 {
1445 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1446 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1447 {
1448 mstart = md->start_match_ptr; /* In case \K reset it */
1449 markptr = md->mark;
1450 break;
1451 }
1452
1453 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1454 as NOMATCH. */
1455
1456 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1457 ecode += GET(ecode, 1);
1458 }
1459 while (*ecode == OP_ALT);
1460
1461 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1462
1463 /* If checking an assertion for a condition, return MATCH_MATCH. */
1464
1465 if (condassert) RRETURN(MATCH_MATCH);
1466
1467 /* Continue from after the assertion, updating the offsets high water
1468 mark, since extracts may have been taken during the assertion. */
1469
1470 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471 ecode += 1 + LINK_SIZE;
1472 offset_top = md->end_offset_top;
1473 continue;
1474
1475 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1476 PRUNE, or COMMIT means we must assume failure without checking subsequent
1477 branches. */
1478
1479 case OP_ASSERT_NOT:
1480 case OP_ASSERTBACK_NOT:
1481 if (md->match_function_type == MATCH_CONDASSERT)
1482 {
1483 condassert = TRUE;
1484 md->match_function_type = 0;
1485 }
1486 else condassert = FALSE;
1487
1488 do
1489 {
1490 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1491 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1492 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1493 {
1494 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1495 break;
1496 }
1497
1498 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1499 as NOMATCH. */
1500
1501 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1502 ecode += GET(ecode,1);
1503 }
1504 while (*ecode == OP_ALT);
1505
1506 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1507
1508 ecode += 1 + LINK_SIZE;
1509 continue;
1510
1511 /* Move the subject pointer back. This occurs only at the start of
1512 each branch of a lookbehind assertion. If we are too close to the start to
1513 move back, this match function fails. When working with UTF-8 we move
1514 back a number of characters, not bytes. */
1515
1516 case OP_REVERSE:
1517 #ifdef SUPPORT_UTF8
1518 if (utf8)
1519 {
1520 i = GET(ecode, 1);
1521 while (i-- > 0)
1522 {
1523 eptr--;
1524 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1525 BACKCHAR(eptr);
1526 }
1527 }
1528 else
1529 #endif
1530
1531 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1532
1533 {
1534 eptr -= GET(ecode, 1);
1535 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1536 }
1537
1538 /* Save the earliest consulted character, then skip to next op code */
1539
1540 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1541 ecode += 1 + LINK_SIZE;
1542 break;
1543
1544 /* The callout item calls an external function, if one is provided, passing
1545 details of the match so far. This is mainly for debugging, though the
1546 function is able to force a failure. */
1547
1548 case OP_CALLOUT:
1549 if (pcre_callout != NULL)
1550 {
1551 pcre_callout_block cb;
1552 cb.version = 2; /* Version 1 of the callout block */
1553 cb.callout_number = ecode[1];
1554 cb.offset_vector = md->offset_vector;
1555 cb.subject = (PCRE_SPTR)md->start_subject;
1556 cb.subject_length = (int)(md->end_subject - md->start_subject);
1557 cb.start_match = (int)(mstart - md->start_subject);
1558 cb.current_position = (int)(eptr - md->start_subject);
1559 cb.pattern_position = GET(ecode, 2);
1560 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1561 cb.capture_top = offset_top/2;
1562 cb.capture_last = md->capture_last;
1563 cb.callout_data = md->callout_data;
1564 cb.mark = markptr;
1565 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1566 if (rrc < 0) RRETURN(rrc);
1567 }
1568 ecode += 2 + 2*LINK_SIZE;
1569 break;
1570
1571 /* Recursion either matches the current regex, or some subexpression. The
1572 offset data is the offset to the starting bracket from the start of the
1573 whole pattern. (This is so that it works from duplicated subpatterns.)
1574
1575 The state of the capturing groups is preserved over recursion, and
1576 re-instated afterwards. We don't know how many are started and not yet
1577 finished (offset_top records the completed total) so we just have to save
1578 all the potential data. There may be up to 65535 such values, which is too
1579 large to put on the stack, but using malloc for small numbers seems
1580 expensive. As a compromise, the stack is used when there are no more than
1581 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1582
1583 There are also other values that have to be saved. We use a chained
1584 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1585 for the original version of this logic. It has, however, been hacked around
1586 a lot, so he is not to blame for the current way it works. */
1587
1588 case OP_RECURSE:
1589 {
1590 recursion_info *ri;
1591 int recno;
1592
1593 callpat = md->start_code + GET(ecode, 1);
1594 recno = (callpat == md->start_code)? 0 :
1595 GET2(callpat, 1 + LINK_SIZE);
1596
1597 /* Check for repeating a recursion without advancing the subject pointer.
1598 This should catch convoluted mutual recursions. (Some simple cases are
1599 caught at compile time.) */
1600
1601 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1602 if (recno == ri->group_num && eptr == ri->subject_position)
1603 RRETURN(PCRE_ERROR_RECURSELOOP);
1604
1605 /* Add to "recursing stack" */
1606
1607 new_recursive.group_num = recno;
1608 new_recursive.subject_position = eptr;
1609 new_recursive.prevrec = md->recursive;
1610 md->recursive = &new_recursive;
1611
1612 /* Where to continue from afterwards */
1613
1614 ecode += 1 + LINK_SIZE;
1615
1616 /* Now save the offset data */
1617
1618 new_recursive.saved_max = md->offset_end;
1619 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1620 new_recursive.offset_save = stacksave;
1621 else
1622 {
1623 new_recursive.offset_save =
1624 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1625 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1626 }
1627 memcpy(new_recursive.offset_save, md->offset_vector,
1628 new_recursive.saved_max * sizeof(int));
1629
1630 /* OK, now we can do the recursion. After processing each alternative,
1631 restore the offset data. If there were nested recursions, md->recursive
1632 might be changed, so reset it before looping. */
1633
1634 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1635 cbegroup = (*callpat >= OP_SBRA);
1636 do
1637 {
1638 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1639 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1640 md, eptrb, RM6);
1641 memcpy(md->offset_vector, new_recursive.offset_save,
1642 new_recursive.saved_max * sizeof(int));
1643 md->recursive = new_recursive.prevrec;
1644 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1645 {
1646 DPRINTF(("Recursion matched\n"));
1647 if (new_recursive.offset_save != stacksave)
1648 (pcre_free)(new_recursive.offset_save);
1649
1650 /* Set where we got to in the subject, and reset the start in case
1651 it was changed by \K. This *is* propagated back out of a recursion,
1652 for Perl compatibility. */
1653
1654 eptr = md->end_match_ptr;
1655 mstart = md->start_match_ptr;
1656 goto RECURSION_MATCHED; /* Exit loop; end processing */
1657 }
1658
1659 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1660 as NOMATCH. */
1661
1662 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1663 {
1664 DPRINTF(("Recursion gave error %d\n", rrc));
1665 if (new_recursive.offset_save != stacksave)
1666 (pcre_free)(new_recursive.offset_save);
1667 RRETURN(rrc);
1668 }
1669
1670 md->recursive = &new_recursive;
1671 callpat += GET(callpat, 1);
1672 }
1673 while (*callpat == OP_ALT);
1674
1675 DPRINTF(("Recursion didn't match\n"));
1676 md->recursive = new_recursive.prevrec;
1677 if (new_recursive.offset_save != stacksave)
1678 (pcre_free)(new_recursive.offset_save);
1679 MRRETURN(MATCH_NOMATCH);
1680 }
1681
1682 RECURSION_MATCHED:
1683 break;
1684
1685 /* An alternation is the end of a branch; scan along to find the end of the
1686 bracketed group and go to there. */
1687
1688 case OP_ALT:
1689 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1690 break;
1691
1692 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1693 indicating that it may occur zero times. It may repeat infinitely, or not
1694 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1695 with fixed upper repeat limits are compiled as a number of copies, with the
1696 optional ones preceded by BRAZERO or BRAMINZERO. */
1697
1698 case OP_BRAZERO:
1699 next = ecode + 1;
1700 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1701 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1702 do next += GET(next, 1); while (*next == OP_ALT);
1703 ecode = next + 1 + LINK_SIZE;
1704 break;
1705
1706 case OP_BRAMINZERO:
1707 next = ecode + 1;
1708 do next += GET(next, 1); while (*next == OP_ALT);
1709 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1711 ecode++;
1712 break;
1713
1714 case OP_SKIPZERO:
1715 next = ecode+1;
1716 do next += GET(next,1); while (*next == OP_ALT);
1717 ecode = next + 1 + LINK_SIZE;
1718 break;
1719
1720 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1721 here; just jump to the group, with allow_zero set TRUE. */
1722
1723 case OP_BRAPOSZERO:
1724 op = *(++ecode);
1725 allow_zero = TRUE;
1726 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1727 goto POSSESSIVE_NON_CAPTURE;
1728
1729 /* End of a group, repeated or non-repeating. */
1730
1731 case OP_KET:
1732 case OP_KETRMIN:
1733 case OP_KETRMAX:
1734 case OP_KETRPOS:
1735 prev = ecode - GET(ecode, 1);
1736
1737 /* If this was a group that remembered the subject start, in order to break
1738 infinite repeats of empty string matches, retrieve the subject start from
1739 the chain. Otherwise, set it NULL. */
1740
1741 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1742 {
1743 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1744 eptrb = eptrb->epb_prev; /* Backup to previous group */
1745 }
1746 else saved_eptr = NULL;
1747
1748 /* If we are at the end of an assertion group, stop matching and return
1749 MATCH_MATCH, but record the current high water mark for use by positive
1750 assertions. We also need to record the match start in case it was changed
1751 by \K. */
1752
1753 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1754 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1755 {
1756 md->end_match_ptr = eptr; /* For ONCE */
1757 md->end_offset_top = offset_top;
1758 md->start_match_ptr = mstart;
1759 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1760 }
1761
1762 /* For capturing groups we have to check the group number back at the start
1763 and if necessary complete handling an extraction by setting the offsets and
1764 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1765 into group 0, so it won't be picked up here. Instead, we catch it when the
1766 OP_END is reached. Other recursion is handled here. We just have to record
1767 the current subject position and start match pointer and give a MATCH
1768 return. */
1769
1770 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1771 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1772 {
1773 number = GET2(prev, 1+LINK_SIZE);
1774 offset = number << 1;
1775
1776 #ifdef PCRE_DEBUG
1777 printf("end bracket %d", number);
1778 printf("\n");
1779 #endif
1780
1781 /* Handle a recursively called group. */
1782
1783 if (md->recursive != NULL && md->recursive->group_num == number)
1784 {
1785 md->end_match_ptr = eptr;
1786 md->start_match_ptr = mstart;
1787 RRETURN(MATCH_MATCH);
1788 }
1789
1790 /* Deal with capturing */
1791
1792 md->capture_last = number;
1793 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1794 {
1795 /* If offset is greater than offset_top, it means that we are
1796 "skipping" a capturing group, and that group's offsets must be marked
1797 unset. In earlier versions of PCRE, all the offsets were unset at the
1798 start of matching, but this doesn't work because atomic groups and
1799 assertions can cause a value to be set that should later be unset.
1800 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1801 part of the atomic group, but this is not on the final matching path,
1802 so must be unset when 2 is set. (If there is no group 2, there is no
1803 problem, because offset_top will then be 2, indicating no capture.) */
1804
1805 if (offset > offset_top)
1806 {
1807 register int *iptr = md->offset_vector + offset_top;
1808 register int *iend = md->offset_vector + offset;
1809 while (iptr < iend) *iptr++ = -1;
1810 }
1811
1812 /* Now make the extraction */
1813
1814 md->offset_vector[offset] =
1815 md->offset_vector[md->offset_end - number];
1816 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1817 if (offset_top <= offset) offset_top = offset + 2;
1818 }
1819 }
1820
1821 /* For an ordinary non-repeating ket, just continue at this level. This
1822 also happens for a repeating ket if no characters were matched in the
1823 group. This is the forcible breaking of infinite loops as implemented in
1824 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1825 processing the rest of the pattern at a lower level. If this results in a
1826 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1827 bypassing intermediate backup points, but resetting any captures that
1828 happened along the way. */
1829
1830 if (*ecode == OP_KET || eptr == saved_eptr)
1831 {
1832 if (*prev == OP_ONCE)
1833 {
1834 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1836 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1837 RRETURN(MATCH_ONCE);
1838 }
1839 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1840 break;
1841 }
1842
1843 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1844 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1845 at a time from the outer level, thus saving stack. */
1846
1847 if (*ecode == OP_KETRPOS)
1848 {
1849 md->end_match_ptr = eptr;
1850 md->end_offset_top = offset_top;
1851 RRETURN(MATCH_KETRPOS);
1852 }
1853
1854 /* The normal repeating kets try the rest of the pattern or restart from
1855 the preceding bracket, in the appropriate order. In the second case, we can
1856 use tail recursion to avoid using another stack frame, unless we have an
1857 an atomic group or an unlimited repeat of a group that can match an empty
1858 string. */
1859
1860 if (*ecode == OP_KETRMIN)
1861 {
1862 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 if (*prev == OP_ONCE)
1865 {
1866 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1868 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1869 RRETURN(MATCH_ONCE);
1870 }
1871 if (*prev >= OP_SBRA) /* Could match an empty string */
1872 {
1873 md->match_function_type = MATCH_CBEGROUP;
1874 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1875 RRETURN(rrc);
1876 }
1877 ecode = prev;
1878 goto TAIL_RECURSE;
1879 }
1880 else /* OP_KETRMAX */
1881 {
1882 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1883 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1884 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886 if (*prev == OP_ONCE)
1887 {
1888 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1889 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1890 md->once_target = prev;
1891 RRETURN(MATCH_ONCE);
1892 }
1893 ecode += 1 + LINK_SIZE;
1894 goto TAIL_RECURSE;
1895 }
1896 /* Control never gets here */
1897
1898 /* Not multiline mode: start of subject assertion, unless notbol. */
1899
1900 case OP_CIRC:
1901 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1902
1903 /* Start of subject assertion */
1904
1905 case OP_SOD:
1906 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1907 ecode++;
1908 break;
1909
1910 /* Multiline mode: start of subject unless notbol, or after any newline. */
1911
1912 case OP_CIRCM:
1913 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1914 if (eptr != md->start_subject &&
1915 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1916 MRRETURN(MATCH_NOMATCH);
1917 ecode++;
1918 break;
1919
1920 /* Start of match assertion */
1921
1922 case OP_SOM:
1923 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1924 ecode++;
1925 break;
1926
1927 /* Reset the start of match point */
1928
1929 case OP_SET_SOM:
1930 mstart = eptr;
1931 ecode++;
1932 break;
1933
1934 /* Multiline mode: assert before any newline, or before end of subject
1935 unless noteol is set. */
1936
1937 case OP_DOLLM:
1938 if (eptr < md->end_subject)
1939 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1940 else
1941 {
1942 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1943 SCHECK_PARTIAL();
1944 }
1945 ecode++;
1946 break;
1947
1948 /* Not multiline mode: assert before a terminating newline or before end of
1949 subject unless noteol is set. */
1950
1951 case OP_DOLL:
1952 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1953 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1954
1955 /* ... else fall through for endonly */
1956
1957 /* End of subject assertion (\z) */
1958
1959 case OP_EOD:
1960 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1961 SCHECK_PARTIAL();
1962 ecode++;
1963 break;
1964
1965 /* End of subject or ending \n assertion (\Z) */
1966
1967 case OP_EODN:
1968 ASSERT_NL_OR_EOS:
1969 if (eptr < md->end_subject &&
1970 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1971 MRRETURN(MATCH_NOMATCH);
1972
1973 /* Either at end of string or \n before end. */
1974
1975 SCHECK_PARTIAL();
1976 ecode++;
1977 break;
1978
1979 /* Word boundary assertions */
1980
1981 case OP_NOT_WORD_BOUNDARY:
1982 case OP_WORD_BOUNDARY:
1983 {
1984
1985 /* Find out if the previous and current characters are "word" characters.
1986 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1987 be "non-word" characters. Remember the earliest consulted character for
1988 partial matching. */
1989
1990 #ifdef SUPPORT_UTF8
1991 if (utf8)
1992 {
1993 /* Get status of previous character */
1994
1995 if (eptr == md->start_subject) prev_is_word = FALSE; else
1996 {
1997 USPTR lastptr = eptr - 1;
1998 while((*lastptr & 0xc0) == 0x80) lastptr--;
1999 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2000 GETCHAR(c, lastptr);
2001 #ifdef SUPPORT_UCP
2002 if (md->use_ucp)
2003 {
2004 if (c == '_') prev_is_word = TRUE; else
2005 {
2006 int cat = UCD_CATEGORY(c);
2007 prev_is_word = (cat == ucp_L || cat == ucp_N);
2008 }
2009 }
2010 else
2011 #endif
2012 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2013 }
2014
2015 /* Get status of next character */
2016
2017 if (eptr >= md->end_subject)
2018 {
2019 SCHECK_PARTIAL();
2020 cur_is_word = FALSE;
2021 }
2022 else
2023 {
2024 GETCHAR(c, eptr);
2025 #ifdef SUPPORT_UCP
2026 if (md->use_ucp)
2027 {
2028 if (c == '_') cur_is_word = TRUE; else
2029 {
2030 int cat = UCD_CATEGORY(c);
2031 cur_is_word = (cat == ucp_L || cat == ucp_N);
2032 }
2033 }
2034 else
2035 #endif
2036 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2037 }
2038 }
2039 else
2040 #endif
2041
2042 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2043 consistency with the behaviour of \w we do use it in this case. */
2044
2045 {
2046 /* Get status of previous character */
2047
2048 if (eptr == md->start_subject) prev_is_word = FALSE; else
2049 {
2050 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2051 #ifdef SUPPORT_UCP
2052 if (md->use_ucp)
2053 {
2054 c = eptr[-1];
2055 if (c == '_') prev_is_word = TRUE; else
2056 {
2057 int cat = UCD_CATEGORY(c);
2058 prev_is_word = (cat == ucp_L || cat == ucp_N);
2059 }
2060 }
2061 else
2062 #endif
2063 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2064 }
2065
2066 /* Get status of next character */
2067
2068 if (eptr >= md->end_subject)
2069 {
2070 SCHECK_PARTIAL();
2071 cur_is_word = FALSE;
2072 }
2073 else
2074 #ifdef SUPPORT_UCP
2075 if (md->use_ucp)
2076 {
2077 c = *eptr;
2078 if (c == '_') cur_is_word = TRUE; else
2079 {
2080 int cat = UCD_CATEGORY(c);
2081 cur_is_word = (cat == ucp_L || cat == ucp_N);
2082 }
2083 }
2084 else
2085 #endif
2086 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2087 }
2088
2089 /* Now see if the situation is what we want */
2090
2091 if ((*ecode++ == OP_WORD_BOUNDARY)?
2092 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2093 MRRETURN(MATCH_NOMATCH);
2094 }
2095 break;
2096
2097 /* Match a single character type; inline for speed */
2098
2099 case OP_ANY:
2100 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2101 /* Fall through */
2102
2103 case OP_ALLANY:
2104 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2105 { /* not be updated before SCHECK_PARTIAL. */
2106 SCHECK_PARTIAL();
2107 MRRETURN(MATCH_NOMATCH);
2108 }
2109 eptr++;
2110 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2111 ecode++;
2112 break;
2113
2114 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2115 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2116
2117 case OP_ANYBYTE:
2118 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2119 { /* not be updated before SCHECK_PARTIAL. */
2120 SCHECK_PARTIAL();
2121 MRRETURN(MATCH_NOMATCH);
2122 }
2123 eptr++;
2124 ecode++;
2125 break;
2126
2127 case OP_NOT_DIGIT:
2128 if (eptr >= md->end_subject)
2129 {
2130 SCHECK_PARTIAL();
2131 MRRETURN(MATCH_NOMATCH);
2132 }
2133 GETCHARINCTEST(c, eptr);
2134 if (
2135 #ifdef SUPPORT_UTF8
2136 c < 256 &&
2137 #endif
2138 (md->ctypes[c] & ctype_digit) != 0
2139 )
2140 MRRETURN(MATCH_NOMATCH);
2141 ecode++;
2142 break;
2143
2144 case OP_DIGIT:
2145 if (eptr >= md->end_subject)
2146 {
2147 SCHECK_PARTIAL();
2148 MRRETURN(MATCH_NOMATCH);
2149 }
2150 GETCHARINCTEST(c, eptr);
2151 if (
2152 #ifdef SUPPORT_UTF8
2153 c >= 256 ||
2154 #endif
2155 (md->ctypes[c] & ctype_digit) == 0
2156 )
2157 MRRETURN(MATCH_NOMATCH);
2158 ecode++;
2159 break;
2160
2161 case OP_NOT_WHITESPACE:
2162 if (eptr >= md->end_subject)
2163 {
2164 SCHECK_PARTIAL();
2165 MRRETURN(MATCH_NOMATCH);
2166 }
2167 GETCHARINCTEST(c, eptr);
2168 if (
2169 #ifdef SUPPORT_UTF8
2170 c < 256 &&
2171 #endif
2172 (md->ctypes[c] & ctype_space) != 0
2173 )
2174 MRRETURN(MATCH_NOMATCH);
2175 ecode++;
2176 break;
2177
2178 case OP_WHITESPACE:
2179 if (eptr >= md->end_subject)
2180 {
2181 SCHECK_PARTIAL();
2182 MRRETURN(MATCH_NOMATCH);
2183 }
2184 GETCHARINCTEST(c, eptr);
2185 if (
2186 #ifdef SUPPORT_UTF8
2187 c >= 256 ||
2188 #endif
2189 (md->ctypes[c] & ctype_space) == 0
2190 )
2191 MRRETURN(MATCH_NOMATCH);
2192 ecode++;
2193 break;
2194
2195 case OP_NOT_WORDCHAR:
2196 if (eptr >= md->end_subject)
2197 {
2198 SCHECK_PARTIAL();
2199 MRRETURN(MATCH_NOMATCH);
2200 }
2201 GETCHARINCTEST(c, eptr);
2202 if (
2203 #ifdef SUPPORT_UTF8
2204 c < 256 &&
2205 #endif
2206 (md->ctypes[c] & ctype_word) != 0
2207 )
2208 MRRETURN(MATCH_NOMATCH);
2209 ecode++;
2210 break;
2211
2212 case OP_WORDCHAR:
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 GETCHARINCTEST(c, eptr);
2219 if (
2220 #ifdef SUPPORT_UTF8
2221 c >= 256 ||
2222 #endif
2223 (md->ctypes[c] & ctype_word) == 0
2224 )
2225 MRRETURN(MATCH_NOMATCH);
2226 ecode++;
2227 break;
2228
2229 case OP_ANYNL:
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 MRRETURN(MATCH_NOMATCH);
2234 }
2235 GETCHARINCTEST(c, eptr);
2236 switch(c)
2237 {
2238 default: MRRETURN(MATCH_NOMATCH);
2239
2240 case 0x000d:
2241 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2242 break;
2243
2244 case 0x000a:
2245 break;
2246
2247 case 0x000b:
2248 case 0x000c:
2249 case 0x0085:
2250 case 0x2028:
2251 case 0x2029:
2252 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2253 break;
2254 }
2255 ecode++;
2256 break;
2257
2258 case OP_NOT_HSPACE:
2259 if (eptr >= md->end_subject)
2260 {
2261 SCHECK_PARTIAL();
2262 MRRETURN(MATCH_NOMATCH);
2263 }
2264 GETCHARINCTEST(c, eptr);
2265 switch(c)
2266 {
2267 default: break;
2268 case 0x09: /* HT */
2269 case 0x20: /* SPACE */
2270 case 0xa0: /* NBSP */
2271 case 0x1680: /* OGHAM SPACE MARK */
2272 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2273 case 0x2000: /* EN QUAD */
2274 case 0x2001: /* EM QUAD */
2275 case 0x2002: /* EN SPACE */
2276 case 0x2003: /* EM SPACE */
2277 case 0x2004: /* THREE-PER-EM SPACE */
2278 case 0x2005: /* FOUR-PER-EM SPACE */
2279 case 0x2006: /* SIX-PER-EM SPACE */
2280 case 0x2007: /* FIGURE SPACE */
2281 case 0x2008: /* PUNCTUATION SPACE */
2282 case 0x2009: /* THIN SPACE */
2283 case 0x200A: /* HAIR SPACE */
2284 case 0x202f: /* NARROW NO-BREAK SPACE */
2285 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2286 case 0x3000: /* IDEOGRAPHIC SPACE */
2287 MRRETURN(MATCH_NOMATCH);
2288 }
2289 ecode++;
2290 break;
2291
2292 case OP_HSPACE:
2293 if (eptr >= md->end_subject)
2294 {
2295 SCHECK_PARTIAL();
2296 MRRETURN(MATCH_NOMATCH);
2297 }
2298 GETCHARINCTEST(c, eptr);
2299 switch(c)
2300 {
2301 default: MRRETURN(MATCH_NOMATCH);
2302 case 0x09: /* HT */
2303 case 0x20: /* SPACE */
2304 case 0xa0: /* NBSP */
2305 case 0x1680: /* OGHAM SPACE MARK */
2306 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2307 case 0x2000: /* EN QUAD */
2308 case 0x2001: /* EM QUAD */
2309 case 0x2002: /* EN SPACE */
2310 case 0x2003: /* EM SPACE */
2311 case 0x2004: /* THREE-PER-EM SPACE */
2312 case 0x2005: /* FOUR-PER-EM SPACE */
2313 case 0x2006: /* SIX-PER-EM SPACE */
2314 case 0x2007: /* FIGURE SPACE */
2315 case 0x2008: /* PUNCTUATION SPACE */
2316 case 0x2009: /* THIN SPACE */
2317 case 0x200A: /* HAIR SPACE */
2318 case 0x202f: /* NARROW NO-BREAK SPACE */
2319 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2320 case 0x3000: /* IDEOGRAPHIC SPACE */
2321 break;
2322 }
2323 ecode++;
2324 break;
2325
2326 case OP_NOT_VSPACE:
2327 if (eptr >= md->end_subject)
2328 {
2329 SCHECK_PARTIAL();
2330 MRRETURN(MATCH_NOMATCH);
2331 }
2332 GETCHARINCTEST(c, eptr);
2333 switch(c)
2334 {
2335 default: break;
2336 case 0x0a: /* LF */
2337 case 0x0b: /* VT */
2338 case 0x0c: /* FF */
2339 case 0x0d: /* CR */
2340 case 0x85: /* NEL */
2341 case 0x2028: /* LINE SEPARATOR */
2342 case 0x2029: /* PARAGRAPH SEPARATOR */
2343 MRRETURN(MATCH_NOMATCH);
2344 }
2345 ecode++;
2346 break;
2347
2348 case OP_VSPACE:
2349 if (eptr >= md->end_subject)
2350 {
2351 SCHECK_PARTIAL();
2352 MRRETURN(MATCH_NOMATCH);
2353 }
2354 GETCHARINCTEST(c, eptr);
2355 switch(c)
2356 {
2357 default: MRRETURN(MATCH_NOMATCH);
2358 case 0x0a: /* LF */
2359 case 0x0b: /* VT */
2360 case 0x0c: /* FF */
2361 case 0x0d: /* CR */
2362 case 0x85: /* NEL */
2363 case 0x2028: /* LINE SEPARATOR */
2364 case 0x2029: /* PARAGRAPH SEPARATOR */
2365 break;
2366 }
2367 ecode++;
2368 break;
2369
2370 #ifdef SUPPORT_UCP
2371 /* Check the next character by Unicode property. We will get here only
2372 if the support is in the binary; otherwise a compile-time error occurs. */
2373
2374 case OP_PROP:
2375 case OP_NOTPROP:
2376 if (eptr >= md->end_subject)
2377 {
2378 SCHECK_PARTIAL();
2379 MRRETURN(MATCH_NOMATCH);
2380 }
2381 GETCHARINCTEST(c, eptr);
2382 {
2383 const ucd_record *prop = GET_UCD(c);
2384
2385 switch(ecode[1])
2386 {
2387 case PT_ANY:
2388 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2389 break;
2390
2391 case PT_LAMP:
2392 if ((prop->chartype == ucp_Lu ||
2393 prop->chartype == ucp_Ll ||
2394 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2395 MRRETURN(MATCH_NOMATCH);
2396 break;
2397
2398 case PT_GC:
2399 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2400 MRRETURN(MATCH_NOMATCH);
2401 break;
2402
2403 case PT_PC:
2404 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2405 MRRETURN(MATCH_NOMATCH);
2406 break;
2407
2408 case PT_SC:
2409 if ((ecode[2] != prop->script) == (op == OP_PROP))
2410 MRRETURN(MATCH_NOMATCH);
2411 break;
2412
2413 /* These are specials */
2414
2415 case PT_ALNUM:
2416 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2417 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2418 MRRETURN(MATCH_NOMATCH);
2419 break;
2420
2421 case PT_SPACE: /* Perl space */
2422 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2423 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2424 == (op == OP_NOTPROP))
2425 MRRETURN(MATCH_NOMATCH);
2426 break;
2427
2428 case PT_PXSPACE: /* POSIX space */
2429 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2430 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2431 c == CHAR_FF || c == CHAR_CR)
2432 == (op == OP_NOTPROP))
2433 MRRETURN(MATCH_NOMATCH);
2434 break;
2435
2436 case PT_WORD:
2437 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2438 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2439 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2440 MRRETURN(MATCH_NOMATCH);
2441 break;
2442
2443 /* This should never occur */
2444
2445 default:
2446 RRETURN(PCRE_ERROR_INTERNAL);
2447 }
2448
2449 ecode += 3;
2450 }
2451 break;
2452
2453 /* Match an extended Unicode sequence. We will get here only if the support
2454 is in the binary; otherwise a compile-time error occurs. */
2455
2456 case OP_EXTUNI:
2457 if (eptr >= md->end_subject)
2458 {
2459 SCHECK_PARTIAL();
2460 MRRETURN(MATCH_NOMATCH);
2461 }
2462 GETCHARINCTEST(c, eptr);
2463 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2464 while (eptr < md->end_subject)
2465 {
2466 int len = 1;
2467 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2468 if (UCD_CATEGORY(c) != ucp_M) break;
2469 eptr += len;
2470 }
2471 ecode++;
2472 break;
2473 #endif
2474
2475
2476 /* Match a back reference, possibly repeatedly. Look past the end of the
2477 item to see if there is repeat information following. The code is similar
2478 to that for character classes, but repeated for efficiency. Then obey
2479 similar code to character type repeats - written out again for speed.
2480 However, if the referenced string is the empty string, always treat
2481 it as matched, any number of times (otherwise there could be infinite
2482 loops). */
2483
2484 case OP_REF:
2485 case OP_REFI:
2486 caseless = op == OP_REFI;
2487 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2488 ecode += 3;
2489
2490 /* If the reference is unset, there are two possibilities:
2491
2492 (a) In the default, Perl-compatible state, set the length negative;
2493 this ensures that every attempt at a match fails. We can't just fail
2494 here, because of the possibility of quantifiers with zero minima.
2495
2496 (b) If the JavaScript compatibility flag is set, set the length to zero
2497 so that the back reference matches an empty string.
2498
2499 Otherwise, set the length to the length of what was matched by the
2500 referenced subpattern. */
2501
2502 if (offset >= offset_top || md->offset_vector[offset] < 0)
2503 length = (md->jscript_compat)? 0 : -1;
2504 else
2505 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2506
2507 /* Set up for repetition, or handle the non-repeated case */
2508
2509 switch (*ecode)
2510 {
2511 case OP_CRSTAR:
2512 case OP_CRMINSTAR:
2513 case OP_CRPLUS:
2514 case OP_CRMINPLUS:
2515 case OP_CRQUERY:
2516 case OP_CRMINQUERY:
2517 c = *ecode++ - OP_CRSTAR;
2518 minimize = (c & 1) != 0;
2519 min = rep_min[c]; /* Pick up values from tables; */
2520 max = rep_max[c]; /* zero for max => infinity */
2521 if (max == 0) max = INT_MAX;
2522 break;
2523
2524 case OP_CRRANGE:
2525 case OP_CRMINRANGE:
2526 minimize = (*ecode == OP_CRMINRANGE);
2527 min = GET2(ecode, 1);
2528 max = GET2(ecode, 3);
2529 if (max == 0) max = INT_MAX;
2530 ecode += 5;
2531 break;
2532
2533 default: /* No repeat follows */
2534 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2535 {
2536 CHECK_PARTIAL();
2537 MRRETURN(MATCH_NOMATCH);
2538 }
2539 eptr += length;
2540 continue; /* With the main loop */
2541 }
2542
2543 /* Handle repeated back references. If the length of the reference is
2544 zero, just continue with the main loop. */
2545
2546 if (length == 0) continue;
2547
2548 /* First, ensure the minimum number of matches are present. We get back
2549 the length of the reference string explicitly rather than passing the
2550 address of eptr, so that eptr can be a register variable. */
2551
2552 for (i = 1; i <= min; i++)
2553 {
2554 int slength;
2555 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2556 {
2557 CHECK_PARTIAL();
2558 MRRETURN(MATCH_NOMATCH);
2559 }
2560 eptr += slength;
2561 }
2562
2563 /* If min = max, continue at the same level without recursion.
2564 They are not both allowed to be zero. */
2565
2566 if (min == max) continue;
2567
2568 /* If minimizing, keep trying and advancing the pointer */
2569
2570 if (minimize)
2571 {
2572 for (fi = min;; fi++)
2573 {
2574 int slength;
2575 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2576 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2577 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2578 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2579 {
2580 CHECK_PARTIAL();
2581 MRRETURN(MATCH_NOMATCH);
2582 }
2583 eptr += slength;
2584 }
2585 /* Control never gets here */
2586 }
2587
2588 /* If maximizing, find the longest string and work backwards */
2589
2590 else
2591 {
2592 pp = eptr;
2593 for (i = min; i < max; i++)
2594 {
2595 int slength;
2596 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2597 {
2598 CHECK_PARTIAL();
2599 break;
2600 }
2601 eptr += slength;
2602 }
2603 while (eptr >= pp)
2604 {
2605 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2607 eptr -= length;
2608 }
2609 MRRETURN(MATCH_NOMATCH);
2610 }
2611 /* Control never gets here */
2612
2613 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2614 used when all the characters in the class have values in the range 0-255,
2615 and either the matching is caseful, or the characters are in the range
2616 0-127 when UTF-8 processing is enabled. The only difference between
2617 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2618 encountered.
2619
2620 First, look past the end of the item to see if there is repeat information
2621 following. Then obey similar code to character type repeats - written out
2622 again for speed. */
2623
2624 case OP_NCLASS:
2625 case OP_CLASS:
2626 {
2627 data = ecode + 1; /* Save for matching */
2628 ecode += 33; /* Advance past the item */
2629
2630 switch (*ecode)
2631 {
2632 case OP_CRSTAR:
2633 case OP_CRMINSTAR:
2634 case OP_CRPLUS:
2635 case OP_CRMINPLUS:
2636 case OP_CRQUERY:
2637 case OP_CRMINQUERY:
2638 c = *ecode++ - OP_CRSTAR;
2639 minimize = (c & 1) != 0;
2640 min = rep_min[c]; /* Pick up values from tables; */
2641 max = rep_max[c]; /* zero for max => infinity */
2642 if (max == 0) max = INT_MAX;
2643 break;
2644
2645 case OP_CRRANGE:
2646 case OP_CRMINRANGE:
2647 minimize = (*ecode == OP_CRMINRANGE);
2648 min = GET2(ecode, 1);
2649 max = GET2(ecode, 3);
2650 if (max == 0) max = INT_MAX;
2651 ecode += 5;
2652 break;
2653
2654 default: /* No repeat follows */
2655 min = max = 1;
2656 break;
2657 }
2658
2659 /* First, ensure the minimum number of matches are present. */
2660
2661 #ifdef SUPPORT_UTF8
2662 /* UTF-8 mode */
2663 if (utf8)
2664 {
2665 for (i = 1; i <= min; i++)
2666 {
2667 if (eptr >= md->end_subject)
2668 {
2669 SCHECK_PARTIAL();
2670 MRRETURN(MATCH_NOMATCH);
2671 }
2672 GETCHARINC(c, eptr);
2673 if (c > 255)
2674 {
2675 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2676 }
2677 else
2678 {
2679 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2680 }
2681 }
2682 }
2683 else
2684 #endif
2685 /* Not UTF-8 mode */
2686 {
2687 for (i = 1; i <= min; i++)
2688 {
2689 if (eptr >= md->end_subject)
2690 {
2691 SCHECK_PARTIAL();
2692 MRRETURN(MATCH_NOMATCH);
2693 }
2694 c = *eptr++;
2695 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2696 }
2697 }
2698
2699 /* If max == min we can continue with the main loop without the
2700 need to recurse. */
2701
2702 if (min == max) continue;
2703
2704 /* If minimizing, keep testing the rest of the expression and advancing
2705 the pointer while it matches the class. */
2706
2707 if (minimize)
2708 {
2709 #ifdef SUPPORT_UTF8
2710 /* UTF-8 mode */
2711 if (utf8)
2712 {
2713 for (fi = min;; fi++)
2714 {
2715 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2717 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2718 if (eptr >= md->end_subject)
2719 {
2720 SCHECK_PARTIAL();
2721 MRRETURN(MATCH_NOMATCH);
2722 }
2723 GETCHARINC(c, eptr);
2724 if (c > 255)
2725 {
2726 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2727 }
2728 else
2729 {
2730 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2731 }
2732 }
2733 }
2734 else
2735 #endif
2736 /* Not UTF-8 mode */
2737 {
2738 for (fi = min;; fi++)
2739 {
2740 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2741 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2742 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2743 if (eptr >= md->end_subject)
2744 {
2745 SCHECK_PARTIAL();
2746 MRRETURN(MATCH_NOMATCH);
2747 }
2748 c = *eptr++;
2749 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2750 }
2751 }
2752 /* Control never gets here */
2753 }
2754
2755 /* If maximizing, find the longest possible run, then work backwards. */
2756
2757 else
2758 {
2759 pp = eptr;
2760
2761 #ifdef SUPPORT_UTF8
2762 /* UTF-8 mode */
2763 if (utf8)
2764 {
2765 for (i = min; i < max; i++)
2766 {
2767 int len = 1;
2768 if (eptr >= md->end_subject)
2769 {
2770 SCHECK_PARTIAL();
2771 break;
2772 }
2773 GETCHARLEN(c, eptr, len);
2774 if (c > 255)
2775 {
2776 if (op == OP_CLASS) break;
2777 }
2778 else
2779 {
2780 if ((data[c/8] & (1 << (c&7))) == 0) break;
2781 }
2782 eptr += len;
2783 }
2784 for (;;)
2785 {
2786 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2788 if (eptr-- == pp) break; /* Stop if tried at original pos */
2789 BACKCHAR(eptr);
2790 }
2791 }
2792 else
2793 #endif
2794 /* Not UTF-8 mode */
2795 {
2796 for (i = min; i < max; i++)
2797 {
2798 if (eptr >= md->end_subject)
2799 {
2800 SCHECK_PARTIAL();
2801 break;
2802 }
2803 c = *eptr;
2804 if ((data[c/8] & (1 << (c&7))) == 0) break;
2805 eptr++;
2806 }
2807 while (eptr >= pp)
2808 {
2809 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2811 eptr--;
2812 }
2813 }
2814
2815 MRRETURN(MATCH_NOMATCH);
2816 }
2817 }
2818 /* Control never gets here */
2819
2820
2821 /* Match an extended character class. This opcode is encountered only
2822 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2823 mode, because Unicode properties are supported in non-UTF-8 mode. */
2824
2825 #ifdef SUPPORT_UTF8
2826 case OP_XCLASS:
2827 {
2828 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2829 ecode += GET(ecode, 1); /* Advance past the item */
2830
2831 switch (*ecode)
2832 {
2833 case OP_CRSTAR:
2834 case OP_CRMINSTAR:
2835 case OP_CRPLUS:
2836 case OP_CRMINPLUS:
2837 case OP_CRQUERY:
2838 case OP_CRMINQUERY:
2839 c = *ecode++ - OP_CRSTAR;
2840 minimize = (c & 1) != 0;
2841 min = rep_min[c]; /* Pick up values from tables; */
2842 max = rep_max[c]; /* zero for max => infinity */
2843 if (max == 0) max = INT_MAX;
2844 break;
2845
2846 case OP_CRRANGE:
2847 case OP_CRMINRANGE:
2848 minimize = (*ecode == OP_CRMINRANGE);
2849 min = GET2(ecode, 1);
2850 max = GET2(ecode, 3);
2851 if (max == 0) max = INT_MAX;
2852 ecode += 5;
2853 break;
2854
2855 default: /* No repeat follows */
2856 min = max = 1;
2857 break;
2858 }
2859
2860 /* First, ensure the minimum number of matches are present. */
2861
2862 for (i = 1; i <= min; i++)
2863 {
2864 if (eptr >= md->end_subject)
2865 {
2866 SCHECK_PARTIAL();
2867 MRRETURN(MATCH_NOMATCH);
2868 }
2869 GETCHARINCTEST(c, eptr);
2870 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2871 }
2872
2873 /* If max == min we can continue with the main loop without the
2874 need to recurse. */
2875
2876 if (min == max) continue;
2877
2878 /* If minimizing, keep testing the rest of the expression and advancing
2879 the pointer while it matches the class. */
2880
2881 if (minimize)
2882 {
2883 for (fi = min;; fi++)
2884 {
2885 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2887 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2888 if (eptr >= md->end_subject)
2889 {
2890 SCHECK_PARTIAL();
2891 MRRETURN(MATCH_NOMATCH);
2892 }
2893 GETCHARINCTEST(c, eptr);
2894 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2895 }
2896 /* Control never gets here */
2897 }
2898
2899 /* If maximizing, find the longest possible run, then work backwards. */
2900
2901 else
2902 {
2903 pp = eptr;
2904 for (i = min; i < max; i++)
2905 {
2906 int len = 1;
2907 if (eptr >= md->end_subject)
2908 {
2909 SCHECK_PARTIAL();
2910 break;
2911 }
2912 GETCHARLENTEST(c, eptr, len);
2913 if (!_pcre_xclass(c, data)) break;
2914 eptr += len;
2915 }
2916 for(;;)
2917 {
2918 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2920 if (eptr-- == pp) break; /* Stop if tried at original pos */
2921 if (utf8) BACKCHAR(eptr);
2922 }
2923 MRRETURN(MATCH_NOMATCH);
2924 }
2925
2926 /* Control never gets here */
2927 }
2928 #endif /* End of XCLASS */
2929
2930 /* Match a single character, casefully */
2931
2932 case OP_CHAR:
2933 #ifdef SUPPORT_UTF8
2934 if (utf8)
2935 {
2936 length = 1;
2937 ecode++;
2938 GETCHARLEN(fc, ecode, length);
2939 if (length > md->end_subject - eptr)
2940 {
2941 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2942 MRRETURN(MATCH_NOMATCH);
2943 }
2944 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2945 }
2946 else
2947 #endif
2948
2949 /* Non-UTF-8 mode */
2950 {
2951 if (md->end_subject - eptr < 1)
2952 {
2953 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2954 MRRETURN(MATCH_NOMATCH);
2955 }
2956 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2957 ecode += 2;
2958 }
2959 break;
2960
2961 /* Match a single character, caselessly */
2962
2963 case OP_CHARI:
2964 #ifdef SUPPORT_UTF8
2965 if (utf8)
2966 {
2967 length = 1;
2968 ecode++;
2969 GETCHARLEN(fc, ecode, length);
2970
2971 if (length > md->end_subject - eptr)
2972 {
2973 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2974 MRRETURN(MATCH_NOMATCH);
2975 }
2976
2977 /* If the pattern character's value is < 128, we have only one byte, and
2978 can use the fast lookup table. */
2979
2980 if (fc < 128)
2981 {
2982 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2983 }
2984
2985 /* Otherwise we must pick up the subject character */
2986
2987 else
2988 {
2989 unsigned int dc;
2990 GETCHARINC(dc, eptr);
2991 ecode += length;
2992
2993 /* If we have Unicode property support, we can use it to test the other
2994 case of the character, if there is one. */
2995
2996 if (fc != dc)
2997 {
2998 #ifdef SUPPORT_UCP
2999 if (dc != UCD_OTHERCASE(fc))
3000 #endif
3001 MRRETURN(MATCH_NOMATCH);
3002 }
3003 }
3004 }
3005 else
3006 #endif /* SUPPORT_UTF8 */
3007
3008 /* Non-UTF-8 mode */
3009 {
3010 if (md->end_subject - eptr < 1)
3011 {
3012 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3013 MRRETURN(MATCH_NOMATCH);
3014 }
3015 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3016 ecode += 2;
3017 }
3018 break;
3019
3020 /* Match a single character repeatedly. */
3021
3022 case OP_EXACT:
3023 case OP_EXACTI:
3024 min = max = GET2(ecode, 1);
3025 ecode += 3;
3026 goto REPEATCHAR;
3027
3028 case OP_POSUPTO:
3029 case OP_POSUPTOI:
3030 possessive = TRUE;
3031 /* Fall through */
3032
3033 case OP_UPTO:
3034 case OP_UPTOI:
3035 case OP_MINUPTO:
3036 case OP_MINUPTOI:
3037 min = 0;
3038 max = GET2(ecode, 1);
3039 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3040 ecode += 3;
3041 goto REPEATCHAR;
3042
3043 case OP_POSSTAR:
3044 case OP_POSSTARI:
3045 possessive = TRUE;
3046 min = 0;
3047 max = INT_MAX;
3048 ecode++;
3049 goto REPEATCHAR;
3050
3051 case OP_POSPLUS:
3052 case OP_POSPLUSI:
3053 possessive = TRUE;
3054 min = 1;
3055 max = INT_MAX;
3056 ecode++;
3057 goto REPEATCHAR;
3058
3059 case OP_POSQUERY:
3060 case OP_POSQUERYI:
3061 possessive = TRUE;
3062 min = 0;
3063 max = 1;
3064 ecode++;
3065 goto REPEATCHAR;
3066
3067 case OP_STAR:
3068 case OP_STARI:
3069 case OP_MINSTAR:
3070 case OP_MINSTARI:
3071 case OP_PLUS:
3072 case OP_PLUSI:
3073 case OP_MINPLUS:
3074 case OP_MINPLUSI:
3075 case OP_QUERY:
3076 case OP_QUERYI:
3077 case OP_MINQUERY:
3078 case OP_MINQUERYI:
3079 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3080 minimize = (c & 1) != 0;
3081 min = rep_min[c]; /* Pick up values from tables; */
3082 max = rep_max[c]; /* zero for max => infinity */
3083 if (max == 0) max = INT_MAX;
3084
3085 /* Common code for all repeated single-character matches. */
3086
3087 REPEATCHAR:
3088 #ifdef SUPPORT_UTF8
3089 if (utf8)
3090 {
3091 length = 1;
3092 charptr = ecode;
3093 GETCHARLEN(fc, ecode, length);
3094 ecode += length;
3095
3096 /* Handle multibyte character matching specially here. There is
3097 support for caseless matching if UCP support is present. */
3098
3099 if (length > 1)
3100 {
3101 #ifdef SUPPORT_UCP
3102 unsigned int othercase;
3103 if (op >= OP_STARI && /* Caseless */
3104 (othercase = UCD_OTHERCASE(fc)) != fc)
3105 oclength = _pcre_ord2utf8(othercase, occhars);
3106 else oclength = 0;
3107 #endif /* SUPPORT_UCP */
3108
3109 for (i = 1; i <= min; i++)
3110 {
3111 if (eptr <= md->end_subject - length &&
3112 memcmp(eptr, charptr, length) == 0) eptr += length;
3113 #ifdef SUPPORT_UCP
3114 else if (oclength > 0 &&
3115 eptr <= md->end_subject - oclength &&
3116 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3117 #endif /* SUPPORT_UCP */
3118 else
3119 {
3120 CHECK_PARTIAL();
3121 MRRETURN(MATCH_NOMATCH);
3122 }
3123 }
3124
3125 if (min == max) continue;
3126
3127 if (minimize)
3128 {
3129 for (fi = min;; fi++)
3130 {
3131 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3132 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3133 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3134 if (eptr <= md->end_subject - length &&
3135 memcmp(eptr, charptr, length) == 0) eptr += length;
3136 #ifdef SUPPORT_UCP
3137 else if (oclength > 0 &&
3138 eptr <= md->end_subject - oclength &&
3139 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3140 #endif /* SUPPORT_UCP */
3141 else
3142 {
3143 CHECK_PARTIAL();
3144 MRRETURN(MATCH_NOMATCH);
3145 }
3146 }
3147 /* Control never gets here */
3148 }
3149
3150 else /* Maximize */
3151 {
3152 pp = eptr;
3153 for (i = min; i < max; i++)
3154 {
3155 if (eptr <= md->end_subject - length &&
3156 memcmp(eptr, charptr, length) == 0) eptr += length;
3157 #ifdef SUPPORT_UCP
3158 else if (oclength > 0 &&
3159 eptr <= md->end_subject - oclength &&
3160 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3161 #endif /* SUPPORT_UCP */
3162 else
3163 {
3164 CHECK_PARTIAL();
3165 break;
3166 }
3167 }
3168
3169 if (possessive) continue;
3170
3171 for(;;)
3172 {
3173 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3175 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3176 #ifdef SUPPORT_UCP
3177 eptr--;
3178 BACKCHAR(eptr);
3179 #else /* without SUPPORT_UCP */
3180 eptr -= length;
3181 #endif /* SUPPORT_UCP */
3182 }
3183 }
3184 /* Control never gets here */
3185 }
3186
3187 /* If the length of a UTF-8 character is 1, we fall through here, and
3188 obey the code as for non-UTF-8 characters below, though in this case the
3189 value of fc will always be < 128. */
3190 }
3191 else
3192 #endif /* SUPPORT_UTF8 */
3193
3194 /* When not in UTF-8 mode, load a single-byte character. */
3195
3196 fc = *ecode++;
3197
3198 /* The value of fc at this point is always less than 256, though we may or
3199 may not be in UTF-8 mode. The code is duplicated for the caseless and
3200 caseful cases, for speed, since matching characters is likely to be quite
3201 common. First, ensure the minimum number of matches are present. If min =
3202 max, continue at the same level without recursing. Otherwise, if
3203 minimizing, keep trying the rest of the expression and advancing one
3204 matching character if failing, up to the maximum. Alternatively, if
3205 maximizing, find the maximum number of characters and work backwards. */
3206
3207 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3208 max, eptr));
3209
3210 if (op >= OP_STARI) /* Caseless */
3211 {
3212 fc = md->lcc[fc];
3213 for (i = 1; i <= min; i++)
3214 {
3215 if (eptr >= md->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 MRRETURN(MATCH_NOMATCH);
3219 }
3220 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3221 }
3222 if (min == max) continue;
3223 if (minimize)
3224 {
3225 for (fi = min;; fi++)
3226 {
3227 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3230 if (eptr >= md->end_subject)
3231 {
3232 SCHECK_PARTIAL();
3233 MRRETURN(MATCH_NOMATCH);
3234 }
3235 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3236 }
3237 /* Control never gets here */
3238 }
3239 else /* Maximize */
3240 {
3241 pp = eptr;
3242 for (i = min; i < max; i++)
3243 {
3244 if (eptr >= md->end_subject)
3245 {
3246 SCHECK_PARTIAL();
3247 break;
3248 }
3249 if (fc != md->lcc[*eptr]) break;
3250 eptr++;
3251 }
3252
3253 if (possessive) continue;
3254
3255 while (eptr >= pp)
3256 {
3257 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3258 eptr--;
3259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 }
3261 MRRETURN(MATCH_NOMATCH);
3262 }
3263 /* Control never gets here */
3264 }
3265
3266 /* Caseful comparisons (includes all multi-byte characters) */
3267
3268 else
3269 {
3270 for (i = 1; i <= min; i++)
3271 {
3272 if (eptr >= md->end_subject)
3273 {
3274 SCHECK_PARTIAL();
3275 MRRETURN(MATCH_NOMATCH);
3276 }
3277 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3278 }
3279
3280 if (min == max) continue;
3281
3282 if (minimize)
3283 {
3284 for (fi = min;; fi++)
3285 {
3286 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3287 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3288 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3289 if (eptr >= md->end_subject)
3290 {
3291 SCHECK_PARTIAL();
3292 MRRETURN(MATCH_NOMATCH);
3293 }
3294 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3295 }
3296 /* Control never gets here */
3297 }
3298 else /* Maximize */
3299 {
3300 pp = eptr;
3301 for (i = min; i < max; i++)
3302 {
3303 if (eptr >= md->end_subject)
3304 {
3305 SCHECK_PARTIAL();
3306 break;
3307 }
3308 if (fc != *eptr) break;
3309 eptr++;
3310 }
3311 if (possessive) continue;
3312
3313 while (eptr >= pp)
3314 {
3315 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3316 eptr--;
3317 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3318 }
3319 MRRETURN(MATCH_NOMATCH);
3320 }
3321 }
3322 /* Control never gets here */
3323
3324 /* Match a negated single one-byte character. The character we are
3325 checking can be multibyte. */
3326
3327 case OP_NOT:
3328 case OP_NOTI:
3329 if (eptr >= md->end_subject)
3330 {
3331 SCHECK_PARTIAL();
3332 MRRETURN(MATCH_NOMATCH);
3333 }
3334 ecode++;
3335 GETCHARINCTEST(c, eptr);
3336 if (op == OP_NOTI) /* The caseless case */
3337 {
3338 #ifdef SUPPORT_UTF8
3339 if (c < 256)
3340 #endif
3341 c = md->lcc[c];
3342 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3343 }
3344 else /* Caseful */
3345 {
3346 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3347 }
3348 break;
3349
3350 /* Match a negated single one-byte character repeatedly. This is almost a
3351 repeat of the code for a repeated single character, but I haven't found a
3352 nice way of commoning these up that doesn't require a test of the
3353 positive/negative option for each character match. Maybe that wouldn't add
3354 very much to the time taken, but character matching *is* what this is all
3355 about... */
3356
3357 case OP_NOTEXACT:
3358 case OP_NOTEXACTI:
3359 min = max = GET2(ecode, 1);
3360 ecode += 3;
3361 goto REPEATNOTCHAR;
3362
3363 case OP_NOTUPTO:
3364 case OP_NOTUPTOI:
3365 case OP_NOTMINUPTO:
3366 case OP_NOTMINUPTOI:
3367 min = 0;
3368 max = GET2(ecode, 1);
3369 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3370 ecode += 3;
3371 goto REPEATNOTCHAR;
3372
3373 case OP_NOTPOSSTAR:
3374 case OP_NOTPOSSTARI:
3375 possessive = TRUE;
3376 min = 0;
3377 max = INT_MAX;
3378 ecode++;
3379 goto REPEATNOTCHAR;
3380
3381 case OP_NOTPOSPLUS:
3382 case OP_NOTPOSPLUSI:
3383 possessive = TRUE;
3384 min = 1;
3385 max = INT_MAX;
3386 ecode++;
3387 goto REPEATNOTCHAR;
3388
3389 case OP_NOTPOSQUERY:
3390 case OP_NOTPOSQUERYI:
3391 possessive = TRUE;
3392 min = 0;
3393 max = 1;
3394 ecode++;
3395 goto REPEATNOTCHAR;
3396
3397 case OP_NOTPOSUPTO:
3398 case OP_NOTPOSUPTOI:
3399 possessive = TRUE;
3400 min = 0;
3401 max = GET2(ecode, 1);
3402 ecode += 3;
3403 goto REPEATNOTCHAR;
3404
3405 case OP_NOTSTAR:
3406 case OP_NOTSTARI:
3407 case OP_NOTMINSTAR:
3408 case OP_NOTMINSTARI:
3409 case OP_NOTPLUS:
3410 case OP_NOTPLUSI:
3411 case OP_NOTMINPLUS:
3412 case OP_NOTMINPLUSI:
3413 case OP_NOTQUERY:
3414 case OP_NOTQUERYI:
3415 case OP_NOTMINQUERY:
3416 case OP_NOTMINQUERYI:
3417 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3418 minimize = (c & 1) != 0;
3419 min = rep_min[c]; /* Pick up values from tables; */
3420 max = rep_max[c]; /* zero for max => infinity */
3421 if (max == 0) max = INT_MAX;
3422
3423 /* Common code for all repeated single-byte matches. */
3424
3425 REPEATNOTCHAR:
3426 fc = *ecode++;
3427
3428 /* The code is duplicated for the caseless and caseful cases, for speed,
3429 since matching characters is likely to be quite common. First, ensure the
3430 minimum number of matches are present. If min = max, continue at the same
3431 level without recursing. Otherwise, if minimizing, keep trying the rest of
3432 the expression and advancing one matching character if failing, up to the
3433 maximum. Alternatively, if maximizing, find the maximum number of
3434 characters and work backwards. */
3435
3436 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3437 max, eptr));
3438
3439 if (op >= OP_NOTSTARI) /* Caseless */
3440 {
3441 fc = md->lcc[fc];
3442
3443 #ifdef SUPPORT_UTF8
3444 /* UTF-8 mode */
3445 if (utf8)
3446 {
3447 register unsigned int d;
3448 for (i = 1; i <= min; i++)
3449 {
3450 if (eptr >= md->end_subject)
3451 {
3452 SCHECK_PARTIAL();
3453 MRRETURN(MATCH_NOMATCH);
3454 }
3455 GETCHARINC(d, eptr);
3456 if (d < 256) d = md->lcc[d];
3457 if (fc == d) MRRETURN(MATCH_NOMATCH);
3458 }
3459 }
3460 else
3461 #endif
3462
3463 /* Not UTF-8 mode */
3464 {
3465 for (i = 1; i <= min; i++)
3466 {
3467 if (eptr >= md->end_subject)
3468 {
3469 SCHECK_PARTIAL();
3470 MRRETURN(MATCH_NOMATCH);
3471 }
3472 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3473 }
3474 }
3475
3476 if (min == max) continue;
3477
3478 if (minimize)
3479 {
3480 #ifdef SUPPORT_UTF8
3481 /* UTF-8 mode */
3482 if (utf8)
3483 {
3484 register unsigned int d;
3485 for (fi = min;; fi++)
3486 {
3487 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3488 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3489 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3490 if (eptr >= md->end_subject)
3491 {
3492 SCHECK_PARTIAL();
3493 MRRETURN(MATCH_NOMATCH);
3494 }
3495 GETCHARINC(d, eptr);
3496 if (d < 256) d = md->lcc[d];
3497 if (fc == d) MRRETURN(MATCH_NOMATCH);
3498 }
3499 }
3500 else
3501 #endif
3502 /* Not UTF-8 mode */
3503 {
3504 for (fi = min;; fi++)
3505 {
3506 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3509 if (eptr >= md->end_subject)
3510 {
3511 SCHECK_PARTIAL();
3512 MRRETURN(MATCH_NOMATCH);
3513 }
3514 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3515 }
3516 }
3517 /* Control never gets here */
3518 }
3519
3520 /* Maximize case */
3521
3522 else
3523 {
3524 pp = eptr;
3525
3526 #ifdef SUPPORT_UTF8
3527 /* UTF-8 mode */
3528 if (utf8)
3529 {
3530 register unsigned int d;
3531 for (i = min; i < max; i++)
3532 {
3533 int len = 1;
3534 if (eptr >= md->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 break;
3538 }
3539 GETCHARLEN(d, eptr, len);
3540 if (d < 256) d = md->lcc[d];
3541 if (fc == d) break;
3542 eptr += len;
3543 }
3544 if (possessive) continue;
3545 for(;;)
3546 {
3547 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3548 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3549 if (eptr-- == pp) break; /* Stop if tried at original pos */
3550 BACKCHAR(eptr);
3551 }
3552 }
3553 else
3554 #endif
3555 /* Not UTF-8 mode */
3556 {
3557 for (i = min; i < max; i++)
3558 {
3559 if (eptr >= md->end_subject)
3560 {
3561 SCHECK_PARTIAL();
3562 break;
3563 }
3564 if (fc == md->lcc[*eptr]) break;
3565 eptr++;
3566 }
3567 if (possessive) continue;
3568 while (eptr >= pp)
3569 {
3570 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3572 eptr--;
3573 }
3574 }
3575
3576 MRRETURN(MATCH_NOMATCH);
3577 }
3578 /* Control never gets here */
3579 }
3580
3581 /* Caseful comparisons */
3582
3583 else
3584 {
3585 #ifdef SUPPORT_UTF8
3586 /* UTF-8 mode */
3587 if (utf8)
3588 {
3589 register unsigned int d;
3590 for (i = 1; i <= min; i++)
3591 {
3592 if (eptr >= md->end_subject)
3593 {
3594 SCHECK_PARTIAL();
3595 MRRETURN(MATCH_NOMATCH);
3596 }
3597 GETCHARINC(d, eptr);
3598 if (fc == d) MRRETURN(MATCH_NOMATCH);
3599 }
3600 }
3601 else
3602 #endif
3603 /* Not UTF-8 mode */
3604 {
3605 for (i = 1; i <= min; i++)
3606 {
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 MRRETURN(MATCH_NOMATCH);
3611 }
3612 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3613 }
3614 }
3615
3616 if (min == max) continue;
3617
3618 if (minimize)
3619 {
3620 #ifdef SUPPORT_UTF8
3621 /* UTF-8 mode */
3622 if (utf8)
3623 {
3624 register unsigned int d;
3625 for (fi = min;; fi++)
3626 {
3627 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3630 if (eptr >= md->end_subject)
3631 {
3632 SCHECK_PARTIAL();
3633 MRRETURN(MATCH_NOMATCH);
3634 }
3635 GETCHARINC(d, eptr);
3636 if (fc == d) MRRETURN(MATCH_NOMATCH);
3637 }
3638 }
3639 else
3640 #endif
3641 /* Not UTF-8 mode */
3642 {
3643 for (fi = min;; fi++)
3644 {
3645 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3647 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3648 if (eptr >= md->end_subject)
3649 {
3650 SCHECK_PARTIAL();
3651 MRRETURN(MATCH_NOMATCH);
3652 }
3653 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3654 }
3655 }
3656 /* Control never gets here */
3657 }
3658
3659 /* Maximize case */
3660
3661 else
3662 {
3663 pp = eptr;
3664
3665 #ifdef SUPPORT_UTF8
3666 /* UTF-8 mode */
3667 if (utf8)
3668 {
3669 register unsigned int d;
3670 for (i = min; i < max; i++)
3671 {
3672 int len = 1;
3673 if (eptr >= md->end_subject)
3674 {
3675 SCHECK_PARTIAL();
3676 break;
3677 }
3678 GETCHARLEN(d, eptr, len);
3679 if (fc == d) break;
3680 eptr += len;
3681 }
3682 if (possessive) continue;
3683 for(;;)
3684 {
3685 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3687 if (eptr-- == pp) break; /* Stop if tried at original pos */
3688 BACKCHAR(eptr);
3689 }
3690 }
3691 else
3692 #endif
3693 /* Not UTF-8 mode */
3694 {
3695 for (i = min; i < max; i++)
3696 {
3697 if (eptr >= md->end_subject)
3698 {
3699 SCHECK_PARTIAL();
3700 break;
3701 }
3702 if (fc == *eptr) break;
3703 eptr++;
3704 }
3705 if (possessive) continue;
3706 while (eptr >= pp)
3707 {
3708 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3710 eptr--;
3711 }
3712 }
3713
3714 MRRETURN(MATCH_NOMATCH);
3715 }
3716 }
3717 /* Control never gets here */
3718
3719 /* Match a single character type repeatedly; several different opcodes
3720 share code. This is very similar to the code for single characters, but we
3721 repeat it in the interests of efficiency. */
3722
3723 case OP_TYPEEXACT:
3724 min = max = GET2(ecode, 1);
3725 minimize = TRUE;
3726 ecode += 3;
3727 goto REPEATTYPE;
3728
3729 case OP_TYPEUPTO:
3730 case OP_TYPEMINUPTO:
3731 min = 0;
3732 max = GET2(ecode, 1);
3733 minimize = *ecode == OP_TYPEMINUPTO;
3734 ecode += 3;
3735 goto REPEATTYPE;
3736
3737 case OP_TYPEPOSSTAR:
3738 possessive = TRUE;
3739 min = 0;
3740 max = INT_MAX;
3741 ecode++;
3742 goto REPEATTYPE;
3743
3744 case OP_TYPEPOSPLUS:
3745 possessive = TRUE;
3746 min = 1;
3747 max = INT_MAX;
3748 ecode++;
3749 goto REPEATTYPE;
3750
3751 case OP_TYPEPOSQUERY:
3752 possessive = TRUE;
3753 min = 0;
3754 max = 1;
3755 ecode++;
3756 goto REPEATTYPE;
3757
3758 case OP_TYPEPOSUPTO:
3759 possessive = TRUE;
3760 min = 0;
3761 max = GET2(ecode, 1);
3762 ecode += 3;
3763 goto REPEATTYPE;
3764
3765 case OP_TYPESTAR:
3766 case OP_TYPEMINSTAR:
3767 case OP_TYPEPLUS:
3768 case OP_TYPEMINPLUS:
3769 case OP_TYPEQUERY:
3770 case OP_TYPEMINQUERY:
3771 c = *ecode++ - OP_TYPESTAR;
3772 minimize = (c & 1) != 0;
3773 min = rep_min[c]; /* Pick up values from tables; */
3774 max = rep_max[c]; /* zero for max => infinity */
3775 if (max == 0) max = INT_MAX;
3776
3777 /* Common code for all repeated single character type matches. Note that
3778 in UTF-8 mode, '.' matches a character of any length, but for the other
3779 character types, the valid characters are all one-byte long. */
3780
3781 REPEATTYPE:
3782 ctype = *ecode++; /* Code for the character type */
3783
3784 #ifdef SUPPORT_UCP
3785 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3786 {
3787 prop_fail_result = ctype == OP_NOTPROP;
3788 prop_type = *ecode++;
3789 prop_value = *ecode++;
3790 }
3791 else prop_type = -1;
3792 #endif
3793
3794 /* First, ensure the minimum number of matches are present. Use inline
3795 code for maximizing the speed, and do the type test once at the start
3796 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3797 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3798 and single-bytes. */
3799
3800 if (min > 0)
3801 {
3802 #ifdef SUPPORT_UCP
3803 if (prop_type >= 0)
3804 {
3805 switch(prop_type)
3806 {
3807 case PT_ANY:
3808 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3809 for (i = 1; i <= min; i++)
3810 {
3811 if (eptr >= md->end_subject)
3812 {
3813 SCHECK_PARTIAL();
3814 MRRETURN(MATCH_NOMATCH);
3815 }
3816 GETCHARINCTEST(c, eptr);
3817 }
3818 break;
3819
3820 case PT_LAMP:
3821 for (i = 1; i <= min; i++)
3822 {
3823 int chartype;
3824 if (eptr >= md->end_subject)
3825 {
3826 SCHECK_PARTIAL();
3827 MRRETURN(MATCH_NOMATCH);
3828 }
3829 GETCHARINCTEST(c, eptr);
3830 chartype = UCD_CHARTYPE(c);
3831 if ((chartype == ucp_Lu ||
3832 chartype == ucp_Ll ||
3833 chartype == ucp_Lt) == prop_fail_result)
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 break;
3837
3838 case PT_GC:
3839 for (i = 1; i <= min; i++)
3840 {
3841 if (eptr >= md->end_subject)
3842 {
3843 SCHECK_PARTIAL();
3844 MRRETURN(MATCH_NOMATCH);
3845 }
3846 GETCHARINCTEST(c, eptr);
3847 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3848 MRRETURN(MATCH_NOMATCH);
3849 }
3850 break;
3851
3852 case PT_PC:
3853 for (i = 1; i <= min; i++)
3854 {
3855 if (eptr >= md->end_subject)
3856 {
3857 SCHECK_PARTIAL();
3858 MRRETURN(MATCH_NOMATCH);
3859 }
3860 GETCHARINCTEST(c, eptr);
3861 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3862 MRRETURN(MATCH_NOMATCH);
3863 }
3864 break;
3865
3866 case PT_SC:
3867 for (i = 1; i <= min; i++)
3868 {
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 MRRETURN(MATCH_NOMATCH);
3873 }
3874 GETCHARINCTEST(c, eptr);
3875 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3876 MRRETURN(MATCH_NOMATCH);
3877 }
3878 break;
3879
3880 case PT_ALNUM:
3881 for (i = 1; i <= min; i++)
3882 {
3883 int category;
3884 if (eptr >= md->end_subject)
3885 {
3886 SCHECK_PARTIAL();
3887 MRRETURN(MATCH_NOMATCH);
3888 }
3889 GETCHARINCTEST(c, eptr);
3890 category = UCD_CATEGORY(c);
3891 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3892 MRRETURN(MATCH_NOMATCH);
3893 }
3894 break;
3895
3896 case PT_SPACE: /* Perl space */
3897 for (i = 1; i <= min; i++)
3898 {
3899 if (eptr >= md->end_subject)
3900 {
3901 SCHECK_PARTIAL();
3902 MRRETURN(MATCH_NOMATCH);
3903 }
3904 GETCHARINCTEST(c, eptr);
3905 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3906 c == CHAR_FF || c == CHAR_CR)
3907 == prop_fail_result)
3908 MRRETURN(MATCH_NOMATCH);
3909 }
3910 break;
3911
3912 case PT_PXSPACE: /* POSIX space */
3913 for (i = 1; i <= min; i++)
3914 {
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 MRRETURN(MATCH_NOMATCH);
3919 }
3920 GETCHARINCTEST(c, eptr);
3921 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3922 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3923 == prop_fail_result)
3924 MRRETURN(MATCH_NOMATCH);
3925 }
3926 break;
3927
3928 case PT_WORD:
3929 for (i = 1; i <= min; i++)
3930 {
3931 int category;
3932 if (eptr >= md->end_subject)
3933 {
3934 SCHECK_PARTIAL();
3935 MRRETURN(MATCH_NOMATCH);
3936 }
3937 GETCHARINCTEST(c, eptr);
3938 category = UCD_CATEGORY(c);
3939 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3940 == prop_fail_result)
3941 MRRETURN(MATCH_NOMATCH);
3942 }
3943 break;
3944
3945 /* This should not occur */
3946
3947 default:
3948 RRETURN(PCRE_ERROR_INTERNAL);
3949 }
3950 }
3951
3952 /* Match extended Unicode sequences. We will get here only if the
3953 support is in the binary; otherwise a compile-time error occurs. */
3954
3955 else if (ctype == OP_EXTUNI)
3956 {
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 MRRETURN(MATCH_NOMATCH);
3963 }
3964 GETCHARINCTEST(c, eptr);
3965 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3966 while (eptr < md->end_subject)
3967 {
3968 int len = 1;
3969 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3970 if (UCD_CATEGORY(c) != ucp_M) break;
3971 eptr += len;
3972 }
3973 }
3974 }
3975
3976 else
3977 #endif /* SUPPORT_UCP */
3978
3979 /* Handle all other cases when the coding is UTF-8 */
3980
3981 #ifdef SUPPORT_UTF8
3982 if (utf8) switch(ctype)
3983 {
3984 case OP_ANY:
3985 for (i = 1; i <= min; i++)
3986 {
3987 if (eptr >= md->end_subject)
3988 {
3989 SCHECK_PARTIAL();
3990 MRRETURN(MATCH_NOMATCH);
3991 }
3992 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3993 eptr++;
3994 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3995 }
3996 break;
3997
3998 case OP_ALLANY:
3999 for (i = 1; i <= min; i++)
4000 {
4001 if (eptr >= md->end_subject)
4002 {
4003 SCHECK_PARTIAL();
4004 MRRETURN(MATCH_NOMATCH);
4005 }
4006 eptr++;
4007 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4008 }
4009 break;
4010
4011 case OP_ANYBYTE:
4012 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4013 eptr += min;
4014 break;
4015
4016 case OP_ANYNL:
4017 for (i = 1; i <= min; i++)
4018 {
4019 if (eptr >= md->end_subject)
4020 {
4021 SCHECK_PARTIAL();
4022 MRRETURN(MATCH_NOMATCH);
4023 }
4024 GETCHARINC(c, eptr);
4025 switch(c)
4026 {
4027 default: MRRETURN(MATCH_NOMATCH);
4028
4029 case 0x000d:
4030 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4031 break;
4032
4033 case 0x000a:
4034 break;
4035
4036 case 0x000b:
4037 case 0x000c:
4038 case 0x0085:
4039 case 0x2028:
4040 case 0x2029:
4041 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4042 break;
4043 }
4044 }
4045 break;
4046
4047 case OP_NOT_HSPACE:
4048 for (i = 1; i <= min; i++)
4049 {
4050 if (eptr >= md->end_subject)
4051 {
4052 SCHECK_PARTIAL();
4053 MRRETURN(MATCH_NOMATCH);
4054 }
4055 GETCHARINC(c, eptr);
4056 switch(c)
4057 {
4058 default: break;
4059 case 0x09: /* HT */
4060 case 0x20: /* SPACE */
4061 case 0xa0: /* NBSP */
4062 case 0x1680: /* OGHAM SPACE MARK */
4063 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4064 case 0x2000: /* EN QUAD */
4065 case 0x2001: /* EM QUAD */
4066 case 0x2002: /* EN SPACE */
4067 case 0x2003: /* EM SPACE */
4068 case 0x2004: /* THREE-PER-EM SPACE */
4069 case 0x2005: /* FOUR-PER-EM SPACE */
4070 case 0x2006: /* SIX-PER-EM SPACE */
4071 case 0x2007: /* FIGURE SPACE */
4072 case 0x2008: /* PUNCTUATION SPACE */
4073 case 0x2009: /* THIN SPACE */
4074 case 0x200A: /* HAIR SPACE */
4075 case 0x202f: /* NARROW NO-BREAK SPACE */
4076 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4077 case 0x3000: /* IDEOGRAPHIC SPACE */
4078 MRRETURN(MATCH_NOMATCH);
4079 }
4080 }
4081 break;
4082
4083 case OP_HSPACE:
4084 for (i = 1; i <= min; i++)
4085 {
4086 if (eptr >= md->end_subject)
4087 {
4088 SCHECK_PARTIAL();
4089 MRRETURN(MATCH_NOMATCH);
4090 }
4091 GETCHARINC(c, eptr);
4092 switch(c)
4093 {
4094 default: MRRETURN(MATCH_NOMATCH);
4095 case 0x09: /* HT */
4096 case 0x20: /* SPACE */
4097 case 0xa0: /* NBSP */
4098 case 0x1680: /* OGHAM SPACE MARK */
4099 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4100 case 0x2000: /* EN QUAD */
4101 case 0x2001: /* EM QUAD */
4102 case 0x2002: /* EN SPACE */
4103 case 0x2003: /* EM SPACE */
4104 case 0x2004: /* THREE-PER-EM SPACE */
4105 case 0x2005: /* FOUR-PER-EM SPACE */
4106 case 0x2006: /* SIX-PER-EM SPACE */
4107 case 0x2007: /* FIGURE SPACE */
4108 case 0x2008: /* PUNCTUATION SPACE */
4109 case 0x2009: /* THIN SPACE */
4110 case 0x200A: /* HAIR SPACE */
4111 case 0x202f: /* NARROW NO-BREAK SPACE */
4112 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4113 case 0x3000: /* IDEOGRAPHIC SPACE */
4114 break;
4115 }
4116 }
4117 break;
4118
4119 case OP_NOT_VSPACE:
4120 for (i = 1; i <= min; i++)
4121 {
4122 if (eptr >= md->end_subject)
4123 {
4124 SCHECK_PARTIAL();
4125 MRRETURN(MATCH_NOMATCH);
4126 }
4127 GETCHARINC(c, eptr);
4128 switch(c)
4129 {
4130 default: break;
4131 case 0x0a: /* LF */
4132 case 0x0b: /* VT */
4133 case 0x0c: /* FF */
4134 case 0x0d: /* CR */
4135 case 0x85: /* NEL */
4136 case 0x2028: /* LINE SEPARATOR */
4137 case 0x2029: /* PARAGRAPH SEPARATOR */
4138 MRRETURN(MATCH_NOMATCH);
4139 }
4140 }
4141 break;
4142
4143 case OP_VSPACE:
4144 for (i = 1; i <= min; i++)
4145 {
4146 if (eptr >= md->end_subject)
4147 {
4148 SCHECK_PARTIAL();
4149 MRRETURN(MATCH_NOMATCH);
4150 }
4151 GETCHARINC(c, eptr);
4152 switch(c)
4153 {
4154 default: MRRETURN(MATCH_NOMATCH);
4155 case 0x0a: /* LF */
4156 case 0x0b: /* VT */
4157 case 0x0c: /* FF */
4158 case 0x0d: /* CR */
4159 case 0x85: /* NEL */
4160 case 0x2028: /* LINE SEPARATOR */
4161 case 0x2029: /* PARAGRAPH SEPARATOR */
4162 break;
4163 }
4164 }
4165 break;
4166
4167 case OP_NOT_DIGIT:
4168 for (i = 1; i <= min; i++)
4169 {
4170 if (eptr >= md->end_subject)
4171 {
4172 SCHECK_PARTIAL();
4173 MRRETURN(MATCH_NOMATCH);
4174 }
4175 GETCHARINC(c, eptr);
4176 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4177 MRRETURN(MATCH_NOMATCH);
4178 }
4179 break;
4180
4181 case OP_DIGIT:
4182 for (i = 1; i <= min; i++)
4183 {
4184 if (eptr >= md->end_subject)
4185 {
4186 SCHECK_PARTIAL();
4187 MRRETURN(MATCH_NOMATCH);
4188 }
4189 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4190 MRRETURN(MATCH_NOMATCH);
4191 /* No need to skip more bytes - we know it's a 1-byte character */
4192 }
4193 break;
4194
4195 case OP_NOT_WHITESPACE:
4196 for (i = 1; i <= min; i++)
4197 {
4198 if (eptr >= md->end_subject)
4199 {
4200 SCHECK_PARTIAL();
4201 MRRETURN(MATCH_NOMATCH);
4202 }
4203 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4204 MRRETURN(MATCH_NOMATCH);
4205 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4206 }
4207 break;
4208
4209 case OP_WHITESPACE:
4210 for (i = 1; i <= min; i++)
4211 {
4212 if (eptr >= md->end_subject)
4213 {
4214 SCHECK_PARTIAL();
4215 MRRETURN(MATCH_NOMATCH);
4216 }
4217 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4218 MRRETURN(MATCH_NOMATCH);
4219 /* No need to skip more bytes - we know it's a 1-byte character */
4220 }
4221 break;
4222
4223 case OP_NOT_WORDCHAR:
4224 for (i = 1; i <= min; i++)
4225 {
4226 if (eptr >= md->end_subject)
4227 {
4228 SCHECK_PARTIAL();
4229 MRRETURN(MATCH_NOMATCH);
4230 }
4231 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4232 MRRETURN(MATCH_NOMATCH);
4233 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4234 }
4235 break;
4236
4237 case OP_WORDCHAR:
4238 for (i = 1; i <= min; i++)
4239 {
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 MRRETURN(MATCH_NOMATCH);
4244 }
4245 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4246 MRRETURN(MATCH_NOMATCH);
4247 /* No need to skip more bytes - we know it's a 1-byte character */
4248 }
4249 break;
4250
4251 default:
4252 RRETURN(PCRE_ERROR_INTERNAL);
4253 } /* End switch(ctype) */
4254
4255 else
4256 #endif /* SUPPORT_UTF8 */
4257
4258 /* Code for the non-UTF-8 case for minimum matching of operators other
4259 than OP_PROP and OP_NOTPROP. */
4260
4261 switch(ctype)
4262 {
4263 case OP_ANY:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 MRRETURN(MATCH_NOMATCH);
4270 }
4271 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4272 eptr++;
4273 }
4274 break;
4275
4276 case OP_ALLANY:
4277 if (eptr > md->end_subject - min)
4278 {
4279 SCHECK_PARTIAL();
4280 MRRETURN(MATCH_NOMATCH);
4281 }
4282 eptr += min;
4283 break;
4284
4285 case OP_ANYBYTE:
4286 if (eptr > md->end_subject - min)
4287 {
4288 SCHECK_PARTIAL();
4289 MRRETURN(MATCH_NOMATCH);
4290 }
4291 eptr += min;
4292 break;
4293
4294 case OP_ANYNL:
4295 for (i = 1; i <= min; i++)
4296 {
4297 if (eptr >= md->end_subject)
4298 {
4299 SCHECK_PARTIAL();
4300 MRRETURN(MATCH_NOMATCH);
4301 }
4302 switch(*eptr++)
4303 {
4304 default: MRRETURN(MATCH_NOMATCH);
4305
4306 case 0x000d:
4307 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4308 break;
4309
4310 case 0x000a:
4311 break;
4312
4313 case 0x000b:
4314 case 0x000c:
4315 case 0x0085:
4316 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4317 break;
4318 }
4319 }
4320 break;
4321
4322 case OP_NOT_HSPACE:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 MRRETURN(MATCH_NOMATCH);
4329 }
4330 switch(*eptr++)
4331 {
4332 default: break;
4333 case 0x09: /* HT */
4334 case 0x20: /* SPACE */
4335 case 0xa0: /* NBSP */
4336 MRRETURN(MATCH_NOMATCH);
4337 }
4338 }
4339 break;
4340
4341 case OP_HSPACE:
4342 for (i = 1; i <= min; i++)
4343 {
4344 if (eptr >= md->end_subject)
4345 {
4346 SCHECK_PARTIAL();
4347 MRRETURN(MATCH_NOMATCH);
4348 }
4349 switch(*eptr++)
4350 {
4351 default: MRRETURN(MATCH_NOMATCH);
4352 case 0x09: /* HT */
4353 case 0x20: /* SPACE */
4354 case 0xa0: /* NBSP */
4355 break;
4356 }
4357 }
4358 break;
4359
4360 case OP_NOT_VSPACE:
4361 for (i = 1; i <= min; i++)
4362 {
4363 if (eptr >= md->end_subject)
4364 {
4365 SCHECK_PARTIAL();
4366 MRRETURN(MATCH_NOMATCH);
4367 }
4368 switch(*eptr++)
4369 {
4370 default: break;
4371 case 0x0a: /* LF */
4372 case 0x0b: /* VT */
4373 case 0x0c: /* FF */
4374 case 0x0d: /* CR */
4375 case 0x85: /* NEL */
4376 MRRETURN(MATCH_NOMATCH);
4377 }
4378 }
4379 break;
4380
4381 case OP_VSPACE:
4382 for (i = 1; i <= min; i++)
4383 {
4384 if (eptr >= md->end_subject)
4385 {
4386 SCHECK_PARTIAL();
4387 MRRETURN(MATCH_NOMATCH);
4388 }
4389 switch(*eptr++)
4390 {
4391 default: MRRETURN(MATCH_NOMATCH);
4392 case 0x0a: /* LF */
4393 case 0x0b: /* VT */
4394 case 0x0c: /* FF */
4395 case 0x0d: /* CR */
4396 case 0x85: /* NEL */
4397 break;
4398 }
4399 }
4400 break;
4401
4402 case OP_NOT_DIGIT:
4403 for (i = 1; i <= min; i++)
4404 {
4405 if (eptr >= md->end_subject)
4406 {
4407 SCHECK_PARTIAL();
4408 MRRETURN(MATCH_NOMATCH);
4409 }
4410 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4411 }
4412 break;
4413
4414 case OP_DIGIT:
4415 for (i = 1; i <= min; i++)
4416 {
4417 if (eptr >= md->end_subject)
4418 {
4419 SCHECK_PARTIAL();
4420 MRRETURN(MATCH_NOMATCH);
4421 }
4422 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4423 }
4424 break;
4425
4426 case OP_NOT_WHITESPACE:
4427 for (i = 1; i <= min; i++)
4428 {
4429 if (eptr >= md->end_subject)
4430 {
4431 SCHECK_PARTIAL();
4432 MRRETURN(MATCH_NOMATCH);
4433 }
4434 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4435 }
4436 break;
4437
4438 case OP_WHITESPACE:
4439 for (i = 1; i <= min; i++)
4440 {
4441 if (eptr >= md->end_subject)
4442 {
4443 SCHECK_PARTIAL();
4444 MRRETURN(MATCH_NOMATCH);
4445 }
4446 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4447 }
4448 break;
4449
4450 case OP_NOT_WORDCHAR:
4451 for (i = 1; i <= min; i++)
4452 {
4453 if (eptr >= md->end_subject)
4454 {
4455 SCHECK_PARTIAL();
4456 MRRETURN(MATCH_NOMATCH);
4457 }
4458 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4459 MRRETURN(MATCH_NOMATCH);
4460 }
4461 break;
4462
4463 case OP_WORDCHAR:
4464 for (i = 1; i <= min; i++)
4465 {
4466 if (eptr >= md->end_subject)
4467 {
4468 SCHECK_PARTIAL();
4469 MRRETURN(MATCH_NOMATCH);
4470 }
4471 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 break;
4475
4476 default:
4477 RRETURN(PCRE_ERROR_INTERNAL);
4478 }
4479 }
4480
4481 /* If min = max, continue at the same level without recursing */
4482
4483 if (min == max) continue;
4484
4485 /* If minimizing, we have to test the rest of the pattern before each
4486 subsequent match. Again, separate the UTF-8 case for speed, and also
4487 separate the UCP cases. */
4488
4489 if (minimize)
4490 {
4491 #ifdef SUPPORT_UCP
4492 if (prop_type >= 0)
4493 {
4494 switch(prop_type)
4495 {
4496 case PT_ANY:
4497 for (fi = min;; fi++)
4498 {
4499 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4501 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 MRRETURN(MATCH_NOMATCH);
4506 }
4507 GETCHARINCTEST(c, eptr);
4508 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4509 }
4510 /* Control never gets here */
4511
4512 case PT_LAMP:
4513 for (fi = min;; fi++)
4514 {
4515 int chartype;
4516 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4518 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4519 if (eptr >= md->end_subject)
4520 {
4521 SCHECK_PARTIAL();
4522 MRRETURN(MATCH_NOMATCH);
4523 }
4524 GETCHARINCTEST(c, eptr);
4525 chartype = UCD_CHARTYPE(c);
4526 if ((chartype == ucp_Lu ||
4527 chartype == ucp_Ll ||
4528 chartype == ucp_Lt) == prop_fail_result)
4529 MRRETURN(MATCH_NOMATCH);
4530 }
4531 /* Control never gets here */
4532
4533 case PT_GC:
4534 for (fi = min;; fi++)
4535 {
4536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4538 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4539 if (eptr >= md->end_subject)
4540 {
4541 SCHECK_PARTIAL();
4542 MRRETURN(MATCH_NOMATCH);
4543 }
4544 GETCHARINCTEST(c, eptr);
4545 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4546 MRRETURN(MATCH_NOMATCH);
4547 }
4548 /* Control never gets here */
4549
4550 case PT_PC:
4551 for (fi = min;; fi++)
4552 {
4553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4554 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4555 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4556 if (eptr >= md->end_subject)
4557 {
4558 SCHECK_PARTIAL();
4559 MRRETURN(MATCH_NOMATCH);
4560 }
4561 GETCHARINCTEST(c, eptr);
4562 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 /* Control never gets here */
4566
4567 case PT_SC:
4568 for (fi = min;; fi++)
4569 {
4570 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4571 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4572 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4573 if (eptr >= md->end_subject)
4574 {
4575 SCHECK_PARTIAL();
4576 MRRETURN(MATCH_NOMATCH);
4577 }
4578 GETCHARINCTEST(c, eptr);
4579 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4580 MRRETURN(MATCH_NOMATCH);
4581 }
4582 /* Control never gets here */
4583
4584 case PT_ALNUM:
4585 for (fi = min;; fi++)
4586 {
4587 int category;
4588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4590 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4591 if (eptr >= md->end_subject)
4592 {
4593 SCHECK_PARTIAL();
4594 MRRETURN(MATCH_NOMATCH);
4595 }
4596 GETCHARINCTEST(c, eptr);
4597 category = UCD_CATEGORY(c);
4598 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4599 MRRETURN(MATCH_NOMATCH);
4600 }
4601 /* Control never gets here */
4602
4603 case PT_SPACE: /* Perl space */
4604 for (fi = min;; fi++)
4605 {
4606 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4608 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4609 if (eptr >= md->end_subject)
4610 {
4611 SCHECK_PARTIAL();
4612 MRRETURN(MATCH_NOMATCH);
4613 }
4614 GETCHARINCTEST(c, eptr);
4615 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4616 c == CHAR_FF || c == CHAR_CR)
4617 == prop_fail_result)
4618 MRRETURN(MATCH_NOMATCH);
4619 }
4620 /* Control never gets here */
4621
4622 case PT_PXSPACE: /* POSIX space */
4623 for (fi = min;; fi++)
4624 {
4625 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4627 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4628 if (eptr >= md->end_subject)
4629 {
4630 SCHECK_PARTIAL();
4631 MRRETURN(MATCH_NOMATCH);
4632 }
4633 GETCHARINCTEST(c, eptr);
4634 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4635 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4636 == prop_fail_result)
4637 MRRETURN(MATCH_NOMATCH);
4638 }
4639 /* Control never gets here */
4640
4641 case PT_WORD:
4642 for (fi = min;; fi++)
4643 {
4644 int category;
4645 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4647 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4648 if (eptr >= md->end_subject)
4649 {
4650 SCHECK_PARTIAL();
4651 MRRETURN(MATCH_NOMATCH);
4652 }
4653 GETCHARINCTEST(c, eptr);
4654 category = UCD_CATEGORY(c);
4655 if ((category == ucp_L ||
4656 category == ucp_N ||
4657 c == CHAR_UNDERSCORE)
4658 == prop_fail_result)
4659 MRRETURN(MATCH_NOMATCH);
4660 }
4661 /* Control never gets here */
4662
4663 /* This should never occur */
4664
4665 default:
4666 RRETURN(PCRE_ERROR_INTERNAL);
4667 }
4668 }
4669
4670 /* Match extended Unicode sequences. We will get here only if the
4671 support is in the binary; otherwise a compile-time error occurs. */
4672
4673 else if (ctype == OP_EXTUNI)
4674 {
4675 for (fi = min;; fi++)
4676 {
4677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4679 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 MRRETURN(MATCH_NOMATCH);
4684 }
4685 GETCHARINCTEST(c, eptr);
4686 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4687 while (eptr < md->end_subject)
4688 {
4689 int len = 1;
4690 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4691 if (UCD_CATEGORY(c) != ucp_M) break;
4692 eptr += len;
4693 }
4694 }
4695 }
4696 else
4697 #endif /* SUPPORT_UCP */
4698
4699 #ifdef SUPPORT_UTF8
4700 /* UTF-8 mode */
4701 if (utf8)
4702 {
4703 for (fi = min;; fi++)
4704 {
4705 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4706 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4707 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4708 if (eptr >= md->end_subject)
4709 {
4710 SCHECK_PARTIAL();
4711 MRRETURN(MATCH_NOMATCH);
4712 }
4713 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4714 MRRETURN(MATCH_NOMATCH);
4715 GETCHARINC(c, eptr);
4716 switch(ctype)
4717 {
4718 case OP_ANY: /* This is the non-NL case */
4719 case OP_ALLANY:
4720 case OP_ANYBYTE:
4721 break;
4722
4723 case OP_ANYNL:
4724 switch(c)
4725 {
4726 default: MRRETURN(MATCH_NOMATCH);
4727 case 0x000d:
4728 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4729 break;
4730 case 0x000a:
4731 break;
4732
4733 case 0x000b:
4734 case 0x000c:
4735 case 0x0085:
4736 case 0x2028:
4737 case 0x2029:
4738 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4739 break;
4740 }
4741 break;
4742
4743 case OP_NOT_HSPACE:
4744 switch(c)
4745 {
4746 default: break;
4747 case 0x09: /* HT */
4748 case 0x20: /* SPACE */
4749 case 0xa0: /* NBSP */
4750 case 0x1680: /* OGHAM SPACE MARK */
4751 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4752 case 0x2000: /* EN QUAD */
4753 case 0x2001: /* EM QUAD */
4754 case 0x2002: /* EN SPACE */
4755 case 0x2003: /* EM SPACE */
4756 case 0x2004: /* THREE-PER-EM SPACE */
4757 case 0x2005: /* FOUR-PER-EM SPACE */
4758 case 0x2006: /* SIX-PER-EM SPACE */
4759 case 0x2007: /* FIGURE SPACE */
4760 case 0x2008: /* PUNCTUATION SPACE */
4761 case 0x2009: /* THIN SPACE */
4762 case 0x200A: /* HAIR SPACE */
4763 case 0x202f: /* NARROW NO-BREAK SPACE */
4764 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4765 case 0x3000: /* IDEOGRAPHIC SPACE */
4766 MRRETURN(MATCH_NOMATCH);
4767 }
4768 break;
4769
4770 case OP_HSPACE:
4771 switch(c)
4772 {
4773 default: MRRETURN(MATCH_NOMATCH);
4774 case 0x09: /* HT */
4775 case 0x20: /* SPACE */
4776 case 0xa0: /* NBSP */
4777 case 0x1680: /* OGHAM SPACE MARK */
4778 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4779 case 0x2000: /* EN QUAD */
4780 case 0x2001: /* EM QUAD */
4781 case 0x2002: /* EN SPACE */
4782 case 0x2003: /* EM SPACE */
4783 case 0x2004: /* THREE-PER-EM SPACE */
4784 case 0x2005: /* FOUR-PER-EM SPACE */
4785 case 0x2006: /* SIX-PER-EM SPACE */
4786 case 0x2007: /* FIGURE SPACE */
4787 case 0x2008: /* PUNCTUATION SPACE */
4788 case 0x2009: /* THIN SPACE */
4789 case 0x200A: /* HAIR SPACE */
4790 case 0x202f: /* NARROW NO-BREAK SPACE */
4791 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4792 case 0x3000: /* IDEOGRAPHIC SPACE */
4793 break;
4794 }
4795 break;
4796
4797 case OP_NOT_VSPACE:
4798 switch(c)
4799 {
4800 default: break;
4801 case 0x0a: /* LF */
4802 case 0x0b: /* VT */
4803 case 0x0c: /* FF */
4804 case 0x0d: /* CR */
4805 case 0x85: /* NEL */
4806 case 0x2028: /* LINE SEPARATOR */
4807 case 0x2029: /* PARAGRAPH SEPARATOR */
4808 MRRETURN(MATCH_NOMATCH);
4809 }
4810 break;
4811
4812 case OP_VSPACE:
4813 switch(c)
4814 {
4815 default: MRRETURN(MATCH_NOMATCH);
4816 case 0x0a: /* LF */
4817 case 0x0b: /* VT */
4818 case 0x0c: /* FF */
4819 case 0x0d: /* CR */
4820 case 0x85: /* NEL */
4821 case 0x2028: /* LINE SEPARATOR */
4822 case 0x2029: /* PARAGRAPH SEPARATOR */
4823 break;
4824 }
4825 break;
4826
4827 case OP_NOT_DIGIT:
4828 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4829 MRRETURN(MATCH_NOMATCH);
4830 break;
4831
4832 case OP_DIGIT:
4833 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4834 MRRETURN(MATCH_NOMATCH);
4835 break;
4836
4837 case OP_NOT_WHITESPACE:
4838 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4839 MRRETURN(MATCH_NOMATCH);
4840 break;
4841
4842 case OP_WHITESPACE:
4843 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4844 MRRETURN(MATCH_NOMATCH);
4845 break;
4846
4847 case OP_NOT_WORDCHAR:
4848 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4849 MRRETURN(MATCH_NOMATCH);
4850 break;
4851
4852 case OP_WORDCHAR:
4853 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4854 MRRETURN(MATCH_NOMATCH);
4855 break;
4856
4857 default:
4858 RRETURN(PCRE_ERROR_INTERNAL);
4859 }
4860 }
4861 }
4862 else
4863 #endif
4864 /* Not UTF-8 mode */
4865 {
4866 for (fi = min;; fi++)
4867 {
4868 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4869 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4870 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4871 if (eptr >= md->end_subject)
4872 {
4873 SCHECK_PARTIAL();
4874 MRRETURN(MATCH_NOMATCH);
4875 }
4876 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4877 MRRETURN(MATCH_NOMATCH);
4878 c = *eptr++;
4879 switch(ctype)
4880 {
4881 case OP_ANY: /* This is the non-NL case */
4882 case OP_ALLANY:
4883 case OP_ANYBYTE:
4884 break;
4885
4886 case OP_ANYNL:
4887 switch(c)
4888 {
4889 default: MRRETURN(MATCH_NOMATCH);
4890 case 0x000d:
4891 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4892 break;
4893
4894 case 0x000a:
4895 break;
4896
4897 case 0x000b:
4898 case 0x000c:
4899 case 0x0085:
4900 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4901 break;
4902 }
4903 break;
4904
4905 case OP_NOT_HSPACE:
4906 switch(c)
4907 {
4908 default: break;
4909 case 0x09: /* HT */
4910 case 0x20: /* SPACE */
4911 case 0xa0: /* NBSP */
4912 MRRETURN(MATCH_NOMATCH);
4913 }
4914 break;
4915
4916 case OP_HSPACE:
4917 switch(c)
4918 {
4919 default: MRRETURN(MATCH_NOMATCH);
4920 case 0x09: /* HT */
4921 case 0x20: /* SPACE */
4922 case 0xa0: /* NBSP */
4923 break;
4924 }
4925 break;
4926
4927 case OP_NOT_VSPACE:
4928 switch(c)
4929 {
4930 default: break;
4931 case 0x0a: /* LF */
4932 case 0x0b: /* VT */
4933 case 0x0c: /* FF */
4934 case 0x0d: /* CR */
4935 case 0x85: /* NEL */
4936 MRRETURN(MATCH_NOMATCH);
4937 }
4938 break;
4939
4940 case OP_VSPACE:
4941 switch(c)
4942 {
4943 default: MRRETURN(MATCH_NOMATCH);
4944 case 0x0a: /* LF */
4945 case 0x0b: /* VT */
4946 case 0x0c: /* FF */
4947 case 0x0d: /* CR */
4948 case 0x85: /* NEL */
4949 break;
4950 }
4951 break;
4952
4953 case OP_NOT_DIGIT:
4954 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4955 break;
4956
4957 case OP_DIGIT:
4958 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4959 break;
4960
4961 case OP_NOT_WHITESPACE:
4962 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4963 break;
4964
4965 case OP_WHITESPACE:
4966 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4967 break;
4968
4969 case OP_NOT_WORDCHAR:
4970 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4971 break;
4972
4973 case OP_WORDCHAR:
4974 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4975 break;
4976
4977 default:
4978 RRETURN(PCRE_ERROR_INTERNAL);
4979 }
4980 }
4981 }
4982 /* Control never gets here */
4983 }
4984
4985 /* If maximizing, it is worth using inline code for speed, doing the type
4986 test once at the start (i.e. keep it out of the loop). Again, keep the
4987 UTF-8 and UCP stuff separate. */
4988
4989 else
4990 {
4991 pp = eptr; /* Remember where we started */
4992
4993 #ifdef SUPPORT_UCP
4994 if (prop_type >= 0)
4995 {
4996 switch(prop_type)
4997 {
4998 case PT_ANY:
4999 for (i = min; i < max; i++)
5000 {
5001 int len = 1;
5002 if (eptr >= md->end_subject)
5003 {
5004 SCHECK_PARTIAL();
5005 break;
5006 }
5007 GETCHARLENTEST(c, eptr, len);
5008 if (prop_fail_result) break;
5009 eptr+= len;
5010 }
5011 break;
5012
5013 case PT_LAMP:
5014 for (i = min; i < max; i++)
5015 {
5016 int chartype;
5017 int len = 1;
5018 if (eptr >= md->end_subject)
5019 {
5020 SCHECK_PARTIAL();
5021 break;
5022 }
5023 GETCHARLENTEST(c, eptr, len);
5024 chartype = UCD_CHARTYPE(c);
5025 if ((chartype == ucp_Lu ||
5026 chartype == ucp_Ll ||
5027 chartype == ucp_Lt) == prop_fail_result)
5028 break;
5029 eptr+= len;
5030 }
5031 break;
5032
5033 case PT_GC:
5034 for (i = min; i < max; i++)
5035 {
5036 int len = 1;
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 break;
5041 }
5042 GETCHARLENTEST(c, eptr, len);
5043 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5044 eptr+= len;
5045 }
5046 break;
5047
5048 case PT_PC:
5049 for (i = min; i < max; i++)
5050 {
5051 int len = 1;
5052 if (eptr >= md->end_subject)
5053 {
5054 SCHECK_PARTIAL();
5055 break;
5056 }
5057 GETCHARLENTEST(c, eptr, len);
5058 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5059 eptr+= len;
5060 }
5061 break;
5062
5063 case PT_SC:
5064 for (i = min; i < max; i++)
5065 {
5066 int len = 1;
5067 if (eptr >= md->end_subject)
5068 {
5069 SCHECK_PARTIAL();
5070 break;
5071 }
5072 GETCHARLENTEST(c, eptr, len);
5073 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5074 eptr+= len;
5075 }
5076 break;
5077
5078 case PT_ALNUM:
5079 for (i = min; i < max; i++)
5080 {
5081 int category;
5082 int len = 1;
5083 if (eptr >= md->end_subject)
5084 {
5085 SCHECK_PARTIAL();
5086 break;
5087 }
5088 GETCHARLENTEST(c, eptr, len);
5089 category = UCD_CATEGORY(c);
5090 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5091 break;
5092 eptr+= len;
5093 }
5094 break;
5095
5096 case PT_SPACE: /* Perl space */
5097 for (i = min; i < max; i++)
5098 {
5099 int len = 1;
5100 if (eptr >= md->end_subject)
5101 {
5102 SCHECK_PARTIAL();
5103 break;
5104 }
5105 GETCHARLENTEST(c, eptr, len);
5106 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5107 c == CHAR_FF || c == CHAR_CR)
5108 == prop_fail_result)
5109 break;
5110 eptr+= len;
5111 }
5112 break;
5113
5114 case PT_PXSPACE: /* POSIX space */
5115 for (i = min; i < max; i++)
5116 {
5117 int len = 1;
5118 if (eptr >= md->end_subject)
5119 {
5120 SCHECK_PARTIAL();
5121 break;
5122 }
5123 GETCHARLENTEST(c, eptr, len);
5124 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5125 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5126 == prop_fail_result)
5127 break;
5128 eptr+= len;
5129 }
5130 break;
5131
5132 case PT_WORD:
5133 for (i = min; i < max; i++)
5134 {
5135 int category;
5136 int len = 1;
5137 if (eptr >= md->end_subject)
5138 {
5139 SCHECK_PARTIAL();
5140 break;
5141 }
5142 GETCHARLENTEST(c, eptr, len);
5143 category = UCD_CATEGORY(c);
5144 if ((category == ucp_L || category == ucp_N ||
5145 c == CHAR_UNDERSCORE) == prop_fail_result)
5146 break;
5147 eptr+= len;
5148 }
5149 break;
5150
5151 default:
5152 RRETURN(PCRE_ERROR_INTERNAL);
5153 }
5154
5155 /* eptr is now past the end of the maximum run */
5156
5157 if (possessive) continue;
5158 for(;;)
5159 {
5160 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5162 if (eptr-- == pp) break; /* Stop if tried at original pos */
5163 if (utf8) BACKCHAR(eptr);
5164 }
5165 }
5166
5167 /* Match extended Unicode sequences. We will get here only if the
5168 support is in the binary; otherwise a compile-time error occurs. */
5169
5170 else if (ctype == OP_EXTUNI)
5171 {
5172 for (i = min; i < max; i++)
5173 {
5174 int len = 1;
5175 if (eptr >= md->end_subject)
5176 {
5177 SCHECK_PARTIAL();
5178 break;
5179 }
5180 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5181 if (UCD_CATEGORY(c) == ucp_M) break;
5182 eptr += len;
5183 while (eptr < md->end_subject)
5184 {
5185 len = 1;
5186 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5187 if (UCD_CATEGORY(c) != ucp_M) break;
5188 eptr += len;
5189 }
5190 }
5191
5192 /* eptr is now past the end of the maximum run */
5193
5194 if (possessive) continue;
5195
5196 for(;;)
5197 {
5198 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5200 if (eptr-- == pp) break; /* Stop if tried at original pos */
5201 for (;;) /* Move back over one extended */
5202 {
5203 if (!utf8) c = *eptr; else
5204 {
5205 BACKCHAR(eptr);
5206 GETCHAR(c, eptr);
5207 }
5208 if (UCD_CATEGORY(c) != ucp_M) break;
5209 eptr--;
5210 }
5211 }
5212 }
5213
5214 else
5215 #endif /* SUPPORT_UCP */
5216
5217 #ifdef SUPPORT_UTF8
5218 /* UTF-8 mode */
5219
5220 if (utf8)
5221 {
5222 switch(ctype)
5223 {
5224 case OP_ANY:
5225 if (max < INT_MAX)
5226 {
5227 for (i = min; i < max; i++)
5228 {
5229 if (eptr >= md->end_subject)
5230 {
5231 SCHECK_PARTIAL();
5232 break;
5233 }
5234 if (IS_NEWLINE(eptr)) break;
5235 eptr++;
5236 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5237 }
5238 }
5239
5240 /* Handle unlimited UTF-8 repeat */
5241
5242 else
5243 {
5244 for (i = min; i < max; i++)
5245 {
5246 if (eptr >= md->end_subject)
5247 {
5248 SCHECK_PARTIAL();
5249 break;
5250 }
5251 if (IS_NEWLINE(eptr)) break;
5252 eptr++;
5253 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5254 }
5255 }
5256 break;
5257
5258 case OP_ALLANY:
5259 if (max < INT_MAX)
5260 {
5261 for (i = min; i < max; i++)
5262 {
5263 if (eptr >= md->end_subject)
5264 {
5265 SCHECK_PARTIAL();
5266 break;
5267 }
5268 eptr++;
5269 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5270 }
5271 }
5272 else
5273 {
5274 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5275 SCHECK_PARTIAL();
5276 }
5277 break;
5278
5279 /* The byte case is the same as non-UTF8 */
5280
5281 case OP_ANYBYTE:
5282 c = max - min;
5283 if (c > (unsigned int)(md->end_subject - eptr))
5284 {
5285 eptr = md->end_subject;
5286 SCHECK_PARTIAL();
5287 }
5288 else eptr += c;
5289 break;
5290
5291 case OP_ANYNL:
5292 for (i = min; i < max; i++)
5293 {
5294 int len = 1;
5295 if (eptr >= md->end_subject)
5296 {
5297 SCHECK_PARTIAL();
5298 break;
5299 }
5300 GETCHARLEN(c, eptr, len);
5301 if (c == 0x000d)
5302 {
5303 if (++eptr >= md->end_subject) break;
5304 if (*eptr == 0x000a) eptr++;
5305 }
5306 else
5307 {
5308 if (c != 0x000a &&
5309 (md->bsr_anycrlf ||
5310 (c != 0x000b && c != 0x000c &&
5311 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5312 break;
5313 eptr += len;
5314 }
5315 }
5316 break;
5317
5318 case OP_NOT_HSPACE:
5319 case OP_HSPACE:
5320 for (i = min; i < max; i++)
5321 {
5322 BOOL gotspace;
5323 int len = 1;
5324 if (eptr >= md->end_subject)
5325 {
5326 SCHECK_PARTIAL();
5327 break;
5328 }
5329 GETCHARLEN(c, eptr, len);
5330 switch(c)
5331 {
5332 default: gotspace = FALSE; break;
5333 case 0x09: /* HT */
5334 case 0x20: /* SPACE */
5335 case 0xa0: /* NBSP */
5336 case 0x1680: /* OGHAM SPACE MARK */
5337 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5338 case 0x2000: /* EN QUAD */
5339 case 0x2001: /* EM QUAD */
5340 case 0x2002: /* EN SPACE */
5341 case 0x2003: /* EM SPACE */
5342 case 0x2004: /* THREE-PER-EM SPACE */
5343 case 0x2005: /* FOUR-PER-EM SPACE */
5344 case 0x2006: /* SIX-PER-EM SPACE */
5345 case 0x2007: /* FIGURE SPACE */
5346 case 0x2008: /* PUNCTUATION SPACE */
5347 case 0x2009: /* THIN SPACE */
5348 case 0x200A: /* HAIR SPACE */
5349 case 0x202f: /* NARROW NO-BREAK SPACE */
5350 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5351 case 0x3000: /* IDEOGRAPHIC SPACE */
5352 gotspace = TRUE;
5353 break;
5354 }
5355 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5356 eptr += len;
5357 }
5358 break;
5359
5360 case OP_NOT_VSPACE:
5361 case OP_VSPACE:
5362 for (i = min; i < max; i++)
5363 {
5364 BOOL gotspace;
5365 int len = 1;
5366 if (eptr >= md->end_subject)
5367 {
5368 SCHECK_PARTIAL();
5369 break;
5370 }
5371 GETCHARLEN(c, eptr, len);
5372 switch(c)
5373 {
5374 default: gotspace = FALSE; break;
5375 case 0x0a: /* LF */
5376 case 0x0b: /* VT */
5377 case 0x0c: /* FF */
5378 case 0x0d: /* CR */
5379 case 0x85: /* NEL */
5380 case 0x2028: /* LINE SEPARATOR */
5381 case 0x2029: /* PARAGRAPH SEPARATOR */
5382 gotspace = TRUE;
5383 break;
5384 }
5385 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5386 eptr += len;
5387 }
5388 break;
5389
5390 case OP_NOT_DIGIT:
5391 for (i = min; i < max; i++)
5392 {
5393 int len = 1;
5394 if (eptr >= md->end_subject)
5395 {
5396 SCHECK_PARTIAL();
5397 break;
5398 }
5399 GETCHARLEN(c, eptr, len);
5400 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5401 eptr+= len;
5402 }
5403 break;
5404
5405 case OP_DIGIT:
5406 for (i = min; i < max; i++)
5407 {
5408 int len = 1;
5409 if (eptr >= md->end_subject)
5410 {
5411 SCHECK_PARTIAL();
5412 break;
5413 }
5414 GETCHARLEN(c, eptr, len);
5415 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5416 eptr+= len;
5417 }
5418 break;
5419
5420 case OP_NOT_WHITESPACE:
5421 for (i = min; i < max; i++)
5422 {
5423 int len = 1;
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 GETCHARLEN(c, eptr, len);
5430 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5431 eptr+= len;
5432 }
5433 break;
5434
5435 case OP_WHITESPACE:
5436 for (i = min; i < max; i++)
5437 {
5438 int len = 1;
5439 if (eptr >= md->end_subject)
5440 {
5441 SCHECK_PARTIAL();
5442 break;
5443 }
5444 GETCHARLEN(c, eptr, len);
5445 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5446 eptr+= len;
5447 }
5448 break;
5449
5450 case OP_NOT_WORDCHAR:
5451 for (i = min; i < max; i++)
5452 {
5453 int len = 1;
5454 if (eptr >= md->end_subject)
5455 {
5456 SCHECK_PARTIAL();
5457 break;
5458 }
5459 GETCHARLEN(c, eptr, len);
5460 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5461 eptr+= len;
5462 }
5463 break;
5464
5465 case OP_WORDCHAR:
5466 for (i = min; i < max; i++)
5467 {
5468 int len = 1;
5469 if (eptr >= md->end_subject)
5470 {
5471 SCHECK_PARTIAL();
5472 break;
5473 }
5474 GETCHARLEN(c, eptr, len);
5475 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5476 eptr+= len;
5477 }
5478 break;
5479
5480 default:
5481 RRETURN(PCRE_ERROR_INTERNAL);
5482 }
5483
5484 /* eptr is now past the end of the maximum run. If possessive, we are
5485 done (no backing up). Otherwise, match at this position; anything other
5486 than no match is immediately returned. For nomatch, back up one
5487 character, unless we are matching \R and the last thing matched was
5488 \r\n, in which case, back up two bytes. */
5489
5490 if (possessive) continue;
5491 for(;;)
5492 {
5493 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5495 if (eptr-- == pp) break; /* Stop if tried at original pos */
5496 BACKCHAR(eptr);
5497 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5498 eptr[-1] == '\r') eptr--;
5499 }
5500 }
5501 else
5502 #endif /* SUPPORT_UTF8 */
5503
5504 /* Not UTF-8 mode */
5505 {
5506 switch(ctype)
5507 {
5508 case OP_ANY:
5509 for (i = min; i < max; i++)
5510 {
5511 if (eptr >= md->end_subject)
5512 {
5513 SCHECK_PARTIAL();
5514 break;
5515 }
5516 if (IS_NEWLINE(eptr)) break;
5517 eptr++;
5518 }
5519 break;
5520
5521 case OP_ALLANY:
5522 case OP_ANYBYTE:
5523 c = max - min;
5524 if (c > (unsigned int)(md->end_subject - eptr))
5525 {
5526 eptr = md->end_subject;
5527 SCHECK_PARTIAL();
5528 }
5529 else eptr += c;
5530 break;
5531
5532 case OP_ANYNL:
5533 for (i = min; i < max; i++)
5534 {
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 c = *eptr;
5541 if (c == 0x000d)
5542 {
5543 if (++eptr >= md->end_subject) break;
5544 if (*eptr == 0x000a) eptr++;
5545 }
5546 else
5547 {
5548 if (c != 0x000a &&
5549 (md->bsr_anycrlf ||
5550 (c != 0x000b && c != 0x000c && c != 0x0085)))
5551 break;
5552 eptr++;
5553 }
5554 }
5555 break;
5556
5557 case OP_NOT_HSPACE:
5558 for (i = min; i < max; i++)
5559 {
5560 if (eptr >= md->end_subject)
5561 {
5562 SCHECK_PARTIAL();
5563 break;
5564 }
5565 c = *eptr;
5566 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5567 eptr++;
5568 }
5569 break;
5570
5571 case OP_HSPACE:
5572 for (i = min; i < max; i++)
5573 {
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 c = *eptr;
5580 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5581 eptr++;
5582 }
5583 break;
5584
5585 case OP_NOT_VSPACE:
5586 for (i = min; i < max; i++)
5587 {
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 c = *eptr;
5594 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5595 break;
5596 eptr++;
5597 }
5598 break;
5599
5600 case OP_VSPACE:
5601 for (i = min; i < max; i++)
5602 {
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 c = *eptr;
5609 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5610 break;
5611 eptr++;
5612 }
5613 break;
5614
5615 case OP_NOT_DIGIT:
5616 for (i = min; i < max; i++)
5617 {
5618 if (eptr >= md->end_subject)
5619 {
5620 SCHECK_PARTIAL();
5621 break;
5622 }
5623 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5624 eptr++;
5625 }
5626 break;
5627
5628 case OP_DIGIT:
5629 for (i = min; i < max; i++)
5630 {
5631 if (eptr >= md->end_subject)
5632 {
5633 SCHECK_PARTIAL();
5634 break;
5635 }
5636 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5637 eptr++;
5638 }
5639 break;
5640
5641 case OP_NOT_WHITESPACE:
5642 for (i = min; i < max; i++)
5643 {
5644 if (eptr >= md->end_subject)
5645 {
5646 SCHECK_PARTIAL();
5647 break;
5648 }
5649 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5650 eptr++;
5651 }
5652 break;
5653
5654 case OP_WHITESPACE:
5655 for (i = min; i < max; i++)
5656 {
5657 if (eptr >= md->end_subject)
5658 {
5659 SCHECK_PARTIAL();
5660 break;
5661 }
5662 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5663 eptr++;
5664 }
5665 break;
5666
5667 case OP_NOT_WORDCHAR:
5668 for (i = min; i < max; i++)
5669 {
5670 if (eptr >= md->end_subject)
5671 {
5672 SCHECK_PARTIAL();
5673 break;
5674 }
5675 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5676 eptr++;
5677 }
5678 break;
5679
5680 case OP_WORDCHAR:
5681 for (i = min; i < max; i++)
5682 {
5683 if (eptr >= md->end_subject)
5684 {
5685 SCHECK_PARTIAL();
5686 break;
5687 }
5688 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5689 eptr++;
5690 }
5691 break;
5692
5693 default:
5694 RRETURN(PCRE_ERROR_INTERNAL);
5695 }
5696
5697 /* eptr is now past the end of the maximum run. If possessive, we are
5698 done (no backing up). Otherwise, match at this position; anything other
5699 than no match is immediately returned. For nomatch, back up one
5700 character (byte), unless we are matching \R and the last thing matched
5701 was \r\n, in which case, back up two bytes. */
5702
5703 if (possessive) continue;
5704 while (eptr >= pp)
5705 {
5706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5708 eptr--;
5709 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5710 eptr[-1] == '\r') eptr--;
5711 }
5712 }
5713
5714 /* Get here if we can't make it match with any permitted repetitions */
5715
5716 MRRETURN(MATCH_NOMATCH);
5717 }
5718 /* Control never gets here */
5719
5720 /* There's been some horrible disaster. Arrival here can only mean there is
5721 something seriously wrong in the code above or the OP_xxx definitions. */
5722
5723 default:
5724 DPRINTF(("Unknown opcode %d\n", *ecode));
5725 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5726 }
5727
5728 /* Do not stick any code in here without much thought; it is assumed
5729 that "continue" in the code above comes out to here to repeat the main
5730 loop. */
5731
5732 } /* End of main loop */
5733 /* Control never reaches here */
5734
5735
5736 /* When compiling to use the heap rather than the stack for recursive calls to
5737 match(), the RRETURN() macro jumps here. The number that is saved in
5738 frame->Xwhere indicates which label we actually want to return to. */
5739
5740 #ifdef NO_RECURSE
5741 #define LBL(val) case val: goto L_RM##val;
5742 HEAP_RETURN:
5743 switch (frame->Xwhere)
5744 {
5745 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5746 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5747 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5748 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5749 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5750 #ifdef SUPPORT_UTF8
5751 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5752 LBL(32) LBL(34) LBL(42) LBL(46)
5753 #ifdef SUPPORT_UCP
5754 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5755 LBL(59) LBL(60) LBL(61) LBL(62)
5756 #endif /* SUPPORT_UCP */
5757 #endif /* SUPPORT_UTF8 */
5758 default:
5759 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5760 return PCRE_ERROR_INTERNAL;
5761 }
5762 #undef LBL
5763 #endif /* NO_RECURSE */
5764 }
5765
5766
5767 /***************************************************************************
5768 ****************************************************************************
5769 RECURSION IN THE match() FUNCTION
5770
5771 Undefine all the macros that were defined above to handle this. */
5772
5773 #ifdef NO_RECURSE
5774 #undef eptr
5775 #undef ecode
5776 #undef mstart
5777 #undef offset_top
5778 #undef eptrb
5779 #undef flags
5780
5781 #undef callpat
5782 #undef charptr
5783 #undef data
5784 #undef next
5785 #undef pp
5786 #undef prev
5787 #undef saved_eptr
5788
5789 #undef new_recursive
5790
5791 #undef cur_is_word
5792 #undef condition
5793 #undef prev_is_word
5794
5795 #undef ctype
5796 #undef length
5797 #undef max
5798 #undef min
5799 #undef number
5800 #undef offset
5801 #undef op
5802 #undef save_capture_last
5803 #undef save_offset1
5804 #undef save_offset2
5805 #undef save_offset3
5806 #undef stacksave
5807
5808 #undef newptrb
5809
5810 #endif
5811
5812 /* These two are defined as macros in both cases */
5813
5814 #undef fc
5815 #undef fi
5816
5817 /***************************************************************************
5818 ***************************************************************************/
5819
5820
5821
5822 /*************************************************
5823 * Execute a Regular Expression *
5824 *************************************************/
5825
5826 /* This function applies a compiled re to a subject string and picks out
5827 portions of the string if it matches. Two elements in the vector are set for
5828 each substring: the offsets to the start and end of the substring.
5829
5830 Arguments:
5831 argument_re points to the compiled expression
5832 extra_data points to extra data or is NULL
5833 subject points to the subject string
5834 length length of subject string (may contain binary zeros)
5835 start_offset where to start in the subject string
5836 options option bits
5837 offsets points to a vector of ints to be filled in with offsets
5838 offsetcount the number of elements in the vector
5839
5840 Returns: > 0 => success; value is the number of elements filled in
5841 = 0 => success, but offsets is not big enough
5842 -1 => failed to match
5843 < -1 => some kind of unexpected problem
5844 */
5845
5846 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5847 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5848 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5849 int offsetcount)
5850 {
5851 int rc, ocount, arg_offset_max;
5852 int first_byte = -1;
5853 int req_byte = -1;
5854 int req_byte2 = -1;
5855 int newline;
5856 BOOL using_temporary_offsets = FALSE;
5857 BOOL anchored;
5858 BOOL startline;
5859 BOOL firstline;
5860 BOOL first_byte_caseless = FALSE;
5861 BOOL req_byte_caseless = FALSE;
5862 BOOL utf8;
5863 match_data match_block;
5864 match_data *md = &match_block;
5865 const uschar *tables;
5866 const uschar *start_bits = NULL;
5867 USPTR start_match = (USPTR)subject + start_offset;
5868 USPTR end_subject;
5869 USPTR start_partial = NULL;
5870 USPTR req_byte_ptr = start_match - 1;
5871
5872 pcre_study_data internal_study;
5873 const pcre_study_data *study;
5874
5875 real_pcre internal_re;
5876 const real_pcre *external_re = (const real_pcre *)argument_re;
5877 const real_pcre *re = external_re;
5878
5879 /* Plausibility checks */
5880
5881 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5882 if (re == NULL || subject == NULL ||
5883 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5884 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5885 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5886
5887 /* These two settings are used in the code for checking a UTF-8 string that
5888 follows immediately afterwards. Other values in the md block are used only
5889 during "normal" pcre_exec() processing, not when the JIT support is in use,
5890 so they are set up later. */
5891
5892 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5893 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5894 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5895
5896 /* Check a UTF-8 string if required. Pass back the character offset and error
5897 code for an invalid string if a results vector is available. */
5898
5899 #ifdef SUPPORT_UTF8
5900 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5901 {
5902 int erroroffset;
5903 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5904 if (errorcode != 0)
5905 {
5906 if (offsetcount >= 2)
5907 {
5908 offsets[0] = erroroffset;
5909 offsets[1] = errorcode;
5910 }
5911 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5912 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5913 }
5914
5915 /* Check that a start_offset points to the start of a UTF-8 character. */
5916 if (start_offset > 0 && start_offset < length &&
5917 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5918 return PCRE_ERROR_BADUTF8_OFFSET;
5919 }
5920 #endif
5921
5922 /* If the pattern was successfully studied with JIT support, run the JIT
5923 executable instead of the rest of this function. Most options must be set at
5924 compile time for the JIT code to be usable. Fallback to the normal code path if
5925 an unsupported flag is set. In particular, JIT does not support partial
5926 matching. */
5927
5928 #ifdef SUPPORT_JIT
5929 if (extra_data != NULL
5930 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5931 && extra_data->executable_jit != NULL
5932 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5933 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5934 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5935 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5936 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5937 #endif
5938
5939 /* Carry on with non-JIT matching. This information is for finding all the
5940 numbers associated with a given name, for condition testing. */
5941
5942 md->name_table = (uschar *)re + re->name_table_offset;
5943 md->name_count = re->name_count;
5944 md->name_entry_size = re->name_entry_size;
5945
5946 /* Fish out the optional data from the extra_data structure, first setting
5947 the default values. */
5948
5949 study = NULL;
5950 md->match_limit = MATCH_LIMIT;
5951 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5952 md->callout_data = NULL;
5953
5954 /* The table pointer is always in native byte order. */
5955
5956 tables = external_re->tables;
5957
5958 if (extra_data != NULL)
5959 {
5960 register unsigned int flags = extra_data->flags;
5961 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5962 study = (const pcre_study_data *)extra_data->study_data;
5963 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5964 md->match_limit = extra_data->match_limit;
5965 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5966 md->match_limit_recursion = extra_data->match_limit_recursion;
5967 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5968 md->callout_data = extra_data->callout_data;
5969 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5970 }
5971
5972 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5973 is a feature that makes it possible to save compiled regex and re-use them
5974 in other programs later. */
5975
5976 if (tables == NULL) tables = _pcre_default_tables;
5977
5978 /* Check that the first field in the block is the magic number. If it is not,
5979 test for a regex that was compiled on a host of opposite endianness. If this is
5980 the case, flipped values are put in internal_re and internal_study if there was
5981 study data too. */
5982
5983 if (re->magic_number != MAGIC_NUMBER)
5984 {
5985 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5986 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5987 if (study != NULL) study = &internal_study;
5988 }
5989
5990 /* Set up other data */
5991
5992 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5993 startline = (re->flags & PCRE_STARTLINE) != 0;
5994 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5995
5996 /* The code starts after the real_pcre block and the capture name table. */
5997
5998 md->start_code = (const uschar *)external_re + re->name_table_offset +
5999 re->name_count * re->name_entry_size;
6000
6001 md->start_subject = (USPTR)subject;
6002 md->start_offset = start_offset;
6003 md->end_subject = md->start_subject + length;
6004 end_subject = md->end_subject;
6005
6006 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6007 md->use_ucp = (re->options & PCRE_UCP) != 0;
6008 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6009
6010 /* Some options are unpacked into BOOL variables in the hope that testing
6011 them will be faster than individual option bits. */
6012
6013 md->notbol = (options & PCRE_NOTBOL) != 0;
6014 md->noteol = (options & PCRE_NOTEOL) != 0;
6015 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6016 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6017
6018 md->hitend = FALSE;
6019 md->mark = NULL; /* In case never set */
6020
6021 md->recursive = NULL; /* No recursion at top level */
6022 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6023
6024 md->lcc = tables + lcc_offset;
6025 md->ctypes = tables + ctypes_offset;
6026
6027 /* Handle different \R options. */
6028
6029 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6030 {
6031 case 0:
6032 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6033 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6034 else
6035 #ifdef BSR_ANYCRLF
6036 md->bsr_anycrlf = TRUE;
6037 #else
6038 md->bsr_anycrlf = FALSE;
6039 #endif
6040 break;
6041
6042 case PCRE_BSR_ANYCRLF:
6043 md->bsr_anycrlf = TRUE;
6044 break;
6045
6046 case PCRE_BSR_UNICODE:
6047 md->bsr_anycrlf = FALSE;
6048 break;
6049
6050 default: return PCRE_ERROR_BADNEWLINE;
6051 }
6052
6053 /* Handle different types of newline. The three bits give eight cases. If
6054 nothing is set at run time, whatever was used at compile time applies. */
6055
6056 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6057 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6058 {
6059 case 0: newline = NEWLINE; break; /* Compile-time default */
6060 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6061 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6062 case PCRE_NEWLINE_CR+
6063 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6064 case PCRE_NEWLINE_ANY: newline = -1; break;
6065 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6066 default: return PCRE_ERROR_BADNEWLINE;
6067 }
6068
6069 if (newline == -2)
6070 {
6071 md->nltype = NLTYPE_ANYCRLF;
6072 }
6073 else if (newline < 0)
6074 {
6075 md->nltype = NLTYPE_ANY;
6076 }
6077 else
6078 {
6079 md->nltype = NLTYPE_FIXED;
6080 if (newline > 255)
6081 {
6082 md->nllen = 2;
6083 md->nl[0] = (newline >> 8) & 255;
6084 md->nl[1] = newline & 255;
6085 }
6086 else
6087 {
6088 md->nllen = 1;
6089 md->nl[0] = newline;
6090 }
6091 }
6092
6093 /* Partial matching was originally supported only for a restricted set of
6094 regexes; from release 8.00 there are no restrictions, but the bits are still
6095 defined (though never set). So there's no harm in leaving this code. */
6096
6097 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6098 return PCRE_ERROR_BADPARTIAL;
6099
6100 /* If the expression has got more back references than the offsets supplied can
6101 hold, we get a temporary chunk of working store to use during the matching.
6102 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6103 of 3. */
6104
6105 ocount = offsetcount - (offsetcount % 3);
6106 arg_offset_max = (2*ocount)/3;
6107
6108 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6109 {
6110 ocount = re->top_backref * 3 + 3;
6111 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6112 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6113 using_temporary_offsets = TRUE;
6114 DPRINTF(("Got memory to hold back references\n"));
6115 }
6116 else md->offset_vector = offsets;
6117
6118 md->offset_end = ocount;
6119 md->offset_max = (2*ocount)/3;
6120 md->offset_overflow = FALSE;
6121 md->capture_last = -1;
6122
6123 /* Reset the working variable associated with each extraction. These should
6124 never be used unless previously set, but they get saved and restored, and so we
6125 initialize them to avoid reading uninitialized locations. Also, unset the
6126 offsets for the matched string. This is really just for tidiness with callouts,
6127 in case they inspect these fields. */
6128
6129 if (md->offset_vector != NULL)
6130 {
6131 register int *iptr = md->offset_vector + ocount;
6132 register int *iend = iptr - re->top_bracket;
6133 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6134 while (--iptr >= iend) *iptr = -1;
6135 md->offset_vector[0] = md->offset_vector[1] = -1;
6136 }
6137
6138 /* Set up the first character to match, if available. The first_byte value is
6139 never set for an anchored regular expression, but the anchoring may be forced
6140 at run time, so we have to test for anchoring. The first char may be unset for
6141 an unanchored pattern, of course. If there's no first char and the pattern was
6142 studied, there may be a bitmap of possible first characters. */
6143
6144 if (!anchored)
6145 {
6146 if ((re->flags & PCRE_FIRSTSET) != 0)
6147 {
6148 first_byte = re->first_byte & 255;
6149 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6150 first_byte = md->lcc[first_byte];
6151 }
6152 else
6153 if (!startline && study != NULL &&
6154 (study->flags & PCRE_STUDY_MAPPED) != 0)
6155 start_bits = study->start_bits;
6156 }
6157
6158 /* For anchored or unanchored matches, there may be a "last known required
6159 character" set. */
6160
6161 if ((re->flags & PCRE_REQCHSET) != 0)
6162 {
6163 req_byte = re->req_byte & 255;
6164 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6165 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6166 }
6167
6168
6169
6170
6171 /* ==========================================================================*/
6172
6173 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6174 the loop runs just once. */
6175
6176 for(;;)
6177 {
6178 USPTR save_end_subject = end_subject;
6179 USPTR new_start_match;
6180
6181 /* If firstline is TRUE, the start of the match is constrained to the first
6182 line of a multiline string. That is, the match must be before or at the first
6183 newline. Implement this by temporarily adjusting end_subject so that we stop
6184 scanning at a newline. If the match fails at the newline, later code breaks
6185 this loop. */
6186
6187 if (firstline)
6188 {
6189 USPTR t = start_match;
6190 #ifdef SUPPORT_UTF8
6191 if (utf8)
6192 {
6193 while (t < md->end_subject && !IS_NEWLINE(t))
6194 {
6195 t++;
6196 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6197 }
6198 }
6199 else
6200 #endif
6201 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6202 end_subject = t;
6203 }
6204
6205 /* There are some optimizations that avoid running the match if a known
6206 starting point is not found, or if a known later character is not present.
6207 However, there is an option that disables these, for testing and for ensuring
6208 that all callouts do actually occur. The option can be set in the regex by
6209 (*NO_START_OPT) or passed in match-time options. */
6210
6211 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6212 {
6213 /* Advance to a unique first byte if there is one. */
6214
6215 if (first_byte >= 0)
6216 {
6217 if (first_byte_caseless)
6218 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6219 start_match++;
6220 else
6221 while (start_match < end_subject && *start_match != first_byte)
6222 start_match++;
6223 }
6224
6225 /* Or to just after a linebreak for a multiline match */
6226
6227 else if (startline)
6228 {
6229 if (start_match > md->start_subject + start_offset)
6230 {
6231 #ifdef SUPPORT_UTF8
6232 if (utf8)
6233 {
6234 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6235 {
6236 start_match++;
6237 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6238 start_match++;
6239 }
6240 }
6241 else
6242 #endif
6243 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6244 start_match++;
6245
6246 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6247 and we are now at a LF, advance the match position by one more character.
6248 */
6249
6250 if (start_match[-1] == CHAR_CR &&
6251 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6252 start_match < end_subject &&
6253 *start_match == CHAR_NL)
6254 start_match++;
6255 }
6256 }
6257
6258 /* Or to a non-unique first byte after study */
6259
6260 else if (start_bits != NULL)
6261 {
6262 while (start_match < end_subject)
6263 {
6264 register unsigned int c = *start_match;
6265 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6266 {
6267 start_match++;
6268 #ifdef SUPPORT_UTF8
6269 if (utf8)
6270 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6271 start_match++;
6272 #endif
6273 }
6274 else break;
6275 }
6276 }
6277 } /* Starting optimizations */
6278
6279 /* Restore fudged end_subject */
6280
6281 end_subject = save_end_subject;
6282
6283 /* The following two optimizations are disabled for partial matching or if
6284 disabling is explicitly requested. */
6285
6286 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6287 {
6288 /* If the pattern was studied, a minimum subject length may be set. This is
6289 a lower bound; no actual string of that length may actually match the
6290 pattern. Although the value is, strictly, in characters, we treat it as
6291 bytes to avoid spending too much time in this optimization. */
6292
6293 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6294 (pcre_uint32)(end_subject - start_match) < study->minlength)
6295 {
6296 rc = MATCH_NOMATCH;
6297 break;
6298 }
6299
6300 /* If req_byte is set, we know that that character must appear in the
6301 subject for the match to succeed. If the first character is set, req_byte
6302 must be later in the subject; otherwise the test starts at the match point.
6303 This optimization can save a huge amount of backtracking in patterns with
6304 nested unlimited repeats that aren't going to match. Writing separate code
6305 for cased/caseless versions makes it go faster, as does using an
6306 autoincrement and backing off on a match.
6307
6308 HOWEVER: when the subject string is very, very long, searching to its end
6309 can take a long time, and give bad performance on quite ordinary patterns.
6310 This showed up when somebody was matching something like /^\d+C/ on a
6311 32-megabyte string... so we don't do this when the string is sufficiently
6312 long. */
6313
6314 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6315 {
6316 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6317
6318 /* We don't need to repeat the search if we haven't yet reached the
6319 place we found it at last time. */
6320
6321 if (p > req_byte_ptr)
6322 {
6323 if (req_byte_caseless)
6324 {
6325 while (p < end_subject)
6326 {
6327 register int pp = *p++;
6328 if (pp == req_byte || pp == req_byte2) { p--; break; }
6329 }
6330 }
6331 else
6332 {
6333 while (p < end_subject)
6334 {
6335 if (*p++ == req_byte) { p--; break; }
6336 }
6337 }
6338
6339 /* If we can't find the required character, break the matching loop,
6340 forcing a match failure. */
6341
6342 if (p >= end_subject)
6343 {
6344 rc = MATCH_NOMATCH;
6345 break;
6346 }
6347
6348 /* If we have found the required character, save the point where we
6349 found it, so that we don't search again next time round the loop if
6350 the start hasn't passed this character yet. */
6351
6352 req_byte_ptr = p;
6353 }
6354 }
6355 }
6356
6357 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6358 printf(">>>> Match against: ");
6359 pchars(start_match, end_subject - start_match, TRUE, md);
6360 printf("\n");
6361 #endif
6362
6363 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6364 first starting point for which a partial match was found. */
6365
6366 md->start_match_ptr = start_match;
6367 md->start_used_ptr = start_match;
6368 md->match_call_count = 0;
6369 md->match_function_type = 0;
6370 md->end_offset_top = 0;
6371 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6372 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6373
6374 switch(rc)
6375 {
6376 /* SKIP passes back the next starting point explicitly, but if it is the
6377 same as the match we have just done, treat it as NOMATCH. */
6378
6379 case MATCH_SKIP:
6380 if (md->start_match_ptr != start_match)
6381 {
6382 new_start_match = md->start_match_ptr;
6383 break;
6384 }
6385 /* Fall through */
6386
6387 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6388 the SKIP's arg was not found. We also treat this as NOMATCH. */
6389
6390 case MATCH_SKIP_ARG:
6391 /* Fall through */
6392
6393 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6394 exactly like PRUNE. */
6395
6396 case MATCH_NOMATCH:
6397 case MATCH_PRUNE:
6398 case MATCH_THEN:
6399 new_start_match = start_match + 1;
6400 #ifdef SUPPORT_UTF8
6401 if (utf8)
6402 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6403 new_start_match++;
6404 #endif
6405 break;
6406
6407 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6408
6409 case MATCH_COMMIT:
6410 rc = MATCH_NOMATCH;
6411 goto ENDLOOP;
6412
6413 /* Any other return is either a match, or some kind of error. */
6414
6415 default:
6416 goto ENDLOOP;
6417 }
6418
6419 /* Control reaches here for the various types of "no match at this point"
6420 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6421
6422 rc = MATCH_NOMATCH;
6423
6424 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6425 newline in the subject (though it may continue over the newline). Therefore,
6426 if we have just failed to match, starting at a newline, do not continue. */
6427
6428 if (firstline && IS_NEWLINE(start_match)) break;
6429
6430 /* Advance to new matching position */
6431
6432 start_match = new_start_match;
6433
6434 /* Break the loop if the pattern is anchored or if we have passed the end of
6435 the subject. */
6436
6437 if (anchored || start_match > end_subject) break;
6438
6439 /* If we have just passed a CR and we are now at a LF, and the pattern does
6440 not contain any explicit matches for \r or \n, and the newline option is CRLF
6441 or ANY or ANYCRLF, advance the match position by one more character. */
6442
6443 if (start_match[-1] == CHAR_CR &&
6444 start_match < end_subject &&
6445 *start_match == CHAR_NL &&
6446 (re->flags & PCRE_HASCRORLF) == 0 &&
6447 (md->nltype == NLTYPE_ANY ||
6448 md->nltype == NLTYPE_ANYCRLF ||
6449 md->nllen == 2))
6450 start_match++;
6451
6452 md->mark = NULL; /* Reset for start of next match attempt */
6453 } /* End of for(;;) "bumpalong" loop */
6454
6455 /* ==========================================================================*/
6456
6457 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6458 conditions is true:
6459
6460 (1) The pattern is anchored or the match was failed by (*COMMIT);
6461
6462 (2) We are past the end of the subject;
6463
6464 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6465 this option requests that a match occur at or before the first newline in
6466 the subject.
6467
6468 When we have a match and the offset vector is big enough to deal with any
6469 backreferences, captured substring offsets will already be set up. In the case
6470 where we had to get some local store to hold offsets for backreference
6471 processing, copy those that we can. In this case there need not be overflow if
6472 certain parts of the pattern were not used, even though there are more
6473 capturing parentheses than vector slots. */
6474
6475 ENDLOOP:
6476
6477 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6478 {
6479 if (using_temporary_offsets)
6480 {
6481 if (arg_offset_max >= 4)
6482 {
6483 memcpy(offsets + 2, md->offset_vector + 2,
6484 (arg_offset_max - 2) * sizeof(int));
6485 DPRINTF(("Copied offsets from temporary memory\n"));
6486 }
6487 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6488 DPRINTF(("Freeing temporary memory\n"));
6489 (pcre_free)(md->offset_vector);
6490 }
6491
6492 /* Set the return code to the number of captured strings, or 0 if there were
6493 too many to fit into the vector. */
6494
6495 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6496 0 : md->end_offset_top/2;
6497
6498 /* If there is space in the offset vector, set any unused pairs at the end of
6499 the pattern to -1 for backwards compatibility. It is documented that this
6500 happens. In earlier versions, the whole set of potential capturing offsets
6501 was set to -1 each time round the loop, but this is handled differently now.
6502 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6503 those at the end that need unsetting here. We can't just unset them all at
6504 the start of the whole thing because they may get set in one branch that is
6505 not the final matching branch. */
6506
6507 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6508 {
6509 register int *iptr, *iend;
6510 int resetcount = 2 + re->top_bracket * 2;
6511 if (resetcount > offsetcount) resetcount = ocount;
6512 iptr = offsets + md->end_offset_top;
6513 iend = offsets + resetcount;
6514 while (iptr < iend) *iptr++ = -1;
6515 }
6516
6517 /* If there is space, set up the whole thing as substring 0. The value of
6518 md->start_match_ptr might be modified if \K was encountered on the success
6519 matching path. */
6520
6521 if (offsetcount < 2) rc = 0; else
6522 {
6523 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6524 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6525 }
6526
6527 DPRINTF((">>>> returning %d\n", rc));
6528 goto RETURN_MARK;
6529 }
6530
6531 /* Control gets here if there has been an error, or if the overall match
6532 attempt has failed at all permitted starting positions. */
6533
6534 if (using_temporary_offsets)
6535 {
6536 DPRINTF(("Freeing temporary memory\n"));
6537 (pcre_free)(md->offset_vector);
6538 }
6539
6540 /* For anything other than nomatch or partial match, just return the code. */
6541
6542 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6543 {
6544 DPRINTF((">>>> error: returning %d\n", rc));
6545 return rc;
6546 }
6547
6548 /* Handle partial matches - disable any mark data */
6549
6550 if (start_partial != NULL)
6551 {
6552 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6553 md->mark = NULL;
6554 if (offsetcount > 1)
6555 {
6556 offsets[0] = (int)(start_partial - (USPTR)subject);
6557 offsets[1] = (int)(end_subject - (USPTR)subject);
6558 }
6559 rc = PCRE_ERROR_PARTIAL;
6560 }
6561
6562 /* This is the classic nomatch case */
6563
6564 else
6565 {
6566 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6567 rc = PCRE_ERROR_NOMATCH;
6568 }
6569
6570 /* Return the MARK data if it has been requested. */
6571
6572 RETURN_MARK:
6573
6574 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6575 *(extra_data->mark) = (unsigned char *)(md->mark);
6576 return rc;
6577 }
6578
6579 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5