/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 721 - (show annotations)
Fri Oct 7 15:51:39 2011 UTC (3 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 198354 byte(s)
Error occurred while calculating annotation data.
Comment correction and minor code improvement.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle a capturing bracket, other than those that are possessive with an
798 unlimited repeat. If there is space in the offset vector, save the current
799 subject position in the working slot at the top of the vector. We mustn't
800 change the current values of the data slot, because they may be set from a
801 previous iteration of this group, and be referred to by a reference inside
802 the group. A failure to match might occur after the group has succeeded,
803 if something later on doesn't match. For this reason, we need to restore
804 the working value and also the values of the final offsets, in case they
805 were set by a previous iteration of the same bracket.
806
807 If there isn't enough space in the offset vector, treat this as if it were
808 a non-capturing bracket. Don't worry about setting the flag for the error
809 case here; that is handled in the code for KET. */
810
811 case OP_CBRA:
812 case OP_SCBRA:
813 number = GET2(ecode, 1+LINK_SIZE);
814 offset = number << 1;
815
816 #ifdef PCRE_DEBUG
817 printf("start bracket %d\n", number);
818 printf("subject=");
819 pchars(eptr, 16, TRUE, md);
820 printf("\n");
821 #endif
822
823 if (offset < md->offset_max)
824 {
825 save_offset1 = md->offset_vector[offset];
826 save_offset2 = md->offset_vector[offset+1];
827 save_offset3 = md->offset_vector[md->offset_end - number];
828 save_capture_last = md->capture_last;
829
830 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
831 md->offset_vector[md->offset_end - number] =
832 (int)(eptr - md->start_subject);
833
834 for (;;)
835 {
836 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
837 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
838 eptrb, RM1);
839 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
840
841 /* If we backed up to a THEN, check whether it is within the current
842 branch by comparing the address of the THEN that is passed back with
843 the end of the branch. If it is within the current branch, and the
844 branch is one of two or more alternatives (it either starts or ends
845 with OP_ALT), we have reached the limit of THEN's action, so convert
846 the return code to NOMATCH, which will cause normal backtracking to
847 happen from now on. Otherwise, THEN is passed back to an outer
848 alternative. This implements Perl's treatment of parenthesized groups,
849 where a group not containing | does not affect the current alternative,
850 that is, (X) is NOT the same as (X|(*F)). */
851
852 if (rrc == MATCH_THEN)
853 {
854 next = ecode + GET(ecode,1);
855 if (md->start_match_ptr < next &&
856 (*ecode == OP_ALT || *next == OP_ALT))
857 rrc = MATCH_NOMATCH;
858 }
859
860 /* Anything other than NOMATCH is passed back. */
861
862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
863 md->capture_last = save_capture_last;
864 ecode += GET(ecode, 1);
865 if (*ecode != OP_ALT) break;
866 }
867
868 DPRINTF(("bracket %d failed\n", number));
869 md->offset_vector[offset] = save_offset1;
870 md->offset_vector[offset+1] = save_offset2;
871 md->offset_vector[md->offset_end - number] = save_offset3;
872
873 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
874
875 if (md->mark == NULL) md->mark = markptr;
876 RRETURN(rrc);
877 }
878
879 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
880 as a non-capturing bracket. */
881
882 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
883 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
884
885 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
886
887 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
888 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
889
890 /* Non-capturing or atomic group, except for possessive with unlimited
891 repeat. Loop for all the alternatives.
892
893 When we get to the final alternative within the brackets, we used to return
894 the result of a recursive call to match() whatever happened so it was
895 possible to reduce stack usage by turning this into a tail recursion,
896 except in the case of a possibly empty group. However, now that there is
897 the possiblity of (*THEN) occurring in the final alternative, this
898 optimization is no longer always possible.
899
900 We can optimize if we know there are no (*THEN)s in the pattern; at present
901 this is the best that can be done.
902
903 MATCH_ONCE is returned when the end of an atomic group is successfully
904 reached, but subsequent matching fails. It passes back up the tree (causing
905 captured values to be reset) until the original atomic group level is
906 reached. This is tested by comparing md->once_target with the start of the
907 group. At this point, the return is converted into MATCH_NOMATCH so that
908 previous backup points can be taken. */
909
910 case OP_ONCE:
911 case OP_BRA:
912 case OP_SBRA:
913 DPRINTF(("start non-capturing bracket\n"));
914
915 for (;;)
916 {
917 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
918
919 /* If this is not a possibly empty group, and there are no (*THEN)s in
920 the pattern, and this is the final alternative, optimize as described
921 above. */
922
923 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
924 {
925 ecode += _pcre_OP_lengths[*ecode];
926 goto TAIL_RECURSE;
927 }
928
929 /* In all other cases, we have to make another call to match(). */
930
931 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
932 RM2);
933
934 /* See comment in the code for capturing groups above about handling
935 THEN. */
936
937 if (rrc == MATCH_THEN)
938 {
939 next = ecode + GET(ecode,1);
940 if (md->start_match_ptr < next &&
941 (*ecode == OP_ALT || *next == OP_ALT))
942 rrc = MATCH_NOMATCH;
943 }
944
945 if (rrc != MATCH_NOMATCH)
946 {
947 if (rrc == MATCH_ONCE)
948 {
949 const uschar *scode = ecode;
950 if (*scode != OP_ONCE) /* If not at start, find it */
951 {
952 while (*scode == OP_ALT) scode += GET(scode, 1);
953 scode -= GET(scode, 1);
954 }
955 if (md->once_target == scode) rrc = MATCH_NOMATCH;
956 }
957 RRETURN(rrc);
958 }
959 ecode += GET(ecode, 1);
960 if (*ecode != OP_ALT) break;
961 }
962
963 if (md->mark == NULL) md->mark = markptr;
964 RRETURN(MATCH_NOMATCH);
965
966 /* Handle possessive capturing brackets with an unlimited repeat. We come
967 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
968 handled similarly to the normal case above. However, the matching is
969 different. The end of these brackets will always be OP_KETRPOS, which
970 returns MATCH_KETRPOS without going further in the pattern. By this means
971 we can handle the group by iteration rather than recursion, thereby
972 reducing the amount of stack needed. */
973
974 case OP_CBRAPOS:
975 case OP_SCBRAPOS:
976 allow_zero = FALSE;
977
978 POSSESSIVE_CAPTURE:
979 number = GET2(ecode, 1+LINK_SIZE);
980 offset = number << 1;
981
982 #ifdef PCRE_DEBUG
983 printf("start possessive bracket %d\n", number);
984 printf("subject=");
985 pchars(eptr, 16, TRUE, md);
986 printf("\n");
987 #endif
988
989 if (offset < md->offset_max)
990 {
991 matched_once = FALSE;
992 code_offset = ecode - md->start_code;
993
994 save_offset1 = md->offset_vector[offset];
995 save_offset2 = md->offset_vector[offset+1];
996 save_offset3 = md->offset_vector[md->offset_end - number];
997 save_capture_last = md->capture_last;
998
999 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1000
1001 /* Each time round the loop, save the current subject position for use
1002 when the group matches. For MATCH_MATCH, the group has matched, so we
1003 restart it with a new subject starting position, remembering that we had
1004 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1005 usual. If we haven't matched any alternatives in any iteration, check to
1006 see if a previous iteration matched. If so, the group has matched;
1007 continue from afterwards. Otherwise it has failed; restore the previous
1008 capture values before returning NOMATCH. */
1009
1010 for (;;)
1011 {
1012 md->offset_vector[md->offset_end - number] =
1013 (int)(eptr - md->start_subject);
1014 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1015 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1016 eptrb, RM63);
1017 if (rrc == MATCH_KETRPOS)
1018 {
1019 offset_top = md->end_offset_top;
1020 eptr = md->end_match_ptr;
1021 ecode = md->start_code + code_offset;
1022 save_capture_last = md->capture_last;
1023 matched_once = TRUE;
1024 continue;
1025 }
1026
1027 /* See comment in the code for capturing groups above about handling
1028 THEN. */
1029
1030 if (rrc == MATCH_THEN)
1031 {
1032 next = ecode + GET(ecode,1);
1033 if (md->start_match_ptr < next &&
1034 (*ecode == OP_ALT || *next == OP_ALT))
1035 rrc = MATCH_NOMATCH;
1036 }
1037
1038 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1039 md->capture_last = save_capture_last;
1040 ecode += GET(ecode, 1);
1041 if (*ecode != OP_ALT) break;
1042 }
1043
1044 if (!matched_once)
1045 {
1046 md->offset_vector[offset] = save_offset1;
1047 md->offset_vector[offset+1] = save_offset2;
1048 md->offset_vector[md->offset_end - number] = save_offset3;
1049 }
1050
1051 if (md->mark == NULL) md->mark = markptr;
1052 if (allow_zero || matched_once)
1053 {
1054 ecode += 1 + LINK_SIZE;
1055 break;
1056 }
1057
1058 RRETURN(MATCH_NOMATCH);
1059 }
1060
1061 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1062 as a non-capturing bracket. */
1063
1064 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1065 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1066
1067 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1068
1069 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1070 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1071
1072 /* Non-capturing possessive bracket with unlimited repeat. We come here
1073 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1074 without the capturing complication. It is written out separately for speed
1075 and cleanliness. */
1076
1077 case OP_BRAPOS:
1078 case OP_SBRAPOS:
1079 allow_zero = FALSE;
1080
1081 POSSESSIVE_NON_CAPTURE:
1082 matched_once = FALSE;
1083 code_offset = ecode - md->start_code;
1084
1085 for (;;)
1086 {
1087 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1088 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1089 eptrb, RM48);
1090 if (rrc == MATCH_KETRPOS)
1091 {
1092 offset_top = md->end_offset_top;
1093 eptr = md->end_match_ptr;
1094 ecode = md->start_code + code_offset;
1095 matched_once = TRUE;
1096 continue;
1097 }
1098
1099 /* See comment in the code for capturing groups above about handling
1100 THEN. */
1101
1102 if (rrc == MATCH_THEN)
1103 {
1104 next = ecode + GET(ecode,1);
1105 if (md->start_match_ptr < next &&
1106 (*ecode == OP_ALT || *next == OP_ALT))
1107 rrc = MATCH_NOMATCH;
1108 }
1109
1110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1111 ecode += GET(ecode, 1);
1112 if (*ecode != OP_ALT) break;
1113 }
1114
1115 if (matched_once || allow_zero)
1116 {
1117 ecode += 1 + LINK_SIZE;
1118 break;
1119 }
1120 RRETURN(MATCH_NOMATCH);
1121
1122 /* Control never reaches here. */
1123
1124 /* Conditional group: compilation checked that there are no more than
1125 two branches. If the condition is false, skipping the first branch takes us
1126 past the end if there is only one branch, but that's OK because that is
1127 exactly what going to the ket would do. */
1128
1129 case OP_COND:
1130 case OP_SCOND:
1131 codelink = GET(ecode, 1);
1132
1133 /* Because of the way auto-callout works during compile, a callout item is
1134 inserted between OP_COND and an assertion condition. */
1135
1136 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1137 {
1138 if (pcre_callout != NULL)
1139 {
1140 pcre_callout_block cb;
1141 cb.version = 2; /* Version 1 of the callout block */
1142 cb.callout_number = ecode[LINK_SIZE+2];
1143 cb.offset_vector = md->offset_vector;
1144 cb.subject = (PCRE_SPTR)md->start_subject;
1145 cb.subject_length = (int)(md->end_subject - md->start_subject);
1146 cb.start_match = (int)(mstart - md->start_subject);
1147 cb.current_position = (int)(eptr - md->start_subject);
1148 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1149 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1150 cb.capture_top = offset_top/2;
1151 cb.capture_last = md->capture_last;
1152 cb.callout_data = md->callout_data;
1153 cb.mark = markptr;
1154 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1155 if (rrc < 0) RRETURN(rrc);
1156 }
1157 ecode += _pcre_OP_lengths[OP_CALLOUT];
1158 }
1159
1160 condcode = ecode[LINK_SIZE+1];
1161
1162 /* Now see what the actual condition is */
1163
1164 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1165 {
1166 if (md->recursive == NULL) /* Not recursing => FALSE */
1167 {
1168 condition = FALSE;
1169 ecode += GET(ecode, 1);
1170 }
1171 else
1172 {
1173 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1174 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1175
1176 /* If the test is for recursion into a specific subpattern, and it is
1177 false, but the test was set up by name, scan the table to see if the
1178 name refers to any other numbers, and test them. The condition is true
1179 if any one is set. */
1180
1181 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1182 {
1183 uschar *slotA = md->name_table;
1184 for (i = 0; i < md->name_count; i++)
1185 {
1186 if (GET2(slotA, 0) == recno) break;
1187 slotA += md->name_entry_size;
1188 }
1189
1190 /* Found a name for the number - there can be only one; duplicate
1191 names for different numbers are allowed, but not vice versa. First
1192 scan down for duplicates. */
1193
1194 if (i < md->name_count)
1195 {
1196 uschar *slotB = slotA;
1197 while (slotB > md->name_table)
1198 {
1199 slotB -= md->name_entry_size;
1200 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1201 {
1202 condition = GET2(slotB, 0) == md->recursive->group_num;
1203 if (condition) break;
1204 }
1205 else break;
1206 }
1207
1208 /* Scan up for duplicates */
1209
1210 if (!condition)
1211 {
1212 slotB = slotA;
1213 for (i++; i < md->name_count; i++)
1214 {
1215 slotB += md->name_entry_size;
1216 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1217 {
1218 condition = GET2(slotB, 0) == md->recursive->group_num;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223 }
1224 }
1225 }
1226
1227 /* Chose branch according to the condition */
1228
1229 ecode += condition? 3 : GET(ecode, 1);
1230 }
1231 }
1232
1233 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1234 {
1235 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1236 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1237
1238 /* If the numbered capture is unset, but the reference was by name,
1239 scan the table to see if the name refers to any other numbers, and test
1240 them. The condition is true if any one is set. This is tediously similar
1241 to the code above, but not close enough to try to amalgamate. */
1242
1243 if (!condition && condcode == OP_NCREF)
1244 {
1245 int refno = offset >> 1;
1246 uschar *slotA = md->name_table;
1247
1248 for (i = 0; i < md->name_count; i++)
1249 {
1250 if (GET2(slotA, 0) == refno) break;
1251 slotA += md->name_entry_size;
1252 }
1253
1254 /* Found a name for the number - there can be only one; duplicate names
1255 for different numbers are allowed, but not vice versa. First scan down
1256 for duplicates. */
1257
1258 if (i < md->name_count)
1259 {
1260 uschar *slotB = slotA;
1261 while (slotB > md->name_table)
1262 {
1263 slotB -= md->name_entry_size;
1264 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1265 {
1266 offset = GET2(slotB, 0) << 1;
1267 condition = offset < offset_top &&
1268 md->offset_vector[offset] >= 0;
1269 if (condition) break;
1270 }
1271 else break;
1272 }
1273
1274 /* Scan up for duplicates */
1275
1276 if (!condition)
1277 {
1278 slotB = slotA;
1279 for (i++; i < md->name_count; i++)
1280 {
1281 slotB += md->name_entry_size;
1282 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1283 {
1284 offset = GET2(slotB, 0) << 1;
1285 condition = offset < offset_top &&
1286 md->offset_vector[offset] >= 0;
1287 if (condition) break;
1288 }
1289 else break;
1290 }
1291 }
1292 }
1293 }
1294
1295 /* Chose branch according to the condition */
1296
1297 ecode += condition? 3 : GET(ecode, 1);
1298 }
1299
1300 else if (condcode == OP_DEF) /* DEFINE - always false */
1301 {
1302 condition = FALSE;
1303 ecode += GET(ecode, 1);
1304 }
1305
1306 /* The condition is an assertion. Call match() to evaluate it - setting
1307 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1308 an assertion. */
1309
1310 else
1311 {
1312 md->match_function_type = MATCH_CONDASSERT;
1313 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1314 if (rrc == MATCH_MATCH)
1315 {
1316 if (md->end_offset_top > offset_top)
1317 offset_top = md->end_offset_top; /* Captures may have happened */
1318 condition = TRUE;
1319 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1320 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1321 }
1322
1323 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1324 assertion; it is therefore treated as NOMATCH. */
1325
1326 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1327 {
1328 RRETURN(rrc); /* Need braces because of following else */
1329 }
1330 else
1331 {
1332 condition = FALSE;
1333 ecode += codelink;
1334 }
1335 }
1336
1337 /* We are now at the branch that is to be obeyed. As there is only one, can
1338 use tail recursion to avoid using another stack frame, except when there is
1339 unlimited repeat of a possibly empty group. In the latter case, a recursive
1340 call to match() is always required, unless the second alternative doesn't
1341 exist, in which case we can just plough on. Note that, for compatibility
1342 with Perl, the | in a conditional group is NOT treated as creating two
1343 alternatives. If a THEN is encountered in the branch, it propagates out to
1344 the enclosing alternative (unless nested in a deeper set of alternatives,
1345 of course). */
1346
1347 if (condition || *ecode == OP_ALT)
1348 {
1349 if (op != OP_SCOND)
1350 {
1351 ecode += 1 + LINK_SIZE;
1352 goto TAIL_RECURSE;
1353 }
1354
1355 md->match_function_type = MATCH_CBEGROUP;
1356 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1357 RRETURN(rrc);
1358 }
1359
1360 /* Condition false & no alternative; continue after the group. */
1361
1362 else
1363 {
1364 ecode += 1 + LINK_SIZE;
1365 }
1366 break;
1367
1368
1369 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1370 to close any currently open capturing brackets. */
1371
1372 case OP_CLOSE:
1373 number = GET2(ecode, 1);
1374 offset = number << 1;
1375
1376 #ifdef PCRE_DEBUG
1377 printf("end bracket %d at *ACCEPT", number);
1378 printf("\n");
1379 #endif
1380
1381 md->capture_last = number;
1382 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1383 {
1384 md->offset_vector[offset] =
1385 md->offset_vector[md->offset_end - number];
1386 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1387 if (offset_top <= offset) offset_top = offset + 2;
1388 }
1389 ecode += 3;
1390 break;
1391
1392
1393 /* End of the pattern, either real or forced. */
1394
1395 case OP_END:
1396 case OP_ACCEPT:
1397 case OP_ASSERT_ACCEPT:
1398
1399 /* If we have matched an empty string, fail if not in an assertion and not
1400 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1401 is set and we have matched at the start of the subject. In both cases,
1402 backtracking will then try other alternatives, if any. */
1403
1404 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1405 md->recursive == NULL &&
1406 (md->notempty ||
1407 (md->notempty_atstart &&
1408 mstart == md->start_subject + md->start_offset)))
1409 MRRETURN(MATCH_NOMATCH);
1410
1411 /* Otherwise, we have a match. */
1412
1413 md->end_match_ptr = eptr; /* Record where we ended */
1414 md->end_offset_top = offset_top; /* and how many extracts were taken */
1415 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1416
1417 /* For some reason, the macros don't work properly if an expression is
1418 given as the argument to MRRETURN when the heap is in use. */
1419
1420 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1421 MRRETURN(rrc);
1422
1423 /* Assertion brackets. Check the alternative branches in turn - the
1424 matching won't pass the KET for an assertion. If any one branch matches,
1425 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1426 start of each branch to move the current point backwards, so the code at
1427 this level is identical to the lookahead case. When the assertion is part
1428 of a condition, we want to return immediately afterwards. The caller of
1429 this incarnation of the match() function will have set MATCH_CONDASSERT in
1430 md->match_function type, and one of these opcodes will be the first opcode
1431 that is processed. We use a local variable that is preserved over calls to
1432 match() to remember this case. */
1433
1434 case OP_ASSERT:
1435 case OP_ASSERTBACK:
1436 if (md->match_function_type == MATCH_CONDASSERT)
1437 {
1438 condassert = TRUE;
1439 md->match_function_type = 0;
1440 }
1441 else condassert = FALSE;
1442
1443 do
1444 {
1445 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1446 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1447 {
1448 mstart = md->start_match_ptr; /* In case \K reset it */
1449 markptr = md->mark;
1450 break;
1451 }
1452
1453 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1454 as NOMATCH. */
1455
1456 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1457 ecode += GET(ecode, 1);
1458 }
1459 while (*ecode == OP_ALT);
1460
1461 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1462
1463 /* If checking an assertion for a condition, return MATCH_MATCH. */
1464
1465 if (condassert) RRETURN(MATCH_MATCH);
1466
1467 /* Continue from after the assertion, updating the offsets high water
1468 mark, since extracts may have been taken during the assertion. */
1469
1470 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471 ecode += 1 + LINK_SIZE;
1472 offset_top = md->end_offset_top;
1473 continue;
1474
1475 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1476 PRUNE, or COMMIT means we must assume failure without checking subsequent
1477 branches. */
1478
1479 case OP_ASSERT_NOT:
1480 case OP_ASSERTBACK_NOT:
1481 if (md->match_function_type == MATCH_CONDASSERT)
1482 {
1483 condassert = TRUE;
1484 md->match_function_type = 0;
1485 }
1486 else condassert = FALSE;
1487
1488 do
1489 {
1490 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1491 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1492 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1493 {
1494 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1495 break;
1496 }
1497
1498 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1499 as NOMATCH. */
1500
1501 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1502 ecode += GET(ecode,1);
1503 }
1504 while (*ecode == OP_ALT);
1505
1506 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1507
1508 ecode += 1 + LINK_SIZE;
1509 continue;
1510
1511 /* Move the subject pointer back. This occurs only at the start of
1512 each branch of a lookbehind assertion. If we are too close to the start to
1513 move back, this match function fails. When working with UTF-8 we move
1514 back a number of characters, not bytes. */
1515
1516 case OP_REVERSE:
1517 #ifdef SUPPORT_UTF8
1518 if (utf8)
1519 {
1520 i = GET(ecode, 1);
1521 while (i-- > 0)
1522 {
1523 eptr--;
1524 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1525 BACKCHAR(eptr);
1526 }
1527 }
1528 else
1529 #endif
1530
1531 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1532
1533 {
1534 eptr -= GET(ecode, 1);
1535 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1536 }
1537
1538 /* Save the earliest consulted character, then skip to next op code */
1539
1540 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1541 ecode += 1 + LINK_SIZE;
1542 break;
1543
1544 /* The callout item calls an external function, if one is provided, passing
1545 details of the match so far. This is mainly for debugging, though the
1546 function is able to force a failure. */
1547
1548 case OP_CALLOUT:
1549 if (pcre_callout != NULL)
1550 {
1551 pcre_callout_block cb;
1552 cb.version = 2; /* Version 1 of the callout block */
1553 cb.callout_number = ecode[1];
1554 cb.offset_vector = md->offset_vector;
1555 cb.subject = (PCRE_SPTR)md->start_subject;
1556 cb.subject_length = (int)(md->end_subject - md->start_subject);
1557 cb.start_match = (int)(mstart - md->start_subject);
1558 cb.current_position = (int)(eptr - md->start_subject);
1559 cb.pattern_position = GET(ecode, 2);
1560 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1561 cb.capture_top = offset_top/2;
1562 cb.capture_last = md->capture_last;
1563 cb.callout_data = md->callout_data;
1564 cb.mark = markptr;
1565 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1566 if (rrc < 0) RRETURN(rrc);
1567 }
1568 ecode += 2 + 2*LINK_SIZE;
1569 break;
1570
1571 /* Recursion either matches the current regex, or some subexpression. The
1572 offset data is the offset to the starting bracket from the start of the
1573 whole pattern. (This is so that it works from duplicated subpatterns.)
1574
1575 The state of the capturing groups is preserved over recursion, and
1576 re-instated afterwards. We don't know how many are started and not yet
1577 finished (offset_top records the completed total) so we just have to save
1578 all the potential data. There may be up to 65535 such values, which is too
1579 large to put on the stack, but using malloc for small numbers seems
1580 expensive. As a compromise, the stack is used when there are no more than
1581 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1582
1583 There are also other values that have to be saved. We use a chained
1584 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1585 for the original version of this logic. It has, however, been hacked around
1586 a lot, so he is not to blame for the current way it works. */
1587
1588 case OP_RECURSE:
1589 {
1590 recursion_info *ri;
1591 int recno;
1592
1593 callpat = md->start_code + GET(ecode, 1);
1594 recno = (callpat == md->start_code)? 0 :
1595 GET2(callpat, 1 + LINK_SIZE);
1596
1597 /* Check for repeating a recursion without advancing the subject pointer.
1598 This should catch convoluted mutual recursions. (Some simple cases are
1599 caught at compile time.) */
1600
1601 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1602 if (recno == ri->group_num && eptr == ri->subject_position)
1603 RRETURN(PCRE_ERROR_RECURSELOOP);
1604
1605 /* Add to "recursing stack" */
1606
1607 new_recursive.group_num = recno;
1608 new_recursive.subject_position = eptr;
1609 new_recursive.prevrec = md->recursive;
1610 md->recursive = &new_recursive;
1611
1612 /* Where to continue from afterwards */
1613
1614 ecode += 1 + LINK_SIZE;
1615
1616 /* Now save the offset data */
1617
1618 new_recursive.saved_max = md->offset_end;
1619 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1620 new_recursive.offset_save = stacksave;
1621 else
1622 {
1623 new_recursive.offset_save =
1624 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1625 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1626 }
1627 memcpy(new_recursive.offset_save, md->offset_vector,
1628 new_recursive.saved_max * sizeof(int));
1629
1630 /* OK, now we can do the recursion. After processing each alternative,
1631 restore the offset data. If there were nested recursions, md->recursive
1632 might be changed, so reset it before looping. */
1633
1634 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1635 cbegroup = (*callpat >= OP_SBRA);
1636 do
1637 {
1638 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1639 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1640 md, eptrb, RM6);
1641 memcpy(md->offset_vector, new_recursive.offset_save,
1642 new_recursive.saved_max * sizeof(int));
1643 md->recursive = new_recursive.prevrec;
1644 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1645 {
1646 DPRINTF(("Recursion matched\n"));
1647 if (new_recursive.offset_save != stacksave)
1648 (pcre_free)(new_recursive.offset_save);
1649
1650 /* Set where we got to in the subject, and reset the start in case
1651 it was changed by \K. This *is* propagated back out of a recursion,
1652 for Perl compatibility. */
1653
1654 eptr = md->end_match_ptr;
1655 mstart = md->start_match_ptr;
1656 goto RECURSION_MATCHED; /* Exit loop; end processing */
1657 }
1658
1659 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1660 as NOMATCH. */
1661
1662 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1663 {
1664 DPRINTF(("Recursion gave error %d\n", rrc));
1665 if (new_recursive.offset_save != stacksave)
1666 (pcre_free)(new_recursive.offset_save);
1667 RRETURN(rrc);
1668 }
1669
1670 md->recursive = &new_recursive;
1671 callpat += GET(callpat, 1);
1672 }
1673 while (*callpat == OP_ALT);
1674
1675 DPRINTF(("Recursion didn't match\n"));
1676 md->recursive = new_recursive.prevrec;
1677 if (new_recursive.offset_save != stacksave)
1678 (pcre_free)(new_recursive.offset_save);
1679 MRRETURN(MATCH_NOMATCH);
1680 }
1681
1682 RECURSION_MATCHED:
1683 break;
1684
1685 /* An alternation is the end of a branch; scan along to find the end of the
1686 bracketed group and go to there. */
1687
1688 case OP_ALT:
1689 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1690 break;
1691
1692 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1693 indicating that it may occur zero times. It may repeat infinitely, or not
1694 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1695 with fixed upper repeat limits are compiled as a number of copies, with the
1696 optional ones preceded by BRAZERO or BRAMINZERO. */
1697
1698 case OP_BRAZERO:
1699 next = ecode + 1;
1700 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1701 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1702 do next += GET(next, 1); while (*next == OP_ALT);
1703 ecode = next + 1 + LINK_SIZE;
1704 break;
1705
1706 case OP_BRAMINZERO:
1707 next = ecode + 1;
1708 do next += GET(next, 1); while (*next == OP_ALT);
1709 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1711 ecode++;
1712 break;
1713
1714 case OP_SKIPZERO:
1715 next = ecode+1;
1716 do next += GET(next,1); while (*next == OP_ALT);
1717 ecode = next + 1 + LINK_SIZE;
1718 break;
1719
1720 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1721 here; just jump to the group, with allow_zero set TRUE. */
1722
1723 case OP_BRAPOSZERO:
1724 op = *(++ecode);
1725 allow_zero = TRUE;
1726 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1727 goto POSSESSIVE_NON_CAPTURE;
1728
1729 /* End of a group, repeated or non-repeating. */
1730
1731 case OP_KET:
1732 case OP_KETRMIN:
1733 case OP_KETRMAX:
1734 case OP_KETRPOS:
1735 prev = ecode - GET(ecode, 1);
1736
1737 /* If this was a group that remembered the subject start, in order to break
1738 infinite repeats of empty string matches, retrieve the subject start from
1739 the chain. Otherwise, set it NULL. */
1740
1741 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1742 {
1743 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1744 eptrb = eptrb->epb_prev; /* Backup to previous group */
1745 }
1746 else saved_eptr = NULL;
1747
1748 /* If we are at the end of an assertion group, stop matching and return
1749 MATCH_MATCH, but record the current high water mark for use by positive
1750 assertions. We also need to record the match start in case it was changed
1751 by \K. */
1752
1753 if (*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT)
1754 {
1755 md->end_match_ptr = eptr; /* For ONCE */
1756 md->end_offset_top = offset_top;
1757 md->start_match_ptr = mstart;
1758 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1759 }
1760
1761 /* For capturing groups we have to check the group number back at the start
1762 and if necessary complete handling an extraction by setting the offsets and
1763 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1764 into group 0, so it won't be picked up here. Instead, we catch it when the
1765 OP_END is reached. Other recursion is handled here. We just have to record
1766 the current subject position and start match pointer and give a MATCH
1767 return. */
1768
1769 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1770 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1771 {
1772 number = GET2(prev, 1+LINK_SIZE);
1773 offset = number << 1;
1774
1775 #ifdef PCRE_DEBUG
1776 printf("end bracket %d", number);
1777 printf("\n");
1778 #endif
1779
1780 /* Handle a recursively called group. */
1781
1782 if (md->recursive != NULL && md->recursive->group_num == number)
1783 {
1784 md->end_match_ptr = eptr;
1785 md->start_match_ptr = mstart;
1786 RRETURN(MATCH_MATCH);
1787 }
1788
1789 /* Deal with capturing */
1790
1791 md->capture_last = number;
1792 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1793 {
1794 /* If offset is greater than offset_top, it means that we are
1795 "skipping" a capturing group, and that group's offsets must be marked
1796 unset. In earlier versions of PCRE, all the offsets were unset at the
1797 start of matching, but this doesn't work because atomic groups and
1798 assertions can cause a value to be set that should later be unset.
1799 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1800 part of the atomic group, but this is not on the final matching path,
1801 so must be unset when 2 is set. (If there is no group 2, there is no
1802 problem, because offset_top will then be 2, indicating no capture.) */
1803
1804 if (offset > offset_top)
1805 {
1806 register int *iptr = md->offset_vector + offset_top;
1807 register int *iend = md->offset_vector + offset;
1808 while (iptr < iend) *iptr++ = -1;
1809 }
1810
1811 /* Now make the extraction */
1812
1813 md->offset_vector[offset] =
1814 md->offset_vector[md->offset_end - number];
1815 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1816 if (offset_top <= offset) offset_top = offset + 2;
1817 }
1818 }
1819
1820 /* For an ordinary non-repeating ket, just continue at this level. This
1821 also happens for a repeating ket if no characters were matched in the
1822 group. This is the forcible breaking of infinite loops as implemented in
1823 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1824 processing the rest of the pattern at a lower level. If this results in a
1825 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1826 bypassing intermediate backup points, but resetting any captures that
1827 happened along the way. */
1828
1829 if (*ecode == OP_KET || eptr == saved_eptr)
1830 {
1831 if (*prev == OP_ONCE)
1832 {
1833 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1835 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1836 RRETURN(MATCH_ONCE);
1837 }
1838 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1839 break;
1840 }
1841
1842 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1843 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1844 at a time from the outer level, thus saving stack. */
1845
1846 if (*ecode == OP_KETRPOS)
1847 {
1848 md->end_match_ptr = eptr;
1849 md->end_offset_top = offset_top;
1850 RRETURN(MATCH_KETRPOS);
1851 }
1852
1853 /* The normal repeating kets try the rest of the pattern or restart from
1854 the preceding bracket, in the appropriate order. In the second case, we can
1855 use tail recursion to avoid using another stack frame, unless we have an
1856 an atomic group or an unlimited repeat of a group that can match an empty
1857 string. */
1858
1859 if (*ecode == OP_KETRMIN)
1860 {
1861 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863 if (*prev == OP_ONCE)
1864 {
1865 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1867 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1868 RRETURN(MATCH_ONCE);
1869 }
1870 if (*prev >= OP_SBRA) /* Could match an empty string */
1871 {
1872 md->match_function_type = MATCH_CBEGROUP;
1873 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1874 RRETURN(rrc);
1875 }
1876 ecode = prev;
1877 goto TAIL_RECURSE;
1878 }
1879 else /* OP_KETRMAX */
1880 {
1881 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1882 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1883 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1884 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1885 if (*prev == OP_ONCE)
1886 {
1887 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1889 md->once_target = prev;
1890 RRETURN(MATCH_ONCE);
1891 }
1892 ecode += 1 + LINK_SIZE;
1893 goto TAIL_RECURSE;
1894 }
1895 /* Control never gets here */
1896
1897 /* Not multiline mode: start of subject assertion, unless notbol. */
1898
1899 case OP_CIRC:
1900 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1901
1902 /* Start of subject assertion */
1903
1904 case OP_SOD:
1905 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1906 ecode++;
1907 break;
1908
1909 /* Multiline mode: start of subject unless notbol, or after any newline. */
1910
1911 case OP_CIRCM:
1912 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1913 if (eptr != md->start_subject &&
1914 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1915 MRRETURN(MATCH_NOMATCH);
1916 ecode++;
1917 break;
1918
1919 /* Start of match assertion */
1920
1921 case OP_SOM:
1922 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1923 ecode++;
1924 break;
1925
1926 /* Reset the start of match point */
1927
1928 case OP_SET_SOM:
1929 mstart = eptr;
1930 ecode++;
1931 break;
1932
1933 /* Multiline mode: assert before any newline, or before end of subject
1934 unless noteol is set. */
1935
1936 case OP_DOLLM:
1937 if (eptr < md->end_subject)
1938 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1939 else
1940 {
1941 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1942 SCHECK_PARTIAL();
1943 }
1944 ecode++;
1945 break;
1946
1947 /* Not multiline mode: assert before a terminating newline or before end of
1948 subject unless noteol is set. */
1949
1950 case OP_DOLL:
1951 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1952 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1953
1954 /* ... else fall through for endonly */
1955
1956 /* End of subject assertion (\z) */
1957
1958 case OP_EOD:
1959 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1960 SCHECK_PARTIAL();
1961 ecode++;
1962 break;
1963
1964 /* End of subject or ending \n assertion (\Z) */
1965
1966 case OP_EODN:
1967 ASSERT_NL_OR_EOS:
1968 if (eptr < md->end_subject &&
1969 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1970 MRRETURN(MATCH_NOMATCH);
1971
1972 /* Either at end of string or \n before end. */
1973
1974 SCHECK_PARTIAL();
1975 ecode++;
1976 break;
1977
1978 /* Word boundary assertions */
1979
1980 case OP_NOT_WORD_BOUNDARY:
1981 case OP_WORD_BOUNDARY:
1982 {
1983
1984 /* Find out if the previous and current characters are "word" characters.
1985 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1986 be "non-word" characters. Remember the earliest consulted character for
1987 partial matching. */
1988
1989 #ifdef SUPPORT_UTF8
1990 if (utf8)
1991 {
1992 /* Get status of previous character */
1993
1994 if (eptr == md->start_subject) prev_is_word = FALSE; else
1995 {
1996 USPTR lastptr = eptr - 1;
1997 while((*lastptr & 0xc0) == 0x80) lastptr--;
1998 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1999 GETCHAR(c, lastptr);
2000 #ifdef SUPPORT_UCP
2001 if (md->use_ucp)
2002 {
2003 if (c == '_') prev_is_word = TRUE; else
2004 {
2005 int cat = UCD_CATEGORY(c);
2006 prev_is_word = (cat == ucp_L || cat == ucp_N);
2007 }
2008 }
2009 else
2010 #endif
2011 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2012 }
2013
2014 /* Get status of next character */
2015
2016 if (eptr >= md->end_subject)
2017 {
2018 SCHECK_PARTIAL();
2019 cur_is_word = FALSE;
2020 }
2021 else
2022 {
2023 GETCHAR(c, eptr);
2024 #ifdef SUPPORT_UCP
2025 if (md->use_ucp)
2026 {
2027 if (c == '_') cur_is_word = TRUE; else
2028 {
2029 int cat = UCD_CATEGORY(c);
2030 cur_is_word = (cat == ucp_L || cat == ucp_N);
2031 }
2032 }
2033 else
2034 #endif
2035 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2036 }
2037 }
2038 else
2039 #endif
2040
2041 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2042 consistency with the behaviour of \w we do use it in this case. */
2043
2044 {
2045 /* Get status of previous character */
2046
2047 if (eptr == md->start_subject) prev_is_word = FALSE; else
2048 {
2049 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2050 #ifdef SUPPORT_UCP
2051 if (md->use_ucp)
2052 {
2053 c = eptr[-1];
2054 if (c == '_') prev_is_word = TRUE; else
2055 {
2056 int cat = UCD_CATEGORY(c);
2057 prev_is_word = (cat == ucp_L || cat == ucp_N);
2058 }
2059 }
2060 else
2061 #endif
2062 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2063 }
2064
2065 /* Get status of next character */
2066
2067 if (eptr >= md->end_subject)
2068 {
2069 SCHECK_PARTIAL();
2070 cur_is_word = FALSE;
2071 }
2072 else
2073 #ifdef SUPPORT_UCP
2074 if (md->use_ucp)
2075 {
2076 c = *eptr;
2077 if (c == '_') cur_is_word = TRUE; else
2078 {
2079 int cat = UCD_CATEGORY(c);
2080 cur_is_word = (cat == ucp_L || cat == ucp_N);
2081 }
2082 }
2083 else
2084 #endif
2085 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2086 }
2087
2088 /* Now see if the situation is what we want */
2089
2090 if ((*ecode++ == OP_WORD_BOUNDARY)?
2091 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2092 MRRETURN(MATCH_NOMATCH);
2093 }
2094 break;
2095
2096 /* Match a single character type; inline for speed */
2097
2098 case OP_ANY:
2099 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2100 /* Fall through */
2101
2102 case OP_ALLANY:
2103 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2104 { /* not be updated before SCHECK_PARTIAL. */
2105 SCHECK_PARTIAL();
2106 MRRETURN(MATCH_NOMATCH);
2107 }
2108 eptr++;
2109 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2110 ecode++;
2111 break;
2112
2113 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2114 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2115
2116 case OP_ANYBYTE:
2117 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2118 { /* not be updated before SCHECK_PARTIAL. */
2119 SCHECK_PARTIAL();
2120 MRRETURN(MATCH_NOMATCH);
2121 }
2122 eptr++;
2123 ecode++;
2124 break;
2125
2126 case OP_NOT_DIGIT:
2127 if (eptr >= md->end_subject)
2128 {
2129 SCHECK_PARTIAL();
2130 MRRETURN(MATCH_NOMATCH);
2131 }
2132 GETCHARINCTEST(c, eptr);
2133 if (
2134 #ifdef SUPPORT_UTF8
2135 c < 256 &&
2136 #endif
2137 (md->ctypes[c] & ctype_digit) != 0
2138 )
2139 MRRETURN(MATCH_NOMATCH);
2140 ecode++;
2141 break;
2142
2143 case OP_DIGIT:
2144 if (eptr >= md->end_subject)
2145 {
2146 SCHECK_PARTIAL();
2147 MRRETURN(MATCH_NOMATCH);
2148 }
2149 GETCHARINCTEST(c, eptr);
2150 if (
2151 #ifdef SUPPORT_UTF8
2152 c >= 256 ||
2153 #endif
2154 (md->ctypes[c] & ctype_digit) == 0
2155 )
2156 MRRETURN(MATCH_NOMATCH);
2157 ecode++;
2158 break;
2159
2160 case OP_NOT_WHITESPACE:
2161 if (eptr >= md->end_subject)
2162 {
2163 SCHECK_PARTIAL();
2164 MRRETURN(MATCH_NOMATCH);
2165 }
2166 GETCHARINCTEST(c, eptr);
2167 if (
2168 #ifdef SUPPORT_UTF8
2169 c < 256 &&
2170 #endif
2171 (md->ctypes[c] & ctype_space) != 0
2172 )
2173 MRRETURN(MATCH_NOMATCH);
2174 ecode++;
2175 break;
2176
2177 case OP_WHITESPACE:
2178 if (eptr >= md->end_subject)
2179 {
2180 SCHECK_PARTIAL();
2181 MRRETURN(MATCH_NOMATCH);
2182 }
2183 GETCHARINCTEST(c, eptr);
2184 if (
2185 #ifdef SUPPORT_UTF8
2186 c >= 256 ||
2187 #endif
2188 (md->ctypes[c] & ctype_space) == 0
2189 )
2190 MRRETURN(MATCH_NOMATCH);
2191 ecode++;
2192 break;
2193
2194 case OP_NOT_WORDCHAR:
2195 if (eptr >= md->end_subject)
2196 {
2197 SCHECK_PARTIAL();
2198 MRRETURN(MATCH_NOMATCH);
2199 }
2200 GETCHARINCTEST(c, eptr);
2201 if (
2202 #ifdef SUPPORT_UTF8
2203 c < 256 &&
2204 #endif
2205 (md->ctypes[c] & ctype_word) != 0
2206 )
2207 MRRETURN(MATCH_NOMATCH);
2208 ecode++;
2209 break;
2210
2211 case OP_WORDCHAR:
2212 if (eptr >= md->end_subject)
2213 {
2214 SCHECK_PARTIAL();
2215 MRRETURN(MATCH_NOMATCH);
2216 }
2217 GETCHARINCTEST(c, eptr);
2218 if (
2219 #ifdef SUPPORT_UTF8
2220 c >= 256 ||
2221 #endif
2222 (md->ctypes[c] & ctype_word) == 0
2223 )
2224 MRRETURN(MATCH_NOMATCH);
2225 ecode++;
2226 break;
2227
2228 case OP_ANYNL:
2229 if (eptr >= md->end_subject)
2230 {
2231 SCHECK_PARTIAL();
2232 MRRETURN(MATCH_NOMATCH);
2233 }
2234 GETCHARINCTEST(c, eptr);
2235 switch(c)
2236 {
2237 default: MRRETURN(MATCH_NOMATCH);
2238
2239 case 0x000d:
2240 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2241 break;
2242
2243 case 0x000a:
2244 break;
2245
2246 case 0x000b:
2247 case 0x000c:
2248 case 0x0085:
2249 case 0x2028:
2250 case 0x2029:
2251 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2252 break;
2253 }
2254 ecode++;
2255 break;
2256
2257 case OP_NOT_HSPACE:
2258 if (eptr >= md->end_subject)
2259 {
2260 SCHECK_PARTIAL();
2261 MRRETURN(MATCH_NOMATCH);
2262 }
2263 GETCHARINCTEST(c, eptr);
2264 switch(c)
2265 {
2266 default: break;
2267 case 0x09: /* HT */
2268 case 0x20: /* SPACE */
2269 case 0xa0: /* NBSP */
2270 case 0x1680: /* OGHAM SPACE MARK */
2271 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2272 case 0x2000: /* EN QUAD */
2273 case 0x2001: /* EM QUAD */
2274 case 0x2002: /* EN SPACE */
2275 case 0x2003: /* EM SPACE */
2276 case 0x2004: /* THREE-PER-EM SPACE */
2277 case 0x2005: /* FOUR-PER-EM SPACE */
2278 case 0x2006: /* SIX-PER-EM SPACE */
2279 case 0x2007: /* FIGURE SPACE */
2280 case 0x2008: /* PUNCTUATION SPACE */
2281 case 0x2009: /* THIN SPACE */
2282 case 0x200A: /* HAIR SPACE */
2283 case 0x202f: /* NARROW NO-BREAK SPACE */
2284 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2285 case 0x3000: /* IDEOGRAPHIC SPACE */
2286 MRRETURN(MATCH_NOMATCH);
2287 }
2288 ecode++;
2289 break;
2290
2291 case OP_HSPACE:
2292 if (eptr >= md->end_subject)
2293 {
2294 SCHECK_PARTIAL();
2295 MRRETURN(MATCH_NOMATCH);
2296 }
2297 GETCHARINCTEST(c, eptr);
2298 switch(c)
2299 {
2300 default: MRRETURN(MATCH_NOMATCH);
2301 case 0x09: /* HT */
2302 case 0x20: /* SPACE */
2303 case 0xa0: /* NBSP */
2304 case 0x1680: /* OGHAM SPACE MARK */
2305 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2306 case 0x2000: /* EN QUAD */
2307 case 0x2001: /* EM QUAD */
2308 case 0x2002: /* EN SPACE */
2309 case 0x2003: /* EM SPACE */
2310 case 0x2004: /* THREE-PER-EM SPACE */
2311 case 0x2005: /* FOUR-PER-EM SPACE */
2312 case 0x2006: /* SIX-PER-EM SPACE */
2313 case 0x2007: /* FIGURE SPACE */
2314 case 0x2008: /* PUNCTUATION SPACE */
2315 case 0x2009: /* THIN SPACE */
2316 case 0x200A: /* HAIR SPACE */
2317 case 0x202f: /* NARROW NO-BREAK SPACE */
2318 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2319 case 0x3000: /* IDEOGRAPHIC SPACE */
2320 break;
2321 }
2322 ecode++;
2323 break;
2324
2325 case OP_NOT_VSPACE:
2326 if (eptr >= md->end_subject)
2327 {
2328 SCHECK_PARTIAL();
2329 MRRETURN(MATCH_NOMATCH);
2330 }
2331 GETCHARINCTEST(c, eptr);
2332 switch(c)
2333 {
2334 default: break;
2335 case 0x0a: /* LF */
2336 case 0x0b: /* VT */
2337 case 0x0c: /* FF */
2338 case 0x0d: /* CR */
2339 case 0x85: /* NEL */
2340 case 0x2028: /* LINE SEPARATOR */
2341 case 0x2029: /* PARAGRAPH SEPARATOR */
2342 MRRETURN(MATCH_NOMATCH);
2343 }
2344 ecode++;
2345 break;
2346
2347 case OP_VSPACE:
2348 if (eptr >= md->end_subject)
2349 {
2350 SCHECK_PARTIAL();
2351 MRRETURN(MATCH_NOMATCH);
2352 }
2353 GETCHARINCTEST(c, eptr);
2354 switch(c)
2355 {
2356 default: MRRETURN(MATCH_NOMATCH);
2357 case 0x0a: /* LF */
2358 case 0x0b: /* VT */
2359 case 0x0c: /* FF */
2360 case 0x0d: /* CR */
2361 case 0x85: /* NEL */
2362 case 0x2028: /* LINE SEPARATOR */
2363 case 0x2029: /* PARAGRAPH SEPARATOR */
2364 break;
2365 }
2366 ecode++;
2367 break;
2368
2369 #ifdef SUPPORT_UCP
2370 /* Check the next character by Unicode property. We will get here only
2371 if the support is in the binary; otherwise a compile-time error occurs. */
2372
2373 case OP_PROP:
2374 case OP_NOTPROP:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 MRRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 {
2382 const ucd_record *prop = GET_UCD(c);
2383
2384 switch(ecode[1])
2385 {
2386 case PT_ANY:
2387 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2388 break;
2389
2390 case PT_LAMP:
2391 if ((prop->chartype == ucp_Lu ||
2392 prop->chartype == ucp_Ll ||
2393 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2394 MRRETURN(MATCH_NOMATCH);
2395 break;
2396
2397 case PT_GC:
2398 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2399 MRRETURN(MATCH_NOMATCH);
2400 break;
2401
2402 case PT_PC:
2403 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2404 MRRETURN(MATCH_NOMATCH);
2405 break;
2406
2407 case PT_SC:
2408 if ((ecode[2] != prop->script) == (op == OP_PROP))
2409 MRRETURN(MATCH_NOMATCH);
2410 break;
2411
2412 /* These are specials */
2413
2414 case PT_ALNUM:
2415 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2416 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2417 MRRETURN(MATCH_NOMATCH);
2418 break;
2419
2420 case PT_SPACE: /* Perl space */
2421 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2422 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2423 == (op == OP_NOTPROP))
2424 MRRETURN(MATCH_NOMATCH);
2425 break;
2426
2427 case PT_PXSPACE: /* POSIX space */
2428 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2429 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2430 c == CHAR_FF || c == CHAR_CR)
2431 == (op == OP_NOTPROP))
2432 MRRETURN(MATCH_NOMATCH);
2433 break;
2434
2435 case PT_WORD:
2436 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2437 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2438 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2439 MRRETURN(MATCH_NOMATCH);
2440 break;
2441
2442 /* This should never occur */
2443
2444 default:
2445 RRETURN(PCRE_ERROR_INTERNAL);
2446 }
2447
2448 ecode += 3;
2449 }
2450 break;
2451
2452 /* Match an extended Unicode sequence. We will get here only if the support
2453 is in the binary; otherwise a compile-time error occurs. */
2454
2455 case OP_EXTUNI:
2456 if (eptr >= md->end_subject)
2457 {
2458 SCHECK_PARTIAL();
2459 MRRETURN(MATCH_NOMATCH);
2460 }
2461 GETCHARINCTEST(c, eptr);
2462 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2463 while (eptr < md->end_subject)
2464 {
2465 int len = 1;
2466 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2467 if (UCD_CATEGORY(c) != ucp_M) break;
2468 eptr += len;
2469 }
2470 ecode++;
2471 break;
2472 #endif
2473
2474
2475 /* Match a back reference, possibly repeatedly. Look past the end of the
2476 item to see if there is repeat information following. The code is similar
2477 to that for character classes, but repeated for efficiency. Then obey
2478 similar code to character type repeats - written out again for speed.
2479 However, if the referenced string is the empty string, always treat
2480 it as matched, any number of times (otherwise there could be infinite
2481 loops). */
2482
2483 case OP_REF:
2484 case OP_REFI:
2485 caseless = op == OP_REFI;
2486 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2487 ecode += 3;
2488
2489 /* If the reference is unset, there are two possibilities:
2490
2491 (a) In the default, Perl-compatible state, set the length negative;
2492 this ensures that every attempt at a match fails. We can't just fail
2493 here, because of the possibility of quantifiers with zero minima.
2494
2495 (b) If the JavaScript compatibility flag is set, set the length to zero
2496 so that the back reference matches an empty string.
2497
2498 Otherwise, set the length to the length of what was matched by the
2499 referenced subpattern. */
2500
2501 if (offset >= offset_top || md->offset_vector[offset] < 0)
2502 length = (md->jscript_compat)? 0 : -1;
2503 else
2504 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2505
2506 /* Set up for repetition, or handle the non-repeated case */
2507
2508 switch (*ecode)
2509 {
2510 case OP_CRSTAR:
2511 case OP_CRMINSTAR:
2512 case OP_CRPLUS:
2513 case OP_CRMINPLUS:
2514 case OP_CRQUERY:
2515 case OP_CRMINQUERY:
2516 c = *ecode++ - OP_CRSTAR;
2517 minimize = (c & 1) != 0;
2518 min = rep_min[c]; /* Pick up values from tables; */
2519 max = rep_max[c]; /* zero for max => infinity */
2520 if (max == 0) max = INT_MAX;
2521 break;
2522
2523 case OP_CRRANGE:
2524 case OP_CRMINRANGE:
2525 minimize = (*ecode == OP_CRMINRANGE);
2526 min = GET2(ecode, 1);
2527 max = GET2(ecode, 3);
2528 if (max == 0) max = INT_MAX;
2529 ecode += 5;
2530 break;
2531
2532 default: /* No repeat follows */
2533 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2534 {
2535 CHECK_PARTIAL();
2536 MRRETURN(MATCH_NOMATCH);
2537 }
2538 eptr += length;
2539 continue; /* With the main loop */
2540 }
2541
2542 /* Handle repeated back references. If the length of the reference is
2543 zero, just continue with the main loop. */
2544
2545 if (length == 0) continue;
2546
2547 /* First, ensure the minimum number of matches are present. We get back
2548 the length of the reference string explicitly rather than passing the
2549 address of eptr, so that eptr can be a register variable. */
2550
2551 for (i = 1; i <= min; i++)
2552 {
2553 int slength;
2554 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2555 {
2556 CHECK_PARTIAL();
2557 MRRETURN(MATCH_NOMATCH);
2558 }
2559 eptr += slength;
2560 }
2561
2562 /* If min = max, continue at the same level without recursion.
2563 They are not both allowed to be zero. */
2564
2565 if (min == max) continue;
2566
2567 /* If minimizing, keep trying and advancing the pointer */
2568
2569 if (minimize)
2570 {
2571 for (fi = min;; fi++)
2572 {
2573 int slength;
2574 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2575 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2576 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2577 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2578 {
2579 CHECK_PARTIAL();
2580 MRRETURN(MATCH_NOMATCH);
2581 }
2582 eptr += slength;
2583 }
2584 /* Control never gets here */
2585 }
2586
2587 /* If maximizing, find the longest string and work backwards */
2588
2589 else
2590 {
2591 pp = eptr;
2592 for (i = min; i < max; i++)
2593 {
2594 int slength;
2595 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2596 {
2597 CHECK_PARTIAL();
2598 break;
2599 }
2600 eptr += slength;
2601 }
2602 while (eptr >= pp)
2603 {
2604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2606 eptr -= length;
2607 }
2608 MRRETURN(MATCH_NOMATCH);
2609 }
2610 /* Control never gets here */
2611
2612 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2613 used when all the characters in the class have values in the range 0-255,
2614 and either the matching is caseful, or the characters are in the range
2615 0-127 when UTF-8 processing is enabled. The only difference between
2616 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2617 encountered.
2618
2619 First, look past the end of the item to see if there is repeat information
2620 following. Then obey similar code to character type repeats - written out
2621 again for speed. */
2622
2623 case OP_NCLASS:
2624 case OP_CLASS:
2625 {
2626 data = ecode + 1; /* Save for matching */
2627 ecode += 33; /* Advance past the item */
2628
2629 switch (*ecode)
2630 {
2631 case OP_CRSTAR:
2632 case OP_CRMINSTAR:
2633 case OP_CRPLUS:
2634 case OP_CRMINPLUS:
2635 case OP_CRQUERY:
2636 case OP_CRMINQUERY:
2637 c = *ecode++ - OP_CRSTAR;
2638 minimize = (c & 1) != 0;
2639 min = rep_min[c]; /* Pick up values from tables; */
2640 max = rep_max[c]; /* zero for max => infinity */
2641 if (max == 0) max = INT_MAX;
2642 break;
2643
2644 case OP_CRRANGE:
2645 case OP_CRMINRANGE:
2646 minimize = (*ecode == OP_CRMINRANGE);
2647 min = GET2(ecode, 1);
2648 max = GET2(ecode, 3);
2649 if (max == 0) max = INT_MAX;
2650 ecode += 5;
2651 break;
2652
2653 default: /* No repeat follows */
2654 min = max = 1;
2655 break;
2656 }
2657
2658 /* First, ensure the minimum number of matches are present. */
2659
2660 #ifdef SUPPORT_UTF8
2661 /* UTF-8 mode */
2662 if (utf8)
2663 {
2664 for (i = 1; i <= min; i++)
2665 {
2666 if (eptr >= md->end_subject)
2667 {
2668 SCHECK_PARTIAL();
2669 MRRETURN(MATCH_NOMATCH);
2670 }
2671 GETCHARINC(c, eptr);
2672 if (c > 255)
2673 {
2674 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2675 }
2676 else
2677 {
2678 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2679 }
2680 }
2681 }
2682 else
2683 #endif
2684 /* Not UTF-8 mode */
2685 {
2686 for (i = 1; i <= min; i++)
2687 {
2688 if (eptr >= md->end_subject)
2689 {
2690 SCHECK_PARTIAL();
2691 MRRETURN(MATCH_NOMATCH);
2692 }
2693 c = *eptr++;
2694 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2695 }
2696 }
2697
2698 /* If max == min we can continue with the main loop without the
2699 need to recurse. */
2700
2701 if (min == max) continue;
2702
2703 /* If minimizing, keep testing the rest of the expression and advancing
2704 the pointer while it matches the class. */
2705
2706 if (minimize)
2707 {
2708 #ifdef SUPPORT_UTF8
2709 /* UTF-8 mode */
2710 if (utf8)
2711 {
2712 for (fi = min;; fi++)
2713 {
2714 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2716 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2717 if (eptr >= md->end_subject)
2718 {
2719 SCHECK_PARTIAL();
2720 MRRETURN(MATCH_NOMATCH);
2721 }
2722 GETCHARINC(c, eptr);
2723 if (c > 255)
2724 {
2725 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2726 }
2727 else
2728 {
2729 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2730 }
2731 }
2732 }
2733 else
2734 #endif
2735 /* Not UTF-8 mode */
2736 {
2737 for (fi = min;; fi++)
2738 {
2739 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2740 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2741 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2742 if (eptr >= md->end_subject)
2743 {
2744 SCHECK_PARTIAL();
2745 MRRETURN(MATCH_NOMATCH);
2746 }
2747 c = *eptr++;
2748 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2749 }
2750 }
2751 /* Control never gets here */
2752 }
2753
2754 /* If maximizing, find the longest possible run, then work backwards. */
2755
2756 else
2757 {
2758 pp = eptr;
2759
2760 #ifdef SUPPORT_UTF8
2761 /* UTF-8 mode */
2762 if (utf8)
2763 {
2764 for (i = min; i < max; i++)
2765 {
2766 int len = 1;
2767 if (eptr >= md->end_subject)
2768 {
2769 SCHECK_PARTIAL();
2770 break;
2771 }
2772 GETCHARLEN(c, eptr, len);
2773 if (c > 255)
2774 {
2775 if (op == OP_CLASS) break;
2776 }
2777 else
2778 {
2779 if ((data[c/8] & (1 << (c&7))) == 0) break;
2780 }
2781 eptr += len;
2782 }
2783 for (;;)
2784 {
2785 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2787 if (eptr-- == pp) break; /* Stop if tried at original pos */
2788 BACKCHAR(eptr);
2789 }
2790 }
2791 else
2792 #endif
2793 /* Not UTF-8 mode */
2794 {
2795 for (i = min; i < max; i++)
2796 {
2797 if (eptr >= md->end_subject)
2798 {
2799 SCHECK_PARTIAL();
2800 break;
2801 }
2802 c = *eptr;
2803 if ((data[c/8] & (1 << (c&7))) == 0) break;
2804 eptr++;
2805 }
2806 while (eptr >= pp)
2807 {
2808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2810 eptr--;
2811 }
2812 }
2813
2814 MRRETURN(MATCH_NOMATCH);
2815 }
2816 }
2817 /* Control never gets here */
2818
2819
2820 /* Match an extended character class. This opcode is encountered only
2821 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2822 mode, because Unicode properties are supported in non-UTF-8 mode. */
2823
2824 #ifdef SUPPORT_UTF8
2825 case OP_XCLASS:
2826 {
2827 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2828 ecode += GET(ecode, 1); /* Advance past the item */
2829
2830 switch (*ecode)
2831 {
2832 case OP_CRSTAR:
2833 case OP_CRMINSTAR:
2834 case OP_CRPLUS:
2835 case OP_CRMINPLUS:
2836 case OP_CRQUERY:
2837 case OP_CRMINQUERY:
2838 c = *ecode++ - OP_CRSTAR;
2839 minimize = (c & 1) != 0;
2840 min = rep_min[c]; /* Pick up values from tables; */
2841 max = rep_max[c]; /* zero for max => infinity */
2842 if (max == 0) max = INT_MAX;
2843 break;
2844
2845 case OP_CRRANGE:
2846 case OP_CRMINRANGE:
2847 minimize = (*ecode == OP_CRMINRANGE);
2848 min = GET2(ecode, 1);
2849 max = GET2(ecode, 3);
2850 if (max == 0) max = INT_MAX;
2851 ecode += 5;
2852 break;
2853
2854 default: /* No repeat follows */
2855 min = max = 1;
2856 break;
2857 }
2858
2859 /* First, ensure the minimum number of matches are present. */
2860
2861 for (i = 1; i <= min; i++)
2862 {
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 MRRETURN(MATCH_NOMATCH);
2867 }
2868 GETCHARINCTEST(c, eptr);
2869 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2870 }
2871
2872 /* If max == min we can continue with the main loop without the
2873 need to recurse. */
2874
2875 if (min == max) continue;
2876
2877 /* If minimizing, keep testing the rest of the expression and advancing
2878 the pointer while it matches the class. */
2879
2880 if (minimize)
2881 {
2882 for (fi = min;; fi++)
2883 {
2884 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2885 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2886 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2887 if (eptr >= md->end_subject)
2888 {
2889 SCHECK_PARTIAL();
2890 MRRETURN(MATCH_NOMATCH);
2891 }
2892 GETCHARINCTEST(c, eptr);
2893 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2894 }
2895 /* Control never gets here */
2896 }
2897
2898 /* If maximizing, find the longest possible run, then work backwards. */
2899
2900 else
2901 {
2902 pp = eptr;
2903 for (i = min; i < max; i++)
2904 {
2905 int len = 1;
2906 if (eptr >= md->end_subject)
2907 {
2908 SCHECK_PARTIAL();
2909 break;
2910 }
2911 GETCHARLENTEST(c, eptr, len);
2912 if (!_pcre_xclass(c, data)) break;
2913 eptr += len;
2914 }
2915 for(;;)
2916 {
2917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2919 if (eptr-- == pp) break; /* Stop if tried at original pos */
2920 if (utf8) BACKCHAR(eptr);
2921 }
2922 MRRETURN(MATCH_NOMATCH);
2923 }
2924
2925 /* Control never gets here */
2926 }
2927 #endif /* End of XCLASS */
2928
2929 /* Match a single character, casefully */
2930
2931 case OP_CHAR:
2932 #ifdef SUPPORT_UTF8
2933 if (utf8)
2934 {
2935 length = 1;
2936 ecode++;
2937 GETCHARLEN(fc, ecode, length);
2938 if (length > md->end_subject - eptr)
2939 {
2940 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2941 MRRETURN(MATCH_NOMATCH);
2942 }
2943 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2944 }
2945 else
2946 #endif
2947
2948 /* Non-UTF-8 mode */
2949 {
2950 if (md->end_subject - eptr < 1)
2951 {
2952 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2953 MRRETURN(MATCH_NOMATCH);
2954 }
2955 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2956 ecode += 2;
2957 }
2958 break;
2959
2960 /* Match a single character, caselessly */
2961
2962 case OP_CHARI:
2963 #ifdef SUPPORT_UTF8
2964 if (utf8)
2965 {
2966 length = 1;
2967 ecode++;
2968 GETCHARLEN(fc, ecode, length);
2969
2970 if (length > md->end_subject - eptr)
2971 {
2972 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2973 MRRETURN(MATCH_NOMATCH);
2974 }
2975
2976 /* If the pattern character's value is < 128, we have only one byte, and
2977 can use the fast lookup table. */
2978
2979 if (fc < 128)
2980 {
2981 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2982 }
2983
2984 /* Otherwise we must pick up the subject character */
2985
2986 else
2987 {
2988 unsigned int dc;
2989 GETCHARINC(dc, eptr);
2990 ecode += length;
2991
2992 /* If we have Unicode property support, we can use it to test the other
2993 case of the character, if there is one. */
2994
2995 if (fc != dc)
2996 {
2997 #ifdef SUPPORT_UCP
2998 if (dc != UCD_OTHERCASE(fc))
2999 #endif
3000 MRRETURN(MATCH_NOMATCH);
3001 }
3002 }
3003 }
3004 else
3005 #endif /* SUPPORT_UTF8 */
3006
3007 /* Non-UTF-8 mode */
3008 {
3009 if (md->end_subject - eptr < 1)
3010 {
3011 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3012 MRRETURN(MATCH_NOMATCH);
3013 }
3014 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3015 ecode += 2;
3016 }
3017 break;
3018
3019 /* Match a single character repeatedly. */
3020
3021 case OP_EXACT:
3022 case OP_EXACTI:
3023 min = max = GET2(ecode, 1);
3024 ecode += 3;
3025 goto REPEATCHAR;
3026
3027 case OP_POSUPTO:
3028 case OP_POSUPTOI:
3029 possessive = TRUE;
3030 /* Fall through */
3031
3032 case OP_UPTO:
3033 case OP_UPTOI:
3034 case OP_MINUPTO:
3035 case OP_MINUPTOI:
3036 min = 0;
3037 max = GET2(ecode, 1);
3038 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3039 ecode += 3;
3040 goto REPEATCHAR;
3041
3042 case OP_POSSTAR:
3043 case OP_POSSTARI:
3044 possessive = TRUE;
3045 min = 0;
3046 max = INT_MAX;
3047 ecode++;
3048 goto REPEATCHAR;
3049
3050 case OP_POSPLUS:
3051 case OP_POSPLUSI:
3052 possessive = TRUE;
3053 min = 1;
3054 max = INT_MAX;
3055 ecode++;
3056 goto REPEATCHAR;
3057
3058 case OP_POSQUERY:
3059 case OP_POSQUERYI:
3060 possessive = TRUE;
3061 min = 0;
3062 max = 1;
3063 ecode++;
3064 goto REPEATCHAR;
3065
3066 case OP_STAR:
3067 case OP_STARI:
3068 case OP_MINSTAR:
3069 case OP_MINSTARI:
3070 case OP_PLUS:
3071 case OP_PLUSI:
3072 case OP_MINPLUS:
3073 case OP_MINPLUSI:
3074 case OP_QUERY:
3075 case OP_QUERYI:
3076 case OP_MINQUERY:
3077 case OP_MINQUERYI:
3078 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3079 minimize = (c & 1) != 0;
3080 min = rep_min[c]; /* Pick up values from tables; */
3081 max = rep_max[c]; /* zero for max => infinity */
3082 if (max == 0) max = INT_MAX;
3083
3084 /* Common code for all repeated single-character matches. */
3085
3086 REPEATCHAR:
3087 #ifdef SUPPORT_UTF8
3088 if (utf8)
3089 {
3090 length = 1;
3091 charptr = ecode;
3092 GETCHARLEN(fc, ecode, length);
3093 ecode += length;
3094
3095 /* Handle multibyte character matching specially here. There is
3096 support for caseless matching if UCP support is present. */
3097
3098 if (length > 1)
3099 {
3100 #ifdef SUPPORT_UCP
3101 unsigned int othercase;
3102 if (op >= OP_STARI && /* Caseless */
3103 (othercase = UCD_OTHERCASE(fc)) != fc)
3104 oclength = _pcre_ord2utf8(othercase, occhars);
3105 else oclength = 0;
3106 #endif /* SUPPORT_UCP */
3107
3108 for (i = 1; i <= min; i++)
3109 {
3110 if (eptr <= md->end_subject - length &&
3111 memcmp(eptr, charptr, length) == 0) eptr += length;
3112 #ifdef SUPPORT_UCP
3113 else if (oclength > 0 &&
3114 eptr <= md->end_subject - oclength &&
3115 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3116 #endif /* SUPPORT_UCP */
3117 else
3118 {
3119 CHECK_PARTIAL();
3120 MRRETURN(MATCH_NOMATCH);
3121 }
3122 }
3123
3124 if (min == max) continue;
3125
3126 if (minimize)
3127 {
3128 for (fi = min;; fi++)
3129 {
3130 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3132 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3133 if (eptr <= md->end_subject - length &&
3134 memcmp(eptr, charptr, length) == 0) eptr += length;
3135 #ifdef SUPPORT_UCP
3136 else if (oclength > 0 &&
3137 eptr <= md->end_subject - oclength &&
3138 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3139 #endif /* SUPPORT_UCP */
3140 else
3141 {
3142 CHECK_PARTIAL();
3143 MRRETURN(MATCH_NOMATCH);
3144 }
3145 }
3146 /* Control never gets here */
3147 }
3148
3149 else /* Maximize */
3150 {
3151 pp = eptr;
3152 for (i = min; i < max; i++)
3153 {
3154 if (eptr <= md->end_subject - length &&
3155 memcmp(eptr, charptr, length) == 0) eptr += length;
3156 #ifdef SUPPORT_UCP
3157 else if (oclength > 0 &&
3158 eptr <= md->end_subject - oclength &&
3159 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3160 #endif /* SUPPORT_UCP */
3161 else
3162 {
3163 CHECK_PARTIAL();
3164 break;
3165 }
3166 }
3167
3168 if (possessive) continue;
3169
3170 for(;;)
3171 {
3172 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3174 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3175 #ifdef SUPPORT_UCP
3176 eptr--;
3177 BACKCHAR(eptr);
3178 #else /* without SUPPORT_UCP */
3179 eptr -= length;
3180 #endif /* SUPPORT_UCP */
3181 }
3182 }
3183 /* Control never gets here */
3184 }
3185
3186 /* If the length of a UTF-8 character is 1, we fall through here, and
3187 obey the code as for non-UTF-8 characters below, though in this case the
3188 value of fc will always be < 128. */
3189 }
3190 else
3191 #endif /* SUPPORT_UTF8 */
3192
3193 /* When not in UTF-8 mode, load a single-byte character. */
3194
3195 fc = *ecode++;
3196
3197 /* The value of fc at this point is always less than 256, though we may or
3198 may not be in UTF-8 mode. The code is duplicated for the caseless and
3199 caseful cases, for speed, since matching characters is likely to be quite
3200 common. First, ensure the minimum number of matches are present. If min =
3201 max, continue at the same level without recursing. Otherwise, if
3202 minimizing, keep trying the rest of the expression and advancing one
3203 matching character if failing, up to the maximum. Alternatively, if
3204 maximizing, find the maximum number of characters and work backwards. */
3205
3206 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3207 max, eptr));
3208
3209 if (op >= OP_STARI) /* Caseless */
3210 {
3211 fc = md->lcc[fc];
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 MRRETURN(MATCH_NOMATCH);
3218 }
3219 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3220 }
3221 if (min == max) continue;
3222 if (minimize)
3223 {
3224 for (fi = min;; fi++)
3225 {
3226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3227 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3228 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3229 if (eptr >= md->end_subject)
3230 {
3231 SCHECK_PARTIAL();
3232 MRRETURN(MATCH_NOMATCH);
3233 }
3234 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3235 }
3236 /* Control never gets here */
3237 }
3238 else /* Maximize */
3239 {
3240 pp = eptr;
3241 for (i = min; i < max; i++)
3242 {
3243 if (eptr >= md->end_subject)
3244 {
3245 SCHECK_PARTIAL();
3246 break;
3247 }
3248 if (fc != md->lcc[*eptr]) break;
3249 eptr++;
3250 }
3251
3252 if (possessive) continue;
3253
3254 while (eptr >= pp)
3255 {
3256 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3257 eptr--;
3258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3259 }
3260 MRRETURN(MATCH_NOMATCH);
3261 }
3262 /* Control never gets here */
3263 }
3264
3265 /* Caseful comparisons (includes all multi-byte characters) */
3266
3267 else
3268 {
3269 for (i = 1; i <= min; i++)
3270 {
3271 if (eptr >= md->end_subject)
3272 {
3273 SCHECK_PARTIAL();
3274 MRRETURN(MATCH_NOMATCH);
3275 }
3276 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3277 }
3278
3279 if (min == max) continue;
3280
3281 if (minimize)
3282 {
3283 for (fi = min;; fi++)
3284 {
3285 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3286 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3287 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3288 if (eptr >= md->end_subject)
3289 {
3290 SCHECK_PARTIAL();
3291 MRRETURN(MATCH_NOMATCH);
3292 }
3293 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3294 }
3295 /* Control never gets here */
3296 }
3297 else /* Maximize */
3298 {
3299 pp = eptr;
3300 for (i = min; i < max; i++)
3301 {
3302 if (eptr >= md->end_subject)
3303 {
3304 SCHECK_PARTIAL();
3305 break;
3306 }
3307 if (fc != *eptr) break;
3308 eptr++;
3309 }
3310 if (possessive) continue;
3311
3312 while (eptr >= pp)
3313 {
3314 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3315 eptr--;
3316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3317 }
3318 MRRETURN(MATCH_NOMATCH);
3319 }
3320 }
3321 /* Control never gets here */
3322
3323 /* Match a negated single one-byte character. The character we are
3324 checking can be multibyte. */
3325
3326 case OP_NOT:
3327 case OP_NOTI:
3328 if (eptr >= md->end_subject)
3329 {
3330 SCHECK_PARTIAL();
3331 MRRETURN(MATCH_NOMATCH);
3332 }
3333 ecode++;
3334 GETCHARINCTEST(c, eptr);
3335 if (op == OP_NOTI) /* The caseless case */
3336 {
3337 #ifdef SUPPORT_UTF8
3338 if (c < 256)
3339 #endif
3340 c = md->lcc[c];
3341 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3342 }
3343 else /* Caseful */
3344 {
3345 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3346 }
3347 break;
3348
3349 /* Match a negated single one-byte character repeatedly. This is almost a
3350 repeat of the code for a repeated single character, but I haven't found a
3351 nice way of commoning these up that doesn't require a test of the
3352 positive/negative option for each character match. Maybe that wouldn't add
3353 very much to the time taken, but character matching *is* what this is all
3354 about... */
3355
3356 case OP_NOTEXACT:
3357 case OP_NOTEXACTI:
3358 min = max = GET2(ecode, 1);
3359 ecode += 3;
3360 goto REPEATNOTCHAR;
3361
3362 case OP_NOTUPTO:
3363 case OP_NOTUPTOI:
3364 case OP_NOTMINUPTO:
3365 case OP_NOTMINUPTOI:
3366 min = 0;
3367 max = GET2(ecode, 1);
3368 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3369 ecode += 3;
3370 goto REPEATNOTCHAR;
3371
3372 case OP_NOTPOSSTAR:
3373 case OP_NOTPOSSTARI:
3374 possessive = TRUE;
3375 min = 0;
3376 max = INT_MAX;
3377 ecode++;
3378 goto REPEATNOTCHAR;
3379
3380 case OP_NOTPOSPLUS:
3381 case OP_NOTPOSPLUSI:
3382 possessive = TRUE;
3383 min = 1;
3384 max = INT_MAX;
3385 ecode++;
3386 goto REPEATNOTCHAR;
3387
3388 case OP_NOTPOSQUERY:
3389 case OP_NOTPOSQUERYI:
3390 possessive = TRUE;
3391 min = 0;
3392 max = 1;
3393 ecode++;
3394 goto REPEATNOTCHAR;
3395
3396 case OP_NOTPOSUPTO:
3397 case OP_NOTPOSUPTOI:
3398 possessive = TRUE;
3399 min = 0;
3400 max = GET2(ecode, 1);
3401 ecode += 3;
3402 goto REPEATNOTCHAR;
3403
3404 case OP_NOTSTAR:
3405 case OP_NOTSTARI:
3406 case OP_NOTMINSTAR:
3407 case OP_NOTMINSTARI:
3408 case OP_NOTPLUS:
3409 case OP_NOTPLUSI:
3410 case OP_NOTMINPLUS:
3411 case OP_NOTMINPLUSI:
3412 case OP_NOTQUERY:
3413 case OP_NOTQUERYI:
3414 case OP_NOTMINQUERY:
3415 case OP_NOTMINQUERYI:
3416 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3417 minimize = (c & 1) != 0;
3418 min = rep_min[c]; /* Pick up values from tables; */
3419 max = rep_max[c]; /* zero for max => infinity */
3420 if (max == 0) max = INT_MAX;
3421
3422 /* Common code for all repeated single-byte matches. */
3423
3424 REPEATNOTCHAR:
3425 fc = *ecode++;
3426
3427 /* The code is duplicated for the caseless and caseful cases, for speed,
3428 since matching characters is likely to be quite common. First, ensure the
3429 minimum number of matches are present. If min = max, continue at the same
3430 level without recursing. Otherwise, if minimizing, keep trying the rest of
3431 the expression and advancing one matching character if failing, up to the
3432 maximum. Alternatively, if maximizing, find the maximum number of
3433 characters and work backwards. */
3434
3435 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3436 max, eptr));
3437
3438 if (op >= OP_NOTSTARI) /* Caseless */
3439 {
3440 fc = md->lcc[fc];
3441
3442 #ifdef SUPPORT_UTF8
3443 /* UTF-8 mode */
3444 if (utf8)
3445 {
3446 register unsigned int d;
3447 for (i = 1; i <= min; i++)
3448 {
3449 if (eptr >= md->end_subject)
3450 {
3451 SCHECK_PARTIAL();
3452 MRRETURN(MATCH_NOMATCH);
3453 }
3454 GETCHARINC(d, eptr);
3455 if (d < 256) d = md->lcc[d];
3456 if (fc == d) MRRETURN(MATCH_NOMATCH);
3457 }
3458 }
3459 else
3460 #endif
3461
3462 /* Not UTF-8 mode */
3463 {
3464 for (i = 1; i <= min; i++)
3465 {
3466 if (eptr >= md->end_subject)
3467 {
3468 SCHECK_PARTIAL();
3469 MRRETURN(MATCH_NOMATCH);
3470 }
3471 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3472 }
3473 }
3474
3475 if (min == max) continue;
3476
3477 if (minimize)
3478 {
3479 #ifdef SUPPORT_UTF8
3480 /* UTF-8 mode */
3481 if (utf8)
3482 {
3483 register unsigned int d;
3484 for (fi = min;; fi++)
3485 {
3486 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3488 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3489 if (eptr >= md->end_subject)
3490 {
3491 SCHECK_PARTIAL();
3492 MRRETURN(MATCH_NOMATCH);
3493 }
3494 GETCHARINC(d, eptr);
3495 if (d < 256) d = md->lcc[d];
3496 if (fc == d) MRRETURN(MATCH_NOMATCH);
3497 }
3498 }
3499 else
3500 #endif
3501 /* Not UTF-8 mode */
3502 {
3503 for (fi = min;; fi++)
3504 {
3505 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3506 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3507 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3508 if (eptr >= md->end_subject)
3509 {
3510 SCHECK_PARTIAL();
3511 MRRETURN(MATCH_NOMATCH);
3512 }
3513 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3514 }
3515 }
3516 /* Control never gets here */
3517 }
3518
3519 /* Maximize case */
3520
3521 else
3522 {
3523 pp = eptr;
3524
3525 #ifdef SUPPORT_UTF8
3526 /* UTF-8 mode */
3527 if (utf8)
3528 {
3529 register unsigned int d;
3530 for (i = min; i < max; i++)
3531 {
3532 int len = 1;
3533 if (eptr >= md->end_subject)
3534 {
3535 SCHECK_PARTIAL();
3536 break;
3537 }
3538 GETCHARLEN(d, eptr, len);
3539 if (d < 256) d = md->lcc[d];
3540 if (fc == d) break;
3541 eptr += len;
3542 }
3543 if (possessive) continue;
3544 for(;;)
3545 {
3546 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3547 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3548 if (eptr-- == pp) break; /* Stop if tried at original pos */
3549 BACKCHAR(eptr);
3550 }
3551 }
3552 else
3553 #endif
3554 /* Not UTF-8 mode */
3555 {
3556 for (i = min; i < max; i++)
3557 {
3558 if (eptr >= md->end_subject)
3559 {
3560 SCHECK_PARTIAL();
3561 break;
3562 }
3563 if (fc == md->lcc[*eptr]) break;
3564 eptr++;
3565 }
3566 if (possessive) continue;
3567 while (eptr >= pp)
3568 {
3569 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3571 eptr--;
3572 }
3573 }
3574
3575 MRRETURN(MATCH_NOMATCH);
3576 }
3577 /* Control never gets here */
3578 }
3579
3580 /* Caseful comparisons */
3581
3582 else
3583 {
3584 #ifdef SUPPORT_UTF8
3585 /* UTF-8 mode */
3586 if (utf8)
3587 {
3588 register unsigned int d;
3589 for (i = 1; i <= min; i++)
3590 {
3591 if (eptr >= md->end_subject)
3592 {
3593 SCHECK_PARTIAL();
3594 MRRETURN(MATCH_NOMATCH);
3595 }
3596 GETCHARINC(d, eptr);
3597 if (fc == d) MRRETURN(MATCH_NOMATCH);
3598 }
3599 }
3600 else
3601 #endif
3602 /* Not UTF-8 mode */
3603 {
3604 for (i = 1; i <= min; i++)
3605 {
3606 if (eptr >= md->end_subject)
3607 {
3608 SCHECK_PARTIAL();
3609 MRRETURN(MATCH_NOMATCH);
3610 }
3611 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3612 }
3613 }
3614
3615 if (min == max) continue;
3616
3617 if (minimize)
3618 {
3619 #ifdef SUPPORT_UTF8
3620 /* UTF-8 mode */
3621 if (utf8)
3622 {
3623 register unsigned int d;
3624 for (fi = min;; fi++)
3625 {
3626 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3629 if (eptr >= md->end_subject)
3630 {
3631 SCHECK_PARTIAL();
3632 MRRETURN(MATCH_NOMATCH);
3633 }
3634 GETCHARINC(d, eptr);
3635 if (fc == d) MRRETURN(MATCH_NOMATCH);
3636 }
3637 }
3638 else
3639 #endif
3640 /* Not UTF-8 mode */
3641 {
3642 for (fi = min;; fi++)
3643 {
3644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3646 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3647 if (eptr >= md->end_subject)
3648 {
3649 SCHECK_PARTIAL();
3650 MRRETURN(MATCH_NOMATCH);
3651 }
3652 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3653 }
3654 }
3655 /* Control never gets here */
3656 }
3657
3658 /* Maximize case */
3659
3660 else
3661 {
3662 pp = eptr;
3663
3664 #ifdef SUPPORT_UTF8
3665 /* UTF-8 mode */
3666 if (utf8)
3667 {
3668 register unsigned int d;
3669 for (i = min; i < max; i++)
3670 {
3671 int len = 1;
3672 if (eptr >= md->end_subject)
3673 {
3674 SCHECK_PARTIAL();
3675 break;
3676 }
3677 GETCHARLEN(d, eptr, len);
3678 if (fc == d) break;
3679 eptr += len;
3680 }
3681 if (possessive) continue;
3682 for(;;)
3683 {
3684 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3686 if (eptr-- == pp) break; /* Stop if tried at original pos */
3687 BACKCHAR(eptr);
3688 }
3689 }
3690 else
3691 #endif
3692 /* Not UTF-8 mode */
3693 {
3694 for (i = min; i < max; i++)
3695 {
3696 if (eptr >= md->end_subject)
3697 {
3698 SCHECK_PARTIAL();
3699 break;
3700 }
3701 if (fc == *eptr) break;
3702 eptr++;
3703 }
3704 if (possessive) continue;
3705 while (eptr >= pp)
3706 {
3707 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3709 eptr--;
3710 }
3711 }
3712
3713 MRRETURN(MATCH_NOMATCH);
3714 }
3715 }
3716 /* Control never gets here */
3717
3718 /* Match a single character type repeatedly; several different opcodes
3719 share code. This is very similar to the code for single characters, but we
3720 repeat it in the interests of efficiency. */
3721
3722 case OP_TYPEEXACT:
3723 min = max = GET2(ecode, 1);
3724 minimize = TRUE;
3725 ecode += 3;
3726 goto REPEATTYPE;
3727
3728 case OP_TYPEUPTO:
3729 case OP_TYPEMINUPTO:
3730 min = 0;
3731 max = GET2(ecode, 1);
3732 minimize = *ecode == OP_TYPEMINUPTO;
3733 ecode += 3;
3734 goto REPEATTYPE;
3735
3736 case OP_TYPEPOSSTAR:
3737 possessive = TRUE;
3738 min = 0;
3739 max = INT_MAX;
3740 ecode++;
3741 goto REPEATTYPE;
3742
3743 case OP_TYPEPOSPLUS:
3744 possessive = TRUE;
3745 min = 1;
3746 max = INT_MAX;
3747 ecode++;
3748 goto REPEATTYPE;
3749
3750 case OP_TYPEPOSQUERY:
3751 possessive = TRUE;
3752 min = 0;
3753 max = 1;
3754 ecode++;
3755 goto REPEATTYPE;
3756
3757 case OP_TYPEPOSUPTO:
3758 possessive = TRUE;
3759 min = 0;
3760 max = GET2(ecode, 1);
3761 ecode += 3;
3762 goto REPEATTYPE;
3763
3764 case OP_TYPESTAR:
3765 case OP_TYPEMINSTAR:
3766 case OP_TYPEPLUS:
3767 case OP_TYPEMINPLUS:
3768 case OP_TYPEQUERY:
3769 case OP_TYPEMINQUERY:
3770 c = *ecode++ - OP_TYPESTAR;
3771 minimize = (c & 1) != 0;
3772 min = rep_min[c]; /* Pick up values from tables; */
3773 max = rep_max[c]; /* zero for max => infinity */
3774 if (max == 0) max = INT_MAX;
3775
3776 /* Common code for all repeated single character type matches. Note that
3777 in UTF-8 mode, '.' matches a character of any length, but for the other
3778 character types, the valid characters are all one-byte long. */
3779
3780 REPEATTYPE:
3781 ctype = *ecode++; /* Code for the character type */
3782
3783 #ifdef SUPPORT_UCP
3784 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3785 {
3786 prop_fail_result = ctype == OP_NOTPROP;
3787 prop_type = *ecode++;
3788 prop_value = *ecode++;
3789 }
3790 else prop_type = -1;
3791 #endif
3792
3793 /* First, ensure the minimum number of matches are present. Use inline
3794 code for maximizing the speed, and do the type test once at the start
3795 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3796 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3797 and single-bytes. */
3798
3799 if (min > 0)
3800 {
3801 #ifdef SUPPORT_UCP
3802 if (prop_type >= 0)
3803 {
3804 switch(prop_type)
3805 {
3806 case PT_ANY:
3807 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3808 for (i = 1; i <= min; i++)
3809 {
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 MRRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINCTEST(c, eptr);
3816 }
3817 break;
3818
3819 case PT_LAMP:
3820 for (i = 1; i <= min; i++)
3821 {
3822 int chartype;
3823 if (eptr >= md->end_subject)
3824 {
3825 SCHECK_PARTIAL();
3826 MRRETURN(MATCH_NOMATCH);
3827 }
3828 GETCHARINCTEST(c, eptr);
3829 chartype = UCD_CHARTYPE(c);
3830 if ((chartype == ucp_Lu ||
3831 chartype == ucp_Ll ||
3832 chartype == ucp_Lt) == prop_fail_result)
3833 MRRETURN(MATCH_NOMATCH);
3834 }
3835 break;
3836
3837 case PT_GC:
3838 for (i = 1; i <= min; i++)
3839 {
3840 if (eptr >= md->end_subject)
3841 {
3842 SCHECK_PARTIAL();
3843 MRRETURN(MATCH_NOMATCH);
3844 }
3845 GETCHARINCTEST(c, eptr);
3846 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3847 MRRETURN(MATCH_NOMATCH);
3848 }
3849 break;
3850
3851 case PT_PC:
3852 for (i = 1; i <= min; i++)
3853 {
3854 if (eptr >= md->end_subject)
3855 {
3856 SCHECK_PARTIAL();
3857 MRRETURN(MATCH_NOMATCH);
3858 }
3859 GETCHARINCTEST(c, eptr);
3860 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3861 MRRETURN(MATCH_NOMATCH);
3862 }
3863 break;
3864
3865 case PT_SC:
3866 for (i = 1; i <= min; i++)
3867 {
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 MRRETURN(MATCH_NOMATCH);
3872 }
3873 GETCHARINCTEST(c, eptr);
3874 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3875 MRRETURN(MATCH_NOMATCH);
3876 }
3877 break;
3878
3879 case PT_ALNUM:
3880 for (i = 1; i <= min; i++)
3881 {
3882 int category;
3883 if (eptr >= md->end_subject)
3884 {
3885 SCHECK_PARTIAL();
3886 MRRETURN(MATCH_NOMATCH);
3887 }
3888 GETCHARINCTEST(c, eptr);
3889 category = UCD_CATEGORY(c);
3890 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3891 MRRETURN(MATCH_NOMATCH);
3892 }
3893 break;
3894
3895 case PT_SPACE: /* Perl space */
3896 for (i = 1; i <= min; i++)
3897 {
3898 if (eptr >= md->end_subject)
3899 {
3900 SCHECK_PARTIAL();
3901 MRRETURN(MATCH_NOMATCH);
3902 }
3903 GETCHARINCTEST(c, eptr);
3904 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3905 c == CHAR_FF || c == CHAR_CR)
3906 == prop_fail_result)
3907 MRRETURN(MATCH_NOMATCH);
3908 }
3909 break;
3910
3911 case PT_PXSPACE: /* POSIX space */
3912 for (i = 1; i <= min; i++)
3913 {
3914 if (eptr >= md->end_subject)
3915 {
3916 SCHECK_PARTIAL();
3917 MRRETURN(MATCH_NOMATCH);
3918 }
3919 GETCHARINCTEST(c, eptr);
3920 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3921 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3922 == prop_fail_result)
3923 MRRETURN(MATCH_NOMATCH);
3924 }
3925 break;
3926
3927 case PT_WORD:
3928 for (i = 1; i <= min; i++)
3929 {
3930 int category;
3931 if (eptr >= md->end_subject)
3932 {
3933 SCHECK_PARTIAL();
3934 MRRETURN(MATCH_NOMATCH);
3935 }
3936 GETCHARINCTEST(c, eptr);
3937 category = UCD_CATEGORY(c);
3938 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3939 == prop_fail_result)
3940 MRRETURN(MATCH_NOMATCH);
3941 }
3942 break;
3943
3944 /* This should not occur */
3945
3946 default:
3947 RRETURN(PCRE_ERROR_INTERNAL);
3948 }
3949 }
3950
3951 /* Match extended Unicode sequences. We will get here only if the
3952 support is in the binary; otherwise a compile-time error occurs. */
3953
3954 else if (ctype == OP_EXTUNI)
3955 {
3956 for (i = 1; i <= min; i++)
3957 {
3958 if (eptr >= md->end_subject)
3959 {
3960 SCHECK_PARTIAL();
3961 MRRETURN(MATCH_NOMATCH);
3962 }
3963 GETCHARINCTEST(c, eptr);
3964 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3965 while (eptr < md->end_subject)
3966 {
3967 int len = 1;
3968 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3969 if (UCD_CATEGORY(c) != ucp_M) break;
3970 eptr += len;
3971 }
3972 }
3973 }
3974
3975 else
3976 #endif /* SUPPORT_UCP */
3977
3978 /* Handle all other cases when the coding is UTF-8 */
3979
3980 #ifdef SUPPORT_UTF8
3981 if (utf8) switch(ctype)
3982 {
3983 case OP_ANY:
3984 for (i = 1; i <= min; i++)
3985 {
3986 if (eptr >= md->end_subject)
3987 {
3988 SCHECK_PARTIAL();
3989 MRRETURN(MATCH_NOMATCH);
3990 }
3991 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3992 eptr++;
3993 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3994 }
3995 break;
3996
3997 case OP_ALLANY:
3998 for (i = 1; i <= min; i++)
3999 {
4000 if (eptr >= md->end_subject)
4001 {
4002 SCHECK_PARTIAL();
4003 MRRETURN(MATCH_NOMATCH);
4004 }
4005 eptr++;
4006 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4007 }
4008 break;
4009
4010 case OP_ANYBYTE:
4011 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4012 eptr += min;
4013 break;
4014
4015 case OP_ANYNL:
4016 for (i = 1; i <= min; i++)
4017 {
4018 if (eptr >= md->end_subject)
4019 {
4020 SCHECK_PARTIAL();
4021 MRRETURN(MATCH_NOMATCH);
4022 }
4023 GETCHARINC(c, eptr);
4024 switch(c)
4025 {
4026 default: MRRETURN(MATCH_NOMATCH);
4027
4028 case 0x000d:
4029 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4030 break;
4031
4032 case 0x000a:
4033 break;
4034
4035 case 0x000b:
4036 case 0x000c:
4037 case 0x0085:
4038 case 0x2028:
4039 case 0x2029:
4040 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4041 break;
4042 }
4043 }
4044 break;
4045
4046 case OP_NOT_HSPACE:
4047 for (i = 1; i <= min; i++)
4048 {
4049 if (eptr >= md->end_subject)
4050 {
4051 SCHECK_PARTIAL();
4052 MRRETURN(MATCH_NOMATCH);
4053 }
4054 GETCHARINC(c, eptr);
4055 switch(c)
4056 {
4057 default: break;
4058 case 0x09: /* HT */
4059 case 0x20: /* SPACE */
4060 case 0xa0: /* NBSP */
4061 case 0x1680: /* OGHAM SPACE MARK */
4062 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4063 case 0x2000: /* EN QUAD */
4064 case 0x2001: /* EM QUAD */
4065 case 0x2002: /* EN SPACE */
4066 case 0x2003: /* EM SPACE */
4067 case 0x2004: /* THREE-PER-EM SPACE */
4068 case 0x2005: /* FOUR-PER-EM SPACE */
4069 case 0x2006: /* SIX-PER-EM SPACE */
4070 case 0x2007: /* FIGURE SPACE */
4071 case 0x2008: /* PUNCTUATION SPACE */
4072 case 0x2009: /* THIN SPACE */
4073 case 0x200A: /* HAIR SPACE */
4074 case 0x202f: /* NARROW NO-BREAK SPACE */
4075 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4076 case 0x3000: /* IDEOGRAPHIC SPACE */
4077 MRRETURN(MATCH_NOMATCH);
4078 }
4079 }
4080 break;
4081
4082 case OP_HSPACE:
4083 for (i = 1; i <= min; i++)
4084 {
4085 if (eptr >= md->end_subject)
4086 {
4087 SCHECK_PARTIAL();
4088 MRRETURN(MATCH_NOMATCH);
4089 }
4090 GETCHARINC(c, eptr);
4091 switch(c)
4092 {
4093 default: MRRETURN(MATCH_NOMATCH);
4094 case 0x09: /* HT */
4095 case 0x20: /* SPACE */
4096 case 0xa0: /* NBSP */
4097 case 0x1680: /* OGHAM SPACE MARK */
4098 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4099 case 0x2000: /* EN QUAD */
4100 case 0x2001: /* EM QUAD */
4101 case 0x2002: /* EN SPACE */
4102 case 0x2003: /* EM SPACE */
4103 case 0x2004: /* THREE-PER-EM SPACE */
4104 case 0x2005: /* FOUR-PER-EM SPACE */
4105 case 0x2006: /* SIX-PER-EM SPACE */
4106 case 0x2007: /* FIGURE SPACE */
4107 case 0x2008: /* PUNCTUATION SPACE */
4108 case 0x2009: /* THIN SPACE */
4109 case 0x200A: /* HAIR SPACE */
4110 case 0x202f: /* NARROW NO-BREAK SPACE */
4111 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4112 case 0x3000: /* IDEOGRAPHIC SPACE */
4113 break;
4114 }
4115 }
4116 break;
4117
4118 case OP_NOT_VSPACE:
4119 for (i = 1; i <= min; i++)
4120 {
4121 if (eptr >= md->end_subject)
4122 {
4123 SCHECK_PARTIAL();
4124 MRRETURN(MATCH_NOMATCH);
4125 }
4126 GETCHARINC(c, eptr);
4127 switch(c)
4128 {
4129 default: break;
4130 case 0x0a: /* LF */
4131 case 0x0b: /* VT */
4132 case 0x0c: /* FF */
4133 case 0x0d: /* CR */
4134 case 0x85: /* NEL */
4135 case 0x2028: /* LINE SEPARATOR */
4136 case 0x2029: /* PARAGRAPH SEPARATOR */
4137 MRRETURN(MATCH_NOMATCH);
4138 }
4139 }
4140 break;
4141
4142 case OP_VSPACE:
4143 for (i = 1; i <= min; i++)
4144 {
4145 if (eptr >= md->end_subject)
4146 {
4147 SCHECK_PARTIAL();
4148 MRRETURN(MATCH_NOMATCH);
4149 }
4150 GETCHARINC(c, eptr);
4151 switch(c)
4152 {
4153 default: MRRETURN(MATCH_NOMATCH);
4154 case 0x0a: /* LF */
4155 case 0x0b: /* VT */
4156 case 0x0c: /* FF */
4157 case 0x0d: /* CR */
4158 case 0x85: /* NEL */
4159 case 0x2028: /* LINE SEPARATOR */
4160 case 0x2029: /* PARAGRAPH SEPARATOR */
4161 break;
4162 }
4163 }
4164 break;
4165
4166 case OP_NOT_DIGIT:
4167 for (i = 1; i <= min; i++)
4168 {
4169 if (eptr >= md->end_subject)
4170 {
4171 SCHECK_PARTIAL();
4172 MRRETURN(MATCH_NOMATCH);
4173 }
4174 GETCHARINC(c, eptr);
4175 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4176 MRRETURN(MATCH_NOMATCH);
4177 }
4178 break;
4179
4180 case OP_DIGIT:
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 MRRETURN(MATCH_NOMATCH);
4187 }
4188 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4189 MRRETURN(MATCH_NOMATCH);
4190 /* No need to skip more bytes - we know it's a 1-byte character */
4191 }
4192 break;
4193
4194 case OP_NOT_WHITESPACE:
4195 for (i = 1; i <= min; i++)
4196 {
4197 if (eptr >= md->end_subject)
4198 {
4199 SCHECK_PARTIAL();
4200 MRRETURN(MATCH_NOMATCH);
4201 }
4202 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4203 MRRETURN(MATCH_NOMATCH);
4204 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4205 }
4206 break;
4207
4208 case OP_WHITESPACE:
4209 for (i = 1; i <= min; i++)
4210 {
4211 if (eptr >= md->end_subject)
4212 {
4213 SCHECK_PARTIAL();
4214 MRRETURN(MATCH_NOMATCH);
4215 }
4216 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4217 MRRETURN(MATCH_NOMATCH);
4218 /* No need to skip more bytes - we know it's a 1-byte character */
4219 }
4220 break;
4221
4222 case OP_NOT_WORDCHAR:
4223 for (i = 1; i <= min; i++)
4224 {
4225 if (eptr >= md->end_subject)
4226 {
4227 SCHECK_PARTIAL();
4228 MRRETURN(MATCH_NOMATCH);
4229 }
4230 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4231 MRRETURN(MATCH_NOMATCH);
4232 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4233 }
4234 break;
4235
4236 case OP_WORDCHAR:
4237 for (i = 1; i <= min; i++)
4238 {
4239 if (eptr >= md->end_subject)
4240 {
4241 SCHECK_PARTIAL();
4242 MRRETURN(MATCH_NOMATCH);
4243 }
4244 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4245 MRRETURN(MATCH_NOMATCH);
4246 /* No need to skip more bytes - we know it's a 1-byte character */
4247 }
4248 break;
4249
4250 default:
4251 RRETURN(PCRE_ERROR_INTERNAL);
4252 } /* End switch(ctype) */
4253
4254 else
4255 #endif /* SUPPORT_UTF8 */
4256
4257 /* Code for the non-UTF-8 case for minimum matching of operators other
4258 than OP_PROP and OP_NOTPROP. */
4259
4260 switch(ctype)
4261 {
4262 case OP_ANY:
4263 for (i = 1; i <= min; i++)
4264 {
4265 if (eptr >= md->end_subject)
4266 {
4267 SCHECK_PARTIAL();
4268 MRRETURN(MATCH_NOMATCH);
4269 }
4270 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4271 eptr++;
4272 }
4273 break;
4274
4275 case OP_ALLANY:
4276 if (eptr > md->end_subject - min)
4277 {
4278 SCHECK_PARTIAL();
4279 MRRETURN(MATCH_NOMATCH);
4280 }
4281 eptr += min;
4282 break;
4283
4284 case OP_ANYBYTE:
4285 if (eptr > md->end_subject - min)
4286 {
4287 SCHECK_PARTIAL();
4288 MRRETURN(MATCH_NOMATCH);
4289 }
4290 eptr += min;
4291 break;
4292
4293 case OP_ANYNL:
4294 for (i = 1; i <= min; i++)
4295 {
4296 if (eptr >= md->end_subject)
4297 {
4298 SCHECK_PARTIAL();
4299 MRRETURN(MATCH_NOMATCH);
4300 }
4301 switch(*eptr++)
4302 {
4303 default: MRRETURN(MATCH_NOMATCH);
4304
4305 case 0x000d:
4306 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4307 break;
4308
4309 case 0x000a:
4310 break;
4311
4312 case 0x000b:
4313 case 0x000c:
4314 case 0x0085:
4315 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4316 break;
4317 }
4318 }
4319 break;
4320
4321 case OP_NOT_HSPACE:
4322 for (i = 1; i <= min; i++)
4323 {
4324 if (eptr >= md->end_subject)
4325 {
4326 SCHECK_PARTIAL();
4327 MRRETURN(MATCH_NOMATCH);
4328 }
4329 switch(*eptr++)
4330 {
4331 default: break;
4332 case 0x09: /* HT */
4333 case 0x20: /* SPACE */
4334 case 0xa0: /* NBSP */
4335 MRRETURN(MATCH_NOMATCH);
4336 }
4337 }
4338 break;
4339
4340 case OP_HSPACE:
4341 for (i = 1; i <= min; i++)
4342 {
4343 if (eptr >= md->end_subject)
4344 {
4345 SCHECK_PARTIAL();
4346 MRRETURN(MATCH_NOMATCH);
4347 }
4348 switch(*eptr++)
4349 {
4350 default: MRRETURN(MATCH_NOMATCH);
4351 case 0x09: /* HT */
4352 case 0x20: /* SPACE */
4353 case 0xa0: /* NBSP */
4354 break;
4355 }
4356 }
4357 break;
4358
4359 case OP_NOT_VSPACE:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 MRRETURN(MATCH_NOMATCH);
4366 }
4367 switch(*eptr++)
4368 {
4369 default: break;
4370 case 0x0a: /* LF */
4371 case 0x0b: /* VT */
4372 case 0x0c: /* FF */
4373 case 0x0d: /* CR */
4374 case 0x85: /* NEL */
4375 MRRETURN(MATCH_NOMATCH);
4376 }
4377 }
4378 break;
4379
4380 case OP_VSPACE:
4381 for (i = 1; i <= min; i++)
4382 {
4383 if (eptr >= md->end_subject)
4384 {
4385 SCHECK_PARTIAL();
4386 MRRETURN(MATCH_NOMATCH);
4387 }
4388 switch(*eptr++)
4389 {
4390 default: MRRETURN(MATCH_NOMATCH);
4391 case 0x0a: /* LF */
4392 case 0x0b: /* VT */
4393 case 0x0c: /* FF */
4394 case 0x0d: /* CR */
4395 case 0x85: /* NEL */
4396 break;
4397 }
4398 }
4399 break;
4400
4401 case OP_NOT_DIGIT:
4402 for (i = 1; i <= min; i++)
4403 {
4404 if (eptr >= md->end_subject)
4405 {
4406 SCHECK_PARTIAL();
4407 MRRETURN(MATCH_NOMATCH);
4408 }
4409 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4410 }
4411 break;
4412
4413 case OP_DIGIT:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 MRRETURN(MATCH_NOMATCH);
4420 }
4421 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4422 }
4423 break;
4424
4425 case OP_NOT_WHITESPACE:
4426 for (i = 1; i <= min; i++)
4427 {
4428 if (eptr >= md->end_subject)
4429 {
4430 SCHECK_PARTIAL();
4431 MRRETURN(MATCH_NOMATCH);
4432 }
4433 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4434 }
4435 break;
4436
4437 case OP_WHITESPACE:
4438 for (i = 1; i <= min; i++)
4439 {
4440 if (eptr >= md->end_subject)
4441 {
4442 SCHECK_PARTIAL();
4443 MRRETURN(MATCH_NOMATCH);
4444 }
4445 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4446 }
4447 break;
4448
4449 case OP_NOT_WORDCHAR:
4450 for (i = 1; i <= min; i++)
4451 {
4452 if (eptr >= md->end_subject)
4453 {
4454 SCHECK_PARTIAL();
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4458 MRRETURN(MATCH_NOMATCH);
4459 }
4460 break;
4461
4462 case OP_WORDCHAR:
4463 for (i = 1; i <= min; i++)
4464 {
4465 if (eptr >= md->end_subject)
4466 {
4467 SCHECK_PARTIAL();
4468 MRRETURN(MATCH_NOMATCH);
4469 }
4470 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4471 MRRETURN(MATCH_NOMATCH);
4472 }
4473 break;
4474
4475 default:
4476 RRETURN(PCRE_ERROR_INTERNAL);
4477 }
4478 }
4479
4480 /* If min = max, continue at the same level without recursing */
4481
4482 if (min == max) continue;
4483
4484 /* If minimizing, we have to test the rest of the pattern before each
4485 subsequent match. Again, separate the UTF-8 case for speed, and also
4486 separate the UCP cases. */
4487
4488 if (minimize)
4489 {
4490 #ifdef SUPPORT_UCP
4491 if (prop_type >= 0)
4492 {
4493 switch(prop_type)
4494 {
4495 case PT_ANY:
4496 for (fi = min;; fi++)
4497 {
4498 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4499 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4500 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 MRRETURN(MATCH_NOMATCH);
4505 }
4506 GETCHARINCTEST(c, eptr);
4507 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4508 }
4509 /* Control never gets here */
4510
4511 case PT_LAMP:
4512 for (fi = min;; fi++)
4513 {
4514 int chartype;
4515 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4517 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4518 if (eptr >= md->end_subject)
4519 {
4520 SCHECK_PARTIAL();
4521 MRRETURN(MATCH_NOMATCH);
4522 }
4523 GETCHARINCTEST(c, eptr);
4524 chartype = UCD_CHARTYPE(c);
4525 if ((chartype == ucp_Lu ||
4526 chartype == ucp_Ll ||
4527 chartype == ucp_Lt) == prop_fail_result)
4528 MRRETURN(MATCH_NOMATCH);
4529 }
4530 /* Control never gets here */
4531
4532 case PT_GC:
4533 for (fi = min;; fi++)
4534 {
4535 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4537 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4538 if (eptr >= md->end_subject)
4539 {
4540 SCHECK_PARTIAL();
4541 MRRETURN(MATCH_NOMATCH);
4542 }
4543 GETCHARINCTEST(c, eptr);
4544 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4545 MRRETURN(MATCH_NOMATCH);
4546 }
4547 /* Control never gets here */
4548
4549 case PT_PC:
4550 for (fi = min;; fi++)
4551 {
4552 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4553 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4554 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4555 if (eptr >= md->end_subject)
4556 {
4557 SCHECK_PARTIAL();
4558 MRRETURN(MATCH_NOMATCH);
4559 }
4560 GETCHARINCTEST(c, eptr);
4561 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4562 MRRETURN(MATCH_NOMATCH);
4563 }
4564 /* Control never gets here */
4565
4566 case PT_SC:
4567 for (fi = min;; fi++)
4568 {
4569 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4571 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 MRRETURN(MATCH_NOMATCH);
4576 }
4577 GETCHARINCTEST(c, eptr);
4578 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4579 MRRETURN(MATCH_NOMATCH);
4580 }
4581 /* Control never gets here */
4582
4583 case PT_ALNUM:
4584 for (fi = min;; fi++)
4585 {
4586 int category;
4587 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4589 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4590 if (eptr >= md->end_subject)
4591 {
4592 SCHECK_PARTIAL();
4593 MRRETURN(MATCH_NOMATCH);
4594 }
4595 GETCHARINCTEST(c, eptr);
4596 category = UCD_CATEGORY(c);
4597 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4598 MRRETURN(MATCH_NOMATCH);
4599 }
4600 /* Control never gets here */
4601
4602 case PT_SPACE: /* Perl space */
4603 for (fi = min;; fi++)
4604 {
4605 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4607 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4608 if (eptr >= md->end_subject)
4609 {
4610 SCHECK_PARTIAL();
4611 MRRETURN(MATCH_NOMATCH);
4612 }
4613 GETCHARINCTEST(c, eptr);
4614 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4615 c == CHAR_FF || c == CHAR_CR)
4616 == prop_fail_result)
4617 MRRETURN(MATCH_NOMATCH);
4618 }
4619 /* Control never gets here */
4620
4621 case PT_PXSPACE: /* POSIX space */
4622 for (fi = min;; fi++)
4623 {
4624 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4625 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4626 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4627 if (eptr >= md->end_subject)
4628 {
4629 SCHECK_PARTIAL();
4630 MRRETURN(MATCH_NOMATCH);
4631 }
4632 GETCHARINCTEST(c, eptr);
4633 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4634 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4635 == prop_fail_result)
4636 MRRETURN(MATCH_NOMATCH);
4637 }
4638 /* Control never gets here */
4639
4640 case PT_WORD:
4641 for (fi = min;; fi++)
4642 {
4643 int category;
4644 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4645 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4646 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4647 if (eptr >= md->end_subject)
4648 {
4649 SCHECK_PARTIAL();
4650 MRRETURN(MATCH_NOMATCH);
4651 }
4652 GETCHARINCTEST(c, eptr);
4653 category = UCD_CATEGORY(c);
4654 if ((category == ucp_L ||
4655 category == ucp_N ||
4656 c == CHAR_UNDERSCORE)
4657 == prop_fail_result)
4658 MRRETURN(MATCH_NOMATCH);
4659 }
4660 /* Control never gets here */
4661
4662 /* This should never occur */
4663
4664 default:
4665 RRETURN(PCRE_ERROR_INTERNAL);
4666 }
4667 }
4668
4669 /* Match extended Unicode sequences. We will get here only if the
4670 support is in the binary; otherwise a compile-time error occurs. */
4671
4672 else if (ctype == OP_EXTUNI)
4673 {
4674 for (fi = min;; fi++)
4675 {
4676 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4678 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4679 if (eptr >= md->end_subject)
4680 {
4681 SCHECK_PARTIAL();
4682 MRRETURN(MATCH_NOMATCH);
4683 }
4684 GETCHARINCTEST(c, eptr);
4685 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4686 while (eptr < md->end_subject)
4687 {
4688 int len = 1;
4689 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4690 if (UCD_CATEGORY(c) != ucp_M) break;
4691 eptr += len;
4692 }
4693 }
4694 }
4695 else
4696 #endif /* SUPPORT_UCP */
4697
4698 #ifdef SUPPORT_UTF8
4699 /* UTF-8 mode */
4700 if (utf8)
4701 {
4702 for (fi = min;; fi++)
4703 {
4704 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4705 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4706 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4707 if (eptr >= md->end_subject)
4708 {
4709 SCHECK_PARTIAL();
4710 MRRETURN(MATCH_NOMATCH);
4711 }
4712 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4713 MRRETURN(MATCH_NOMATCH);
4714 GETCHARINC(c, eptr);
4715 switch(ctype)
4716 {
4717 case OP_ANY: /* This is the non-NL case */
4718 case OP_ALLANY:
4719 case OP_ANYBYTE:
4720 break;
4721
4722 case OP_ANYNL:
4723 switch(c)
4724 {
4725 default: MRRETURN(MATCH_NOMATCH);
4726 case 0x000d:
4727 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4728 break;
4729 case 0x000a:
4730 break;
4731
4732 case 0x000b:
4733 case 0x000c:
4734 case 0x0085:
4735 case 0x2028:
4736 case 0x2029:
4737 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4738 break;
4739 }
4740 break;
4741
4742 case OP_NOT_HSPACE:
4743 switch(c)
4744 {
4745 default: break;
4746 case 0x09: /* HT */
4747 case 0x20: /* SPACE */
4748 case 0xa0: /* NBSP */
4749 case 0x1680: /* OGHAM SPACE MARK */
4750 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4751 case 0x2000: /* EN QUAD */
4752 case 0x2001: /* EM QUAD */
4753 case 0x2002: /* EN SPACE */
4754 case 0x2003: /* EM SPACE */
4755 case 0x2004: /* THREE-PER-EM SPACE */
4756 case 0x2005: /* FOUR-PER-EM SPACE */
4757 case 0x2006: /* SIX-PER-EM SPACE */
4758 case 0x2007: /* FIGURE SPACE */
4759 case 0x2008: /* PUNCTUATION SPACE */
4760 case 0x2009: /* THIN SPACE */
4761 case 0x200A: /* HAIR SPACE */
4762 case 0x202f: /* NARROW NO-BREAK SPACE */
4763 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4764 case 0x3000: /* IDEOGRAPHIC SPACE */
4765 MRRETURN(MATCH_NOMATCH);
4766 }
4767 break;
4768
4769 case OP_HSPACE:
4770 switch(c)
4771 {
4772 default: MRRETURN(MATCH_NOMATCH);
4773 case 0x09: /* HT */
4774 case 0x20: /* SPACE */
4775 case 0xa0: /* NBSP */
4776 case 0x1680: /* OGHAM SPACE MARK */
4777 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4778 case 0x2000: /* EN QUAD */
4779 case 0x2001: /* EM QUAD */
4780 case 0x2002: /* EN SPACE */
4781 case 0x2003: /* EM SPACE */
4782 case 0x2004: /* THREE-PER-EM SPACE */
4783 case 0x2005: /* FOUR-PER-EM SPACE */
4784 case 0x2006: /* SIX-PER-EM SPACE */
4785 case 0x2007: /* FIGURE SPACE */
4786 case 0x2008: /* PUNCTUATION SPACE */
4787 case 0x2009: /* THIN SPACE */
4788 case 0x200A: /* HAIR SPACE */
4789 case 0x202f: /* NARROW NO-BREAK SPACE */
4790 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4791 case 0x3000: /* IDEOGRAPHIC SPACE */
4792 break;
4793 }
4794 break;
4795
4796 case OP_NOT_VSPACE:
4797 switch(c)
4798 {
4799 default: break;
4800 case 0x0a: /* LF */
4801 case 0x0b: /* VT */
4802 case 0x0c: /* FF */
4803 case 0x0d: /* CR */
4804 case 0x85: /* NEL */
4805 case 0x2028: /* LINE SEPARATOR */
4806 case 0x2029: /* PARAGRAPH SEPARATOR */
4807 MRRETURN(MATCH_NOMATCH);
4808 }
4809 break;
4810
4811 case OP_VSPACE:
4812 switch(c)
4813 {
4814 default: MRRETURN(MATCH_NOMATCH);
4815 case 0x0a: /* LF */
4816 case 0x0b: /* VT */
4817 case 0x0c: /* FF */
4818 case 0x0d: /* CR */
4819 case 0x85: /* NEL */
4820 case 0x2028: /* LINE SEPARATOR */
4821 case 0x2029: /* PARAGRAPH SEPARATOR */
4822 break;
4823 }
4824 break;
4825
4826 case OP_NOT_DIGIT:
4827 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4828 MRRETURN(MATCH_NOMATCH);
4829 break;
4830
4831 case OP_DIGIT:
4832 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4833 MRRETURN(MATCH_NOMATCH);
4834 break;
4835
4836 case OP_NOT_WHITESPACE:
4837 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4838 MRRETURN(MATCH_NOMATCH);
4839 break;
4840
4841 case OP_WHITESPACE:
4842 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4843 MRRETURN(MATCH_NOMATCH);
4844 break;
4845
4846 case OP_NOT_WORDCHAR:
4847 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4848 MRRETURN(MATCH_NOMATCH);
4849 break;
4850
4851 case OP_WORDCHAR:
4852 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4853 MRRETURN(MATCH_NOMATCH);
4854 break;
4855
4856 default:
4857 RRETURN(PCRE_ERROR_INTERNAL);
4858 }
4859 }
4860 }
4861 else
4862 #endif
4863 /* Not UTF-8 mode */
4864 {
4865 for (fi = min;; fi++)
4866 {
4867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4869 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4870 if (eptr >= md->end_subject)
4871 {
4872 SCHECK_PARTIAL();
4873 MRRETURN(MATCH_NOMATCH);
4874 }
4875 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4876 MRRETURN(MATCH_NOMATCH);
4877 c = *eptr++;
4878 switch(ctype)
4879 {
4880 case OP_ANY: /* This is the non-NL case */
4881 case OP_ALLANY:
4882 case OP_ANYBYTE:
4883 break;
4884
4885 case OP_ANYNL:
4886 switch(c)
4887 {
4888 default: MRRETURN(MATCH_NOMATCH);
4889 case 0x000d:
4890 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4891 break;
4892
4893 case 0x000a:
4894 break;
4895
4896 case 0x000b:
4897 case 0x000c:
4898 case 0x0085:
4899 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4900 break;
4901 }
4902 break;
4903
4904 case OP_NOT_HSPACE:
4905 switch(c)
4906 {
4907 default: break;
4908 case 0x09: /* HT */
4909 case 0x20: /* SPACE */
4910 case 0xa0: /* NBSP */
4911 MRRETURN(MATCH_NOMATCH);
4912 }
4913 break;
4914
4915 case OP_HSPACE:
4916 switch(c)
4917 {
4918 default: MRRETURN(MATCH_NOMATCH);
4919 case 0x09: /* HT */
4920 case 0x20: /* SPACE */
4921 case 0xa0: /* NBSP */
4922 break;
4923 }
4924 break;
4925
4926 case OP_NOT_VSPACE:
4927 switch(c)
4928 {
4929 default: break;
4930 case 0x0a: /* LF */
4931 case 0x0b: /* VT */
4932 case 0x0c: /* FF */
4933 case 0x0d: /* CR */
4934 case 0x85: /* NEL */
4935 MRRETURN(MATCH_NOMATCH);
4936 }
4937 break;
4938
4939 case OP_VSPACE:
4940 switch(c)
4941 {
4942 default: MRRETURN(MATCH_NOMATCH);
4943 case 0x0a: /* LF */
4944 case 0x0b: /* VT */
4945 case 0x0c: /* FF */
4946 case 0x0d: /* CR */
4947 case 0x85: /* NEL */
4948 break;
4949 }
4950 break;
4951
4952 case OP_NOT_DIGIT:
4953 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4954 break;
4955
4956 case OP_DIGIT:
4957 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4958 break;
4959
4960 case OP_NOT_WHITESPACE:
4961 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4962 break;
4963
4964 case OP_WHITESPACE:
4965 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4966 break;
4967
4968 case OP_NOT_WORDCHAR:
4969 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4970 break;
4971
4972 case OP_WORDCHAR:
4973 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4974 break;
4975
4976 default:
4977 RRETURN(PCRE_ERROR_INTERNAL);
4978 }
4979 }
4980 }
4981 /* Control never gets here */
4982 }
4983
4984 /* If maximizing, it is worth using inline code for speed, doing the type
4985 test once at the start (i.e. keep it out of the loop). Again, keep the
4986 UTF-8 and UCP stuff separate. */
4987
4988 else
4989 {
4990 pp = eptr; /* Remember where we started */
4991
4992 #ifdef SUPPORT_UCP
4993 if (prop_type >= 0)
4994 {
4995 switch(prop_type)
4996 {
4997 case PT_ANY:
4998 for (i = min; i < max; i++)
4999 {
5000 int len = 1;
5001 if (eptr >= md->end_subject)
5002 {
5003 SCHECK_PARTIAL();
5004 break;
5005 }
5006 GETCHARLENTEST(c, eptr, len);
5007 if (prop_fail_result) break;
5008 eptr+= len;
5009 }
5010 break;
5011
5012 case PT_LAMP:
5013 for (i = min; i < max; i++)
5014 {
5015 int chartype;
5016 int len = 1;
5017 if (eptr >= md->end_subject)
5018 {
5019 SCHECK_PARTIAL();
5020 break;
5021 }
5022 GETCHARLENTEST(c, eptr, len);
5023 chartype = UCD_CHARTYPE(c);
5024 if ((chartype == ucp_Lu ||
5025 chartype == ucp_Ll ||
5026 chartype == ucp_Lt) == prop_fail_result)
5027 break;
5028 eptr+= len;
5029 }
5030 break;
5031
5032 case PT_GC:
5033 for (i = min; i < max; i++)
5034 {
5035 int len = 1;
5036 if (eptr >= md->end_subject)
5037 {
5038 SCHECK_PARTIAL();
5039 break;
5040 }
5041 GETCHARLENTEST(c, eptr, len);
5042 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5043 eptr+= len;
5044 }
5045 break;
5046
5047 case PT_PC:
5048 for (i = min; i < max; i++)
5049 {
5050 int len = 1;
5051 if (eptr >= md->end_subject)
5052 {
5053 SCHECK_PARTIAL();
5054 break;
5055 }
5056 GETCHARLENTEST(c, eptr, len);
5057 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5058 eptr+= len;
5059 }
5060 break;
5061
5062 case PT_SC:
5063 for (i = min; i < max; i++)
5064 {
5065 int len = 1;
5066 if (eptr >= md->end_subject)
5067 {
5068 SCHECK_PARTIAL();
5069 break;
5070 }
5071 GETCHARLENTEST(c, eptr, len);
5072 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5073 eptr+= len;
5074 }
5075 break;
5076
5077 case PT_ALNUM:
5078 for (i = min; i < max; i++)
5079 {
5080 int category;
5081 int len = 1;
5082 if (eptr >= md->end_subject)
5083 {
5084 SCHECK_PARTIAL();
5085 break;
5086 }
5087 GETCHARLENTEST(c, eptr, len);
5088 category = UCD_CATEGORY(c);
5089 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5090 break;
5091 eptr+= len;
5092 }
5093 break;
5094
5095 case PT_SPACE: /* Perl space */
5096 for (i = min; i < max; i++)
5097 {
5098 int len = 1;
5099 if (eptr >= md->end_subject)
5100 {
5101 SCHECK_PARTIAL();
5102 break;
5103 }
5104 GETCHARLENTEST(c, eptr, len);
5105 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5106 c == CHAR_FF || c == CHAR_CR)
5107 == prop_fail_result)
5108 break;
5109 eptr+= len;
5110 }
5111 break;
5112
5113 case PT_PXSPACE: /* POSIX space */
5114 for (i = min; i < max; i++)
5115 {
5116 int len = 1;
5117 if (eptr >= md->end_subject)
5118 {
5119 SCHECK_PARTIAL();
5120 break;
5121 }
5122 GETCHARLENTEST(c, eptr, len);
5123 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5124 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5125 == prop_fail_result)
5126 break;
5127 eptr+= len;
5128 }
5129 break;
5130
5131 case PT_WORD:
5132 for (i = min; i < max; i++)
5133 {
5134 int category;
5135 int len = 1;
5136 if (eptr >= md->end_subject)
5137 {
5138 SCHECK_PARTIAL();
5139 break;
5140 }
5141 GETCHARLENTEST(c, eptr, len);
5142 category = UCD_CATEGORY(c);
5143 if ((category == ucp_L || category == ucp_N ||
5144 c == CHAR_UNDERSCORE) == prop_fail_result)
5145 break;
5146 eptr+= len;
5147 }
5148 break;
5149
5150 default:
5151 RRETURN(PCRE_ERROR_INTERNAL);
5152 }
5153
5154 /* eptr is now past the end of the maximum run */
5155
5156 if (possessive) continue;
5157 for(;;)
5158 {
5159 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5160 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5161 if (eptr-- == pp) break; /* Stop if tried at original pos */
5162 if (utf8) BACKCHAR(eptr);
5163 }
5164 }
5165
5166 /* Match extended Unicode sequences. We will get here only if the
5167 support is in the binary; otherwise a compile-time error occurs. */
5168
5169 else if (ctype == OP_EXTUNI)
5170 {
5171 for (i = min; i < max; i++)
5172 {
5173 int len = 1;
5174 if (eptr >= md->end_subject)
5175 {
5176 SCHECK_PARTIAL();
5177 break;
5178 }
5179 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5180 if (UCD_CATEGORY(c) == ucp_M) break;
5181 eptr += len;
5182 while (eptr < md->end_subject)
5183 {
5184 len = 1;
5185 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5186 if (UCD_CATEGORY(c) != ucp_M) break;
5187 eptr += len;
5188 }
5189 }
5190
5191 /* eptr is now past the end of the maximum run */
5192
5193 if (possessive) continue;
5194
5195 for(;;)
5196 {
5197 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5199 if (eptr-- == pp) break; /* Stop if tried at original pos */
5200 for (;;) /* Move back over one extended */
5201 {
5202 if (!utf8) c = *eptr; else
5203 {
5204 BACKCHAR(eptr);
5205 GETCHAR(c, eptr);
5206 }
5207 if (UCD_CATEGORY(c) != ucp_M) break;
5208 eptr--;
5209 }
5210 }
5211 }
5212
5213 else
5214 #endif /* SUPPORT_UCP */
5215
5216 #ifdef SUPPORT_UTF8
5217 /* UTF-8 mode */
5218
5219 if (utf8)
5220 {
5221 switch(ctype)
5222 {
5223 case OP_ANY:
5224 if (max < INT_MAX)
5225 {
5226 for (i = min; i < max; i++)
5227 {
5228 if (eptr >= md->end_subject)
5229 {
5230 SCHECK_PARTIAL();
5231 break;
5232 }
5233 if (IS_NEWLINE(eptr)) break;
5234 eptr++;
5235 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5236 }
5237 }
5238
5239 /* Handle unlimited UTF-8 repeat */
5240
5241 else
5242 {
5243 for (i = min; i < max; i++)
5244 {
5245 if (eptr >= md->end_subject)
5246 {
5247 SCHECK_PARTIAL();
5248 break;
5249 }
5250 if (IS_NEWLINE(eptr)) break;
5251 eptr++;
5252 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5253 }
5254 }
5255 break;
5256
5257 case OP_ALLANY:
5258 if (max < INT_MAX)
5259 {
5260 for (i = min; i < max; i++)
5261 {
5262 if (eptr >= md->end_subject)
5263 {
5264 SCHECK_PARTIAL();
5265 break;
5266 }
5267 eptr++;
5268 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5269 }
5270 }
5271 else
5272 {
5273 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5274 SCHECK_PARTIAL();
5275 }
5276 break;
5277
5278 /* The byte case is the same as non-UTF8 */
5279
5280 case OP_ANYBYTE:
5281 c = max - min;
5282 if (c > (unsigned int)(md->end_subject - eptr))
5283 {
5284 eptr = md->end_subject;
5285 SCHECK_PARTIAL();
5286 }
5287 else eptr += c;
5288 break;
5289
5290 case OP_ANYNL:
5291 for (i = min; i < max; i++)
5292 {
5293 int len = 1;
5294 if (eptr >= md->end_subject)
5295 {
5296 SCHECK_PARTIAL();
5297 break;
5298 }
5299 GETCHARLEN(c, eptr, len);
5300 if (c == 0x000d)
5301 {
5302 if (++eptr >= md->end_subject) break;
5303 if (*eptr == 0x000a) eptr++;
5304 }
5305 else
5306 {
5307 if (c != 0x000a &&
5308 (md->bsr_anycrlf ||
5309 (c != 0x000b && c != 0x000c &&
5310 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5311 break;
5312 eptr += len;
5313 }
5314 }
5315 break;
5316
5317 case OP_NOT_HSPACE:
5318 case OP_HSPACE:
5319 for (i = min; i < max; i++)
5320 {
5321 BOOL gotspace;
5322 int len = 1;
5323 if (eptr >= md->end_subject)
5324 {
5325 SCHECK_PARTIAL();
5326 break;
5327 }
5328 GETCHARLEN(c, eptr, len);
5329 switch(c)
5330 {
5331 default: gotspace = FALSE; break;
5332 case 0x09: /* HT */
5333 case 0x20: /* SPACE */
5334 case 0xa0: /* NBSP */
5335 case 0x1680: /* OGHAM SPACE MARK */
5336 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5337 case 0x2000: /* EN QUAD */
5338 case 0x2001: /* EM QUAD */
5339 case 0x2002: /* EN SPACE */
5340 case 0x2003: /* EM SPACE */
5341 case 0x2004: /* THREE-PER-EM SPACE */
5342 case 0x2005: /* FOUR-PER-EM SPACE */
5343 case 0x2006: /* SIX-PER-EM SPACE */
5344 case 0x2007: /* FIGURE SPACE */
5345 case 0x2008: /* PUNCTUATION SPACE */
5346 case 0x2009: /* THIN SPACE */
5347 case 0x200A: /* HAIR SPACE */
5348 case 0x202f: /* NARROW NO-BREAK SPACE */
5349 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5350 case 0x3000: /* IDEOGRAPHIC SPACE */
5351 gotspace = TRUE;
5352 break;
5353 }
5354 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5355 eptr += len;
5356 }
5357 break;
5358
5359 case OP_NOT_VSPACE:
5360 case OP_VSPACE:
5361 for (i = min; i < max; i++)
5362 {
5363 BOOL gotspace;
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5366 {
5367 SCHECK_PARTIAL();
5368 break;
5369 }
5370 GETCHARLEN(c, eptr, len);
5371 switch(c)
5372 {
5373 default: gotspace = FALSE; break;
5374 case 0x0a: /* LF */
5375 case 0x0b: /* VT */
5376 case 0x0c: /* FF */
5377 case 0x0d: /* CR */
5378 case 0x85: /* NEL */
5379 case 0x2028: /* LINE SEPARATOR */
5380 case 0x2029: /* PARAGRAPH SEPARATOR */
5381 gotspace = TRUE;
5382 break;
5383 }
5384 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5385 eptr += len;
5386 }
5387 break;
5388
5389 case OP_NOT_DIGIT:
5390 for (i = min; i < max; i++)
5391 {
5392 int len = 1;
5393 if (eptr >= md->end_subject)
5394 {
5395 SCHECK_PARTIAL();
5396 break;
5397 }
5398 GETCHARLEN(c, eptr, len);
5399 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5400 eptr+= len;
5401 }
5402 break;
5403
5404 case OP_DIGIT:
5405 for (i = min; i < max; i++)
5406 {
5407 int len = 1;
5408 if (eptr >= md->end_subject)
5409 {
5410 SCHECK_PARTIAL();
5411 break;
5412 }
5413 GETCHARLEN(c, eptr, len);
5414 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5415 eptr+= len;
5416 }
5417 break;
5418
5419 case OP_NOT_WHITESPACE:
5420 for (i = min; i < max; i++)
5421 {
5422 int len = 1;
5423 if (eptr >= md->end_subject)
5424 {
5425 SCHECK_PARTIAL();
5426 break;
5427 }
5428 GETCHARLEN(c, eptr, len);
5429 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5430 eptr+= len;
5431 }
5432 break;
5433
5434 case OP_WHITESPACE:
5435 for (i = min; i < max; i++)
5436 {
5437 int len = 1;
5438 if (eptr >= md->end_subject)
5439 {
5440 SCHECK_PARTIAL();
5441 break;
5442 }
5443 GETCHARLEN(c, eptr, len);
5444 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5445 eptr+= len;
5446 }
5447 break;
5448
5449 case OP_NOT_WORDCHAR:
5450 for (i = min; i < max; i++)
5451 {
5452 int len = 1;
5453 if (eptr >= md->end_subject)
5454 {
5455 SCHECK_PARTIAL();
5456 break;
5457 }
5458 GETCHARLEN(c, eptr, len);
5459 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5460 eptr+= len;
5461 }
5462 break;
5463
5464 case OP_WORDCHAR:
5465 for (i = min; i < max; i++)
5466 {
5467 int len = 1;
5468 if (eptr >= md->end_subject)
5469 {
5470 SCHECK_PARTIAL();
5471 break;
5472 }
5473 GETCHARLEN(c, eptr, len);
5474 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5475 eptr+= len;
5476 }
5477 break;
5478
5479 default:
5480 RRETURN(PCRE_ERROR_INTERNAL);
5481 }
5482
5483 /* eptr is now past the end of the maximum run. If possessive, we are
5484 done (no backing up). Otherwise, match at this position; anything other
5485 than no match is immediately returned. For nomatch, back up one
5486 character, unless we are matching \R and the last thing matched was
5487 \r\n, in which case, back up two bytes. */
5488
5489 if (possessive) continue;
5490 for(;;)
5491 {
5492 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5493 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5494 if (eptr-- == pp) break; /* Stop if tried at original pos */
5495 BACKCHAR(eptr);
5496 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5497 eptr[-1] == '\r') eptr--;
5498 }
5499 }
5500 else
5501 #endif /* SUPPORT_UTF8 */
5502
5503 /* Not UTF-8 mode */
5504 {
5505 switch(ctype)
5506 {
5507 case OP_ANY:
5508 for (i = min; i < max; i++)
5509 {
5510 if (eptr >= md->end_subject)
5511 {
5512 SCHECK_PARTIAL();
5513 break;
5514 }
5515 if (IS_NEWLINE(eptr)) break;
5516 eptr++;
5517 }
5518 break;
5519
5520 case OP_ALLANY:
5521 case OP_ANYBYTE:
5522 c = max - min;
5523 if (c > (unsigned int)(md->end_subject - eptr))
5524 {
5525 eptr = md->end_subject;
5526 SCHECK_PARTIAL();
5527 }
5528 else eptr += c;
5529 break;
5530
5531 case OP_ANYNL:
5532 for (i = min; i < max; i++)
5533 {
5534 if (eptr >= md->end_subject)
5535 {
5536 SCHECK_PARTIAL();
5537 break;
5538 }
5539 c = *eptr;
5540 if (c == 0x000d)
5541 {
5542 if (++eptr >= md->end_subject) break;
5543 if (*eptr == 0x000a) eptr++;
5544 }
5545 else
5546 {
5547 if (c != 0x000a &&
5548 (md->bsr_anycrlf ||
5549 (c != 0x000b && c != 0x000c && c != 0x0085)))
5550 break;
5551 eptr++;
5552 }
5553 }
5554 break;
5555
5556 case OP_NOT_HSPACE:
5557 for (i = min; i < max; i++)
5558 {
5559 if (eptr >= md->end_subject)
5560 {
5561 SCHECK_PARTIAL();
5562 break;
5563 }
5564 c = *eptr;
5565 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5566 eptr++;
5567 }
5568 break;
5569
5570 case OP_HSPACE:
5571 for (i = min; i < max; i++)
5572 {
5573 if (eptr >= md->end_subject)
5574 {
5575 SCHECK_PARTIAL();
5576 break;
5577 }
5578 c = *eptr;
5579 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5580 eptr++;
5581 }
5582 break;
5583
5584 case OP_NOT_VSPACE:
5585 for (i = min; i < max; i++)
5586 {
5587 if (eptr >= md->end_subject)
5588 {
5589 SCHECK_PARTIAL();
5590 break;
5591 }
5592 c = *eptr;
5593 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5594 break;
5595 eptr++;
5596 }
5597 break;
5598
5599 case OP_VSPACE:
5600 for (i = min; i < max; i++)
5601 {
5602 if (eptr >= md->end_subject)
5603 {
5604 SCHECK_PARTIAL();
5605 break;
5606 }
5607 c = *eptr;
5608 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5609 break;
5610 eptr++;
5611 }
5612 break;
5613
5614 case OP_NOT_DIGIT:
5615 for (i = min; i < max; i++)
5616 {
5617 if (eptr >= md->end_subject)
5618 {
5619 SCHECK_PARTIAL();
5620 break;
5621 }
5622 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5623 eptr++;
5624 }
5625 break;
5626
5627 case OP_DIGIT:
5628 for (i = min; i < max; i++)
5629 {
5630 if (eptr >= md->end_subject)
5631 {
5632 SCHECK_PARTIAL();
5633 break;
5634 }
5635 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5636 eptr++;
5637 }
5638 break;
5639
5640 case OP_NOT_WHITESPACE:
5641 for (i = min; i < max; i++)
5642 {
5643 if (eptr >= md->end_subject)
5644 {
5645 SCHECK_PARTIAL();
5646 break;
5647 }
5648 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5649 eptr++;
5650 }
5651 break;
5652
5653 case OP_WHITESPACE:
5654 for (i = min; i < max; i++)
5655 {
5656 if (eptr >= md->end_subject)
5657 {
5658 SCHECK_PARTIAL();
5659 break;
5660 }
5661 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5662 eptr++;
5663 }
5664 break;
5665
5666 case OP_NOT_WORDCHAR:
5667 for (i = min; i < max; i++)
5668 {
5669 if (eptr >= md->end_subject)
5670 {
5671 SCHECK_PARTIAL();
5672 break;
5673 }
5674 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5675 eptr++;
5676 }
5677 break;
5678
5679 case OP_WORDCHAR:
5680 for (i = min; i < max; i++)
5681 {
5682 if (eptr >= md->end_subject)
5683 {
5684 SCHECK_PARTIAL();
5685 break;
5686 }
5687 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5688 eptr++;
5689 }
5690 break;
5691
5692 default:
5693 RRETURN(PCRE_ERROR_INTERNAL);
5694 }
5695
5696 /* eptr is now past the end of the maximum run. If possessive, we are
5697 done (no backing up). Otherwise, match at this position; anything other
5698 than no match is immediately returned. For nomatch, back up one
5699 character (byte), unless we are matching \R and the last thing matched
5700 was \r\n, in which case, back up two bytes. */
5701
5702 if (possessive) continue;
5703 while (eptr >= pp)
5704 {
5705 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5706 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5707 eptr--;
5708 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5709 eptr[-1] == '\r') eptr--;
5710 }
5711 }
5712
5713 /* Get here if we can't make it match with any permitted repetitions */
5714
5715 MRRETURN(MATCH_NOMATCH);
5716 }
5717 /* Control never gets here */
5718
5719 /* There's been some horrible disaster. Arrival here can only mean there is
5720 something seriously wrong in the code above or the OP_xxx definitions. */
5721
5722 default:
5723 DPRINTF(("Unknown opcode %d\n", *ecode));
5724 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5725 }
5726
5727 /* Do not stick any code in here without much thought; it is assumed
5728 that "continue" in the code above comes out to here to repeat the main
5729 loop. */
5730
5731 } /* End of main loop */
5732 /* Control never reaches here */
5733
5734
5735 /* When compiling to use the heap rather than the stack for recursive calls to
5736 match(), the RRETURN() macro jumps here. The number that is saved in
5737 frame->Xwhere indicates which label we actually want to return to. */
5738
5739 #ifdef NO_RECURSE
5740 #define LBL(val) case val: goto L_RM##val;
5741 HEAP_RETURN:
5742 switch (frame->Xwhere)
5743 {
5744 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5745 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5746 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5747 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5748 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5749 #ifdef SUPPORT_UTF8
5750 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5751 LBL(32) LBL(34) LBL(42) LBL(46)
5752 #ifdef SUPPORT_UCP
5753 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5754 LBL(59) LBL(60) LBL(61) LBL(62)
5755 #endif /* SUPPORT_UCP */
5756 #endif /* SUPPORT_UTF8 */
5757 default:
5758 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5759 return PCRE_ERROR_INTERNAL;
5760 }
5761 #undef LBL
5762 #endif /* NO_RECURSE */
5763 }
5764
5765
5766 /***************************************************************************
5767 ****************************************************************************
5768 RECURSION IN THE match() FUNCTION
5769
5770 Undefine all the macros that were defined above to handle this. */
5771
5772 #ifdef NO_RECURSE
5773 #undef eptr
5774 #undef ecode
5775 #undef mstart
5776 #undef offset_top
5777 #undef eptrb
5778 #undef flags
5779
5780 #undef callpat
5781 #undef charptr
5782 #undef data
5783 #undef next
5784 #undef pp
5785 #undef prev
5786 #undef saved_eptr
5787
5788 #undef new_recursive
5789
5790 #undef cur_is_word
5791 #undef condition
5792 #undef prev_is_word
5793
5794 #undef ctype
5795 #undef length
5796 #undef max
5797 #undef min
5798 #undef number
5799 #undef offset
5800 #undef op
5801 #undef save_capture_last
5802 #undef save_offset1
5803 #undef save_offset2
5804 #undef save_offset3
5805 #undef stacksave
5806
5807 #undef newptrb
5808
5809 #endif
5810
5811 /* These two are defined as macros in both cases */
5812
5813 #undef fc
5814 #undef fi
5815
5816 /***************************************************************************
5817 ***************************************************************************/
5818
5819
5820
5821 /*************************************************
5822 * Execute a Regular Expression *
5823 *************************************************/
5824
5825 /* This function applies a compiled re to a subject string and picks out
5826 portions of the string if it matches. Two elements in the vector are set for
5827 each substring: the offsets to the start and end of the substring.
5828
5829 Arguments:
5830 argument_re points to the compiled expression
5831 extra_data points to extra data or is NULL
5832 subject points to the subject string
5833 length length of subject string (may contain binary zeros)
5834 start_offset where to start in the subject string
5835 options option bits
5836 offsets points to a vector of ints to be filled in with offsets
5837 offsetcount the number of elements in the vector
5838
5839 Returns: > 0 => success; value is the number of elements filled in
5840 = 0 => success, but offsets is not big enough
5841 -1 => failed to match
5842 < -1 => some kind of unexpected problem
5843 */
5844
5845 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5846 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5847 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5848 int offsetcount)
5849 {
5850 int rc, ocount, arg_offset_max;
5851 int first_byte = -1;
5852 int req_byte = -1;
5853 int req_byte2 = -1;
5854 int newline;
5855 BOOL using_temporary_offsets = FALSE;
5856 BOOL anchored;
5857 BOOL startline;
5858 BOOL firstline;
5859 BOOL first_byte_caseless = FALSE;
5860 BOOL req_byte_caseless = FALSE;
5861 BOOL utf8;
5862 match_data match_block;
5863 match_data *md = &match_block;
5864 const uschar *tables;
5865 const uschar *start_bits = NULL;
5866 USPTR start_match = (USPTR)subject + start_offset;
5867 USPTR end_subject;
5868 USPTR start_partial = NULL;
5869 USPTR req_byte_ptr = start_match - 1;
5870
5871 pcre_study_data internal_study;
5872 const pcre_study_data *study;
5873
5874 real_pcre internal_re;
5875 const real_pcre *external_re = (const real_pcre *)argument_re;
5876 const real_pcre *re = external_re;
5877
5878 /* Plausibility checks */
5879
5880 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5881 if (re == NULL || subject == NULL ||
5882 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5883 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5884 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5885
5886 /* These two settings are used in the code for checking a UTF-8 string that
5887 follows immediately afterwards. Other values in the md block are used only
5888 during "normal" pcre_exec() processing, not when the JIT support is in use,
5889 so they are set up later. */
5890
5891 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5892 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5893 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5894
5895 /* Check a UTF-8 string if required. Pass back the character offset and error
5896 code for an invalid string if a results vector is available. */
5897
5898 #ifdef SUPPORT_UTF8
5899 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5900 {
5901 int erroroffset;
5902 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5903 if (errorcode != 0)
5904 {
5905 if (offsetcount >= 2)
5906 {
5907 offsets[0] = erroroffset;
5908 offsets[1] = errorcode;
5909 }
5910 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5911 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5912 }
5913
5914 /* Check that a start_offset points to the start of a UTF-8 character. */
5915 if (start_offset > 0 && start_offset < length &&
5916 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5917 return PCRE_ERROR_BADUTF8_OFFSET;
5918 }
5919 #endif
5920
5921 /* If the pattern was successfully studied with JIT support, run the JIT
5922 executable instead of the rest of this function. Most options must be set at
5923 compile time for the JIT code to be usable. Fallback to the normal code path if
5924 an unsupported flag is set. In particular, JIT does not support partial
5925 matching. */
5926
5927 #ifdef SUPPORT_JIT
5928 if (extra_data != NULL
5929 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
5930 && extra_data->executable_jit != NULL
5931 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
5932 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
5933 return _pcre_jit_exec(re, extra_data->executable_jit, subject, length,
5934 start_offset, options, ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
5935 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
5936 #endif
5937
5938 /* Carry on with non-JIT matching. This information is for finding all the
5939 numbers associated with a given name, for condition testing. */
5940
5941 md->name_table = (uschar *)re + re->name_table_offset;
5942 md->name_count = re->name_count;
5943 md->name_entry_size = re->name_entry_size;
5944
5945 /* Fish out the optional data from the extra_data structure, first setting
5946 the default values. */
5947
5948 study = NULL;
5949 md->match_limit = MATCH_LIMIT;
5950 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5951 md->callout_data = NULL;
5952
5953 /* The table pointer is always in native byte order. */
5954
5955 tables = external_re->tables;
5956
5957 if (extra_data != NULL)
5958 {
5959 register unsigned int flags = extra_data->flags;
5960 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5961 study = (const pcre_study_data *)extra_data->study_data;
5962 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5963 md->match_limit = extra_data->match_limit;
5964 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5965 md->match_limit_recursion = extra_data->match_limit_recursion;
5966 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5967 md->callout_data = extra_data->callout_data;
5968 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5969 }
5970
5971 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5972 is a feature that makes it possible to save compiled regex and re-use them
5973 in other programs later. */
5974
5975 if (tables == NULL) tables = _pcre_default_tables;
5976
5977 /* Check that the first field in the block is the magic number. If it is not,
5978 test for a regex that was compiled on a host of opposite endianness. If this is
5979 the case, flipped values are put in internal_re and internal_study if there was
5980 study data too. */
5981
5982 if (re->magic_number != MAGIC_NUMBER)
5983 {
5984 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5985 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5986 if (study != NULL) study = &internal_study;
5987 }
5988
5989 /* Set up other data */
5990
5991 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5992 startline = (re->flags & PCRE_STARTLINE) != 0;
5993 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5994
5995 /* The code starts after the real_pcre block and the capture name table. */
5996
5997 md->start_code = (const uschar *)external_re + re->name_table_offset +
5998 re->name_count * re->name_entry_size;
5999
6000 md->start_subject = (USPTR)subject;
6001 md->start_offset = start_offset;
6002 md->end_subject = md->start_subject + length;
6003 end_subject = md->end_subject;
6004
6005 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6006 md->use_ucp = (re->options & PCRE_UCP) != 0;
6007 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6008
6009 /* Some options are unpacked into BOOL variables in the hope that testing
6010 them will be faster than individual option bits. */
6011
6012 md->notbol = (options & PCRE_NOTBOL) != 0;
6013 md->noteol = (options & PCRE_NOTEOL) != 0;
6014 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6015 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6016
6017 md->hitend = FALSE;
6018 md->mark = NULL; /* In case never set */
6019
6020 md->recursive = NULL; /* No recursion at top level */
6021 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6022
6023 md->lcc = tables + lcc_offset;
6024 md->ctypes = tables + ctypes_offset;
6025
6026 /* Handle different \R options. */
6027
6028 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6029 {
6030 case 0:
6031 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6032 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6033 else
6034 #ifdef BSR_ANYCRLF
6035 md->bsr_anycrlf = TRUE;
6036 #else
6037 md->bsr_anycrlf = FALSE;
6038 #endif
6039 break;
6040
6041 case PCRE_BSR_ANYCRLF:
6042 md->bsr_anycrlf = TRUE;
6043 break;
6044
6045 case PCRE_BSR_UNICODE:
6046 md->bsr_anycrlf = FALSE;
6047 break;
6048
6049 default: return PCRE_ERROR_BADNEWLINE;
6050 }
6051
6052 /* Handle different types of newline. The three bits give eight cases. If
6053 nothing is set at run time, whatever was used at compile time applies. */
6054
6055 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6056 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6057 {
6058 case 0: newline = NEWLINE; break; /* Compile-time default */
6059 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6060 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6061 case PCRE_NEWLINE_CR+
6062 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6063 case PCRE_NEWLINE_ANY: newline = -1; break;
6064 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6065 default: return PCRE_ERROR_BADNEWLINE;
6066 }
6067
6068 if (newline == -2)
6069 {
6070 md->nltype = NLTYPE_ANYCRLF;
6071 }
6072 else if (newline < 0)
6073 {
6074 md->nltype = NLTYPE_ANY;
6075 }
6076 else
6077 {
6078 md->nltype = NLTYPE_FIXED;
6079 if (newline > 255)
6080 {
6081 md->nllen = 2;
6082 md->nl[0] = (newline >> 8) & 255;
6083 md->nl[1] = newline & 255;
6084 }
6085 else
6086 {
6087 md->nllen = 1;
6088 md->nl[0] = newline;
6089 }
6090 }
6091
6092 /* Partial matching was originally supported only for a restricted set of
6093 regexes; from release 8.00 there are no restrictions, but the bits are still
6094 defined (though never set). So there's no harm in leaving this code. */
6095
6096 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6097 return PCRE_ERROR_BADPARTIAL;
6098
6099 /* If the expression has got more back references than the offsets supplied can
6100 hold, we get a temporary chunk of working store to use during the matching.
6101 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6102 of 3. */
6103
6104 ocount = offsetcount - (offsetcount % 3);
6105 arg_offset_max = (2*ocount)/3;
6106
6107 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6108 {
6109 ocount = re->top_backref * 3 + 3;
6110 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6111 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6112 using_temporary_offsets = TRUE;
6113 DPRINTF(("Got memory to hold back references\n"));
6114 }
6115 else md->offset_vector = offsets;
6116
6117 md->offset_end = ocount;
6118 md->offset_max = (2*ocount)/3;
6119 md->offset_overflow = FALSE;
6120 md->capture_last = -1;
6121
6122 /* Reset the working variable associated with each extraction. These should
6123 never be used unless previously set, but they get saved and restored, and so we
6124 initialize them to avoid reading uninitialized locations. Also, unset the
6125 offsets for the matched string. This is really just for tidiness with callouts,
6126 in case they inspect these fields. */
6127
6128 if (md->offset_vector != NULL)
6129 {
6130 register int *iptr = md->offset_vector + ocount;
6131 register int *iend = iptr - re->top_bracket;
6132 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6133 while (--iptr >= iend) *iptr = -1;
6134 md->offset_vector[0] = md->offset_vector[1] = -1;
6135 }
6136
6137 /* Set up the first character to match, if available. The first_byte value is
6138 never set for an anchored regular expression, but the anchoring may be forced
6139 at run time, so we have to test for anchoring. The first char may be unset for
6140 an unanchored pattern, of course. If there's no first char and the pattern was
6141 studied, there may be a bitmap of possible first characters. */
6142
6143 if (!anchored)
6144 {
6145 if ((re->flags & PCRE_FIRSTSET) != 0)
6146 {
6147 first_byte = re->first_byte & 255;
6148 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6149 first_byte = md->lcc[first_byte];
6150 }
6151 else
6152 if (!startline && study != NULL &&
6153 (study->flags & PCRE_STUDY_MAPPED) != 0)
6154 start_bits = study->start_bits;
6155 }
6156
6157 /* For anchored or unanchored matches, there may be a "last known required
6158 character" set. */
6159
6160 if ((re->flags & PCRE_REQCHSET) != 0)
6161 {
6162 req_byte = re->req_byte & 255;
6163 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6164 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6165 }
6166
6167
6168
6169
6170 /* ==========================================================================*/
6171
6172 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6173 the loop runs just once. */
6174
6175 for(;;)
6176 {
6177 USPTR save_end_subject = end_subject;
6178 USPTR new_start_match;
6179
6180 /* If firstline is TRUE, the start of the match is constrained to the first
6181 line of a multiline string. That is, the match must be before or at the first
6182 newline. Implement this by temporarily adjusting end_subject so that we stop
6183 scanning at a newline. If the match fails at the newline, later code breaks
6184 this loop. */
6185
6186 if (firstline)
6187 {
6188 USPTR t = start_match;
6189 #ifdef SUPPORT_UTF8
6190 if (utf8)
6191 {
6192 while (t < md->end_subject && !IS_NEWLINE(t))
6193 {
6194 t++;
6195 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6196 }
6197 }
6198 else
6199 #endif
6200 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6201 end_subject = t;
6202 }
6203
6204 /* There are some optimizations that avoid running the match if a known
6205 starting point is not found, or if a known later character is not present.
6206 However, there is an option that disables these, for testing and for ensuring
6207 that all callouts do actually occur. The option can be set in the regex by
6208 (*NO_START_OPT) or passed in match-time options. */
6209
6210 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6211 {
6212 /* Advance to a unique first byte if there is one. */
6213
6214 if (first_byte >= 0)
6215 {
6216 if (first_byte_caseless)
6217 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6218 start_match++;
6219 else
6220 while (start_match < end_subject && *start_match != first_byte)
6221 start_match++;
6222 }
6223
6224 /* Or to just after a linebreak for a multiline match */
6225
6226 else if (startline)
6227 {
6228 if (start_match > md->start_subject + start_offset)
6229 {
6230 #ifdef SUPPORT_UTF8
6231 if (utf8)
6232 {
6233 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6234 {
6235 start_match++;
6236 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6237 start_match++;
6238 }
6239 }
6240 else
6241 #endif
6242 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6243 start_match++;
6244
6245 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6246 and we are now at a LF, advance the match position by one more character.
6247 */
6248
6249 if (start_match[-1] == CHAR_CR &&
6250 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6251 start_match < end_subject &&
6252 *start_match == CHAR_NL)
6253 start_match++;
6254 }
6255 }
6256
6257 /* Or to a non-unique first byte after study */
6258
6259 else if (start_bits != NULL)
6260 {
6261 while (start_match < end_subject)
6262 {
6263 register unsigned int c = *start_match;
6264 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6265 {
6266 start_match++;
6267 #ifdef SUPPORT_UTF8
6268 if (utf8)
6269 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6270 start_match++;
6271 #endif
6272 }
6273 else break;
6274 }
6275 }
6276 } /* Starting optimizations */
6277
6278 /* Restore fudged end_subject */
6279
6280 end_subject = save_end_subject;
6281
6282 /* The following two optimizations are disabled for partial matching or if
6283 disabling is explicitly requested. */
6284
6285 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6286 {
6287 /* If the pattern was studied, a minimum subject length may be set. This is
6288 a lower bound; no actual string of that length may actually match the
6289 pattern. Although the value is, strictly, in characters, we treat it as
6290 bytes to avoid spending too much time in this optimization. */
6291
6292 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6293 (pcre_uint32)(end_subject - start_match) < study->minlength)
6294 {
6295 rc = MATCH_NOMATCH;
6296 break;
6297 }
6298
6299 /* If req_byte is set, we know that that character must appear in the
6300 subject for the match to succeed. If the first character is set, req_byte
6301 must be later in the subject; otherwise the test starts at the match point.
6302 This optimization can save a huge amount of backtracking in patterns with
6303 nested unlimited repeats that aren't going to match. Writing separate code
6304 for cased/caseless versions makes it go faster, as does using an
6305 autoincrement and backing off on a match.
6306
6307 HOWEVER: when the subject string is very, very long, searching to its end
6308 can take a long time, and give bad performance on quite ordinary patterns.
6309 This showed up when somebody was matching something like /^\d+C/ on a
6310 32-megabyte string... so we don't do this when the string is sufficiently
6311 long. */
6312
6313 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6314 {
6315 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6316
6317 /* We don't need to repeat the search if we haven't yet reached the
6318 place we found it at last time. */
6319
6320 if (p > req_byte_ptr)
6321 {
6322 if (req_byte_caseless)
6323 {
6324 while (p < end_subject)
6325 {
6326 register int pp = *p++;
6327 if (pp == req_byte || pp == req_byte2) { p--; break; }
6328 }
6329 }
6330 else
6331 {
6332 while (p < end_subject)
6333 {
6334 if (*p++ == req_byte) { p--; break; }
6335 }
6336 }
6337
6338 /* If we can't find the required character, break the matching loop,
6339 forcing a match failure. */
6340
6341 if (p >= end_subject)
6342 {
6343 rc = MATCH_NOMATCH;
6344 break;
6345 }
6346
6347 /* If we have found the required character, save the point where we
6348 found it, so that we don't search again next time round the loop if
6349 the start hasn't passed this character yet. */
6350
6351 req_byte_ptr = p;
6352 }
6353 }
6354 }
6355
6356 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6357 printf(">>>> Match against: ");
6358 pchars(start_match, end_subject - start_match, TRUE, md);
6359 printf("\n");
6360 #endif
6361
6362 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6363 first starting point for which a partial match was found. */
6364
6365 md->start_match_ptr = start_match;
6366 md->start_used_ptr = start_match;
6367 md->match_call_count = 0;
6368 md->match_function_type = 0;
6369 md->end_offset_top = 0;
6370 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6371 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6372
6373 switch(rc)
6374 {
6375 /* SKIP passes back the next starting point explicitly, but if it is the
6376 same as the match we have just done, treat it as NOMATCH. */
6377
6378 case MATCH_SKIP:
6379 if (md->start_match_ptr != start_match)
6380 {
6381 new_start_match = md->start_match_ptr;
6382 break;
6383 }
6384 /* Fall through */
6385
6386 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6387 the SKIP's arg was not found. We also treat this as NOMATCH. */
6388
6389 case MATCH_SKIP_ARG:
6390 /* Fall through */
6391
6392 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6393 exactly like PRUNE. */
6394
6395 case MATCH_NOMATCH:
6396 case MATCH_PRUNE:
6397 case MATCH_THEN:
6398 new_start_match = start_match + 1;
6399 #ifdef SUPPORT_UTF8
6400 if (utf8)
6401 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6402 new_start_match++;
6403 #endif
6404 break;
6405
6406 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6407
6408 case MATCH_COMMIT:
6409 rc = MATCH_NOMATCH;
6410 goto ENDLOOP;
6411
6412 /* Any other return is either a match, or some kind of error. */
6413
6414 default:
6415 goto ENDLOOP;
6416 }
6417
6418 /* Control reaches here for the various types of "no match at this point"
6419 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6420
6421 rc = MATCH_NOMATCH;
6422
6423 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6424 newline in the subject (though it may continue over the newline). Therefore,
6425 if we have just failed to match, starting at a newline, do not continue. */
6426
6427 if (firstline && IS_NEWLINE(start_match)) break;
6428
6429 /* Advance to new matching position */
6430
6431 start_match = new_start_match;
6432
6433 /* Break the loop if the pattern is anchored or if we have passed the end of
6434 the subject. */
6435
6436 if (anchored || start_match > end_subject) break;
6437
6438 /* If we have just passed a CR and we are now at a LF, and the pattern does
6439 not contain any explicit matches for \r or \n, and the newline option is CRLF
6440 or ANY or ANYCRLF, advance the match position by one more character. */
6441
6442 if (start_match[-1] == CHAR_CR &&
6443 start_match < end_subject &&
6444 *start_match == CHAR_NL &&
6445 (re->flags & PCRE_HASCRORLF) == 0 &&
6446 (md->nltype == NLTYPE_ANY ||
6447 md->nltype == NLTYPE_ANYCRLF ||
6448 md->nllen == 2))
6449 start_match++;
6450
6451 md->mark = NULL; /* Reset for start of next match attempt */
6452 } /* End of for(;;) "bumpalong" loop */
6453
6454 /* ==========================================================================*/
6455
6456 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6457 conditions is true:
6458
6459 (1) The pattern is anchored or the match was failed by (*COMMIT);
6460
6461 (2) We are past the end of the subject;
6462
6463 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6464 this option requests that a match occur at or before the first newline in
6465 the subject.
6466
6467 When we have a match and the offset vector is big enough to deal with any
6468 backreferences, captured substring offsets will already be set up. In the case
6469 where we had to get some local store to hold offsets for backreference
6470 processing, copy those that we can. In this case there need not be overflow if
6471 certain parts of the pattern were not used, even though there are more
6472 capturing parentheses than vector slots. */
6473
6474 ENDLOOP:
6475
6476 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6477 {
6478 if (using_temporary_offsets)
6479 {
6480 if (arg_offset_max >= 4)
6481 {
6482 memcpy(offsets + 2, md->offset_vector + 2,
6483 (arg_offset_max - 2) * sizeof(int));
6484 DPRINTF(("Copied offsets from temporary memory\n"));
6485 }
6486 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6487 DPRINTF(("Freeing temporary memory\n"));
6488 (pcre_free)(md->offset_vector);
6489 }
6490
6491 /* Set the return code to the number of captured strings, or 0 if there were
6492 too many to fit into the vector. */
6493
6494 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6495 0 : md->end_offset_top/2;
6496
6497 /* If there is space in the offset vector, set any unused pairs at the end of
6498 the pattern to -1 for backwards compatibility. It is documented that this
6499 happens. In earlier versions, the whole set of potential capturing offsets
6500 was set to -1 each time round the loop, but this is handled differently now.
6501 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6502 those at the end that need unsetting here. We can't just unset them all at
6503 the start of the whole thing because they may get set in one branch that is
6504 not the final matching branch. */
6505
6506 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6507 {
6508 register int *iptr, *iend;
6509 int resetcount = 2 + re->top_bracket * 2;
6510 if (resetcount > offsetcount) resetcount = ocount;
6511 iptr = offsets + md->end_offset_top;
6512 iend = offsets + resetcount;
6513 while (iptr < iend) *iptr++ = -1;
6514 }
6515
6516 /* If there is space, set up the whole thing as substring 0. The value of
6517 md->start_match_ptr might be modified if \K was encountered on the success
6518 matching path. */
6519
6520 if (offsetcount < 2) rc = 0; else
6521 {
6522 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6523 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6524 }
6525
6526 DPRINTF((">>>> returning %d\n", rc));
6527 goto RETURN_MARK;
6528 }
6529
6530 /* Control gets here if there has been an error, or if the overall match
6531 attempt has failed at all permitted starting positions. */
6532
6533 if (using_temporary_offsets)
6534 {
6535 DPRINTF(("Freeing temporary memory\n"));
6536 (pcre_free)(md->offset_vector);
6537 }
6538
6539 /* For anything other than nomatch or partial match, just return the code. */
6540
6541 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6542 {
6543 DPRINTF((">>>> error: returning %d\n", rc));
6544 return rc;
6545 }
6546
6547 /* Handle partial matches - disable any mark data */
6548
6549 if (start_partial != NULL)
6550 {
6551 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6552 md->mark = NULL;
6553 if (offsetcount > 1)
6554 {
6555 offsets[0] = (int)(start_partial - (USPTR)subject);
6556 offsets[1] = (int)(end_subject - (USPTR)subject);
6557 }
6558 rc = PCRE_ERROR_PARTIAL;
6559 }
6560
6561 /* This is the classic nomatch case */
6562
6563 else
6564 {
6565 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6566 rc = PCRE_ERROR_NOMATCH;
6567 }
6568
6569 /* Return the MARK data if it has been requested. */
6570
6571 RETURN_MARK:
6572
6573 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6574 *(extra_data->mark) = (unsigned char *)(md->mark);
6575 return rc;
6576 }
6577
6578 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5