/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 630 - (show annotations)
Fri Jul 22 10:00:10 2011 UTC (4 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 193789 byte(s)
Error occurred while calculating annotation data.
Make (*MARK) work in positive assertions.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 1; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1086 if (rrc < 0) RRETURN(rrc);
1087 }
1088 ecode += _pcre_OP_lengths[OP_CALLOUT];
1089 }
1090
1091 condcode = ecode[LINK_SIZE+1];
1092
1093 /* Now see what the actual condition is */
1094
1095 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1096 {
1097 if (md->recursive == NULL) /* Not recursing => FALSE */
1098 {
1099 condition = FALSE;
1100 ecode += GET(ecode, 1);
1101 }
1102 else
1103 {
1104 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1105 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1106
1107 /* If the test is for recursion into a specific subpattern, and it is
1108 false, but the test was set up by name, scan the table to see if the
1109 name refers to any other numbers, and test them. The condition is true
1110 if any one is set. */
1111
1112 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1113 {
1114 uschar *slotA = md->name_table;
1115 for (i = 0; i < md->name_count; i++)
1116 {
1117 if (GET2(slotA, 0) == recno) break;
1118 slotA += md->name_entry_size;
1119 }
1120
1121 /* Found a name for the number - there can be only one; duplicate
1122 names for different numbers are allowed, but not vice versa. First
1123 scan down for duplicates. */
1124
1125 if (i < md->name_count)
1126 {
1127 uschar *slotB = slotA;
1128 while (slotB > md->name_table)
1129 {
1130 slotB -= md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138
1139 /* Scan up for duplicates */
1140
1141 if (!condition)
1142 {
1143 slotB = slotA;
1144 for (i++; i < md->name_count; i++)
1145 {
1146 slotB += md->name_entry_size;
1147 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1148 {
1149 condition = GET2(slotB, 0) == md->recursive->group_num;
1150 if (condition) break;
1151 }
1152 else break;
1153 }
1154 }
1155 }
1156 }
1157
1158 /* Chose branch according to the condition */
1159
1160 ecode += condition? 3 : GET(ecode, 1);
1161 }
1162 }
1163
1164 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1165 {
1166 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1167 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1168
1169 /* If the numbered capture is unset, but the reference was by name,
1170 scan the table to see if the name refers to any other numbers, and test
1171 them. The condition is true if any one is set. This is tediously similar
1172 to the code above, but not close enough to try to amalgamate. */
1173
1174 if (!condition && condcode == OP_NCREF)
1175 {
1176 int refno = offset >> 1;
1177 uschar *slotA = md->name_table;
1178
1179 for (i = 0; i < md->name_count; i++)
1180 {
1181 if (GET2(slotA, 0) == refno) break;
1182 slotA += md->name_entry_size;
1183 }
1184
1185 /* Found a name for the number - there can be only one; duplicate names
1186 for different numbers are allowed, but not vice versa. First scan down
1187 for duplicates. */
1188
1189 if (i < md->name_count)
1190 {
1191 uschar *slotB = slotA;
1192 while (slotB > md->name_table)
1193 {
1194 slotB -= md->name_entry_size;
1195 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1196 {
1197 offset = GET2(slotB, 0) << 1;
1198 condition = offset < offset_top &&
1199 md->offset_vector[offset] >= 0;
1200 if (condition) break;
1201 }
1202 else break;
1203 }
1204
1205 /* Scan up for duplicates */
1206
1207 if (!condition)
1208 {
1209 slotB = slotA;
1210 for (i++; i < md->name_count; i++)
1211 {
1212 slotB += md->name_entry_size;
1213 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1214 {
1215 offset = GET2(slotB, 0) << 1;
1216 condition = offset < offset_top &&
1217 md->offset_vector[offset] >= 0;
1218 if (condition) break;
1219 }
1220 else break;
1221 }
1222 }
1223 }
1224 }
1225
1226 /* Chose branch according to the condition */
1227
1228 ecode += condition? 3 : GET(ecode, 1);
1229 }
1230
1231 else if (condcode == OP_DEF) /* DEFINE - always false */
1232 {
1233 condition = FALSE;
1234 ecode += GET(ecode, 1);
1235 }
1236
1237 /* The condition is an assertion. Call match() to evaluate it - setting
1238 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1239 an assertion. */
1240
1241 else
1242 {
1243 md->match_function_type = MATCH_CONDASSERT;
1244 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1245 if (rrc == MATCH_MATCH)
1246 {
1247 if (md->end_offset_top > offset_top)
1248 offset_top = md->end_offset_top; /* Captures may have happened */
1249 condition = TRUE;
1250 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1251 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1252 }
1253 else if (rrc != MATCH_NOMATCH &&
1254 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1255 {
1256 RRETURN(rrc); /* Need braces because of following else */
1257 }
1258 else
1259 {
1260 condition = FALSE;
1261 ecode += codelink;
1262 }
1263 }
1264
1265 /* We are now at the branch that is to be obeyed. As there is only one,
1266 we used to use tail recursion to avoid using another stack frame, except
1267 when there was unlimited repeat of a possibly empty group. However, that
1268 strategy no longer works because of the possibilty of (*THEN) being
1269 encountered in the branch. A recursive call to match() is always required,
1270 unless the second alternative doesn't exist, in which case we can just
1271 plough on. */
1272
1273 if (condition || *ecode == OP_ALT)
1274 {
1275 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1276 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1277 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1278 rrc = MATCH_NOMATCH;
1279 RRETURN(rrc);
1280 }
1281 else /* Condition false & no alternative */
1282 {
1283 ecode += 1 + LINK_SIZE;
1284 }
1285 break;
1286
1287
1288 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1289 to close any currently open capturing brackets. */
1290
1291 case OP_CLOSE:
1292 number = GET2(ecode, 1);
1293 offset = number << 1;
1294
1295 #ifdef PCRE_DEBUG
1296 printf("end bracket %d at *ACCEPT", number);
1297 printf("\n");
1298 #endif
1299
1300 md->capture_last = number;
1301 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1302 {
1303 md->offset_vector[offset] =
1304 md->offset_vector[md->offset_end - number];
1305 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1306 if (offset_top <= offset) offset_top = offset + 2;
1307 }
1308 ecode += 3;
1309 break;
1310
1311
1312 /* End of the pattern, either real or forced. */
1313
1314 case OP_END:
1315 case OP_ACCEPT:
1316 case OP_ASSERT_ACCEPT:
1317
1318 /* If we have matched an empty string, fail if not in an assertion and not
1319 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1320 is set and we have matched at the start of the subject. In both cases,
1321 backtracking will then try other alternatives, if any. */
1322
1323 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1324 md->recursive == NULL &&
1325 (md->notempty ||
1326 (md->notempty_atstart &&
1327 mstart == md->start_subject + md->start_offset)))
1328 MRRETURN(MATCH_NOMATCH);
1329
1330 /* Otherwise, we have a match. */
1331
1332 md->end_match_ptr = eptr; /* Record where we ended */
1333 md->end_offset_top = offset_top; /* and how many extracts were taken */
1334 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1335
1336 /* For some reason, the macros don't work properly if an expression is
1337 given as the argument to MRRETURN when the heap is in use. */
1338
1339 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1340 MRRETURN(rrc);
1341
1342 /* Assertion brackets. Check the alternative branches in turn - the
1343 matching won't pass the KET for an assertion. If any one branch matches,
1344 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1345 start of each branch to move the current point backwards, so the code at
1346 this level is identical to the lookahead case. When the assertion is part
1347 of a condition, we want to return immediately afterwards. The caller of
1348 this incarnation of the match() function will have set MATCH_CONDASSERT in
1349 md->match_function type, and one of these opcodes will be the first opcode
1350 that is processed. We use a local variable that is preserved over calls to
1351 match() to remember this case. */
1352
1353 case OP_ASSERT:
1354 case OP_ASSERTBACK:
1355 if (md->match_function_type == MATCH_CONDASSERT)
1356 {
1357 condassert = TRUE;
1358 md->match_function_type = 0;
1359 }
1360 else condassert = FALSE;
1361
1362 do
1363 {
1364 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1365 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1366 {
1367 mstart = md->start_match_ptr; /* In case \K reset it */
1368 markptr = md->mark;
1369 break;
1370 }
1371 if (rrc != MATCH_NOMATCH &&
1372 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1373 RRETURN(rrc);
1374 ecode += GET(ecode, 1);
1375 }
1376 while (*ecode == OP_ALT);
1377
1378 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1379
1380 /* If checking an assertion for a condition, return MATCH_MATCH. */
1381
1382 if (condassert) RRETURN(MATCH_MATCH);
1383
1384 /* Continue from after the assertion, updating the offsets high water
1385 mark, since extracts may have been taken during the assertion. */
1386
1387 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1388 ecode += 1 + LINK_SIZE;
1389 offset_top = md->end_offset_top;
1390 continue;
1391
1392 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1393 PRUNE, or COMMIT means we must assume failure without checking subsequent
1394 branches. */
1395
1396 case OP_ASSERT_NOT:
1397 case OP_ASSERTBACK_NOT:
1398 if (md->match_function_type == MATCH_CONDASSERT)
1399 {
1400 condassert = TRUE;
1401 md->match_function_type = 0;
1402 }
1403 else condassert = FALSE;
1404
1405 do
1406 {
1407 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1408 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1409 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1410 {
1411 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1412 break;
1413 }
1414 if (rrc != MATCH_NOMATCH &&
1415 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1416 RRETURN(rrc);
1417 ecode += GET(ecode,1);
1418 }
1419 while (*ecode == OP_ALT);
1420
1421 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1422
1423 ecode += 1 + LINK_SIZE;
1424 continue;
1425
1426 /* Move the subject pointer back. This occurs only at the start of
1427 each branch of a lookbehind assertion. If we are too close to the start to
1428 move back, this match function fails. When working with UTF-8 we move
1429 back a number of characters, not bytes. */
1430
1431 case OP_REVERSE:
1432 #ifdef SUPPORT_UTF8
1433 if (utf8)
1434 {
1435 i = GET(ecode, 1);
1436 while (i-- > 0)
1437 {
1438 eptr--;
1439 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1440 BACKCHAR(eptr);
1441 }
1442 }
1443 else
1444 #endif
1445
1446 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1447
1448 {
1449 eptr -= GET(ecode, 1);
1450 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1451 }
1452
1453 /* Save the earliest consulted character, then skip to next op code */
1454
1455 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1456 ecode += 1 + LINK_SIZE;
1457 break;
1458
1459 /* The callout item calls an external function, if one is provided, passing
1460 details of the match so far. This is mainly for debugging, though the
1461 function is able to force a failure. */
1462
1463 case OP_CALLOUT:
1464 if (pcre_callout != NULL)
1465 {
1466 pcre_callout_block cb;
1467 cb.version = 1; /* Version 1 of the callout block */
1468 cb.callout_number = ecode[1];
1469 cb.offset_vector = md->offset_vector;
1470 cb.subject = (PCRE_SPTR)md->start_subject;
1471 cb.subject_length = (int)(md->end_subject - md->start_subject);
1472 cb.start_match = (int)(mstart - md->start_subject);
1473 cb.current_position = (int)(eptr - md->start_subject);
1474 cb.pattern_position = GET(ecode, 2);
1475 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1476 cb.capture_top = offset_top/2;
1477 cb.capture_last = md->capture_last;
1478 cb.callout_data = md->callout_data;
1479 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1480 if (rrc < 0) RRETURN(rrc);
1481 }
1482 ecode += 2 + 2*LINK_SIZE;
1483 break;
1484
1485 /* Recursion either matches the current regex, or some subexpression. The
1486 offset data is the offset to the starting bracket from the start of the
1487 whole pattern. (This is so that it works from duplicated subpatterns.)
1488
1489 The state of the capturing groups is preserved over recursion, and
1490 re-instated afterwards. We don't know how many are started and not yet
1491 finished (offset_top records the completed total) so we just have to save
1492 all the potential data. There may be up to 65535 such values, which is too
1493 large to put on the stack, but using malloc for small numbers seems
1494 expensive. As a compromise, the stack is used when there are no more than
1495 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1496
1497 There are also other values that have to be saved. We use a chained
1498 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1499 for the original version of this logic. It has, however, been hacked around
1500 a lot, so he is not to blame for the current way it works. */
1501
1502 case OP_RECURSE:
1503 {
1504 callpat = md->start_code + GET(ecode, 1);
1505 new_recursive.group_num = (callpat == md->start_code)? 0 :
1506 GET2(callpat, 1 + LINK_SIZE);
1507
1508 /* Add to "recursing stack" */
1509
1510 new_recursive.prevrec = md->recursive;
1511 md->recursive = &new_recursive;
1512
1513 /* Where to continue from afterwards */
1514
1515 ecode += 1 + LINK_SIZE;
1516
1517 /* Now save the offset data */
1518
1519 new_recursive.saved_max = md->offset_end;
1520 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1521 new_recursive.offset_save = stacksave;
1522 else
1523 {
1524 new_recursive.offset_save =
1525 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1526 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1527 }
1528 memcpy(new_recursive.offset_save, md->offset_vector,
1529 new_recursive.saved_max * sizeof(int));
1530
1531 /* OK, now we can do the recursion. After processing each alternative,
1532 restore the offset data. If there were nested recursions, md->recursive
1533 might be changed, so reset it before looping. */
1534
1535 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1536 cbegroup = (*callpat >= OP_SBRA);
1537 do
1538 {
1539 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1540 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1541 md, eptrb, RM6);
1542 memcpy(md->offset_vector, new_recursive.offset_save,
1543 new_recursive.saved_max * sizeof(int));
1544 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1545 {
1546 DPRINTF(("Recursion matched\n"));
1547 md->recursive = new_recursive.prevrec;
1548 if (new_recursive.offset_save != stacksave)
1549 (pcre_free)(new_recursive.offset_save);
1550
1551 /* Set where we got to in the subject, and reset the start in case
1552 it was changed by \K. This *is* propagated back out of a recursion,
1553 for Perl compatibility. */
1554
1555 eptr = md->end_match_ptr;
1556 mstart = md->start_match_ptr;
1557 goto RECURSION_MATCHED; /* Exit loop; end processing */
1558 }
1559 else if (rrc != MATCH_NOMATCH &&
1560 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1561 {
1562 DPRINTF(("Recursion gave error %d\n", rrc));
1563 if (new_recursive.offset_save != stacksave)
1564 (pcre_free)(new_recursive.offset_save);
1565 RRETURN(rrc);
1566 }
1567
1568 md->recursive = &new_recursive;
1569 callpat += GET(callpat, 1);
1570 }
1571 while (*callpat == OP_ALT);
1572
1573 DPRINTF(("Recursion didn't match\n"));
1574 md->recursive = new_recursive.prevrec;
1575 if (new_recursive.offset_save != stacksave)
1576 (pcre_free)(new_recursive.offset_save);
1577 MRRETURN(MATCH_NOMATCH);
1578 }
1579
1580 RECURSION_MATCHED:
1581 break;
1582
1583 /* An alternation is the end of a branch; scan along to find the end of the
1584 bracketed group and go to there. */
1585
1586 case OP_ALT:
1587 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1588 break;
1589
1590 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1591 indicating that it may occur zero times. It may repeat infinitely, or not
1592 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1593 with fixed upper repeat limits are compiled as a number of copies, with the
1594 optional ones preceded by BRAZERO or BRAMINZERO. */
1595
1596 case OP_BRAZERO:
1597 next = ecode + 1;
1598 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1600 do next += GET(next, 1); while (*next == OP_ALT);
1601 ecode = next + 1 + LINK_SIZE;
1602 break;
1603
1604 case OP_BRAMINZERO:
1605 next = ecode + 1;
1606 do next += GET(next, 1); while (*next == OP_ALT);
1607 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1608 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1609 ecode++;
1610 break;
1611
1612 case OP_SKIPZERO:
1613 next = ecode+1;
1614 do next += GET(next,1); while (*next == OP_ALT);
1615 ecode = next + 1 + LINK_SIZE;
1616 break;
1617
1618 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1619 here; just jump to the group, with allow_zero set TRUE. */
1620
1621 case OP_BRAPOSZERO:
1622 op = *(++ecode);
1623 allow_zero = TRUE;
1624 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1625 goto POSSESSIVE_NON_CAPTURE;
1626
1627 /* End of a group, repeated or non-repeating. */
1628
1629 case OP_KET:
1630 case OP_KETRMIN:
1631 case OP_KETRMAX:
1632 case OP_KETRPOS:
1633 prev = ecode - GET(ecode, 1);
1634
1635 /* If this was a group that remembered the subject start, in order to break
1636 infinite repeats of empty string matches, retrieve the subject start from
1637 the chain. Otherwise, set it NULL. */
1638
1639 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1640 {
1641 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1642 eptrb = eptrb->epb_prev; /* Backup to previous group */
1643 }
1644 else saved_eptr = NULL;
1645
1646 /* If we are at the end of an assertion group, stop matching and return
1647 MATCH_MATCH, but record the current high water mark for use by positive
1648 assertions. We also need to record the match start in case it was changed
1649 by \K. */
1650
1651 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1652 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1653 {
1654 md->end_match_ptr = eptr; /* For ONCE */
1655 md->end_offset_top = offset_top;
1656 md->start_match_ptr = mstart;
1657 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1658 }
1659
1660 /* For capturing groups we have to check the group number back at the start
1661 and if necessary complete handling an extraction by setting the offsets and
1662 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1663 into group 0, so it won't be picked up here. Instead, we catch it when the
1664 OP_END is reached. Other recursion is handled here. We just have to record
1665 the current subject position and start match pointer and give a MATCH
1666 return. */
1667
1668 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1669 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1670 {
1671 number = GET2(prev, 1+LINK_SIZE);
1672 offset = number << 1;
1673
1674 #ifdef PCRE_DEBUG
1675 printf("end bracket %d", number);
1676 printf("\n");
1677 #endif
1678
1679 /* Handle a recursively called group. */
1680
1681 if (md->recursive != NULL && md->recursive->group_num == number)
1682 {
1683 md->end_match_ptr = eptr;
1684 md->start_match_ptr = mstart;
1685 RRETURN(MATCH_MATCH);
1686 }
1687
1688 /* Deal with capturing */
1689
1690 md->capture_last = number;
1691 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1692 {
1693 /* If offset is greater than offset_top, it means that we are
1694 "skipping" a capturing group, and that group's offsets must be marked
1695 unset. In earlier versions of PCRE, all the offsets were unset at the
1696 start of matching, but this doesn't work because atomic groups and
1697 assertions can cause a value to be set that should later be unset.
1698 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1699 part of the atomic group, but this is not on the final matching path,
1700 so must be unset when 2 is set. (If there is no group 2, there is no
1701 problem, because offset_top will then be 2, indicating no capture.) */
1702
1703 if (offset > offset_top)
1704 {
1705 register int *iptr = md->offset_vector + offset_top;
1706 register int *iend = md->offset_vector + offset;
1707 while (iptr < iend) *iptr++ = -1;
1708 }
1709
1710 /* Now make the extraction */
1711
1712 md->offset_vector[offset] =
1713 md->offset_vector[md->offset_end - number];
1714 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1715 if (offset_top <= offset) offset_top = offset + 2;
1716 }
1717 }
1718
1719 /* For an ordinary non-repeating ket, just continue at this level. This
1720 also happens for a repeating ket if no characters were matched in the
1721 group. This is the forcible breaking of infinite loops as implemented in
1722 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1723 processing the rest of the pattern at a lower level. If this results in a
1724 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1725 bypassing intermediate backup points, but resetting any captures that
1726 happened along the way. */
1727
1728 if (*ecode == OP_KET || eptr == saved_eptr)
1729 {
1730 if (*prev == OP_ONCE)
1731 {
1732 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1734 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1735 RRETURN(MATCH_ONCE);
1736 }
1737 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1738 break;
1739 }
1740
1741 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1742 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1743 at a time from the outer level, thus saving stack. */
1744
1745 if (*ecode == OP_KETRPOS)
1746 {
1747 md->end_match_ptr = eptr;
1748 md->end_offset_top = offset_top;
1749 RRETURN(MATCH_KETRPOS);
1750 }
1751
1752 /* The normal repeating kets try the rest of the pattern or restart from
1753 the preceding bracket, in the appropriate order. In the second case, we can
1754 use tail recursion to avoid using another stack frame, unless we have an
1755 an atomic group or an unlimited repeat of a group that can match an empty
1756 string. */
1757
1758 if (*ecode == OP_KETRMIN)
1759 {
1760 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1761 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1762 if (*prev == OP_ONCE)
1763 {
1764 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1766 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1767 RRETURN(MATCH_ONCE);
1768 }
1769 if (*prev >= OP_SBRA) /* Could match an empty string */
1770 {
1771 md->match_function_type = MATCH_CBEGROUP;
1772 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1773 RRETURN(rrc);
1774 }
1775 ecode = prev;
1776 goto TAIL_RECURSE;
1777 }
1778 else /* OP_KETRMAX */
1779 {
1780 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1781 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1782 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 if (*prev == OP_ONCE)
1785 {
1786 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1787 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1788 md->once_target = prev;
1789 RRETURN(MATCH_ONCE);
1790 }
1791 ecode += 1 + LINK_SIZE;
1792 goto TAIL_RECURSE;
1793 }
1794 /* Control never gets here */
1795
1796 /* Not multiline mode: start of subject assertion, unless notbol. */
1797
1798 case OP_CIRC:
1799 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1800
1801 /* Start of subject assertion */
1802
1803 case OP_SOD:
1804 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1805 ecode++;
1806 break;
1807
1808 /* Multiline mode: start of subject unless notbol, or after any newline. */
1809
1810 case OP_CIRCM:
1811 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1812 if (eptr != md->start_subject &&
1813 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1814 MRRETURN(MATCH_NOMATCH);
1815 ecode++;
1816 break;
1817
1818 /* Start of match assertion */
1819
1820 case OP_SOM:
1821 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1822 ecode++;
1823 break;
1824
1825 /* Reset the start of match point */
1826
1827 case OP_SET_SOM:
1828 mstart = eptr;
1829 ecode++;
1830 break;
1831
1832 /* Multiline mode: assert before any newline, or before end of subject
1833 unless noteol is set. */
1834
1835 case OP_DOLLM:
1836 if (eptr < md->end_subject)
1837 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1838 else
1839 {
1840 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1841 SCHECK_PARTIAL();
1842 }
1843 ecode++;
1844 break;
1845
1846 /* Not multiline mode: assert before a terminating newline or before end of
1847 subject unless noteol is set. */
1848
1849 case OP_DOLL:
1850 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1851 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1852
1853 /* ... else fall through for endonly */
1854
1855 /* End of subject assertion (\z) */
1856
1857 case OP_EOD:
1858 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1859 SCHECK_PARTIAL();
1860 ecode++;
1861 break;
1862
1863 /* End of subject or ending \n assertion (\Z) */
1864
1865 case OP_EODN:
1866 ASSERT_NL_OR_EOS:
1867 if (eptr < md->end_subject &&
1868 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1869 MRRETURN(MATCH_NOMATCH);
1870
1871 /* Either at end of string or \n before end. */
1872
1873 SCHECK_PARTIAL();
1874 ecode++;
1875 break;
1876
1877 /* Word boundary assertions */
1878
1879 case OP_NOT_WORD_BOUNDARY:
1880 case OP_WORD_BOUNDARY:
1881 {
1882
1883 /* Find out if the previous and current characters are "word" characters.
1884 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1885 be "non-word" characters. Remember the earliest consulted character for
1886 partial matching. */
1887
1888 #ifdef SUPPORT_UTF8
1889 if (utf8)
1890 {
1891 /* Get status of previous character */
1892
1893 if (eptr == md->start_subject) prev_is_word = FALSE; else
1894 {
1895 USPTR lastptr = eptr - 1;
1896 while((*lastptr & 0xc0) == 0x80) lastptr--;
1897 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1898 GETCHAR(c, lastptr);
1899 #ifdef SUPPORT_UCP
1900 if (md->use_ucp)
1901 {
1902 if (c == '_') prev_is_word = TRUE; else
1903 {
1904 int cat = UCD_CATEGORY(c);
1905 prev_is_word = (cat == ucp_L || cat == ucp_N);
1906 }
1907 }
1908 else
1909 #endif
1910 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1911 }
1912
1913 /* Get status of next character */
1914
1915 if (eptr >= md->end_subject)
1916 {
1917 SCHECK_PARTIAL();
1918 cur_is_word = FALSE;
1919 }
1920 else
1921 {
1922 GETCHAR(c, eptr);
1923 #ifdef SUPPORT_UCP
1924 if (md->use_ucp)
1925 {
1926 if (c == '_') cur_is_word = TRUE; else
1927 {
1928 int cat = UCD_CATEGORY(c);
1929 cur_is_word = (cat == ucp_L || cat == ucp_N);
1930 }
1931 }
1932 else
1933 #endif
1934 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1935 }
1936 }
1937 else
1938 #endif
1939
1940 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1941 consistency with the behaviour of \w we do use it in this case. */
1942
1943 {
1944 /* Get status of previous character */
1945
1946 if (eptr == md->start_subject) prev_is_word = FALSE; else
1947 {
1948 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1949 #ifdef SUPPORT_UCP
1950 if (md->use_ucp)
1951 {
1952 c = eptr[-1];
1953 if (c == '_') prev_is_word = TRUE; else
1954 {
1955 int cat = UCD_CATEGORY(c);
1956 prev_is_word = (cat == ucp_L || cat == ucp_N);
1957 }
1958 }
1959 else
1960 #endif
1961 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1962 }
1963
1964 /* Get status of next character */
1965
1966 if (eptr >= md->end_subject)
1967 {
1968 SCHECK_PARTIAL();
1969 cur_is_word = FALSE;
1970 }
1971 else
1972 #ifdef SUPPORT_UCP
1973 if (md->use_ucp)
1974 {
1975 c = *eptr;
1976 if (c == '_') cur_is_word = TRUE; else
1977 {
1978 int cat = UCD_CATEGORY(c);
1979 cur_is_word = (cat == ucp_L || cat == ucp_N);
1980 }
1981 }
1982 else
1983 #endif
1984 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1985 }
1986
1987 /* Now see if the situation is what we want */
1988
1989 if ((*ecode++ == OP_WORD_BOUNDARY)?
1990 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1991 MRRETURN(MATCH_NOMATCH);
1992 }
1993 break;
1994
1995 /* Match a single character type; inline for speed */
1996
1997 case OP_ANY:
1998 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1999 /* Fall through */
2000
2001 case OP_ALLANY:
2002 if (eptr++ >= md->end_subject)
2003 {
2004 SCHECK_PARTIAL();
2005 MRRETURN(MATCH_NOMATCH);
2006 }
2007 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2008 ecode++;
2009 break;
2010
2011 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2012 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2013
2014 case OP_ANYBYTE:
2015 if (eptr++ >= md->end_subject)
2016 {
2017 SCHECK_PARTIAL();
2018 MRRETURN(MATCH_NOMATCH);
2019 }
2020 ecode++;
2021 break;
2022
2023 case OP_NOT_DIGIT:
2024 if (eptr >= md->end_subject)
2025 {
2026 SCHECK_PARTIAL();
2027 MRRETURN(MATCH_NOMATCH);
2028 }
2029 GETCHARINCTEST(c, eptr);
2030 if (
2031 #ifdef SUPPORT_UTF8
2032 c < 256 &&
2033 #endif
2034 (md->ctypes[c] & ctype_digit) != 0
2035 )
2036 MRRETURN(MATCH_NOMATCH);
2037 ecode++;
2038 break;
2039
2040 case OP_DIGIT:
2041 if (eptr >= md->end_subject)
2042 {
2043 SCHECK_PARTIAL();
2044 MRRETURN(MATCH_NOMATCH);
2045 }
2046 GETCHARINCTEST(c, eptr);
2047 if (
2048 #ifdef SUPPORT_UTF8
2049 c >= 256 ||
2050 #endif
2051 (md->ctypes[c] & ctype_digit) == 0
2052 )
2053 MRRETURN(MATCH_NOMATCH);
2054 ecode++;
2055 break;
2056
2057 case OP_NOT_WHITESPACE:
2058 if (eptr >= md->end_subject)
2059 {
2060 SCHECK_PARTIAL();
2061 MRRETURN(MATCH_NOMATCH);
2062 }
2063 GETCHARINCTEST(c, eptr);
2064 if (
2065 #ifdef SUPPORT_UTF8
2066 c < 256 &&
2067 #endif
2068 (md->ctypes[c] & ctype_space) != 0
2069 )
2070 MRRETURN(MATCH_NOMATCH);
2071 ecode++;
2072 break;
2073
2074 case OP_WHITESPACE:
2075 if (eptr >= md->end_subject)
2076 {
2077 SCHECK_PARTIAL();
2078 MRRETURN(MATCH_NOMATCH);
2079 }
2080 GETCHARINCTEST(c, eptr);
2081 if (
2082 #ifdef SUPPORT_UTF8
2083 c >= 256 ||
2084 #endif
2085 (md->ctypes[c] & ctype_space) == 0
2086 )
2087 MRRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 break;
2090
2091 case OP_NOT_WORDCHAR:
2092 if (eptr >= md->end_subject)
2093 {
2094 SCHECK_PARTIAL();
2095 MRRETURN(MATCH_NOMATCH);
2096 }
2097 GETCHARINCTEST(c, eptr);
2098 if (
2099 #ifdef SUPPORT_UTF8
2100 c < 256 &&
2101 #endif
2102 (md->ctypes[c] & ctype_word) != 0
2103 )
2104 MRRETURN(MATCH_NOMATCH);
2105 ecode++;
2106 break;
2107
2108 case OP_WORDCHAR:
2109 if (eptr >= md->end_subject)
2110 {
2111 SCHECK_PARTIAL();
2112 MRRETURN(MATCH_NOMATCH);
2113 }
2114 GETCHARINCTEST(c, eptr);
2115 if (
2116 #ifdef SUPPORT_UTF8
2117 c >= 256 ||
2118 #endif
2119 (md->ctypes[c] & ctype_word) == 0
2120 )
2121 MRRETURN(MATCH_NOMATCH);
2122 ecode++;
2123 break;
2124
2125 case OP_ANYNL:
2126 if (eptr >= md->end_subject)
2127 {
2128 SCHECK_PARTIAL();
2129 MRRETURN(MATCH_NOMATCH);
2130 }
2131 GETCHARINCTEST(c, eptr);
2132 switch(c)
2133 {
2134 default: MRRETURN(MATCH_NOMATCH);
2135
2136 case 0x000d:
2137 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2138 break;
2139
2140 case 0x000a:
2141 break;
2142
2143 case 0x000b:
2144 case 0x000c:
2145 case 0x0085:
2146 case 0x2028:
2147 case 0x2029:
2148 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2149 break;
2150 }
2151 ecode++;
2152 break;
2153
2154 case OP_NOT_HSPACE:
2155 if (eptr >= md->end_subject)
2156 {
2157 SCHECK_PARTIAL();
2158 MRRETURN(MATCH_NOMATCH);
2159 }
2160 GETCHARINCTEST(c, eptr);
2161 switch(c)
2162 {
2163 default: break;
2164 case 0x09: /* HT */
2165 case 0x20: /* SPACE */
2166 case 0xa0: /* NBSP */
2167 case 0x1680: /* OGHAM SPACE MARK */
2168 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2169 case 0x2000: /* EN QUAD */
2170 case 0x2001: /* EM QUAD */
2171 case 0x2002: /* EN SPACE */
2172 case 0x2003: /* EM SPACE */
2173 case 0x2004: /* THREE-PER-EM SPACE */
2174 case 0x2005: /* FOUR-PER-EM SPACE */
2175 case 0x2006: /* SIX-PER-EM SPACE */
2176 case 0x2007: /* FIGURE SPACE */
2177 case 0x2008: /* PUNCTUATION SPACE */
2178 case 0x2009: /* THIN SPACE */
2179 case 0x200A: /* HAIR SPACE */
2180 case 0x202f: /* NARROW NO-BREAK SPACE */
2181 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2182 case 0x3000: /* IDEOGRAPHIC SPACE */
2183 MRRETURN(MATCH_NOMATCH);
2184 }
2185 ecode++;
2186 break;
2187
2188 case OP_HSPACE:
2189 if (eptr >= md->end_subject)
2190 {
2191 SCHECK_PARTIAL();
2192 MRRETURN(MATCH_NOMATCH);
2193 }
2194 GETCHARINCTEST(c, eptr);
2195 switch(c)
2196 {
2197 default: MRRETURN(MATCH_NOMATCH);
2198 case 0x09: /* HT */
2199 case 0x20: /* SPACE */
2200 case 0xa0: /* NBSP */
2201 case 0x1680: /* OGHAM SPACE MARK */
2202 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2203 case 0x2000: /* EN QUAD */
2204 case 0x2001: /* EM QUAD */
2205 case 0x2002: /* EN SPACE */
2206 case 0x2003: /* EM SPACE */
2207 case 0x2004: /* THREE-PER-EM SPACE */
2208 case 0x2005: /* FOUR-PER-EM SPACE */
2209 case 0x2006: /* SIX-PER-EM SPACE */
2210 case 0x2007: /* FIGURE SPACE */
2211 case 0x2008: /* PUNCTUATION SPACE */
2212 case 0x2009: /* THIN SPACE */
2213 case 0x200A: /* HAIR SPACE */
2214 case 0x202f: /* NARROW NO-BREAK SPACE */
2215 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2216 case 0x3000: /* IDEOGRAPHIC SPACE */
2217 break;
2218 }
2219 ecode++;
2220 break;
2221
2222 case OP_NOT_VSPACE:
2223 if (eptr >= md->end_subject)
2224 {
2225 SCHECK_PARTIAL();
2226 MRRETURN(MATCH_NOMATCH);
2227 }
2228 GETCHARINCTEST(c, eptr);
2229 switch(c)
2230 {
2231 default: break;
2232 case 0x0a: /* LF */
2233 case 0x0b: /* VT */
2234 case 0x0c: /* FF */
2235 case 0x0d: /* CR */
2236 case 0x85: /* NEL */
2237 case 0x2028: /* LINE SEPARATOR */
2238 case 0x2029: /* PARAGRAPH SEPARATOR */
2239 MRRETURN(MATCH_NOMATCH);
2240 }
2241 ecode++;
2242 break;
2243
2244 case OP_VSPACE:
2245 if (eptr >= md->end_subject)
2246 {
2247 SCHECK_PARTIAL();
2248 MRRETURN(MATCH_NOMATCH);
2249 }
2250 GETCHARINCTEST(c, eptr);
2251 switch(c)
2252 {
2253 default: MRRETURN(MATCH_NOMATCH);
2254 case 0x0a: /* LF */
2255 case 0x0b: /* VT */
2256 case 0x0c: /* FF */
2257 case 0x0d: /* CR */
2258 case 0x85: /* NEL */
2259 case 0x2028: /* LINE SEPARATOR */
2260 case 0x2029: /* PARAGRAPH SEPARATOR */
2261 break;
2262 }
2263 ecode++;
2264 break;
2265
2266 #ifdef SUPPORT_UCP
2267 /* Check the next character by Unicode property. We will get here only
2268 if the support is in the binary; otherwise a compile-time error occurs. */
2269
2270 case OP_PROP:
2271 case OP_NOTPROP:
2272 if (eptr >= md->end_subject)
2273 {
2274 SCHECK_PARTIAL();
2275 MRRETURN(MATCH_NOMATCH);
2276 }
2277 GETCHARINCTEST(c, eptr);
2278 {
2279 const ucd_record *prop = GET_UCD(c);
2280
2281 switch(ecode[1])
2282 {
2283 case PT_ANY:
2284 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2285 break;
2286
2287 case PT_LAMP:
2288 if ((prop->chartype == ucp_Lu ||
2289 prop->chartype == ucp_Ll ||
2290 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2291 MRRETURN(MATCH_NOMATCH);
2292 break;
2293
2294 case PT_GC:
2295 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2296 MRRETURN(MATCH_NOMATCH);
2297 break;
2298
2299 case PT_PC:
2300 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2301 MRRETURN(MATCH_NOMATCH);
2302 break;
2303
2304 case PT_SC:
2305 if ((ecode[2] != prop->script) == (op == OP_PROP))
2306 MRRETURN(MATCH_NOMATCH);
2307 break;
2308
2309 /* These are specials */
2310
2311 case PT_ALNUM:
2312 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2313 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2314 MRRETURN(MATCH_NOMATCH);
2315 break;
2316
2317 case PT_SPACE: /* Perl space */
2318 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2319 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2320 == (op == OP_NOTPROP))
2321 MRRETURN(MATCH_NOMATCH);
2322 break;
2323
2324 case PT_PXSPACE: /* POSIX space */
2325 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2326 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2327 c == CHAR_FF || c == CHAR_CR)
2328 == (op == OP_NOTPROP))
2329 MRRETURN(MATCH_NOMATCH);
2330 break;
2331
2332 case PT_WORD:
2333 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2334 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2335 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2336 MRRETURN(MATCH_NOMATCH);
2337 break;
2338
2339 /* This should never occur */
2340
2341 default:
2342 RRETURN(PCRE_ERROR_INTERNAL);
2343 }
2344
2345 ecode += 3;
2346 }
2347 break;
2348
2349 /* Match an extended Unicode sequence. We will get here only if the support
2350 is in the binary; otherwise a compile-time error occurs. */
2351
2352 case OP_EXTUNI:
2353 if (eptr >= md->end_subject)
2354 {
2355 SCHECK_PARTIAL();
2356 MRRETURN(MATCH_NOMATCH);
2357 }
2358 GETCHARINCTEST(c, eptr);
2359 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2360 while (eptr < md->end_subject)
2361 {
2362 int len = 1;
2363 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2364 if (UCD_CATEGORY(c) != ucp_M) break;
2365 eptr += len;
2366 }
2367 ecode++;
2368 break;
2369 #endif
2370
2371
2372 /* Match a back reference, possibly repeatedly. Look past the end of the
2373 item to see if there is repeat information following. The code is similar
2374 to that for character classes, but repeated for efficiency. Then obey
2375 similar code to character type repeats - written out again for speed.
2376 However, if the referenced string is the empty string, always treat
2377 it as matched, any number of times (otherwise there could be infinite
2378 loops). */
2379
2380 case OP_REF:
2381 case OP_REFI:
2382 caseless = op == OP_REFI;
2383 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2384 ecode += 3;
2385
2386 /* If the reference is unset, there are two possibilities:
2387
2388 (a) In the default, Perl-compatible state, set the length negative;
2389 this ensures that every attempt at a match fails. We can't just fail
2390 here, because of the possibility of quantifiers with zero minima.
2391
2392 (b) If the JavaScript compatibility flag is set, set the length to zero
2393 so that the back reference matches an empty string.
2394
2395 Otherwise, set the length to the length of what was matched by the
2396 referenced subpattern. */
2397
2398 if (offset >= offset_top || md->offset_vector[offset] < 0)
2399 length = (md->jscript_compat)? 0 : -1;
2400 else
2401 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2402
2403 /* Set up for repetition, or handle the non-repeated case */
2404
2405 switch (*ecode)
2406 {
2407 case OP_CRSTAR:
2408 case OP_CRMINSTAR:
2409 case OP_CRPLUS:
2410 case OP_CRMINPLUS:
2411 case OP_CRQUERY:
2412 case OP_CRMINQUERY:
2413 c = *ecode++ - OP_CRSTAR;
2414 minimize = (c & 1) != 0;
2415 min = rep_min[c]; /* Pick up values from tables; */
2416 max = rep_max[c]; /* zero for max => infinity */
2417 if (max == 0) max = INT_MAX;
2418 break;
2419
2420 case OP_CRRANGE:
2421 case OP_CRMINRANGE:
2422 minimize = (*ecode == OP_CRMINRANGE);
2423 min = GET2(ecode, 1);
2424 max = GET2(ecode, 3);
2425 if (max == 0) max = INT_MAX;
2426 ecode += 5;
2427 break;
2428
2429 default: /* No repeat follows */
2430 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2431 {
2432 CHECK_PARTIAL();
2433 MRRETURN(MATCH_NOMATCH);
2434 }
2435 eptr += length;
2436 continue; /* With the main loop */
2437 }
2438
2439 /* Handle repeated back references. If the length of the reference is
2440 zero, just continue with the main loop. */
2441
2442 if (length == 0) continue;
2443
2444 /* First, ensure the minimum number of matches are present. We get back
2445 the length of the reference string explicitly rather than passing the
2446 address of eptr, so that eptr can be a register variable. */
2447
2448 for (i = 1; i <= min; i++)
2449 {
2450 int slength;
2451 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2452 {
2453 CHECK_PARTIAL();
2454 MRRETURN(MATCH_NOMATCH);
2455 }
2456 eptr += slength;
2457 }
2458
2459 /* If min = max, continue at the same level without recursion.
2460 They are not both allowed to be zero. */
2461
2462 if (min == max) continue;
2463
2464 /* If minimizing, keep trying and advancing the pointer */
2465
2466 if (minimize)
2467 {
2468 for (fi = min;; fi++)
2469 {
2470 int slength;
2471 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2473 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2474 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2475 {
2476 CHECK_PARTIAL();
2477 MRRETURN(MATCH_NOMATCH);
2478 }
2479 eptr += slength;
2480 }
2481 /* Control never gets here */
2482 }
2483
2484 /* If maximizing, find the longest string and work backwards */
2485
2486 else
2487 {
2488 pp = eptr;
2489 for (i = min; i < max; i++)
2490 {
2491 int slength;
2492 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2493 {
2494 CHECK_PARTIAL();
2495 break;
2496 }
2497 eptr += slength;
2498 }
2499 while (eptr >= pp)
2500 {
2501 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2502 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2503 eptr -= length;
2504 }
2505 MRRETURN(MATCH_NOMATCH);
2506 }
2507 /* Control never gets here */
2508
2509 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2510 used when all the characters in the class have values in the range 0-255,
2511 and either the matching is caseful, or the characters are in the range
2512 0-127 when UTF-8 processing is enabled. The only difference between
2513 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2514 encountered.
2515
2516 First, look past the end of the item to see if there is repeat information
2517 following. Then obey similar code to character type repeats - written out
2518 again for speed. */
2519
2520 case OP_NCLASS:
2521 case OP_CLASS:
2522 {
2523 data = ecode + 1; /* Save for matching */
2524 ecode += 33; /* Advance past the item */
2525
2526 switch (*ecode)
2527 {
2528 case OP_CRSTAR:
2529 case OP_CRMINSTAR:
2530 case OP_CRPLUS:
2531 case OP_CRMINPLUS:
2532 case OP_CRQUERY:
2533 case OP_CRMINQUERY:
2534 c = *ecode++ - OP_CRSTAR;
2535 minimize = (c & 1) != 0;
2536 min = rep_min[c]; /* Pick up values from tables; */
2537 max = rep_max[c]; /* zero for max => infinity */
2538 if (max == 0) max = INT_MAX;
2539 break;
2540
2541 case OP_CRRANGE:
2542 case OP_CRMINRANGE:
2543 minimize = (*ecode == OP_CRMINRANGE);
2544 min = GET2(ecode, 1);
2545 max = GET2(ecode, 3);
2546 if (max == 0) max = INT_MAX;
2547 ecode += 5;
2548 break;
2549
2550 default: /* No repeat follows */
2551 min = max = 1;
2552 break;
2553 }
2554
2555 /* First, ensure the minimum number of matches are present. */
2556
2557 #ifdef SUPPORT_UTF8
2558 /* UTF-8 mode */
2559 if (utf8)
2560 {
2561 for (i = 1; i <= min; i++)
2562 {
2563 if (eptr >= md->end_subject)
2564 {
2565 SCHECK_PARTIAL();
2566 MRRETURN(MATCH_NOMATCH);
2567 }
2568 GETCHARINC(c, eptr);
2569 if (c > 255)
2570 {
2571 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2572 }
2573 else
2574 {
2575 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2576 }
2577 }
2578 }
2579 else
2580 #endif
2581 /* Not UTF-8 mode */
2582 {
2583 for (i = 1; i <= min; i++)
2584 {
2585 if (eptr >= md->end_subject)
2586 {
2587 SCHECK_PARTIAL();
2588 MRRETURN(MATCH_NOMATCH);
2589 }
2590 c = *eptr++;
2591 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2592 }
2593 }
2594
2595 /* If max == min we can continue with the main loop without the
2596 need to recurse. */
2597
2598 if (min == max) continue;
2599
2600 /* If minimizing, keep testing the rest of the expression and advancing
2601 the pointer while it matches the class. */
2602
2603 if (minimize)
2604 {
2605 #ifdef SUPPORT_UTF8
2606 /* UTF-8 mode */
2607 if (utf8)
2608 {
2609 for (fi = min;; fi++)
2610 {
2611 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2613 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2614 if (eptr >= md->end_subject)
2615 {
2616 SCHECK_PARTIAL();
2617 MRRETURN(MATCH_NOMATCH);
2618 }
2619 GETCHARINC(c, eptr);
2620 if (c > 255)
2621 {
2622 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2623 }
2624 else
2625 {
2626 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2627 }
2628 }
2629 }
2630 else
2631 #endif
2632 /* Not UTF-8 mode */
2633 {
2634 for (fi = min;; fi++)
2635 {
2636 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2639 if (eptr >= md->end_subject)
2640 {
2641 SCHECK_PARTIAL();
2642 MRRETURN(MATCH_NOMATCH);
2643 }
2644 c = *eptr++;
2645 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2646 }
2647 }
2648 /* Control never gets here */
2649 }
2650
2651 /* If maximizing, find the longest possible run, then work backwards. */
2652
2653 else
2654 {
2655 pp = eptr;
2656
2657 #ifdef SUPPORT_UTF8
2658 /* UTF-8 mode */
2659 if (utf8)
2660 {
2661 for (i = min; i < max; i++)
2662 {
2663 int len = 1;
2664 if (eptr >= md->end_subject)
2665 {
2666 SCHECK_PARTIAL();
2667 break;
2668 }
2669 GETCHARLEN(c, eptr, len);
2670 if (c > 255)
2671 {
2672 if (op == OP_CLASS) break;
2673 }
2674 else
2675 {
2676 if ((data[c/8] & (1 << (c&7))) == 0) break;
2677 }
2678 eptr += len;
2679 }
2680 for (;;)
2681 {
2682 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684 if (eptr-- == pp) break; /* Stop if tried at original pos */
2685 BACKCHAR(eptr);
2686 }
2687 }
2688 else
2689 #endif
2690 /* Not UTF-8 mode */
2691 {
2692 for (i = min; i < max; i++)
2693 {
2694 if (eptr >= md->end_subject)
2695 {
2696 SCHECK_PARTIAL();
2697 break;
2698 }
2699 c = *eptr;
2700 if ((data[c/8] & (1 << (c&7))) == 0) break;
2701 eptr++;
2702 }
2703 while (eptr >= pp)
2704 {
2705 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2706 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2707 eptr--;
2708 }
2709 }
2710
2711 MRRETURN(MATCH_NOMATCH);
2712 }
2713 }
2714 /* Control never gets here */
2715
2716
2717 /* Match an extended character class. This opcode is encountered only
2718 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2719 mode, because Unicode properties are supported in non-UTF-8 mode. */
2720
2721 #ifdef SUPPORT_UTF8
2722 case OP_XCLASS:
2723 {
2724 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2725 ecode += GET(ecode, 1); /* Advance past the item */
2726
2727 switch (*ecode)
2728 {
2729 case OP_CRSTAR:
2730 case OP_CRMINSTAR:
2731 case OP_CRPLUS:
2732 case OP_CRMINPLUS:
2733 case OP_CRQUERY:
2734 case OP_CRMINQUERY:
2735 c = *ecode++ - OP_CRSTAR;
2736 minimize = (c & 1) != 0;
2737 min = rep_min[c]; /* Pick up values from tables; */
2738 max = rep_max[c]; /* zero for max => infinity */
2739 if (max == 0) max = INT_MAX;
2740 break;
2741
2742 case OP_CRRANGE:
2743 case OP_CRMINRANGE:
2744 minimize = (*ecode == OP_CRMINRANGE);
2745 min = GET2(ecode, 1);
2746 max = GET2(ecode, 3);
2747 if (max == 0) max = INT_MAX;
2748 ecode += 5;
2749 break;
2750
2751 default: /* No repeat follows */
2752 min = max = 1;
2753 break;
2754 }
2755
2756 /* First, ensure the minimum number of matches are present. */
2757
2758 for (i = 1; i <= min; i++)
2759 {
2760 if (eptr >= md->end_subject)
2761 {
2762 SCHECK_PARTIAL();
2763 MRRETURN(MATCH_NOMATCH);
2764 }
2765 GETCHARINCTEST(c, eptr);
2766 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2767 }
2768
2769 /* If max == min we can continue with the main loop without the
2770 need to recurse. */
2771
2772 if (min == max) continue;
2773
2774 /* If minimizing, keep testing the rest of the expression and advancing
2775 the pointer while it matches the class. */
2776
2777 if (minimize)
2778 {
2779 for (fi = min;; fi++)
2780 {
2781 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2783 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2784 if (eptr >= md->end_subject)
2785 {
2786 SCHECK_PARTIAL();
2787 MRRETURN(MATCH_NOMATCH);
2788 }
2789 GETCHARINCTEST(c, eptr);
2790 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2791 }
2792 /* Control never gets here */
2793 }
2794
2795 /* If maximizing, find the longest possible run, then work backwards. */
2796
2797 else
2798 {
2799 pp = eptr;
2800 for (i = min; i < max; i++)
2801 {
2802 int len = 1;
2803 if (eptr >= md->end_subject)
2804 {
2805 SCHECK_PARTIAL();
2806 break;
2807 }
2808 GETCHARLENTEST(c, eptr, len);
2809 if (!_pcre_xclass(c, data)) break;
2810 eptr += len;
2811 }
2812 for(;;)
2813 {
2814 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816 if (eptr-- == pp) break; /* Stop if tried at original pos */
2817 if (utf8) BACKCHAR(eptr);
2818 }
2819 MRRETURN(MATCH_NOMATCH);
2820 }
2821
2822 /* Control never gets here */
2823 }
2824 #endif /* End of XCLASS */
2825
2826 /* Match a single character, casefully */
2827
2828 case OP_CHAR:
2829 #ifdef SUPPORT_UTF8
2830 if (utf8)
2831 {
2832 length = 1;
2833 ecode++;
2834 GETCHARLEN(fc, ecode, length);
2835 if (length > md->end_subject - eptr)
2836 {
2837 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2838 MRRETURN(MATCH_NOMATCH);
2839 }
2840 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2841 }
2842 else
2843 #endif
2844
2845 /* Non-UTF-8 mode */
2846 {
2847 if (md->end_subject - eptr < 1)
2848 {
2849 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2850 MRRETURN(MATCH_NOMATCH);
2851 }
2852 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2853 ecode += 2;
2854 }
2855 break;
2856
2857 /* Match a single character, caselessly */
2858
2859 case OP_CHARI:
2860 #ifdef SUPPORT_UTF8
2861 if (utf8)
2862 {
2863 length = 1;
2864 ecode++;
2865 GETCHARLEN(fc, ecode, length);
2866
2867 if (length > md->end_subject - eptr)
2868 {
2869 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2870 MRRETURN(MATCH_NOMATCH);
2871 }
2872
2873 /* If the pattern character's value is < 128, we have only one byte, and
2874 can use the fast lookup table. */
2875
2876 if (fc < 128)
2877 {
2878 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2879 }
2880
2881 /* Otherwise we must pick up the subject character */
2882
2883 else
2884 {
2885 unsigned int dc;
2886 GETCHARINC(dc, eptr);
2887 ecode += length;
2888
2889 /* If we have Unicode property support, we can use it to test the other
2890 case of the character, if there is one. */
2891
2892 if (fc != dc)
2893 {
2894 #ifdef SUPPORT_UCP
2895 if (dc != UCD_OTHERCASE(fc))
2896 #endif
2897 MRRETURN(MATCH_NOMATCH);
2898 }
2899 }
2900 }
2901 else
2902 #endif /* SUPPORT_UTF8 */
2903
2904 /* Non-UTF-8 mode */
2905 {
2906 if (md->end_subject - eptr < 1)
2907 {
2908 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2909 MRRETURN(MATCH_NOMATCH);
2910 }
2911 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2912 ecode += 2;
2913 }
2914 break;
2915
2916 /* Match a single character repeatedly. */
2917
2918 case OP_EXACT:
2919 case OP_EXACTI:
2920 min = max = GET2(ecode, 1);
2921 ecode += 3;
2922 goto REPEATCHAR;
2923
2924 case OP_POSUPTO:
2925 case OP_POSUPTOI:
2926 possessive = TRUE;
2927 /* Fall through */
2928
2929 case OP_UPTO:
2930 case OP_UPTOI:
2931 case OP_MINUPTO:
2932 case OP_MINUPTOI:
2933 min = 0;
2934 max = GET2(ecode, 1);
2935 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2936 ecode += 3;
2937 goto REPEATCHAR;
2938
2939 case OP_POSSTAR:
2940 case OP_POSSTARI:
2941 possessive = TRUE;
2942 min = 0;
2943 max = INT_MAX;
2944 ecode++;
2945 goto REPEATCHAR;
2946
2947 case OP_POSPLUS:
2948 case OP_POSPLUSI:
2949 possessive = TRUE;
2950 min = 1;
2951 max = INT_MAX;
2952 ecode++;
2953 goto REPEATCHAR;
2954
2955 case OP_POSQUERY:
2956 case OP_POSQUERYI:
2957 possessive = TRUE;
2958 min = 0;
2959 max = 1;
2960 ecode++;
2961 goto REPEATCHAR;
2962
2963 case OP_STAR:
2964 case OP_STARI:
2965 case OP_MINSTAR:
2966 case OP_MINSTARI:
2967 case OP_PLUS:
2968 case OP_PLUSI:
2969 case OP_MINPLUS:
2970 case OP_MINPLUSI:
2971 case OP_QUERY:
2972 case OP_QUERYI:
2973 case OP_MINQUERY:
2974 case OP_MINQUERYI:
2975 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2976 minimize = (c & 1) != 0;
2977 min = rep_min[c]; /* Pick up values from tables; */
2978 max = rep_max[c]; /* zero for max => infinity */
2979 if (max == 0) max = INT_MAX;
2980
2981 /* Common code for all repeated single-character matches. */
2982
2983 REPEATCHAR:
2984 #ifdef SUPPORT_UTF8
2985 if (utf8)
2986 {
2987 length = 1;
2988 charptr = ecode;
2989 GETCHARLEN(fc, ecode, length);
2990 ecode += length;
2991
2992 /* Handle multibyte character matching specially here. There is
2993 support for caseless matching if UCP support is present. */
2994
2995 if (length > 1)
2996 {
2997 #ifdef SUPPORT_UCP
2998 unsigned int othercase;
2999 if (op >= OP_STARI && /* Caseless */
3000 (othercase = UCD_OTHERCASE(fc)) != fc)
3001 oclength = _pcre_ord2utf8(othercase, occhars);
3002 else oclength = 0;
3003 #endif /* SUPPORT_UCP */
3004
3005 for (i = 1; i <= min; i++)
3006 {
3007 if (eptr <= md->end_subject - length &&
3008 memcmp(eptr, charptr, length) == 0) eptr += length;
3009 #ifdef SUPPORT_UCP
3010 else if (oclength > 0 &&
3011 eptr <= md->end_subject - oclength &&
3012 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3013 #endif /* SUPPORT_UCP */
3014 else
3015 {
3016 CHECK_PARTIAL();
3017 MRRETURN(MATCH_NOMATCH);
3018 }
3019 }
3020
3021 if (min == max) continue;
3022
3023 if (minimize)
3024 {
3025 for (fi = min;; fi++)
3026 {
3027 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3030 if (eptr <= md->end_subject - length &&
3031 memcmp(eptr, charptr, length) == 0) eptr += length;
3032 #ifdef SUPPORT_UCP
3033 else if (oclength > 0 &&
3034 eptr <= md->end_subject - oclength &&
3035 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3036 #endif /* SUPPORT_UCP */
3037 else
3038 {
3039 CHECK_PARTIAL();
3040 MRRETURN(MATCH_NOMATCH);
3041 }
3042 }
3043 /* Control never gets here */
3044 }
3045
3046 else /* Maximize */
3047 {
3048 pp = eptr;
3049 for (i = min; i < max; i++)
3050 {
3051 if (eptr <= md->end_subject - length &&
3052 memcmp(eptr, charptr, length) == 0) eptr += length;
3053 #ifdef SUPPORT_UCP
3054 else if (oclength > 0 &&
3055 eptr <= md->end_subject - oclength &&
3056 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3057 #endif /* SUPPORT_UCP */
3058 else
3059 {
3060 CHECK_PARTIAL();
3061 break;
3062 }
3063 }
3064
3065 if (possessive) continue;
3066
3067 for(;;)
3068 {
3069 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3071 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3072 #ifdef SUPPORT_UCP
3073 eptr--;
3074 BACKCHAR(eptr);
3075 #else /* without SUPPORT_UCP */
3076 eptr -= length;
3077 #endif /* SUPPORT_UCP */
3078 }
3079 }
3080 /* Control never gets here */
3081 }
3082
3083 /* If the length of a UTF-8 character is 1, we fall through here, and
3084 obey the code as for non-UTF-8 characters below, though in this case the
3085 value of fc will always be < 128. */
3086 }
3087 else
3088 #endif /* SUPPORT_UTF8 */
3089
3090 /* When not in UTF-8 mode, load a single-byte character. */
3091
3092 fc = *ecode++;
3093
3094 /* The value of fc at this point is always less than 256, though we may or
3095 may not be in UTF-8 mode. The code is duplicated for the caseless and
3096 caseful cases, for speed, since matching characters is likely to be quite
3097 common. First, ensure the minimum number of matches are present. If min =
3098 max, continue at the same level without recursing. Otherwise, if
3099 minimizing, keep trying the rest of the expression and advancing one
3100 matching character if failing, up to the maximum. Alternatively, if
3101 maximizing, find the maximum number of characters and work backwards. */
3102
3103 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3104 max, eptr));
3105
3106 if (op >= OP_STARI) /* Caseless */
3107 {
3108 fc = md->lcc[fc];
3109 for (i = 1; i <= min; i++)
3110 {
3111 if (eptr >= md->end_subject)
3112 {
3113 SCHECK_PARTIAL();
3114 MRRETURN(MATCH_NOMATCH);
3115 }
3116 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3117 }
3118 if (min == max) continue;
3119 if (minimize)
3120 {
3121 for (fi = min;; fi++)
3122 {
3123 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3124 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3125 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3126 if (eptr >= md->end_subject)
3127 {
3128 SCHECK_PARTIAL();
3129 MRRETURN(MATCH_NOMATCH);
3130 }
3131 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3132 }
3133 /* Control never gets here */
3134 }
3135 else /* Maximize */
3136 {
3137 pp = eptr;
3138 for (i = min; i < max; i++)
3139 {
3140 if (eptr >= md->end_subject)
3141 {
3142 SCHECK_PARTIAL();
3143 break;
3144 }
3145 if (fc != md->lcc[*eptr]) break;
3146 eptr++;
3147 }
3148
3149 if (possessive) continue;
3150
3151 while (eptr >= pp)
3152 {
3153 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3154 eptr--;
3155 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3156 }
3157 MRRETURN(MATCH_NOMATCH);
3158 }
3159 /* Control never gets here */
3160 }
3161
3162 /* Caseful comparisons (includes all multi-byte characters) */
3163
3164 else
3165 {
3166 for (i = 1; i <= min; i++)
3167 {
3168 if (eptr >= md->end_subject)
3169 {
3170 SCHECK_PARTIAL();
3171 MRRETURN(MATCH_NOMATCH);
3172 }
3173 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3174 }
3175
3176 if (min == max) continue;
3177
3178 if (minimize)
3179 {
3180 for (fi = min;; fi++)
3181 {
3182 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3184 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3185 if (eptr >= md->end_subject)
3186 {
3187 SCHECK_PARTIAL();
3188 MRRETURN(MATCH_NOMATCH);
3189 }
3190 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3191 }
3192 /* Control never gets here */
3193 }
3194 else /* Maximize */
3195 {
3196 pp = eptr;
3197 for (i = min; i < max; i++)
3198 {
3199 if (eptr >= md->end_subject)
3200 {
3201 SCHECK_PARTIAL();
3202 break;
3203 }
3204 if (fc != *eptr) break;
3205 eptr++;
3206 }
3207 if (possessive) continue;
3208
3209 while (eptr >= pp)
3210 {
3211 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3212 eptr--;
3213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3214 }
3215 MRRETURN(MATCH_NOMATCH);
3216 }
3217 }
3218 /* Control never gets here */
3219
3220 /* Match a negated single one-byte character. The character we are
3221 checking can be multibyte. */
3222
3223 case OP_NOT:
3224 case OP_NOTI:
3225 if (eptr >= md->end_subject)
3226 {
3227 SCHECK_PARTIAL();
3228 MRRETURN(MATCH_NOMATCH);
3229 }
3230 ecode++;
3231 GETCHARINCTEST(c, eptr);
3232 if (op == OP_NOTI) /* The caseless case */
3233 {
3234 #ifdef SUPPORT_UTF8
3235 if (c < 256)
3236 #endif
3237 c = md->lcc[c];
3238 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3239 }
3240 else /* Caseful */
3241 {
3242 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3243 }
3244 break;
3245
3246 /* Match a negated single one-byte character repeatedly. This is almost a
3247 repeat of the code for a repeated single character, but I haven't found a
3248 nice way of commoning these up that doesn't require a test of the
3249 positive/negative option for each character match. Maybe that wouldn't add
3250 very much to the time taken, but character matching *is* what this is all
3251 about... */
3252
3253 case OP_NOTEXACT:
3254 case OP_NOTEXACTI:
3255 min = max = GET2(ecode, 1);
3256 ecode += 3;
3257 goto REPEATNOTCHAR;
3258
3259 case OP_NOTUPTO:
3260 case OP_NOTUPTOI:
3261 case OP_NOTMINUPTO:
3262 case OP_NOTMINUPTOI:
3263 min = 0;
3264 max = GET2(ecode, 1);
3265 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3266 ecode += 3;
3267 goto REPEATNOTCHAR;
3268
3269 case OP_NOTPOSSTAR:
3270 case OP_NOTPOSSTARI:
3271 possessive = TRUE;
3272 min = 0;
3273 max = INT_MAX;
3274 ecode++;
3275 goto REPEATNOTCHAR;
3276
3277 case OP_NOTPOSPLUS:
3278 case OP_NOTPOSPLUSI:
3279 possessive = TRUE;
3280 min = 1;
3281 max = INT_MAX;
3282 ecode++;
3283 goto REPEATNOTCHAR;
3284
3285 case OP_NOTPOSQUERY:
3286 case OP_NOTPOSQUERYI:
3287 possessive = TRUE;
3288 min = 0;
3289 max = 1;
3290 ecode++;
3291 goto REPEATNOTCHAR;
3292
3293 case OP_NOTPOSUPTO:
3294 case OP_NOTPOSUPTOI:
3295 possessive = TRUE;
3296 min = 0;
3297 max = GET2(ecode, 1);
3298 ecode += 3;
3299 goto REPEATNOTCHAR;
3300
3301 case OP_NOTSTAR:
3302 case OP_NOTSTARI:
3303 case OP_NOTMINSTAR:
3304 case OP_NOTMINSTARI:
3305 case OP_NOTPLUS:
3306 case OP_NOTPLUSI:
3307 case OP_NOTMINPLUS:
3308 case OP_NOTMINPLUSI:
3309 case OP_NOTQUERY:
3310 case OP_NOTQUERYI:
3311 case OP_NOTMINQUERY:
3312 case OP_NOTMINQUERYI:
3313 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3314 minimize = (c & 1) != 0;
3315 min = rep_min[c]; /* Pick up values from tables; */
3316 max = rep_max[c]; /* zero for max => infinity */
3317 if (max == 0) max = INT_MAX;
3318
3319 /* Common code for all repeated single-byte matches. */
3320
3321 REPEATNOTCHAR:
3322 fc = *ecode++;
3323
3324 /* The code is duplicated for the caseless and caseful cases, for speed,
3325 since matching characters is likely to be quite common. First, ensure the
3326 minimum number of matches are present. If min = max, continue at the same
3327 level without recursing. Otherwise, if minimizing, keep trying the rest of
3328 the expression and advancing one matching character if failing, up to the
3329 maximum. Alternatively, if maximizing, find the maximum number of
3330 characters and work backwards. */
3331
3332 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3333 max, eptr));
3334
3335 if (op >= OP_NOTSTARI) /* Caseless */
3336 {
3337 fc = md->lcc[fc];
3338
3339 #ifdef SUPPORT_UTF8
3340 /* UTF-8 mode */
3341 if (utf8)
3342 {
3343 register unsigned int d;
3344 for (i = 1; i <= min; i++)
3345 {
3346 if (eptr >= md->end_subject)
3347 {
3348 SCHECK_PARTIAL();
3349 MRRETURN(MATCH_NOMATCH);
3350 }
3351 GETCHARINC(d, eptr);
3352 if (d < 256) d = md->lcc[d];
3353 if (fc == d) MRRETURN(MATCH_NOMATCH);
3354 }
3355 }
3356 else
3357 #endif
3358
3359 /* Not UTF-8 mode */
3360 {
3361 for (i = 1; i <= min; i++)
3362 {
3363 if (eptr >= md->end_subject)
3364 {
3365 SCHECK_PARTIAL();
3366 MRRETURN(MATCH_NOMATCH);
3367 }
3368 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3369 }
3370 }
3371
3372 if (min == max) continue;
3373
3374 if (minimize)
3375 {
3376 #ifdef SUPPORT_UTF8
3377 /* UTF-8 mode */
3378 if (utf8)
3379 {
3380 register unsigned int d;
3381 for (fi = min;; fi++)
3382 {
3383 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3385 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3386 if (eptr >= md->end_subject)
3387 {
3388 SCHECK_PARTIAL();
3389 MRRETURN(MATCH_NOMATCH);
3390 }
3391 GETCHARINC(d, eptr);
3392 if (d < 256) d = md->lcc[d];
3393 if (fc == d) MRRETURN(MATCH_NOMATCH);
3394 }
3395 }
3396 else
3397 #endif
3398 /* Not UTF-8 mode */
3399 {
3400 for (fi = min;; fi++)
3401 {
3402 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3404 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3405 if (eptr >= md->end_subject)
3406 {
3407 SCHECK_PARTIAL();
3408 MRRETURN(MATCH_NOMATCH);
3409 }
3410 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3411 }
3412 }
3413 /* Control never gets here */
3414 }
3415
3416 /* Maximize case */
3417
3418 else
3419 {
3420 pp = eptr;
3421
3422 #ifdef SUPPORT_UTF8
3423 /* UTF-8 mode */
3424 if (utf8)
3425 {
3426 register unsigned int d;
3427 for (i = min; i < max; i++)
3428 {
3429 int len = 1;
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 break;
3434 }
3435 GETCHARLEN(d, eptr, len);
3436 if (d < 256) d = md->lcc[d];
3437 if (fc == d) break;
3438 eptr += len;
3439 }
3440 if (possessive) continue;
3441 for(;;)
3442 {
3443 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 if (eptr-- == pp) break; /* Stop if tried at original pos */
3446 BACKCHAR(eptr);
3447 }
3448 }
3449 else
3450 #endif
3451 /* Not UTF-8 mode */
3452 {
3453 for (i = min; i < max; i++)
3454 {
3455 if (eptr >= md->end_subject)
3456 {
3457 SCHECK_PARTIAL();
3458 break;
3459 }
3460 if (fc == md->lcc[*eptr]) break;
3461 eptr++;
3462 }
3463 if (possessive) continue;
3464 while (eptr >= pp)
3465 {
3466 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3468 eptr--;
3469 }
3470 }
3471
3472 MRRETURN(MATCH_NOMATCH);
3473 }
3474 /* Control never gets here */
3475 }
3476
3477 /* Caseful comparisons */
3478
3479 else
3480 {
3481 #ifdef SUPPORT_UTF8
3482 /* UTF-8 mode */
3483 if (utf8)
3484 {
3485 register unsigned int d;
3486 for (i = 1; i <= min; i++)
3487 {
3488 if (eptr >= md->end_subject)
3489 {
3490 SCHECK_PARTIAL();
3491 MRRETURN(MATCH_NOMATCH);
3492 }
3493 GETCHARINC(d, eptr);
3494 if (fc == d) MRRETURN(MATCH_NOMATCH);
3495 }
3496 }
3497 else
3498 #endif
3499 /* Not UTF-8 mode */
3500 {
3501 for (i = 1; i <= min; i++)
3502 {
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 MRRETURN(MATCH_NOMATCH);
3507 }
3508 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3509 }
3510 }
3511
3512 if (min == max) continue;
3513
3514 if (minimize)
3515 {
3516 #ifdef SUPPORT_UTF8
3517 /* UTF-8 mode */
3518 if (utf8)
3519 {
3520 register unsigned int d;
3521 for (fi = min;; fi++)
3522 {
3523 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3524 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3525 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3526 if (eptr >= md->end_subject)
3527 {
3528 SCHECK_PARTIAL();
3529 MRRETURN(MATCH_NOMATCH);
3530 }
3531 GETCHARINC(d, eptr);
3532 if (fc == d) MRRETURN(MATCH_NOMATCH);
3533 }
3534 }
3535 else
3536 #endif
3537 /* Not UTF-8 mode */
3538 {
3539 for (fi = min;; fi++)
3540 {
3541 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3543 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3544 if (eptr >= md->end_subject)
3545 {
3546 SCHECK_PARTIAL();
3547 MRRETURN(MATCH_NOMATCH);
3548 }
3549 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3550 }
3551 }
3552 /* Control never gets here */
3553 }
3554
3555 /* Maximize case */
3556
3557 else
3558 {
3559 pp = eptr;
3560
3561 #ifdef SUPPORT_UTF8
3562 /* UTF-8 mode */
3563 if (utf8)
3564 {
3565 register unsigned int d;
3566 for (i = min; i < max; i++)
3567 {
3568 int len = 1;
3569 if (eptr >= md->end_subject)
3570 {
3571 SCHECK_PARTIAL();
3572 break;
3573 }
3574 GETCHARLEN(d, eptr, len);
3575 if (fc == d) break;
3576 eptr += len;
3577 }
3578 if (possessive) continue;
3579 for(;;)
3580 {
3581 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3582 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3583 if (eptr-- == pp) break; /* Stop if tried at original pos */
3584 BACKCHAR(eptr);
3585 }
3586 }
3587 else
3588 #endif
3589 /* Not UTF-8 mode */
3590 {
3591 for (i = min; i < max; i++)
3592 {
3593 if (eptr >= md->end_subject)
3594 {
3595 SCHECK_PARTIAL();
3596 break;
3597 }
3598 if (fc == *eptr) break;
3599 eptr++;
3600 }
3601 if (possessive) continue;
3602 while (eptr >= pp)
3603 {
3604 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3605 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3606 eptr--;
3607 }
3608 }
3609
3610 MRRETURN(MATCH_NOMATCH);
3611 }
3612 }
3613 /* Control never gets here */
3614
3615 /* Match a single character type repeatedly; several different opcodes
3616 share code. This is very similar to the code for single characters, but we
3617 repeat it in the interests of efficiency. */
3618
3619 case OP_TYPEEXACT:
3620 min = max = GET2(ecode, 1);
3621 minimize = TRUE;
3622 ecode += 3;
3623 goto REPEATTYPE;
3624
3625 case OP_TYPEUPTO:
3626 case OP_TYPEMINUPTO:
3627 min = 0;
3628 max = GET2(ecode, 1);
3629 minimize = *ecode == OP_TYPEMINUPTO;
3630 ecode += 3;
3631 goto REPEATTYPE;
3632
3633 case OP_TYPEPOSSTAR:
3634 possessive = TRUE;
3635 min = 0;
3636 max = INT_MAX;
3637 ecode++;
3638 goto REPEATTYPE;
3639
3640 case OP_TYPEPOSPLUS:
3641 possessive = TRUE;
3642 min = 1;
3643 max = INT_MAX;
3644 ecode++;
3645 goto REPEATTYPE;
3646
3647 case OP_TYPEPOSQUERY:
3648 possessive = TRUE;
3649 min = 0;
3650 max = 1;
3651 ecode++;
3652 goto REPEATTYPE;
3653
3654 case OP_TYPEPOSUPTO:
3655 possessive = TRUE;
3656 min = 0;
3657 max = GET2(ecode, 1);
3658 ecode += 3;
3659 goto REPEATTYPE;
3660
3661 case OP_TYPESTAR:
3662 case OP_TYPEMINSTAR:
3663 case OP_TYPEPLUS:
3664 case OP_TYPEMINPLUS:
3665 case OP_TYPEQUERY:
3666 case OP_TYPEMINQUERY:
3667 c = *ecode++ - OP_TYPESTAR;
3668 minimize = (c & 1) != 0;
3669 min = rep_min[c]; /* Pick up values from tables; */
3670 max = rep_max[c]; /* zero for max => infinity */
3671 if (max == 0) max = INT_MAX;
3672
3673 /* Common code for all repeated single character type matches. Note that
3674 in UTF-8 mode, '.' matches a character of any length, but for the other
3675 character types, the valid characters are all one-byte long. */
3676
3677 REPEATTYPE:
3678 ctype = *ecode++; /* Code for the character type */
3679
3680 #ifdef SUPPORT_UCP
3681 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3682 {
3683 prop_fail_result = ctype == OP_NOTPROP;
3684 prop_type = *ecode++;
3685 prop_value = *ecode++;
3686 }
3687 else prop_type = -1;
3688 #endif
3689
3690 /* First, ensure the minimum number of matches are present. Use inline
3691 code for maximizing the speed, and do the type test once at the start
3692 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3693 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3694 and single-bytes. */
3695
3696 if (min > 0)
3697 {
3698 #ifdef SUPPORT_UCP
3699 if (prop_type >= 0)
3700 {
3701 switch(prop_type)
3702 {
3703 case PT_ANY:
3704 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3705 for (i = 1; i <= min; i++)
3706 {
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 MRRETURN(MATCH_NOMATCH);
3711 }
3712 GETCHARINCTEST(c, eptr);
3713 }
3714 break;
3715
3716 case PT_LAMP:
3717 for (i = 1; i <= min; i++)
3718 {
3719 int chartype;
3720 if (eptr >= md->end_subject)
3721 {
3722 SCHECK_PARTIAL();
3723 MRRETURN(MATCH_NOMATCH);
3724 }
3725 GETCHARINCTEST(c, eptr);
3726 chartype = UCD_CHARTYPE(c);
3727 if ((chartype == ucp_Lu ||
3728 chartype == ucp_Ll ||
3729 chartype == ucp_Lt) == prop_fail_result)
3730 MRRETURN(MATCH_NOMATCH);
3731 }
3732 break;
3733
3734 case PT_GC:
3735 for (i = 1; i <= min; i++)
3736 {
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 MRRETURN(MATCH_NOMATCH);
3741 }
3742 GETCHARINCTEST(c, eptr);
3743 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3744 MRRETURN(MATCH_NOMATCH);
3745 }
3746 break;
3747
3748 case PT_PC:
3749 for (i = 1; i <= min; i++)
3750 {
3751 if (eptr >= md->end_subject)
3752 {
3753 SCHECK_PARTIAL();
3754 MRRETURN(MATCH_NOMATCH);
3755 }
3756 GETCHARINCTEST(c, eptr);
3757 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3758 MRRETURN(MATCH_NOMATCH);
3759 }
3760 break;
3761
3762 case PT_SC:
3763 for (i = 1; i <= min; i++)
3764 {
3765 if (eptr >= md->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 MRRETURN(MATCH_NOMATCH);
3769 }
3770 GETCHARINCTEST(c, eptr);
3771 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3772 MRRETURN(MATCH_NOMATCH);
3773 }
3774 break;
3775
3776 case PT_ALNUM:
3777 for (i = 1; i <= min; i++)
3778 {
3779 int category;
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 MRRETURN(MATCH_NOMATCH);
3784 }
3785 GETCHARINCTEST(c, eptr);
3786 category = UCD_CATEGORY(c);
3787 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 break;
3791
3792 case PT_SPACE: /* Perl space */
3793 for (i = 1; i <= min; i++)
3794 {
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 MRRETURN(MATCH_NOMATCH);
3799 }
3800 GETCHARINCTEST(c, eptr);
3801 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3802 c == CHAR_FF || c == CHAR_CR)
3803 == prop_fail_result)
3804 MRRETURN(MATCH_NOMATCH);
3805 }
3806 break;
3807
3808 case PT_PXSPACE: /* POSIX space */
3809 for (i = 1; i <= min; i++)
3810 {
3811 if (eptr >= md->end_subject)
3812 {
3813 SCHECK_PARTIAL();
3814 MRRETURN(MATCH_NOMATCH);
3815 }
3816 GETCHARINCTEST(c, eptr);
3817 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3818 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3819 == prop_fail_result)
3820 MRRETURN(MATCH_NOMATCH);
3821 }
3822 break;
3823
3824 case PT_WORD:
3825 for (i = 1; i <= min; i++)
3826 {
3827 int category;
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 MRRETURN(MATCH_NOMATCH);
3832 }
3833 GETCHARINCTEST(c, eptr);
3834 category = UCD_CATEGORY(c);
3835 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3836 == prop_fail_result)
3837 MRRETURN(MATCH_NOMATCH);
3838 }
3839 break;
3840
3841 /* This should not occur */
3842
3843 default:
3844 RRETURN(PCRE_ERROR_INTERNAL);
3845 }
3846 }
3847
3848 /* Match extended Unicode sequences. We will get here only if the
3849 support is in the binary; otherwise a compile-time error occurs. */
3850
3851 else if (ctype == OP_EXTUNI)
3852 {
3853 for (i = 1; i <= min; i++)
3854 {
3855 if (eptr >= md->end_subject)
3856 {
3857 SCHECK_PARTIAL();
3858 MRRETURN(MATCH_NOMATCH);
3859 }
3860 GETCHARINCTEST(c, eptr);
3861 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3862 while (eptr < md->end_subject)
3863 {
3864 int len = 1;
3865 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3866 if (UCD_CATEGORY(c) != ucp_M) break;
3867 eptr += len;
3868 }
3869 }
3870 }
3871
3872 else
3873 #endif /* SUPPORT_UCP */
3874
3875 /* Handle all other cases when the coding is UTF-8 */
3876
3877 #ifdef SUPPORT_UTF8
3878 if (utf8) switch(ctype)
3879 {
3880 case OP_ANY:
3881 for (i = 1; i <= min; i++)
3882 {
3883 if (eptr >= md->end_subject)
3884 {
3885 SCHECK_PARTIAL();
3886 MRRETURN(MATCH_NOMATCH);
3887 }
3888 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3889 eptr++;
3890 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3891 }
3892 break;
3893
3894 case OP_ALLANY:
3895 for (i = 1; i <= min; i++)
3896 {
3897 if (eptr >= md->end_subject)
3898 {
3899 SCHECK_PARTIAL();
3900 MRRETURN(MATCH_NOMATCH);
3901 }
3902 eptr++;
3903 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904 }
3905 break;
3906
3907 case OP_ANYBYTE:
3908 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3909 eptr += min;
3910 break;
3911
3912 case OP_ANYNL:
3913 for (i = 1; i <= min; i++)
3914 {
3915 if (eptr >= md->end_subject)
3916 {
3917 SCHECK_PARTIAL();
3918 MRRETURN(MATCH_NOMATCH);
3919 }
3920 GETCHARINC(c, eptr);
3921 switch(c)
3922 {
3923 default: MRRETURN(MATCH_NOMATCH);
3924
3925 case 0x000d:
3926 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3927 break;
3928
3929 case 0x000a:
3930 break;
3931
3932 case 0x000b:
3933 case 0x000c:
3934 case 0x0085:
3935 case 0x2028:
3936 case 0x2029:
3937 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3938 break;
3939 }
3940 }
3941 break;
3942
3943 case OP_NOT_HSPACE:
3944 for (i = 1; i <= min; i++)
3945 {
3946 if (eptr >= md->end_subject)
3947 {
3948 SCHECK_PARTIAL();
3949 MRRETURN(MATCH_NOMATCH);
3950 }
3951 GETCHARINC(c, eptr);
3952 switch(c)
3953 {
3954 default: break;
3955 case 0x09: /* HT */
3956 case 0x20: /* SPACE */
3957 case 0xa0: /* NBSP */
3958 case 0x1680: /* OGHAM SPACE MARK */
3959 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3960 case 0x2000: /* EN QUAD */
3961 case 0x2001: /* EM QUAD */
3962 case 0x2002: /* EN SPACE */
3963 case 0x2003: /* EM SPACE */
3964 case 0x2004: /* THREE-PER-EM SPACE */
3965 case 0x2005: /* FOUR-PER-EM SPACE */
3966 case 0x2006: /* SIX-PER-EM SPACE */
3967 case 0x2007: /* FIGURE SPACE */
3968 case 0x2008: /* PUNCTUATION SPACE */
3969 case 0x2009: /* THIN SPACE */
3970 case 0x200A: /* HAIR SPACE */
3971 case 0x202f: /* NARROW NO-BREAK SPACE */
3972 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3973 case 0x3000: /* IDEOGRAPHIC SPACE */
3974 MRRETURN(MATCH_NOMATCH);
3975 }
3976 }
3977 break;
3978
3979 case OP_HSPACE:
3980 for (i = 1; i <= min; i++)
3981 {
3982 if (eptr >= md->end_subject)
3983 {
3984 SCHECK_PARTIAL();
3985 MRRETURN(MATCH_NOMATCH);
3986 }
3987 GETCHARINC(c, eptr);
3988 switch(c)
3989 {
3990 default: MRRETURN(MATCH_NOMATCH);
3991 case 0x09: /* HT */
3992 case 0x20: /* SPACE */
3993 case 0xa0: /* NBSP */
3994 case 0x1680: /* OGHAM SPACE MARK */
3995 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3996 case 0x2000: /* EN QUAD */
3997 case 0x2001: /* EM QUAD */
3998 case 0x2002: /* EN SPACE */
3999 case 0x2003: /* EM SPACE */
4000 case 0x2004: /* THREE-PER-EM SPACE */
4001 case 0x2005: /* FOUR-PER-EM SPACE */
4002 case 0x2006: /* SIX-PER-EM SPACE */
4003 case 0x2007: /* FIGURE SPACE */
4004 case 0x2008: /* PUNCTUATION SPACE */
4005 case 0x2009: /* THIN SPACE */
4006 case 0x200A: /* HAIR SPACE */
4007 case 0x202f: /* NARROW NO-BREAK SPACE */
4008 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4009 case 0x3000: /* IDEOGRAPHIC SPACE */
4010 break;
4011 }
4012 }
4013 break;
4014
4015 case OP_NOT_VSPACE:
4016 for (i = 1; i <= min; i++)
4017 {
4018 if (eptr >= md->end_subject)
4019 {
4020 SCHECK_PARTIAL();
4021 MRRETURN(MATCH_NOMATCH);
4022 }
4023 GETCHARINC(c, eptr);
4024 switch(c)
4025 {
4026 default: break;
4027 case 0x0a: /* LF */
4028 case 0x0b: /* VT */
4029 case 0x0c: /* FF */
4030 case 0x0d: /* CR */
4031 case 0x85: /* NEL */
4032 case 0x2028: /* LINE SEPARATOR */
4033 case 0x2029: /* PARAGRAPH SEPARATOR */
4034 MRRETURN(MATCH_NOMATCH);
4035 }
4036 }
4037 break;
4038
4039 case OP_VSPACE:
4040 for (i = 1; i <= min; i++)
4041 {
4042 if (eptr >= md->end_subject)
4043 {
4044 SCHECK_PARTIAL();
4045 MRRETURN(MATCH_NOMATCH);
4046 }
4047 GETCHARINC(c, eptr);
4048 switch(c)
4049 {
4050 default: MRRETURN(MATCH_NOMATCH);
4051 case 0x0a: /* LF */
4052 case 0x0b: /* VT */
4053 case 0x0c: /* FF */
4054 case 0x0d: /* CR */
4055 case 0x85: /* NEL */
4056 case 0x2028: /* LINE SEPARATOR */
4057 case 0x2029: /* PARAGRAPH SEPARATOR */
4058 break;
4059 }
4060 }
4061 break;
4062
4063 case OP_NOT_DIGIT:
4064 for (i = 1; i <= min; i++)
4065 {
4066 if (eptr >= md->end_subject)
4067 {
4068 SCHECK_PARTIAL();
4069 MRRETURN(MATCH_NOMATCH);
4070 }
4071 GETCHARINC(c, eptr);
4072 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4073 MRRETURN(MATCH_NOMATCH);
4074 }
4075 break;
4076
4077 case OP_DIGIT:
4078 for (i = 1; i <= min; i++)
4079 {
4080 if (eptr >= md->end_subject)
4081 {
4082 SCHECK_PARTIAL();
4083 MRRETURN(MATCH_NOMATCH);
4084 }
4085 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4086 MRRETURN(MATCH_NOMATCH);
4087 /* No need to skip more bytes - we know it's a 1-byte character */
4088 }
4089 break;
4090
4091 case OP_NOT_WHITESPACE:
4092 for (i = 1; i <= min; i++)
4093 {
4094 if (eptr >= md->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 MRRETURN(MATCH_NOMATCH);
4098 }
4099 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4100 MRRETURN(MATCH_NOMATCH);
4101 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4102 }
4103 break;
4104
4105 case OP_WHITESPACE:
4106 for (i = 1; i <= min; i++)
4107 {
4108 if (eptr >= md->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 MRRETURN(MATCH_NOMATCH);
4112 }
4113 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4114 MRRETURN(MATCH_NOMATCH);
4115 /* No need to skip more bytes - we know it's a 1-byte character */
4116 }
4117 break;
4118
4119 case OP_NOT_WORDCHAR:
4120 for (i = 1; i <= min; i++)
4121 {
4122 if (eptr >= md->end_subject)
4123 {
4124 SCHECK_PARTIAL();
4125 MRRETURN(MATCH_NOMATCH);
4126 }
4127 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4128 MRRETURN(MATCH_NOMATCH);
4129 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4130 }
4131 break;
4132
4133 case OP_WORDCHAR:
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 MRRETURN(MATCH_NOMATCH);
4140 }
4141 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4142 MRRETURN(MATCH_NOMATCH);
4143 /* No need to skip more bytes - we know it's a 1-byte character */
4144 }
4145 break;
4146
4147 default:
4148 RRETURN(PCRE_ERROR_INTERNAL);
4149 } /* End switch(ctype) */
4150
4151 else
4152 #endif /* SUPPORT_UTF8 */
4153
4154 /* Code for the non-UTF-8 case for minimum matching of operators other
4155 than OP_PROP and OP_NOTPROP. */
4156
4157 switch(ctype)
4158 {
4159 case OP_ANY:
4160 for (i = 1; i <= min; i++)
4161 {
4162 if (eptr >= md->end_subject)
4163 {
4164 SCHECK_PARTIAL();
4165 MRRETURN(MATCH_NOMATCH);
4166 }
4167 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4168 eptr++;
4169 }
4170 break;
4171
4172 case OP_ALLANY:
4173 if (eptr > md->end_subject - min)
4174 {
4175 SCHECK_PARTIAL();
4176 MRRETURN(MATCH_NOMATCH);
4177 }
4178 eptr += min;
4179 break;
4180
4181 case OP_ANYBYTE:
4182 if (eptr > md->end_subject - min)
4183 {
4184 SCHECK_PARTIAL();
4185 MRRETURN(MATCH_NOMATCH);
4186 }
4187 eptr += min;
4188 break;
4189
4190 case OP_ANYNL:
4191 for (i = 1; i <= min; i++)
4192 {
4193 if (eptr >= md->end_subject)
4194 {
4195 SCHECK_PARTIAL();
4196 MRRETURN(MATCH_NOMATCH);
4197 }
4198 switch(*eptr++)
4199 {
4200 default: MRRETURN(MATCH_NOMATCH);
4201
4202 case 0x000d:
4203 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4204 break;
4205
4206 case 0x000a:
4207 break;
4208
4209 case 0x000b:
4210 case 0x000c:
4211 case 0x0085:
4212 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4213 break;
4214 }
4215 }
4216 break;
4217
4218 case OP_NOT_HSPACE:
4219 for (i = 1; i <= min; i++)
4220 {
4221 if (eptr >= md->end_subject)
4222 {
4223 SCHECK_PARTIAL();
4224 MRRETURN(MATCH_NOMATCH);
4225 }
4226 switch(*eptr++)
4227 {
4228 default: break;
4229 case 0x09: /* HT */
4230 case 0x20: /* SPACE */
4231 case 0xa0: /* NBSP */
4232 MRRETURN(MATCH_NOMATCH);
4233 }
4234 }
4235 break;
4236
4237 case OP_HSPACE:
4238 for (i = 1; i <= min; i++)
4239 {
4240 if (eptr >= md->end_subject)
4241 {
4242 SCHECK_PARTIAL();
4243 MRRETURN(MATCH_NOMATCH);
4244 }
4245 switch(*eptr++)
4246 {
4247 default: MRRETURN(MATCH_NOMATCH);
4248 case 0x09: /* HT */
4249 case 0x20: /* SPACE */
4250 case 0xa0: /* NBSP */
4251 break;
4252 }
4253 }
4254 break;
4255
4256 case OP_NOT_VSPACE:
4257 for (i = 1; i <= min; i++)
4258 {
4259 if (eptr >= md->end_subject)
4260 {
4261 SCHECK_PARTIAL();
4262 MRRETURN(MATCH_NOMATCH);
4263 }
4264 switch(*eptr++)
4265 {
4266 default: break;
4267 case 0x0a: /* LF */
4268 case 0x0b: /* VT */
4269 case 0x0c: /* FF */
4270 case 0x0d: /* CR */
4271 case 0x85: /* NEL */
4272 MRRETURN(MATCH_NOMATCH);
4273 }
4274 }
4275 break;
4276
4277 case OP_VSPACE:
4278 for (i = 1; i <= min; i++)
4279 {
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 MRRETURN(MATCH_NOMATCH);
4284 }
4285 switch(*eptr++)
4286 {
4287 default: MRRETURN(MATCH_NOMATCH);
4288 case 0x0a: /* LF */
4289 case 0x0b: /* VT */
4290 case 0x0c: /* FF */
4291 case 0x0d: /* CR */
4292 case 0x85: /* NEL */
4293 break;
4294 }
4295 }
4296 break;
4297
4298 case OP_NOT_DIGIT:
4299 for (i = 1; i <= min; i++)
4300 {
4301 if (eptr >= md->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 MRRETURN(MATCH_NOMATCH);
4305 }
4306 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4307 }
4308 break;
4309
4310 case OP_DIGIT:
4311 for (i = 1; i <= min; i++)
4312 {
4313 if (eptr >= md->end_subject)
4314 {
4315 SCHECK_PARTIAL();
4316 MRRETURN(MATCH_NOMATCH);
4317 }
4318 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4319 }
4320 break;
4321
4322 case OP_NOT_WHITESPACE:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 MRRETURN(MATCH_NOMATCH);
4329 }
4330 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4331 }
4332 break;
4333
4334 case OP_WHITESPACE:
4335 for (i = 1; i <= min; i++)
4336 {
4337 if (eptr >= md->end_subject)
4338 {
4339 SCHECK_PARTIAL();
4340 MRRETURN(MATCH_NOMATCH);
4341 }
4342 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4343 }
4344 break;
4345
4346 case OP_NOT_WORDCHAR:
4347 for (i = 1; i <= min; i++)
4348 {
4349 if (eptr >= md->end_subject)
4350 {
4351 SCHECK_PARTIAL();
4352 MRRETURN(MATCH_NOMATCH);
4353 }
4354 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4355 MRRETURN(MATCH_NOMATCH);
4356 }
4357 break;
4358
4359 case OP_WORDCHAR:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 MRRETURN(MATCH_NOMATCH);
4366 }
4367 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4368 MRRETURN(MATCH_NOMATCH);
4369 }
4370 break;
4371
4372 default:
4373 RRETURN(PCRE_ERROR_INTERNAL);
4374 }
4375 }
4376
4377 /* If min = max, continue at the same level without recursing */
4378
4379 if (min == max) continue;
4380
4381 /* If minimizing, we have to test the rest of the pattern before each
4382 subsequent match. Again, separate the UTF-8 case for speed, and also
4383 separate the UCP cases. */
4384
4385 if (minimize)
4386 {
4387 #ifdef SUPPORT_UCP
4388 if (prop_type >= 0)
4389 {
4390 switch(prop_type)
4391 {
4392 case PT_ANY:
4393 for (fi = min;; fi++)
4394 {
4395 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4397 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4398 if (eptr >= md->end_subject)
4399 {
4400 SCHECK_PARTIAL();
4401 MRRETURN(MATCH_NOMATCH);
4402 }
4403 GETCHARINCTEST(c, eptr);
4404 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4405 }
4406 /* Control never gets here */
4407
4408 case PT_LAMP:
4409 for (fi = min;; fi++)
4410 {
4411 int chartype;
4412 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4414 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 MRRETURN(MATCH_NOMATCH);
4419 }
4420 GETCHARINCTEST(c, eptr);
4421 chartype = UCD_CHARTYPE(c);
4422 if ((chartype == ucp_Lu ||
4423 chartype == ucp_Ll ||
4424 chartype == ucp_Lt) == prop_fail_result)
4425 MRRETURN(MATCH_NOMATCH);
4426 }
4427 /* Control never gets here */
4428
4429 case PT_GC:
4430 for (fi = min;; fi++)
4431 {
4432 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4433 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4434 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4435 if (eptr >= md->end_subject)
4436 {
4437 SCHECK_PARTIAL();
4438 MRRETURN(MATCH_NOMATCH);
4439 }
4440 GETCHARINCTEST(c, eptr);
4441 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4442 MRRETURN(MATCH_NOMATCH);
4443 }
4444 /* Control never gets here */
4445
4446 case PT_PC:
4447 for (fi = min;; fi++)
4448 {
4449 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4450 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4451 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4452 if (eptr >= md->end_subject)
4453 {
4454 SCHECK_PARTIAL();
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 GETCHARINCTEST(c, eptr);
4458 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4459 MRRETURN(MATCH_NOMATCH);
4460 }
4461 /* Control never gets here */
4462
4463 case PT_SC:
4464 for (fi = min;; fi++)
4465 {
4466 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4468 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4469 if (eptr >= md->end_subject)
4470 {
4471 SCHECK_PARTIAL();
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 GETCHARINCTEST(c, eptr);
4475 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4476 MRRETURN(MATCH_NOMATCH);
4477 }
4478 /* Control never gets here */
4479
4480 case PT_ALNUM:
4481 for (fi = min;; fi++)
4482 {
4483 int category;
4484 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4486 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 MRRETURN(MATCH_NOMATCH);
4491 }
4492 GETCHARINCTEST(c, eptr);
4493 category = UCD_CATEGORY(c);
4494 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4495 MRRETURN(MATCH_NOMATCH);
4496 }
4497 /* Control never gets here */
4498
4499 case PT_SPACE: /* Perl space */
4500 for (fi = min;; fi++)
4501 {
4502 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4504 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 MRRETURN(MATCH_NOMATCH);
4509 }
4510 GETCHARINCTEST(c, eptr);
4511 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4512 c == CHAR_FF || c == CHAR_CR)
4513 == prop_fail_result)
4514 MRRETURN(MATCH_NOMATCH);
4515 }
4516 /* Control never gets here */
4517
4518 case PT_PXSPACE: /* POSIX space */
4519 for (fi = min;; fi++)
4520 {
4521 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4522 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4523 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4524 if (eptr >= md->end_subject)
4525 {
4526 SCHECK_PARTIAL();
4527 MRRETURN(MATCH_NOMATCH);
4528 }
4529 GETCHARINCTEST(c, eptr);
4530 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4531 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4532 == prop_fail_result)
4533 MRRETURN(MATCH_NOMATCH);
4534 }
4535 /* Control never gets here */
4536
4537 case PT_WORD:
4538 for (fi = min;; fi++)
4539 {
4540 int category;
4541 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4542 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4543 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4544 if (eptr >= md->end_subject)
4545 {
4546 SCHECK_PARTIAL();
4547 MRRETURN(MATCH_NOMATCH);
4548 }
4549 GETCHARINCTEST(c, eptr);
4550 category = UCD_CATEGORY(c);
4551 if ((category == ucp_L ||
4552 category == ucp_N ||
4553 c == CHAR_UNDERSCORE)
4554 == prop_fail_result)
4555 MRRETURN(MATCH_NOMATCH);
4556 }
4557 /* Control never gets here */
4558
4559 /* This should never occur */
4560
4561 default:
4562 RRETURN(PCRE_ERROR_INTERNAL);
4563 }
4564 }
4565
4566 /* Match extended Unicode sequences. We will get here only if the
4567 support is in the binary; otherwise a compile-time error occurs. */
4568
4569 else if (ctype == OP_EXTUNI)
4570 {
4571 for (fi = min;; fi++)
4572 {
4573 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4574 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4575 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 MRRETURN(MATCH_NOMATCH);
4580 }
4581 GETCHARINCTEST(c, eptr);
4582 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4583 while (eptr < md->end_subject)
4584 {
4585 int len = 1;
4586 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4587 if (UCD_CATEGORY(c) != ucp_M) break;
4588 eptr += len;
4589 }
4590 }
4591 }
4592 else
4593 #endif /* SUPPORT_UCP */
4594
4595 #ifdef SUPPORT_UTF8
4596 /* UTF-8 mode */
4597 if (utf8)
4598 {
4599 for (fi = min;; fi++)
4600 {
4601 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4603 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4604 if (eptr >= md->end_subject)
4605 {
4606 SCHECK_PARTIAL();
4607 MRRETURN(MATCH_NOMATCH);
4608 }
4609 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4610 MRRETURN(MATCH_NOMATCH);
4611 GETCHARINC(c, eptr);
4612 switch(ctype)
4613 {
4614 case OP_ANY: /* This is the non-NL case */
4615 case OP_ALLANY:
4616 case OP_ANYBYTE:
4617 break;
4618
4619 case OP_ANYNL:
4620 switch(c)
4621 {
4622 default: MRRETURN(MATCH_NOMATCH);
4623 case 0x000d:
4624 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4625 break;
4626 case 0x000a:
4627 break;
4628
4629 case 0x000b:
4630 case 0x000c:
4631 case 0x0085:
4632 case 0x2028:
4633 case 0x2029:
4634 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4635 break;
4636 }
4637 break;
4638
4639 case OP_NOT_HSPACE:
4640 switch(c)
4641 {
4642 default: break;
4643 case 0x09: /* HT */
4644 case 0x20: /* SPACE */
4645 case 0xa0: /* NBSP */
4646 case 0x1680: /* OGHAM SPACE MARK */
4647 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4648 case 0x2000: /* EN QUAD */
4649 case 0x2001: /* EM QUAD */
4650 case 0x2002: /* EN SPACE */
4651 case 0x2003: /* EM SPACE */
4652 case 0x2004: /* THREE-PER-EM SPACE */
4653 case 0x2005: /* FOUR-PER-EM SPACE */
4654 case 0x2006: /* SIX-PER-EM SPACE */
4655 case 0x2007: /* FIGURE SPACE */
4656 case 0x2008: /* PUNCTUATION SPACE */
4657 case 0x2009: /* THIN SPACE */
4658 case 0x200A: /* HAIR SPACE */
4659 case 0x202f: /* NARROW NO-BREAK SPACE */
4660 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4661 case 0x3000: /* IDEOGRAPHIC SPACE */
4662 MRRETURN(MATCH_NOMATCH);
4663 }
4664 break;
4665
4666 case OP_HSPACE:
4667 switch(c)
4668 {
4669 default: MRRETURN(MATCH_NOMATCH);
4670 case 0x09: /* HT */
4671 case 0x20: /* SPACE */
4672 case 0xa0: /* NBSP */
4673 case 0x1680: /* OGHAM SPACE MARK */
4674 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4675 case 0x2000: /* EN QUAD */
4676 case 0x2001: /* EM QUAD */
4677 case 0x2002: /* EN SPACE */
4678 case 0x2003: /* EM SPACE */
4679 case 0x2004: /* THREE-PER-EM SPACE */
4680 case 0x2005: /* FOUR-PER-EM SPACE */
4681 case 0x2006: /* SIX-PER-EM SPACE */
4682 case 0x2007: /* FIGURE SPACE */
4683 case 0x2008: /* PUNCTUATION SPACE */
4684 case 0x2009: /* THIN SPACE */
4685 case 0x200A: /* HAIR SPACE */
4686 case 0x202f: /* NARROW NO-BREAK SPACE */
4687 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4688 case 0x3000: /* IDEOGRAPHIC SPACE */
4689 break;
4690 }
4691 break;
4692
4693 case OP_NOT_VSPACE:
4694 switch(c)
4695 {
4696 default: break;
4697 case 0x0a: /* LF */
4698 case 0x0b: /* VT */
4699 case 0x0c: /* FF */
4700 case 0x0d: /* CR */
4701 case 0x85: /* NEL */
4702 case 0x2028: /* LINE SEPARATOR */
4703 case 0x2029: /* PARAGRAPH SEPARATOR */
4704 MRRETURN(MATCH_NOMATCH);
4705 }
4706 break;
4707
4708 case OP_VSPACE:
4709 switch(c)
4710 {
4711 default: MRRETURN(MATCH_NOMATCH);
4712 case 0x0a: /* LF */
4713 case 0x0b: /* VT */
4714 case 0x0c: /* FF */
4715 case 0x0d: /* CR */
4716 case 0x85: /* NEL */
4717 case 0x2028: /* LINE SEPARATOR */
4718 case 0x2029: /* PARAGRAPH SEPARATOR */
4719 break;
4720 }
4721 break;
4722
4723 case OP_NOT_DIGIT:
4724 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4725 MRRETURN(MATCH_NOMATCH);
4726 break;
4727
4728 case OP_DIGIT:
4729 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4730 MRRETURN(MATCH_NOMATCH);
4731 break;
4732
4733 case OP_NOT_WHITESPACE:
4734 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4735 MRRETURN(MATCH_NOMATCH);
4736 break;
4737
4738 case OP_WHITESPACE:
4739 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4740 MRRETURN(MATCH_NOMATCH);
4741 break;
4742
4743 case OP_NOT_WORDCHAR:
4744 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4745 MRRETURN(MATCH_NOMATCH);
4746 break;
4747
4748 case OP_WORDCHAR:
4749 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4750 MRRETURN(MATCH_NOMATCH);
4751 break;
4752
4753 default:
4754 RRETURN(PCRE_ERROR_INTERNAL);
4755 }
4756 }
4757 }
4758 else
4759 #endif
4760 /* Not UTF-8 mode */
4761 {
4762 for (fi = min;; fi++)
4763 {
4764 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4766 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4767 if (eptr >= md->end_subject)
4768 {
4769 SCHECK_PARTIAL();
4770 MRRETURN(MATCH_NOMATCH);
4771 }
4772 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4773 MRRETURN(MATCH_NOMATCH);
4774 c = *eptr++;
4775 switch(ctype)
4776 {
4777 case OP_ANY: /* This is the non-NL case */
4778 case OP_ALLANY:
4779 case OP_ANYBYTE:
4780 break;
4781
4782 case OP_ANYNL:
4783 switch(c)
4784 {
4785 default: MRRETURN(MATCH_NOMATCH);
4786 case 0x000d:
4787 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4788 break;
4789
4790 case 0x000a:
4791 break;
4792
4793 case 0x000b:
4794 case 0x000c:
4795 case 0x0085:
4796 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4797 break;
4798 }
4799 break;
4800
4801 case OP_NOT_HSPACE:
4802 switch(c)
4803 {
4804 default: break;
4805 case 0x09: /* HT */
4806 case 0x20: /* SPACE */
4807 case 0xa0: /* NBSP */
4808 MRRETURN(MATCH_NOMATCH);
4809 }
4810 break;
4811
4812 case OP_HSPACE:
4813 switch(c)
4814 {
4815 default: MRRETURN(MATCH_NOMATCH);
4816 case 0x09: /* HT */
4817 case 0x20: /* SPACE */
4818 case 0xa0: /* NBSP */
4819 break;
4820 }
4821 break;
4822
4823 case OP_NOT_VSPACE:
4824 switch(c)
4825 {
4826 default: break;
4827 case 0x0a: /* LF */
4828 case 0x0b: /* VT */
4829 case 0x0c: /* FF */
4830 case 0x0d: /* CR */
4831 case 0x85: /* NEL */
4832 MRRETURN(MATCH_NOMATCH);
4833 }
4834 break;
4835
4836 case OP_VSPACE:
4837 switch(c)
4838 {
4839 default: MRRETURN(MATCH_NOMATCH);
4840 case 0x0a: /* LF */
4841 case 0x0b: /* VT */
4842 case 0x0c: /* FF */
4843 case 0x0d: /* CR */
4844 case 0x85: /* NEL */
4845 break;
4846 }
4847 break;
4848
4849 case OP_NOT_DIGIT:
4850 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4851 break;
4852
4853 case OP_DIGIT:
4854 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4855 break;
4856
4857 case OP_NOT_WHITESPACE:
4858 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4859 break;
4860
4861 case OP_WHITESPACE:
4862 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4863 break;
4864
4865 case OP_NOT_WORDCHAR:
4866 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4867 break;
4868
4869 case OP_WORDCHAR:
4870 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4871 break;
4872
4873 default:
4874 RRETURN(PCRE_ERROR_INTERNAL);
4875 }
4876 }
4877 }
4878 /* Control never gets here */
4879 }
4880
4881 /* If maximizing, it is worth using inline code for speed, doing the type
4882 test once at the start (i.e. keep it out of the loop). Again, keep the
4883 UTF-8 and UCP stuff separate. */
4884
4885 else
4886 {
4887 pp = eptr; /* Remember where we started */
4888
4889 #ifdef SUPPORT_UCP
4890 if (prop_type >= 0)
4891 {
4892 switch(prop_type)
4893 {
4894 case PT_ANY:
4895 for (i = min; i < max; i++)
4896 {
4897 int len = 1;
4898 if (eptr >= md->end_subject)
4899 {
4900 SCHECK_PARTIAL();
4901 break;
4902 }
4903 GETCHARLENTEST(c, eptr, len);
4904 if (prop_fail_result) break;
4905 eptr+= len;
4906 }
4907 break;
4908
4909 case PT_LAMP:
4910 for (i = min; i < max; i++)
4911 {
4912 int chartype;
4913 int len = 1;
4914 if (eptr >= md->end_subject)
4915 {
4916 SCHECK_PARTIAL();
4917 break;
4918 }
4919 GETCHARLENTEST(c, eptr, len);
4920 chartype = UCD_CHARTYPE(c);
4921 if ((chartype == ucp_Lu ||
4922 chartype == ucp_Ll ||
4923 chartype == ucp_Lt) == prop_fail_result)
4924 break;
4925 eptr+= len;
4926 }
4927 break;
4928
4929 case PT_GC:
4930 for (i = min; i < max; i++)
4931 {
4932 int len = 1;
4933 if (eptr >= md->end_subject)
4934 {
4935 SCHECK_PARTIAL();
4936 break;
4937 }
4938 GETCHARLENTEST(c, eptr, len);
4939 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4940 eptr+= len;
4941 }
4942 break;
4943
4944 case PT_PC:
4945 for (i = min; i < max; i++)
4946 {
4947 int len = 1;
4948 if (eptr >= md->end_subject)
4949 {
4950 SCHECK_PARTIAL();
4951 break;
4952 }
4953 GETCHARLENTEST(c, eptr, len);
4954 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4955 eptr+= len;
4956 }
4957 break;
4958
4959 case PT_SC:
4960 for (i = min; i < max; i++)
4961 {
4962 int len = 1;
4963 if (eptr >= md->end_subject)
4964 {
4965 SCHECK_PARTIAL();
4966 break;
4967 }
4968 GETCHARLENTEST(c, eptr, len);
4969 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
4970 eptr+= len;
4971 }
4972 break;
4973
4974 case PT_ALNUM:
4975 for (i = min; i < max; i++)
4976 {
4977 int category;
4978 int len = 1;
4979 if (eptr >= md->end_subject)
4980 {
4981 SCHECK_PARTIAL();
4982 break;
4983 }
4984 GETCHARLENTEST(c, eptr, len);
4985 category = UCD_CATEGORY(c);
4986 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4987 break;
4988 eptr+= len;
4989 }
4990 break;
4991
4992 case PT_SPACE: /* Perl space */
4993 for (i = min; i < max; i++)
4994 {
4995 int len = 1;
4996 if (eptr >= md->end_subject)
4997 {
4998 SCHECK_PARTIAL();
4999 break;
5000 }
5001 GETCHARLENTEST(c, eptr, len);
5002 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5003 c == CHAR_FF || c == CHAR_CR)
5004 == prop_fail_result)
5005 break;
5006 eptr+= len;
5007 }
5008 break;
5009
5010 case PT_PXSPACE: /* POSIX space */
5011 for (i = min; i < max; i++)
5012 {
5013 int len = 1;
5014 if (eptr >= md->end_subject)
5015 {
5016 SCHECK_PARTIAL();
5017 break;
5018 }
5019 GETCHARLENTEST(c, eptr, len);
5020 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5021 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5022 == prop_fail_result)
5023 break;
5024 eptr+= len;
5025 }
5026 break;
5027
5028 case PT_WORD:
5029 for (i = min; i < max; i++)
5030 {
5031 int category;
5032 int len = 1;
5033 if (eptr >= md->end_subject)
5034 {
5035 SCHECK_PARTIAL();
5036 break;
5037 }
5038 GETCHARLENTEST(c, eptr, len);
5039 category = UCD_CATEGORY(c);
5040 if ((category == ucp_L || category == ucp_N ||
5041 c == CHAR_UNDERSCORE) == prop_fail_result)
5042 break;
5043 eptr+= len;
5044 }
5045 break;
5046
5047 default:
5048 RRETURN(PCRE_ERROR_INTERNAL);
5049 }
5050
5051 /* eptr is now past the end of the maximum run */
5052
5053 if (possessive) continue;
5054 for(;;)
5055 {
5056 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058 if (eptr-- == pp) break; /* Stop if tried at original pos */
5059 if (utf8) BACKCHAR(eptr);
5060 }
5061 }
5062
5063 /* Match extended Unicode sequences. We will get here only if the
5064 support is in the binary; otherwise a compile-time error occurs. */
5065
5066 else if (ctype == OP_EXTUNI)
5067 {
5068 for (i = min; i < max; i++)
5069 {
5070 int len = 1;
5071 if (eptr >= md->end_subject)
5072 {
5073 SCHECK_PARTIAL();
5074 break;
5075 }
5076 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5077 if (UCD_CATEGORY(c) == ucp_M) break;
5078 eptr += len;
5079 while (eptr < md->end_subject)
5080 {
5081 len = 1;
5082 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5083 if (UCD_CATEGORY(c) != ucp_M) break;
5084 eptr += len;
5085 }
5086 }
5087
5088 /* eptr is now past the end of the maximum run */
5089
5090 if (possessive) continue;
5091
5092 for(;;)
5093 {
5094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5096 if (eptr-- == pp) break; /* Stop if tried at original pos */
5097 for (;;) /* Move back over one extended */
5098 {
5099 if (!utf8) c = *eptr; else
5100 {
5101 BACKCHAR(eptr);
5102 GETCHAR(c, eptr);
5103 }
5104 if (UCD_CATEGORY(c) != ucp_M) break;
5105 eptr--;
5106 }
5107 }
5108 }
5109
5110 else
5111 #endif /* SUPPORT_UCP */
5112
5113 #ifdef SUPPORT_UTF8
5114 /* UTF-8 mode */
5115
5116 if (utf8)
5117 {
5118 switch(ctype)
5119 {
5120 case OP_ANY:
5121 if (max < INT_MAX)
5122 {
5123 for (i = min; i < max; i++)
5124 {
5125 if (eptr >= md->end_subject)
5126 {
5127 SCHECK_PARTIAL();
5128 break;
5129 }
5130 if (IS_NEWLINE(eptr)) break;
5131 eptr++;
5132 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5133 }
5134 }
5135
5136 /* Handle unlimited UTF-8 repeat */
5137
5138 else
5139 {
5140 for (i = min; i < max; i++)
5141 {
5142 if (eptr >= md->end_subject)
5143 {
5144 SCHECK_PARTIAL();
5145 break;
5146 }
5147 if (IS_NEWLINE(eptr)) break;
5148 eptr++;
5149 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5150 }
5151 }
5152 break;
5153
5154 case OP_ALLANY:
5155 if (max < INT_MAX)
5156 {
5157 for (i = min; i < max; i++)
5158 {
5159 if (eptr >= md->end_subject)
5160 {
5161 SCHECK_PARTIAL();
5162 break;
5163 }
5164 eptr++;
5165 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5166 }
5167 }
5168 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5169 break;
5170
5171 /* The byte case is the same as non-UTF8 */
5172
5173 case OP_ANYBYTE:
5174 c = max - min;
5175 if (c > (unsigned int)(md->end_subject - eptr))
5176 {
5177 eptr = md->end_subject;
5178 SCHECK_PARTIAL();
5179 }
5180 else eptr += c;
5181 break;
5182
5183 case OP_ANYNL:
5184 for (i = min; i < max; i++)
5185 {
5186 int len = 1;
5187 if (eptr >= md->end_subject)
5188 {
5189 SCHECK_PARTIAL();
5190 break;
5191 }
5192 GETCHARLEN(c, eptr, len);
5193 if (c == 0x000d)
5194 {
5195 if (++eptr >= md->end_subject) break;
5196 if (*eptr == 0x000a) eptr++;
5197 }
5198 else
5199 {
5200 if (c != 0x000a &&
5201 (md->bsr_anycrlf ||
5202 (c != 0x000b && c != 0x000c &&
5203 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5204 break;
5205 eptr += len;
5206 }
5207 }
5208 break;
5209
5210 case OP_NOT_HSPACE:
5211 case OP_HSPACE:
5212 for (i = min; i < max; i++)
5213 {
5214 BOOL gotspace;
5215 int len = 1;
5216 if (eptr >= md->end_subject)
5217 {
5218 SCHECK_PARTIAL();
5219 break;
5220 }
5221 GETCHARLEN(c, eptr, len);
5222 switch(c)
5223 {
5224 default: gotspace = FALSE; break;
5225 case 0x09: /* HT */
5226 case 0x20: /* SPACE */
5227 case 0xa0: /* NBSP */
5228 case 0x1680: /* OGHAM SPACE MARK */
5229 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5230 case 0x2000: /* EN QUAD */
5231 case 0x2001: /* EM QUAD */
5232 case 0x2002: /* EN SPACE */
5233 case 0x2003: /* EM SPACE */
5234 case 0x2004: /* THREE-PER-EM SPACE */
5235 case 0x2005: /* FOUR-PER-EM SPACE */
5236 case 0x2006: /* SIX-PER-EM SPACE */
5237 case 0x2007: /* FIGURE SPACE */
5238 case 0x2008: /* PUNCTUATION SPACE */
5239 case 0x2009: /* THIN SPACE */
5240 case 0x200A: /* HAIR SPACE */
5241 case 0x202f: /* NARROW NO-BREAK SPACE */
5242 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5243 case 0x3000: /* IDEOGRAPHIC SPACE */
5244 gotspace = TRUE;
5245 break;
5246 }
5247 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5248 eptr += len;
5249 }
5250 break;
5251
5252 case OP_NOT_VSPACE:
5253 case OP_VSPACE:
5254 for (i = min; i < max; i++)
5255 {
5256 BOOL gotspace;
5257 int len = 1;
5258 if (eptr >= md->end_subject)
5259 {
5260 SCHECK_PARTIAL();
5261 break;
5262 }
5263 GETCHARLEN(c, eptr, len);
5264 switch(c)
5265 {
5266 default: gotspace = FALSE; break;
5267 case 0x0a: /* LF */
5268 case 0x0b: /* VT */
5269 case 0x0c: /* FF */
5270 case 0x0d: /* CR */
5271 case 0x85: /* NEL */
5272 case 0x2028: /* LINE SEPARATOR */
5273 case 0x2029: /* PARAGRAPH SEPARATOR */
5274 gotspace = TRUE;
5275 break;
5276 }
5277 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5278 eptr += len;
5279 }
5280 break;
5281
5282 case OP_NOT_DIGIT:
5283 for (i = min; i < max; i++)
5284 {
5285 int len = 1;
5286 if (eptr >= md->end_subject)
5287 {
5288 SCHECK_PARTIAL();
5289 break;
5290 }
5291 GETCHARLEN(c, eptr, len);
5292 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5293 eptr+= len;
5294 }
5295 break;
5296
5297 case OP_DIGIT:
5298 for (i = min; i < max; i++)
5299 {
5300 int len = 1;
5301 if (eptr >= md->end_subject)
5302 {
5303 SCHECK_PARTIAL();
5304 break;
5305 }
5306 GETCHARLEN(c, eptr, len);
5307 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5308 eptr+= len;
5309 }
5310 break;
5311
5312 case OP_NOT_WHITESPACE:
5313 for (i = min; i < max; i++)
5314 {
5315 int len = 1;
5316 if (eptr >= md->end_subject)
5317 {
5318 SCHECK_PARTIAL();
5319 break;
5320 }
5321 GETCHARLEN(c, eptr, len);
5322 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5323 eptr+= len;
5324 }
5325 break;
5326
5327 case OP_WHITESPACE:
5328 for (i = min; i < max; i++)
5329 {
5330 int len = 1;
5331 if (eptr >= md->end_subject)
5332 {
5333 SCHECK_PARTIAL();
5334 break;
5335 }
5336 GETCHARLEN(c, eptr, len);
5337 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5338 eptr+= len;
5339 }
5340 break;
5341
5342 case OP_NOT_WORDCHAR:
5343 for (i = min; i < max; i++)
5344 {
5345 int len = 1;
5346 if (eptr >= md->end_subject)
5347 {
5348 SCHECK_PARTIAL();
5349 break;
5350 }
5351 GETCHARLEN(c, eptr, len);
5352 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5353 eptr+= len;
5354 }
5355 break;
5356
5357 case OP_WORDCHAR:
5358 for (i = min; i < max; i++)
5359 {
5360 int len = 1;
5361 if (eptr >= md->end_subject)
5362 {
5363 SCHECK_PARTIAL();
5364 break;
5365 }
5366 GETCHARLEN(c, eptr, len);
5367 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5368 eptr+= len;
5369 }
5370 break;
5371
5372 default:
5373 RRETURN(PCRE_ERROR_INTERNAL);
5374 }
5375
5376 /* eptr is now past the end of the maximum run. If possessive, we are
5377 done (no backing up). Otherwise, match at this position; anything other
5378 than no match is immediately returned. For nomatch, back up one
5379 character, unless we are matching \R and the last thing matched was
5380 \r\n, in which case, back up two bytes. */
5381
5382 if (possessive) continue;
5383 for(;;)
5384 {
5385 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5387 if (eptr-- == pp) break; /* Stop if tried at original pos */
5388 BACKCHAR(eptr);
5389 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5390 eptr[-1] == '\r') eptr--;
5391 }
5392 }
5393 else
5394 #endif /* SUPPORT_UTF8 */
5395
5396 /* Not UTF-8 mode */
5397 {
5398 switch(ctype)
5399 {
5400 case OP_ANY:
5401 for (i = min; i < max; i++)
5402 {
5403 if (eptr >= md->end_subject)
5404 {
5405 SCHECK_PARTIAL();
5406 break;
5407 }
5408 if (IS_NEWLINE(eptr)) break;
5409 eptr++;
5410 }
5411 break;
5412
5413 case OP_ALLANY:
5414 case OP_ANYBYTE:
5415 c = max - min;
5416 if (c > (unsigned int)(md->end_subject - eptr))
5417 {
5418 eptr = md->end_subject;
5419 SCHECK_PARTIAL();
5420 }
5421 else eptr += c;
5422 break;
5423
5424 case OP_ANYNL:
5425 for (i = min; i < max; i++)
5426 {
5427 if (eptr >= md->end_subject)
5428 {
5429 SCHECK_PARTIAL();
5430 break;
5431 }
5432 c = *eptr;
5433 if (c == 0x000d)
5434 {
5435 if (++eptr >= md->end_subject) break;
5436 if (*eptr == 0x000a) eptr++;
5437 }
5438 else
5439 {
5440 if (c != 0x000a &&
5441 (md->bsr_anycrlf ||
5442 (c != 0x000b && c != 0x000c && c != 0x0085)))
5443 break;
5444 eptr++;
5445 }
5446 }
5447 break;
5448
5449 case OP_NOT_HSPACE:
5450 for (i = min; i < max; i++)
5451 {
5452 if (eptr >= md->end_subject)
5453 {
5454 SCHECK_PARTIAL();
5455 break;
5456 }
5457 c = *eptr;
5458 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5459 eptr++;
5460 }
5461 break;
5462
5463 case OP_HSPACE:
5464 for (i = min; i < max; i++)
5465 {
5466 if (eptr >= md->end_subject)
5467 {
5468 SCHECK_PARTIAL();
5469 break;
5470 }
5471 c = *eptr;
5472 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5473 eptr++;
5474 }
5475 break;
5476
5477 case OP_NOT_VSPACE:
5478 for (i = min; i < max; i++)
5479 {
5480 if (eptr >= md->end_subject)
5481 {
5482 SCHECK_PARTIAL();
5483 break;
5484 }
5485 c = *eptr;
5486 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5487 break;
5488 eptr++;
5489 }
5490 break;
5491
5492 case OP_VSPACE:
5493 for (i = min; i < max; i++)
5494 {
5495 if (eptr >= md->end_subject)
5496 {
5497 SCHECK_PARTIAL();
5498 break;
5499 }
5500 c = *eptr;
5501 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5502 break;
5503 eptr++;
5504 }
5505 break;
5506
5507 case OP_NOT_DIGIT:
5508 for (i = min; i < max; i++)
5509 {
5510 if (eptr >= md->end_subject)
5511 {
5512 SCHECK_PARTIAL();
5513 break;
5514 }
5515 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5516 eptr++;
5517 }
5518 break;
5519
5520 case OP_DIGIT:
5521 for (i = min; i < max; i++)
5522 {
5523 if (eptr >= md->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5529 eptr++;
5530 }
5531 break;
5532
5533 case OP_NOT_WHITESPACE:
5534 for (i = min; i < max; i++)
5535 {
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5542 eptr++;
5543 }
5544 break;
5545
5546 case OP_WHITESPACE:
5547 for (i = min; i < max; i++)
5548 {
5549 if (eptr >= md->end_subject)
5550 {
5551 SCHECK_PARTIAL();
5552 break;
5553 }
5554 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5555 eptr++;
5556 }
5557 break;
5558
5559 case OP_NOT_WORDCHAR:
5560 for (i = min; i < max; i++)
5561 {
5562 if (eptr >= md->end_subject)
5563 {
5564 SCHECK_PARTIAL();
5565 break;
5566 }
5567 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5568 eptr++;
5569 }
5570 break;
5571
5572 case OP_WORDCHAR:
5573 for (i = min; i < max; i++)
5574 {
5575 if (eptr >= md->end_subject)
5576 {
5577 SCHECK_PARTIAL();
5578 break;
5579 }
5580 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5581 eptr++;
5582 }
5583 break;
5584
5585 default:
5586 RRETURN(PCRE_ERROR_INTERNAL);
5587 }
5588
5589 /* eptr is now past the end of the maximum run. If possessive, we are
5590 done (no backing up). Otherwise, match at this position; anything other
5591 than no match is immediately returned. For nomatch, back up one
5592 character (byte), unless we are matching \R and the last thing matched
5593 was \r\n, in which case, back up two bytes. */
5594
5595 if (possessive) continue;
5596 while (eptr >= pp)
5597 {
5598 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5600 eptr--;
5601 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5602 eptr[-1] == '\r') eptr--;
5603 }
5604 }
5605
5606 /* Get here if we can't make it match with any permitted repetitions */
5607
5608 MRRETURN(MATCH_NOMATCH);
5609 }
5610 /* Control never gets here */
5611
5612 /* There's been some horrible disaster. Arrival here can only mean there is
5613 something seriously wrong in the code above or the OP_xxx definitions. */
5614
5615 default:
5616 DPRINTF(("Unknown opcode %d\n", *ecode));
5617 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5618 }
5619
5620 /* Do not stick any code in here without much thought; it is assumed
5621 that "continue" in the code above comes out to here to repeat the main
5622 loop. */
5623
5624 } /* End of main loop */
5625 /* Control never reaches here */
5626
5627
5628 /* When compiling to use the heap rather than the stack for recursive calls to
5629 match(), the RRETURN() macro jumps here. The number that is saved in
5630 frame->Xwhere indicates which label we actually want to return to. */
5631
5632 #ifdef NO_RECURSE
5633 #define LBL(val) case val: goto L_RM##val;
5634 HEAP_RETURN:
5635 switch (frame->Xwhere)
5636 {
5637 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5638 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5639 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5640 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5641 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5642 #ifdef SUPPORT_UTF8
5643 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5644 LBL(32) LBL(34) LBL(42) LBL(46)
5645 #ifdef SUPPORT_UCP
5646 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5647 LBL(59) LBL(60) LBL(61) LBL(62)
5648 #endif /* SUPPORT_UCP */
5649 #endif /* SUPPORT_UTF8 */
5650 default:
5651 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5652 return PCRE_ERROR_INTERNAL;
5653 }
5654 #undef LBL
5655 #endif /* NO_RECURSE */
5656 }
5657
5658
5659 /***************************************************************************
5660 ****************************************************************************
5661 RECURSION IN THE match() FUNCTION
5662
5663 Undefine all the macros that were defined above to handle this. */
5664
5665 #ifdef NO_RECURSE
5666 #undef eptr
5667 #undef ecode
5668 #undef mstart
5669 #undef offset_top
5670 #undef eptrb
5671 #undef flags
5672
5673 #undef callpat
5674 #undef charptr
5675 #undef data
5676 #undef next
5677 #undef pp
5678 #undef prev
5679 #undef saved_eptr
5680
5681 #undef new_recursive
5682
5683 #undef cur_is_word
5684 #undef condition
5685 #undef prev_is_word
5686
5687 #undef ctype
5688 #undef length
5689 #undef max
5690 #undef min
5691 #undef number
5692 #undef offset
5693 #undef op
5694 #undef save_capture_last
5695 #undef save_offset1
5696 #undef save_offset2
5697 #undef save_offset3
5698 #undef stacksave
5699
5700 #undef newptrb
5701
5702 #endif
5703
5704 /* These two are defined as macros in both cases */
5705
5706 #undef fc
5707 #undef fi
5708
5709 /***************************************************************************
5710 ***************************************************************************/
5711
5712
5713
5714 /*************************************************
5715 * Execute a Regular Expression *
5716 *************************************************/
5717
5718 /* This function applies a compiled re to a subject string and picks out
5719 portions of the string if it matches. Two elements in the vector are set for
5720 each substring: the offsets to the start and end of the substring.
5721
5722 Arguments:
5723 argument_re points to the compiled expression
5724 extra_data points to extra data or is NULL
5725 subject points to the subject string
5726 length length of subject string (may contain binary zeros)
5727 start_offset where to start in the subject string
5728 options option bits
5729 offsets points to a vector of ints to be filled in with offsets
5730 offsetcount the number of elements in the vector
5731
5732 Returns: > 0 => success; value is the number of elements filled in
5733 = 0 => success, but offsets is not big enough
5734 -1 => failed to match
5735 < -1 => some kind of unexpected problem
5736 */
5737
5738 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5739 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5740 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5741 int offsetcount)
5742 {
5743 int rc, ocount;
5744 int first_byte = -1;
5745 int req_byte = -1;
5746 int req_byte2 = -1;
5747 int newline;
5748 BOOL using_temporary_offsets = FALSE;
5749 BOOL anchored;
5750 BOOL startline;
5751 BOOL firstline;
5752 BOOL first_byte_caseless = FALSE;
5753 BOOL req_byte_caseless = FALSE;
5754 BOOL utf8;
5755 match_data match_block;
5756 match_data *md = &match_block;
5757 const uschar *tables;
5758 const uschar *start_bits = NULL;
5759 USPTR start_match = (USPTR)subject + start_offset;
5760 USPTR end_subject;
5761 USPTR start_partial = NULL;
5762 USPTR req_byte_ptr = start_match - 1;
5763
5764 pcre_study_data internal_study;
5765 const pcre_study_data *study;
5766
5767 real_pcre internal_re;
5768 const real_pcre *external_re = (const real_pcre *)argument_re;
5769 const real_pcre *re = external_re;
5770
5771 /* Plausibility checks */
5772
5773 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5774 if (re == NULL || subject == NULL ||
5775 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5776 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5777 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5778
5779 /* This information is for finding all the numbers associated with a given
5780 name, for condition testing. */
5781
5782 md->name_table = (uschar *)re + re->name_table_offset;
5783 md->name_count = re->name_count;
5784 md->name_entry_size = re->name_entry_size;
5785
5786 /* Fish out the optional data from the extra_data structure, first setting
5787 the default values. */
5788
5789 study = NULL;
5790 md->match_limit = MATCH_LIMIT;
5791 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5792 md->callout_data = NULL;
5793
5794 /* The table pointer is always in native byte order. */
5795
5796 tables = external_re->tables;
5797
5798 if (extra_data != NULL)
5799 {
5800 register unsigned int flags = extra_data->flags;
5801 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5802 study = (const pcre_study_data *)extra_data->study_data;
5803 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5804 md->match_limit = extra_data->match_limit;
5805 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5806 md->match_limit_recursion = extra_data->match_limit_recursion;
5807 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5808 md->callout_data = extra_data->callout_data;
5809 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5810 }
5811
5812 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5813 is a feature that makes it possible to save compiled regex and re-use them
5814 in other programs later. */
5815
5816 if (tables == NULL) tables = _pcre_default_tables;
5817
5818 /* Check that the first field in the block is the magic number. If it is not,
5819 test for a regex that was compiled on a host of opposite endianness. If this is
5820 the case, flipped values are put in internal_re and internal_study if there was
5821 study data too. */
5822
5823 if (re->magic_number != MAGIC_NUMBER)
5824 {
5825 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5826 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5827 if (study != NULL) study = &internal_study;
5828 }
5829
5830 /* Set up other data */
5831
5832 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5833 startline = (re->flags & PCRE_STARTLINE) != 0;
5834 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5835
5836 /* The code starts after the real_pcre block and the capture name table. */
5837
5838 md->start_code = (const uschar *)external_re + re->name_table_offset +
5839 re->name_count * re->name_entry_size;
5840
5841 md->start_subject = (USPTR)subject;
5842 md->start_offset = start_offset;
5843 md->end_subject = md->start_subject + length;
5844 end_subject = md->end_subject;
5845
5846 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5847 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5848 md->use_ucp = (re->options & PCRE_UCP) != 0;
5849 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5850
5851 /* Some options are unpacked into BOOL variables in the hope that testing
5852 them will be faster than individual option bits. */
5853
5854 md->notbol = (options & PCRE_NOTBOL) != 0;
5855 md->noteol = (options & PCRE_NOTEOL) != 0;
5856 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5857 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5858 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5859 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5860
5861
5862 md->hitend = FALSE;
5863 md->mark = NULL; /* In case never set */
5864
5865 md->recursive = NULL; /* No recursion at top level */
5866
5867 md->lcc = tables + lcc_offset;
5868 md->ctypes = tables + ctypes_offset;
5869
5870 /* Handle different \R options. */
5871
5872 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5873 {
5874 case 0:
5875 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5876 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5877 else
5878 #ifdef BSR_ANYCRLF
5879 md->bsr_anycrlf = TRUE;
5880 #else
5881 md->bsr_anycrlf = FALSE;
5882 #endif
5883 break;
5884
5885 case PCRE_BSR_ANYCRLF:
5886 md->bsr_anycrlf = TRUE;
5887 break;
5888
5889 case PCRE_BSR_UNICODE:
5890 md->bsr_anycrlf = FALSE;
5891 break;
5892
5893 default: return PCRE_ERROR_BADNEWLINE;
5894 }
5895
5896 /* Handle different types of newline. The three bits give eight cases. If
5897 nothing is set at run time, whatever was used at compile time applies. */
5898
5899 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5900 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5901 {
5902 case 0: newline = NEWLINE; break; /* Compile-time default */
5903 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5904 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5905 case PCRE_NEWLINE_CR+
5906 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5907 case PCRE_NEWLINE_ANY: newline = -1; break;
5908 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5909 default: return PCRE_ERROR_BADNEWLINE;
5910 }
5911
5912 if (newline == -2)
5913 {
5914 md->nltype = NLTYPE_ANYCRLF;
5915 }
5916 else if (newline < 0)
5917 {
5918 md->nltype = NLTYPE_ANY;
5919 }
5920 else
5921 {
5922 md->nltype = NLTYPE_FIXED;
5923 if (newline > 255)
5924 {
5925 md->nllen = 2;
5926 md->nl[0] = (newline >> 8) & 255;
5927 md->nl[1] = newline & 255;
5928 }
5929 else
5930 {
5931 md->nllen = 1;
5932 md->nl[0] = newline;
5933 }
5934 }
5935
5936 /* Partial matching was originally supported only for a restricted set of
5937 regexes; from release 8.00 there are no restrictions, but the bits are still
5938 defined (though never set). So there's no harm in leaving this code. */
5939
5940 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5941 return PCRE_ERROR_BADPARTIAL;
5942
5943 /* Check a UTF-8 string if required. Pass back the character offset and error
5944 code for an invalid string if a results vector is available. */
5945
5946 #ifdef SUPPORT_UTF8
5947 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5948 {
5949 int erroroffset;
5950 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5951 if (errorcode != 0)
5952 {
5953 if (offsetcount >= 2)
5954 {
5955 offsets[0] = erroroffset;
5956 offsets[1] = errorcode;
5957 }
5958 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5959 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5960 }
5961
5962 /* Check that a start_offset points to the start of a UTF-8 character. */
5963
5964 if (start_offset > 0 && start_offset < length &&
5965 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5966 return PCRE_ERROR_BADUTF8_OFFSET;
5967 }
5968 #endif
5969
5970 /* If the expression has got more back references than the offsets supplied can
5971 hold, we get a temporary chunk of working store to use during the matching.
5972 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5973 of 3. */
5974
5975 ocount = offsetcount - (offsetcount % 3);
5976
5977 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5978 {
5979 ocount = re->top_backref * 3 + 3;
5980 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5981 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5982 using_temporary_offsets = TRUE;
5983 DPRINTF(("Got memory to hold back references\n"));
5984 }
5985 else md->offset_vector = offsets;
5986
5987 md->offset_end = ocount;
5988 md->offset_max = (2*ocount)/3;
5989 md->offset_overflow = FALSE;
5990 md->capture_last = -1;
5991
5992 /* Reset the working variable associated with each extraction. These should
5993 never be used unless previously set, but they get saved and restored, and so we
5994 initialize them to avoid reading uninitialized locations. Also, unset the
5995 offsets for the matched string. This is really just for tidiness with callouts,
5996 in case they inspect these fields. */
5997
5998 if (md->offset_vector != NULL)
5999 {
6000 register int *iptr = md->offset_vector + ocount;
6001 register int *iend = iptr - re->top_bracket;
6002 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6003 while (--iptr >= iend) *iptr = -1;
6004 md->offset_vector[0] = md->offset_vector[1] = -1;
6005 }
6006
6007 /* Set up the first character to match, if available. The first_byte value is
6008 never set for an anchored regular expression, but the anchoring may be forced
6009 at run time, so we have to test for anchoring. The first char may be unset for
6010 an unanchored pattern, of course. If there's no first char and the pattern was
6011 studied, there may be a bitmap of possible first characters. */
6012
6013 if (!anchored)
6014 {
6015 if ((re->flags & PCRE_FIRSTSET) != 0)
6016 {
6017 first_byte = re->first_byte & 255;
6018 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6019 first_byte = md->lcc[first_byte];
6020 }
6021 else
6022 if (!startline && study != NULL &&
6023 (study->flags & PCRE_STUDY_MAPPED) != 0)
6024 start_bits = study->start_bits;
6025 }
6026
6027 /* For anchored or unanchored matches, there may be a "last known required
6028 character" set. */
6029
6030 if ((re->flags & PCRE_REQCHSET) != 0)
6031 {
6032 req_byte = re->req_byte & 255;
6033 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6034 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6035 }
6036
6037
6038
6039
6040 /* ==========================================================================*/
6041
6042 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6043 the loop runs just once. */
6044
6045 for(;;)
6046 {
6047 USPTR save_end_subject = end_subject;
6048 USPTR new_start_match;
6049
6050 /* If firstline is TRUE, the start of the match is constrained to the first
6051 line of a multiline string. That is, the match must be before or at the first
6052 newline. Implement this by temporarily adjusting end_subject so that we stop
6053 scanning at a newline. If the match fails at the newline, later code breaks
6054 this loop. */
6055
6056 if (firstline)
6057 {
6058 USPTR t = start_match;
6059 #ifdef SUPPORT_UTF8
6060 if (utf8)
6061 {
6062 while (t < md->end_subject && !IS_NEWLINE(t))
6063 {
6064 t++;
6065 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6066 }
6067 }
6068 else
6069 #endif
6070 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6071 end_subject = t;
6072 }
6073
6074 /* There are some optimizations that avoid running the match if a known
6075 starting point is not found, or if a known later character is not present.
6076 However, there is an option that disables these, for testing and for ensuring
6077 that all callouts do actually occur. The option can be set in the regex by
6078 (*NO_START_OPT) or passed in match-time options. */
6079
6080 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6081 {
6082 /* Advance to a unique first byte if there is one. */
6083
6084 if (first_byte >= 0)
6085 {
6086 if (first_byte_caseless)
6087 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6088 start_match++;
6089 else
6090 while (start_match < end_subject && *start_match != first_byte)
6091 start_match++;
6092 }
6093
6094 /* Or to just after a linebreak for a multiline match */
6095
6096 else if (startline)
6097 {
6098 if (start_match > md->start_subject + start_offset)
6099 {
6100 #ifdef SUPPORT_UTF8
6101 if (utf8)
6102 {
6103 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6104 {
6105 start_match++;
6106 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6107 start_match++;
6108 }
6109 }
6110 else
6111 #endif
6112 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6113 start_match++;
6114
6115 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6116 and we are now at a LF, advance the match position by one more character.
6117 */
6118
6119 if (start_match[-1] == CHAR_CR &&
6120 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6121 start_match < end_subject &&
6122 *start_match == CHAR_NL)
6123 start_match++;
6124 }
6125 }
6126
6127 /* Or to a non-unique first byte after study */
6128
6129 else if (start_bits != NULL)
6130 {
6131 while (start_match < end_subject)
6132 {
6133 register unsigned int c = *start_match;
6134 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6135 {
6136 start_match++;
6137 #ifdef SUPPORT_UTF8
6138 if (utf8)
6139 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6140 start_match++;
6141 #endif
6142 }
6143 else break;
6144 }
6145 }
6146 } /* Starting optimizations */
6147
6148 /* Restore fudged end_subject */
6149
6150 end_subject = save_end_subject;
6151
6152 /* The following two optimizations are disabled for partial matching or if
6153 disabling is explicitly requested. */
6154
6155 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6156 {
6157 /* If the pattern was studied, a minimum subject length may be set. This is
6158 a lower bound; no actual string of that length may actually match the
6159 pattern. Although the value is, strictly, in characters, we treat it as
6160 bytes to avoid spending too much time in this optimization. */
6161
6162 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6163 (pcre_uint32)(end_subject - start_match) < study->minlength)
6164 {
6165 rc = MATCH_NOMATCH;
6166 break;
6167 }
6168
6169 /* If req_byte is set, we know that that character must appear in the
6170 subject for the match to succeed. If the first character is set, req_byte
6171 must be later in the subject; otherwise the test starts at the match point.
6172 This optimization can save a huge amount of backtracking in patterns with
6173 nested unlimited repeats that aren't going to match. Writing separate code
6174 for cased/caseless versions makes it go faster, as does using an
6175 autoincrement and backing off on a match.
6176
6177 HOWEVER: when the subject string is very, very long, searching to its end
6178 can take a long time, and give bad performance on quite ordinary patterns.
6179 This showed up when somebody was matching something like /^\d+C/ on a
6180 32-megabyte string... so we don't do this when the string is sufficiently
6181 long. */
6182
6183 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6184 {
6185 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6186
6187 /* We don't need to repeat the search if we haven't yet reached the
6188 place we found it at last time. */
6189
6190 if (p > req_byte_ptr)
6191 {
6192 if (req_byte_caseless)
6193 {
6194 while (p < end_subject)
6195 {
6196 register int pp = *p++;
6197 if (pp == req_byte || pp == req_byte2) { p--; break; }
6198 }
6199 }
6200 else
6201 {
6202 while (p < end_subject)
6203 {
6204 if (*p++ == req_byte) { p--; break; }
6205 }
6206 }
6207
6208 /* If we can't find the required character, break the matching loop,
6209 forcing a match failure. */
6210
6211 if (p >= end_subject)
6212 {
6213 rc = MATCH_NOMATCH;
6214 break;
6215 }
6216
6217 /* If we have found the required character, save the point where we
6218 found it, so that we don't search again next time round the loop if
6219 the start hasn't passed this character yet. */
6220
6221 req_byte_ptr = p;
6222 }
6223 }
6224 }
6225
6226 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6227 printf(">>>> Match against: ");
6228 pchars(start_match, end_subject - start_match, TRUE, md);
6229 printf("\n");
6230 #endif
6231
6232 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6233 first starting point for which a partial match was found. */
6234
6235 md->start_match_ptr = start_match;
6236 md->start_used_ptr = start_match;
6237 md->match_call_count = 0;
6238 md->match_function_type = 0;
6239 md->end_offset_top = 0;
6240 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6241 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6242
6243 switch(rc)
6244 {
6245 /* SKIP passes back the next starting point explicitly, but if it is the
6246 same as the match we have just done, treat it as NOMATCH. */
6247
6248 case MATCH_SKIP:
6249 if (md->start_match_ptr != start_match)
6250 {
6251 new_start_match = md->start_match_ptr;
6252 break;
6253 }
6254 /* Fall through */
6255
6256 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6257 the SKIP's arg was not found. We also treat this as NOMATCH. */
6258
6259 case MATCH_SKIP_ARG:
6260 /* Fall through */
6261
6262 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6263 exactly like PRUNE. */
6264
6265 case MATCH_NOMATCH:
6266 case MATCH_PRUNE:
6267 case MATCH_THEN:
6268 new_start_match = start_match + 1;
6269 #ifdef SUPPORT_UTF8
6270 if (utf8)
6271 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6272 new_start_match++;
6273 #endif
6274 break;
6275
6276 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6277
6278 case MATCH_COMMIT:
6279 rc = MATCH_NOMATCH;
6280 goto ENDLOOP;
6281
6282 /* Any other return is either a match, or some kind of error. */
6283
6284 default:
6285 goto ENDLOOP;
6286 }
6287
6288 /* Control reaches here for the various types of "no match at this point"
6289 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6290
6291 rc = MATCH_NOMATCH;
6292
6293 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6294 newline in the subject (though it may continue over the newline). Therefore,
6295 if we have just failed to match, starting at a newline, do not continue. */
6296
6297 if (firstline && IS_NEWLINE(start_match)) break;
6298
6299 /* Advance to new matching position */
6300
6301 start_match = new_start_match;
6302
6303 /* Break the loop if the pattern is anchored or if we have passed the end of
6304 the subject. */
6305
6306 if (anchored || start_match > end_subject) break;
6307
6308 /* If we have just passed a CR and we are now at a LF, and the pattern does
6309 not contain any explicit matches for \r or \n, and the newline option is CRLF
6310 or ANY or ANYCRLF, advance the match position by one more character. */
6311
6312 if (start_match[-1] == CHAR_CR &&
6313 start_match < end_subject &&
6314 *start_match == CHAR_NL &&
6315 (re->flags & PCRE_HASCRORLF) == 0 &&
6316 (md->nltype == NLTYPE_ANY ||
6317 md->nltype == NLTYPE_ANYCRLF ||
6318 md->nllen == 2))
6319 start_match++;
6320
6321 md->mark = NULL; /* Reset for start of next match attempt */
6322 } /* End of for(;;) "bumpalong" loop */
6323
6324 /* ==========================================================================*/
6325
6326 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6327 conditions is true:
6328
6329 (1) The pattern is anchored or the match was failed by (*COMMIT);
6330
6331 (2) We are past the end of the subject;
6332
6333 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6334 this option requests that a match occur at or before the first newline in
6335 the subject.
6336
6337 When we have a match and the offset vector is big enough to deal with any
6338 backreferences, captured substring offsets will already be set up. In the case
6339 where we had to get some local store to hold offsets for backreference
6340 processing, copy those that we can. In this case there need not be overflow if
6341 certain parts of the pattern were not used, even though there are more
6342 capturing parentheses than vector slots. */
6343
6344 ENDLOOP:
6345
6346 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6347 {
6348 if (using_temporary_offsets)
6349 {
6350 if (offsetcount >= 4)
6351 {
6352 memcpy(offsets + 2, md->offset_vector + 2,
6353 (offsetcount - 2) * sizeof(int));
6354 DPRINTF(("Copied offsets from temporary memory\n"));
6355 }
6356 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6357 DPRINTF(("Freeing temporary memory\n"));
6358 (pcre_free)(md->offset_vector);
6359 }
6360
6361 /* Set the return code to the number of captured strings, or 0 if there are
6362 too many to fit into the vector. */
6363
6364 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6365
6366 /* If there is space in the offset vector, set any unused pairs at the end of
6367 the pattern to -1 for backwards compatibility. It is documented that this
6368 happens. In earlier versions, the whole set of potential capturing offsets
6369 was set to -1 each time round the loop, but this is handled differently now.
6370 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6371 those at the end that need unsetting here. We can't just unset them all at
6372 the start of the whole thing because they may get set in one branch that is
6373 not the final matching branch. */
6374
6375 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6376 {
6377 register int *iptr, *iend;
6378 int resetcount = 2 + re->top_bracket * 2;
6379 if (resetcount > offsetcount) resetcount = ocount;
6380 iptr = offsets + md->end_offset_top;
6381 iend = offsets + resetcount;
6382 while (iptr < iend) *iptr++ = -1;
6383 }
6384
6385 /* If there is space, set up the whole thing as substring 0. The value of
6386 md->start_match_ptr might be modified if \K was encountered on the success
6387 matching path. */
6388
6389 if (offsetcount < 2) rc = 0; else
6390 {
6391 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6392 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6393 }
6394
6395 DPRINTF((">>>> returning %d\n", rc));
6396 goto RETURN_MARK;
6397 }
6398
6399 /* Control gets here if there has been an error, or if the overall match
6400 attempt has failed at all permitted starting positions. */
6401
6402 if (using_temporary_offsets)
6403 {
6404 DPRINTF(("Freeing temporary memory\n"));
6405 (pcre_free)(md->offset_vector);
6406 }
6407
6408 /* For anything other than nomatch or partial match, just return the code. */
6409
6410 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6411 {
6412 DPRINTF((">>>> error: returning %d\n", rc));
6413 return rc;
6414 }
6415
6416 /* Handle partial matches - disable any mark data */
6417
6418 if (start_partial != NULL)
6419 {
6420 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6421 md->mark = NULL;
6422 if (offsetcount > 1)
6423 {
6424 offsets[0] = (int)(start_partial - (USPTR)subject);
6425 offsets[1] = (int)(end_subject - (USPTR)subject);
6426 }
6427 rc = PCRE_ERROR_PARTIAL;
6428 }
6429
6430 /* This is the classic nomatch case */
6431
6432 else
6433 {
6434 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6435 rc = PCRE_ERROR_NOMATCH;
6436 }
6437
6438 /* Return the MARK data if it has been requested. */
6439
6440 RETURN_MARK:
6441
6442 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6443 *(extra_data->mark) = (unsigned char *)(md->mark);
6444 return rc;
6445 }
6446
6447 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5