/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 645 - (show annotations)
Sun Jul 31 17:02:18 2011 UTC (8 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 194381 byte(s)
Error occurred while calculating annotation data.
Pass *MARK name to callouts
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register USPTR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 USPTR eptr_start = eptr;
159 register USPTR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 USPTR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 USPTR Xeptr;
358 const uschar *Xecode;
359 USPTR Xmstart;
360 USPTR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 USPTR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 USPTR Xcharptr;
370 #endif
371 USPTR Xdata;
372 USPTR Xnext;
373 USPTR Xpp;
374 USPTR Xprev;
375 USPTR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 uschar Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
478 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
479 unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const uschar *charptr;
590 #endif
591 const uschar *callpat;
592 const uschar *data;
593 const uschar *next;
594 USPTR pp;
595 const uschar *prev;
596 USPTR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 uschar occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
779 the alt that is at the start of the current branch. This makes it possible
780 to skip back past alternatives that precede the THEN within the current
781 branch. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode - GET(ecode, 1);
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
792 offset_top, md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode - GET(ecode, 1);
795 md->mark = ecode + LINK_SIZE + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle a capturing bracket, other than those that are possessive with an
799 unlimited repeat. If there is space in the offset vector, save the current
800 subject position in the working slot at the top of the vector. We mustn't
801 change the current values of the data slot, because they may be set from a
802 previous iteration of this group, and be referred to by a reference inside
803 the group. A failure to match might occur after the group has succeeded,
804 if something later on doesn't match. For this reason, we need to restore
805 the working value and also the values of the final offsets, in case they
806 were set by a previous iteration of the same bracket.
807
808 If there isn't enough space in the offset vector, treat this as if it were
809 a non-capturing bracket. Don't worry about setting the flag for the error
810 case here; that is handled in the code for KET. */
811
812 case OP_CBRA:
813 case OP_SCBRA:
814 number = GET2(ecode, 1+LINK_SIZE);
815 offset = number << 1;
816
817 #ifdef PCRE_DEBUG
818 printf("start bracket %d\n", number);
819 printf("subject=");
820 pchars(eptr, 16, TRUE, md);
821 printf("\n");
822 #endif
823
824 if (offset < md->offset_max)
825 {
826 save_offset1 = md->offset_vector[offset];
827 save_offset2 = md->offset_vector[offset+1];
828 save_offset3 = md->offset_vector[md->offset_end - number];
829 save_capture_last = md->capture_last;
830
831 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
832 md->offset_vector[md->offset_end - number] =
833 (int)(eptr - md->start_subject);
834
835 for (;;)
836 {
837 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
838 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
839 eptrb, RM1);
840 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
841 if (rrc != MATCH_NOMATCH &&
842 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
843 RRETURN(rrc);
844 md->capture_last = save_capture_last;
845 ecode += GET(ecode, 1);
846 if (*ecode != OP_ALT) break;
847 }
848
849 DPRINTF(("bracket %d failed\n", number));
850 md->offset_vector[offset] = save_offset1;
851 md->offset_vector[offset+1] = save_offset2;
852 md->offset_vector[md->offset_end - number] = save_offset3;
853
854 /* At this point, rrc will be one of MATCH_ONCE, MATCH_NOMATCH, or
855 MATCH_THEN. */
856
857 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
858 RRETURN(((rrc == MATCH_ONCE)? MATCH_ONCE:MATCH_NOMATCH));
859 }
860
861 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
862 as a non-capturing bracket. */
863
864 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
865 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
866
867 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
868
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
871
872 /* Non-capturing or atomic group, except for possessive with unlimited
873 repeat. Loop for all the alternatives. When we get to the final alternative
874 within the brackets, we used to return the result of a recursive call to
875 match() whatever happened so it was possible to reduce stack usage by
876 turning this into a tail recursion, except in the case of a possibly empty
877 group. However, now that there is the possiblity of (*THEN) occurring in
878 the final alternative, this optimization is no longer possible.
879
880 MATCH_ONCE is returned when the end of an atomic group is successfully
881 reached, but subsequent matching fails. It passes back up the tree (causing
882 captured values to be reset) until the original atomic group level is
883 reached. This is tested by comparing md->once_target with the start of the
884 group. At this point, the return is converted into MATCH_NOMATCH so that
885 previous backup points can be taken. */
886
887 case OP_ONCE:
888 case OP_BRA:
889 case OP_SBRA:
890 DPRINTF(("start non-capturing bracket\n"));
891
892 for (;;)
893 {
894 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
895 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
896 RM2);
897 if (rrc != MATCH_NOMATCH &&
898 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
899 {
900 if (rrc == MATCH_ONCE)
901 {
902 const uschar *scode = ecode;
903 if (*scode != OP_ONCE) /* If not at start, find it */
904 {
905 while (*scode == OP_ALT) scode += GET(scode, 1);
906 scode -= GET(scode, 1);
907 }
908 if (md->once_target == scode) rrc = MATCH_NOMATCH;
909 }
910 RRETURN(rrc);
911 }
912 ecode += GET(ecode, 1);
913 if (*ecode != OP_ALT) break;
914 }
915 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
916 RRETURN(MATCH_NOMATCH);
917
918 /* Handle possessive capturing brackets with an unlimited repeat. We come
919 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
920 handled similarly to the normal case above. However, the matching is
921 different. The end of these brackets will always be OP_KETRPOS, which
922 returns MATCH_KETRPOS without going further in the pattern. By this means
923 we can handle the group by iteration rather than recursion, thereby
924 reducing the amount of stack needed. */
925
926 case OP_CBRAPOS:
927 case OP_SCBRAPOS:
928 allow_zero = FALSE;
929
930 POSSESSIVE_CAPTURE:
931 number = GET2(ecode, 1+LINK_SIZE);
932 offset = number << 1;
933
934 #ifdef PCRE_DEBUG
935 printf("start possessive bracket %d\n", number);
936 printf("subject=");
937 pchars(eptr, 16, TRUE, md);
938 printf("\n");
939 #endif
940
941 if (offset < md->offset_max)
942 {
943 matched_once = FALSE;
944 code_offset = ecode - md->start_code;
945
946 save_offset1 = md->offset_vector[offset];
947 save_offset2 = md->offset_vector[offset+1];
948 save_offset3 = md->offset_vector[md->offset_end - number];
949 save_capture_last = md->capture_last;
950
951 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
952
953 /* Each time round the loop, save the current subject position for use
954 when the group matches. For MATCH_MATCH, the group has matched, so we
955 restart it with a new subject starting position, remembering that we had
956 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
957 usual. If we haven't matched any alternatives in any iteration, check to
958 see if a previous iteration matched. If so, the group has matched;
959 continue from afterwards. Otherwise it has failed; restore the previous
960 capture values before returning NOMATCH. */
961
962 for (;;)
963 {
964 md->offset_vector[md->offset_end - number] =
965 (int)(eptr - md->start_subject);
966 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
967 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
968 eptrb, RM63);
969 if (rrc == MATCH_KETRPOS)
970 {
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973 ecode = md->start_code + code_offset;
974 save_capture_last = md->capture_last;
975 matched_once = TRUE;
976 continue;
977 }
978 if (rrc != MATCH_NOMATCH &&
979 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
980 RRETURN(rrc);
981 md->capture_last = save_capture_last;
982 ecode += GET(ecode, 1);
983 if (*ecode != OP_ALT) break;
984 }
985
986 if (!matched_once)
987 {
988 md->offset_vector[offset] = save_offset1;
989 md->offset_vector[offset+1] = save_offset2;
990 md->offset_vector[md->offset_end - number] = save_offset3;
991 }
992
993 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
994 if (allow_zero || matched_once)
995 {
996 ecode += 1 + LINK_SIZE;
997 break;
998 }
999
1000 RRETURN(MATCH_NOMATCH);
1001 }
1002
1003 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1004 as a non-capturing bracket. */
1005
1006 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1007 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1008
1009 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1010
1011 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1012 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1013
1014 /* Non-capturing possessive bracket with unlimited repeat. We come here
1015 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1016 without the capturing complication. It is written out separately for speed
1017 and cleanliness. */
1018
1019 case OP_BRAPOS:
1020 case OP_SBRAPOS:
1021 allow_zero = FALSE;
1022
1023 POSSESSIVE_NON_CAPTURE:
1024 matched_once = FALSE;
1025 code_offset = ecode - md->start_code;
1026
1027 for (;;)
1028 {
1029 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1030 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1031 eptrb, RM48);
1032 if (rrc == MATCH_KETRPOS)
1033 {
1034 offset_top = md->end_offset_top;
1035 eptr = md->end_match_ptr;
1036 ecode = md->start_code + code_offset;
1037 matched_once = TRUE;
1038 continue;
1039 }
1040 if (rrc != MATCH_NOMATCH &&
1041 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1042 RRETURN(rrc);
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 if (matched_once || allow_zero)
1048 {
1049 ecode += 1 + LINK_SIZE;
1050 break;
1051 }
1052 RRETURN(MATCH_NOMATCH);
1053
1054 /* Control never reaches here. */
1055
1056 /* Conditional group: compilation checked that there are no more than
1057 two branches. If the condition is false, skipping the first branch takes us
1058 past the end if there is only one branch, but that's OK because that is
1059 exactly what going to the ket would do. */
1060
1061 case OP_COND:
1062 case OP_SCOND:
1063 codelink = GET(ecode, 1);
1064
1065 /* Because of the way auto-callout works during compile, a callout item is
1066 inserted between OP_COND and an assertion condition. */
1067
1068 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1069 {
1070 if (pcre_callout != NULL)
1071 {
1072 pcre_callout_block cb;
1073 cb.version = 2; /* Version 1 of the callout block */
1074 cb.callout_number = ecode[LINK_SIZE+2];
1075 cb.offset_vector = md->offset_vector;
1076 cb.subject = (PCRE_SPTR)md->start_subject;
1077 cb.subject_length = (int)(md->end_subject - md->start_subject);
1078 cb.start_match = (int)(mstart - md->start_subject);
1079 cb.current_position = (int)(eptr - md->start_subject);
1080 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1081 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1082 cb.capture_top = offset_top/2;
1083 cb.capture_last = md->capture_last;
1084 cb.callout_data = md->callout_data;
1085 cb.mark = markptr;
1086 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1087 if (rrc < 0) RRETURN(rrc);
1088 }
1089 ecode += _pcre_OP_lengths[OP_CALLOUT];
1090 }
1091
1092 condcode = ecode[LINK_SIZE+1];
1093
1094 /* Now see what the actual condition is */
1095
1096 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1097 {
1098 if (md->recursive == NULL) /* Not recursing => FALSE */
1099 {
1100 condition = FALSE;
1101 ecode += GET(ecode, 1);
1102 }
1103 else
1104 {
1105 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1106 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1107
1108 /* If the test is for recursion into a specific subpattern, and it is
1109 false, but the test was set up by name, scan the table to see if the
1110 name refers to any other numbers, and test them. The condition is true
1111 if any one is set. */
1112
1113 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1114 {
1115 uschar *slotA = md->name_table;
1116 for (i = 0; i < md->name_count; i++)
1117 {
1118 if (GET2(slotA, 0) == recno) break;
1119 slotA += md->name_entry_size;
1120 }
1121
1122 /* Found a name for the number - there can be only one; duplicate
1123 names for different numbers are allowed, but not vice versa. First
1124 scan down for duplicates. */
1125
1126 if (i < md->name_count)
1127 {
1128 uschar *slotB = slotA;
1129 while (slotB > md->name_table)
1130 {
1131 slotB -= md->name_entry_size;
1132 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1133 {
1134 condition = GET2(slotB, 0) == md->recursive->group_num;
1135 if (condition) break;
1136 }
1137 else break;
1138 }
1139
1140 /* Scan up for duplicates */
1141
1142 if (!condition)
1143 {
1144 slotB = slotA;
1145 for (i++; i < md->name_count; i++)
1146 {
1147 slotB += md->name_entry_size;
1148 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1149 {
1150 condition = GET2(slotB, 0) == md->recursive->group_num;
1151 if (condition) break;
1152 }
1153 else break;
1154 }
1155 }
1156 }
1157 }
1158
1159 /* Chose branch according to the condition */
1160
1161 ecode += condition? 3 : GET(ecode, 1);
1162 }
1163 }
1164
1165 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1166 {
1167 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1168 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1169
1170 /* If the numbered capture is unset, but the reference was by name,
1171 scan the table to see if the name refers to any other numbers, and test
1172 them. The condition is true if any one is set. This is tediously similar
1173 to the code above, but not close enough to try to amalgamate. */
1174
1175 if (!condition && condcode == OP_NCREF)
1176 {
1177 int refno = offset >> 1;
1178 uschar *slotA = md->name_table;
1179
1180 for (i = 0; i < md->name_count; i++)
1181 {
1182 if (GET2(slotA, 0) == refno) break;
1183 slotA += md->name_entry_size;
1184 }
1185
1186 /* Found a name for the number - there can be only one; duplicate names
1187 for different numbers are allowed, but not vice versa. First scan down
1188 for duplicates. */
1189
1190 if (i < md->name_count)
1191 {
1192 uschar *slotB = slotA;
1193 while (slotB > md->name_table)
1194 {
1195 slotB -= md->name_entry_size;
1196 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1197 {
1198 offset = GET2(slotB, 0) << 1;
1199 condition = offset < offset_top &&
1200 md->offset_vector[offset] >= 0;
1201 if (condition) break;
1202 }
1203 else break;
1204 }
1205
1206 /* Scan up for duplicates */
1207
1208 if (!condition)
1209 {
1210 slotB = slotA;
1211 for (i++; i < md->name_count; i++)
1212 {
1213 slotB += md->name_entry_size;
1214 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1215 {
1216 offset = GET2(slotB, 0) << 1;
1217 condition = offset < offset_top &&
1218 md->offset_vector[offset] >= 0;
1219 if (condition) break;
1220 }
1221 else break;
1222 }
1223 }
1224 }
1225 }
1226
1227 /* Chose branch according to the condition */
1228
1229 ecode += condition? 3 : GET(ecode, 1);
1230 }
1231
1232 else if (condcode == OP_DEF) /* DEFINE - always false */
1233 {
1234 condition = FALSE;
1235 ecode += GET(ecode, 1);
1236 }
1237
1238 /* The condition is an assertion. Call match() to evaluate it - setting
1239 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1240 an assertion. */
1241
1242 else
1243 {
1244 md->match_function_type = MATCH_CONDASSERT;
1245 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1246 if (rrc == MATCH_MATCH)
1247 {
1248 if (md->end_offset_top > offset_top)
1249 offset_top = md->end_offset_top; /* Captures may have happened */
1250 condition = TRUE;
1251 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1252 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1253 }
1254 else if (rrc != MATCH_NOMATCH &&
1255 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1256 {
1257 RRETURN(rrc); /* Need braces because of following else */
1258 }
1259 else
1260 {
1261 condition = FALSE;
1262 ecode += codelink;
1263 }
1264 }
1265
1266 /* We are now at the branch that is to be obeyed. As there is only one,
1267 we used to use tail recursion to avoid using another stack frame, except
1268 when there was unlimited repeat of a possibly empty group. However, that
1269 strategy no longer works because of the possibilty of (*THEN) being
1270 encountered in the branch. A recursive call to match() is always required,
1271 unless the second alternative doesn't exist, in which case we can just
1272 plough on. */
1273
1274 if (condition || *ecode == OP_ALT)
1275 {
1276 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1277 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1278 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1279 rrc = MATCH_NOMATCH;
1280 RRETURN(rrc);
1281 }
1282 else /* Condition false & no alternative */
1283 {
1284 ecode += 1 + LINK_SIZE;
1285 }
1286 break;
1287
1288
1289 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1290 to close any currently open capturing brackets. */
1291
1292 case OP_CLOSE:
1293 number = GET2(ecode, 1);
1294 offset = number << 1;
1295
1296 #ifdef PCRE_DEBUG
1297 printf("end bracket %d at *ACCEPT", number);
1298 printf("\n");
1299 #endif
1300
1301 md->capture_last = number;
1302 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1303 {
1304 md->offset_vector[offset] =
1305 md->offset_vector[md->offset_end - number];
1306 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1307 if (offset_top <= offset) offset_top = offset + 2;
1308 }
1309 ecode += 3;
1310 break;
1311
1312
1313 /* End of the pattern, either real or forced. */
1314
1315 case OP_END:
1316 case OP_ACCEPT:
1317 case OP_ASSERT_ACCEPT:
1318
1319 /* If we have matched an empty string, fail if not in an assertion and not
1320 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1321 is set and we have matched at the start of the subject. In both cases,
1322 backtracking will then try other alternatives, if any. */
1323
1324 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1325 md->recursive == NULL &&
1326 (md->notempty ||
1327 (md->notempty_atstart &&
1328 mstart == md->start_subject + md->start_offset)))
1329 MRRETURN(MATCH_NOMATCH);
1330
1331 /* Otherwise, we have a match. */
1332
1333 md->end_match_ptr = eptr; /* Record where we ended */
1334 md->end_offset_top = offset_top; /* and how many extracts were taken */
1335 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1336
1337 /* For some reason, the macros don't work properly if an expression is
1338 given as the argument to MRRETURN when the heap is in use. */
1339
1340 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1341 MRRETURN(rrc);
1342
1343 /* Assertion brackets. Check the alternative branches in turn - the
1344 matching won't pass the KET for an assertion. If any one branch matches,
1345 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1346 start of each branch to move the current point backwards, so the code at
1347 this level is identical to the lookahead case. When the assertion is part
1348 of a condition, we want to return immediately afterwards. The caller of
1349 this incarnation of the match() function will have set MATCH_CONDASSERT in
1350 md->match_function type, and one of these opcodes will be the first opcode
1351 that is processed. We use a local variable that is preserved over calls to
1352 match() to remember this case. */
1353
1354 case OP_ASSERT:
1355 case OP_ASSERTBACK:
1356 if (md->match_function_type == MATCH_CONDASSERT)
1357 {
1358 condassert = TRUE;
1359 md->match_function_type = 0;
1360 }
1361 else condassert = FALSE;
1362
1363 do
1364 {
1365 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1366 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1367 {
1368 mstart = md->start_match_ptr; /* In case \K reset it */
1369 markptr = md->mark;
1370 break;
1371 }
1372 if (rrc != MATCH_NOMATCH &&
1373 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1374 RRETURN(rrc);
1375 ecode += GET(ecode, 1);
1376 }
1377 while (*ecode == OP_ALT);
1378
1379 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1380
1381 /* If checking an assertion for a condition, return MATCH_MATCH. */
1382
1383 if (condassert) RRETURN(MATCH_MATCH);
1384
1385 /* Continue from after the assertion, updating the offsets high water
1386 mark, since extracts may have been taken during the assertion. */
1387
1388 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1389 ecode += 1 + LINK_SIZE;
1390 offset_top = md->end_offset_top;
1391 continue;
1392
1393 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1394 PRUNE, or COMMIT means we must assume failure without checking subsequent
1395 branches. */
1396
1397 case OP_ASSERT_NOT:
1398 case OP_ASSERTBACK_NOT:
1399 if (md->match_function_type == MATCH_CONDASSERT)
1400 {
1401 condassert = TRUE;
1402 md->match_function_type = 0;
1403 }
1404 else condassert = FALSE;
1405
1406 do
1407 {
1408 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1409 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1410 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1411 {
1412 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1413 break;
1414 }
1415 if (rrc != MATCH_NOMATCH &&
1416 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1417 RRETURN(rrc);
1418 ecode += GET(ecode,1);
1419 }
1420 while (*ecode == OP_ALT);
1421
1422 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1423
1424 ecode += 1 + LINK_SIZE;
1425 continue;
1426
1427 /* Move the subject pointer back. This occurs only at the start of
1428 each branch of a lookbehind assertion. If we are too close to the start to
1429 move back, this match function fails. When working with UTF-8 we move
1430 back a number of characters, not bytes. */
1431
1432 case OP_REVERSE:
1433 #ifdef SUPPORT_UTF8
1434 if (utf8)
1435 {
1436 i = GET(ecode, 1);
1437 while (i-- > 0)
1438 {
1439 eptr--;
1440 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1441 BACKCHAR(eptr);
1442 }
1443 }
1444 else
1445 #endif
1446
1447 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1448
1449 {
1450 eptr -= GET(ecode, 1);
1451 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1452 }
1453
1454 /* Save the earliest consulted character, then skip to next op code */
1455
1456 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1457 ecode += 1 + LINK_SIZE;
1458 break;
1459
1460 /* The callout item calls an external function, if one is provided, passing
1461 details of the match so far. This is mainly for debugging, though the
1462 function is able to force a failure. */
1463
1464 case OP_CALLOUT:
1465 if (pcre_callout != NULL)
1466 {
1467 pcre_callout_block cb;
1468 cb.version = 2; /* Version 1 of the callout block */
1469 cb.callout_number = ecode[1];
1470 cb.offset_vector = md->offset_vector;
1471 cb.subject = (PCRE_SPTR)md->start_subject;
1472 cb.subject_length = (int)(md->end_subject - md->start_subject);
1473 cb.start_match = (int)(mstart - md->start_subject);
1474 cb.current_position = (int)(eptr - md->start_subject);
1475 cb.pattern_position = GET(ecode, 2);
1476 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1477 cb.capture_top = offset_top/2;
1478 cb.capture_last = md->capture_last;
1479 cb.callout_data = md->callout_data;
1480 cb.mark = markptr;
1481 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1482 if (rrc < 0) RRETURN(rrc);
1483 }
1484 ecode += 2 + 2*LINK_SIZE;
1485 break;
1486
1487 /* Recursion either matches the current regex, or some subexpression. The
1488 offset data is the offset to the starting bracket from the start of the
1489 whole pattern. (This is so that it works from duplicated subpatterns.)
1490
1491 The state of the capturing groups is preserved over recursion, and
1492 re-instated afterwards. We don't know how many are started and not yet
1493 finished (offset_top records the completed total) so we just have to save
1494 all the potential data. There may be up to 65535 such values, which is too
1495 large to put on the stack, but using malloc for small numbers seems
1496 expensive. As a compromise, the stack is used when there are no more than
1497 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1498
1499 There are also other values that have to be saved. We use a chained
1500 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1501 for the original version of this logic. It has, however, been hacked around
1502 a lot, so he is not to blame for the current way it works. */
1503
1504 case OP_RECURSE:
1505 {
1506 recursion_info *ri;
1507 int recno;
1508
1509 callpat = md->start_code + GET(ecode, 1);
1510 recno = (callpat == md->start_code)? 0 :
1511 GET2(callpat, 1 + LINK_SIZE);
1512
1513 /* Check for repeating a recursion without advancing the subject pointer.
1514 This should catch convoluted mutual recursions. (Some simple cases are
1515 caught at compile time.) */
1516
1517 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1518 if (recno == ri->group_num && eptr == ri->subject_position)
1519 RRETURN(PCRE_ERROR_RECURSELOOP);
1520
1521 /* Add to "recursing stack" */
1522
1523 new_recursive.group_num = recno;
1524 new_recursive.subject_position = eptr;
1525 new_recursive.prevrec = md->recursive;
1526 md->recursive = &new_recursive;
1527
1528 /* Where to continue from afterwards */
1529
1530 ecode += 1 + LINK_SIZE;
1531
1532 /* Now save the offset data */
1533
1534 new_recursive.saved_max = md->offset_end;
1535 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1536 new_recursive.offset_save = stacksave;
1537 else
1538 {
1539 new_recursive.offset_save =
1540 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1541 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1542 }
1543 memcpy(new_recursive.offset_save, md->offset_vector,
1544 new_recursive.saved_max * sizeof(int));
1545
1546 /* OK, now we can do the recursion. After processing each alternative,
1547 restore the offset data. If there were nested recursions, md->recursive
1548 might be changed, so reset it before looping. */
1549
1550 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1551 cbegroup = (*callpat >= OP_SBRA);
1552 do
1553 {
1554 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1555 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1556 md, eptrb, RM6);
1557 memcpy(md->offset_vector, new_recursive.offset_save,
1558 new_recursive.saved_max * sizeof(int));
1559 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1560 {
1561 DPRINTF(("Recursion matched\n"));
1562 md->recursive = new_recursive.prevrec;
1563 if (new_recursive.offset_save != stacksave)
1564 (pcre_free)(new_recursive.offset_save);
1565
1566 /* Set where we got to in the subject, and reset the start in case
1567 it was changed by \K. This *is* propagated back out of a recursion,
1568 for Perl compatibility. */
1569
1570 eptr = md->end_match_ptr;
1571 mstart = md->start_match_ptr;
1572 goto RECURSION_MATCHED; /* Exit loop; end processing */
1573 }
1574 else if (rrc != MATCH_NOMATCH &&
1575 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1576 {
1577 DPRINTF(("Recursion gave error %d\n", rrc));
1578 if (new_recursive.offset_save != stacksave)
1579 (pcre_free)(new_recursive.offset_save);
1580 RRETURN(rrc);
1581 }
1582
1583 md->recursive = &new_recursive;
1584 callpat += GET(callpat, 1);
1585 }
1586 while (*callpat == OP_ALT);
1587
1588 DPRINTF(("Recursion didn't match\n"));
1589 md->recursive = new_recursive.prevrec;
1590 if (new_recursive.offset_save != stacksave)
1591 (pcre_free)(new_recursive.offset_save);
1592 MRRETURN(MATCH_NOMATCH);
1593 }
1594
1595 RECURSION_MATCHED:
1596 break;
1597
1598 /* An alternation is the end of a branch; scan along to find the end of the
1599 bracketed group and go to there. */
1600
1601 case OP_ALT:
1602 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1603 break;
1604
1605 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1606 indicating that it may occur zero times. It may repeat infinitely, or not
1607 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1608 with fixed upper repeat limits are compiled as a number of copies, with the
1609 optional ones preceded by BRAZERO or BRAMINZERO. */
1610
1611 case OP_BRAZERO:
1612 next = ecode + 1;
1613 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615 do next += GET(next, 1); while (*next == OP_ALT);
1616 ecode = next + 1 + LINK_SIZE;
1617 break;
1618
1619 case OP_BRAMINZERO:
1620 next = ecode + 1;
1621 do next += GET(next, 1); while (*next == OP_ALT);
1622 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1624 ecode++;
1625 break;
1626
1627 case OP_SKIPZERO:
1628 next = ecode+1;
1629 do next += GET(next,1); while (*next == OP_ALT);
1630 ecode = next + 1 + LINK_SIZE;
1631 break;
1632
1633 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1634 here; just jump to the group, with allow_zero set TRUE. */
1635
1636 case OP_BRAPOSZERO:
1637 op = *(++ecode);
1638 allow_zero = TRUE;
1639 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1640 goto POSSESSIVE_NON_CAPTURE;
1641
1642 /* End of a group, repeated or non-repeating. */
1643
1644 case OP_KET:
1645 case OP_KETRMIN:
1646 case OP_KETRMAX:
1647 case OP_KETRPOS:
1648 prev = ecode - GET(ecode, 1);
1649
1650 /* If this was a group that remembered the subject start, in order to break
1651 infinite repeats of empty string matches, retrieve the subject start from
1652 the chain. Otherwise, set it NULL. */
1653
1654 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1655 {
1656 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1657 eptrb = eptrb->epb_prev; /* Backup to previous group */
1658 }
1659 else saved_eptr = NULL;
1660
1661 /* If we are at the end of an assertion group, stop matching and return
1662 MATCH_MATCH, but record the current high water mark for use by positive
1663 assertions. We also need to record the match start in case it was changed
1664 by \K. */
1665
1666 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1667 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT)
1668 {
1669 md->end_match_ptr = eptr; /* For ONCE */
1670 md->end_offset_top = offset_top;
1671 md->start_match_ptr = mstart;
1672 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1673 }
1674
1675 /* For capturing groups we have to check the group number back at the start
1676 and if necessary complete handling an extraction by setting the offsets and
1677 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1678 into group 0, so it won't be picked up here. Instead, we catch it when the
1679 OP_END is reached. Other recursion is handled here. We just have to record
1680 the current subject position and start match pointer and give a MATCH
1681 return. */
1682
1683 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1684 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1685 {
1686 number = GET2(prev, 1+LINK_SIZE);
1687 offset = number << 1;
1688
1689 #ifdef PCRE_DEBUG
1690 printf("end bracket %d", number);
1691 printf("\n");
1692 #endif
1693
1694 /* Handle a recursively called group. */
1695
1696 if (md->recursive != NULL && md->recursive->group_num == number)
1697 {
1698 md->end_match_ptr = eptr;
1699 md->start_match_ptr = mstart;
1700 RRETURN(MATCH_MATCH);
1701 }
1702
1703 /* Deal with capturing */
1704
1705 md->capture_last = number;
1706 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1707 {
1708 /* If offset is greater than offset_top, it means that we are
1709 "skipping" a capturing group, and that group's offsets must be marked
1710 unset. In earlier versions of PCRE, all the offsets were unset at the
1711 start of matching, but this doesn't work because atomic groups and
1712 assertions can cause a value to be set that should later be unset.
1713 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1714 part of the atomic group, but this is not on the final matching path,
1715 so must be unset when 2 is set. (If there is no group 2, there is no
1716 problem, because offset_top will then be 2, indicating no capture.) */
1717
1718 if (offset > offset_top)
1719 {
1720 register int *iptr = md->offset_vector + offset_top;
1721 register int *iend = md->offset_vector + offset;
1722 while (iptr < iend) *iptr++ = -1;
1723 }
1724
1725 /* Now make the extraction */
1726
1727 md->offset_vector[offset] =
1728 md->offset_vector[md->offset_end - number];
1729 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1730 if (offset_top <= offset) offset_top = offset + 2;
1731 }
1732 }
1733
1734 /* For an ordinary non-repeating ket, just continue at this level. This
1735 also happens for a repeating ket if no characters were matched in the
1736 group. This is the forcible breaking of infinite loops as implemented in
1737 Perl 5.005. For a non-repeating atomic group, establish a backup point by
1738 processing the rest of the pattern at a lower level. If this results in a
1739 NOMATCH return, pass MATCH_ONCE back to the original OP_ONCE level, thereby
1740 bypassing intermediate backup points, but resetting any captures that
1741 happened along the way. */
1742
1743 if (*ecode == OP_KET || eptr == saved_eptr)
1744 {
1745 if (*prev == OP_ONCE)
1746 {
1747 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1748 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1749 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1750 RRETURN(MATCH_ONCE);
1751 }
1752 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1753 break;
1754 }
1755
1756 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1757 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1758 at a time from the outer level, thus saving stack. */
1759
1760 if (*ecode == OP_KETRPOS)
1761 {
1762 md->end_match_ptr = eptr;
1763 md->end_offset_top = offset_top;
1764 RRETURN(MATCH_KETRPOS);
1765 }
1766
1767 /* The normal repeating kets try the rest of the pattern or restart from
1768 the preceding bracket, in the appropriate order. In the second case, we can
1769 use tail recursion to avoid using another stack frame, unless we have an
1770 an atomic group or an unlimited repeat of a group that can match an empty
1771 string. */
1772
1773 if (*ecode == OP_KETRMIN)
1774 {
1775 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1777 if (*prev == OP_ONCE)
1778 {
1779 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1782 RRETURN(MATCH_ONCE);
1783 }
1784 if (*prev >= OP_SBRA) /* Could match an empty string */
1785 {
1786 md->match_function_type = MATCH_CBEGROUP;
1787 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1788 RRETURN(rrc);
1789 }
1790 ecode = prev;
1791 goto TAIL_RECURSE;
1792 }
1793 else /* OP_KETRMAX */
1794 {
1795 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1797 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1799 if (*prev == OP_ONCE)
1800 {
1801 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1803 md->once_target = prev;
1804 RRETURN(MATCH_ONCE);
1805 }
1806 ecode += 1 + LINK_SIZE;
1807 goto TAIL_RECURSE;
1808 }
1809 /* Control never gets here */
1810
1811 /* Not multiline mode: start of subject assertion, unless notbol. */
1812
1813 case OP_CIRC:
1814 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1815
1816 /* Start of subject assertion */
1817
1818 case OP_SOD:
1819 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1820 ecode++;
1821 break;
1822
1823 /* Multiline mode: start of subject unless notbol, or after any newline. */
1824
1825 case OP_CIRCM:
1826 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1827 if (eptr != md->start_subject &&
1828 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1829 MRRETURN(MATCH_NOMATCH);
1830 ecode++;
1831 break;
1832
1833 /* Start of match assertion */
1834
1835 case OP_SOM:
1836 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1837 ecode++;
1838 break;
1839
1840 /* Reset the start of match point */
1841
1842 case OP_SET_SOM:
1843 mstart = eptr;
1844 ecode++;
1845 break;
1846
1847 /* Multiline mode: assert before any newline, or before end of subject
1848 unless noteol is set. */
1849
1850 case OP_DOLLM:
1851 if (eptr < md->end_subject)
1852 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1853 else
1854 {
1855 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1856 SCHECK_PARTIAL();
1857 }
1858 ecode++;
1859 break;
1860
1861 /* Not multiline mode: assert before a terminating newline or before end of
1862 subject unless noteol is set. */
1863
1864 case OP_DOLL:
1865 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1866 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1867
1868 /* ... else fall through for endonly */
1869
1870 /* End of subject assertion (\z) */
1871
1872 case OP_EOD:
1873 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1874 SCHECK_PARTIAL();
1875 ecode++;
1876 break;
1877
1878 /* End of subject or ending \n assertion (\Z) */
1879
1880 case OP_EODN:
1881 ASSERT_NL_OR_EOS:
1882 if (eptr < md->end_subject &&
1883 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1884 MRRETURN(MATCH_NOMATCH);
1885
1886 /* Either at end of string or \n before end. */
1887
1888 SCHECK_PARTIAL();
1889 ecode++;
1890 break;
1891
1892 /* Word boundary assertions */
1893
1894 case OP_NOT_WORD_BOUNDARY:
1895 case OP_WORD_BOUNDARY:
1896 {
1897
1898 /* Find out if the previous and current characters are "word" characters.
1899 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1900 be "non-word" characters. Remember the earliest consulted character for
1901 partial matching. */
1902
1903 #ifdef SUPPORT_UTF8
1904 if (utf8)
1905 {
1906 /* Get status of previous character */
1907
1908 if (eptr == md->start_subject) prev_is_word = FALSE; else
1909 {
1910 USPTR lastptr = eptr - 1;
1911 while((*lastptr & 0xc0) == 0x80) lastptr--;
1912 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1913 GETCHAR(c, lastptr);
1914 #ifdef SUPPORT_UCP
1915 if (md->use_ucp)
1916 {
1917 if (c == '_') prev_is_word = TRUE; else
1918 {
1919 int cat = UCD_CATEGORY(c);
1920 prev_is_word = (cat == ucp_L || cat == ucp_N);
1921 }
1922 }
1923 else
1924 #endif
1925 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1926 }
1927
1928 /* Get status of next character */
1929
1930 if (eptr >= md->end_subject)
1931 {
1932 SCHECK_PARTIAL();
1933 cur_is_word = FALSE;
1934 }
1935 else
1936 {
1937 GETCHAR(c, eptr);
1938 #ifdef SUPPORT_UCP
1939 if (md->use_ucp)
1940 {
1941 if (c == '_') cur_is_word = TRUE; else
1942 {
1943 int cat = UCD_CATEGORY(c);
1944 cur_is_word = (cat == ucp_L || cat == ucp_N);
1945 }
1946 }
1947 else
1948 #endif
1949 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1950 }
1951 }
1952 else
1953 #endif
1954
1955 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1956 consistency with the behaviour of \w we do use it in this case. */
1957
1958 {
1959 /* Get status of previous character */
1960
1961 if (eptr == md->start_subject) prev_is_word = FALSE; else
1962 {
1963 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1964 #ifdef SUPPORT_UCP
1965 if (md->use_ucp)
1966 {
1967 c = eptr[-1];
1968 if (c == '_') prev_is_word = TRUE; else
1969 {
1970 int cat = UCD_CATEGORY(c);
1971 prev_is_word = (cat == ucp_L || cat == ucp_N);
1972 }
1973 }
1974 else
1975 #endif
1976 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1977 }
1978
1979 /* Get status of next character */
1980
1981 if (eptr >= md->end_subject)
1982 {
1983 SCHECK_PARTIAL();
1984 cur_is_word = FALSE;
1985 }
1986 else
1987 #ifdef SUPPORT_UCP
1988 if (md->use_ucp)
1989 {
1990 c = *eptr;
1991 if (c == '_') cur_is_word = TRUE; else
1992 {
1993 int cat = UCD_CATEGORY(c);
1994 cur_is_word = (cat == ucp_L || cat == ucp_N);
1995 }
1996 }
1997 else
1998 #endif
1999 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2000 }
2001
2002 /* Now see if the situation is what we want */
2003
2004 if ((*ecode++ == OP_WORD_BOUNDARY)?
2005 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2006 MRRETURN(MATCH_NOMATCH);
2007 }
2008 break;
2009
2010 /* Match a single character type; inline for speed */
2011
2012 case OP_ANY:
2013 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2014 /* Fall through */
2015
2016 case OP_ALLANY:
2017 if (eptr++ >= md->end_subject)
2018 {
2019 SCHECK_PARTIAL();
2020 MRRETURN(MATCH_NOMATCH);
2021 }
2022 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2023 ecode++;
2024 break;
2025
2026 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2027 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2028
2029 case OP_ANYBYTE:
2030 if (eptr++ >= md->end_subject)
2031 {
2032 SCHECK_PARTIAL();
2033 MRRETURN(MATCH_NOMATCH);
2034 }
2035 ecode++;
2036 break;
2037
2038 case OP_NOT_DIGIT:
2039 if (eptr >= md->end_subject)
2040 {
2041 SCHECK_PARTIAL();
2042 MRRETURN(MATCH_NOMATCH);
2043 }
2044 GETCHARINCTEST(c, eptr);
2045 if (
2046 #ifdef SUPPORT_UTF8
2047 c < 256 &&
2048 #endif
2049 (md->ctypes[c] & ctype_digit) != 0
2050 )
2051 MRRETURN(MATCH_NOMATCH);
2052 ecode++;
2053 break;
2054
2055 case OP_DIGIT:
2056 if (eptr >= md->end_subject)
2057 {
2058 SCHECK_PARTIAL();
2059 MRRETURN(MATCH_NOMATCH);
2060 }
2061 GETCHARINCTEST(c, eptr);
2062 if (
2063 #ifdef SUPPORT_UTF8
2064 c >= 256 ||
2065 #endif
2066 (md->ctypes[c] & ctype_digit) == 0
2067 )
2068 MRRETURN(MATCH_NOMATCH);
2069 ecode++;
2070 break;
2071
2072 case OP_NOT_WHITESPACE:
2073 if (eptr >= md->end_subject)
2074 {
2075 SCHECK_PARTIAL();
2076 MRRETURN(MATCH_NOMATCH);
2077 }
2078 GETCHARINCTEST(c, eptr);
2079 if (
2080 #ifdef SUPPORT_UTF8
2081 c < 256 &&
2082 #endif
2083 (md->ctypes[c] & ctype_space) != 0
2084 )
2085 MRRETURN(MATCH_NOMATCH);
2086 ecode++;
2087 break;
2088
2089 case OP_WHITESPACE:
2090 if (eptr >= md->end_subject)
2091 {
2092 SCHECK_PARTIAL();
2093 MRRETURN(MATCH_NOMATCH);
2094 }
2095 GETCHARINCTEST(c, eptr);
2096 if (
2097 #ifdef SUPPORT_UTF8
2098 c >= 256 ||
2099 #endif
2100 (md->ctypes[c] & ctype_space) == 0
2101 )
2102 MRRETURN(MATCH_NOMATCH);
2103 ecode++;
2104 break;
2105
2106 case OP_NOT_WORDCHAR:
2107 if (eptr >= md->end_subject)
2108 {
2109 SCHECK_PARTIAL();
2110 MRRETURN(MATCH_NOMATCH);
2111 }
2112 GETCHARINCTEST(c, eptr);
2113 if (
2114 #ifdef SUPPORT_UTF8
2115 c < 256 &&
2116 #endif
2117 (md->ctypes[c] & ctype_word) != 0
2118 )
2119 MRRETURN(MATCH_NOMATCH);
2120 ecode++;
2121 break;
2122
2123 case OP_WORDCHAR:
2124 if (eptr >= md->end_subject)
2125 {
2126 SCHECK_PARTIAL();
2127 MRRETURN(MATCH_NOMATCH);
2128 }
2129 GETCHARINCTEST(c, eptr);
2130 if (
2131 #ifdef SUPPORT_UTF8
2132 c >= 256 ||
2133 #endif
2134 (md->ctypes[c] & ctype_word) == 0
2135 )
2136 MRRETURN(MATCH_NOMATCH);
2137 ecode++;
2138 break;
2139
2140 case OP_ANYNL:
2141 if (eptr >= md->end_subject)
2142 {
2143 SCHECK_PARTIAL();
2144 MRRETURN(MATCH_NOMATCH);
2145 }
2146 GETCHARINCTEST(c, eptr);
2147 switch(c)
2148 {
2149 default: MRRETURN(MATCH_NOMATCH);
2150
2151 case 0x000d:
2152 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2153 break;
2154
2155 case 0x000a:
2156 break;
2157
2158 case 0x000b:
2159 case 0x000c:
2160 case 0x0085:
2161 case 0x2028:
2162 case 0x2029:
2163 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2164 break;
2165 }
2166 ecode++;
2167 break;
2168
2169 case OP_NOT_HSPACE:
2170 if (eptr >= md->end_subject)
2171 {
2172 SCHECK_PARTIAL();
2173 MRRETURN(MATCH_NOMATCH);
2174 }
2175 GETCHARINCTEST(c, eptr);
2176 switch(c)
2177 {
2178 default: break;
2179 case 0x09: /* HT */
2180 case 0x20: /* SPACE */
2181 case 0xa0: /* NBSP */
2182 case 0x1680: /* OGHAM SPACE MARK */
2183 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2184 case 0x2000: /* EN QUAD */
2185 case 0x2001: /* EM QUAD */
2186 case 0x2002: /* EN SPACE */
2187 case 0x2003: /* EM SPACE */
2188 case 0x2004: /* THREE-PER-EM SPACE */
2189 case 0x2005: /* FOUR-PER-EM SPACE */
2190 case 0x2006: /* SIX-PER-EM SPACE */
2191 case 0x2007: /* FIGURE SPACE */
2192 case 0x2008: /* PUNCTUATION SPACE */
2193 case 0x2009: /* THIN SPACE */
2194 case 0x200A: /* HAIR SPACE */
2195 case 0x202f: /* NARROW NO-BREAK SPACE */
2196 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2197 case 0x3000: /* IDEOGRAPHIC SPACE */
2198 MRRETURN(MATCH_NOMATCH);
2199 }
2200 ecode++;
2201 break;
2202
2203 case OP_HSPACE:
2204 if (eptr >= md->end_subject)
2205 {
2206 SCHECK_PARTIAL();
2207 MRRETURN(MATCH_NOMATCH);
2208 }
2209 GETCHARINCTEST(c, eptr);
2210 switch(c)
2211 {
2212 default: MRRETURN(MATCH_NOMATCH);
2213 case 0x09: /* HT */
2214 case 0x20: /* SPACE */
2215 case 0xa0: /* NBSP */
2216 case 0x1680: /* OGHAM SPACE MARK */
2217 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2218 case 0x2000: /* EN QUAD */
2219 case 0x2001: /* EM QUAD */
2220 case 0x2002: /* EN SPACE */
2221 case 0x2003: /* EM SPACE */
2222 case 0x2004: /* THREE-PER-EM SPACE */
2223 case 0x2005: /* FOUR-PER-EM SPACE */
2224 case 0x2006: /* SIX-PER-EM SPACE */
2225 case 0x2007: /* FIGURE SPACE */
2226 case 0x2008: /* PUNCTUATION SPACE */
2227 case 0x2009: /* THIN SPACE */
2228 case 0x200A: /* HAIR SPACE */
2229 case 0x202f: /* NARROW NO-BREAK SPACE */
2230 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2231 case 0x3000: /* IDEOGRAPHIC SPACE */
2232 break;
2233 }
2234 ecode++;
2235 break;
2236
2237 case OP_NOT_VSPACE:
2238 if (eptr >= md->end_subject)
2239 {
2240 SCHECK_PARTIAL();
2241 MRRETURN(MATCH_NOMATCH);
2242 }
2243 GETCHARINCTEST(c, eptr);
2244 switch(c)
2245 {
2246 default: break;
2247 case 0x0a: /* LF */
2248 case 0x0b: /* VT */
2249 case 0x0c: /* FF */
2250 case 0x0d: /* CR */
2251 case 0x85: /* NEL */
2252 case 0x2028: /* LINE SEPARATOR */
2253 case 0x2029: /* PARAGRAPH SEPARATOR */
2254 MRRETURN(MATCH_NOMATCH);
2255 }
2256 ecode++;
2257 break;
2258
2259 case OP_VSPACE:
2260 if (eptr >= md->end_subject)
2261 {
2262 SCHECK_PARTIAL();
2263 MRRETURN(MATCH_NOMATCH);
2264 }
2265 GETCHARINCTEST(c, eptr);
2266 switch(c)
2267 {
2268 default: MRRETURN(MATCH_NOMATCH);
2269 case 0x0a: /* LF */
2270 case 0x0b: /* VT */
2271 case 0x0c: /* FF */
2272 case 0x0d: /* CR */
2273 case 0x85: /* NEL */
2274 case 0x2028: /* LINE SEPARATOR */
2275 case 0x2029: /* PARAGRAPH SEPARATOR */
2276 break;
2277 }
2278 ecode++;
2279 break;
2280
2281 #ifdef SUPPORT_UCP
2282 /* Check the next character by Unicode property. We will get here only
2283 if the support is in the binary; otherwise a compile-time error occurs. */
2284
2285 case OP_PROP:
2286 case OP_NOTPROP:
2287 if (eptr >= md->end_subject)
2288 {
2289 SCHECK_PARTIAL();
2290 MRRETURN(MATCH_NOMATCH);
2291 }
2292 GETCHARINCTEST(c, eptr);
2293 {
2294 const ucd_record *prop = GET_UCD(c);
2295
2296 switch(ecode[1])
2297 {
2298 case PT_ANY:
2299 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2300 break;
2301
2302 case PT_LAMP:
2303 if ((prop->chartype == ucp_Lu ||
2304 prop->chartype == ucp_Ll ||
2305 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2306 MRRETURN(MATCH_NOMATCH);
2307 break;
2308
2309 case PT_GC:
2310 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2311 MRRETURN(MATCH_NOMATCH);
2312 break;
2313
2314 case PT_PC:
2315 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2316 MRRETURN(MATCH_NOMATCH);
2317 break;
2318
2319 case PT_SC:
2320 if ((ecode[2] != prop->script) == (op == OP_PROP))
2321 MRRETURN(MATCH_NOMATCH);
2322 break;
2323
2324 /* These are specials */
2325
2326 case PT_ALNUM:
2327 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2328 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2329 MRRETURN(MATCH_NOMATCH);
2330 break;
2331
2332 case PT_SPACE: /* Perl space */
2333 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2334 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2335 == (op == OP_NOTPROP))
2336 MRRETURN(MATCH_NOMATCH);
2337 break;
2338
2339 case PT_PXSPACE: /* POSIX space */
2340 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2341 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2342 c == CHAR_FF || c == CHAR_CR)
2343 == (op == OP_NOTPROP))
2344 MRRETURN(MATCH_NOMATCH);
2345 break;
2346
2347 case PT_WORD:
2348 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2349 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2350 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2351 MRRETURN(MATCH_NOMATCH);
2352 break;
2353
2354 /* This should never occur */
2355
2356 default:
2357 RRETURN(PCRE_ERROR_INTERNAL);
2358 }
2359
2360 ecode += 3;
2361 }
2362 break;
2363
2364 /* Match an extended Unicode sequence. We will get here only if the support
2365 is in the binary; otherwise a compile-time error occurs. */
2366
2367 case OP_EXTUNI:
2368 if (eptr >= md->end_subject)
2369 {
2370 SCHECK_PARTIAL();
2371 MRRETURN(MATCH_NOMATCH);
2372 }
2373 GETCHARINCTEST(c, eptr);
2374 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2375 while (eptr < md->end_subject)
2376 {
2377 int len = 1;
2378 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2379 if (UCD_CATEGORY(c) != ucp_M) break;
2380 eptr += len;
2381 }
2382 ecode++;
2383 break;
2384 #endif
2385
2386
2387 /* Match a back reference, possibly repeatedly. Look past the end of the
2388 item to see if there is repeat information following. The code is similar
2389 to that for character classes, but repeated for efficiency. Then obey
2390 similar code to character type repeats - written out again for speed.
2391 However, if the referenced string is the empty string, always treat
2392 it as matched, any number of times (otherwise there could be infinite
2393 loops). */
2394
2395 case OP_REF:
2396 case OP_REFI:
2397 caseless = op == OP_REFI;
2398 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2399 ecode += 3;
2400
2401 /* If the reference is unset, there are two possibilities:
2402
2403 (a) In the default, Perl-compatible state, set the length negative;
2404 this ensures that every attempt at a match fails. We can't just fail
2405 here, because of the possibility of quantifiers with zero minima.
2406
2407 (b) If the JavaScript compatibility flag is set, set the length to zero
2408 so that the back reference matches an empty string.
2409
2410 Otherwise, set the length to the length of what was matched by the
2411 referenced subpattern. */
2412
2413 if (offset >= offset_top || md->offset_vector[offset] < 0)
2414 length = (md->jscript_compat)? 0 : -1;
2415 else
2416 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2417
2418 /* Set up for repetition, or handle the non-repeated case */
2419
2420 switch (*ecode)
2421 {
2422 case OP_CRSTAR:
2423 case OP_CRMINSTAR:
2424 case OP_CRPLUS:
2425 case OP_CRMINPLUS:
2426 case OP_CRQUERY:
2427 case OP_CRMINQUERY:
2428 c = *ecode++ - OP_CRSTAR;
2429 minimize = (c & 1) != 0;
2430 min = rep_min[c]; /* Pick up values from tables; */
2431 max = rep_max[c]; /* zero for max => infinity */
2432 if (max == 0) max = INT_MAX;
2433 break;
2434
2435 case OP_CRRANGE:
2436 case OP_CRMINRANGE:
2437 minimize = (*ecode == OP_CRMINRANGE);
2438 min = GET2(ecode, 1);
2439 max = GET2(ecode, 3);
2440 if (max == 0) max = INT_MAX;
2441 ecode += 5;
2442 break;
2443
2444 default: /* No repeat follows */
2445 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2446 {
2447 CHECK_PARTIAL();
2448 MRRETURN(MATCH_NOMATCH);
2449 }
2450 eptr += length;
2451 continue; /* With the main loop */
2452 }
2453
2454 /* Handle repeated back references. If the length of the reference is
2455 zero, just continue with the main loop. */
2456
2457 if (length == 0) continue;
2458
2459 /* First, ensure the minimum number of matches are present. We get back
2460 the length of the reference string explicitly rather than passing the
2461 address of eptr, so that eptr can be a register variable. */
2462
2463 for (i = 1; i <= min; i++)
2464 {
2465 int slength;
2466 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2467 {
2468 CHECK_PARTIAL();
2469 MRRETURN(MATCH_NOMATCH);
2470 }
2471 eptr += slength;
2472 }
2473
2474 /* If min = max, continue at the same level without recursion.
2475 They are not both allowed to be zero. */
2476
2477 if (min == max) continue;
2478
2479 /* If minimizing, keep trying and advancing the pointer */
2480
2481 if (minimize)
2482 {
2483 for (fi = min;; fi++)
2484 {
2485 int slength;
2486 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2488 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2489 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2490 {
2491 CHECK_PARTIAL();
2492 MRRETURN(MATCH_NOMATCH);
2493 }
2494 eptr += slength;
2495 }
2496 /* Control never gets here */
2497 }
2498
2499 /* If maximizing, find the longest string and work backwards */
2500
2501 else
2502 {
2503 pp = eptr;
2504 for (i = min; i < max; i++)
2505 {
2506 int slength;
2507 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2508 {
2509 CHECK_PARTIAL();
2510 break;
2511 }
2512 eptr += slength;
2513 }
2514 while (eptr >= pp)
2515 {
2516 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2518 eptr -= length;
2519 }
2520 MRRETURN(MATCH_NOMATCH);
2521 }
2522 /* Control never gets here */
2523
2524 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2525 used when all the characters in the class have values in the range 0-255,
2526 and either the matching is caseful, or the characters are in the range
2527 0-127 when UTF-8 processing is enabled. The only difference between
2528 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2529 encountered.
2530
2531 First, look past the end of the item to see if there is repeat information
2532 following. Then obey similar code to character type repeats - written out
2533 again for speed. */
2534
2535 case OP_NCLASS:
2536 case OP_CLASS:
2537 {
2538 data = ecode + 1; /* Save for matching */
2539 ecode += 33; /* Advance past the item */
2540
2541 switch (*ecode)
2542 {
2543 case OP_CRSTAR:
2544 case OP_CRMINSTAR:
2545 case OP_CRPLUS:
2546 case OP_CRMINPLUS:
2547 case OP_CRQUERY:
2548 case OP_CRMINQUERY:
2549 c = *ecode++ - OP_CRSTAR;
2550 minimize = (c & 1) != 0;
2551 min = rep_min[c]; /* Pick up values from tables; */
2552 max = rep_max[c]; /* zero for max => infinity */
2553 if (max == 0) max = INT_MAX;
2554 break;
2555
2556 case OP_CRRANGE:
2557 case OP_CRMINRANGE:
2558 minimize = (*ecode == OP_CRMINRANGE);
2559 min = GET2(ecode, 1);
2560 max = GET2(ecode, 3);
2561 if (max == 0) max = INT_MAX;
2562 ecode += 5;
2563 break;
2564
2565 default: /* No repeat follows */
2566 min = max = 1;
2567 break;
2568 }
2569
2570 /* First, ensure the minimum number of matches are present. */
2571
2572 #ifdef SUPPORT_UTF8
2573 /* UTF-8 mode */
2574 if (utf8)
2575 {
2576 for (i = 1; i <= min; i++)
2577 {
2578 if (eptr >= md->end_subject)
2579 {
2580 SCHECK_PARTIAL();
2581 MRRETURN(MATCH_NOMATCH);
2582 }
2583 GETCHARINC(c, eptr);
2584 if (c > 255)
2585 {
2586 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2587 }
2588 else
2589 {
2590 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2591 }
2592 }
2593 }
2594 else
2595 #endif
2596 /* Not UTF-8 mode */
2597 {
2598 for (i = 1; i <= min; i++)
2599 {
2600 if (eptr >= md->end_subject)
2601 {
2602 SCHECK_PARTIAL();
2603 MRRETURN(MATCH_NOMATCH);
2604 }
2605 c = *eptr++;
2606 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2607 }
2608 }
2609
2610 /* If max == min we can continue with the main loop without the
2611 need to recurse. */
2612
2613 if (min == max) continue;
2614
2615 /* If minimizing, keep testing the rest of the expression and advancing
2616 the pointer while it matches the class. */
2617
2618 if (minimize)
2619 {
2620 #ifdef SUPPORT_UTF8
2621 /* UTF-8 mode */
2622 if (utf8)
2623 {
2624 for (fi = min;; fi++)
2625 {
2626 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2628 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2629 if (eptr >= md->end_subject)
2630 {
2631 SCHECK_PARTIAL();
2632 MRRETURN(MATCH_NOMATCH);
2633 }
2634 GETCHARINC(c, eptr);
2635 if (c > 255)
2636 {
2637 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2638 }
2639 else
2640 {
2641 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2642 }
2643 }
2644 }
2645 else
2646 #endif
2647 /* Not UTF-8 mode */
2648 {
2649 for (fi = min;; fi++)
2650 {
2651 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2652 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2654 if (eptr >= md->end_subject)
2655 {
2656 SCHECK_PARTIAL();
2657 MRRETURN(MATCH_NOMATCH);
2658 }
2659 c = *eptr++;
2660 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2661 }
2662 }
2663 /* Control never gets here */
2664 }
2665
2666 /* If maximizing, find the longest possible run, then work backwards. */
2667
2668 else
2669 {
2670 pp = eptr;
2671
2672 #ifdef SUPPORT_UTF8
2673 /* UTF-8 mode */
2674 if (utf8)
2675 {
2676 for (i = min; i < max; i++)
2677 {
2678 int len = 1;
2679 if (eptr >= md->end_subject)
2680 {
2681 SCHECK_PARTIAL();
2682 break;
2683 }
2684 GETCHARLEN(c, eptr, len);
2685 if (c > 255)
2686 {
2687 if (op == OP_CLASS) break;
2688 }
2689 else
2690 {
2691 if ((data[c/8] & (1 << (c&7))) == 0) break;
2692 }
2693 eptr += len;
2694 }
2695 for (;;)
2696 {
2697 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2698 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2699 if (eptr-- == pp) break; /* Stop if tried at original pos */
2700 BACKCHAR(eptr);
2701 }
2702 }
2703 else
2704 #endif
2705 /* Not UTF-8 mode */
2706 {
2707 for (i = min; i < max; i++)
2708 {
2709 if (eptr >= md->end_subject)
2710 {
2711 SCHECK_PARTIAL();
2712 break;
2713 }
2714 c = *eptr;
2715 if ((data[c/8] & (1 << (c&7))) == 0) break;
2716 eptr++;
2717 }
2718 while (eptr >= pp)
2719 {
2720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 eptr--;
2723 }
2724 }
2725
2726 MRRETURN(MATCH_NOMATCH);
2727 }
2728 }
2729 /* Control never gets here */
2730
2731
2732 /* Match an extended character class. This opcode is encountered only
2733 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2734 mode, because Unicode properties are supported in non-UTF-8 mode. */
2735
2736 #ifdef SUPPORT_UTF8
2737 case OP_XCLASS:
2738 {
2739 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2740 ecode += GET(ecode, 1); /* Advance past the item */
2741
2742 switch (*ecode)
2743 {
2744 case OP_CRSTAR:
2745 case OP_CRMINSTAR:
2746 case OP_CRPLUS:
2747 case OP_CRMINPLUS:
2748 case OP_CRQUERY:
2749 case OP_CRMINQUERY:
2750 c = *ecode++ - OP_CRSTAR;
2751 minimize = (c & 1) != 0;
2752 min = rep_min[c]; /* Pick up values from tables; */
2753 max = rep_max[c]; /* zero for max => infinity */
2754 if (max == 0) max = INT_MAX;
2755 break;
2756
2757 case OP_CRRANGE:
2758 case OP_CRMINRANGE:
2759 minimize = (*ecode == OP_CRMINRANGE);
2760 min = GET2(ecode, 1);
2761 max = GET2(ecode, 3);
2762 if (max == 0) max = INT_MAX;
2763 ecode += 5;
2764 break;
2765
2766 default: /* No repeat follows */
2767 min = max = 1;
2768 break;
2769 }
2770
2771 /* First, ensure the minimum number of matches are present. */
2772
2773 for (i = 1; i <= min; i++)
2774 {
2775 if (eptr >= md->end_subject)
2776 {
2777 SCHECK_PARTIAL();
2778 MRRETURN(MATCH_NOMATCH);
2779 }
2780 GETCHARINCTEST(c, eptr);
2781 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2782 }
2783
2784 /* If max == min we can continue with the main loop without the
2785 need to recurse. */
2786
2787 if (min == max) continue;
2788
2789 /* If minimizing, keep testing the rest of the expression and advancing
2790 the pointer while it matches the class. */
2791
2792 if (minimize)
2793 {
2794 for (fi = min;; fi++)
2795 {
2796 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2798 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2799 if (eptr >= md->end_subject)
2800 {
2801 SCHECK_PARTIAL();
2802 MRRETURN(MATCH_NOMATCH);
2803 }
2804 GETCHARINCTEST(c, eptr);
2805 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2806 }
2807 /* Control never gets here */
2808 }
2809
2810 /* If maximizing, find the longest possible run, then work backwards. */
2811
2812 else
2813 {
2814 pp = eptr;
2815 for (i = min; i < max; i++)
2816 {
2817 int len = 1;
2818 if (eptr >= md->end_subject)
2819 {
2820 SCHECK_PARTIAL();
2821 break;
2822 }
2823 GETCHARLENTEST(c, eptr, len);
2824 if (!_pcre_xclass(c, data)) break;
2825 eptr += len;
2826 }
2827 for(;;)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (eptr-- == pp) break; /* Stop if tried at original pos */
2832 if (utf8) BACKCHAR(eptr);
2833 }
2834 MRRETURN(MATCH_NOMATCH);
2835 }
2836
2837 /* Control never gets here */
2838 }
2839 #endif /* End of XCLASS */
2840
2841 /* Match a single character, casefully */
2842
2843 case OP_CHAR:
2844 #ifdef SUPPORT_UTF8
2845 if (utf8)
2846 {
2847 length = 1;
2848 ecode++;
2849 GETCHARLEN(fc, ecode, length);
2850 if (length > md->end_subject - eptr)
2851 {
2852 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2853 MRRETURN(MATCH_NOMATCH);
2854 }
2855 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2856 }
2857 else
2858 #endif
2859
2860 /* Non-UTF-8 mode */
2861 {
2862 if (md->end_subject - eptr < 1)
2863 {
2864 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2865 MRRETURN(MATCH_NOMATCH);
2866 }
2867 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2868 ecode += 2;
2869 }
2870 break;
2871
2872 /* Match a single character, caselessly */
2873
2874 case OP_CHARI:
2875 #ifdef SUPPORT_UTF8
2876 if (utf8)
2877 {
2878 length = 1;
2879 ecode++;
2880 GETCHARLEN(fc, ecode, length);
2881
2882 if (length > md->end_subject - eptr)
2883 {
2884 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2885 MRRETURN(MATCH_NOMATCH);
2886 }
2887
2888 /* If the pattern character's value is < 128, we have only one byte, and
2889 can use the fast lookup table. */
2890
2891 if (fc < 128)
2892 {
2893 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2894 }
2895
2896 /* Otherwise we must pick up the subject character */
2897
2898 else
2899 {
2900 unsigned int dc;
2901 GETCHARINC(dc, eptr);
2902 ecode += length;
2903
2904 /* If we have Unicode property support, we can use it to test the other
2905 case of the character, if there is one. */
2906
2907 if (fc != dc)
2908 {
2909 #ifdef SUPPORT_UCP
2910 if (dc != UCD_OTHERCASE(fc))
2911 #endif
2912 MRRETURN(MATCH_NOMATCH);
2913 }
2914 }
2915 }
2916 else
2917 #endif /* SUPPORT_UTF8 */
2918
2919 /* Non-UTF-8 mode */
2920 {
2921 if (md->end_subject - eptr < 1)
2922 {
2923 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2924 MRRETURN(MATCH_NOMATCH);
2925 }
2926 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2927 ecode += 2;
2928 }
2929 break;
2930
2931 /* Match a single character repeatedly. */
2932
2933 case OP_EXACT:
2934 case OP_EXACTI:
2935 min = max = GET2(ecode, 1);
2936 ecode += 3;
2937 goto REPEATCHAR;
2938
2939 case OP_POSUPTO:
2940 case OP_POSUPTOI:
2941 possessive = TRUE;
2942 /* Fall through */
2943
2944 case OP_UPTO:
2945 case OP_UPTOI:
2946 case OP_MINUPTO:
2947 case OP_MINUPTOI:
2948 min = 0;
2949 max = GET2(ecode, 1);
2950 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2951 ecode += 3;
2952 goto REPEATCHAR;
2953
2954 case OP_POSSTAR:
2955 case OP_POSSTARI:
2956 possessive = TRUE;
2957 min = 0;
2958 max = INT_MAX;
2959 ecode++;
2960 goto REPEATCHAR;
2961
2962 case OP_POSPLUS:
2963 case OP_POSPLUSI:
2964 possessive = TRUE;
2965 min = 1;
2966 max = INT_MAX;
2967 ecode++;
2968 goto REPEATCHAR;
2969
2970 case OP_POSQUERY:
2971 case OP_POSQUERYI:
2972 possessive = TRUE;
2973 min = 0;
2974 max = 1;
2975 ecode++;
2976 goto REPEATCHAR;
2977
2978 case OP_STAR:
2979 case OP_STARI:
2980 case OP_MINSTAR:
2981 case OP_MINSTARI:
2982 case OP_PLUS:
2983 case OP_PLUSI:
2984 case OP_MINPLUS:
2985 case OP_MINPLUSI:
2986 case OP_QUERY:
2987 case OP_QUERYI:
2988 case OP_MINQUERY:
2989 case OP_MINQUERYI:
2990 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
2991 minimize = (c & 1) != 0;
2992 min = rep_min[c]; /* Pick up values from tables; */
2993 max = rep_max[c]; /* zero for max => infinity */
2994 if (max == 0) max = INT_MAX;
2995
2996 /* Common code for all repeated single-character matches. */
2997
2998 REPEATCHAR:
2999 #ifdef SUPPORT_UTF8
3000 if (utf8)
3001 {
3002 length = 1;
3003 charptr = ecode;
3004 GETCHARLEN(fc, ecode, length);
3005 ecode += length;
3006
3007 /* Handle multibyte character matching specially here. There is
3008 support for caseless matching if UCP support is present. */
3009
3010 if (length > 1)
3011 {
3012 #ifdef SUPPORT_UCP
3013 unsigned int othercase;
3014 if (op >= OP_STARI && /* Caseless */
3015 (othercase = UCD_OTHERCASE(fc)) != fc)
3016 oclength = _pcre_ord2utf8(othercase, occhars);
3017 else oclength = 0;
3018 #endif /* SUPPORT_UCP */
3019
3020 for (i = 1; i <= min; i++)
3021 {
3022 if (eptr <= md->end_subject - length &&
3023 memcmp(eptr, charptr, length) == 0) eptr += length;
3024 #ifdef SUPPORT_UCP
3025 else if (oclength > 0 &&
3026 eptr <= md->end_subject - oclength &&
3027 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3028 #endif /* SUPPORT_UCP */
3029 else
3030 {
3031 CHECK_PARTIAL();
3032 MRRETURN(MATCH_NOMATCH);
3033 }
3034 }
3035
3036 if (min == max) continue;
3037
3038 if (minimize)
3039 {
3040 for (fi = min;; fi++)
3041 {
3042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3044 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3045 if (eptr <= md->end_subject - length &&
3046 memcmp(eptr, charptr, length) == 0) eptr += length;
3047 #ifdef SUPPORT_UCP
3048 else if (oclength > 0 &&
3049 eptr <= md->end_subject - oclength &&
3050 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3051 #endif /* SUPPORT_UCP */
3052 else
3053 {
3054 CHECK_PARTIAL();
3055 MRRETURN(MATCH_NOMATCH);
3056 }
3057 }
3058 /* Control never gets here */
3059 }
3060
3061 else /* Maximize */
3062 {
3063 pp = eptr;
3064 for (i = min; i < max; i++)
3065 {
3066 if (eptr <= md->end_subject - length &&
3067 memcmp(eptr, charptr, length) == 0) eptr += length;
3068 #ifdef SUPPORT_UCP
3069 else if (oclength > 0 &&
3070 eptr <= md->end_subject - oclength &&
3071 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3072 #endif /* SUPPORT_UCP */
3073 else
3074 {
3075 CHECK_PARTIAL();
3076 break;
3077 }
3078 }
3079
3080 if (possessive) continue;
3081
3082 for(;;)
3083 {
3084 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3086 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3087 #ifdef SUPPORT_UCP
3088 eptr--;
3089 BACKCHAR(eptr);
3090 #else /* without SUPPORT_UCP */
3091 eptr -= length;
3092 #endif /* SUPPORT_UCP */
3093 }
3094 }
3095 /* Control never gets here */
3096 }
3097
3098 /* If the length of a UTF-8 character is 1, we fall through here, and
3099 obey the code as for non-UTF-8 characters below, though in this case the
3100 value of fc will always be < 128. */
3101 }
3102 else
3103 #endif /* SUPPORT_UTF8 */
3104
3105 /* When not in UTF-8 mode, load a single-byte character. */
3106
3107 fc = *ecode++;
3108
3109 /* The value of fc at this point is always less than 256, though we may or
3110 may not be in UTF-8 mode. The code is duplicated for the caseless and
3111 caseful cases, for speed, since matching characters is likely to be quite
3112 common. First, ensure the minimum number of matches are present. If min =
3113 max, continue at the same level without recursing. Otherwise, if
3114 minimizing, keep trying the rest of the expression and advancing one
3115 matching character if failing, up to the maximum. Alternatively, if
3116 maximizing, find the maximum number of characters and work backwards. */
3117
3118 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3119 max, eptr));
3120
3121 if (op >= OP_STARI) /* Caseless */
3122 {
3123 fc = md->lcc[fc];
3124 for (i = 1; i <= min; i++)
3125 {
3126 if (eptr >= md->end_subject)
3127 {
3128 SCHECK_PARTIAL();
3129 MRRETURN(MATCH_NOMATCH);
3130 }
3131 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3132 }
3133 if (min == max) continue;
3134 if (minimize)
3135 {
3136 for (fi = min;; fi++)
3137 {
3138 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3139 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3140 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3141 if (eptr >= md->end_subject)
3142 {
3143 SCHECK_PARTIAL();
3144 MRRETURN(MATCH_NOMATCH);
3145 }
3146 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3147 }
3148 /* Control never gets here */
3149 }
3150 else /* Maximize */
3151 {
3152 pp = eptr;
3153 for (i = min; i < max; i++)
3154 {
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 break;
3159 }
3160 if (fc != md->lcc[*eptr]) break;
3161 eptr++;
3162 }
3163
3164 if (possessive) continue;
3165
3166 while (eptr >= pp)
3167 {
3168 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3169 eptr--;
3170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 }
3172 MRRETURN(MATCH_NOMATCH);
3173 }
3174 /* Control never gets here */
3175 }
3176
3177 /* Caseful comparisons (includes all multi-byte characters) */
3178
3179 else
3180 {
3181 for (i = 1; i <= min; i++)
3182 {
3183 if (eptr >= md->end_subject)
3184 {
3185 SCHECK_PARTIAL();
3186 MRRETURN(MATCH_NOMATCH);
3187 }
3188 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3189 }
3190
3191 if (min == max) continue;
3192
3193 if (minimize)
3194 {
3195 for (fi = min;; fi++)
3196 {
3197 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3199 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3200 if (eptr >= md->end_subject)
3201 {
3202 SCHECK_PARTIAL();
3203 MRRETURN(MATCH_NOMATCH);
3204 }
3205 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3206 }
3207 /* Control never gets here */
3208 }
3209 else /* Maximize */
3210 {
3211 pp = eptr;
3212 for (i = min; i < max; i++)
3213 {
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 break;
3218 }
3219 if (fc != *eptr) break;
3220 eptr++;
3221 }
3222 if (possessive) continue;
3223
3224 while (eptr >= pp)
3225 {
3226 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3227 eptr--;
3228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3229 }
3230 MRRETURN(MATCH_NOMATCH);
3231 }
3232 }
3233 /* Control never gets here */
3234
3235 /* Match a negated single one-byte character. The character we are
3236 checking can be multibyte. */
3237
3238 case OP_NOT:
3239 case OP_NOTI:
3240 if (eptr >= md->end_subject)
3241 {
3242 SCHECK_PARTIAL();
3243 MRRETURN(MATCH_NOMATCH);
3244 }
3245 ecode++;
3246 GETCHARINCTEST(c, eptr);
3247 if (op == OP_NOTI) /* The caseless case */
3248 {
3249 #ifdef SUPPORT_UTF8
3250 if (c < 256)
3251 #endif
3252 c = md->lcc[c];
3253 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3254 }
3255 else /* Caseful */
3256 {
3257 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3258 }
3259 break;
3260
3261 /* Match a negated single one-byte character repeatedly. This is almost a
3262 repeat of the code for a repeated single character, but I haven't found a
3263 nice way of commoning these up that doesn't require a test of the
3264 positive/negative option for each character match. Maybe that wouldn't add
3265 very much to the time taken, but character matching *is* what this is all
3266 about... */
3267
3268 case OP_NOTEXACT:
3269 case OP_NOTEXACTI:
3270 min = max = GET2(ecode, 1);
3271 ecode += 3;
3272 goto REPEATNOTCHAR;
3273
3274 case OP_NOTUPTO:
3275 case OP_NOTUPTOI:
3276 case OP_NOTMINUPTO:
3277 case OP_NOTMINUPTOI:
3278 min = 0;
3279 max = GET2(ecode, 1);
3280 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3281 ecode += 3;
3282 goto REPEATNOTCHAR;
3283
3284 case OP_NOTPOSSTAR:
3285 case OP_NOTPOSSTARI:
3286 possessive = TRUE;
3287 min = 0;
3288 max = INT_MAX;
3289 ecode++;
3290 goto REPEATNOTCHAR;
3291
3292 case OP_NOTPOSPLUS:
3293 case OP_NOTPOSPLUSI:
3294 possessive = TRUE;
3295 min = 1;
3296 max = INT_MAX;
3297 ecode++;
3298 goto REPEATNOTCHAR;
3299
3300 case OP_NOTPOSQUERY:
3301 case OP_NOTPOSQUERYI:
3302 possessive = TRUE;
3303 min = 0;
3304 max = 1;
3305 ecode++;
3306 goto REPEATNOTCHAR;
3307
3308 case OP_NOTPOSUPTO:
3309 case OP_NOTPOSUPTOI:
3310 possessive = TRUE;
3311 min = 0;
3312 max = GET2(ecode, 1);
3313 ecode += 3;
3314 goto REPEATNOTCHAR;
3315
3316 case OP_NOTSTAR:
3317 case OP_NOTSTARI:
3318 case OP_NOTMINSTAR:
3319 case OP_NOTMINSTARI:
3320 case OP_NOTPLUS:
3321 case OP_NOTPLUSI:
3322 case OP_NOTMINPLUS:
3323 case OP_NOTMINPLUSI:
3324 case OP_NOTQUERY:
3325 case OP_NOTQUERYI:
3326 case OP_NOTMINQUERY:
3327 case OP_NOTMINQUERYI:
3328 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3329 minimize = (c & 1) != 0;
3330 min = rep_min[c]; /* Pick up values from tables; */
3331 max = rep_max[c]; /* zero for max => infinity */
3332 if (max == 0) max = INT_MAX;
3333
3334 /* Common code for all repeated single-byte matches. */
3335
3336 REPEATNOTCHAR:
3337 fc = *ecode++;
3338
3339 /* The code is duplicated for the caseless and caseful cases, for speed,
3340 since matching characters is likely to be quite common. First, ensure the
3341 minimum number of matches are present. If min = max, continue at the same
3342 level without recursing. Otherwise, if minimizing, keep trying the rest of
3343 the expression and advancing one matching character if failing, up to the
3344 maximum. Alternatively, if maximizing, find the maximum number of
3345 characters and work backwards. */
3346
3347 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3348 max, eptr));
3349
3350 if (op >= OP_NOTSTARI) /* Caseless */
3351 {
3352 fc = md->lcc[fc];
3353
3354 #ifdef SUPPORT_UTF8
3355 /* UTF-8 mode */
3356 if (utf8)
3357 {
3358 register unsigned int d;
3359 for (i = 1; i <= min; i++)
3360 {
3361 if (eptr >= md->end_subject)
3362 {
3363 SCHECK_PARTIAL();
3364 MRRETURN(MATCH_NOMATCH);
3365 }
3366 GETCHARINC(d, eptr);
3367 if (d < 256) d = md->lcc[d];
3368 if (fc == d) MRRETURN(MATCH_NOMATCH);
3369 }
3370 }
3371 else
3372 #endif
3373
3374 /* Not UTF-8 mode */
3375 {
3376 for (i = 1; i <= min; i++)
3377 {
3378 if (eptr >= md->end_subject)
3379 {
3380 SCHECK_PARTIAL();
3381 MRRETURN(MATCH_NOMATCH);
3382 }
3383 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3384 }
3385 }
3386
3387 if (min == max) continue;
3388
3389 if (minimize)
3390 {
3391 #ifdef SUPPORT_UTF8
3392 /* UTF-8 mode */
3393 if (utf8)
3394 {
3395 register unsigned int d;
3396 for (fi = min;; fi++)
3397 {
3398 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3399 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3401 if (eptr >= md->end_subject)
3402 {
3403 SCHECK_PARTIAL();
3404 MRRETURN(MATCH_NOMATCH);
3405 }
3406 GETCHARINC(d, eptr);
3407 if (d < 256) d = md->lcc[d];
3408 if (fc == d) MRRETURN(MATCH_NOMATCH);
3409 }
3410 }
3411 else
3412 #endif
3413 /* Not UTF-8 mode */
3414 {
3415 for (fi = min;; fi++)
3416 {
3417 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3418 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3419 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3420 if (eptr >= md->end_subject)
3421 {
3422 SCHECK_PARTIAL();
3423 MRRETURN(MATCH_NOMATCH);
3424 }
3425 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3426 }
3427 }
3428 /* Control never gets here */
3429 }
3430
3431 /* Maximize case */
3432
3433 else
3434 {
3435 pp = eptr;
3436
3437 #ifdef SUPPORT_UTF8
3438 /* UTF-8 mode */
3439 if (utf8)
3440 {
3441 register unsigned int d;
3442 for (i = min; i < max; i++)
3443 {
3444 int len = 1;
3445 if (eptr >= md->end_subject)
3446 {
3447 SCHECK_PARTIAL();
3448 break;
3449 }
3450 GETCHARLEN(d, eptr, len);
3451 if (d < 256) d = md->lcc[d];
3452 if (fc == d) break;
3453 eptr += len;
3454 }
3455 if (possessive) continue;
3456 for(;;)
3457 {
3458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3460 if (eptr-- == pp) break; /* Stop if tried at original pos */
3461 BACKCHAR(eptr);
3462 }
3463 }
3464 else
3465 #endif
3466 /* Not UTF-8 mode */
3467 {
3468 for (i = min; i < max; i++)
3469 {
3470 if (eptr >= md->end_subject)
3471 {
3472 SCHECK_PARTIAL();
3473 break;
3474 }
3475 if (fc == md->lcc[*eptr]) break;
3476 eptr++;
3477 }
3478 if (possessive) continue;
3479 while (eptr >= pp)
3480 {
3481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3483 eptr--;
3484 }
3485 }
3486
3487 MRRETURN(MATCH_NOMATCH);
3488 }
3489 /* Control never gets here */
3490 }
3491
3492 /* Caseful comparisons */
3493
3494 else
3495 {
3496 #ifdef SUPPORT_UTF8
3497 /* UTF-8 mode */
3498 if (utf8)
3499 {
3500 register unsigned int d;
3501 for (i = 1; i <= min; i++)
3502 {
3503 if (eptr >= md->end_subject)
3504 {
3505 SCHECK_PARTIAL();
3506 MRRETURN(MATCH_NOMATCH);
3507 }
3508 GETCHARINC(d, eptr);
3509 if (fc == d) MRRETURN(MATCH_NOMATCH);
3510 }
3511 }
3512 else
3513 #endif
3514 /* Not UTF-8 mode */
3515 {
3516 for (i = 1; i <= min; i++)
3517 {
3518 if (eptr >= md->end_subject)
3519 {
3520 SCHECK_PARTIAL();
3521 MRRETURN(MATCH_NOMATCH);
3522 }
3523 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3524 }
3525 }
3526
3527 if (min == max) continue;
3528
3529 if (minimize)
3530 {
3531 #ifdef SUPPORT_UTF8
3532 /* UTF-8 mode */
3533 if (utf8)
3534 {
3535 register unsigned int d;
3536 for (fi = min;; fi++)
3537 {
3538 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3540 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3541 if (eptr >= md->end_subject)
3542 {
3543 SCHECK_PARTIAL();
3544 MRRETURN(MATCH_NOMATCH);
3545 }
3546 GETCHARINC(d, eptr);
3547 if (fc == d) MRRETURN(MATCH_NOMATCH);
3548 }
3549 }
3550 else
3551 #endif
3552 /* Not UTF-8 mode */
3553 {
3554 for (fi = min;; fi++)
3555 {
3556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3558 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3559 if (eptr >= md->end_subject)
3560 {
3561 SCHECK_PARTIAL();
3562 MRRETURN(MATCH_NOMATCH);
3563 }
3564 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3565 }
3566 }
3567 /* Control never gets here */
3568 }
3569
3570 /* Maximize case */
3571
3572 else
3573 {
3574 pp = eptr;
3575
3576 #ifdef SUPPORT_UTF8
3577 /* UTF-8 mode */
3578 if (utf8)
3579 {
3580 register unsigned int d;
3581 for (i = min; i < max; i++)
3582 {
3583 int len = 1;
3584 if (eptr >= md->end_subject)
3585 {
3586 SCHECK_PARTIAL();
3587 break;
3588 }
3589 GETCHARLEN(d, eptr, len);
3590 if (fc == d) break;
3591 eptr += len;
3592 }
3593 if (possessive) continue;
3594 for(;;)
3595 {
3596 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3598 if (eptr-- == pp) break; /* Stop if tried at original pos */
3599 BACKCHAR(eptr);
3600 }
3601 }
3602 else
3603 #endif
3604 /* Not UTF-8 mode */
3605 {
3606 for (i = min; i < max; i++)
3607 {
3608 if (eptr >= md->end_subject)
3609 {
3610 SCHECK_PARTIAL();
3611 break;
3612 }
3613 if (fc == *eptr) break;
3614 eptr++;
3615 }
3616 if (possessive) continue;
3617 while (eptr >= pp)
3618 {
3619 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3620 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3621 eptr--;
3622 }
3623 }
3624
3625 MRRETURN(MATCH_NOMATCH);
3626 }
3627 }
3628 /* Control never gets here */
3629
3630 /* Match a single character type repeatedly; several different opcodes
3631 share code. This is very similar to the code for single characters, but we
3632 repeat it in the interests of efficiency. */
3633
3634 case OP_TYPEEXACT:
3635 min = max = GET2(ecode, 1);
3636 minimize = TRUE;
3637 ecode += 3;
3638 goto REPEATTYPE;
3639
3640 case OP_TYPEUPTO:
3641 case OP_TYPEMINUPTO:
3642 min = 0;
3643 max = GET2(ecode, 1);
3644 minimize = *ecode == OP_TYPEMINUPTO;
3645 ecode += 3;
3646 goto REPEATTYPE;
3647
3648 case OP_TYPEPOSSTAR:
3649 possessive = TRUE;
3650 min = 0;
3651 max = INT_MAX;
3652 ecode++;
3653 goto REPEATTYPE;
3654
3655 case OP_TYPEPOSPLUS:
3656 possessive = TRUE;
3657 min = 1;
3658 max = INT_MAX;
3659 ecode++;
3660 goto REPEATTYPE;
3661
3662 case OP_TYPEPOSQUERY:
3663 possessive = TRUE;
3664 min = 0;
3665 max = 1;
3666 ecode++;
3667 goto REPEATTYPE;
3668
3669 case OP_TYPEPOSUPTO:
3670 possessive = TRUE;
3671 min = 0;
3672 max = GET2(ecode, 1);
3673 ecode += 3;
3674 goto REPEATTYPE;
3675
3676 case OP_TYPESTAR:
3677 case OP_TYPEMINSTAR:
3678 case OP_TYPEPLUS:
3679 case OP_TYPEMINPLUS:
3680 case OP_TYPEQUERY:
3681 case OP_TYPEMINQUERY:
3682 c = *ecode++ - OP_TYPESTAR;
3683 minimize = (c & 1) != 0;
3684 min = rep_min[c]; /* Pick up values from tables; */
3685 max = rep_max[c]; /* zero for max => infinity */
3686 if (max == 0) max = INT_MAX;
3687
3688 /* Common code for all repeated single character type matches. Note that
3689 in UTF-8 mode, '.' matches a character of any length, but for the other
3690 character types, the valid characters are all one-byte long. */
3691
3692 REPEATTYPE:
3693 ctype = *ecode++; /* Code for the character type */
3694
3695 #ifdef SUPPORT_UCP
3696 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3697 {
3698 prop_fail_result = ctype == OP_NOTPROP;
3699 prop_type = *ecode++;
3700 prop_value = *ecode++;
3701 }
3702 else prop_type = -1;
3703 #endif
3704
3705 /* First, ensure the minimum number of matches are present. Use inline
3706 code for maximizing the speed, and do the type test once at the start
3707 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3708 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3709 and single-bytes. */
3710
3711 if (min > 0)
3712 {
3713 #ifdef SUPPORT_UCP
3714 if (prop_type >= 0)
3715 {
3716 switch(prop_type)
3717 {
3718 case PT_ANY:
3719 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3720 for (i = 1; i <= min; i++)
3721 {
3722 if (eptr >= md->end_subject)
3723 {
3724 SCHECK_PARTIAL();
3725 MRRETURN(MATCH_NOMATCH);
3726 }
3727 GETCHARINCTEST(c, eptr);
3728 }
3729 break;
3730
3731 case PT_LAMP:
3732 for (i = 1; i <= min; i++)
3733 {
3734 int chartype;
3735 if (eptr >= md->end_subject)
3736 {
3737 SCHECK_PARTIAL();
3738 MRRETURN(MATCH_NOMATCH);
3739 }
3740 GETCHARINCTEST(c, eptr);
3741 chartype = UCD_CHARTYPE(c);
3742 if ((chartype == ucp_Lu ||
3743 chartype == ucp_Ll ||
3744 chartype == ucp_Lt) == prop_fail_result)
3745 MRRETURN(MATCH_NOMATCH);
3746 }
3747 break;
3748
3749 case PT_GC:
3750 for (i = 1; i <= min; i++)
3751 {
3752 if (eptr >= md->end_subject)
3753 {
3754 SCHECK_PARTIAL();
3755 MRRETURN(MATCH_NOMATCH);
3756 }
3757 GETCHARINCTEST(c, eptr);
3758 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3759 MRRETURN(MATCH_NOMATCH);
3760 }
3761 break;
3762
3763 case PT_PC:
3764 for (i = 1; i <= min; i++)
3765 {
3766 if (eptr >= md->end_subject)
3767 {
3768 SCHECK_PARTIAL();
3769 MRRETURN(MATCH_NOMATCH);
3770 }
3771 GETCHARINCTEST(c, eptr);
3772 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3773 MRRETURN(MATCH_NOMATCH);
3774 }
3775 break;
3776
3777 case PT_SC:
3778 for (i = 1; i <= min; i++)
3779 {
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 MRRETURN(MATCH_NOMATCH);
3784 }
3785 GETCHARINCTEST(c, eptr);
3786 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3787 MRRETURN(MATCH_NOMATCH);
3788 }
3789 break;
3790
3791 case PT_ALNUM:
3792 for (i = 1; i <= min; i++)
3793 {
3794 int category;
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 MRRETURN(MATCH_NOMATCH);
3799 }
3800 GETCHARINCTEST(c, eptr);
3801 category = UCD_CATEGORY(c);
3802 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3803 MRRETURN(MATCH_NOMATCH);
3804 }
3805 break;
3806
3807 case PT_SPACE: /* Perl space */
3808 for (i = 1; i <= min; i++)
3809 {
3810 if (eptr >= md->end_subject)
3811 {
3812 SCHECK_PARTIAL();
3813 MRRETURN(MATCH_NOMATCH);
3814 }
3815 GETCHARINCTEST(c, eptr);
3816 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3817 c == CHAR_FF || c == CHAR_CR)
3818 == prop_fail_result)
3819 MRRETURN(MATCH_NOMATCH);
3820 }
3821 break;
3822
3823 case PT_PXSPACE: /* POSIX space */
3824 for (i = 1; i <= min; i++)
3825 {
3826 if (eptr >= md->end_subject)
3827 {
3828 SCHECK_PARTIAL();
3829 MRRETURN(MATCH_NOMATCH);
3830 }
3831 GETCHARINCTEST(c, eptr);
3832 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3833 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3834 == prop_fail_result)
3835 MRRETURN(MATCH_NOMATCH);
3836 }
3837 break;
3838
3839 case PT_WORD:
3840 for (i = 1; i <= min; i++)
3841 {
3842 int category;
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 MRRETURN(MATCH_NOMATCH);
3847 }
3848 GETCHARINCTEST(c, eptr);
3849 category = UCD_CATEGORY(c);
3850 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
3851 == prop_fail_result)
3852 MRRETURN(MATCH_NOMATCH);
3853 }
3854 break;
3855
3856 /* This should not occur */
3857
3858 default:
3859 RRETURN(PCRE_ERROR_INTERNAL);
3860 }
3861 }
3862
3863 /* Match extended Unicode sequences. We will get here only if the
3864 support is in the binary; otherwise a compile-time error occurs. */
3865
3866 else if (ctype == OP_EXTUNI)
3867 {
3868 for (i = 1; i <= min; i++)
3869 {
3870 if (eptr >= md->end_subject)
3871 {
3872 SCHECK_PARTIAL();
3873 MRRETURN(MATCH_NOMATCH);
3874 }
3875 GETCHARINCTEST(c, eptr);
3876 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
3877 while (eptr < md->end_subject)
3878 {
3879 int len = 1;
3880 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
3881 if (UCD_CATEGORY(c) != ucp_M) break;
3882 eptr += len;
3883 }
3884 }
3885 }
3886
3887 else
3888 #endif /* SUPPORT_UCP */
3889
3890 /* Handle all other cases when the coding is UTF-8 */
3891
3892 #ifdef SUPPORT_UTF8
3893 if (utf8) switch(ctype)
3894 {
3895 case OP_ANY:
3896 for (i = 1; i <= min; i++)
3897 {
3898 if (eptr >= md->end_subject)
3899 {
3900 SCHECK_PARTIAL();
3901 MRRETURN(MATCH_NOMATCH);
3902 }
3903 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3904 eptr++;
3905 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3906 }
3907 break;
3908
3909 case OP_ALLANY:
3910 for (i = 1; i <= min; i++)
3911 {
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 MRRETURN(MATCH_NOMATCH);
3916 }
3917 eptr++;
3918 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3919 }
3920 break;
3921
3922 case OP_ANYBYTE:
3923 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3924 eptr += min;
3925 break;
3926
3927 case OP_ANYNL:
3928 for (i = 1; i <= min; i++)
3929 {
3930 if (eptr >= md->end_subject)
3931 {
3932 SCHECK_PARTIAL();
3933 MRRETURN(MATCH_NOMATCH);
3934 }
3935 GETCHARINC(c, eptr);
3936 switch(c)
3937 {
3938 default: MRRETURN(MATCH_NOMATCH);
3939
3940 case 0x000d:
3941 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3942 break;
3943
3944 case 0x000a:
3945 break;
3946
3947 case 0x000b:
3948 case 0x000c:
3949 case 0x0085:
3950 case 0x2028:
3951 case 0x2029:
3952 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3953 break;
3954 }
3955 }
3956 break;
3957
3958 case OP_NOT_HSPACE:
3959 for (i = 1; i <= min; i++)
3960 {
3961 if (eptr >= md->end_subject)
3962 {
3963 SCHECK_PARTIAL();
3964 MRRETURN(MATCH_NOMATCH);
3965 }
3966 GETCHARINC(c, eptr);
3967 switch(c)
3968 {
3969 default: break;
3970 case 0x09: /* HT */
3971 case 0x20: /* SPACE */
3972 case 0xa0: /* NBSP */
3973 case 0x1680: /* OGHAM SPACE MARK */
3974 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3975 case 0x2000: /* EN QUAD */
3976 case 0x2001: /* EM QUAD */
3977 case 0x2002: /* EN SPACE */
3978 case 0x2003: /* EM SPACE */
3979 case 0x2004: /* THREE-PER-EM SPACE */
3980 case 0x2005: /* FOUR-PER-EM SPACE */
3981 case 0x2006: /* SIX-PER-EM SPACE */
3982 case 0x2007: /* FIGURE SPACE */
3983 case 0x2008: /* PUNCTUATION SPACE */
3984 case 0x2009: /* THIN SPACE */
3985 case 0x200A: /* HAIR SPACE */
3986 case 0x202f: /* NARROW NO-BREAK SPACE */
3987 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3988 case 0x3000: /* IDEOGRAPHIC SPACE */
3989 MRRETURN(MATCH_NOMATCH);
3990 }
3991 }
3992 break;
3993
3994 case OP_HSPACE:
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINC(c, eptr);
4003 switch(c)
4004 {
4005 default: MRRETURN(MATCH_NOMATCH);
4006 case 0x09: /* HT */
4007 case 0x20: /* SPACE */
4008 case 0xa0: /* NBSP */
4009 case 0x1680: /* OGHAM SPACE MARK */
4010 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011 case 0x2000: /* EN QUAD */
4012 case 0x2001: /* EM QUAD */
4013 case 0x2002: /* EN SPACE */
4014 case 0x2003: /* EM SPACE */
4015 case 0x2004: /* THREE-PER-EM SPACE */
4016 case 0x2005: /* FOUR-PER-EM SPACE */
4017 case 0x2006: /* SIX-PER-EM SPACE */
4018 case 0x2007: /* FIGURE SPACE */
4019 case 0x2008: /* PUNCTUATION SPACE */
4020 case 0x2009: /* THIN SPACE */
4021 case 0x200A: /* HAIR SPACE */
4022 case 0x202f: /* NARROW NO-BREAK SPACE */
4023 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024 case 0x3000: /* IDEOGRAPHIC SPACE */
4025 break;
4026 }
4027 }
4028 break;
4029
4030 case OP_NOT_VSPACE:
4031 for (i = 1; i <= min; i++)
4032 {
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 MRRETURN(MATCH_NOMATCH);
4037 }
4038 GETCHARINC(c, eptr);
4039 switch(c)
4040 {
4041 default: break;
4042 case 0x0a: /* LF */
4043 case 0x0b: /* VT */
4044 case 0x0c: /* FF */
4045 case 0x0d: /* CR */
4046 case 0x85: /* NEL */
4047 case 0x2028: /* LINE SEPARATOR */
4048 case 0x2029: /* PARAGRAPH SEPARATOR */
4049 MRRETURN(MATCH_NOMATCH);
4050 }
4051 }
4052 break;
4053
4054 case OP_VSPACE:
4055 for (i = 1; i <= min; i++)
4056 {
4057 if (eptr >= md->end_subject)
4058 {
4059 SCHECK_PARTIAL();
4060 MRRETURN(MATCH_NOMATCH);
4061 }
4062 GETCHARINC(c, eptr);
4063 switch(c)
4064 {
4065 default: MRRETURN(MATCH_NOMATCH);
4066 case 0x0a: /* LF */
4067 case 0x0b: /* VT */
4068 case 0x0c: /* FF */
4069 case 0x0d: /* CR */
4070 case 0x85: /* NEL */
4071 case 0x2028: /* LINE SEPARATOR */
4072 case 0x2029: /* PARAGRAPH SEPARATOR */
4073 break;
4074 }
4075 }
4076 break;
4077
4078 case OP_NOT_DIGIT:
4079 for (i = 1; i <= min; i++)
4080 {
4081 if (eptr >= md->end_subject)
4082 {
4083 SCHECK_PARTIAL();
4084 MRRETURN(MATCH_NOMATCH);
4085 }
4086 GETCHARINC(c, eptr);
4087 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4088 MRRETURN(MATCH_NOMATCH);
4089 }
4090 break;
4091
4092 case OP_DIGIT:
4093 for (i = 1; i <= min; i++)
4094 {
4095 if (eptr >= md->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 MRRETURN(MATCH_NOMATCH);
4099 }
4100 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4101 MRRETURN(MATCH_NOMATCH);
4102 /* No need to skip more bytes - we know it's a 1-byte character */
4103 }
4104 break;
4105
4106 case OP_NOT_WHITESPACE:
4107 for (i = 1; i <= min; i++)
4108 {
4109 if (eptr >= md->end_subject)
4110 {
4111 SCHECK_PARTIAL();
4112 MRRETURN(MATCH_NOMATCH);
4113 }
4114 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4115 MRRETURN(MATCH_NOMATCH);
4116 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4117 }
4118 break;
4119
4120 case OP_WHITESPACE:
4121 for (i = 1; i <= min; i++)
4122 {
4123 if (eptr >= md->end_subject)
4124 {
4125 SCHECK_PARTIAL();
4126 MRRETURN(MATCH_NOMATCH);
4127 }
4128 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4129 MRRETURN(MATCH_NOMATCH);
4130 /* No need to skip more bytes - we know it's a 1-byte character */
4131 }
4132 break;
4133
4134 case OP_NOT_WORDCHAR:
4135 for (i = 1; i <= min; i++)
4136 {
4137 if (eptr >= md->end_subject)
4138 {
4139 SCHECK_PARTIAL();
4140 MRRETURN(MATCH_NOMATCH);
4141 }
4142 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4143 MRRETURN(MATCH_NOMATCH);
4144 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4145 }
4146 break;
4147
4148 case OP_WORDCHAR:
4149 for (i = 1; i <= min; i++)
4150 {
4151 if (eptr >= md->end_subject)
4152 {
4153 SCHECK_PARTIAL();
4154 MRRETURN(MATCH_NOMATCH);
4155 }
4156 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4157 MRRETURN(MATCH_NOMATCH);
4158 /* No need to skip more bytes - we know it's a 1-byte character */
4159 }
4160 break;
4161
4162 default:
4163 RRETURN(PCRE_ERROR_INTERNAL);
4164 } /* End switch(ctype) */
4165
4166 else
4167 #endif /* SUPPORT_UTF8 */
4168
4169 /* Code for the non-UTF-8 case for minimum matching of operators other
4170 than OP_PROP and OP_NOTPROP. */
4171
4172 switch(ctype)
4173 {
4174 case OP_ANY:
4175 for (i = 1; i <= min; i++)
4176 {
4177 if (eptr >= md->end_subject)
4178 {
4179 SCHECK_PARTIAL();
4180 MRRETURN(MATCH_NOMATCH);
4181 }
4182 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4183 eptr++;
4184 }
4185 break;
4186
4187 case OP_ALLANY:
4188 if (eptr > md->end_subject - min)
4189 {
4190 SCHECK_PARTIAL();
4191 MRRETURN(MATCH_NOMATCH);
4192 }
4193 eptr += min;
4194 break;
4195
4196 case OP_ANYBYTE:
4197 if (eptr > md->end_subject - min)
4198 {
4199 SCHECK_PARTIAL();
4200 MRRETURN(MATCH_NOMATCH);
4201 }
4202 eptr += min;
4203 break;
4204
4205 case OP_ANYNL:
4206 for (i = 1; i <= min; i++)
4207 {
4208 if (eptr >= md->end_subject)
4209 {
4210 SCHECK_PARTIAL();
4211 MRRETURN(MATCH_NOMATCH);
4212 }
4213 switch(*eptr++)
4214 {
4215 default: MRRETURN(MATCH_NOMATCH);
4216
4217 case 0x000d:
4218 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4219 break;
4220
4221 case 0x000a:
4222 break;
4223
4224 case 0x000b:
4225 case 0x000c:
4226 case 0x0085:
4227 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4228 break;
4229 }
4230 }
4231 break;
4232
4233 case OP_NOT_HSPACE:
4234 for (i = 1; i <= min; i++)
4235 {
4236 if (eptr >= md->end_subject)
4237 {
4238 SCHECK_PARTIAL();
4239 MRRETURN(MATCH_NOMATCH);
4240 }
4241 switch(*eptr++)
4242 {
4243 default: break;
4244 case 0x09: /* HT */
4245 case 0x20: /* SPACE */
4246 case 0xa0: /* NBSP */
4247 MRRETURN(MATCH_NOMATCH);
4248 }
4249 }
4250 break;
4251
4252 case OP_HSPACE:
4253 for (i = 1; i <= min; i++)
4254 {
4255 if (eptr >= md->end_subject)
4256 {
4257 SCHECK_PARTIAL();
4258 MRRETURN(MATCH_NOMATCH);
4259 }
4260 switch(*eptr++)
4261 {
4262 default: MRRETURN(MATCH_NOMATCH);
4263 case 0x09: /* HT */
4264 case 0x20: /* SPACE */
4265 case 0xa0: /* NBSP */
4266 break;
4267 }
4268 }
4269 break;
4270
4271 case OP_NOT_VSPACE:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 switch(*eptr++)
4280 {
4281 default: break;
4282 case 0x0a: /* LF */
4283 case 0x0b: /* VT */
4284 case 0x0c: /* FF */
4285 case 0x0d: /* CR */
4286 case 0x85: /* NEL */
4287 MRRETURN(MATCH_NOMATCH);
4288 }
4289 }
4290 break;
4291
4292 case OP_VSPACE:
4293 for (i = 1; i <= min; i++)
4294 {
4295 if (eptr >= md->end_subject)
4296 {
4297 SCHECK_PARTIAL();
4298 MRRETURN(MATCH_NOMATCH);
4299 }
4300 switch(*eptr++)
4301 {
4302 default: MRRETURN(MATCH_NOMATCH);
4303 case 0x0a: /* LF */
4304 case 0x0b: /* VT */
4305 case 0x0c: /* FF */
4306 case 0x0d: /* CR */
4307 case 0x85: /* NEL */
4308 break;
4309 }
4310 }
4311 break;
4312
4313 case OP_NOT_DIGIT:
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 MRRETURN(MATCH_NOMATCH);
4320 }
4321 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4322 }
4323 break;
4324
4325 case OP_DIGIT:
4326 for (i = 1; i <= min; i++)
4327 {
4328 if (eptr >= md->end_subject)
4329 {
4330 SCHECK_PARTIAL();
4331 MRRETURN(MATCH_NOMATCH);
4332 }
4333 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4334 }
4335 break;
4336
4337 case OP_NOT_WHITESPACE:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 MRRETURN(MATCH_NOMATCH);
4344 }
4345 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4346 }
4347 break;
4348
4349 case OP_WHITESPACE:
4350 for (i = 1; i <= min; i++)
4351 {
4352 if (eptr >= md->end_subject)
4353 {
4354 SCHECK_PARTIAL();
4355 MRRETURN(MATCH_NOMATCH);
4356 }
4357 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4358 }
4359 break;
4360
4361 case OP_NOT_WORDCHAR:
4362 for (i = 1; i <= min; i++)
4363 {
4364 if (eptr >= md->end_subject)
4365 {
4366 SCHECK_PARTIAL();
4367 MRRETURN(MATCH_NOMATCH);
4368 }
4369 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4370 MRRETURN(MATCH_NOMATCH);
4371 }
4372 break;
4373
4374 case OP_WORDCHAR:
4375 for (i = 1; i <= min; i++)
4376 {
4377 if (eptr >= md->end_subject)
4378 {
4379 SCHECK_PARTIAL();
4380 MRRETURN(MATCH_NOMATCH);
4381 }
4382 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4383 MRRETURN(MATCH_NOMATCH);
4384 }
4385 break;
4386
4387 default:
4388 RRETURN(PCRE_ERROR_INTERNAL);
4389 }
4390 }
4391
4392 /* If min = max, continue at the same level without recursing */
4393
4394 if (min == max) continue;
4395
4396 /* If minimizing, we have to test the rest of the pattern before each
4397 subsequent match. Again, separate the UTF-8 case for speed, and also
4398 separate the UCP cases. */
4399
4400 if (minimize)
4401 {
4402 #ifdef SUPPORT_UCP
4403 if (prop_type >= 0)
4404 {
4405 switch(prop_type)
4406 {
4407 case PT_ANY:
4408 for (fi = min;; fi++)
4409 {
4410 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4412 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4413 if (eptr >= md->end_subject)
4414 {
4415 SCHECK_PARTIAL();
4416 MRRETURN(MATCH_NOMATCH);
4417 }
4418 GETCHARINCTEST(c, eptr);
4419 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4420 }
4421 /* Control never gets here */
4422
4423 case PT_LAMP:
4424 for (fi = min;; fi++)
4425 {
4426 int chartype;
4427 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4429 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4430 if (eptr >= md->end_subject)
4431 {
4432 SCHECK_PARTIAL();
4433 MRRETURN(MATCH_NOMATCH);
4434 }
4435 GETCHARINCTEST(c, eptr);
4436 chartype = UCD_CHARTYPE(c);
4437 if ((chartype == ucp_Lu ||
4438 chartype == ucp_Ll ||
4439 chartype == ucp_Lt) == prop_fail_result)
4440 MRRETURN(MATCH_NOMATCH);
4441 }
4442 /* Control never gets here */
4443
4444 case PT_GC:
4445 for (fi = min;; fi++)
4446 {
4447 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4449 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4450 if (eptr >= md->end_subject)
4451 {
4452 SCHECK_PARTIAL();
4453 MRRETURN(MATCH_NOMATCH);
4454 }
4455 GETCHARINCTEST(c, eptr);
4456 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4457 MRRETURN(MATCH_NOMATCH);
4458 }
4459 /* Control never gets here */
4460
4461 case PT_PC:
4462 for (fi = min;; fi++)
4463 {
4464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4466 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4467 if (eptr >= md->end_subject)
4468 {
4469 SCHECK_PARTIAL();
4470 MRRETURN(MATCH_NOMATCH);
4471 }
4472 GETCHARINCTEST(c, eptr);
4473 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4474 MRRETURN(MATCH_NOMATCH);
4475 }
4476 /* Control never gets here */
4477
4478 case PT_SC:
4479 for (fi = min;; fi++)
4480 {
4481 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4483 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 MRRETURN(MATCH_NOMATCH);
4488 }
4489 GETCHARINCTEST(c, eptr);
4490 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4491 MRRETURN(MATCH_NOMATCH);
4492 }
4493 /* Control never gets here */
4494
4495 case PT_ALNUM:
4496 for (fi = min;; fi++)
4497 {
4498 int category;
4499 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4500 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4501 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4502 if (eptr >= md->end_subject)
4503 {
4504 SCHECK_PARTIAL();
4505 MRRETURN(MATCH_NOMATCH);
4506 }
4507 GETCHARINCTEST(c, eptr);
4508 category = UCD_CATEGORY(c);
4509 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4510 MRRETURN(MATCH_NOMATCH);
4511 }
4512 /* Control never gets here */
4513
4514 case PT_SPACE: /* Perl space */
4515 for (fi = min;; fi++)
4516 {
4517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4519 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4520 if (eptr >= md->end_subject)
4521 {
4522 SCHECK_PARTIAL();
4523 MRRETURN(MATCH_NOMATCH);
4524 }
4525 GETCHARINCTEST(c, eptr);
4526 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4527 c == CHAR_FF || c == CHAR_CR)
4528 == prop_fail_result)
4529 MRRETURN(MATCH_NOMATCH);
4530 }
4531 /* Control never gets here */
4532
4533 case PT_PXSPACE: /* POSIX space */
4534 for (fi = min;; fi++)
4535 {
4536 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4537 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4538 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4539 if (eptr >= md->end_subject)
4540 {
4541 SCHECK_PARTIAL();
4542 MRRETURN(MATCH_NOMATCH);
4543 }
4544 GETCHARINCTEST(c, eptr);
4545 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4546 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4547 == prop_fail_result)
4548 MRRETURN(MATCH_NOMATCH);
4549 }
4550 /* Control never gets here */
4551
4552 case PT_WORD:
4553 for (fi = min;; fi++)
4554 {
4555 int category;
4556 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4558 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4559 if (eptr >= md->end_subject)
4560 {
4561 SCHECK_PARTIAL();
4562 MRRETURN(MATCH_NOMATCH);
4563 }
4564 GETCHARINCTEST(c, eptr);
4565 category = UCD_CATEGORY(c);
4566 if ((category == ucp_L ||
4567 category == ucp_N ||
4568 c == CHAR_UNDERSCORE)
4569 == prop_fail_result)
4570 MRRETURN(MATCH_NOMATCH);
4571 }
4572 /* Control never gets here */
4573
4574 /* This should never occur */
4575
4576 default:
4577 RRETURN(PCRE_ERROR_INTERNAL);
4578 }
4579 }
4580
4581 /* Match extended Unicode sequences. We will get here only if the
4582 support is in the binary; otherwise a compile-time error occurs. */
4583
4584 else if (ctype == OP_EXTUNI)
4585 {
4586 for (fi = min;; fi++)
4587 {
4588 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4590 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4591 if (eptr >= md->end_subject)
4592 {
4593 SCHECK_PARTIAL();
4594 MRRETURN(MATCH_NOMATCH);
4595 }
4596 GETCHARINCTEST(c, eptr);
4597 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4598 while (eptr < md->end_subject)
4599 {
4600 int len = 1;
4601 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4602 if (UCD_CATEGORY(c) != ucp_M) break;
4603 eptr += len;
4604 }
4605 }
4606 }
4607 else
4608 #endif /* SUPPORT_UCP */
4609
4610 #ifdef SUPPORT_UTF8
4611 /* UTF-8 mode */
4612 if (utf8)
4613 {
4614 for (fi = min;; fi++)
4615 {
4616 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4618 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4619 if (eptr >= md->end_subject)
4620 {
4621 SCHECK_PARTIAL();
4622 MRRETURN(MATCH_NOMATCH);
4623 }
4624 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4625 MRRETURN(MATCH_NOMATCH);
4626 GETCHARINC(c, eptr);
4627 switch(ctype)
4628 {
4629 case OP_ANY: /* This is the non-NL case */
4630 case OP_ALLANY:
4631 case OP_ANYBYTE:
4632 break;
4633
4634 case OP_ANYNL:
4635 switch(c)
4636 {
4637 default: MRRETURN(MATCH_NOMATCH);
4638 case 0x000d:
4639 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4640 break;
4641 case 0x000a:
4642 break;
4643
4644 case 0x000b:
4645 case 0x000c:
4646 case 0x0085:
4647 case 0x2028:
4648 case 0x2029:
4649 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4650 break;
4651 }
4652 break;
4653
4654 case OP_NOT_HSPACE:
4655 switch(c)
4656 {
4657 default: break;
4658 case 0x09: /* HT */
4659 case 0x20: /* SPACE */
4660 case 0xa0: /* NBSP */
4661 case 0x1680: /* OGHAM SPACE MARK */
4662 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4663 case 0x2000: /* EN QUAD */
4664 case 0x2001: /* EM QUAD */
4665 case 0x2002: /* EN SPACE */
4666 case 0x2003: /* EM SPACE */
4667 case 0x2004: /* THREE-PER-EM SPACE */
4668 case 0x2005: /* FOUR-PER-EM SPACE */
4669 case 0x2006: /* SIX-PER-EM SPACE */
4670 case 0x2007: /* FIGURE SPACE */
4671 case 0x2008: /* PUNCTUATION SPACE */
4672 case 0x2009: /* THIN SPACE */
4673 case 0x200A: /* HAIR SPACE */
4674 case 0x202f: /* NARROW NO-BREAK SPACE */
4675 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4676 case 0x3000: /* IDEOGRAPHIC SPACE */
4677 MRRETURN(MATCH_NOMATCH);
4678 }
4679 break;
4680
4681 case OP_HSPACE:
4682 switch(c)
4683 {
4684 default: MRRETURN(MATCH_NOMATCH);
4685 case 0x09: /* HT */
4686 case 0x20: /* SPACE */
4687 case 0xa0: /* NBSP */
4688 case 0x1680: /* OGHAM SPACE MARK */
4689 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4690 case 0x2000: /* EN QUAD */
4691 case 0x2001: /* EM QUAD */
4692 case 0x2002: /* EN SPACE */
4693 case 0x2003: /* EM SPACE */
4694 case 0x2004: /* THREE-PER-EM SPACE */
4695 case 0x2005: /* FOUR-PER-EM SPACE */
4696 case 0x2006: /* SIX-PER-EM SPACE */
4697 case 0x2007: /* FIGURE SPACE */
4698 case 0x2008: /* PUNCTUATION SPACE */
4699 case 0x2009: /* THIN SPACE */
4700 case 0x200A: /* HAIR SPACE */
4701 case 0x202f: /* NARROW NO-BREAK SPACE */
4702 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4703 case 0x3000: /* IDEOGRAPHIC SPACE */
4704 break;
4705 }
4706 break;
4707
4708 case OP_NOT_VSPACE:
4709 switch(c)
4710 {
4711 default: break;
4712 case 0x0a: /* LF */
4713 case 0x0b: /* VT */
4714 case 0x0c: /* FF */
4715 case 0x0d: /* CR */
4716 case 0x85: /* NEL */
4717 case 0x2028: /* LINE SEPARATOR */
4718 case 0x2029: /* PARAGRAPH SEPARATOR */
4719 MRRETURN(MATCH_NOMATCH);
4720 }
4721 break;
4722
4723 case OP_VSPACE:
4724 switch(c)
4725 {
4726 default: MRRETURN(MATCH_NOMATCH);
4727 case 0x0a: /* LF */
4728 case 0x0b: /* VT */
4729 case 0x0c: /* FF */
4730 case 0x0d: /* CR */
4731 case 0x85: /* NEL */
4732 case 0x2028: /* LINE SEPARATOR */
4733 case 0x2029: /* PARAGRAPH SEPARATOR */
4734 break;
4735 }
4736 break;
4737
4738 case OP_NOT_DIGIT:
4739 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4740 MRRETURN(MATCH_NOMATCH);
4741 break;
4742
4743 case OP_DIGIT:
4744 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4745 MRRETURN(MATCH_NOMATCH);
4746 break;
4747
4748 case OP_NOT_WHITESPACE:
4749 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4750 MRRETURN(MATCH_NOMATCH);
4751 break;
4752
4753 case OP_WHITESPACE:
4754 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4755 MRRETURN(MATCH_NOMATCH);
4756 break;
4757
4758 case OP_NOT_WORDCHAR:
4759 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4760 MRRETURN(MATCH_NOMATCH);
4761 break;
4762
4763 case OP_WORDCHAR:
4764 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4765 MRRETURN(MATCH_NOMATCH);
4766 break;
4767
4768 default:
4769 RRETURN(PCRE_ERROR_INTERNAL);
4770 }
4771 }
4772 }
4773 else
4774 #endif
4775 /* Not UTF-8 mode */
4776 {
4777 for (fi = min;; fi++)
4778 {
4779 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4781 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4782 if (eptr >= md->end_subject)
4783 {
4784 SCHECK_PARTIAL();
4785 MRRETURN(MATCH_NOMATCH);
4786 }
4787 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4788 MRRETURN(MATCH_NOMATCH);
4789 c = *eptr++;
4790 switch(ctype)
4791 {
4792 case OP_ANY: /* This is the non-NL case */
4793 case OP_ALLANY:
4794 case OP_ANYBYTE:
4795 break;
4796
4797 case OP_ANYNL:
4798 switch(c)
4799 {
4800 default: MRRETURN(MATCH_NOMATCH);
4801 case 0x000d:
4802 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4803 break;
4804
4805 case 0x000a:
4806 break;
4807
4808 case 0x000b:
4809 case 0x000c:
4810 case 0x0085:
4811 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4812 break;
4813 }
4814 break;
4815
4816 case OP_NOT_HSPACE:
4817 switch(c)
4818 {
4819 default: break;
4820 case 0x09: /* HT */
4821 case 0x20: /* SPACE */
4822 case 0xa0: /* NBSP */
4823 MRRETURN(MATCH_NOMATCH);
4824 }
4825 break;
4826
4827 case OP_HSPACE:
4828 switch(c)
4829 {
4830 default: MRRETURN(MATCH_NOMATCH);
4831 case 0x09: /* HT */
4832 case 0x20: /* SPACE */
4833 case 0xa0: /* NBSP */
4834 break;
4835 }
4836 break;
4837
4838 case OP_NOT_VSPACE:
4839 switch(c)
4840 {
4841 default: break;
4842 case 0x0a: /* LF */
4843 case 0x0b: /* VT */
4844 case 0x0c: /* FF */
4845 case 0x0d: /* CR */
4846 case 0x85: /* NEL */
4847 MRRETURN(MATCH_NOMATCH);
4848 }
4849 break;
4850
4851 case OP_VSPACE:
4852 switch(c)
4853 {
4854 default: MRRETURN(MATCH_NOMATCH);
4855 case 0x0a: /* LF */
4856 case 0x0b: /* VT */
4857 case 0x0c: /* FF */
4858 case 0x0d: /* CR */
4859 case 0x85: /* NEL */
4860 break;
4861 }
4862 break;
4863
4864 case OP_NOT_DIGIT:
4865 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4866 break;
4867
4868 case OP_DIGIT:
4869 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4870 break;
4871
4872 case OP_NOT_WHITESPACE:
4873 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4874 break;
4875
4876 case OP_WHITESPACE:
4877 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4878 break;
4879
4880 case OP_NOT_WORDCHAR:
4881 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4882 break;
4883
4884 case OP_WORDCHAR:
4885 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4886 break;
4887
4888 default:
4889 RRETURN(PCRE_ERROR_INTERNAL);
4890 }
4891 }
4892 }
4893 /* Control never gets here */
4894 }
4895
4896 /* If maximizing, it is worth using inline code for speed, doing the type
4897 test once at the start (i.e. keep it out of the loop). Again, keep the
4898 UTF-8 and UCP stuff separate. */
4899
4900 else
4901 {
4902 pp = eptr; /* Remember where we started */
4903
4904 #ifdef SUPPORT_UCP
4905 if (prop_type >= 0)
4906 {
4907 switch(prop_type)
4908 {
4909 case PT_ANY:
4910 for (i = min; i < max; i++)
4911 {
4912 int len = 1;
4913 if (eptr >= md->end_subject)
4914 {
4915 SCHECK_PARTIAL();
4916 break;
4917 }
4918 GETCHARLENTEST(c, eptr, len);
4919 if (prop_fail_result) break;
4920 eptr+= len;
4921 }
4922 break;
4923
4924 case PT_LAMP:
4925 for (i = min; i < max; i++)
4926 {
4927 int chartype;
4928 int len = 1;
4929 if (eptr >= md->end_subject)
4930 {
4931 SCHECK_PARTIAL();
4932 break;
4933 }
4934 GETCHARLENTEST(c, eptr, len);
4935 chartype = UCD_CHARTYPE(c);
4936 if ((chartype == ucp_Lu ||
4937 chartype == ucp_Ll ||
4938 chartype == ucp_Lt) == prop_fail_result)
4939 break;
4940 eptr+= len;
4941 }
4942 break;
4943
4944 case PT_GC:
4945 for (i = min; i < max; i++)
4946 {
4947 int len = 1;
4948 if (eptr >= md->end_subject)
4949 {
4950 SCHECK_PARTIAL();
4951 break;
4952 }
4953 GETCHARLENTEST(c, eptr, len);
4954 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
4955 eptr+= len;
4956 }
4957 break;
4958
4959 case PT_PC:
4960 for (i = min; i < max; i++)
4961 {
4962 int len = 1;
4963 if (eptr >= md->end_subject)
4964 {
4965 SCHECK_PARTIAL();
4966 break;
4967 }
4968 GETCHARLENTEST(c, eptr, len);
4969 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
4970 eptr+= len;
4971 }
4972 break;
4973
4974 case PT_SC:
4975 for (i = min; i < max; i++)
4976 {
4977 int len = 1;
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 GETCHARLENTEST(c, eptr, len);
4984 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
4985 eptr+= len;
4986 }
4987 break;
4988
4989 case PT_ALNUM:
4990 for (i = min; i < max; i++)
4991 {
4992 int category;
4993 int len = 1;
4994 if (eptr >= md->end_subject)
4995 {
4996 SCHECK_PARTIAL();
4997 break;
4998 }
4999 GETCHARLENTEST(c, eptr, len);
5000 category = UCD_CATEGORY(c);
5001 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5002 break;
5003 eptr+= len;
5004 }
5005 break;
5006
5007 case PT_SPACE: /* Perl space */
5008 for (i = min; i < max; i++)
5009 {
5010 int len = 1;
5011 if (eptr >= md->end_subject)
5012 {
5013 SCHECK_PARTIAL();
5014 break;
5015 }
5016 GETCHARLENTEST(c, eptr, len);
5017 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5018 c == CHAR_FF || c == CHAR_CR)
5019 == prop_fail_result)
5020 break;
5021 eptr+= len;
5022 }
5023 break;
5024
5025 case PT_PXSPACE: /* POSIX space */
5026 for (i = min; i < max; i++)
5027 {
5028 int len = 1;
5029 if (eptr >= md->end_subject)
5030 {
5031 SCHECK_PARTIAL();
5032 break;
5033 }
5034 GETCHARLENTEST(c, eptr, len);
5035 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5036 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5037 == prop_fail_result)
5038 break;
5039 eptr+= len;
5040 }
5041 break;
5042
5043 case PT_WORD:
5044 for (i = min; i < max; i++)
5045 {
5046 int category;
5047 int len = 1;
5048 if (eptr >= md->end_subject)
5049 {
5050 SCHECK_PARTIAL();
5051 break;
5052 }
5053 GETCHARLENTEST(c, eptr, len);
5054 category = UCD_CATEGORY(c);
5055 if ((category == ucp_L || category == ucp_N ||
5056 c == CHAR_UNDERSCORE) == prop_fail_result)
5057 break;
5058 eptr+= len;
5059 }
5060 break;
5061
5062 default:
5063 RRETURN(PCRE_ERROR_INTERNAL);
5064 }
5065
5066 /* eptr is now past the end of the maximum run */
5067
5068 if (possessive) continue;
5069 for(;;)
5070 {
5071 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5072 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5073 if (eptr-- == pp) break; /* Stop if tried at original pos */
5074 if (utf8) BACKCHAR(eptr);
5075 }
5076 }
5077
5078 /* Match extended Unicode sequences. We will get here only if the
5079 support is in the binary; otherwise a compile-time error occurs. */
5080
5081 else if (ctype == OP_EXTUNI)
5082 {
5083 for (i = min; i < max; i++)
5084 {
5085 int len = 1;
5086 if (eptr >= md->end_subject)
5087 {
5088 SCHECK_PARTIAL();
5089 break;
5090 }
5091 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5092 if (UCD_CATEGORY(c) == ucp_M) break;
5093 eptr += len;
5094 while (eptr < md->end_subject)
5095 {
5096 len = 1;
5097 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5098 if (UCD_CATEGORY(c) != ucp_M) break;
5099 eptr += len;
5100 }
5101 }
5102
5103 /* eptr is now past the end of the maximum run */
5104
5105 if (possessive) continue;
5106
5107 for(;;)
5108 {
5109 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5110 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5111 if (eptr-- == pp) break; /* Stop if tried at original pos */
5112 for (;;) /* Move back over one extended */
5113 {
5114 if (!utf8) c = *eptr; else
5115 {
5116 BACKCHAR(eptr);
5117 GETCHAR(c, eptr);
5118 }
5119 if (UCD_CATEGORY(c) != ucp_M) break;
5120 eptr--;
5121 }
5122 }
5123 }
5124
5125 else
5126 #endif /* SUPPORT_UCP */
5127
5128 #ifdef SUPPORT_UTF8
5129 /* UTF-8 mode */
5130
5131 if (utf8)
5132 {
5133 switch(ctype)
5134 {
5135 case OP_ANY:
5136 if (max < INT_MAX)
5137 {
5138 for (i = min; i < max; i++)
5139 {
5140 if (eptr >= md->end_subject)
5141 {
5142 SCHECK_PARTIAL();
5143 break;
5144 }
5145 if (IS_NEWLINE(eptr)) break;
5146 eptr++;
5147 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5148 }
5149 }
5150
5151 /* Handle unlimited UTF-8 repeat */
5152
5153 else
5154 {
5155 for (i = min; i < max; i++)
5156 {
5157 if (eptr >= md->end_subject)
5158 {
5159 SCHECK_PARTIAL();
5160 break;
5161 }
5162 if (IS_NEWLINE(eptr)) break;
5163 eptr++;
5164 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5165 }
5166 }
5167 break;
5168
5169 case OP_ALLANY:
5170 if (max < INT_MAX)
5171 {
5172 for (i = min; i < max; i++)
5173 {
5174 if (eptr >= md->end_subject)
5175 {
5176 SCHECK_PARTIAL();
5177 break;
5178 }
5179 eptr++;
5180 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5181 }
5182 }
5183 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5184 break;
5185
5186 /* The byte case is the same as non-UTF8 */
5187
5188 case OP_ANYBYTE:
5189 c = max - min;
5190 if (c > (unsigned int)(md->end_subject - eptr))
5191 {
5192 eptr = md->end_subject;
5193 SCHECK_PARTIAL();
5194 }
5195 else eptr += c;
5196 break;
5197
5198 case OP_ANYNL:
5199 for (i = min; i < max; i++)
5200 {
5201 int len = 1;
5202 if (eptr >= md->end_subject)
5203 {
5204 SCHECK_PARTIAL();
5205 break;
5206 }
5207 GETCHARLEN(c, eptr, len);
5208 if (c == 0x000d)
5209 {
5210 if (++eptr >= md->end_subject) break;
5211 if (*eptr == 0x000a) eptr++;
5212 }
5213 else
5214 {
5215 if (c != 0x000a &&
5216 (md->bsr_anycrlf ||
5217 (c != 0x000b && c != 0x000c &&
5218 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5219 break;
5220 eptr += len;
5221 }
5222 }
5223 break;
5224
5225 case OP_NOT_HSPACE:
5226 case OP_HSPACE:
5227 for (i = min; i < max; i++)
5228 {
5229 BOOL gotspace;
5230 int len = 1;
5231 if (eptr >= md->end_subject)
5232 {
5233 SCHECK_PARTIAL();
5234 break;
5235 }
5236 GETCHARLEN(c, eptr, len);
5237 switch(c)
5238 {
5239 default: gotspace = FALSE; break;
5240 case 0x09: /* HT */
5241 case 0x20: /* SPACE */
5242 case 0xa0: /* NBSP */
5243 case 0x1680: /* OGHAM SPACE MARK */
5244 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5245 case 0x2000: /* EN QUAD */
5246 case 0x2001: /* EM QUAD */
5247 case 0x2002: /* EN SPACE */
5248 case 0x2003: /* EM SPACE */
5249 case 0x2004: /* THREE-PER-EM SPACE */
5250 case 0x2005: /* FOUR-PER-EM SPACE */
5251 case 0x2006: /* SIX-PER-EM SPACE */
5252 case 0x2007: /* FIGURE SPACE */
5253 case 0x2008: /* PUNCTUATION SPACE */
5254 case 0x2009: /* THIN SPACE */
5255 case 0x200A: /* HAIR SPACE */
5256 case 0x202f: /* NARROW NO-BREAK SPACE */
5257 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5258 case 0x3000: /* IDEOGRAPHIC SPACE */
5259 gotspace = TRUE;
5260 break;
5261 }
5262 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5263 eptr += len;
5264 }
5265 break;
5266
5267 case OP_NOT_VSPACE:
5268 case OP_VSPACE:
5269 for (i = min; i < max; i++)
5270 {
5271 BOOL gotspace;
5272 int len = 1;
5273 if (eptr >= md->end_subject)
5274 {
5275 SCHECK_PARTIAL();
5276 break;
5277 }
5278 GETCHARLEN(c, eptr, len);
5279 switch(c)
5280 {
5281 default: gotspace = FALSE; break;
5282 case 0x0a: /* LF */
5283 case 0x0b: /* VT */
5284 case 0x0c: /* FF */
5285 case 0x0d: /* CR */
5286 case 0x85: /* NEL */
5287 case 0x2028: /* LINE SEPARATOR */
5288 case 0x2029: /* PARAGRAPH SEPARATOR */
5289 gotspace = TRUE;
5290 break;
5291 }
5292 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5293 eptr += len;
5294 }
5295 break;
5296
5297 case OP_NOT_DIGIT:
5298 for (i = min; i < max; i++)
5299 {
5300 int len = 1;
5301 if (eptr >= md->end_subject)
5302 {
5303 SCHECK_PARTIAL();
5304 break;
5305 }
5306 GETCHARLEN(c, eptr, len);
5307 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5308 eptr+= len;
5309 }
5310 break;
5311
5312 case OP_DIGIT:
5313 for (i = min; i < max; i++)
5314 {
5315 int len = 1;
5316 if (eptr >= md->end_subject)
5317 {
5318 SCHECK_PARTIAL();
5319 break;
5320 }
5321 GETCHARLEN(c, eptr, len);
5322 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5323 eptr+= len;
5324 }
5325 break;
5326
5327 case OP_NOT_WHITESPACE:
5328 for (i = min; i < max; i++)
5329 {
5330 int len = 1;
5331 if (eptr >= md->end_subject)
5332 {
5333 SCHECK_PARTIAL();
5334 break;
5335 }
5336 GETCHARLEN(c, eptr, len);
5337 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5338 eptr+= len;
5339 }
5340 break;
5341
5342 case OP_WHITESPACE:
5343 for (i = min; i < max; i++)
5344 {
5345 int len = 1;
5346 if (eptr >= md->end_subject)
5347 {
5348 SCHECK_PARTIAL();
5349 break;
5350 }
5351 GETCHARLEN(c, eptr, len);
5352 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5353 eptr+= len;
5354 }
5355 break;
5356
5357 case OP_NOT_WORDCHAR:
5358 for (i = min; i < max; i++)
5359 {
5360 int len = 1;
5361 if (eptr >= md->end_subject)
5362 {
5363 SCHECK_PARTIAL();
5364 break;
5365 }
5366 GETCHARLEN(c, eptr, len);
5367 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5368 eptr+= len;
5369 }
5370 break;
5371
5372 case OP_WORDCHAR:
5373 for (i = min; i < max; i++)
5374 {
5375 int len = 1;
5376 if (eptr >= md->end_subject)
5377 {
5378 SCHECK_PARTIAL();
5379 break;
5380 }
5381 GETCHARLEN(c, eptr, len);
5382 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5383 eptr+= len;
5384 }
5385 break;
5386
5387 default:
5388 RRETURN(PCRE_ERROR_INTERNAL);
5389 }
5390
5391 /* eptr is now past the end of the maximum run. If possessive, we are
5392 done (no backing up). Otherwise, match at this position; anything other
5393 than no match is immediately returned. For nomatch, back up one
5394 character, unless we are matching \R and the last thing matched was
5395 \r\n, in which case, back up two bytes. */
5396
5397 if (possessive) continue;
5398 for(;;)
5399 {
5400 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5401 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5402 if (eptr-- == pp) break; /* Stop if tried at original pos */
5403 BACKCHAR(eptr);
5404 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5405 eptr[-1] == '\r') eptr--;
5406 }
5407 }
5408 else
5409 #endif /* SUPPORT_UTF8 */
5410
5411 /* Not UTF-8 mode */
5412 {
5413 switch(ctype)
5414 {
5415 case OP_ANY:
5416 for (i = min; i < max; i++)
5417 {
5418 if (eptr >= md->end_subject)
5419 {
5420 SCHECK_PARTIAL();
5421 break;
5422 }
5423 if (IS_NEWLINE(eptr)) break;
5424 eptr++;
5425 }
5426 break;
5427
5428 case OP_ALLANY:
5429 case OP_ANYBYTE:
5430 c = max - min;
5431 if (c > (unsigned int)(md->end_subject - eptr))
5432 {
5433 eptr = md->end_subject;
5434 SCHECK_PARTIAL();
5435 }
5436 else eptr += c;
5437 break;
5438
5439 case OP_ANYNL:
5440 for (i = min; i < max; i++)
5441 {
5442 if (eptr >= md->end_subject)
5443 {
5444 SCHECK_PARTIAL();
5445 break;
5446 }
5447 c = *eptr;
5448 if (c == 0x000d)
5449 {
5450 if (++eptr >= md->end_subject) break;
5451 if (*eptr == 0x000a) eptr++;
5452 }
5453 else
5454 {
5455 if (c != 0x000a &&
5456 (md->bsr_anycrlf ||
5457 (c != 0x000b && c != 0x000c && c != 0x0085)))
5458 break;
5459 eptr++;
5460 }
5461 }
5462 break;
5463
5464 case OP_NOT_HSPACE:
5465 for (i = min; i < max; i++)
5466 {
5467 if (eptr >= md->end_subject)
5468 {
5469 SCHECK_PARTIAL();
5470 break;
5471 }
5472 c = *eptr;
5473 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5474 eptr++;
5475 }
5476 break;
5477
5478 case OP_HSPACE:
5479 for (i = min; i < max; i++)
5480 {
5481 if (eptr >= md->end_subject)
5482 {
5483 SCHECK_PARTIAL();
5484 break;
5485 }
5486 c = *eptr;
5487 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5488 eptr++;
5489 }
5490 break;
5491
5492 case OP_NOT_VSPACE:
5493 for (i = min; i < max; i++)
5494 {
5495 if (eptr >= md->end_subject)
5496 {
5497 SCHECK_PARTIAL();
5498 break;
5499 }
5500 c = *eptr;
5501 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5502 break;
5503 eptr++;
5504 }
5505 break;
5506
5507 case OP_VSPACE:
5508 for (i = min; i < max; i++)
5509 {
5510 if (eptr >= md->end_subject)
5511 {
5512 SCHECK_PARTIAL();
5513 break;
5514 }
5515 c = *eptr;
5516 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5517 break;
5518 eptr++;
5519 }
5520 break;
5521
5522 case OP_NOT_DIGIT:
5523 for (i = min; i < max; i++)
5524 {
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5531 eptr++;
5532 }
5533 break;
5534
5535 case OP_DIGIT:
5536 for (i = min; i < max; i++)
5537 {
5538 if (eptr >= md->end_subject)
5539 {
5540 SCHECK_PARTIAL();
5541 break;
5542 }
5543 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5544 eptr++;
5545 }
5546 break;
5547
5548 case OP_NOT_WHITESPACE:
5549 for (i = min; i < max; i++)
5550 {
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5557 eptr++;
5558 }
5559 break;
5560
5561 case OP_WHITESPACE:
5562 for (i = min; i < max; i++)
5563 {
5564 if (eptr >= md->end_subject)
5565 {
5566 SCHECK_PARTIAL();
5567 break;
5568 }
5569 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5570 eptr++;
5571 }
5572 break;
5573
5574 case OP_NOT_WORDCHAR:
5575 for (i = min; i < max; i++)
5576 {
5577 if (eptr >= md->end_subject)
5578 {
5579 SCHECK_PARTIAL();
5580 break;
5581 }
5582 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5583 eptr++;
5584 }
5585 break;
5586
5587 case OP_WORDCHAR:
5588 for (i = min; i < max; i++)
5589 {
5590 if (eptr >= md->end_subject)
5591 {
5592 SCHECK_PARTIAL();
5593 break;
5594 }
5595 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5596 eptr++;
5597 }
5598 break;
5599
5600 default:
5601 RRETURN(PCRE_ERROR_INTERNAL);
5602 }
5603
5604 /* eptr is now past the end of the maximum run. If possessive, we are
5605 done (no backing up). Otherwise, match at this position; anything other
5606 than no match is immediately returned. For nomatch, back up one
5607 character (byte), unless we are matching \R and the last thing matched
5608 was \r\n, in which case, back up two bytes. */
5609
5610 if (possessive) continue;
5611 while (eptr >= pp)
5612 {
5613 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5615 eptr--;
5616 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5617 eptr[-1] == '\r') eptr--;
5618 }
5619 }
5620
5621 /* Get here if we can't make it match with any permitted repetitions */
5622
5623 MRRETURN(MATCH_NOMATCH);
5624 }
5625 /* Control never gets here */
5626
5627 /* There's been some horrible disaster. Arrival here can only mean there is
5628 something seriously wrong in the code above or the OP_xxx definitions. */
5629
5630 default:
5631 DPRINTF(("Unknown opcode %d\n", *ecode));
5632 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5633 }
5634
5635 /* Do not stick any code in here without much thought; it is assumed
5636 that "continue" in the code above comes out to here to repeat the main
5637 loop. */
5638
5639 } /* End of main loop */
5640 /* Control never reaches here */
5641
5642
5643 /* When compiling to use the heap rather than the stack for recursive calls to
5644 match(), the RRETURN() macro jumps here. The number that is saved in
5645 frame->Xwhere indicates which label we actually want to return to. */
5646
5647 #ifdef NO_RECURSE
5648 #define LBL(val) case val: goto L_RM##val;
5649 HEAP_RETURN:
5650 switch (frame->Xwhere)
5651 {
5652 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5653 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5654 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5655 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5656 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5657 #ifdef SUPPORT_UTF8
5658 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5659 LBL(32) LBL(34) LBL(42) LBL(46)
5660 #ifdef SUPPORT_UCP
5661 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5662 LBL(59) LBL(60) LBL(61) LBL(62)
5663 #endif /* SUPPORT_UCP */
5664 #endif /* SUPPORT_UTF8 */
5665 default:
5666 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5667 return PCRE_ERROR_INTERNAL;
5668 }
5669 #undef LBL
5670 #endif /* NO_RECURSE */
5671 }
5672
5673
5674 /***************************************************************************
5675 ****************************************************************************
5676 RECURSION IN THE match() FUNCTION
5677
5678 Undefine all the macros that were defined above to handle this. */
5679
5680 #ifdef NO_RECURSE
5681 #undef eptr
5682 #undef ecode
5683 #undef mstart
5684 #undef offset_top
5685 #undef eptrb
5686 #undef flags
5687
5688 #undef callpat
5689 #undef charptr
5690 #undef data
5691 #undef next
5692 #undef pp
5693 #undef prev
5694 #undef saved_eptr
5695
5696 #undef new_recursive
5697
5698 #undef cur_is_word
5699 #undef condition
5700 #undef prev_is_word
5701
5702 #undef ctype
5703 #undef length
5704 #undef max
5705 #undef min
5706 #undef number
5707 #undef offset
5708 #undef op
5709 #undef save_capture_last
5710 #undef save_offset1
5711 #undef save_offset2
5712 #undef save_offset3
5713 #undef stacksave
5714
5715 #undef newptrb
5716
5717 #endif
5718
5719 /* These two are defined as macros in both cases */
5720
5721 #undef fc
5722 #undef fi
5723
5724 /***************************************************************************
5725 ***************************************************************************/
5726
5727
5728
5729 /*************************************************
5730 * Execute a Regular Expression *
5731 *************************************************/
5732
5733 /* This function applies a compiled re to a subject string and picks out
5734 portions of the string if it matches. Two elements in the vector are set for
5735 each substring: the offsets to the start and end of the substring.
5736
5737 Arguments:
5738 argument_re points to the compiled expression
5739 extra_data points to extra data or is NULL
5740 subject points to the subject string
5741 length length of subject string (may contain binary zeros)
5742 start_offset where to start in the subject string
5743 options option bits
5744 offsets points to a vector of ints to be filled in with offsets
5745 offsetcount the number of elements in the vector
5746
5747 Returns: > 0 => success; value is the number of elements filled in
5748 = 0 => success, but offsets is not big enough
5749 -1 => failed to match
5750 < -1 => some kind of unexpected problem
5751 */
5752
5753 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5754 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5755 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5756 int offsetcount)
5757 {
5758 int rc, ocount;
5759 int first_byte = -1;
5760 int req_byte = -1;
5761 int req_byte2 = -1;
5762 int newline;
5763 BOOL using_temporary_offsets = FALSE;
5764 BOOL anchored;
5765 BOOL startline;
5766 BOOL firstline;
5767 BOOL first_byte_caseless = FALSE;
5768 BOOL req_byte_caseless = FALSE;
5769 BOOL utf8;
5770 match_data match_block;
5771 match_data *md = &match_block;
5772 const uschar *tables;
5773 const uschar *start_bits = NULL;
5774 USPTR start_match = (USPTR)subject + start_offset;
5775 USPTR end_subject;
5776 USPTR start_partial = NULL;
5777 USPTR req_byte_ptr = start_match - 1;
5778
5779 pcre_study_data internal_study;
5780 const pcre_study_data *study;
5781
5782 real_pcre internal_re;
5783 const real_pcre *external_re = (const real_pcre *)argument_re;
5784 const real_pcre *re = external_re;
5785
5786 /* Plausibility checks */
5787
5788 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5789 if (re == NULL || subject == NULL ||
5790 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5791 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5792 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5793
5794 /* This information is for finding all the numbers associated with a given
5795 name, for condition testing. */
5796
5797 md->name_table = (uschar *)re + re->name_table_offset;
5798 md->name_count = re->name_count;
5799 md->name_entry_size = re->name_entry_size;
5800
5801 /* Fish out the optional data from the extra_data structure, first setting
5802 the default values. */
5803
5804 study = NULL;
5805 md->match_limit = MATCH_LIMIT;
5806 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5807 md->callout_data = NULL;
5808
5809 /* The table pointer is always in native byte order. */
5810
5811 tables = external_re->tables;
5812
5813 if (extra_data != NULL)
5814 {
5815 register unsigned int flags = extra_data->flags;
5816 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5817 study = (const pcre_study_data *)extra_data->study_data;
5818 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5819 md->match_limit = extra_data->match_limit;
5820 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5821 md->match_limit_recursion = extra_data->match_limit_recursion;
5822 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5823 md->callout_data = extra_data->callout_data;
5824 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5825 }
5826
5827 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5828 is a feature that makes it possible to save compiled regex and re-use them
5829 in other programs later. */
5830
5831 if (tables == NULL) tables = _pcre_default_tables;
5832
5833 /* Check that the first field in the block is the magic number. If it is not,
5834 test for a regex that was compiled on a host of opposite endianness. If this is
5835 the case, flipped values are put in internal_re and internal_study if there was
5836 study data too. */
5837
5838 if (re->magic_number != MAGIC_NUMBER)
5839 {
5840 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5841 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5842 if (study != NULL) study = &internal_study;
5843 }
5844
5845 /* Set up other data */
5846
5847 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5848 startline = (re->flags & PCRE_STARTLINE) != 0;
5849 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5850
5851 /* The code starts after the real_pcre block and the capture name table. */
5852
5853 md->start_code = (const uschar *)external_re + re->name_table_offset +
5854 re->name_count * re->name_entry_size;
5855
5856 md->start_subject = (USPTR)subject;
5857 md->start_offset = start_offset;
5858 md->end_subject = md->start_subject + length;
5859 end_subject = md->end_subject;
5860
5861 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5862 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5863 md->use_ucp = (re->options & PCRE_UCP) != 0;
5864 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5865
5866 /* Some options are unpacked into BOOL variables in the hope that testing
5867 them will be faster than individual option bits. */
5868
5869 md->notbol = (options & PCRE_NOTBOL) != 0;
5870 md->noteol = (options & PCRE_NOTEOL) != 0;
5871 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5872 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5873 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5874 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5875
5876
5877 md->hitend = FALSE;
5878 md->mark = NULL; /* In case never set */
5879
5880 md->recursive = NULL; /* No recursion at top level */
5881
5882 md->lcc = tables + lcc_offset;
5883 md->ctypes = tables + ctypes_offset;
5884
5885 /* Handle different \R options. */
5886
5887 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5888 {
5889 case 0:
5890 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5891 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5892 else
5893 #ifdef BSR_ANYCRLF
5894 md->bsr_anycrlf = TRUE;
5895 #else
5896 md->bsr_anycrlf = FALSE;
5897 #endif
5898 break;
5899
5900 case PCRE_BSR_ANYCRLF:
5901 md->bsr_anycrlf = TRUE;
5902 break;
5903
5904 case PCRE_BSR_UNICODE:
5905 md->bsr_anycrlf = FALSE;
5906 break;
5907
5908 default: return PCRE_ERROR_BADNEWLINE;
5909 }
5910
5911 /* Handle different types of newline. The three bits give eight cases. If
5912 nothing is set at run time, whatever was used at compile time applies. */
5913
5914 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5915 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5916 {
5917 case 0: newline = NEWLINE; break; /* Compile-time default */
5918 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5919 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5920 case PCRE_NEWLINE_CR+
5921 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5922 case PCRE_NEWLINE_ANY: newline = -1; break;
5923 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5924 default: return PCRE_ERROR_BADNEWLINE;
5925 }
5926
5927 if (newline == -2)
5928 {
5929 md->nltype = NLTYPE_ANYCRLF;
5930 }
5931 else if (newline < 0)
5932 {
5933 md->nltype = NLTYPE_ANY;
5934 }
5935 else
5936 {
5937 md->nltype = NLTYPE_FIXED;
5938 if (newline > 255)
5939 {
5940 md->nllen = 2;
5941 md->nl[0] = (newline >> 8) & 255;
5942 md->nl[1] = newline & 255;
5943 }
5944 else
5945 {
5946 md->nllen = 1;
5947 md->nl[0] = newline;
5948 }
5949 }
5950
5951 /* Partial matching was originally supported only for a restricted set of
5952 regexes; from release 8.00 there are no restrictions, but the bits are still
5953 defined (though never set). So there's no harm in leaving this code. */
5954
5955 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5956 return PCRE_ERROR_BADPARTIAL;
5957
5958 /* Check a UTF-8 string if required. Pass back the character offset and error
5959 code for an invalid string if a results vector is available. */
5960
5961 #ifdef SUPPORT_UTF8
5962 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5963 {
5964 int erroroffset;
5965 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
5966 if (errorcode != 0)
5967 {
5968 if (offsetcount >= 2)
5969 {
5970 offsets[0] = erroroffset;
5971 offsets[1] = errorcode;
5972 }
5973 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
5974 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
5975 }
5976
5977 /* Check that a start_offset points to the start of a UTF-8 character. */
5978
5979 if (start_offset > 0 && start_offset < length &&
5980 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
5981 return PCRE_ERROR_BADUTF8_OFFSET;
5982 }
5983 #endif
5984
5985 /* If the expression has got more back references than the offsets supplied can
5986 hold, we get a temporary chunk of working store to use during the matching.
5987 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5988 of 3. */
5989
5990 ocount = offsetcount - (offsetcount % 3);
5991
5992 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5993 {
5994 ocount = re->top_backref * 3 + 3;
5995 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5996 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5997 using_temporary_offsets = TRUE;
5998 DPRINTF(("Got memory to hold back references\n"));
5999 }
6000 else md->offset_vector = offsets;
6001
6002 md->offset_end = ocount;
6003 md->offset_max = (2*ocount)/3;
6004 md->offset_overflow = FALSE;
6005 md->capture_last = -1;
6006
6007 /* Reset the working variable associated with each extraction. These should
6008 never be used unless previously set, but they get saved and restored, and so we
6009 initialize them to avoid reading uninitialized locations. Also, unset the
6010 offsets for the matched string. This is really just for tidiness with callouts,
6011 in case they inspect these fields. */
6012
6013 if (md->offset_vector != NULL)
6014 {
6015 register int *iptr = md->offset_vector + ocount;
6016 register int *iend = iptr - re->top_bracket;
6017 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6018 while (--iptr >= iend) *iptr = -1;
6019 md->offset_vector[0] = md->offset_vector[1] = -1;
6020 }
6021
6022 /* Set up the first character to match, if available. The first_byte value is
6023 never set for an anchored regular expression, but the anchoring may be forced
6024 at run time, so we have to test for anchoring. The first char may be unset for
6025 an unanchored pattern, of course. If there's no first char and the pattern was
6026 studied, there may be a bitmap of possible first characters. */
6027
6028 if (!anchored)
6029 {
6030 if ((re->flags & PCRE_FIRSTSET) != 0)
6031 {
6032 first_byte = re->first_byte & 255;
6033 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6034 first_byte = md->lcc[first_byte];
6035 }
6036 else
6037 if (!startline && study != NULL &&
6038 (study->flags & PCRE_STUDY_MAPPED) != 0)
6039 start_bits = study->start_bits;
6040 }
6041
6042 /* For anchored or unanchored matches, there may be a "last known required
6043 character" set. */
6044
6045 if ((re->flags & PCRE_REQCHSET) != 0)
6046 {
6047 req_byte = re->req_byte & 255;
6048 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6049 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6050 }
6051
6052
6053
6054
6055 /* ==========================================================================*/
6056
6057 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6058 the loop runs just once. */
6059
6060 for(;;)
6061 {
6062 USPTR save_end_subject = end_subject;
6063 USPTR new_start_match;
6064
6065 /* If firstline is TRUE, the start of the match is constrained to the first
6066 line of a multiline string. That is, the match must be before or at the first
6067 newline. Implement this by temporarily adjusting end_subject so that we stop
6068 scanning at a newline. If the match fails at the newline, later code breaks
6069 this loop. */
6070
6071 if (firstline)
6072 {
6073 USPTR t = start_match;
6074 #ifdef SUPPORT_UTF8
6075 if (utf8)
6076 {
6077 while (t < md->end_subject && !IS_NEWLINE(t))
6078 {
6079 t++;
6080 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6081 }
6082 }
6083 else
6084 #endif
6085 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6086 end_subject = t;
6087 }
6088
6089 /* There are some optimizations that avoid running the match if a known
6090 starting point is not found, or if a known later character is not present.
6091 However, there is an option that disables these, for testing and for ensuring
6092 that all callouts do actually occur. The option can be set in the regex by
6093 (*NO_START_OPT) or passed in match-time options. */
6094
6095 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6096 {
6097 /* Advance to a unique first byte if there is one. */
6098
6099 if (first_byte >= 0)
6100 {
6101 if (first_byte_caseless)
6102 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6103 start_match++;
6104 else
6105 while (start_match < end_subject && *start_match != first_byte)
6106 start_match++;
6107 }
6108
6109 /* Or to just after a linebreak for a multiline match */
6110
6111 else if (startline)
6112 {
6113 if (start_match > md->start_subject + start_offset)
6114 {
6115 #ifdef SUPPORT_UTF8
6116 if (utf8)
6117 {
6118 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6119 {
6120 start_match++;
6121 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6122 start_match++;
6123 }
6124 }
6125 else
6126 #endif
6127 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6128 start_match++;
6129
6130 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6131 and we are now at a LF, advance the match position by one more character.
6132 */
6133
6134 if (start_match[-1] == CHAR_CR &&
6135 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6136 start_match < end_subject &&
6137 *start_match == CHAR_NL)
6138 start_match++;
6139 }
6140 }
6141
6142 /* Or to a non-unique first byte after study */
6143
6144 else if (start_bits != NULL)
6145 {
6146 while (start_match < end_subject)
6147 {
6148 register unsigned int c = *start_match;
6149 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6150 {
6151 start_match++;
6152 #ifdef SUPPORT_UTF8
6153 if (utf8)
6154 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6155 start_match++;
6156 #endif
6157 }
6158 else break;
6159 }
6160 }
6161 } /* Starting optimizations */
6162
6163 /* Restore fudged end_subject */
6164
6165 end_subject = save_end_subject;
6166
6167 /* The following two optimizations are disabled for partial matching or if
6168 disabling is explicitly requested. */
6169
6170 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6171 {
6172 /* If the pattern was studied, a minimum subject length may be set. This is
6173 a lower bound; no actual string of that length may actually match the
6174 pattern. Although the value is, strictly, in characters, we treat it as
6175 bytes to avoid spending too much time in this optimization. */
6176
6177 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6178 (pcre_uint32)(end_subject - start_match) < study->minlength)
6179 {
6180 rc = MATCH_NOMATCH;
6181 break;
6182 }
6183
6184 /* If req_byte is set, we know that that character must appear in the
6185 subject for the match to succeed. If the first character is set, req_byte
6186 must be later in the subject; otherwise the test starts at the match point.
6187 This optimization can save a huge amount of backtracking in patterns with
6188 nested unlimited repeats that aren't going to match. Writing separate code
6189 for cased/caseless versions makes it go faster, as does using an
6190 autoincrement and backing off on a match.
6191
6192 HOWEVER: when the subject string is very, very long, searching to its end
6193 can take a long time, and give bad performance on quite ordinary patterns.
6194 This showed up when somebody was matching something like /^\d+C/ on a
6195 32-megabyte string... so we don't do this when the string is sufficiently
6196 long. */
6197
6198 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6199 {
6200 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6201
6202 /* We don't need to repeat the search if we haven't yet reached the
6203 place we found it at last time. */
6204
6205 if (p > req_byte_ptr)
6206 {
6207 if (req_byte_caseless)
6208 {
6209 while (p < end_subject)
6210 {
6211 register int pp = *p++;
6212 if (pp == req_byte || pp == req_byte2) { p--; break; }
6213 }
6214 }
6215 else
6216 {
6217 while (p < end_subject)
6218 {
6219 if (*p++ == req_byte) { p--; break; }
6220 }
6221 }
6222
6223 /* If we can't find the required character, break the matching loop,
6224 forcing a match failure. */
6225
6226 if (p >= end_subject)
6227 {
6228 rc = MATCH_NOMATCH;
6229 break;
6230 }
6231
6232 /* If we have found the required character, save the point where we
6233 found it, so that we don't search again next time round the loop if
6234 the start hasn't passed this character yet. */
6235
6236 req_byte_ptr = p;
6237 }
6238 }
6239 }
6240
6241 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6242 printf(">>>> Match against: ");
6243 pchars(start_match, end_subject - start_match, TRUE, md);
6244 printf("\n");
6245 #endif
6246
6247 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6248 first starting point for which a partial match was found. */
6249
6250 md->start_match_ptr = start_match;
6251 md->start_used_ptr = start_match;
6252 md->match_call_count = 0;
6253 md->match_function_type = 0;
6254 md->end_offset_top = 0;
6255 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6256 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6257
6258 switch(rc)
6259 {
6260 /* SKIP passes back the next starting point explicitly, but if it is the
6261 same as the match we have just done, treat it as NOMATCH. */
6262
6263 case MATCH_SKIP:
6264 if (md->start_match_ptr != start_match)
6265 {
6266 new_start_match = md->start_match_ptr;
6267 break;
6268 }
6269 /* Fall through */
6270
6271 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6272 the SKIP's arg was not found. We also treat this as NOMATCH. */
6273
6274 case MATCH_SKIP_ARG:
6275 /* Fall through */
6276
6277 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6278 exactly like PRUNE. */
6279
6280 case MATCH_NOMATCH:
6281 case MATCH_PRUNE:
6282 case MATCH_THEN:
6283 new_start_match = start_match + 1;
6284 #ifdef SUPPORT_UTF8
6285 if (utf8)
6286 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6287 new_start_match++;
6288 #endif
6289 break;
6290
6291 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6292
6293 case MATCH_COMMIT:
6294 rc = MATCH_NOMATCH;
6295 goto ENDLOOP;
6296
6297 /* Any other return is either a match, or some kind of error. */
6298
6299 default:
6300 goto ENDLOOP;
6301 }
6302
6303 /* Control reaches here for the various types of "no match at this point"
6304 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6305
6306 rc = MATCH_NOMATCH;
6307
6308 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6309 newline in the subject (though it may continue over the newline). Therefore,
6310 if we have just failed to match, starting at a newline, do not continue. */
6311
6312 if (firstline && IS_NEWLINE(start_match)) break;
6313
6314 /* Advance to new matching position */
6315
6316 start_match = new_start_match;
6317
6318 /* Break the loop if the pattern is anchored or if we have passed the end of
6319 the subject. */
6320
6321 if (anchored || start_match > end_subject) break;
6322
6323 /* If we have just passed a CR and we are now at a LF, and the pattern does
6324 not contain any explicit matches for \r or \n, and the newline option is CRLF
6325 or ANY or ANYCRLF, advance the match position by one more character. */
6326
6327 if (start_match[-1] == CHAR_CR &&
6328 start_match < end_subject &&
6329 *start_match == CHAR_NL &&
6330 (re->flags & PCRE_HASCRORLF) == 0 &&
6331 (md->nltype == NLTYPE_ANY ||
6332 md->nltype == NLTYPE_ANYCRLF ||
6333 md->nllen == 2))
6334 start_match++;
6335
6336 md->mark = NULL; /* Reset for start of next match attempt */
6337 } /* End of for(;;) "bumpalong" loop */
6338
6339 /* ==========================================================================*/
6340
6341 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6342 conditions is true:
6343
6344 (1) The pattern is anchored or the match was failed by (*COMMIT);
6345
6346 (2) We are past the end of the subject;
6347
6348 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6349 this option requests that a match occur at or before the first newline in
6350 the subject.
6351
6352 When we have a match and the offset vector is big enough to deal with any
6353 backreferences, captured substring offsets will already be set up. In the case
6354 where we had to get some local store to hold offsets for backreference
6355 processing, copy those that we can. In this case there need not be overflow if
6356 certain parts of the pattern were not used, even though there are more
6357 capturing parentheses than vector slots. */
6358
6359 ENDLOOP:
6360
6361 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6362 {
6363 if (using_temporary_offsets)
6364 {
6365 if (offsetcount >= 4)
6366 {
6367 memcpy(offsets + 2, md->offset_vector + 2,
6368 (offsetcount - 2) * sizeof(int));
6369 DPRINTF(("Copied offsets from temporary memory\n"));
6370 }
6371 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6372 DPRINTF(("Freeing temporary memory\n"));
6373 (pcre_free)(md->offset_vector);
6374 }
6375
6376 /* Set the return code to the number of captured strings, or 0 if there are
6377 too many to fit into the vector. */
6378
6379 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6380
6381 /* If there is space in the offset vector, set any unused pairs at the end of
6382 the pattern to -1 for backwards compatibility. It is documented that this
6383 happens. In earlier versions, the whole set of potential capturing offsets
6384 was set to -1 each time round the loop, but this is handled differently now.
6385 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6386 those at the end that need unsetting here. We can't just unset them all at
6387 the start of the whole thing because they may get set in one branch that is
6388 not the final matching branch. */
6389
6390 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6391 {
6392 register int *iptr, *iend;
6393 int resetcount = 2 + re->top_bracket * 2;
6394 if (resetcount > offsetcount) resetcount = ocount;
6395 iptr = offsets + md->end_offset_top;
6396 iend = offsets + resetcount;
6397 while (iptr < iend) *iptr++ = -1;
6398 }
6399
6400 /* If there is space, set up the whole thing as substring 0. The value of
6401 md->start_match_ptr might be modified if \K was encountered on the success
6402 matching path. */
6403
6404 if (offsetcount < 2) rc = 0; else
6405 {
6406 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6407 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6408 }
6409
6410 DPRINTF((">>>> returning %d\n", rc));
6411 goto RETURN_MARK;
6412 }
6413
6414 /* Control gets here if there has been an error, or if the overall match
6415 attempt has failed at all permitted starting positions. */
6416
6417 if (using_temporary_offsets)
6418 {
6419 DPRINTF(("Freeing temporary memory\n"));
6420 (pcre_free)(md->offset_vector);
6421 }
6422
6423 /* For anything other than nomatch or partial match, just return the code. */
6424
6425 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6426 {
6427 DPRINTF((">>>> error: returning %d\n", rc));
6428 return rc;
6429 }
6430
6431 /* Handle partial matches - disable any mark data */
6432
6433 if (start_partial != NULL)
6434 {
6435 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6436 md->mark = NULL;
6437 if (offsetcount > 1)
6438 {
6439 offsets[0] = (int)(start_partial - (USPTR)subject);
6440 offsets[1] = (int)(end_subject - (USPTR)subject);
6441 }
6442 rc = PCRE_ERROR_PARTIAL;
6443 }
6444
6445 /* This is the classic nomatch case */
6446
6447 else
6448 {
6449 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6450 rc = PCRE_ERROR_NOMATCH;
6451 }
6452
6453 /* Return the MARK data if it has been requested. */
6454
6455 RETURN_MARK:
6456
6457 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6458 *(extra_data->mark) = (unsigned char *)(md->mark);
6459 return rc;
6460 }
6461
6462 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5