/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 786 - (show annotations)
Tue Dec 6 11:33:41 2011 UTC (7 years, 9 months ago) by zherczeg
File MIME type: text/plain
File size: 203316 byte(s)
Updating pcre_jit_test. Most of the JIT tests are working now in 16 bit mode.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 PCRE_PUCHAR eptr_start = eptr;
159 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 PCRE_PUCHAR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 PCRE_PUCHAR Xeptr;
358 const pcre_uchar *Xecode;
359 PCRE_PUCHAR Xmstart;
360 PCRE_PUCHAR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
478 PCRE_PUCHAR mstart, const pcre_uchar *markptr, int offset_top,
479 match_data *md, eptrblock *eptrb, unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf; /* Local copy of UTF flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const pcre_uchar *charptr;
590 #endif
591 const pcre_uchar *callpat;
592 const pcre_uchar *data;
593 const pcre_uchar *next;
594 PCRE_PUCHAR pp;
595 const pcre_uchar *prev;
596 PCRE_PUCHAR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 pcre_uchar occhars[6];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf = md->utf; /* Local copy of the flag */
664 #else
665 utf = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 STRCMP_UC_UC(markptr, md->start_match_ptr) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle an atomic group that does not contain any capturing parentheses.
798 This can be handled like an assertion. Prior to 8.13, all atomic groups
799 were handled this way. In 8.13, the code was changed as below for ONCE, so
800 that backups pass through the group and thereby reset captured values.
801 However, this uses a lot more stack, so in 8.20, atomic groups that do not
802 contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 less stack intensive way.
804
805 Check the alternative branches in turn - the matching won't pass the KET
806 for this kind of subpattern. If any one branch matches, we carry on as at
807 the end of a normal bracket, leaving the subject pointer, but resetting
808 the start-of-match value in case it was changed by \K. */
809
810 case OP_ONCE_NC:
811 prev = ecode;
812 saved_eptr = eptr;
813 do
814 {
815 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817 {
818 mstart = md->start_match_ptr;
819 markptr = md->mark;
820 break;
821 }
822 if (rrc == MATCH_THEN)
823 {
824 next = ecode + GET(ecode,1);
825 if (md->start_match_ptr < next &&
826 (*ecode == OP_ALT || *next == OP_ALT))
827 rrc = MATCH_NOMATCH;
828 }
829
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 ecode += GET(ecode,1);
832 }
833 while (*ecode == OP_ALT);
834
835 /* If hit the end of the group (which could be repeated), fail */
836
837 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
838
839 /* Continue as from after the group, updating the offsets high water
840 mark, since extracts may have been taken. */
841
842 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
843
844 offset_top = md->end_offset_top;
845 eptr = md->end_match_ptr;
846
847 /* For a non-repeating ket, just continue at this level. This also
848 happens for a repeating ket if no characters were matched in the group.
849 This is the forcible breaking of infinite loops as implemented in Perl
850 5.005. */
851
852 if (*ecode == OP_KET || eptr == saved_eptr)
853 {
854 ecode += 1+LINK_SIZE;
855 break;
856 }
857
858 /* The repeating kets try the rest of the pattern or restart from the
859 preceding bracket, in the appropriate order. The second "call" of match()
860 uses tail recursion, to avoid using another stack frame. */
861
862 if (*ecode == OP_KETRMIN)
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
866 ecode = prev;
867 goto TAIL_RECURSE;
868 }
869 else /* OP_KETRMAX */
870 {
871 md->match_function_type = MATCH_CBEGROUP;
872 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
874 ecode += 1 + LINK_SIZE;
875 goto TAIL_RECURSE;
876 }
877 /* Control never gets here */
878
879 /* Handle a capturing bracket, other than those that are possessive with an
880 unlimited repeat. If there is space in the offset vector, save the current
881 subject position in the working slot at the top of the vector. We mustn't
882 change the current values of the data slot, because they may be set from a
883 previous iteration of this group, and be referred to by a reference inside
884 the group. A failure to match might occur after the group has succeeded,
885 if something later on doesn't match. For this reason, we need to restore
886 the working value and also the values of the final offsets, in case they
887 were set by a previous iteration of the same bracket.
888
889 If there isn't enough space in the offset vector, treat this as if it were
890 a non-capturing bracket. Don't worry about setting the flag for the error
891 case here; that is handled in the code for KET. */
892
893 case OP_CBRA:
894 case OP_SCBRA:
895 number = GET2(ecode, 1+LINK_SIZE);
896 offset = number << 1;
897
898 #ifdef PCRE_DEBUG
899 printf("start bracket %d\n", number);
900 printf("subject=");
901 pchars(eptr, 16, TRUE, md);
902 printf("\n");
903 #endif
904
905 if (offset < md->offset_max)
906 {
907 save_offset1 = md->offset_vector[offset];
908 save_offset2 = md->offset_vector[offset+1];
909 save_offset3 = md->offset_vector[md->offset_end - number];
910 save_capture_last = md->capture_last;
911
912 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
913 md->offset_vector[md->offset_end - number] =
914 (int)(eptr - md->start_subject);
915
916 for (;;)
917 {
918 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
919 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
920 eptrb, RM1);
921 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
922
923 /* If we backed up to a THEN, check whether it is within the current
924 branch by comparing the address of the THEN that is passed back with
925 the end of the branch. If it is within the current branch, and the
926 branch is one of two or more alternatives (it either starts or ends
927 with OP_ALT), we have reached the limit of THEN's action, so convert
928 the return code to NOMATCH, which will cause normal backtracking to
929 happen from now on. Otherwise, THEN is passed back to an outer
930 alternative. This implements Perl's treatment of parenthesized groups,
931 where a group not containing | does not affect the current alternative,
932 that is, (X) is NOT the same as (X|(*F)). */
933
934 if (rrc == MATCH_THEN)
935 {
936 next = ecode + GET(ecode,1);
937 if (md->start_match_ptr < next &&
938 (*ecode == OP_ALT || *next == OP_ALT))
939 rrc = MATCH_NOMATCH;
940 }
941
942 /* Anything other than NOMATCH is passed back. */
943
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 md->capture_last = save_capture_last;
946 ecode += GET(ecode, 1);
947 if (*ecode != OP_ALT) break;
948 }
949
950 DPRINTF(("bracket %d failed\n", number));
951 md->offset_vector[offset] = save_offset1;
952 md->offset_vector[offset+1] = save_offset2;
953 md->offset_vector[md->offset_end - number] = save_offset3;
954
955 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
956
957 if (md->mark == NULL) md->mark = markptr;
958 RRETURN(rrc);
959 }
960
961 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962 as a non-capturing bracket. */
963
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966
967 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971
972 /* Non-capturing or atomic group, except for possessive with unlimited
973 repeat and ONCE group with no captures. Loop for all the alternatives.
974
975 When we get to the final alternative within the brackets, we used to return
976 the result of a recursive call to match() whatever happened so it was
977 possible to reduce stack usage by turning this into a tail recursion,
978 except in the case of a possibly empty group. However, now that there is
979 the possiblity of (*THEN) occurring in the final alternative, this
980 optimization is no longer always possible.
981
982 We can optimize if we know there are no (*THEN)s in the pattern; at present
983 this is the best that can be done.
984
985 MATCH_ONCE is returned when the end of an atomic group is successfully
986 reached, but subsequent matching fails. It passes back up the tree (causing
987 captured values to be reset) until the original atomic group level is
988 reached. This is tested by comparing md->once_target with the start of the
989 group. At this point, the return is converted into MATCH_NOMATCH so that
990 previous backup points can be taken. */
991
992 case OP_ONCE:
993 case OP_BRA:
994 case OP_SBRA:
995 DPRINTF(("start non-capturing bracket\n"));
996
997 for (;;)
998 {
999 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000
1001 /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 the pattern, and this is the final alternative, optimize as described
1003 above. */
1004
1005 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006 {
1007 ecode += PRIV(OP_lengths)[*ecode];
1008 goto TAIL_RECURSE;
1009 }
1010
1011 /* In all other cases, we have to make another call to match(). */
1012
1013 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1014 RM2);
1015
1016 /* See comment in the code for capturing groups above about handling
1017 THEN. */
1018
1019 if (rrc == MATCH_THEN)
1020 {
1021 next = ecode + GET(ecode,1);
1022 if (md->start_match_ptr < next &&
1023 (*ecode == OP_ALT || *next == OP_ALT))
1024 rrc = MATCH_NOMATCH;
1025 }
1026
1027 if (rrc != MATCH_NOMATCH)
1028 {
1029 if (rrc == MATCH_ONCE)
1030 {
1031 const pcre_uchar *scode = ecode;
1032 if (*scode != OP_ONCE) /* If not at start, find it */
1033 {
1034 while (*scode == OP_ALT) scode += GET(scode, 1);
1035 scode -= GET(scode, 1);
1036 }
1037 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 }
1039 RRETURN(rrc);
1040 }
1041 ecode += GET(ecode, 1);
1042 if (*ecode != OP_ALT) break;
1043 }
1044
1045 if (md->mark == NULL) md->mark = markptr;
1046 RRETURN(MATCH_NOMATCH);
1047
1048 /* Handle possessive capturing brackets with an unlimited repeat. We come
1049 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1050 handled similarly to the normal case above. However, the matching is
1051 different. The end of these brackets will always be OP_KETRPOS, which
1052 returns MATCH_KETRPOS without going further in the pattern. By this means
1053 we can handle the group by iteration rather than recursion, thereby
1054 reducing the amount of stack needed. */
1055
1056 case OP_CBRAPOS:
1057 case OP_SCBRAPOS:
1058 allow_zero = FALSE;
1059
1060 POSSESSIVE_CAPTURE:
1061 number = GET2(ecode, 1+LINK_SIZE);
1062 offset = number << 1;
1063
1064 #ifdef PCRE_DEBUG
1065 printf("start possessive bracket %d\n", number);
1066 printf("subject=");
1067 pchars(eptr, 16, TRUE, md);
1068 printf("\n");
1069 #endif
1070
1071 if (offset < md->offset_max)
1072 {
1073 matched_once = FALSE;
1074 code_offset = ecode - md->start_code;
1075
1076 save_offset1 = md->offset_vector[offset];
1077 save_offset2 = md->offset_vector[offset+1];
1078 save_offset3 = md->offset_vector[md->offset_end - number];
1079 save_capture_last = md->capture_last;
1080
1081 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1082
1083 /* Each time round the loop, save the current subject position for use
1084 when the group matches. For MATCH_MATCH, the group has matched, so we
1085 restart it with a new subject starting position, remembering that we had
1086 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1087 usual. If we haven't matched any alternatives in any iteration, check to
1088 see if a previous iteration matched. If so, the group has matched;
1089 continue from afterwards. Otherwise it has failed; restore the previous
1090 capture values before returning NOMATCH. */
1091
1092 for (;;)
1093 {
1094 md->offset_vector[md->offset_end - number] =
1095 (int)(eptr - md->start_subject);
1096 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1097 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1098 eptrb, RM63);
1099 if (rrc == MATCH_KETRPOS)
1100 {
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1103 ecode = md->start_code + code_offset;
1104 save_capture_last = md->capture_last;
1105 matched_once = TRUE;
1106 continue;
1107 }
1108
1109 /* See comment in the code for capturing groups above about handling
1110 THEN. */
1111
1112 if (rrc == MATCH_THEN)
1113 {
1114 next = ecode + GET(ecode,1);
1115 if (md->start_match_ptr < next &&
1116 (*ecode == OP_ALT || *next == OP_ALT))
1117 rrc = MATCH_NOMATCH;
1118 }
1119
1120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1121 md->capture_last = save_capture_last;
1122 ecode += GET(ecode, 1);
1123 if (*ecode != OP_ALT) break;
1124 }
1125
1126 if (!matched_once)
1127 {
1128 md->offset_vector[offset] = save_offset1;
1129 md->offset_vector[offset+1] = save_offset2;
1130 md->offset_vector[md->offset_end - number] = save_offset3;
1131 }
1132
1133 if (md->mark == NULL) md->mark = markptr;
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = ecode - md->start_code;
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (pcre_callout != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = (unsigned char *)markptr;
1236 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 pcre_uchar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 pcre_uchar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 pcre_uchar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 pcre_uchar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 MRRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to MRRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 MRRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 markptr = md->mark;
1532 break;
1533 }
1534
1535 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1536 as NOMATCH. */
1537
1538 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1539 ecode += GET(ecode, 1);
1540 }
1541 while (*ecode == OP_ALT);
1542
1543 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1544
1545 /* If checking an assertion for a condition, return MATCH_MATCH. */
1546
1547 if (condassert) RRETURN(MATCH_MATCH);
1548
1549 /* Continue from after the assertion, updating the offsets high water
1550 mark, since extracts may have been taken during the assertion. */
1551
1552 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1553 ecode += 1 + LINK_SIZE;
1554 offset_top = md->end_offset_top;
1555 continue;
1556
1557 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1558 PRUNE, or COMMIT means we must assume failure without checking subsequent
1559 branches. */
1560
1561 case OP_ASSERT_NOT:
1562 case OP_ASSERTBACK_NOT:
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1574 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1575 {
1576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1577 break;
1578 }
1579
1580 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1581 as NOMATCH. */
1582
1583 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1584 ecode += GET(ecode,1);
1585 }
1586 while (*ecode == OP_ALT);
1587
1588 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1589
1590 ecode += 1 + LINK_SIZE;
1591 continue;
1592
1593 /* Move the subject pointer back. This occurs only at the start of
1594 each branch of a lookbehind assertion. If we are too close to the start to
1595 move back, this match function fails. When working with UTF-8 we move
1596 back a number of characters, not bytes. */
1597
1598 case OP_REVERSE:
1599 #ifdef SUPPORT_UTF8
1600 if (utf)
1601 {
1602 i = GET(ecode, 1);
1603 while (i-- > 0)
1604 {
1605 eptr--;
1606 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1607 BACKCHAR(eptr);
1608 }
1609 }
1610 else
1611 #endif
1612
1613 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1614
1615 {
1616 eptr -= GET(ecode, 1);
1617 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1618 }
1619
1620 /* Save the earliest consulted character, then skip to next op code */
1621
1622 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1623 ecode += 1 + LINK_SIZE;
1624 break;
1625
1626 /* The callout item calls an external function, if one is provided, passing
1627 details of the match so far. This is mainly for debugging, though the
1628 function is able to force a failure. */
1629
1630 case OP_CALLOUT:
1631 if (pcre_callout != NULL)
1632 {
1633 pcre_callout_block cb;
1634 cb.version = 2; /* Version 1 of the callout block */
1635 cb.callout_number = ecode[1];
1636 cb.offset_vector = md->offset_vector;
1637 cb.subject = (PCRE_SPTR)md->start_subject;
1638 cb.subject_length = (int)(md->end_subject - md->start_subject);
1639 cb.start_match = (int)(mstart - md->start_subject);
1640 cb.current_position = (int)(eptr - md->start_subject);
1641 cb.pattern_position = GET(ecode, 2);
1642 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1643 cb.capture_top = offset_top/2;
1644 cb.capture_last = md->capture_last;
1645 cb.callout_data = md->callout_data;
1646 cb.mark = (unsigned char *)markptr;
1647 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1648 if (rrc < 0) RRETURN(rrc);
1649 }
1650 ecode += 2 + 2*LINK_SIZE;
1651 break;
1652
1653 /* Recursion either matches the current regex, or some subexpression. The
1654 offset data is the offset to the starting bracket from the start of the
1655 whole pattern. (This is so that it works from duplicated subpatterns.)
1656
1657 The state of the capturing groups is preserved over recursion, and
1658 re-instated afterwards. We don't know how many are started and not yet
1659 finished (offset_top records the completed total) so we just have to save
1660 all the potential data. There may be up to 65535 such values, which is too
1661 large to put on the stack, but using malloc for small numbers seems
1662 expensive. As a compromise, the stack is used when there are no more than
1663 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1664
1665 There are also other values that have to be saved. We use a chained
1666 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1667 for the original version of this logic. It has, however, been hacked around
1668 a lot, so he is not to blame for the current way it works. */
1669
1670 case OP_RECURSE:
1671 {
1672 recursion_info *ri;
1673 int recno;
1674
1675 callpat = md->start_code + GET(ecode, 1);
1676 recno = (callpat == md->start_code)? 0 :
1677 GET2(callpat, 1 + LINK_SIZE);
1678
1679 /* Check for repeating a recursion without advancing the subject pointer.
1680 This should catch convoluted mutual recursions. (Some simple cases are
1681 caught at compile time.) */
1682
1683 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1684 if (recno == ri->group_num && eptr == ri->subject_position)
1685 RRETURN(PCRE_ERROR_RECURSELOOP);
1686
1687 /* Add to "recursing stack" */
1688
1689 new_recursive.group_num = recno;
1690 new_recursive.subject_position = eptr;
1691 new_recursive.prevrec = md->recursive;
1692 md->recursive = &new_recursive;
1693
1694 /* Where to continue from afterwards */
1695
1696 ecode += 1 + LINK_SIZE;
1697
1698 /* Now save the offset data */
1699
1700 new_recursive.saved_max = md->offset_end;
1701 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1702 new_recursive.offset_save = stacksave;
1703 else
1704 {
1705 new_recursive.offset_save =
1706 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1707 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1708 }
1709 memcpy(new_recursive.offset_save, md->offset_vector,
1710 new_recursive.saved_max * sizeof(int));
1711
1712 /* OK, now we can do the recursion. After processing each alternative,
1713 restore the offset data. If there were nested recursions, md->recursive
1714 might be changed, so reset it before looping. */
1715
1716 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1717 cbegroup = (*callpat >= OP_SBRA);
1718 do
1719 {
1720 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1721 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1722 md, eptrb, RM6);
1723 memcpy(md->offset_vector, new_recursive.offset_save,
1724 new_recursive.saved_max * sizeof(int));
1725 md->recursive = new_recursive.prevrec;
1726 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1727 {
1728 DPRINTF(("Recursion matched\n"));
1729 if (new_recursive.offset_save != stacksave)
1730 (pcre_free)(new_recursive.offset_save);
1731
1732 /* Set where we got to in the subject, and reset the start in case
1733 it was changed by \K. This *is* propagated back out of a recursion,
1734 for Perl compatibility. */
1735
1736 eptr = md->end_match_ptr;
1737 mstart = md->start_match_ptr;
1738 goto RECURSION_MATCHED; /* Exit loop; end processing */
1739 }
1740
1741 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1742 as NOMATCH. */
1743
1744 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1745 {
1746 DPRINTF(("Recursion gave error %d\n", rrc));
1747 if (new_recursive.offset_save != stacksave)
1748 (pcre_free)(new_recursive.offset_save);
1749 RRETURN(rrc);
1750 }
1751
1752 md->recursive = &new_recursive;
1753 callpat += GET(callpat, 1);
1754 }
1755 while (*callpat == OP_ALT);
1756
1757 DPRINTF(("Recursion didn't match\n"));
1758 md->recursive = new_recursive.prevrec;
1759 if (new_recursive.offset_save != stacksave)
1760 (pcre_free)(new_recursive.offset_save);
1761 MRRETURN(MATCH_NOMATCH);
1762 }
1763
1764 RECURSION_MATCHED:
1765 break;
1766
1767 /* An alternation is the end of a branch; scan along to find the end of the
1768 bracketed group and go to there. */
1769
1770 case OP_ALT:
1771 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1772 break;
1773
1774 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1775 indicating that it may occur zero times. It may repeat infinitely, or not
1776 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1777 with fixed upper repeat limits are compiled as a number of copies, with the
1778 optional ones preceded by BRAZERO or BRAMINZERO. */
1779
1780 case OP_BRAZERO:
1781 next = ecode + 1;
1782 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 do next += GET(next, 1); while (*next == OP_ALT);
1785 ecode = next + 1 + LINK_SIZE;
1786 break;
1787
1788 case OP_BRAMINZERO:
1789 next = ecode + 1;
1790 do next += GET(next, 1); while (*next == OP_ALT);
1791 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 ecode++;
1794 break;
1795
1796 case OP_SKIPZERO:
1797 next = ecode+1;
1798 do next += GET(next,1); while (*next == OP_ALT);
1799 ecode = next + 1 + LINK_SIZE;
1800 break;
1801
1802 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1803 here; just jump to the group, with allow_zero set TRUE. */
1804
1805 case OP_BRAPOSZERO:
1806 op = *(++ecode);
1807 allow_zero = TRUE;
1808 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1809 goto POSSESSIVE_NON_CAPTURE;
1810
1811 /* End of a group, repeated or non-repeating. */
1812
1813 case OP_KET:
1814 case OP_KETRMIN:
1815 case OP_KETRMAX:
1816 case OP_KETRPOS:
1817 prev = ecode - GET(ecode, 1);
1818
1819 /* If this was a group that remembered the subject start, in order to break
1820 infinite repeats of empty string matches, retrieve the subject start from
1821 the chain. Otherwise, set it NULL. */
1822
1823 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1824 {
1825 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1826 eptrb = eptrb->epb_prev; /* Backup to previous group */
1827 }
1828 else saved_eptr = NULL;
1829
1830 /* If we are at the end of an assertion group or a non-capturing atomic
1831 group, stop matching and return MATCH_MATCH, but record the current high
1832 water mark for use by positive assertions. We also need to record the match
1833 start in case it was changed by \K. */
1834
1835 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1836 *prev == OP_ONCE_NC)
1837 {
1838 md->end_match_ptr = eptr; /* For ONCE_NC */
1839 md->end_offset_top = offset_top;
1840 md->start_match_ptr = mstart;
1841 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1842 }
1843
1844 /* For capturing groups we have to check the group number back at the start
1845 and if necessary complete handling an extraction by setting the offsets and
1846 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1847 into group 0, so it won't be picked up here. Instead, we catch it when the
1848 OP_END is reached. Other recursion is handled here. We just have to record
1849 the current subject position and start match pointer and give a MATCH
1850 return. */
1851
1852 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1853 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1854 {
1855 number = GET2(prev, 1+LINK_SIZE);
1856 offset = number << 1;
1857
1858 #ifdef PCRE_DEBUG
1859 printf("end bracket %d", number);
1860 printf("\n");
1861 #endif
1862
1863 /* Handle a recursively called group. */
1864
1865 if (md->recursive != NULL && md->recursive->group_num == number)
1866 {
1867 md->end_match_ptr = eptr;
1868 md->start_match_ptr = mstart;
1869 RRETURN(MATCH_MATCH);
1870 }
1871
1872 /* Deal with capturing */
1873
1874 md->capture_last = number;
1875 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1876 {
1877 /* If offset is greater than offset_top, it means that we are
1878 "skipping" a capturing group, and that group's offsets must be marked
1879 unset. In earlier versions of PCRE, all the offsets were unset at the
1880 start of matching, but this doesn't work because atomic groups and
1881 assertions can cause a value to be set that should later be unset.
1882 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1883 part of the atomic group, but this is not on the final matching path,
1884 so must be unset when 2 is set. (If there is no group 2, there is no
1885 problem, because offset_top will then be 2, indicating no capture.) */
1886
1887 if (offset > offset_top)
1888 {
1889 register int *iptr = md->offset_vector + offset_top;
1890 register int *iend = md->offset_vector + offset;
1891 while (iptr < iend) *iptr++ = -1;
1892 }
1893
1894 /* Now make the extraction */
1895
1896 md->offset_vector[offset] =
1897 md->offset_vector[md->offset_end - number];
1898 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1899 if (offset_top <= offset) offset_top = offset + 2;
1900 }
1901 }
1902
1903 /* For an ordinary non-repeating ket, just continue at this level. This
1904 also happens for a repeating ket if no characters were matched in the
1905 group. This is the forcible breaking of infinite loops as implemented in
1906 Perl 5.005. For a non-repeating atomic group that includes captures,
1907 establish a backup point by processing the rest of the pattern at a lower
1908 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1909 original OP_ONCE level, thereby bypassing intermediate backup points, but
1910 resetting any captures that happened along the way. */
1911
1912 if (*ecode == OP_KET || eptr == saved_eptr)
1913 {
1914 if (*prev == OP_ONCE)
1915 {
1916 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1918 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1919 RRETURN(MATCH_ONCE);
1920 }
1921 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1922 break;
1923 }
1924
1925 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1926 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1927 at a time from the outer level, thus saving stack. */
1928
1929 if (*ecode == OP_KETRPOS)
1930 {
1931 md->end_match_ptr = eptr;
1932 md->end_offset_top = offset_top;
1933 RRETURN(MATCH_KETRPOS);
1934 }
1935
1936 /* The normal repeating kets try the rest of the pattern or restart from
1937 the preceding bracket, in the appropriate order. In the second case, we can
1938 use tail recursion to avoid using another stack frame, unless we have an
1939 an atomic group or an unlimited repeat of a group that can match an empty
1940 string. */
1941
1942 if (*ecode == OP_KETRMIN)
1943 {
1944 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1946 if (*prev == OP_ONCE)
1947 {
1948 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1951 RRETURN(MATCH_ONCE);
1952 }
1953 if (*prev >= OP_SBRA) /* Could match an empty string */
1954 {
1955 md->match_function_type = MATCH_CBEGROUP;
1956 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1957 RRETURN(rrc);
1958 }
1959 ecode = prev;
1960 goto TAIL_RECURSE;
1961 }
1962 else /* OP_KETRMAX */
1963 {
1964 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1965 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1966 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (*prev == OP_ONCE)
1969 {
1970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972 md->once_target = prev;
1973 RRETURN(MATCH_ONCE);
1974 }
1975 ecode += 1 + LINK_SIZE;
1976 goto TAIL_RECURSE;
1977 }
1978 /* Control never gets here */
1979
1980 /* Not multiline mode: start of subject assertion, unless notbol. */
1981
1982 case OP_CIRC:
1983 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1984
1985 /* Start of subject assertion */
1986
1987 case OP_SOD:
1988 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1989 ecode++;
1990 break;
1991
1992 /* Multiline mode: start of subject unless notbol, or after any newline. */
1993
1994 case OP_CIRCM:
1995 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1996 if (eptr != md->start_subject &&
1997 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1998 MRRETURN(MATCH_NOMATCH);
1999 ecode++;
2000 break;
2001
2002 /* Start of match assertion */
2003
2004 case OP_SOM:
2005 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2006 ecode++;
2007 break;
2008
2009 /* Reset the start of match point */
2010
2011 case OP_SET_SOM:
2012 mstart = eptr;
2013 ecode++;
2014 break;
2015
2016 /* Multiline mode: assert before any newline, or before end of subject
2017 unless noteol is set. */
2018
2019 case OP_DOLLM:
2020 if (eptr < md->end_subject)
2021 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2022 else
2023 {
2024 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2025 SCHECK_PARTIAL();
2026 }
2027 ecode++;
2028 break;
2029
2030 /* Not multiline mode: assert before a terminating newline or before end of
2031 subject unless noteol is set. */
2032
2033 case OP_DOLL:
2034 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2035 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2036
2037 /* ... else fall through for endonly */
2038
2039 /* End of subject assertion (\z) */
2040
2041 case OP_EOD:
2042 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2043 SCHECK_PARTIAL();
2044 ecode++;
2045 break;
2046
2047 /* End of subject or ending \n assertion (\Z) */
2048
2049 case OP_EODN:
2050 ASSERT_NL_OR_EOS:
2051 if (eptr < md->end_subject &&
2052 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2053 MRRETURN(MATCH_NOMATCH);
2054
2055 /* Either at end of string or \n before end. */
2056
2057 SCHECK_PARTIAL();
2058 ecode++;
2059 break;
2060
2061 /* Word boundary assertions */
2062
2063 case OP_NOT_WORD_BOUNDARY:
2064 case OP_WORD_BOUNDARY:
2065 {
2066
2067 /* Find out if the previous and current characters are "word" characters.
2068 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2069 be "non-word" characters. Remember the earliest consulted character for
2070 partial matching. */
2071
2072 #ifdef SUPPORT_UTF
2073 if (utf)
2074 {
2075 /* Get status of previous character */
2076
2077 if (eptr == md->start_subject) prev_is_word = FALSE; else
2078 {
2079 PCRE_PUCHAR lastptr = eptr - 1;
2080 BACKCHAR(lastptr);
2081 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2082 GETCHAR(c, lastptr);
2083 #ifdef SUPPORT_UCP
2084 if (md->use_ucp)
2085 {
2086 if (c == '_') prev_is_word = TRUE; else
2087 {
2088 int cat = UCD_CATEGORY(c);
2089 prev_is_word = (cat == ucp_L || cat == ucp_N);
2090 }
2091 }
2092 else
2093 #endif
2094 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2095 }
2096
2097 /* Get status of next character */
2098
2099 if (eptr >= md->end_subject)
2100 {
2101 SCHECK_PARTIAL();
2102 cur_is_word = FALSE;
2103 }
2104 else
2105 {
2106 GETCHAR(c, eptr);
2107 #ifdef SUPPORT_UCP
2108 if (md->use_ucp)
2109 {
2110 if (c == '_') cur_is_word = TRUE; else
2111 {
2112 int cat = UCD_CATEGORY(c);
2113 cur_is_word = (cat == ucp_L || cat == ucp_N);
2114 }
2115 }
2116 else
2117 #endif
2118 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2119 }
2120 }
2121 else
2122 #endif
2123
2124 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2125 consistency with the behaviour of \w we do use it in this case. */
2126
2127 {
2128 /* Get status of previous character */
2129
2130 if (eptr == md->start_subject) prev_is_word = FALSE; else
2131 {
2132 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2133 #ifdef SUPPORT_UCP
2134 if (md->use_ucp)
2135 {
2136 c = eptr[-1];
2137 if (c == '_') prev_is_word = TRUE; else
2138 {
2139 int cat = UCD_CATEGORY(c);
2140 prev_is_word = (cat == ucp_L || cat == ucp_N);
2141 }
2142 }
2143 else
2144 #endif
2145 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2146 }
2147
2148 /* Get status of next character */
2149
2150 if (eptr >= md->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 cur_is_word = FALSE;
2154 }
2155 else
2156 #ifdef SUPPORT_UCP
2157 if (md->use_ucp)
2158 {
2159 c = *eptr;
2160 if (c == '_') cur_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 cur_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2169 }
2170
2171 /* Now see if the situation is what we want */
2172
2173 if ((*ecode++ == OP_WORD_BOUNDARY)?
2174 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2175 MRRETURN(MATCH_NOMATCH);
2176 }
2177 break;
2178
2179 /* Match a single character type; inline for speed */
2180
2181 case OP_ANY:
2182 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2183 /* Fall through */
2184
2185 case OP_ALLANY:
2186 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2187 { /* not be updated before SCHECK_PARTIAL. */
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 eptr++;
2192 #ifdef SUPPORT_UTF
2193 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2194 #endif
2195 ecode++;
2196 break;
2197
2198 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2199 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2200
2201 case OP_ANYBYTE:
2202 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2203 { /* not be updated before SCHECK_PARTIAL. */
2204 SCHECK_PARTIAL();
2205 MRRETURN(MATCH_NOMATCH);
2206 }
2207 eptr++;
2208 ecode++;
2209 break;
2210
2211 case OP_NOT_DIGIT:
2212 if (eptr >= md->end_subject)
2213 {
2214 SCHECK_PARTIAL();
2215 MRRETURN(MATCH_NOMATCH);
2216 }
2217 GETCHARINCTEST(c, eptr);
2218 if (
2219 #ifdef SUPPORT_UTF8
2220 c < 256 &&
2221 #endif
2222 (md->ctypes[c] & ctype_digit) != 0
2223 )
2224 MRRETURN(MATCH_NOMATCH);
2225 ecode++;
2226 break;
2227
2228 case OP_DIGIT:
2229 if (eptr >= md->end_subject)
2230 {
2231 SCHECK_PARTIAL();
2232 MRRETURN(MATCH_NOMATCH);
2233 }
2234 GETCHARINCTEST(c, eptr);
2235 if (
2236 #ifdef SUPPORT_UTF8
2237 c >= 256 ||
2238 #endif
2239 (md->ctypes[c] & ctype_digit) == 0
2240 )
2241 MRRETURN(MATCH_NOMATCH);
2242 ecode++;
2243 break;
2244
2245 case OP_NOT_WHITESPACE:
2246 if (eptr >= md->end_subject)
2247 {
2248 SCHECK_PARTIAL();
2249 MRRETURN(MATCH_NOMATCH);
2250 }
2251 GETCHARINCTEST(c, eptr);
2252 if (
2253 #ifdef SUPPORT_UTF8
2254 c < 256 &&
2255 #endif
2256 (md->ctypes[c] & ctype_space) != 0
2257 )
2258 MRRETURN(MATCH_NOMATCH);
2259 ecode++;
2260 break;
2261
2262 case OP_WHITESPACE:
2263 if (eptr >= md->end_subject)
2264 {
2265 SCHECK_PARTIAL();
2266 MRRETURN(MATCH_NOMATCH);
2267 }
2268 GETCHARINCTEST(c, eptr);
2269 if (
2270 #ifdef SUPPORT_UTF8
2271 c >= 256 ||
2272 #endif
2273 (md->ctypes[c] & ctype_space) == 0
2274 )
2275 MRRETURN(MATCH_NOMATCH);
2276 ecode++;
2277 break;
2278
2279 case OP_NOT_WORDCHAR:
2280 if (eptr >= md->end_subject)
2281 {
2282 SCHECK_PARTIAL();
2283 MRRETURN(MATCH_NOMATCH);
2284 }
2285 GETCHARINCTEST(c, eptr);
2286 if (
2287 #ifdef SUPPORT_UTF8
2288 c < 256 &&
2289 #endif
2290 (md->ctypes[c] & ctype_word) != 0
2291 )
2292 MRRETURN(MATCH_NOMATCH);
2293 ecode++;
2294 break;
2295
2296 case OP_WORDCHAR:
2297 if (eptr >= md->end_subject)
2298 {
2299 SCHECK_PARTIAL();
2300 MRRETURN(MATCH_NOMATCH);
2301 }
2302 GETCHARINCTEST(c, eptr);
2303 if (
2304 #ifdef SUPPORT_UTF8
2305 c >= 256 ||
2306 #endif
2307 (md->ctypes[c] & ctype_word) == 0
2308 )
2309 MRRETURN(MATCH_NOMATCH);
2310 ecode++;
2311 break;
2312
2313 case OP_ANYNL:
2314 if (eptr >= md->end_subject)
2315 {
2316 SCHECK_PARTIAL();
2317 MRRETURN(MATCH_NOMATCH);
2318 }
2319 GETCHARINCTEST(c, eptr);
2320 switch(c)
2321 {
2322 default: MRRETURN(MATCH_NOMATCH);
2323
2324 case 0x000d:
2325 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2326 break;
2327
2328 case 0x000a:
2329 break;
2330
2331 case 0x000b:
2332 case 0x000c:
2333 case 0x0085:
2334 case 0x2028:
2335 case 0x2029:
2336 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2337 break;
2338 }
2339 ecode++;
2340 break;
2341
2342 case OP_NOT_HSPACE:
2343 if (eptr >= md->end_subject)
2344 {
2345 SCHECK_PARTIAL();
2346 MRRETURN(MATCH_NOMATCH);
2347 }
2348 GETCHARINCTEST(c, eptr);
2349 switch(c)
2350 {
2351 default: break;
2352 case 0x09: /* HT */
2353 case 0x20: /* SPACE */
2354 case 0xa0: /* NBSP */
2355 case 0x1680: /* OGHAM SPACE MARK */
2356 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2357 case 0x2000: /* EN QUAD */
2358 case 0x2001: /* EM QUAD */
2359 case 0x2002: /* EN SPACE */
2360 case 0x2003: /* EM SPACE */
2361 case 0x2004: /* THREE-PER-EM SPACE */
2362 case 0x2005: /* FOUR-PER-EM SPACE */
2363 case 0x2006: /* SIX-PER-EM SPACE */
2364 case 0x2007: /* FIGURE SPACE */
2365 case 0x2008: /* PUNCTUATION SPACE */
2366 case 0x2009: /* THIN SPACE */
2367 case 0x200A: /* HAIR SPACE */
2368 case 0x202f: /* NARROW NO-BREAK SPACE */
2369 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2370 case 0x3000: /* IDEOGRAPHIC SPACE */
2371 MRRETURN(MATCH_NOMATCH);
2372 }
2373 ecode++;
2374 break;
2375
2376 case OP_HSPACE:
2377 if (eptr >= md->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 MRRETURN(MATCH_NOMATCH);
2381 }
2382 GETCHARINCTEST(c, eptr);
2383 switch(c)
2384 {
2385 default: MRRETURN(MATCH_NOMATCH);
2386 case 0x09: /* HT */
2387 case 0x20: /* SPACE */
2388 case 0xa0: /* NBSP */
2389 case 0x1680: /* OGHAM SPACE MARK */
2390 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2391 case 0x2000: /* EN QUAD */
2392 case 0x2001: /* EM QUAD */
2393 case 0x2002: /* EN SPACE */
2394 case 0x2003: /* EM SPACE */
2395 case 0x2004: /* THREE-PER-EM SPACE */
2396 case 0x2005: /* FOUR-PER-EM SPACE */
2397 case 0x2006: /* SIX-PER-EM SPACE */
2398 case 0x2007: /* FIGURE SPACE */
2399 case 0x2008: /* PUNCTUATION SPACE */
2400 case 0x2009: /* THIN SPACE */
2401 case 0x200A: /* HAIR SPACE */
2402 case 0x202f: /* NARROW NO-BREAK SPACE */
2403 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2404 case 0x3000: /* IDEOGRAPHIC SPACE */
2405 break;
2406 }
2407 ecode++;
2408 break;
2409
2410 case OP_NOT_VSPACE:
2411 if (eptr >= md->end_subject)
2412 {
2413 SCHECK_PARTIAL();
2414 MRRETURN(MATCH_NOMATCH);
2415 }
2416 GETCHARINCTEST(c, eptr);
2417 switch(c)
2418 {
2419 default: break;
2420 case 0x0a: /* LF */
2421 case 0x0b: /* VT */
2422 case 0x0c: /* FF */
2423 case 0x0d: /* CR */
2424 case 0x85: /* NEL */
2425 case 0x2028: /* LINE SEPARATOR */
2426 case 0x2029: /* PARAGRAPH SEPARATOR */
2427 MRRETURN(MATCH_NOMATCH);
2428 }
2429 ecode++;
2430 break;
2431
2432 case OP_VSPACE:
2433 if (eptr >= md->end_subject)
2434 {
2435 SCHECK_PARTIAL();
2436 MRRETURN(MATCH_NOMATCH);
2437 }
2438 GETCHARINCTEST(c, eptr);
2439 switch(c)
2440 {
2441 default: MRRETURN(MATCH_NOMATCH);
2442 case 0x0a: /* LF */
2443 case 0x0b: /* VT */
2444 case 0x0c: /* FF */
2445 case 0x0d: /* CR */
2446 case 0x85: /* NEL */
2447 case 0x2028: /* LINE SEPARATOR */
2448 case 0x2029: /* PARAGRAPH SEPARATOR */
2449 break;
2450 }
2451 ecode++;
2452 break;
2453
2454 #ifdef SUPPORT_UCP
2455 /* Check the next character by Unicode property. We will get here only
2456 if the support is in the binary; otherwise a compile-time error occurs. */
2457
2458 case OP_PROP:
2459 case OP_NOTPROP:
2460 if (eptr >= md->end_subject)
2461 {
2462 SCHECK_PARTIAL();
2463 MRRETURN(MATCH_NOMATCH);
2464 }
2465 GETCHARINCTEST(c, eptr);
2466 {
2467 const ucd_record *prop = GET_UCD(c);
2468
2469 switch(ecode[1])
2470 {
2471 case PT_ANY:
2472 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2473 break;
2474
2475 case PT_LAMP:
2476 if ((prop->chartype == ucp_Lu ||
2477 prop->chartype == ucp_Ll ||
2478 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2479 MRRETURN(MATCH_NOMATCH);
2480 break;
2481
2482 case PT_GC:
2483 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2484 MRRETURN(MATCH_NOMATCH);
2485 break;
2486
2487 case PT_PC:
2488 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2489 MRRETURN(MATCH_NOMATCH);
2490 break;
2491
2492 case PT_SC:
2493 if ((ecode[2] != prop->script) == (op == OP_PROP))
2494 MRRETURN(MATCH_NOMATCH);
2495 break;
2496
2497 /* These are specials */
2498
2499 case PT_ALNUM:
2500 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2501 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2502 MRRETURN(MATCH_NOMATCH);
2503 break;
2504
2505 case PT_SPACE: /* Perl space */
2506 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2507 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2508 == (op == OP_NOTPROP))
2509 MRRETURN(MATCH_NOMATCH);
2510 break;
2511
2512 case PT_PXSPACE: /* POSIX space */
2513 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2514 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2515 c == CHAR_FF || c == CHAR_CR)
2516 == (op == OP_NOTPROP))
2517 MRRETURN(MATCH_NOMATCH);
2518 break;
2519
2520 case PT_WORD:
2521 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2522 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2523 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2524 MRRETURN(MATCH_NOMATCH);
2525 break;
2526
2527 /* This should never occur */
2528
2529 default:
2530 RRETURN(PCRE_ERROR_INTERNAL);
2531 }
2532
2533 ecode += 3;
2534 }
2535 break;
2536
2537 /* Match an extended Unicode sequence. We will get here only if the support
2538 is in the binary; otherwise a compile-time error occurs. */
2539
2540 case OP_EXTUNI:
2541 if (eptr >= md->end_subject)
2542 {
2543 SCHECK_PARTIAL();
2544 MRRETURN(MATCH_NOMATCH);
2545 }
2546 GETCHARINCTEST(c, eptr);
2547 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2548 while (eptr < md->end_subject)
2549 {
2550 int len = 1;
2551 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2552 if (UCD_CATEGORY(c) != ucp_M) break;
2553 eptr += len;
2554 }
2555 ecode++;
2556 break;
2557 #endif
2558
2559
2560 /* Match a back reference, possibly repeatedly. Look past the end of the
2561 item to see if there is repeat information following. The code is similar
2562 to that for character classes, but repeated for efficiency. Then obey
2563 similar code to character type repeats - written out again for speed.
2564 However, if the referenced string is the empty string, always treat
2565 it as matched, any number of times (otherwise there could be infinite
2566 loops). */
2567
2568 case OP_REF:
2569 case OP_REFI:
2570 caseless = op == OP_REFI;
2571 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2572 ecode += 1 + IMM2_SIZE;
2573
2574 /* If the reference is unset, there are two possibilities:
2575
2576 (a) In the default, Perl-compatible state, set the length negative;
2577 this ensures that every attempt at a match fails. We can't just fail
2578 here, because of the possibility of quantifiers with zero minima.
2579
2580 (b) If the JavaScript compatibility flag is set, set the length to zero
2581 so that the back reference matches an empty string.
2582
2583 Otherwise, set the length to the length of what was matched by the
2584 referenced subpattern. */
2585
2586 if (offset >= offset_top || md->offset_vector[offset] < 0)
2587 length = (md->jscript_compat)? 0 : -1;
2588 else
2589 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2590
2591 /* Set up for repetition, or handle the non-repeated case */
2592
2593 switch (*ecode)
2594 {
2595 case OP_CRSTAR:
2596 case OP_CRMINSTAR:
2597 case OP_CRPLUS:
2598 case OP_CRMINPLUS:
2599 case OP_CRQUERY:
2600 case OP_CRMINQUERY:
2601 c = *ecode++ - OP_CRSTAR;
2602 minimize = (c & 1) != 0;
2603 min = rep_min[c]; /* Pick up values from tables; */
2604 max = rep_max[c]; /* zero for max => infinity */
2605 if (max == 0) max = INT_MAX;
2606 break;
2607
2608 case OP_CRRANGE:
2609 case OP_CRMINRANGE:
2610 minimize = (*ecode == OP_CRMINRANGE);
2611 min = GET2(ecode, 1);
2612 max = GET2(ecode, 1 + IMM2_SIZE);
2613 if (max == 0) max = INT_MAX;
2614 ecode += 1 + 2 * IMM2_SIZE;
2615 break;
2616
2617 default: /* No repeat follows */
2618 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2619 {
2620 CHECK_PARTIAL();
2621 MRRETURN(MATCH_NOMATCH);
2622 }
2623 eptr += length;
2624 continue; /* With the main loop */
2625 }
2626
2627 /* Handle repeated back references. If the length of the reference is
2628 zero, just continue with the main loop. */
2629
2630 if (length == 0) continue;
2631
2632 /* First, ensure the minimum number of matches are present. We get back
2633 the length of the reference string explicitly rather than passing the
2634 address of eptr, so that eptr can be a register variable. */
2635
2636 for (i = 1; i <= min; i++)
2637 {
2638 int slength;
2639 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2640 {
2641 CHECK_PARTIAL();
2642 MRRETURN(MATCH_NOMATCH);
2643 }
2644 eptr += slength;
2645 }
2646
2647 /* If min = max, continue at the same level without recursion.
2648 They are not both allowed to be zero. */
2649
2650 if (min == max) continue;
2651
2652 /* If minimizing, keep trying and advancing the pointer */
2653
2654 if (minimize)
2655 {
2656 for (fi = min;; fi++)
2657 {
2658 int slength;
2659 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2660 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2661 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2662 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2663 {
2664 CHECK_PARTIAL();
2665 MRRETURN(MATCH_NOMATCH);
2666 }
2667 eptr += slength;
2668 }
2669 /* Control never gets here */
2670 }
2671
2672 /* If maximizing, find the longest string and work backwards */
2673
2674 else
2675 {
2676 pp = eptr;
2677 for (i = min; i < max; i++)
2678 {
2679 int slength;
2680 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2681 {
2682 CHECK_PARTIAL();
2683 break;
2684 }
2685 eptr += slength;
2686 }
2687 while (eptr >= pp)
2688 {
2689 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2690 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2691 eptr -= length;
2692 }
2693 MRRETURN(MATCH_NOMATCH);
2694 }
2695 /* Control never gets here */
2696
2697 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2698 used when all the characters in the class have values in the range 0-255,
2699 and either the matching is caseful, or the characters are in the range
2700 0-127 when UTF-8 processing is enabled. The only difference between
2701 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2702 encountered.
2703
2704 First, look past the end of the item to see if there is repeat information
2705 following. Then obey similar code to character type repeats - written out
2706 again for speed. */
2707
2708 case OP_NCLASS:
2709 case OP_CLASS:
2710 {
2711 /* The data variable is saved across frames, so the byte map needs to
2712 be stored there. */
2713 #define BYTE_MAP ((pcre_uint8 *)data)
2714 data = ecode + 1; /* Save for matching */
2715 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2716
2717 switch (*ecode)
2718 {
2719 case OP_CRSTAR:
2720 case OP_CRMINSTAR:
2721 case OP_CRPLUS:
2722 case OP_CRMINPLUS:
2723 case OP_CRQUERY:
2724 case OP_CRMINQUERY:
2725 c = *ecode++ - OP_CRSTAR;
2726 minimize = (c & 1) != 0;
2727 min = rep_min[c]; /* Pick up values from tables; */
2728 max = rep_max[c]; /* zero for max => infinity */
2729 if (max == 0) max = INT_MAX;
2730 break;
2731
2732 case OP_CRRANGE:
2733 case OP_CRMINRANGE:
2734 minimize = (*ecode == OP_CRMINRANGE);
2735 min = GET2(ecode, 1);
2736 max = GET2(ecode, 1 + IMM2_SIZE);
2737 if (max == 0) max = INT_MAX;
2738 ecode += 1 + 2 * IMM2_SIZE;
2739 break;
2740
2741 default: /* No repeat follows */
2742 min = max = 1;
2743 break;
2744 }
2745
2746 /* First, ensure the minimum number of matches are present. */
2747
2748 #ifdef SUPPORT_UTF
2749 if (utf)
2750 {
2751 for (i = 1; i <= min; i++)
2752 {
2753 if (eptr >= md->end_subject)
2754 {
2755 SCHECK_PARTIAL();
2756 MRRETURN(MATCH_NOMATCH);
2757 }
2758 GETCHARINC(c, eptr);
2759 if (c > 255)
2760 {
2761 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2762 }
2763 else
2764 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2765 }
2766 }
2767 else
2768 #endif
2769 /* Not UTF mode */
2770 {
2771 for (i = 1; i <= min; i++)
2772 {
2773 if (eptr >= md->end_subject)
2774 {
2775 SCHECK_PARTIAL();
2776 MRRETURN(MATCH_NOMATCH);
2777 }
2778 c = *eptr++;
2779 #ifndef COMPILE_PCRE8
2780 if (c > 255)
2781 {
2782 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2783 }
2784 else
2785 #endif
2786 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2787 }
2788 }
2789
2790 /* If max == min we can continue with the main loop without the
2791 need to recurse. */
2792
2793 if (min == max) continue;
2794
2795 /* If minimizing, keep testing the rest of the expression and advancing
2796 the pointer while it matches the class. */
2797
2798 if (minimize)
2799 {
2800 #ifdef SUPPORT_UTF
2801 if (utf)
2802 {
2803 for (fi = min;; fi++)
2804 {
2805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2808 if (eptr >= md->end_subject)
2809 {
2810 SCHECK_PARTIAL();
2811 MRRETURN(MATCH_NOMATCH);
2812 }
2813 GETCHARINC(c, eptr);
2814 if (c > 255)
2815 {
2816 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2817 }
2818 else
2819 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2820 }
2821 }
2822 else
2823 #endif
2824 /* Not UTF mode */
2825 {
2826 for (fi = min;; fi++)
2827 {
2828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2830 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2831 if (eptr >= md->end_subject)
2832 {
2833 SCHECK_PARTIAL();
2834 MRRETURN(MATCH_NOMATCH);
2835 }
2836 c = *eptr++;
2837 #ifndef COMPILE_PCRE8
2838 if (c > 255)
2839 {
2840 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2841 }
2842 else
2843 #endif
2844 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2845 }
2846 }
2847 /* Control never gets here */
2848 }
2849
2850 /* If maximizing, find the longest possible run, then work backwards. */
2851
2852 else
2853 {
2854 pp = eptr;
2855
2856 #ifdef SUPPORT_UTF
2857 if (utf)
2858 {
2859 for (i = min; i < max; i++)
2860 {
2861 int len = 1;
2862 if (eptr >= md->end_subject)
2863 {
2864 SCHECK_PARTIAL();
2865 break;
2866 }
2867 GETCHARLEN(c, eptr, len);
2868 if (c > 255)
2869 {
2870 if (op == OP_CLASS) break;
2871 }
2872 else
2873 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2874 eptr += len;
2875 }
2876 for (;;)
2877 {
2878 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2880 if (eptr-- == pp) break; /* Stop if tried at original pos */
2881 BACKCHAR(eptr);
2882 }
2883 }
2884 else
2885 #endif
2886 /* Not UTF mode */
2887 {
2888 for (i = min; i < max; i++)
2889 {
2890 if (eptr >= md->end_subject)
2891 {
2892 SCHECK_PARTIAL();
2893 break;
2894 }
2895 c = *eptr;
2896 #ifndef COMPILE_PCRE8
2897 if (c > 255)
2898 {
2899 if (op == OP_CLASS) break;
2900 }
2901 else
2902 #endif
2903 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2904 eptr++;
2905 }
2906 while (eptr >= pp)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 eptr--;
2911 }
2912 }
2913
2914 MRRETURN(MATCH_NOMATCH);
2915 }
2916 #undef BYTE_MAP
2917 }
2918 /* Control never gets here */
2919
2920
2921 /* Match an extended character class. This opcode is encountered only
2922 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2923 mode, because Unicode properties are supported in non-UTF-8 mode. */
2924
2925 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2926 case OP_XCLASS:
2927 {
2928 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2929 ecode += GET(ecode, 1); /* Advance past the item */
2930
2931 switch (*ecode)
2932 {
2933 case OP_CRSTAR:
2934 case OP_CRMINSTAR:
2935 case OP_CRPLUS:
2936 case OP_CRMINPLUS:
2937 case OP_CRQUERY:
2938 case OP_CRMINQUERY:
2939 c = *ecode++ - OP_CRSTAR;
2940 minimize = (c & 1) != 0;
2941 min = rep_min[c]; /* Pick up values from tables; */
2942 max = rep_max[c]; /* zero for max => infinity */
2943 if (max == 0) max = INT_MAX;
2944 break;
2945
2946 case OP_CRRANGE:
2947 case OP_CRMINRANGE:
2948 minimize = (*ecode == OP_CRMINRANGE);
2949 min = GET2(ecode, 1);
2950 max = GET2(ecode, 1 + IMM2_SIZE);
2951 if (max == 0) max = INT_MAX;
2952 ecode += 1 + 2 * IMM2_SIZE;
2953 break;
2954
2955 default: /* No repeat follows */
2956 min = max = 1;
2957 break;
2958 }
2959
2960 /* First, ensure the minimum number of matches are present. */
2961
2962 for (i = 1; i <= min; i++)
2963 {
2964 if (eptr >= md->end_subject)
2965 {
2966 SCHECK_PARTIAL();
2967 MRRETURN(MATCH_NOMATCH);
2968 }
2969 GETCHARINCTEST(c, eptr);
2970 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2971 }
2972
2973 /* If max == min we can continue with the main loop without the
2974 need to recurse. */
2975
2976 if (min == max) continue;
2977
2978 /* If minimizing, keep testing the rest of the expression and advancing
2979 the pointer while it matches the class. */
2980
2981 if (minimize)
2982 {
2983 for (fi = min;; fi++)
2984 {
2985 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2987 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 MRRETURN(MATCH_NOMATCH);
2992 }
2993 GETCHARINCTEST(c, eptr);
2994 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2995 }
2996 /* Control never gets here */
2997 }
2998
2999 /* If maximizing, find the longest possible run, then work backwards. */
3000
3001 else
3002 {
3003 pp = eptr;
3004 for (i = min; i < max; i++)
3005 {
3006 int len = 1;
3007 if (eptr >= md->end_subject)
3008 {
3009 SCHECK_PARTIAL();
3010 break;
3011 }
3012 #ifdef SUPPORT_UTF
3013 GETCHARLENTEST(c, eptr, len);
3014 #else
3015 c = *eptr;
3016 #endif
3017 if (!PRIV(xclass)(c, data)) break;
3018 eptr += len;
3019 }
3020 for(;;)
3021 {
3022 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 if (eptr-- == pp) break; /* Stop if tried at original pos */
3025 #ifdef SUPPORT_UTF
3026 if (utf) BACKCHAR(eptr);
3027 #endif
3028 }
3029 MRRETURN(MATCH_NOMATCH);
3030 }
3031
3032 /* Control never gets here */
3033 }
3034 #endif /* End of XCLASS */
3035
3036 /* Match a single character, casefully */
3037
3038 case OP_CHAR:
3039 #ifdef SUPPORT_UTF8
3040 if (utf)
3041 {
3042 length = 1;
3043 ecode++;
3044 GETCHARLEN(fc, ecode, length);
3045 if (length > md->end_subject - eptr)
3046 {
3047 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3048 MRRETURN(MATCH_NOMATCH);
3049 }
3050 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3051 }
3052 else
3053 #endif
3054 /* Not UTF mode */
3055 {
3056 if (md->end_subject - eptr < 1)
3057 {
3058 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3059 MRRETURN(MATCH_NOMATCH);
3060 }
3061 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3062 ecode += 2;
3063 }
3064 break;
3065
3066 /* Match a single character, caselessly */
3067
3068 case OP_CHARI:
3069 #ifdef SUPPORT_UTF
3070 if (utf)
3071 {
3072 length = 1;
3073 ecode++;
3074 GETCHARLEN(fc, ecode, length);
3075
3076 if (length > md->end_subject - eptr)
3077 {
3078 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3079 MRRETURN(MATCH_NOMATCH);
3080 }
3081
3082 /* If the pattern character's value is < 128, we have only one byte, and
3083 can use the fast lookup table. */
3084
3085 if (fc < 128)
3086 {
3087 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3088 }
3089
3090 /* Otherwise we must pick up the subject character */
3091
3092 else
3093 {
3094 unsigned int dc;
3095 GETCHARINC(dc, eptr);
3096 ecode += length;
3097
3098 /* If we have Unicode property support, we can use it to test the other
3099 case of the character, if there is one. */
3100
3101 if (fc != dc)
3102 {
3103 #ifdef SUPPORT_UCP
3104 if (dc != UCD_OTHERCASE(fc))
3105 #endif
3106 MRRETURN(MATCH_NOMATCH);
3107 }
3108 }
3109 }
3110 else
3111 #endif /* SUPPORT_UTF8 */
3112
3113 /* Not UTF mode */
3114 {
3115 if (md->end_subject - eptr < 1)
3116 {
3117 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3118 MRRETURN(MATCH_NOMATCH);
3119 }
3120 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3121 ecode += 2;
3122 }
3123 break;
3124
3125 /* Match a single character repeatedly. */
3126
3127 case OP_EXACT:
3128 case OP_EXACTI:
3129 min = max = GET2(ecode, 1);
3130 ecode += 1 + IMM2_SIZE;
3131 goto REPEATCHAR;
3132
3133 case OP_POSUPTO:
3134 case OP_POSUPTOI:
3135 possessive = TRUE;
3136 /* Fall through */
3137
3138 case OP_UPTO:
3139 case OP_UPTOI:
3140 case OP_MINUPTO:
3141 case OP_MINUPTOI:
3142 min = 0;
3143 max = GET2(ecode, 1);
3144 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3145 ecode += 1 + IMM2_SIZE;
3146 goto REPEATCHAR;
3147
3148 case OP_POSSTAR:
3149 case OP_POSSTARI:
3150 possessive = TRUE;
3151 min = 0;
3152 max = INT_MAX;
3153 ecode++;
3154 goto REPEATCHAR;
3155
3156 case OP_POSPLUS:
3157 case OP_POSPLUSI:
3158 possessive = TRUE;
3159 min = 1;
3160 max = INT_MAX;
3161 ecode++;
3162 goto REPEATCHAR;
3163
3164 case OP_POSQUERY:
3165 case OP_POSQUERYI:
3166 possessive = TRUE;
3167 min = 0;
3168 max = 1;
3169 ecode++;
3170 goto REPEATCHAR;
3171
3172 case OP_STAR:
3173 case OP_STARI:
3174 case OP_MINSTAR:
3175 case OP_MINSTARI:
3176 case OP_PLUS:
3177 case OP_PLUSI:
3178 case OP_MINPLUS:
3179 case OP_MINPLUSI:
3180 case OP_QUERY:
3181 case OP_QUERYI:
3182 case OP_MINQUERY:
3183 case OP_MINQUERYI:
3184 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3185 minimize = (c & 1) != 0;
3186 min = rep_min[c]; /* Pick up values from tables; */
3187 max = rep_max[c]; /* zero for max => infinity */
3188 if (max == 0) max = INT_MAX;
3189
3190 /* Common code for all repeated single-character matches. */
3191
3192 REPEATCHAR:
3193 #ifdef SUPPORT_UTF8
3194 if (utf)
3195 {
3196 length = 1;
3197 charptr = ecode;
3198 GETCHARLEN(fc, ecode, length);
3199 ecode += length;
3200
3201 /* Handle multibyte character matching specially here. There is
3202 support for caseless matching if UCP support is present. */
3203
3204 if (length > 1)
3205 {
3206 #ifdef SUPPORT_UCP
3207 unsigned int othercase;
3208 if (op >= OP_STARI && /* Caseless */
3209 (othercase = UCD_OTHERCASE(fc)) != fc)
3210 oclength = PRIV(ord2utf)(othercase, occhars);
3211 else oclength = 0;
3212 #endif /* SUPPORT_UCP */
3213
3214 for (i = 1; i <= min; i++)
3215 {
3216 if (eptr <= md->end_subject - length &&
3217 memcmp(eptr, charptr, length) == 0) eptr += length;
3218 #ifdef SUPPORT_UCP
3219 else if (oclength > 0 &&
3220 eptr <= md->end_subject - oclength &&
3221 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3222 #endif /* SUPPORT_UCP */
3223 else
3224 {
3225 CHECK_PARTIAL();
3226 MRRETURN(MATCH_NOMATCH);
3227 }
3228 }
3229
3230 if (min == max) continue;
3231
3232 if (minimize)
3233 {
3234 for (fi = min;; fi++)
3235 {
3236 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3238 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3239 if (eptr <= md->end_subject - length &&
3240 memcmp(eptr, charptr, length) == 0) eptr += length;
3241 #ifdef SUPPORT_UCP
3242 else if (oclength > 0 &&
3243 eptr <= md->end_subject - oclength &&
3244 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3245 #endif /* SUPPORT_UCP */
3246 else
3247 {
3248 CHECK_PARTIAL();
3249 MRRETURN(MATCH_NOMATCH);
3250 }
3251 }
3252 /* Control never gets here */
3253 }
3254
3255 else /* Maximize */
3256 {
3257 pp = eptr;
3258 for (i = min; i < max; i++)
3259 {
3260 if (eptr <= md->end_subject - length &&
3261 memcmp(eptr, charptr, length) == 0) eptr += length;
3262 #ifdef SUPPORT_UCP
3263 else if (oclength > 0 &&
3264 eptr <= md->end_subject - oclength &&
3265 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3266 #endif /* SUPPORT_UCP */
3267 else
3268 {
3269 CHECK_PARTIAL();
3270 break;
3271 }
3272 }
3273
3274 if (possessive) continue;
3275
3276 for(;;)
3277 {
3278 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3280 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3281 #ifdef SUPPORT_UCP
3282 eptr--;
3283 BACKCHAR(eptr);
3284 #else /* without SUPPORT_UCP */
3285 eptr -= length;
3286 #endif /* SUPPORT_UCP */
3287 }
3288 }
3289 /* Control never gets here */
3290 }
3291
3292 /* If the length of a UTF-8 character is 1, we fall through here, and
3293 obey the code as for non-UTF-8 characters below, though in this case the
3294 value of fc will always be < 128. */
3295 }
3296 else
3297 #endif /* SUPPORT_UTF8 */
3298
3299 /* When not in UTF-8 mode, load a single-byte character. */
3300
3301 fc = *ecode++;
3302
3303 /* The value of fc at this point is always less than 256, though we may or
3304 may not be in UTF-8 mode. The code is duplicated for the caseless and
3305 caseful cases, for speed, since matching characters is likely to be quite
3306 common. First, ensure the minimum number of matches are present. If min =
3307 max, continue at the same level without recursing. Otherwise, if
3308 minimizing, keep trying the rest of the expression and advancing one
3309 matching character if failing, up to the maximum. Alternatively, if
3310 maximizing, find the maximum number of characters and work backwards. */
3311
3312 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3313 max, eptr));
3314
3315 if (op >= OP_STARI) /* Caseless */
3316 {
3317 fc = md->lcc[fc];
3318 for (i = 1; i <= min; i++)
3319 {
3320 if (eptr >= md->end_subject)
3321 {
3322 SCHECK_PARTIAL();
3323 MRRETURN(MATCH_NOMATCH);
3324 }
3325 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3326 }
3327 if (min == max) continue;
3328 if (minimize)
3329 {
3330 for (fi = min;; fi++)
3331 {
3332 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3333 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3334 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3335 if (eptr >= md->end_subject)
3336 {
3337 SCHECK_PARTIAL();
3338 MRRETURN(MATCH_NOMATCH);
3339 }
3340 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3341 }
3342 /* Control never gets here */
3343 }
3344 else /* Maximize */
3345 {
3346 pp = eptr;
3347 for (i = min; i < max; i++)
3348 {
3349 if (eptr >= md->end_subject)
3350 {
3351 SCHECK_PARTIAL();
3352 break;
3353 }
3354 if (fc != md->lcc[*eptr]) break;
3355 eptr++;
3356 }
3357
3358 if (possessive) continue;
3359
3360 while (eptr >= pp)
3361 {
3362 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3363 eptr--;
3364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3365 }
3366 MRRETURN(MATCH_NOMATCH);
3367 }
3368 /* Control never gets here */
3369 }
3370
3371 /* Caseful comparisons (includes all multi-byte characters) */
3372
3373 else
3374 {
3375 for (i = 1; i <= min; i++)
3376 {
3377 if (eptr >= md->end_subject)
3378 {
3379 SCHECK_PARTIAL();
3380 MRRETURN(MATCH_NOMATCH);
3381 }
3382 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3383 }
3384
3385 if (min == max) continue;
3386
3387 if (minimize)
3388 {
3389 for (fi = min;; fi++)
3390 {
3391 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3392 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3393 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3394 if (eptr >= md->end_subject)
3395 {
3396 SCHECK_PARTIAL();
3397 MRRETURN(MATCH_NOMATCH);
3398 }
3399 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3400 }
3401 /* Control never gets here */
3402 }
3403 else /* Maximize */
3404 {
3405 pp = eptr;
3406 for (i = min; i < max; i++)
3407 {
3408 if (eptr >= md->end_subject)
3409 {
3410 SCHECK_PARTIAL();
3411 break;
3412 }
3413 if (fc != *eptr) break;
3414 eptr++;
3415 }
3416 if (possessive) continue;
3417
3418 while (eptr >= pp)
3419 {
3420 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3421 eptr--;
3422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 }
3424 MRRETURN(MATCH_NOMATCH);
3425 }
3426 }
3427 /* Control never gets here */
3428
3429 /* Match a negated single one-byte character. The character we are
3430 checking can be multibyte. */
3431
3432 case OP_NOT:
3433 case OP_NOTI:
3434 if (eptr >= md->end_subject)
3435 {
3436 SCHECK_PARTIAL();
3437 MRRETURN(MATCH_NOMATCH);
3438 }
3439 ecode++;
3440 GETCHARINCTEST(c, eptr);
3441 if (op == OP_NOTI) /* The caseless case */
3442 {
3443 #ifdef SUPPORT_UTF8
3444 if (c < 256)
3445 #endif
3446 c = md->lcc[c];
3447 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3448 }
3449 else /* Caseful */
3450 {
3451 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3452 }
3453 break;
3454
3455 /* Match a negated single one-byte character repeatedly. This is almost a
3456 repeat of the code for a repeated single character, but I haven't found a
3457 nice way of commoning these up that doesn't require a test of the
3458 positive/negative option for each character match. Maybe that wouldn't add
3459 very much to the time taken, but character matching *is* what this is all
3460 about... */
3461
3462 case OP_NOTEXACT:
3463 case OP_NOTEXACTI:
3464 min = max = GET2(ecode, 1);
3465 ecode += 1 + IMM2_SIZE;
3466 goto REPEATNOTCHAR;
3467
3468 case OP_NOTUPTO:
3469 case OP_NOTUPTOI:
3470 case OP_NOTMINUPTO:
3471 case OP_NOTMINUPTOI:
3472 min = 0;
3473 max = GET2(ecode, 1);
3474 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3475 ecode += 1 + IMM2_SIZE;
3476 goto REPEATNOTCHAR;
3477
3478 case OP_NOTPOSSTAR:
3479 case OP_NOTPOSSTARI:
3480 possessive = TRUE;
3481 min = 0;
3482 max = INT_MAX;
3483 ecode++;
3484 goto REPEATNOTCHAR;
3485
3486 case OP_NOTPOSPLUS:
3487 case OP_NOTPOSPLUSI:
3488 possessive = TRUE;
3489 min = 1;
3490 max = INT_MAX;
3491 ecode++;
3492 goto REPEATNOTCHAR;
3493
3494 case OP_NOTPOSQUERY:
3495 case OP_NOTPOSQUERYI:
3496 possessive = TRUE;
3497 min = 0;
3498 max = 1;
3499 ecode++;
3500 goto REPEATNOTCHAR;
3501
3502 case OP_NOTPOSUPTO:
3503 case OP_NOTPOSUPTOI:
3504 possessive = TRUE;
3505 min = 0;
3506 max = GET2(ecode, 1);
3507 ecode += 1 + IMM2_SIZE;
3508 goto REPEATNOTCHAR;
3509
3510 case OP_NOTSTAR:
3511 case OP_NOTSTARI:
3512 case OP_NOTMINSTAR:
3513 case OP_NOTMINSTARI:
3514 case OP_NOTPLUS:
3515 case OP_NOTPLUSI:
3516 case OP_NOTMINPLUS:
3517 case OP_NOTMINPLUSI:
3518 case OP_NOTQUERY:
3519 case OP_NOTQUERYI:
3520 case OP_NOTMINQUERY:
3521 case OP_NOTMINQUERYI:
3522 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3523 minimize = (c & 1) != 0;
3524 min = rep_min[c]; /* Pick up values from tables; */
3525 max = rep_max[c]; /* zero for max => infinity */
3526 if (max == 0) max = INT_MAX;
3527
3528 /* Common code for all repeated single-byte matches. */
3529
3530 REPEATNOTCHAR:
3531 fc = *ecode++;
3532
3533 /* The code is duplicated for the caseless and caseful cases, for speed,
3534 since matching characters is likely to be quite common. First, ensure the
3535 minimum number of matches are present. If min = max, continue at the same
3536 level without recursing. Otherwise, if minimizing, keep trying the rest of
3537 the expression and advancing one matching character if failing, up to the
3538 maximum. Alternatively, if maximizing, find the maximum number of
3539 characters and work backwards. */
3540
3541 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3542 max, eptr));
3543
3544 if (op >= OP_NOTSTARI) /* Caseless */
3545 {
3546 fc = md->lcc[fc];
3547
3548 #ifdef SUPPORT_UTF8
3549 if (utf)
3550 {
3551 register unsigned int d;
3552 for (i = 1; i <= min; i++)
3553 {
3554 if (eptr >= md->end_subject)
3555 {
3556 SCHECK_PARTIAL();
3557 MRRETURN(MATCH_NOMATCH);
3558 }
3559 GETCHARINC(d, eptr);
3560 if (d < 256) d = md->lcc[d];
3561 if (fc == d) MRRETURN(MATCH_NOMATCH);
3562 }
3563 }
3564 else
3565 #endif
3566 /* Not UTF mode */
3567 {
3568 for (i = 1; i <= min; i++)
3569 {
3570 if (eptr >= md->end_subject)
3571 {
3572 SCHECK_PARTIAL();
3573 MRRETURN(MATCH_NOMATCH);
3574 }
3575 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3576 }
3577 }
3578
3579 if (min == max) continue;
3580
3581 if (minimize)
3582 {
3583 #ifdef SUPPORT_UTF8
3584 if (utf)
3585 {
3586 register unsigned int d;
3587 for (fi = min;; fi++)
3588 {
3589 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3591 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3592 if (eptr >= md->end_subject)
3593 {
3594 SCHECK_PARTIAL();
3595 MRRETURN(MATCH_NOMATCH);
3596 }
3597 GETCHARINC(d, eptr);
3598 if (d < 256) d = md->lcc[d];
3599 if (fc == d) MRRETURN(MATCH_NOMATCH);
3600 }
3601 }
3602 else
3603 #endif
3604 /* Not UTF mode */
3605 {
3606 for (fi = min;; fi++)
3607 {
3608 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3610 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3611 if (eptr >= md->end_subject)
3612 {
3613 SCHECK_PARTIAL();
3614 MRRETURN(MATCH_NOMATCH);
3615 }
3616 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3617 }
3618 }
3619 /* Control never gets here */
3620 }
3621
3622 /* Maximize case */
3623
3624 else
3625 {
3626 pp = eptr;
3627
3628 #ifdef SUPPORT_UTF8
3629 if (utf)
3630 {
3631 register unsigned int d;
3632 for (i = min; i < max; i++)
3633 {
3634 int len = 1;
3635 if (eptr >= md->end_subject)
3636 {
3637 SCHECK_PARTIAL();
3638 break;
3639 }
3640 GETCHARLEN(d, eptr, len);
3641 if (d < 256) d = md->lcc[d];
3642 if (fc == d) break;
3643 eptr += len;
3644 }
3645 if (possessive) continue;
3646 for(;;)
3647 {
3648 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3650 if (eptr-- == pp) break; /* Stop if tried at original pos */
3651 BACKCHAR(eptr);
3652 }
3653 }
3654 else
3655 #endif
3656 /* Not UTF mode */
3657 {
3658 for (i = min; i < max; i++)
3659 {
3660 if (eptr >= md->end_subject)
3661 {
3662 SCHECK_PARTIAL();
3663 break;
3664 }
3665 if (fc == md->lcc[*eptr]) break;
3666 eptr++;
3667 }
3668 if (possessive) continue;
3669 while (eptr >= pp)
3670 {
3671 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3673 eptr--;
3674 }
3675 }
3676
3677 MRRETURN(MATCH_NOMATCH);
3678 }
3679 /* Control never gets here */
3680 }
3681
3682 /* Caseful comparisons */
3683
3684 else
3685 {
3686 #ifdef SUPPORT_UTF8
3687 if (utf)
3688 {
3689 register unsigned int d;
3690 for (i = 1; i <= min; i++)
3691 {
3692 if (eptr >= md->end_subject)
3693 {
3694 SCHECK_PARTIAL();
3695 MRRETURN(MATCH_NOMATCH);
3696 }
3697 GETCHARINC(d, eptr);
3698 if (fc == d) MRRETURN(MATCH_NOMATCH);
3699 }
3700 }
3701 else
3702 #endif
3703 /* Not UTF mode */
3704 {
3705 for (i = 1; i <= min; i++)
3706 {
3707 if (eptr >= md->end_subject)
3708 {
3709 SCHECK_PARTIAL();
3710 MRRETURN(MATCH_NOMATCH);
3711 }
3712 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3713 }
3714 }
3715
3716 if (min == max) continue;
3717
3718 if (minimize)
3719 {
3720 #ifdef SUPPORT_UTF8
3721 if (utf)
3722 {
3723 register unsigned int d;
3724 for (fi = min;; fi++)
3725 {
3726 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3728 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3729 if (eptr >= md->end_subject)
3730 {
3731 SCHECK_PARTIAL();
3732 MRRETURN(MATCH_NOMATCH);
3733 }
3734 GETCHARINC(d, eptr);
3735 if (fc == d) MRRETURN(MATCH_NOMATCH);
3736 }
3737 }
3738 else
3739 #endif
3740 /* Not UTF mode */
3741 {
3742 for (fi = min;; fi++)
3743 {
3744 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3747 if (eptr >= md->end_subject)
3748 {
3749 SCHECK_PARTIAL();
3750 MRRETURN(MATCH_NOMATCH);
3751 }
3752 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3753 }
3754 }
3755 /* Control never gets here */
3756 }
3757
3758 /* Maximize case */
3759
3760 else
3761 {
3762 pp = eptr;
3763
3764 #ifdef SUPPORT_UTF8
3765 if (utf)
3766 {
3767 register unsigned int d;
3768 for (i = min; i < max; i++)
3769 {
3770 int len = 1;
3771 if (eptr >= md->end_subject)
3772 {
3773 SCHECK_PARTIAL();
3774 break;
3775 }
3776 GETCHARLEN(d, eptr, len);
3777 if (fc == d) break;
3778 eptr += len;
3779 }
3780 if (possessive) continue;
3781 for(;;)
3782 {
3783 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785 if (eptr-- == pp) break; /* Stop if tried at original pos */
3786 BACKCHAR(eptr);
3787 }
3788 }
3789 else
3790 #endif
3791 /* Not UTF mode */
3792 {
3793 for (i = min; i < max; i++)
3794 {
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 break;
3799 }
3800 if (fc == *eptr) break;
3801 eptr++;
3802 }
3803 if (possessive) continue;
3804 while (eptr >= pp)
3805 {
3806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3808 eptr--;
3809 }
3810 }
3811
3812 MRRETURN(MATCH_NOMATCH);
3813 }
3814 }
3815 /* Control never gets here */
3816
3817 /* Match a single character type repeatedly; several different opcodes
3818 share code. This is very similar to the code for single characters, but we
3819 repeat it in the interests of efficiency. */
3820
3821 case OP_TYPEEXACT:
3822 min = max = GET2(ecode, 1);
3823 minimize = TRUE;
3824 ecode += 1 + IMM2_SIZE;
3825 goto REPEATTYPE;
3826
3827 case OP_TYPEUPTO:
3828 case OP_TYPEMINUPTO:
3829 min = 0;
3830 max = GET2(ecode, 1);
3831 minimize = *ecode == OP_TYPEMINUPTO;
3832 ecode += 1 + IMM2_SIZE;
3833 goto REPEATTYPE;
3834
3835 case OP_TYPEPOSSTAR:
3836 possessive = TRUE;
3837 min = 0;
3838 max = INT_MAX;
3839 ecode++;
3840 goto REPEATTYPE;
3841
3842 case OP_TYPEPOSPLUS:
3843 possessive = TRUE;
3844 min = 1;
3845 max = INT_MAX;
3846 ecode++;
3847 goto REPEATTYPE;
3848
3849 case OP_TYPEPOSQUERY:
3850 possessive = TRUE;
3851 min = 0;
3852 max = 1;
3853 ecode++;
3854 goto REPEATTYPE;
3855
3856 case OP_TYPEPOSUPTO:
3857 possessive = TRUE;
3858 min = 0;
3859 max = GET2(ecode, 1);
3860 ecode += 1 + IMM2_SIZE;
3861 goto REPEATTYPE;
3862
3863 case OP_TYPESTAR:
3864 case OP_TYPEMINSTAR:
3865 case OP_TYPEPLUS:
3866 case OP_TYPEMINPLUS:
3867 case OP_TYPEQUERY:
3868 case OP_TYPEMINQUERY:
3869 c = *ecode++ - OP_TYPESTAR;
3870 minimize = (c & 1) != 0;
3871 min = rep_min[c]; /* Pick up values from tables; */
3872 max = rep_max[c]; /* zero for max => infinity */
3873 if (max == 0) max = INT_MAX;
3874
3875 /* Common code for all repeated single character type matches. Note that
3876 in UTF-8 mode, '.' matches a character of any length, but for the other
3877 character types, the valid characters are all one-byte long. */
3878
3879 REPEATTYPE:
3880 ctype = *ecode++; /* Code for the character type */
3881
3882 #ifdef SUPPORT_UCP
3883 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3884 {
3885 prop_fail_result = ctype == OP_NOTPROP;
3886 prop_type = *ecode++;
3887 prop_value = *ecode++;
3888 }
3889 else prop_type = -1;
3890 #endif
3891
3892 /* First, ensure the minimum number of matches are present. Use inline
3893 code for maximizing the speed, and do the type test once at the start
3894 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3895 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3896 and single-bytes. */
3897
3898 if (min > 0)
3899 {
3900 #ifdef SUPPORT_UCP
3901 if (prop_type >= 0)
3902 {
3903 switch(prop_type)
3904 {
3905 case PT_ANY:
3906 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3907 for (i = 1; i <= min; i++)
3908 {
3909 if (eptr >= md->end_subject)
3910 {
3911 SCHECK_PARTIAL();
3912 MRRETURN(MATCH_NOMATCH);
3913 }
3914 GETCHARINCTEST(c, eptr);
3915 }
3916 break;
3917
3918 case PT_LAMP:
3919 for (i = 1; i <= min; i++)
3920 {
3921 int chartype;
3922 if (eptr >= md->end_subject)
3923 {
3924 SCHECK_PARTIAL();
3925 MRRETURN(MATCH_NOMATCH);
3926 }
3927 GETCHARINCTEST(c, eptr);
3928 chartype = UCD_CHARTYPE(c);
3929 if ((chartype == ucp_Lu ||
3930 chartype == ucp_Ll ||
3931 chartype == ucp_Lt) == prop_fail_result)
3932 MRRETURN(MATCH_NOMATCH);
3933 }
3934 break;
3935
3936 case PT_GC:
3937 for (i = 1; i <= min; i++)
3938 {
3939 if (eptr >= md->end_subject)
3940 {
3941 SCHECK_PARTIAL();
3942 MRRETURN(MATCH_NOMATCH);
3943 }
3944 GETCHARINCTEST(c, eptr);
3945 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3946 MRRETURN(MATCH_NOMATCH);
3947 }
3948 break;
3949
3950 case PT_PC:
3951 for (i = 1; i <= min; i++)
3952 {
3953 if (eptr >= md->end_subject)
3954 {
3955 SCHECK_PARTIAL();
3956 MRRETURN(MATCH_NOMATCH);
3957 }
3958 GETCHARINCTEST(c, eptr);
3959 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3960 MRRETURN(MATCH_NOMATCH);
3961 }
3962 break;
3963
3964 case PT_SC:
3965 for (i = 1; i <= min; i++)
3966 {
3967 if (eptr >= md->end_subject)
3968 {
3969 SCHECK_PARTIAL();
3970 MRRETURN(MATCH_NOMATCH);
3971 }
3972 GETCHARINCTEST(c, eptr);
3973 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3974 MRRETURN(MATCH_NOMATCH);
3975 }
3976 break;
3977
3978 case PT_ALNUM:
3979 for (i = 1; i <= min; i++)
3980 {
3981 int category;
3982 if (eptr >= md->end_subject)
3983 {
3984 SCHECK_PARTIAL();
3985 MRRETURN(MATCH_NOMATCH);
3986 }
3987 GETCHARINCTEST(c, eptr);
3988 category = UCD_CATEGORY(c);
3989 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3990 MRRETURN(MATCH_NOMATCH);
3991 }
3992 break;
3993
3994 case PT_SPACE: /* Perl space */
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 MRRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINCTEST(c, eptr);
4003 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4004 c == CHAR_FF || c == CHAR_CR)
4005 == prop_fail_result)
4006 MRRETURN(MATCH_NOMATCH);
4007 }
4008 break;
4009
4010 case PT_PXSPACE: /* POSIX space */
4011 for (i = 1; i <= min; i++)
4012 {
4013 if (eptr >= md->end_subject)
4014 {
4015 SCHECK_PARTIAL();
4016 MRRETURN(MATCH_NOMATCH);
4017 }
4018 GETCHARINCTEST(c, eptr);
4019 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4020 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4021 == prop_fail_result)
4022 MRRETURN(MATCH_NOMATCH);
4023 }
4024 break;
4025
4026 case PT_WORD:
4027 for (i = 1; i <= min; i++)
4028 {
4029 int category;
4030 if (eptr >= md->end_subject)
4031 {
4032 SCHECK_PARTIAL();
4033 MRRETURN(MATCH_NOMATCH);
4034 }
4035 GETCHARINCTEST(c, eptr);
4036 category = UCD_CATEGORY(c);
4037 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4038 == prop_fail_result)
4039 MRRETURN(MATCH_NOMATCH);
4040 }
4041 break;
4042
4043 /* This should not occur */
4044
4045 default:
4046 RRETURN(PCRE_ERROR_INTERNAL);
4047 }
4048 }
4049
4050 /* Match extended Unicode sequences. We will get here only if the
4051 support is in the binary; otherwise a compile-time error occurs. */
4052
4053 else if (ctype == OP_EXTUNI)
4054 {
4055 for (i = 1; i <= min; i++)
4056 {
4057 if (eptr >= md->end_subject)
4058 {
4059 SCHECK_PARTIAL();
4060 MRRETURN(MATCH_NOMATCH);
4061 }
4062 GETCHARINCTEST(c, eptr);
4063 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4064 while (eptr < md->end_subject)
4065 {
4066 int len = 1;
4067 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4068 if (UCD_CATEGORY(c) != ucp_M) break;
4069 eptr += len;
4070 }
4071 }
4072 }
4073
4074 else
4075 #endif /* SUPPORT_UCP */
4076
4077 /* Handle all other cases when the coding is UTF-8 */
4078
4079 #ifdef SUPPORT_UTF
4080 if (utf) switch(ctype)
4081 {
4082 case OP_ANY:
4083 for (i = 1; i <= min; i++)
4084 {
4085 if (eptr >= md->end_subject)
4086 {
4087 SCHECK_PARTIAL();
4088 MRRETURN(MATCH_NOMATCH);
4089 }
4090 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4091 eptr++;
4092 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4093 }
4094 break;
4095
4096 case OP_ALLANY:
4097 for (i = 1; i <= min; i++)
4098 {
4099 if (eptr >= md->end_subject)
4100 {
4101 SCHECK_PARTIAL();
4102 MRRETURN(MATCH_NOMATCH);
4103 }
4104 eptr++;
4105 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4106 }
4107 break;
4108
4109 case OP_ANYBYTE:
4110 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4111 eptr += min;
4112 break;
4113
4114 case OP_ANYNL:
4115 for (i = 1; i <= min; i++)
4116 {
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 MRRETURN(MATCH_NOMATCH);
4121 }
4122 GETCHARINC(c, eptr);
4123 switch(c)
4124 {
4125 default: MRRETURN(MATCH_NOMATCH);
4126
4127 case 0x000d:
4128 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4129 break;
4130
4131 case 0x000a:
4132 break;
4133
4134 case 0x000b:
4135 case 0x000c:
4136 case 0x0085:
4137 case 0x2028:
4138 case 0x2029:
4139 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4140 break;
4141 }
4142 }
4143 break;
4144
4145 case OP_NOT_HSPACE:
4146 for (i = 1; i <= min; i++)
4147 {
4148 if (eptr >= md->end_subject)
4149 {
4150 SCHECK_PARTIAL();
4151 MRRETURN(MATCH_NOMATCH);
4152 }
4153 GETCHARINC(c, eptr);
4154 switch(c)
4155 {
4156 default: break;
4157 case 0x09: /* HT */
4158 case 0x20: /* SPACE */
4159 case 0xa0: /* NBSP */
4160 case 0x1680: /* OGHAM SPACE MARK */
4161 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4162 case 0x2000: /* EN QUAD */
4163 case 0x2001: /* EM QUAD */
4164 case 0x2002: /* EN SPACE */
4165 case 0x2003: /* EM SPACE */
4166 case 0x2004: /* THREE-PER-EM SPACE */
4167 case 0x2005: /* FOUR-PER-EM SPACE */
4168 case 0x2006: /* SIX-PER-EM SPACE */
4169 case 0x2007: /* FIGURE SPACE */
4170 case 0x2008: /* PUNCTUATION SPACE */
4171 case 0x2009: /* THIN SPACE */
4172 case 0x200A: /* HAIR SPACE */
4173 case 0x202f: /* NARROW NO-BREAK SPACE */
4174 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4175 case 0x3000: /* IDEOGRAPHIC SPACE */
4176 MRRETURN(MATCH_NOMATCH);
4177 }
4178 }
4179 break;
4180
4181 case OP_HSPACE:
4182 for (i = 1; i <= min; i++)
4183 {
4184 if (eptr >= md->end_subject)
4185 {
4186 SCHECK_PARTIAL();
4187 MRRETURN(MATCH_NOMATCH);
4188 }
4189 GETCHARINC(c, eptr);
4190 switch(c)
4191 {
4192 default: MRRETURN(MATCH_NOMATCH);
4193 case 0x09: /* HT */
4194 case 0x20: /* SPACE */
4195 case 0xa0: /* NBSP */
4196 case 0x1680: /* OGHAM SPACE MARK */
4197 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4198 case 0x2000: /* EN QUAD */
4199 case 0x2001: /* EM QUAD */
4200 case 0x2002: /* EN SPACE */
4201 case 0x2003: /* EM SPACE */
4202 case 0x2004: /* THREE-PER-EM SPACE */
4203 case 0x2005: /* FOUR-PER-EM SPACE */
4204 case 0x2006: /* SIX-PER-EM SPACE */
4205 case 0x2007: /* FIGURE SPACE */
4206 case 0x2008: /* PUNCTUATION SPACE */
4207 case 0x2009: /* THIN SPACE */
4208 case 0x200A: /* HAIR SPACE */
4209 case 0x202f: /* NARROW NO-BREAK SPACE */
4210 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4211 case 0x3000: /* IDEOGRAPHIC SPACE */
4212 break;
4213 }
4214 }
4215 break;
4216
4217 case OP_NOT_VSPACE:
4218 for (i = 1; i <= min; i++)
4219 {
4220 if (eptr >= md->end_subject)
4221 {
4222 SCHECK_PARTIAL();
4223 MRRETURN(MATCH_NOMATCH);
4224 }
4225 GETCHARINC(c, eptr);
4226 switch(c)
4227 {
4228 default: break;
4229 case 0x0a: /* LF */
4230 case 0x0b: /* VT */
4231 case 0x0c: /* FF */
4232 case 0x0d: /* CR */
4233 case 0x85: /* NEL */
4234 case 0x2028: /* LINE SEPARATOR */
4235 case 0x2029: /* PARAGRAPH SEPARATOR */
4236 MRRETURN(MATCH_NOMATCH);
4237 }
4238 }
4239 break;
4240
4241 case OP_VSPACE:
4242 for (i = 1; i <= min; i++)
4243 {
4244 if (eptr >= md->end_subject)
4245 {
4246 SCHECK_PARTIAL();
4247 MRRETURN(MATCH_NOMATCH);
4248 }
4249 GETCHARINC(c, eptr);
4250 switch(c)
4251 {
4252 default: MRRETURN(MATCH_NOMATCH);
4253 case 0x0a: /* LF */
4254 case 0x0b: /* VT */
4255 case 0x0c: /* FF */
4256 case 0x0d: /* CR */
4257 case 0x85: /* NEL */
4258 case 0x2028: /* LINE SEPARATOR */
4259 case 0x2029: /* PARAGRAPH SEPARATOR */
4260 break;
4261 }
4262 }
4263 break;
4264
4265 case OP_NOT_DIGIT:
4266 for (i = 1; i <= min; i++)
4267 {
4268 if (eptr >= md->end_subject)
4269 {
4270 SCHECK_PARTIAL();
4271 MRRETURN(MATCH_NOMATCH);
4272 }
4273 GETCHARINC(c, eptr);
4274 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4275 MRRETURN(MATCH_NOMATCH);
4276 }
4277 break;
4278
4279 case OP_DIGIT:
4280 for (i = 1; i <= min; i++)
4281 {
4282 if (eptr >= md->end_subject)
4283 {
4284 SCHECK_PARTIAL();
4285 MRRETURN(MATCH_NOMATCH);
4286 }
4287 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4288 MRRETURN(MATCH_NOMATCH);
4289 /* No need to skip more bytes - we know it's a 1-byte character */
4290 }
4291 break;
4292
4293 case OP_NOT_WHITESPACE:
4294 for (i = 1; i <= min; i++)
4295 {
4296 if (eptr >= md->end_subject)
4297 {
4298 SCHECK_PARTIAL();
4299 MRRETURN(MATCH_NOMATCH);
4300 }
4301 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4302 MRRETURN(MATCH_NOMATCH);
4303 eptr++;
4304 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4305 }
4306 break;
4307
4308 case OP_WHITESPACE:
4309 for (i = 1; i <= min; i++)
4310 {
4311 if (eptr >= md->end_subject)
4312 {
4313 SCHECK_PARTIAL();
4314 MRRETURN(MATCH_NOMATCH);
4315 }
4316 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4317 MRRETURN(MATCH_NOMATCH);
4318 /* No need to skip more bytes - we know it's a 1-byte character */
4319 }
4320 break;
4321
4322 case OP_NOT_WORDCHAR:
4323 for (i = 1; i <= min; i++)
4324 {
4325 if (eptr >= md->end_subject)
4326 {
4327 SCHECK_PARTIAL();
4328 MRRETURN(MATCH_NOMATCH);
4329 }
4330 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4331 MRRETURN(MATCH_NOMATCH);
4332 eptr++;
4333 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4334 }
4335 break;
4336
4337 case OP_WORDCHAR:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 MRRETURN(MATCH_NOMATCH);
4344 }
4345 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4346 MRRETURN(MATCH_NOMATCH);
4347 /* No need to skip more bytes - we know it's a 1-byte character */
4348 }
4349 break;
4350
4351 default:
4352 RRETURN(PCRE_ERROR_INTERNAL);
4353 } /* End switch(ctype) */
4354
4355 else
4356 #endif /* SUPPORT_UTF8 */
4357
4358 /* Code for the non-UTF-8 case for minimum matching of operators other
4359 than OP_PROP and OP_NOTPROP. */
4360
4361 switch(ctype)
4362 {
4363 case OP_ANY:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 MRRETURN(MATCH_NOMATCH);
4370 }
4371 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4372 eptr++;
4373 }
4374 break;
4375
4376 case OP_ALLANY:
4377 if (eptr > md->end_subject - min)
4378 {
4379 SCHECK_PARTIAL();
4380 MRRETURN(MATCH_NOMATCH);
4381 }
4382 eptr += min;
4383 break;
4384
4385 case OP_ANYBYTE:
4386 if (eptr > md->end_subject - min)
4387 {
4388 SCHECK_PARTIAL();
4389 MRRETURN(MATCH_NOMATCH);
4390 }
4391 eptr += min;
4392 break;
4393
4394 case OP_ANYNL:
4395 for (i = 1; i <= min; i++)
4396 {
4397 if (eptr >= md->end_subject)
4398 {
4399 SCHECK_PARTIAL();
4400 MRRETURN(MATCH_NOMATCH);
4401 }
4402 switch(*eptr++)
4403 {
4404 default: MRRETURN(MATCH_NOMATCH);
4405
4406 case 0x000d:
4407 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4408 break;
4409
4410 case 0x000a:
4411 break;
4412
4413 case 0x000b:
4414 case 0x000c:
4415 case 0x0085:
4416 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4417 break;
4418 }
4419 }
4420 break;
4421
4422 case OP_NOT_HSPACE:
4423 for (i = 1; i <= min; i++)
4424 {
4425 if (eptr >= md->end_subject)
4426 {
4427 SCHECK_PARTIAL();
4428 MRRETURN(MATCH_NOMATCH);
4429 }
4430 switch(*eptr++)
4431 {
4432 default: break;
4433 case 0x09: /* HT */
4434 case 0x20: /* SPACE */
4435 case 0xa0: /* NBSP */
4436 MRRETURN(MATCH_NOMATCH);
4437 }
4438 }
4439 break;
4440
4441 case OP_HSPACE:
4442 for (i = 1; i <= min; i++)
4443 {
4444 if (eptr >= md->end_subject)
4445 {
4446 SCHECK_PARTIAL();
4447 MRRETURN(MATCH_NOMATCH);
4448 }
4449 switch(*eptr++)
4450 {
4451 default: MRRETURN(MATCH_NOMATCH);
4452 case 0x09: /* HT */
4453 case 0x20: /* SPACE */
4454 case 0xa0: /* NBSP */
4455 break;
4456 }
4457 }
4458 break;
4459
4460 case OP_NOT_VSPACE:
4461 for (i = 1; i <= min; i++)
4462 {
4463 if (eptr >= md->end_subject)
4464 {
4465 SCHECK_PARTIAL();
4466 MRRETURN(MATCH_NOMATCH);
4467 }
4468 switch(*eptr++)
4469 {
4470 default: break;
4471 case 0x0a: /* LF */
4472 case 0x0b: /* VT */
4473 case 0x0c: /* FF */
4474 case 0x0d: /* CR */
4475 case 0x85: /* NEL */
4476 MRRETURN(MATCH_NOMATCH);
4477 }
4478 }
4479 break;
4480
4481 case OP_VSPACE:
4482 for (i = 1; i <= min; i++)
4483 {
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 MRRETURN(MATCH_NOMATCH);
4488 }
4489 switch(*eptr++)
4490 {
4491 default: MRRETURN(MATCH_NOMATCH);
4492 case 0x0a: /* LF */
4493 case 0x0b: /* VT */
4494 case 0x0c: /* FF */
4495 case 0x0d: /* CR */
4496 case 0x85: /* NEL */
4497 break;
4498 }
4499 }
4500 break;
4501
4502 case OP_NOT_DIGIT:
4503 for (i = 1; i <= min; i++)
4504 {
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 MRRETURN(MATCH_NOMATCH);
4509 }
4510 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4511 }
4512 break;
4513
4514 case OP_DIGIT:
4515 for (i = 1; i <= min; i++)
4516 {
4517 if (eptr >= md->end_subject)
4518 {
4519 SCHECK_PARTIAL();
4520 MRRETURN(MATCH_NOMATCH);
4521 }
4522 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4523 }
4524 break;
4525
4526 case OP_NOT_WHITESPACE:
4527 for (i = 1; i <= min; i++)
4528 {
4529 if (eptr >= md->end_subject)
4530 {
4531 SCHECK_PARTIAL();
4532 MRRETURN(MATCH_NOMATCH);
4533 }
4534 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4535 }
4536 break;
4537
4538 case OP_WHITESPACE:
4539 for (i = 1; i <= min; i++)
4540 {
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4547 }
4548 break;
4549
4550 case OP_NOT_WORDCHAR:
4551 for (i = 1; i <= min; i++)
4552 {
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 MRRETURN(MATCH_NOMATCH);
4557 }
4558 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4559 MRRETURN(MATCH_NOMATCH);
4560 }
4561 break;
4562
4563 case OP_WORDCHAR:
4564 for (i = 1; i <= min; i++)
4565 {
4566 if (eptr >= md->end_subject)
4567 {
4568 SCHECK_PARTIAL();
4569 MRRETURN(MATCH_NOMATCH);
4570 }
4571 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4572 MRRETURN(MATCH_NOMATCH);
4573 }
4574 break;
4575
4576 default:
4577 RRETURN(PCRE_ERROR_INTERNAL);
4578 }
4579 }
4580
4581 /* If min = max, continue at the same level without recursing */
4582
4583 if (min == max) continue;
4584
4585 /* If minimizing, we have to test the rest of the pattern before each
4586 subsequent match. Again, separate the UTF-8 case for speed, and also
4587 separate the UCP cases. */
4588
4589 if (minimize)
4590 {
4591 #ifdef SUPPORT_UCP
4592 if (prop_type >= 0)
4593 {
4594 switch(prop_type)
4595 {
4596 case PT_ANY:
4597 for (fi = min;; fi++)
4598 {
4599 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4601 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4602 if (eptr >= md->end_subject)
4603 {
4604 SCHECK_PARTIAL();
4605 MRRETURN(MATCH_NOMATCH);
4606 }
4607 GETCHARINCTEST(c, eptr);
4608 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4609 }
4610 /* Control never gets here */
4611
4612 case PT_LAMP:
4613 for (fi = min;; fi++)
4614 {
4615 int chartype;
4616 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4617 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4618 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4619 if (eptr >= md->end_subject)
4620 {
4621 SCHECK_PARTIAL();
4622 MRRETURN(MATCH_NOMATCH);
4623 }
4624 GETCHARINCTEST(c, eptr);
4625 chartype = UCD_CHARTYPE(c);
4626 if ((chartype == ucp_Lu ||
4627 chartype == ucp_Ll ||
4628 chartype == ucp_Lt) == prop_fail_result)
4629 MRRETURN(MATCH_NOMATCH);
4630 }
4631 /* Control never gets here */
4632
4633 case PT_GC:
4634 for (fi = min;; fi++)
4635 {
4636 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4637 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4638 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4639 if (eptr >= md->end_subject)
4640 {
4641 SCHECK_PARTIAL();
4642 MRRETURN(MATCH_NOMATCH);
4643 }
4644 GETCHARINCTEST(c, eptr);
4645 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4646 MRRETURN(MATCH_NOMATCH);
4647 }
4648 /* Control never gets here */
4649
4650 case PT_PC:
4651 for (fi = min;; fi++)
4652 {
4653 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4655 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4656 if (eptr >= md->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 MRRETURN(MATCH_NOMATCH);
4660 }
4661 GETCHARINCTEST(c, eptr);
4662 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4663 MRRETURN(MATCH_NOMATCH);
4664 }
4665 /* Control never gets here */
4666
4667 case PT_SC:
4668 for (fi = min;; fi++)
4669 {
4670 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4671 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4672 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4673 if (eptr >= md->end_subject)
4674 {
4675 SCHECK_PARTIAL();
4676 MRRETURN(MATCH_NOMATCH);
4677 }
4678 GETCHARINCTEST(c, eptr);
4679 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4680 MRRETURN(MATCH_NOMATCH);
4681 }
4682 /* Control never gets here */
4683
4684 case PT_ALNUM:
4685 for (fi = min;; fi++)
4686 {
4687 int category;
4688 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4690 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4691 if (eptr >= md->end_subject)
4692 {
4693 SCHECK_PARTIAL();
4694 MRRETURN(MATCH_NOMATCH);
4695 }
4696 GETCHARINCTEST(c, eptr);
4697 category = UCD_CATEGORY(c);
4698 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4699 MRRETURN(MATCH_NOMATCH);
4700 }
4701 /* Control never gets here */
4702
4703 case PT_SPACE: /* Perl space */
4704 for (fi = min;; fi++)
4705 {
4706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4708 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4709 if (eptr >= md->end_subject)
4710 {
4711 SCHECK_PARTIAL();
4712 MRRETURN(MATCH_NOMATCH);
4713 }
4714 GETCHARINCTEST(c, eptr);
4715 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4716 c == CHAR_FF || c == CHAR_CR)
4717 == prop_fail_result)
4718 MRRETURN(MATCH_NOMATCH);
4719 }
4720 /* Control never gets here */
4721
4722 case PT_PXSPACE: /* POSIX space */
4723 for (fi = min;; fi++)
4724 {
4725 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4727 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 MRRETURN(MATCH_NOMATCH);
4732 }
4733 GETCHARINCTEST(c, eptr);
4734 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4735 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4736 == prop_fail_result)
4737 MRRETURN(MATCH_NOMATCH);
4738 }
4739 /* Control never gets here */
4740
4741 case PT_WORD:
4742 for (fi = min;; fi++)
4743 {
4744 int category;
4745 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4746 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4747 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4748 if (eptr >= md->end_subject)
4749 {
4750 SCHECK_PARTIAL();
4751 MRRETURN(MATCH_NOMATCH);
4752 }
4753 GETCHARINCTEST(c, eptr);
4754 category = UCD_CATEGORY(c);
4755 if ((category == ucp_L ||
4756 category == ucp_N ||
4757 c == CHAR_UNDERSCORE)
4758 == prop_fail_result)
4759 MRRETURN(MATCH_NOMATCH);
4760 }
4761 /* Control never gets here */
4762
4763 /* This should never occur */
4764
4765 default:
4766 RRETURN(PCRE_ERROR_INTERNAL);
4767 }
4768 }
4769
4770 /* Match extended Unicode sequences. We will get here only if the
4771 support is in the binary; otherwise a compile-time error occurs. */
4772
4773 else if (ctype == OP_EXTUNI)
4774 {
4775 for (fi = min;; fi++)
4776 {
4777 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4779 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4780 if (eptr >= md->end_subject)
4781 {
4782 SCHECK_PARTIAL();
4783 MRRETURN(MATCH_NOMATCH);
4784 }
4785 GETCHARINCTEST(c, eptr);
4786 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4787 while (eptr < md->end_subject)
4788 {
4789 int len = 1;
4790 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4791 if (UCD_CATEGORY(c) != ucp_M) break;
4792 eptr += len;
4793 }
4794 }
4795 }
4796 else
4797 #endif /* SUPPORT_UCP */
4798
4799 #ifdef SUPPORT_UTF8
4800 if (utf)
4801 {
4802 for (fi = min;; fi++)
4803 {
4804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4806 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 MRRETURN(MATCH_NOMATCH);
4811 }
4812 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4813 MRRETURN(MATCH_NOMATCH);
4814 GETCHARINC(c, eptr);
4815 switch(ctype)
4816 {
4817 case OP_ANY: /* This is the non-NL case */
4818 case OP_ALLANY:
4819 case OP_ANYBYTE:
4820 break;
4821
4822 case OP_ANYNL:
4823 switch(c)
4824 {
4825 default: MRRETURN(MATCH_NOMATCH);
4826 case 0x000d:
4827 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4828 break;
4829 case 0x000a:
4830 break;
4831
4832 case 0x000b:
4833 case 0x000c:
4834 case 0x0085:
4835 case 0x2028:
4836 case 0x2029:
4837 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4838 break;
4839 }
4840 break;
4841
4842 case OP_NOT_HSPACE:
4843 switch(c)
4844 {
4845 default: break;
4846 case 0x09: /* HT */
4847 case 0x20: /* SPACE */
4848 case 0xa0: /* NBSP */
4849 case 0x1680: /* OGHAM SPACE MARK */
4850 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4851 case 0x2000: /* EN QUAD */
4852 case 0x2001: /* EM QUAD */
4853 case 0x2002: /* EN SPACE */
4854 case 0x2003: /* EM SPACE */
4855 case 0x2004: /* THREE-PER-EM SPACE */
4856 case 0x2005: /* FOUR-PER-EM SPACE */
4857 case 0x2006: /* SIX-PER-EM SPACE */
4858 case 0x2007: /* FIGURE SPACE */
4859 case 0x2008: /* PUNCTUATION SPACE */
4860 case 0x2009: /* THIN SPACE */
4861 case 0x200A: /* HAIR SPACE */
4862 case 0x202f: /* NARROW NO-BREAK SPACE */
4863 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4864 case 0x3000: /* IDEOGRAPHIC SPACE */
4865 MRRETURN(MATCH_NOMATCH);
4866 }
4867 break;
4868
4869 case OP_HSPACE:
4870 switch(c)
4871 {
4872 default: MRRETURN(MATCH_NOMATCH);
4873 case 0x09: /* HT */
4874 case 0x20: /* SPACE */
4875 case 0xa0: /* NBSP */
4876 case 0x1680: /* OGHAM SPACE MARK */
4877 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4878 case 0x2000: /* EN QUAD */
4879 case 0x2001: /* EM QUAD */
4880 case 0x2002: /* EN SPACE */
4881 case 0x2003: /* EM SPACE */
4882 case 0x2004: /* THREE-PER-EM SPACE */
4883 case 0x2005: /* FOUR-PER-EM SPACE */
4884 case 0x2006: /* SIX-PER-EM SPACE */
4885 case 0x2007: /* FIGURE SPACE */
4886 case 0x2008: /* PUNCTUATION SPACE */
4887 case 0x2009: /* THIN SPACE */
4888 case 0x200A: /* HAIR SPACE */
4889 case 0x202f: /* NARROW NO-BREAK SPACE */
4890 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4891 case 0x3000: /* IDEOGRAPHIC SPACE */
4892 break;
4893 }
4894 break;
4895
4896 case OP_NOT_VSPACE:
4897 switch(c)
4898 {
4899 default: break;
4900 case 0x0a: /* LF */
4901 case 0x0b: /* VT */
4902 case 0x0c: /* FF */
4903 case 0x0d: /* CR */
4904 case 0x85: /* NEL */
4905 case 0x2028: /* LINE SEPARATOR */
4906 case 0x2029: /* PARAGRAPH SEPARATOR */
4907 MRRETURN(MATCH_NOMATCH);
4908 }
4909 break;
4910
4911 case OP_VSPACE:
4912 switch(c)
4913 {
4914 default: MRRETURN(MATCH_NOMATCH);
4915 case 0x0a: /* LF */
4916 case 0x0b: /* VT */
4917 case 0x0c: /* FF */
4918 case 0x0d: /* CR */
4919 case 0x85: /* NEL */
4920 case 0x2028: /* LINE SEPARATOR */
4921 case 0x2029: /* PARAGRAPH SEPARATOR */
4922 break;
4923 }
4924 break;
4925
4926 case OP_NOT_DIGIT:
4927 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4928 MRRETURN(MATCH_NOMATCH);
4929 break;
4930
4931 case OP_DIGIT:
4932 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4933 MRRETURN(MATCH_NOMATCH);
4934 break;
4935
4936 case OP_NOT_WHITESPACE:
4937 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4938 MRRETURN(MATCH_NOMATCH);
4939 break;
4940
4941 case OP_WHITESPACE:
4942 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4943 MRRETURN(MATCH_NOMATCH);
4944 break;
4945
4946 case OP_NOT_WORDCHAR:
4947 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4948 MRRETURN(MATCH_NOMATCH);
4949 break;
4950
4951 case OP_WORDCHAR:
4952 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4953 MRRETURN(MATCH_NOMATCH);
4954 break;
4955
4956 default:
4957 RRETURN(PCRE_ERROR_INTERNAL);
4958 }
4959 }
4960 }
4961 else
4962 #endif
4963 /* Not UTF mode */
4964 {
4965 for (fi = min;; fi++)
4966 {
4967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4969 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4970 if (eptr >= md->end_subject)
4971 {
4972 SCHECK_PARTIAL();
4973 MRRETURN(MATCH_NOMATCH);
4974 }
4975 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4976 MRRETURN(MATCH_NOMATCH);
4977 c = *eptr++;
4978 switch(ctype)
4979 {
4980 case OP_ANY: /* This is the non-NL case */
4981 case OP_ALLANY:
4982 case OP_ANYBYTE:
4983 break;
4984
4985 case OP_ANYNL:
4986 switch(c)
4987 {
4988 default: MRRETURN(MATCH_NOMATCH);
4989 case 0x000d:
4990 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4991 break;
4992
4993 case 0x000a:
4994 break;
4995
4996 case 0x000b:
4997 case 0x000c:
4998 case 0x0085:
4999 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
5000 break;
5001 }
5002 break;
5003
5004 case OP_NOT_HSPACE:
5005 switch(c)
5006 {
5007 default: break;
5008 case 0x09: /* HT */
5009 case 0x20: /* SPACE */
5010 case 0xa0: /* NBSP */
5011 MRRETURN(MATCH_NOMATCH);
5012 }
5013 break;
5014
5015 case OP_HSPACE:
5016 switch(c)
5017 {
5018 default: MRRETURN(MATCH_NOMATCH);
5019 case 0x09: /* HT */
5020 case 0x20: /* SPACE */
5021 case 0xa0: /* NBSP */
5022 break;
5023 }
5024 break;
5025
5026 case OP_NOT_VSPACE:
5027 switch(c)
5028 {
5029 default: break;
5030 case 0x0a: /* LF */
5031 case 0x0b: /* VT */
5032 case 0x0c: /* FF */
5033 case 0x0d: /* CR */
5034 case 0x85: /* NEL */
5035 MRRETURN(MATCH_NOMATCH);
5036 }
5037 break;
5038
5039 case OP_VSPACE:
5040 switch(c)
5041 {
5042 default: MRRETURN(MATCH_NOMATCH);
5043 case 0x0a: /* LF */
5044 case 0x0b: /* VT */
5045 case 0x0c: /* FF */
5046 case 0x0d: /* CR */
5047 case 0x85: /* NEL */
5048 break;
5049 }
5050 break;
5051
5052 case OP_NOT_DIGIT:
5053 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5054 break;
5055
5056 case OP_DIGIT:
5057 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5058 break;
5059
5060 case OP_NOT_WHITESPACE:
5061 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5062 break;
5063
5064 case OP_WHITESPACE:
5065 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5066 break;
5067
5068 case OP_NOT_WORDCHAR:
5069 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5070 break;
5071
5072 case OP_WORDCHAR:
5073 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5074 break;
5075
5076 default:
5077 RRETURN(PCRE_ERROR_INTERNAL);
5078 }
5079 }
5080 }
5081 /* Control never gets here */
5082 }
5083
5084 /* If maximizing, it is worth using inline code for speed, doing the type
5085 test once at the start (i.e. keep it out of the loop). Again, keep the
5086 UTF-8 and UCP stuff separate. */
5087
5088 else
5089 {
5090 pp = eptr; /* Remember where we started */
5091
5092 #ifdef SUPPORT_UCP
5093 if (prop_type >= 0)
5094 {
5095 switch(prop_type)
5096 {
5097 case PT_ANY:
5098 for (i = min; i < max; i++)
5099 {
5100 int len = 1;
5101 if (eptr >= md->end_subject)
5102 {
5103 SCHECK_PARTIAL();
5104 break;
5105 }
5106 GETCHARLENTEST(c, eptr, len);
5107 if (prop_fail_result) break;
5108 eptr+= len;
5109 }
5110 break;
5111
5112 case PT_LAMP:
5113 for (i = min; i < max; i++)
5114 {
5115 int chartype;
5116 int len = 1;
5117 if (eptr >= md->end_subject)
5118 {
5119 SCHECK_PARTIAL();
5120 break;
5121 }
5122 GETCHARLENTEST(c, eptr, len);
5123 chartype = UCD_CHARTYPE(c);
5124 if ((chartype == ucp_Lu ||
5125 chartype == ucp_Ll ||
5126 chartype == ucp_Lt) == prop_fail_result)
5127 break;
5128 eptr+= len;
5129 }
5130 break;
5131
5132 case PT_GC:
5133 for (i = min; i < max; i++)
5134 {
5135 int len = 1;
5136 if (eptr >= md->end_subject)
5137 {
5138 SCHECK_PARTIAL();
5139 break;
5140 }
5141 GETCHARLENTEST(c, eptr, len);
5142 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5143 eptr+= len;
5144 }
5145 break;
5146
5147 case PT_PC:
5148 for (i = min; i < max; i++)
5149 {
5150 int len = 1;
5151 if (eptr >= md->end_subject)
5152 {
5153 SCHECK_PARTIAL();
5154 break;
5155 }
5156 GETCHARLENTEST(c, eptr, len);
5157 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5158 eptr+= len;
5159 }
5160 break;
5161
5162 case PT_SC:
5163 for (i = min; i < max; i++)
5164 {
5165 int len = 1;
5166 if (eptr >= md->end_subject)
5167 {
5168 SCHECK_PARTIAL();
5169 break;
5170 }
5171 GETCHARLENTEST(c, eptr, len);
5172 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5173 eptr+= len;
5174 }
5175 break;
5176
5177 case PT_ALNUM:
5178 for (i = min; i < max; i++)
5179 {
5180 int category;
5181 int len = 1;
5182 if (eptr >= md->end_subject)
5183 {
5184 SCHECK_PARTIAL();
5185 break;
5186 }
5187 GETCHARLENTEST(c, eptr, len);
5188 category = UCD_CATEGORY(c);
5189 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5190 break;
5191 eptr+= len;
5192 }
5193 break;
5194
5195 case PT_SPACE: /* Perl space */
5196 for (i = min; i < max; i++)
5197 {
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 GETCHARLENTEST(c, eptr, len);
5205 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5206 c == CHAR_FF || c == CHAR_CR)
5207 == prop_fail_result)
5208 break;
5209 eptr+= len;
5210 }
5211 break;
5212
5213 case PT_PXSPACE: /* POSIX space */
5214 for (i = min; i < max; i++)
5215 {
5216 int len = 1;
5217 if (eptr >= md->end_subject)
5218 {
5219 SCHECK_PARTIAL();
5220 break;
5221 }
5222 GETCHARLENTEST(c, eptr, len);
5223 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5224 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5225 == prop_fail_result)
5226 break;
5227 eptr+= len;
5228 }
5229 break;
5230
5231 case PT_WORD:
5232 for (i = min; i < max; i++)
5233 {
5234 int category;
5235 int len = 1;
5236 if (eptr >= md->end_subject)
5237 {
5238 SCHECK_PARTIAL();
5239 break;
5240 }
5241 GETCHARLENTEST(c, eptr, len);
5242 category = UCD_CATEGORY(c);
5243 if ((category == ucp_L || category == ucp_N ||
5244 c == CHAR_UNDERSCORE) == prop_fail_result)
5245 break;
5246 eptr+= len;
5247 }
5248 break;
5249
5250 default:
5251 RRETURN(PCRE_ERROR_INTERNAL);
5252 }
5253
5254 /* eptr is now past the end of the maximum run */
5255
5256 if (possessive) continue;
5257 for(;;)
5258 {
5259 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5260 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5261 if (eptr-- == pp) break; /* Stop if tried at original pos */
5262 if (utf) BACKCHAR(eptr);
5263 }
5264 }
5265
5266 /* Match extended Unicode sequences. We will get here only if the
5267 support is in the binary; otherwise a compile-time error occurs. */
5268
5269 else if (ctype == OP_EXTUNI)
5270 {
5271 for (i = min; i < max; i++)
5272 {
5273 int len = 1;
5274 if (eptr >= md->end_subject)
5275 {
5276 SCHECK_PARTIAL();
5277 break;
5278 }
5279 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5280 if (UCD_CATEGORY(c) == ucp_M) break;
5281 eptr += len;
5282 while (eptr < md->end_subject)
5283 {
5284 len = 1;
5285 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5286 if (UCD_CATEGORY(c) != ucp_M) break;
5287 eptr += len;
5288 }
5289 }
5290
5291 /* eptr is now past the end of the maximum run */
5292
5293 if (possessive) continue;
5294
5295 for(;;)
5296 {
5297 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5298 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5299 if (eptr-- == pp) break; /* Stop if tried at original pos */
5300 for (;;) /* Move back over one extended */
5301 {
5302 if (!utf) c = *eptr; else
5303 {
5304 BACKCHAR(eptr);
5305 GETCHAR(c, eptr);
5306 }
5307 if (UCD_CATEGORY(c) != ucp_M) break;
5308 eptr--;
5309 }
5310 }
5311 }
5312
5313 else
5314 #endif /* SUPPORT_UCP */
5315
5316 #ifdef SUPPORT_UTF
5317 if (utf)
5318 {
5319 switch(ctype)
5320 {
5321 case OP_ANY:
5322 if (max < INT_MAX)
5323 {
5324 for (i = min; i < max; i++)
5325 {
5326 if (eptr >= md->end_subject)
5327 {
5328 SCHECK_PARTIAL();
5329 break;
5330 }
5331 if (IS_NEWLINE(eptr)) break;
5332 eptr++;
5333 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5334 }
5335 }
5336
5337 /* Handle unlimited UTF-8 repeat */
5338
5339 else
5340 {
5341 for (i = min; i < max; i++)
5342 {
5343 if (eptr >= md->end_subject)
5344 {
5345 SCHECK_PARTIAL();
5346 break;
5347 }
5348 if (IS_NEWLINE(eptr)) break;
5349 eptr++;
5350 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5351 }
5352 }
5353 break;
5354
5355 case OP_ALLANY:
5356 if (max < INT_MAX)
5357 {
5358 for (i = min; i < max; i++)
5359 {
5360 if (eptr >= md->end_subject)
5361 {
5362 SCHECK_PARTIAL();
5363 break;
5364 }
5365 eptr++;
5366 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5367 }
5368 }
5369 else
5370 {
5371 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5372 SCHECK_PARTIAL();
5373 }
5374 break;
5375
5376 /* The byte case is the same as non-UTF8 */
5377
5378 case OP_ANYBYTE:
5379 c = max - min;
5380 if (c > (unsigned int)(md->end_subject - eptr))
5381 {
5382 eptr = md->end_subject;
5383 SCHECK_PARTIAL();
5384 }
5385 else eptr += c;
5386 break;
5387
5388 case OP_ANYNL:
5389 for (i = min; i < max; i++)
5390 {
5391 int len = 1;
5392 if (eptr >= md->end_subject)
5393 {
5394 SCHECK_PARTIAL();
5395 break;
5396 }
5397 GETCHARLEN(c, eptr, len);
5398 if (c == 0x000d)
5399 {
5400 if (++eptr >= md->end_subject) break;
5401 if (*eptr == 0x000a) eptr++;
5402 }
5403 else
5404 {
5405 if (c != 0x000a &&
5406 (md->bsr_anycrlf ||
5407 (c != 0x000b && c != 0x000c &&
5408 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5409 break;
5410 eptr += len;
5411 }
5412 }
5413 break;
5414
5415 case OP_NOT_HSPACE:
5416 case OP_HSPACE:
5417 for (i = min; i < max; i++)
5418 {
5419 BOOL gotspace;
5420 int len = 1;
5421 if (eptr >= md->end_subject)
5422 {
5423 SCHECK_PARTIAL();
5424 break;
5425 }
5426 GETCHARLEN(c, eptr, len);
5427 switch(c)
5428 {
5429 default: gotspace = FALSE; break;
5430 case 0x09: /* HT */
5431 case 0x20: /* SPACE */
5432 case 0xa0: /* NBSP */
5433 case 0x1680: /* OGHAM SPACE MARK */
5434 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5435 case 0x2000: /* EN QUAD */
5436 case 0x2001: /* EM QUAD */
5437 case 0x2002: /* EN SPACE */
5438 case 0x2003: /* EM SPACE */
5439 case 0x2004: /* THREE-PER-EM SPACE */
5440 case 0x2005: /* FOUR-PER-EM SPACE */
5441 case 0x2006: /* SIX-PER-EM SPACE */
5442 case 0x2007: /* FIGURE SPACE */
5443 case 0x2008: /* PUNCTUATION SPACE */
5444 case 0x2009: /* THIN SPACE */
5445 case 0x200A: /* HAIR SPACE */
5446 case 0x202f: /* NARROW NO-BREAK SPACE */
5447 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5448 case 0x3000: /* IDEOGRAPHIC SPACE */
5449 gotspace = TRUE;
5450 break;
5451 }
5452 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5453 eptr += len;
5454 }
5455 break;
5456
5457 case OP_NOT_VSPACE:
5458 case OP_VSPACE:
5459 for (i = min; i < max; i++)
5460 {
5461 BOOL gotspace;
5462 int len = 1;
5463 if (eptr >= md->end_subject)
5464 {
5465 SCHECK_PARTIAL();
5466 break;
5467 }
5468 GETCHARLEN(c, eptr, len);
5469 switch(c)
5470 {
5471 default: gotspace = FALSE; break;
5472 case 0x0a: /* LF */
5473 case 0x0b: /* VT */
5474 case 0x0c: /* FF */
5475 case 0x0d: /* CR */
5476 case 0x85: /* NEL */
5477 case 0x2028: /* LINE SEPARATOR */
5478 case 0x2029: /* PARAGRAPH SEPARATOR */
5479 gotspace = TRUE;
5480 break;
5481 }
5482 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5483 eptr += len;
5484 }
5485 break;
5486
5487 case OP_NOT_DIGIT:
5488 for (i = min; i < max; i++)
5489 {
5490 int len = 1;
5491 if (eptr >= md->end_subject)
5492 {
5493 SCHECK_PARTIAL();
5494 break;
5495 }
5496 GETCHARLEN(c, eptr, len);
5497 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5498 eptr+= len;
5499 }
5500 break;
5501
5502 case OP_DIGIT:
5503 for (i = min; i < max; i++)
5504 {
5505 int len = 1;
5506 if (eptr >= md->end_subject)
5507 {
5508 SCHECK_PARTIAL();
5509 break;
5510 }
5511 GETCHARLEN(c, eptr, len);
5512 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5513 eptr+= len;
5514 }
5515 break;
5516
5517 case OP_NOT_WHITESPACE:
5518 for (i = min; i < max; i++)
5519 {
5520 int len = 1;
5521 if (eptr >= md->end_subject)
5522 {
5523 SCHECK_PARTIAL();
5524 break;
5525 }
5526 GETCHARLEN(c, eptr, len);
5527 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5528 eptr+= len;
5529 }
5530 break;
5531
5532 case OP_WHITESPACE:
5533 for (i = min; i < max; i++)
5534 {
5535 int len = 1;
5536 if (eptr >= md->end_subject)
5537 {
5538 SCHECK_PARTIAL();
5539 break;
5540 }
5541 GETCHARLEN(c, eptr, len);
5542 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5543 eptr+= len;
5544 }
5545 break;
5546
5547 case OP_NOT_WORDCHAR:
5548 for (i = min; i < max; i++)
5549 {
5550 int len = 1;
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 GETCHARLEN(c, eptr, len);
5557 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5558 eptr+= len;
5559 }
5560 break;
5561
5562 case OP_WORDCHAR:
5563 for (i = min; i < max; i++)
5564 {
5565 int len = 1;
5566 if (eptr >= md->end_subject)
5567 {
5568 SCHECK_PARTIAL();
5569 break;
5570 }
5571 GETCHARLEN(c, eptr, len);
5572 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5573 eptr+= len;
5574 }
5575 break;
5576
5577 default:
5578 RRETURN(PCRE_ERROR_INTERNAL);
5579 }
5580
5581 /* eptr is now past the end of the maximum run. If possessive, we are
5582 done (no backing up). Otherwise, match at this position; anything other
5583 than no match is immediately returned. For nomatch, back up one
5584 character, unless we are matching \R and the last thing matched was
5585 \r\n, in which case, back up two bytes. */
5586
5587 if (possessive) continue;
5588 for(;;)
5589 {
5590 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5592 if (eptr-- == pp) break; /* Stop if tried at original pos */
5593 BACKCHAR(eptr);
5594 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5595 eptr[-1] == '\r') eptr--;
5596 }
5597 }
5598 else
5599 #endif /* SUPPORT_UTF8 */
5600 /* Not UTF mode */
5601 {
5602 switch(ctype)
5603 {
5604 case OP_ANY:
5605 for (i = min; i < max; i++)
5606 {
5607 if (eptr >= md->end_subject)
5608 {
5609 SCHECK_PARTIAL();
5610 break;
5611 }
5612 if (IS_NEWLINE(eptr)) break;
5613 eptr++;
5614 }
5615 break;
5616
5617 case OP_ALLANY:
5618 case OP_ANYBYTE:
5619 c = max - min;
5620 if (c > (unsigned int)(md->end_subject - eptr))
5621 {
5622 eptr = md->end_subject;
5623 SCHECK_PARTIAL();
5624 }
5625 else eptr += c;
5626 break;
5627
5628 case OP_ANYNL:
5629 for (i = min; i < max; i++)
5630 {
5631 if (eptr >= md->end_subject)
5632 {
5633 SCHECK_PARTIAL();
5634 break;
5635 }
5636 c = *eptr;
5637 if (c == 0x000d)
5638 {
5639 if (++eptr >= md->end_subject) break;
5640 if (*eptr == 0x000a) eptr++;
5641 }
5642 else
5643 {
5644 if (c != 0x000a &&
5645 (md->bsr_anycrlf ||
5646 (c != 0x000b && c != 0x000c && c != 0x0085)))
5647 break;
5648 eptr++;
5649 }
5650 }
5651 break;
5652
5653 case OP_NOT_HSPACE:
5654 for (i = min; i < max; i++)
5655 {
5656 if (eptr >= md->end_subject)
5657 {
5658 SCHECK_PARTIAL();
5659 break;
5660 }
5661 c = *eptr;
5662 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5663 eptr++;
5664 }
5665 break;
5666
5667 case OP_HSPACE:
5668 for (i = min; i < max; i++)
5669 {
5670 if (eptr >= md->end_subject)
5671 {
5672 SCHECK_PARTIAL();
5673 break;
5674 }
5675 c = *eptr;
5676 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5677 eptr++;
5678 }
5679 break;
5680
5681 case OP_NOT_VSPACE:
5682 for (i = min; i < max; i++)
5683 {
5684 if (eptr >= md->end_subject)
5685 {
5686 SCHECK_PARTIAL();
5687 break;
5688 }
5689 c = *eptr;
5690 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5691 break;
5692 eptr++;
5693 }
5694 break;
5695
5696 case OP_VSPACE:
5697 for (i = min; i < max; i++)
5698 {
5699 if (eptr >= md->end_subject)
5700 {
5701 SCHECK_PARTIAL();
5702 break;
5703 }
5704 c = *eptr;
5705 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5706 break;
5707 eptr++;
5708 }
5709 break;
5710
5711 case OP_NOT_DIGIT:
5712 for (i = min; i < max; i++)
5713 {
5714 if (eptr >= md->end_subject)
5715 {
5716 SCHECK_PARTIAL();
5717 break;
5718 }
5719 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5720 eptr++;
5721 }
5722 break;
5723
5724 case OP_DIGIT:
5725 for (i = min; i < max; i++)
5726 {
5727 if (eptr >= md->end_subject)
5728 {
5729 SCHECK_PARTIAL();
5730 break;
5731 }
5732 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5733 eptr++;
5734 }
5735 break;
5736
5737 case OP_NOT_WHITESPACE:
5738 for (i = min; i < max; i++)
5739 {
5740 if (eptr >= md->end_subject)
5741 {
5742 SCHECK_PARTIAL();
5743 break;
5744 }
5745 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5746 eptr++;
5747 }
5748 break;
5749
5750 case OP_WHITESPACE:
5751 for (i = min; i < max; i++)
5752 {
5753 if (eptr >= md->end_subject)
5754 {
5755 SCHECK_PARTIAL();
5756 break;
5757 }
5758 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5759 eptr++;
5760 }
5761 break;
5762
5763 case OP_NOT_WORDCHAR:
5764 for (i = min; i < max; i++)
5765 {
5766 if (eptr >= md->end_subject)
5767 {
5768 SCHECK_PARTIAL();
5769 break;
5770 }
5771 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5772 eptr++;
5773 }
5774 break;
5775
5776 case OP_WORDCHAR:
5777 for (i = min; i < max; i++)
5778 {
5779 if (eptr >= md->end_subject)
5780 {
5781 SCHECK_PARTIAL();
5782 break;
5783 }
5784 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5785 eptr++;
5786 }
5787 break;
5788
5789 default:
5790 RRETURN(PCRE_ERROR_INTERNAL);
5791 }
5792
5793 /* eptr is now past the end of the maximum run. If possessive, we are
5794 done (no backing up). Otherwise, match at this position; anything other
5795 than no match is immediately returned. For nomatch, back up one
5796 character (byte), unless we are matching \R and the last thing matched
5797 was \r\n, in which case, back up two bytes. */
5798
5799 if (possessive) continue;
5800 while (eptr >= pp)
5801 {
5802 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5804 eptr--;
5805 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5806 eptr[-1] == '\r') eptr--;
5807 }
5808 }
5809
5810 /* Get here if we can't make it match with any permitted repetitions */
5811
5812 MRRETURN(MATCH_NOMATCH);
5813 }
5814 /* Control never gets here */
5815
5816 /* There's been some horrible disaster. Arrival here can only mean there is
5817 something seriously wrong in the code above or the OP_xxx definitions. */
5818
5819 default:
5820 DPRINTF(("Unknown opcode %d\n", *ecode));
5821 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5822 }
5823
5824 /* Do not stick any code in here without much thought; it is assumed
5825 that "continue" in the code above comes out to here to repeat the main
5826 loop. */
5827
5828 } /* End of main loop */
5829 /* Control never reaches here */
5830
5831
5832 /* When compiling to use the heap rather than the stack for recursive calls to
5833 match(), the RRETURN() macro jumps here. The number that is saved in
5834 frame->Xwhere indicates which label we actually want to return to. */
5835
5836 #ifdef NO_RECURSE
5837 #define LBL(val) case val: goto L_RM##val;
5838 HEAP_RETURN:
5839 switch (frame->Xwhere)
5840 {
5841 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5842 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5843 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5844 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5845 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5846 LBL(65) LBL(66)
5847 #ifdef SUPPORT_UTF8
5848 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5849 LBL(32) LBL(34) LBL(42) LBL(46)
5850 #ifdef SUPPORT_UCP
5851 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5852 LBL(59) LBL(60) LBL(61) LBL(62)
5853 #endif /* SUPPORT_UCP */
5854 #endif /* SUPPORT_UTF8 */
5855 default:
5856 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5857 return PCRE_ERROR_INTERNAL;
5858 }
5859 #undef LBL
5860 #endif /* NO_RECURSE */
5861 }
5862
5863
5864 /***************************************************************************
5865 ****************************************************************************
5866 RECURSION IN THE match() FUNCTION
5867
5868 Undefine all the macros that were defined above to handle this. */
5869
5870 #ifdef NO_RECURSE
5871 #undef eptr
5872 #undef ecode
5873 #undef mstart
5874 #undef offset_top
5875 #undef eptrb
5876 #undef flags
5877
5878 #undef callpat
5879 #undef charptr
5880 #undef data
5881 #undef next
5882 #undef pp
5883 #undef prev
5884 #undef saved_eptr
5885
5886 #undef new_recursive
5887
5888 #undef cur_is_word
5889 #undef condition
5890 #undef prev_is_word
5891
5892 #undef ctype
5893 #undef length
5894 #undef max
5895 #undef min
5896 #undef number
5897 #undef offset
5898 #undef op
5899 #undef save_capture_last
5900 #undef save_offset1
5901 #undef save_offset2
5902 #undef save_offset3
5903 #undef stacksave
5904
5905 #undef newptrb
5906
5907 #endif
5908
5909 /* These two are defined as macros in both cases */
5910
5911 #undef fc
5912 #undef fi
5913
5914 /***************************************************************************
5915 ***************************************************************************/
5916
5917
5918
5919 /*************************************************
5920 * Execute a Regular Expression *
5921 *************************************************/
5922
5923 /* This function applies a compiled re to a subject string and picks out
5924 portions of the string if it matches. Two elements in the vector are set for
5925 each substring: the offsets to the start and end of the substring.
5926
5927 Arguments:
5928 argument_re points to the compiled expression
5929 extra_data points to extra data or is NULL
5930 subject points to the subject string
5931 length length of subject string (may contain binary zeros)
5932 start_offset where to start in the subject string
5933 options option bits
5934 offsets points to a vector of ints to be filled in with offsets
5935 offsetcount the number of elements in the vector
5936
5937 Returns: > 0 => success; value is the number of elements filled in
5938 = 0 => success, but offsets is not big enough
5939 -1 => failed to match
5940 < -1 => some kind of unexpected problem
5941 */
5942
5943 #ifdef COMPILE_PCRE8
5944 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5945 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5946 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5947 int offsetcount)
5948 #else
5949 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5950 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
5951 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
5952 int offsetcount)
5953 #endif
5954 {
5955 int rc, ocount, arg_offset_max;
5956 int newline;
5957 BOOL using_temporary_offsets = FALSE;
5958 BOOL anchored;
5959 BOOL startline;
5960 BOOL firstline;
5961 BOOL utf;
5962 BOOL has_first_char = FALSE;
5963 BOOL has_req_char = FALSE;
5964 pcre_uchar first_char = 0;
5965 pcre_uchar first_char2 = 0;
5966 pcre_uchar req_char = 0;
5967 pcre_uchar req_char2 = 0;
5968 match_data match_block;
5969 match_data *md = &match_block;
5970 const pcre_uint8 *tables;
5971 const pcre_uint8 *start_bits = NULL;
5972 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
5973 PCRE_PUCHAR end_subject;
5974 PCRE_PUCHAR start_partial = NULL;
5975 PCRE_PUCHAR req_char_ptr = start_match - 1;
5976
5977 pcre_study_data internal_study;
5978 const pcre_study_data *study;
5979
5980 real_pcre internal_re;
5981 const real_pcre *external_re = (const real_pcre *)argument_re;
5982 const real_pcre *re = external_re;
5983
5984 /* Plausibility checks */
5985
5986 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5987 if (re == NULL || subject == NULL ||
5988 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5989 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5990 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5991
5992 /* These two settings are used in the code for checking a UTF-8 string that
5993 follows immediately afterwards. Other values in the md block are used only
5994 during "normal" pcre_exec() processing, not when the JIT support is in use,
5995 so they are set up later. */
5996
5997 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
5998 utf = md->utf = (re->options & PCRE_UTF8) != 0;
5999 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6000 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6001
6002 /* Check a UTF-8 string if required. Pass back the character offset and error
6003 code for an invalid string if a results vector is available. */
6004
6005 #ifdef SUPPORT_UTF8
6006 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6007 {
6008 int erroroffset;
6009 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6010 if (errorcode != 0)
6011 {
6012 if (offsetcount >= 2)
6013 {
6014 offsets[0] = erroroffset;
6015 offsets[1] = errorcode;
6016 }
6017 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6018 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6019 }
6020
6021 /* Check that a start_offset points to the start of a UTF character. */
6022 #ifdef COMPILE_PCRE8
6023 if (start_offset > 0 && start_offset < length &&
6024 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
6025 return PCRE_ERROR_BADUTF8_OFFSET;
6026 #else
6027 #ifdef COMPILE_PCRE16
6028 if (start_offset > 0 && start_offset < length &&
6029 (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
6030 return PCRE_ERROR_BADUTF8_OFFSET;
6031 #endif /* COMPILE_PCRE16 */
6032 #endif /* COMPILE_PCRE8 */
6033 }
6034 #endif
6035
6036 /* If the pattern was successfully studied with JIT support, run the JIT
6037 executable instead of the rest of this function. Most options must be set at
6038 compile time for the JIT code to be usable. Fallback to the normal code path if
6039 an unsupported flag is set. In particular, JIT does not support partial
6040 matching. */
6041
6042 #ifdef SUPPORT_JIT
6043 if (extra_data != NULL
6044 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6045 && extra_data->executable_jit != NULL
6046 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6047 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6048 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6049 return PRIV(jit_exec)(re, extra_data->executable_jit,
6050 (const pcre_uchar *)subject, length, start_offset, options,
6051 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6052 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6053 #endif
6054
6055 /* Carry on with non-JIT matching. This information is for finding all the
6056 numbers associated with a given name, for condition testing. */
6057
6058 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6059 md->name_count = re->name_count;
6060 md->name_entry_size = re->name_entry_size;
6061
6062 /* Fish out the optional data from the extra_data structure, first setting
6063 the default values. */
6064
6065 study = NULL;
6066 md->match_limit = MATCH_LIMIT;
6067 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6068 md->callout_data = NULL;
6069
6070 /* The table pointer is always in native byte order. */
6071
6072 tables = external_re->tables;
6073
6074 if (extra_data != NULL)
6075 {
6076 register unsigned int flags = extra_data->flags;
6077 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6078 study = (const pcre_study_data *)extra_data->study_data;
6079 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6080 md->match_limit = extra_data->match_limit;
6081 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6082 md->match_limit_recursion = extra_data->match_limit_recursion;
6083 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6084 md->callout_data = extra_data->callout_data;
6085 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6086 }
6087
6088 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6089 is a feature that makes it possible to save compiled regex and re-use them
6090 in other programs later. */
6091
6092 if (tables == NULL) tables = PRIV(default_tables);
6093
6094 /* Check that the first field in the block is the magic number. If it is not,
6095 test for a regex that was compiled on a host of opposite endianness. If this is
6096 the case, flipped values are put in internal_re and internal_study if there was
6097 study data too. */
6098
6099 if (re->magic_number != MAGIC_NUMBER)
6100 {
6101 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6102 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6103 if (study != NULL) study = &internal_study;
6104 }
6105
6106 /* Set up other data */
6107
6108 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6109 startline = (re->flags & PCRE_STARTLINE) != 0;
6110 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6111
6112 /* The code starts after the real_pcre block and the capture name table. */
6113
6114 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6115 re->name_count * re->name_entry_size;
6116
6117 md->start_subject = (PCRE_PUCHAR)subject;
6118 md->start_offset = start_offset;
6119 md->end_subject = md->start_subject + length;
6120 end_subject = md->end_subject;
6121
6122 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6123 md->use_ucp = (re->options & PCRE_UCP) != 0;
6124 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6125
6126 /* Some options are unpacked into BOOL variables in the hope that testing
6127 them will be faster than individual option bits. */
6128
6129 md->notbol = (options & PCRE_NOTBOL) != 0;
6130 md->noteol = (options & PCRE_NOTEOL) != 0;
6131 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6132 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6133
6134 md->hitend = FALSE;
6135 md->mark = NULL; /* In case never set */
6136
6137 md->recursive = NULL; /* No recursion at top level */
6138 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6139
6140 md->lcc = tables + lcc_offset;
6141 md->ctypes = tables + ctypes_offset;
6142
6143 /* Handle different \R options. */
6144
6145 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6146 {
6147 case 0:
6148 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6149 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6150 else
6151 #ifdef BSR_ANYCRLF
6152 md->bsr_anycrlf = TRUE;
6153 #else
6154 md->bsr_anycrlf = FALSE;
6155 #endif
6156 break;
6157
6158 case PCRE_BSR_ANYCRLF:
6159 md->bsr_anycrlf = TRUE;
6160 break;
6161
6162 case PCRE_BSR_UNICODE:
6163 md->bsr_anycrlf = FALSE;
6164 break;
6165
6166 default: return PCRE_ERROR_BADNEWLINE;
6167 }
6168
6169 /* Handle different types of newline. The three bits give eight cases. If
6170 nothing is set at run time, whatever was used at compile time applies. */
6171
6172 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6173 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6174 {
6175 case 0: newline = NEWLINE; break; /* Compile-time default */
6176 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6177 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6178 case PCRE_NEWLINE_CR+
6179 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6180 case PCRE_NEWLINE_ANY: newline = -1; break;
6181 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6182 default: return PCRE_ERROR_BADNEWLINE;
6183 }
6184
6185 if (newline == -2)
6186 {
6187 md->nltype = NLTYPE_ANYCRLF;
6188 }
6189 else if (newline < 0)
6190 {
6191 md->nltype = NLTYPE_ANY;
6192 }
6193 else
6194 {
6195 md->nltype = NLTYPE_FIXED;
6196 if (newline > 255)
6197 {
6198 md->nllen = 2;
6199 md->nl[0] = (newline >> 8) & 255;
6200 md->nl[1] = newline & 255;
6201 }
6202 else
6203 {
6204 md->nllen = 1;
6205 md->nl[0] = newline;
6206 }
6207 }
6208
6209 /* Partial matching was originally supported only for a restricted set of
6210 regexes; from release 8.00 there are no restrictions, but the bits are still
6211 defined (though never set). So there's no harm in leaving this code. */
6212
6213 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6214 return PCRE_ERROR_BADPARTIAL;
6215
6216 /* If the expression has got more back references than the offsets supplied can
6217 hold, we get a temporary chunk of working store to use during the matching.
6218 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6219 of 3. */
6220
6221 ocount = offsetcount - (offsetcount % 3);
6222 arg_offset_max = (2*ocount)/3;
6223
6224 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6225 {
6226 ocount = re->top_backref * 3 + 3;
6227 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6228 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6229 using_temporary_offsets = TRUE;
6230 DPRINTF(("Got memory to hold back references\n"));
6231 }
6232 else md->offset_vector = offsets;
6233
6234 md->offset_end = ocount;
6235 md->offset_max = (2*ocount)/3;
6236 md->offset_overflow = FALSE;
6237 md->capture_last = -1;
6238
6239 /* Reset the working variable associated with each extraction. These should
6240 never be used unless previously set, but they get saved and restored, and so we
6241 initialize them to avoid reading uninitialized locations. Also, unset the
6242 offsets for the matched string. This is really just for tidiness with callouts,
6243 in case they inspect these fields. */
6244
6245 if (md->offset_vector != NULL)
6246 {
6247 register int *iptr = md->offset_vector + ocount;
6248 register int *iend = iptr - re->top_bracket;
6249 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6250 while (--iptr >= iend) *iptr = -1;
6251 md->offset_vector[0] = md->offset_vector[1] = -1;
6252 }
6253
6254 /* Set up the first character to match, if available. The first_char value is
6255 never set for an anchored regular expression, but the anchoring may be forced
6256 at run time, so we have to test for anchoring. The first char may be unset for
6257 an unanchored pattern, of course. If there's no first char and the pattern was
6258 studied, there may be a bitmap of possible first characters. */
6259
6260 if (!anchored)
6261 {
6262 if ((re->flags & PCRE_FIRSTSET) != 0)
6263 {
6264 has_first_char = TRUE;
6265 first_char = first_char2 = re->first_char;
6266 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6267 {
6268 first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
6269 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6270 if (utf && first_char > 127)
6271 first_char2 = UCD_OTHERCASE(first_char);
6272 #endif
6273 }
6274 }
6275 else
6276 if (!startline && study != NULL &&
6277 (study->flags & PCRE_STUDY_MAPPED) != 0)
6278 start_bits = study->start_bits;
6279 }
6280
6281 /* For anchored or unanchored matches, there may be a "last known required
6282 character" set. */
6283
6284 if ((re->flags & PCRE_REQCHSET) != 0)
6285 {
6286 has_req_char = TRUE;
6287 req_char = req_char2 = re->req_char;
6288 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6289 {
6290 req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
6291 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6292 if (utf && req_char > 127)
6293 req_char2 = UCD_OTHERCASE(req_char);
6294 #endif
6295 }
6296 }
6297
6298
6299 /* ==========================================================================*/
6300
6301 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6302 the loop runs just once. */
6303
6304 for(;;)
6305 {
6306 PCRE_PUCHAR save_end_subject = end_subject;
6307 PCRE_PUCHAR new_start_match;
6308
6309 /* If firstline is TRUE, the start of the match is constrained to the first
6310 line of a multiline string. That is, the match must be before or at the first
6311 newline. Implement this by temporarily adjusting end_subject so that we stop
6312 scanning at a newline. If the match fails at the newline, later code breaks
6313 this loop. */
6314
6315 if (firstline)
6316 {
6317 PCRE_PUCHAR t = start_match;
6318 #ifdef SUPPORT_UTF
6319 if (utf)
6320 {
6321 while (t < md->end_subject && !IS_NEWLINE(t))
6322 {
6323 t++;
6324 ACROSSCHAR(t < end_subject, *t, t++);
6325 }
6326 }
6327 else
6328 #endif
6329 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6330 end_subject = t;
6331 }
6332
6333 /* There are some optimizations that avoid running the match if a known
6334 starting point is not found, or if a known later character is not present.
6335 However, there is an option that disables these, for testing and for ensuring
6336 that all callouts do actually occur. The option can be set in the regex by
6337 (*NO_START_OPT) or passed in match-time options. */
6338
6339 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6340 {
6341 /* Advance to a unique first char if there is one. */
6342
6343 if (has_first_char)
6344 {
6345 if (first_char != first_char2)
6346 while (start_match < end_subject &&
6347 *start_match != first_char && *start_match != first_char2)
6348 start_match++;
6349 else
6350 while (start_match < end_subject && *start_match != first_char)
6351 start_match++;
6352 }
6353
6354 /* Or to just after a linebreak for a multiline match */
6355
6356 else if (startline)
6357 {
6358 if (start_match > md->start_subject + start_offset)
6359 {
6360 #ifdef SUPPORT_UTF
6361 if (utf)
6362 {
6363 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6364 {
6365 start_match++;
6366 ACROSSCHAR(start_match < end_subject, *start_match,
6367 start_match++);
6368 }
6369 }
6370 else
6371 #endif
6372 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6373 start_match++;
6374
6375 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6376 and we are now at a LF, advance the match position by one more character.
6377 */
6378
6379 if (start_match[-1] == CHAR_CR &&
6380 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6381 start_match < end_subject &&
6382 *start_match == CHAR_NL)
6383 start_match++;
6384 }
6385 }
6386
6387 /* Or to a non-unique first byte after study */
6388
6389 else if (start_bits != NULL)
6390 {
6391 while (start_match < end_subject)
6392 {
6393 register unsigned int c = *start_match;
6394 #ifndef COMPILE_PCRE8
6395 if (c > 255) c = 255;
6396 #endif
6397 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6398 {
6399 start_match++;
6400 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6401 /* In non 8-bit mode, the iteration will stop for
6402 characters > 255 at the beginning or not stop at all. */
6403 if (utf)
6404 ACROSSCHAR(start_match < end_subject, *start_match,
6405 start_match++);
6406 #endif
6407 }
6408 else break;
6409 }
6410 }
6411 } /* Starting optimizations */
6412
6413 /* Restore fudged end_subject */
6414
6415 end_subject = save_end_subject;
6416
6417 /* The following two optimizations are disabled for partial matching or if
6418 disabling is explicitly requested. */
6419
6420 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6421 {
6422 /* If the pattern was studied, a minimum subject length may be set. This is
6423 a lower bound; no actual string of that length may actually match the
6424 pattern. Although the value is, strictly, in characters, we treat it as
6425 bytes to avoid spending too much time in this optimization. */
6426
6427 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6428 (pcre_uint32)(end_subject - start_match) < study->minlength)
6429 {
6430 rc = MATCH_NOMATCH;
6431 break;
6432 }
6433
6434 /* If req_char is set, we know that that character must appear in the
6435 subject for the match to succeed. If the first character is set, req_char
6436 must be later in the subject; otherwise the test starts at the match point.
6437 This optimization can save a huge amount of backtracking in patterns with
6438 nested unlimited repeats that aren't going to match. Writing separate code
6439 for cased/caseless versions makes it go faster, as does using an
6440 autoincrement and backing off on a match.
6441
6442 HOWEVER: when the subject string is very, very long, searching to its end
6443 can take a long time, and give bad performance on quite ordinary patterns.
6444 This showed up when somebody was matching something like /^\d+C/ on a
6445 32-megabyte string... so we don't do this when the string is sufficiently
6446 long. */
6447
6448 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6449 {
6450 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6451
6452 /* We don't need to repeat the search if we haven't yet reached the
6453 place we found it at last time. */
6454
6455 if (p > req_char_ptr)
6456 {
6457 if (req_char != req_char2)
6458 {
6459 while (p < end_subject)
6460 {
6461 register int pp = *p++;
6462 if (pp == req_char || pp == req_char2) { p--; break; }
6463 }
6464 }
6465 else
6466 {
6467 while (p < end_subject)
6468 {
6469 if (*p++ == req_char) { p--; break; }
6470 }
6471 }
6472
6473 /* If we can't find the required character, break the matching loop,
6474 forcing a match failure. */
6475
6476 if (p >= end_subject)
6477 {
6478 rc = MATCH_NOMATCH;
6479 break;
6480 }
6481
6482 /* If we have found the required character, save the point where we
6483 found it, so that we don't search again next time round the loop if
6484 the start hasn't passed this character yet. */
6485
6486 req_char_ptr = p;
6487 }
6488 }
6489 }
6490
6491 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6492 printf(">>>> Match against: ");
6493 pchars(start_match, end_subject - start_match, TRUE, md);
6494 printf("\n");
6495 #endif
6496
6497 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6498 first starting point for which a partial match was found. */
6499
6500 md->start_match_ptr = start_match;
6501 md->start_used_ptr = start_match;
6502 md->match_call_count = 0;
6503 md->match_function_type = 0;
6504 md->end_offset_top = 0;
6505 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6506 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6507
6508 switch(rc)
6509 {
6510 /* SKIP passes back the next starting point explicitly, but if it is the
6511 same as the match we have just done, treat it as NOMATCH. */
6512
6513 case MATCH_SKIP:
6514 if (md->start_match_ptr != start_match)
6515 {
6516 new_start_match = md->start_match_ptr;
6517 break;
6518 }
6519 /* Fall through */
6520
6521 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6522 the SKIP's arg was not found. We also treat this as NOMATCH. */
6523
6524 case MATCH_SKIP_ARG:
6525 /* Fall through */
6526
6527 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6528 exactly like PRUNE. */
6529
6530 case MATCH_NOMATCH:
6531 case MATCH_PRUNE:
6532 case MATCH_THEN:
6533 new_start_match = start_match + 1;
6534 #ifdef SUPPORT_UTF
6535 if (utf)
6536 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6537 new_start_match++);
6538 #endif
6539 break;
6540
6541 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6542
6543 case MATCH_COMMIT:
6544 rc = MATCH_NOMATCH;
6545 goto ENDLOOP;
6546
6547 /* Any other return is either a match, or some kind of error. */
6548
6549 default:
6550 goto ENDLOOP;
6551 }
6552
6553 /* Control reaches here for the various types of "no match at this point"
6554 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6555
6556 rc = MATCH_NOMATCH;
6557
6558 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6559 newline in the subject (though it may continue over the newline). Therefore,
6560 if we have just failed to match, starting at a newline, do not continue. */
6561
6562 if (firstline && IS_NEWLINE(start_match)) break;
6563
6564 /* Advance to new matching position */
6565
6566 start_match = new_start_match;
6567
6568 /* Break the loop if the pattern is anchored or if we have passed the end of
6569 the subject. */
6570
6571 if (anchored || start_match > end_subject) break;
6572
6573 /* If we have just passed a CR and we are now at a LF, and the pattern does
6574 not contain any explicit matches for \r or \n, and the newline option is CRLF
6575 or ANY or ANYCRLF, advance the match position by one more character. */
6576
6577 if (start_match[-1] == CHAR_CR &&
6578 start_match < end_subject &&
6579 *start_match == CHAR_NL &&
6580 (re->flags & PCRE_HASCRORLF) == 0 &&
6581 (md->nltype == NLTYPE_ANY ||
6582 md->nltype == NLTYPE_ANYCRLF ||
6583 md->nllen == 2))
6584 start_match++;
6585
6586 md->mark = NULL; /* Reset for start of next match attempt */
6587 } /* End of for(;;) "bumpalong" loop */
6588
6589 /* ==========================================================================*/
6590
6591 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6592 conditions is true:
6593
6594 (1) The pattern is anchored or the match was failed by (*COMMIT);
6595
6596 (2) We are past the end of the subject;
6597
6598 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6599 this option requests that a match occur at or before the first newline in
6600 the subject.
6601
6602 When we have a match and the offset vector is big enough to deal with any
6603 backreferences, captured substring offsets will already be set up. In the case
6604 where we had to get some local store to hold offsets for backreference
6605 processing, copy those that we can. In this case there need not be overflow if
6606 certain parts of the pattern were not used, even though there are more
6607 capturing parentheses than vector slots. */
6608
6609 ENDLOOP:
6610
6611 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6612 {
6613 if (using_temporary_offsets)
6614 {
6615 if (arg_offset_max >= 4)
6616 {
6617 memcpy(offsets + 2, md->offset_vector + 2,
6618 (arg_offset_max - 2) * sizeof(int));
6619 DPRINTF(("Copied offsets from temporary memory\n"));
6620 }
6621 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6622 DPRINTF(("Freeing temporary memory\n"));
6623 (pcre_free)(md->offset_vector);
6624 }
6625
6626 /* Set the return code to the number of captured strings, or 0 if there were
6627 too many to fit into the vector. */
6628
6629 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6630 0 : md->end_offset_top/2;
6631
6632 /* If there is space in the offset vector, set any unused pairs at the end of
6633 the pattern to -1 for backwards compatibility. It is documented that this
6634 happens. In earlier versions, the whole set of potential capturing offsets
6635 was set to -1 each time round the loop, but this is handled differently now.
6636 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6637 those at the end that need unsetting here. We can't just unset them all at
6638 the start of the whole thing because they may get set in one branch that is
6639 not the final matching branch. */
6640
6641 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6642 {
6643 register int *iptr, *iend;
6644 int resetcount = 2 + re->top_bracket * 2;
6645 if (resetcount > offsetcount) resetcount = ocount;
6646 iptr = offsets + md->end_offset_top;
6647 iend = offsets + resetcount;
6648 while (iptr < iend) *iptr++ = -1;
6649 }
6650
6651 /* If there is space, set up the whole thing as substring 0. The value of
6652 md->start_match_ptr might be modified if \K was encountered on the success
6653 matching path. */
6654
6655 if (offsetcount < 2) rc = 0; else
6656 {
6657 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6658 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6659 }
6660
6661 DPRINTF((">>>> returning %d\n", rc));
6662 goto RETURN_MARK;
6663 }
6664
6665 /* Control gets here if there has been an error, or if the overall match
6666 attempt has failed at all permitted starting positions. */
6667
6668 if (using_temporary_offsets)
6669 {
6670 DPRINTF(("Freeing temporary memory\n"));
6671 (pcre_free)(md->offset_vector);
6672 }
6673
6674 /* For anything other than nomatch or partial match, just return the code. */
6675
6676 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6677 {
6678 DPRINTF((">>>> error: returning %d\n", rc));
6679 return rc;
6680 }
6681
6682 /* Handle partial matches - disable any mark data */
6683
6684 if (start_partial != NULL)
6685 {
6686 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6687 md->mark = NULL;
6688 if (offsetcount > 1)
6689 {
6690 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6691 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6692 }
6693 rc = PCRE_ERROR_PARTIAL;
6694 }
6695
6696 /* This is the classic nomatch case */
6697
6698 else
6699 {
6700 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6701 rc = PCRE_ERROR_NOMATCH;
6702 }
6703
6704 /* Return the MARK data if it has been requested. */
6705
6706 RETURN_MARK:
6707
6708 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6709 *(extra_data->mark) = (unsigned char *)(md->mark);
6710 return rc;
6711 }
6712
6713 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5