/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 781 - (show annotations)
Sat Dec 3 07:58:30 2011 UTC (8 years, 7 months ago) by zherczeg
File MIME type: text/plain
File size: 202755 byte(s)
renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 PCRE_PUCHAR eptr_start = eptr;
159 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 PCRE_PUCHAR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 PCRE_PUCHAR Xeptr;
358 const pcre_uchar *Xecode;
359 PCRE_PUCHAR Xmstart;
360 PCRE_PUCHAR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
478 PCRE_PUCHAR mstart, const pcre_uchar *markptr, int offset_top,
479 match_data *md, eptrblock *eptrb, unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf; /* Local copy of UTF flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const pcre_uchar *charptr;
590 #endif
591 const pcre_uchar *callpat;
592 const pcre_uchar *data;
593 const pcre_uchar *next;
594 PCRE_PUCHAR pp;
595 const pcre_uchar *prev;
596 PCRE_PUCHAR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 pcre_uchar occhars[6];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf = md->utf; /* Local copy of the flag */
664 #else
665 utf = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 STRCMP_UC_UC(markptr, md->start_match_ptr) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle an atomic group that does not contain any capturing parentheses.
798 This can be handled like an assertion. Prior to 8.13, all atomic groups
799 were handled this way. In 8.13, the code was changed as below for ONCE, so
800 that backups pass through the group and thereby reset captured values.
801 However, this uses a lot more stack, so in 8.20, atomic groups that do not
802 contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 less stack intensive way.
804
805 Check the alternative branches in turn - the matching won't pass the KET
806 for this kind of subpattern. If any one branch matches, we carry on as at
807 the end of a normal bracket, leaving the subject pointer, but resetting
808 the start-of-match value in case it was changed by \K. */
809
810 case OP_ONCE_NC:
811 prev = ecode;
812 saved_eptr = eptr;
813 do
814 {
815 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817 {
818 mstart = md->start_match_ptr;
819 markptr = md->mark;
820 break;
821 }
822 if (rrc == MATCH_THEN)
823 {
824 next = ecode + GET(ecode,1);
825 if (md->start_match_ptr < next &&
826 (*ecode == OP_ALT || *next == OP_ALT))
827 rrc = MATCH_NOMATCH;
828 }
829
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 ecode += GET(ecode,1);
832 }
833 while (*ecode == OP_ALT);
834
835 /* If hit the end of the group (which could be repeated), fail */
836
837 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
838
839 /* Continue as from after the group, updating the offsets high water
840 mark, since extracts may have been taken. */
841
842 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
843
844 offset_top = md->end_offset_top;
845 eptr = md->end_match_ptr;
846
847 /* For a non-repeating ket, just continue at this level. This also
848 happens for a repeating ket if no characters were matched in the group.
849 This is the forcible breaking of infinite loops as implemented in Perl
850 5.005. */
851
852 if (*ecode == OP_KET || eptr == saved_eptr)
853 {
854 ecode += 1+LINK_SIZE;
855 break;
856 }
857
858 /* The repeating kets try the rest of the pattern or restart from the
859 preceding bracket, in the appropriate order. The second "call" of match()
860 uses tail recursion, to avoid using another stack frame. */
861
862 if (*ecode == OP_KETRMIN)
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
866 ecode = prev;
867 goto TAIL_RECURSE;
868 }
869 else /* OP_KETRMAX */
870 {
871 md->match_function_type = MATCH_CBEGROUP;
872 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
874 ecode += 1 + LINK_SIZE;
875 goto TAIL_RECURSE;
876 }
877 /* Control never gets here */
878
879 /* Handle a capturing bracket, other than those that are possessive with an
880 unlimited repeat. If there is space in the offset vector, save the current
881 subject position in the working slot at the top of the vector. We mustn't
882 change the current values of the data slot, because they may be set from a
883 previous iteration of this group, and be referred to by a reference inside
884 the group. A failure to match might occur after the group has succeeded,
885 if something later on doesn't match. For this reason, we need to restore
886 the working value and also the values of the final offsets, in case they
887 were set by a previous iteration of the same bracket.
888
889 If there isn't enough space in the offset vector, treat this as if it were
890 a non-capturing bracket. Don't worry about setting the flag for the error
891 case here; that is handled in the code for KET. */
892
893 case OP_CBRA:
894 case OP_SCBRA:
895 number = GET2(ecode, 1+LINK_SIZE);
896 offset = number << 1;
897
898 #ifdef PCRE_DEBUG
899 printf("start bracket %d\n", number);
900 printf("subject=");
901 pchars(eptr, 16, TRUE, md);
902 printf("\n");
903 #endif
904
905 if (offset < md->offset_max)
906 {
907 save_offset1 = md->offset_vector[offset];
908 save_offset2 = md->offset_vector[offset+1];
909 save_offset3 = md->offset_vector[md->offset_end - number];
910 save_capture_last = md->capture_last;
911
912 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
913 md->offset_vector[md->offset_end - number] =
914 (int)(eptr - md->start_subject);
915
916 for (;;)
917 {
918 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
919 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
920 eptrb, RM1);
921 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
922
923 /* If we backed up to a THEN, check whether it is within the current
924 branch by comparing the address of the THEN that is passed back with
925 the end of the branch. If it is within the current branch, and the
926 branch is one of two or more alternatives (it either starts or ends
927 with OP_ALT), we have reached the limit of THEN's action, so convert
928 the return code to NOMATCH, which will cause normal backtracking to
929 happen from now on. Otherwise, THEN is passed back to an outer
930 alternative. This implements Perl's treatment of parenthesized groups,
931 where a group not containing | does not affect the current alternative,
932 that is, (X) is NOT the same as (X|(*F)). */
933
934 if (rrc == MATCH_THEN)
935 {
936 next = ecode + GET(ecode,1);
937 if (md->start_match_ptr < next &&
938 (*ecode == OP_ALT || *next == OP_ALT))
939 rrc = MATCH_NOMATCH;
940 }
941
942 /* Anything other than NOMATCH is passed back. */
943
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 md->capture_last = save_capture_last;
946 ecode += GET(ecode, 1);
947 if (*ecode != OP_ALT) break;
948 }
949
950 DPRINTF(("bracket %d failed\n", number));
951 md->offset_vector[offset] = save_offset1;
952 md->offset_vector[offset+1] = save_offset2;
953 md->offset_vector[md->offset_end - number] = save_offset3;
954
955 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
956
957 if (md->mark == NULL) md->mark = markptr;
958 RRETURN(rrc);
959 }
960
961 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962 as a non-capturing bracket. */
963
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966
967 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971
972 /* Non-capturing or atomic group, except for possessive with unlimited
973 repeat and ONCE group with no captures. Loop for all the alternatives.
974
975 When we get to the final alternative within the brackets, we used to return
976 the result of a recursive call to match() whatever happened so it was
977 possible to reduce stack usage by turning this into a tail recursion,
978 except in the case of a possibly empty group. However, now that there is
979 the possiblity of (*THEN) occurring in the final alternative, this
980 optimization is no longer always possible.
981
982 We can optimize if we know there are no (*THEN)s in the pattern; at present
983 this is the best that can be done.
984
985 MATCH_ONCE is returned when the end of an atomic group is successfully
986 reached, but subsequent matching fails. It passes back up the tree (causing
987 captured values to be reset) until the original atomic group level is
988 reached. This is tested by comparing md->once_target with the start of the
989 group. At this point, the return is converted into MATCH_NOMATCH so that
990 previous backup points can be taken. */
991
992 case OP_ONCE:
993 case OP_BRA:
994 case OP_SBRA:
995 DPRINTF(("start non-capturing bracket\n"));
996
997 for (;;)
998 {
999 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000
1001 /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 the pattern, and this is the final alternative, optimize as described
1003 above. */
1004
1005 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006 {
1007 ecode += PRIV(OP_lengths)[*ecode];
1008 goto TAIL_RECURSE;
1009 }
1010
1011 /* In all other cases, we have to make another call to match(). */
1012
1013 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1014 RM2);
1015
1016 /* See comment in the code for capturing groups above about handling
1017 THEN. */
1018
1019 if (rrc == MATCH_THEN)
1020 {
1021 next = ecode + GET(ecode,1);
1022 if (md->start_match_ptr < next &&
1023 (*ecode == OP_ALT || *next == OP_ALT))
1024 rrc = MATCH_NOMATCH;
1025 }
1026
1027 if (rrc != MATCH_NOMATCH)
1028 {
1029 if (rrc == MATCH_ONCE)
1030 {
1031 const pcre_uchar *scode = ecode;
1032 if (*scode != OP_ONCE) /* If not at start, find it */
1033 {
1034 while (*scode == OP_ALT) scode += GET(scode, 1);
1035 scode -= GET(scode, 1);
1036 }
1037 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 }
1039 RRETURN(rrc);
1040 }
1041 ecode += GET(ecode, 1);
1042 if (*ecode != OP_ALT) break;
1043 }
1044
1045 if (md->mark == NULL) md->mark = markptr;
1046 RRETURN(MATCH_NOMATCH);
1047
1048 /* Handle possessive capturing brackets with an unlimited repeat. We come
1049 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1050 handled similarly to the normal case above. However, the matching is
1051 different. The end of these brackets will always be OP_KETRPOS, which
1052 returns MATCH_KETRPOS without going further in the pattern. By this means
1053 we can handle the group by iteration rather than recursion, thereby
1054 reducing the amount of stack needed. */
1055
1056 case OP_CBRAPOS:
1057 case OP_SCBRAPOS:
1058 allow_zero = FALSE;
1059
1060 POSSESSIVE_CAPTURE:
1061 number = GET2(ecode, 1+LINK_SIZE);
1062 offset = number << 1;
1063
1064 #ifdef PCRE_DEBUG
1065 printf("start possessive bracket %d\n", number);
1066 printf("subject=");
1067 pchars(eptr, 16, TRUE, md);
1068 printf("\n");
1069 #endif
1070
1071 if (offset < md->offset_max)
1072 {
1073 matched_once = FALSE;
1074 code_offset = ecode - md->start_code;
1075
1076 save_offset1 = md->offset_vector[offset];
1077 save_offset2 = md->offset_vector[offset+1];
1078 save_offset3 = md->offset_vector[md->offset_end - number];
1079 save_capture_last = md->capture_last;
1080
1081 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1082
1083 /* Each time round the loop, save the current subject position for use
1084 when the group matches. For MATCH_MATCH, the group has matched, so we
1085 restart it with a new subject starting position, remembering that we had
1086 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1087 usual. If we haven't matched any alternatives in any iteration, check to
1088 see if a previous iteration matched. If so, the group has matched;
1089 continue from afterwards. Otherwise it has failed; restore the previous
1090 capture values before returning NOMATCH. */
1091
1092 for (;;)
1093 {
1094 md->offset_vector[md->offset_end - number] =
1095 (int)(eptr - md->start_subject);
1096 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1097 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1098 eptrb, RM63);
1099 if (rrc == MATCH_KETRPOS)
1100 {
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1103 ecode = md->start_code + code_offset;
1104 save_capture_last = md->capture_last;
1105 matched_once = TRUE;
1106 continue;
1107 }
1108
1109 /* See comment in the code for capturing groups above about handling
1110 THEN. */
1111
1112 if (rrc == MATCH_THEN)
1113 {
1114 next = ecode + GET(ecode,1);
1115 if (md->start_match_ptr < next &&
1116 (*ecode == OP_ALT || *next == OP_ALT))
1117 rrc = MATCH_NOMATCH;
1118 }
1119
1120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1121 md->capture_last = save_capture_last;
1122 ecode += GET(ecode, 1);
1123 if (*ecode != OP_ALT) break;
1124 }
1125
1126 if (!matched_once)
1127 {
1128 md->offset_vector[offset] = save_offset1;
1129 md->offset_vector[offset+1] = save_offset2;
1130 md->offset_vector[md->offset_end - number] = save_offset3;
1131 }
1132
1133 if (md->mark == NULL) md->mark = markptr;
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = ecode - md->start_code;
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (pcre_callout != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = (unsigned char *)markptr;
1236 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 pcre_uchar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 pcre_uchar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 pcre_uchar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 pcre_uchar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 MRRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to MRRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 MRRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 markptr = md->mark;
1532 break;
1533 }
1534
1535 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1536 as NOMATCH. */
1537
1538 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1539 ecode += GET(ecode, 1);
1540 }
1541 while (*ecode == OP_ALT);
1542
1543 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1544
1545 /* If checking an assertion for a condition, return MATCH_MATCH. */
1546
1547 if (condassert) RRETURN(MATCH_MATCH);
1548
1549 /* Continue from after the assertion, updating the offsets high water
1550 mark, since extracts may have been taken during the assertion. */
1551
1552 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1553 ecode += 1 + LINK_SIZE;
1554 offset_top = md->end_offset_top;
1555 continue;
1556
1557 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1558 PRUNE, or COMMIT means we must assume failure without checking subsequent
1559 branches. */
1560
1561 case OP_ASSERT_NOT:
1562 case OP_ASSERTBACK_NOT:
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1574 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1575 {
1576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1577 break;
1578 }
1579
1580 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1581 as NOMATCH. */
1582
1583 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1584 ecode += GET(ecode,1);
1585 }
1586 while (*ecode == OP_ALT);
1587
1588 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1589
1590 ecode += 1 + LINK_SIZE;
1591 continue;
1592
1593 /* Move the subject pointer back. This occurs only at the start of
1594 each branch of a lookbehind assertion. If we are too close to the start to
1595 move back, this match function fails. When working with UTF-8 we move
1596 back a number of characters, not bytes. */
1597
1598 case OP_REVERSE:
1599 #ifdef SUPPORT_UTF8
1600 if (utf)
1601 {
1602 i = GET(ecode, 1);
1603 while (i-- > 0)
1604 {
1605 eptr--;
1606 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1607 BACKCHAR(eptr);
1608 }
1609 }
1610 else
1611 #endif
1612
1613 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1614
1615 {
1616 eptr -= GET(ecode, 1);
1617 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1618 }
1619
1620 /* Save the earliest consulted character, then skip to next op code */
1621
1622 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1623 ecode += 1 + LINK_SIZE;
1624 break;
1625
1626 /* The callout item calls an external function, if one is provided, passing
1627 details of the match so far. This is mainly for debugging, though the
1628 function is able to force a failure. */
1629
1630 case OP_CALLOUT:
1631 if (pcre_callout != NULL)
1632 {
1633 pcre_callout_block cb;
1634 cb.version = 2; /* Version 1 of the callout block */
1635 cb.callout_number = ecode[1];
1636 cb.offset_vector = md->offset_vector;
1637 cb.subject = (PCRE_SPTR)md->start_subject;
1638 cb.subject_length = (int)(md->end_subject - md->start_subject);
1639 cb.start_match = (int)(mstart - md->start_subject);
1640 cb.current_position = (int)(eptr - md->start_subject);
1641 cb.pattern_position = GET(ecode, 2);
1642 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1643 cb.capture_top = offset_top/2;
1644 cb.capture_last = md->capture_last;
1645 cb.callout_data = md->callout_data;
1646 cb.mark = (unsigned char *)markptr;
1647 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1648 if (rrc < 0) RRETURN(rrc);
1649 }
1650 ecode += 2 + 2*LINK_SIZE;
1651 break;
1652
1653 /* Recursion either matches the current regex, or some subexpression. The
1654 offset data is the offset to the starting bracket from the start of the
1655 whole pattern. (This is so that it works from duplicated subpatterns.)
1656
1657 The state of the capturing groups is preserved over recursion, and
1658 re-instated afterwards. We don't know how many are started and not yet
1659 finished (offset_top records the completed total) so we just have to save
1660 all the potential data. There may be up to 65535 such values, which is too
1661 large to put on the stack, but using malloc for small numbers seems
1662 expensive. As a compromise, the stack is used when there are no more than
1663 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1664
1665 There are also other values that have to be saved. We use a chained
1666 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1667 for the original version of this logic. It has, however, been hacked around
1668 a lot, so he is not to blame for the current way it works. */
1669
1670 case OP_RECURSE:
1671 {
1672 recursion_info *ri;
1673 int recno;
1674
1675 callpat = md->start_code + GET(ecode, 1);
1676 recno = (callpat == md->start_code)? 0 :
1677 GET2(callpat, 1 + LINK_SIZE);
1678
1679 /* Check for repeating a recursion without advancing the subject pointer.
1680 This should catch convoluted mutual recursions. (Some simple cases are
1681 caught at compile time.) */
1682
1683 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1684 if (recno == ri->group_num && eptr == ri->subject_position)
1685 RRETURN(PCRE_ERROR_RECURSELOOP);
1686
1687 /* Add to "recursing stack" */
1688
1689 new_recursive.group_num = recno;
1690 new_recursive.subject_position = eptr;
1691 new_recursive.prevrec = md->recursive;
1692 md->recursive = &new_recursive;
1693
1694 /* Where to continue from afterwards */
1695
1696 ecode += 1 + LINK_SIZE;
1697
1698 /* Now save the offset data */
1699
1700 new_recursive.saved_max = md->offset_end;
1701 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1702 new_recursive.offset_save = stacksave;
1703 else
1704 {
1705 new_recursive.offset_save =
1706 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1707 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1708 }
1709 memcpy(new_recursive.offset_save, md->offset_vector,
1710 new_recursive.saved_max * sizeof(int));
1711
1712 /* OK, now we can do the recursion. After processing each alternative,
1713 restore the offset data. If there were nested recursions, md->recursive
1714 might be changed, so reset it before looping. */
1715
1716 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1717 cbegroup = (*callpat >= OP_SBRA);
1718 do
1719 {
1720 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1721 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1722 md, eptrb, RM6);
1723 memcpy(md->offset_vector, new_recursive.offset_save,
1724 new_recursive.saved_max * sizeof(int));
1725 md->recursive = new_recursive.prevrec;
1726 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1727 {
1728 DPRINTF(("Recursion matched\n"));
1729 if (new_recursive.offset_save != stacksave)
1730 (pcre_free)(new_recursive.offset_save);
1731
1732 /* Set where we got to in the subject, and reset the start in case
1733 it was changed by \K. This *is* propagated back out of a recursion,
1734 for Perl compatibility. */
1735
1736 eptr = md->end_match_ptr;
1737 mstart = md->start_match_ptr;
1738 goto RECURSION_MATCHED; /* Exit loop; end processing */
1739 }
1740
1741 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1742 as NOMATCH. */
1743
1744 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1745 {
1746 DPRINTF(("Recursion gave error %d\n", rrc));
1747 if (new_recursive.offset_save != stacksave)
1748 (pcre_free)(new_recursive.offset_save);
1749 RRETURN(rrc);
1750 }
1751
1752 md->recursive = &new_recursive;
1753 callpat += GET(callpat, 1);
1754 }
1755 while (*callpat == OP_ALT);
1756
1757 DPRINTF(("Recursion didn't match\n"));
1758 md->recursive = new_recursive.prevrec;
1759 if (new_recursive.offset_save != stacksave)
1760 (pcre_free)(new_recursive.offset_save);
1761 MRRETURN(MATCH_NOMATCH);
1762 }
1763
1764 RECURSION_MATCHED:
1765 break;
1766
1767 /* An alternation is the end of a branch; scan along to find the end of the
1768 bracketed group and go to there. */
1769
1770 case OP_ALT:
1771 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1772 break;
1773
1774 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1775 indicating that it may occur zero times. It may repeat infinitely, or not
1776 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1777 with fixed upper repeat limits are compiled as a number of copies, with the
1778 optional ones preceded by BRAZERO or BRAMINZERO. */
1779
1780 case OP_BRAZERO:
1781 next = ecode + 1;
1782 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 do next += GET(next, 1); while (*next == OP_ALT);
1785 ecode = next + 1 + LINK_SIZE;
1786 break;
1787
1788 case OP_BRAMINZERO:
1789 next = ecode + 1;
1790 do next += GET(next, 1); while (*next == OP_ALT);
1791 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 ecode++;
1794 break;
1795
1796 case OP_SKIPZERO:
1797 next = ecode+1;
1798 do next += GET(next,1); while (*next == OP_ALT);
1799 ecode = next + 1 + LINK_SIZE;
1800 break;
1801
1802 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1803 here; just jump to the group, with allow_zero set TRUE. */
1804
1805 case OP_BRAPOSZERO:
1806 op = *(++ecode);
1807 allow_zero = TRUE;
1808 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1809 goto POSSESSIVE_NON_CAPTURE;
1810
1811 /* End of a group, repeated or non-repeating. */
1812
1813 case OP_KET:
1814 case OP_KETRMIN:
1815 case OP_KETRMAX:
1816 case OP_KETRPOS:
1817 prev = ecode - GET(ecode, 1);
1818
1819 /* If this was a group that remembered the subject start, in order to break
1820 infinite repeats of empty string matches, retrieve the subject start from
1821 the chain. Otherwise, set it NULL. */
1822
1823 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1824 {
1825 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1826 eptrb = eptrb->epb_prev; /* Backup to previous group */
1827 }
1828 else saved_eptr = NULL;
1829
1830 /* If we are at the end of an assertion group or a non-capturing atomic
1831 group, stop matching and return MATCH_MATCH, but record the current high
1832 water mark for use by positive assertions. We also need to record the match
1833 start in case it was changed by \K. */
1834
1835 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1836 *prev == OP_ONCE_NC)
1837 {
1838 md->end_match_ptr = eptr; /* For ONCE_NC */
1839 md->end_offset_top = offset_top;
1840 md->start_match_ptr = mstart;
1841 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1842 }
1843
1844 /* For capturing groups we have to check the group number back at the start
1845 and if necessary complete handling an extraction by setting the offsets and
1846 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1847 into group 0, so it won't be picked up here. Instead, we catch it when the
1848 OP_END is reached. Other recursion is handled here. We just have to record
1849 the current subject position and start match pointer and give a MATCH
1850 return. */
1851
1852 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1853 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1854 {
1855 number = GET2(prev, 1+LINK_SIZE);
1856 offset = number << 1;
1857
1858 #ifdef PCRE_DEBUG
1859 printf("end bracket %d", number);
1860 printf("\n");
1861 #endif
1862
1863 /* Handle a recursively called group. */
1864
1865 if (md->recursive != NULL && md->recursive->group_num == number)
1866 {
1867 md->end_match_ptr = eptr;
1868 md->start_match_ptr = mstart;
1869 RRETURN(MATCH_MATCH);
1870 }
1871
1872 /* Deal with capturing */
1873
1874 md->capture_last = number;
1875 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1876 {
1877 /* If offset is greater than offset_top, it means that we are
1878 "skipping" a capturing group, and that group's offsets must be marked
1879 unset. In earlier versions of PCRE, all the offsets were unset at the
1880 start of matching, but this doesn't work because atomic groups and
1881 assertions can cause a value to be set that should later be unset.
1882 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1883 part of the atomic group, but this is not on the final matching path,
1884 so must be unset when 2 is set. (If there is no group 2, there is no
1885 problem, because offset_top will then be 2, indicating no capture.) */
1886
1887 if (offset > offset_top)
1888 {
1889 register int *iptr = md->offset_vector + offset_top;
1890 register int *iend = md->offset_vector + offset;
1891 while (iptr < iend) *iptr++ = -1;
1892 }
1893
1894 /* Now make the extraction */
1895
1896 md->offset_vector[offset] =
1897 md->offset_vector[md->offset_end - number];
1898 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1899 if (offset_top <= offset) offset_top = offset + 2;
1900 }
1901 }
1902
1903 /* For an ordinary non-repeating ket, just continue at this level. This
1904 also happens for a repeating ket if no characters were matched in the
1905 group. This is the forcible breaking of infinite loops as implemented in
1906 Perl 5.005. For a non-repeating atomic group that includes captures,
1907 establish a backup point by processing the rest of the pattern at a lower
1908 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1909 original OP_ONCE level, thereby bypassing intermediate backup points, but
1910 resetting any captures that happened along the way. */
1911
1912 if (*ecode == OP_KET || eptr == saved_eptr)
1913 {
1914 if (*prev == OP_ONCE)
1915 {
1916 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1918 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1919 RRETURN(MATCH_ONCE);
1920 }
1921 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1922 break;
1923 }
1924
1925 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1926 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1927 at a time from the outer level, thus saving stack. */
1928
1929 if (*ecode == OP_KETRPOS)
1930 {
1931 md->end_match_ptr = eptr;
1932 md->end_offset_top = offset_top;
1933 RRETURN(MATCH_KETRPOS);
1934 }
1935
1936 /* The normal repeating kets try the rest of the pattern or restart from
1937 the preceding bracket, in the appropriate order. In the second case, we can
1938 use tail recursion to avoid using another stack frame, unless we have an
1939 an atomic group or an unlimited repeat of a group that can match an empty
1940 string. */
1941
1942 if (*ecode == OP_KETRMIN)
1943 {
1944 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1946 if (*prev == OP_ONCE)
1947 {
1948 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1951 RRETURN(MATCH_ONCE);
1952 }
1953 if (*prev >= OP_SBRA) /* Could match an empty string */
1954 {
1955 md->match_function_type = MATCH_CBEGROUP;
1956 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1957 RRETURN(rrc);
1958 }
1959 ecode = prev;
1960 goto TAIL_RECURSE;
1961 }
1962 else /* OP_KETRMAX */
1963 {
1964 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1965 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1966 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (*prev == OP_ONCE)
1969 {
1970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972 md->once_target = prev;
1973 RRETURN(MATCH_ONCE);
1974 }
1975 ecode += 1 + LINK_SIZE;
1976 goto TAIL_RECURSE;
1977 }
1978 /* Control never gets here */
1979
1980 /* Not multiline mode: start of subject assertion, unless notbol. */
1981
1982 case OP_CIRC:
1983 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1984
1985 /* Start of subject assertion */
1986
1987 case OP_SOD:
1988 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1989 ecode++;
1990 break;
1991
1992 /* Multiline mode: start of subject unless notbol, or after any newline. */
1993
1994 case OP_CIRCM:
1995 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1996 if (eptr != md->start_subject &&
1997 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1998 MRRETURN(MATCH_NOMATCH);
1999 ecode++;
2000 break;
2001
2002 /* Start of match assertion */
2003
2004 case OP_SOM:
2005 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2006 ecode++;
2007 break;
2008
2009 /* Reset the start of match point */
2010
2011 case OP_SET_SOM:
2012 mstart = eptr;
2013 ecode++;
2014 break;
2015
2016 /* Multiline mode: assert before any newline, or before end of subject
2017 unless noteol is set. */
2018
2019 case OP_DOLLM:
2020 if (eptr < md->end_subject)
2021 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2022 else
2023 {
2024 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2025 SCHECK_PARTIAL();
2026 }
2027 ecode++;
2028 break;
2029
2030 /* Not multiline mode: assert before a terminating newline or before end of
2031 subject unless noteol is set. */
2032
2033 case OP_DOLL:
2034 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2035 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2036
2037 /* ... else fall through for endonly */
2038
2039 /* End of subject assertion (\z) */
2040
2041 case OP_EOD:
2042 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2043 SCHECK_PARTIAL();
2044 ecode++;
2045 break;
2046
2047 /* End of subject or ending \n assertion (\Z) */
2048
2049 case OP_EODN:
2050 ASSERT_NL_OR_EOS:
2051 if (eptr < md->end_subject &&
2052 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2053 MRRETURN(MATCH_NOMATCH);
2054
2055 /* Either at end of string or \n before end. */
2056
2057 SCHECK_PARTIAL();
2058 ecode++;
2059 break;
2060
2061 /* Word boundary assertions */
2062
2063 case OP_NOT_WORD_BOUNDARY:
2064 case OP_WORD_BOUNDARY:
2065 {
2066
2067 /* Find out if the previous and current characters are "word" characters.
2068 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2069 be "non-word" characters. Remember the earliest consulted character for
2070 partial matching. */
2071
2072 #ifdef SUPPORT_UTF8
2073 if (utf)
2074 {
2075 /* Get status of previous character */
2076
2077 if (eptr == md->start_subject) prev_is_word = FALSE; else
2078 {
2079 PCRE_PUCHAR lastptr = eptr - 1;
2080 while((*lastptr & 0xc0) == 0x80) lastptr--;
2081 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2082 GETCHAR(c, lastptr);
2083 #ifdef SUPPORT_UCP
2084 if (md->use_ucp)
2085 {
2086 if (c == '_') prev_is_word = TRUE; else
2087 {
2088 int cat = UCD_CATEGORY(c);
2089 prev_is_word = (cat == ucp_L || cat == ucp_N);
2090 }
2091 }
2092 else
2093 #endif
2094 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2095 }
2096
2097 /* Get status of next character */
2098
2099 if (eptr >= md->end_subject)
2100 {
2101 SCHECK_PARTIAL();
2102 cur_is_word = FALSE;
2103 }
2104 else
2105 {
2106 GETCHAR(c, eptr);
2107 #ifdef SUPPORT_UCP
2108 if (md->use_ucp)
2109 {
2110 if (c == '_') cur_is_word = TRUE; else
2111 {
2112 int cat = UCD_CATEGORY(c);
2113 cur_is_word = (cat == ucp_L || cat == ucp_N);
2114 }
2115 }
2116 else
2117 #endif
2118 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2119 }
2120 }
2121 else
2122 #endif
2123
2124 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2125 consistency with the behaviour of \w we do use it in this case. */
2126
2127 {
2128 /* Get status of previous character */
2129
2130 if (eptr == md->start_subject) prev_is_word = FALSE; else
2131 {
2132 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2133 #ifdef SUPPORT_UCP
2134 if (md->use_ucp)
2135 {
2136 c = eptr[-1];
2137 if (c == '_') prev_is_word = TRUE; else
2138 {
2139 int cat = UCD_CATEGORY(c);
2140 prev_is_word = (cat == ucp_L || cat == ucp_N);
2141 }
2142 }
2143 else
2144 #endif
2145 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2146 }
2147
2148 /* Get status of next character */
2149
2150 if (eptr >= md->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 cur_is_word = FALSE;
2154 }
2155 else
2156 #ifdef SUPPORT_UCP
2157 if (md->use_ucp)
2158 {
2159 c = *eptr;
2160 if (c == '_') cur_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 cur_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2169 }
2170
2171 /* Now see if the situation is what we want */
2172
2173 if ((*ecode++ == OP_WORD_BOUNDARY)?
2174 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2175 MRRETURN(MATCH_NOMATCH);
2176 }
2177 break;
2178
2179 /* Match a single character type; inline for speed */
2180
2181 case OP_ANY:
2182 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2183 /* Fall through */
2184
2185 case OP_ALLANY:
2186 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2187 { /* not be updated before SCHECK_PARTIAL. */
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 eptr++;
2192 if (utf) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2193 ecode++;
2194 break;
2195
2196 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2197 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2198
2199 case OP_ANYBYTE:
2200 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2201 { /* not be updated before SCHECK_PARTIAL. */
2202 SCHECK_PARTIAL();
2203 MRRETURN(MATCH_NOMATCH);
2204 }
2205 eptr++;
2206 ecode++;
2207 break;
2208
2209 case OP_NOT_DIGIT:
2210 if (eptr >= md->end_subject)
2211 {
2212 SCHECK_PARTIAL();
2213 MRRETURN(MATCH_NOMATCH);
2214 }
2215 GETCHARINCTEST(c, eptr);
2216 if (
2217 #ifdef SUPPORT_UTF8
2218 c < 256 &&
2219 #endif
2220 (md->ctypes[c] & ctype_digit) != 0
2221 )
2222 MRRETURN(MATCH_NOMATCH);
2223 ecode++;
2224 break;
2225
2226 case OP_DIGIT:
2227 if (eptr >= md->end_subject)
2228 {
2229 SCHECK_PARTIAL();
2230 MRRETURN(MATCH_NOMATCH);
2231 }
2232 GETCHARINCTEST(c, eptr);
2233 if (
2234 #ifdef SUPPORT_UTF8
2235 c >= 256 ||
2236 #endif
2237 (md->ctypes[c] & ctype_digit) == 0
2238 )
2239 MRRETURN(MATCH_NOMATCH);
2240 ecode++;
2241 break;
2242
2243 case OP_NOT_WHITESPACE:
2244 if (eptr >= md->end_subject)
2245 {
2246 SCHECK_PARTIAL();
2247 MRRETURN(MATCH_NOMATCH);
2248 }
2249 GETCHARINCTEST(c, eptr);
2250 if (
2251 #ifdef SUPPORT_UTF8
2252 c < 256 &&
2253 #endif
2254 (md->ctypes[c] & ctype_space) != 0
2255 )
2256 MRRETURN(MATCH_NOMATCH);
2257 ecode++;
2258 break;
2259
2260 case OP_WHITESPACE:
2261 if (eptr >= md->end_subject)
2262 {
2263 SCHECK_PARTIAL();
2264 MRRETURN(MATCH_NOMATCH);
2265 }
2266 GETCHARINCTEST(c, eptr);
2267 if (
2268 #ifdef SUPPORT_UTF8
2269 c >= 256 ||
2270 #endif
2271 (md->ctypes[c] & ctype_space) == 0
2272 )
2273 MRRETURN(MATCH_NOMATCH);
2274 ecode++;
2275 break;
2276
2277 case OP_NOT_WORDCHAR:
2278 if (eptr >= md->end_subject)
2279 {
2280 SCHECK_PARTIAL();
2281 MRRETURN(MATCH_NOMATCH);
2282 }
2283 GETCHARINCTEST(c, eptr);
2284 if (
2285 #ifdef SUPPORT_UTF8
2286 c < 256 &&
2287 #endif
2288 (md->ctypes[c] & ctype_word) != 0
2289 )
2290 MRRETURN(MATCH_NOMATCH);
2291 ecode++;
2292 break;
2293
2294 case OP_WORDCHAR:
2295 if (eptr >= md->end_subject)
2296 {
2297 SCHECK_PARTIAL();
2298 MRRETURN(MATCH_NOMATCH);
2299 }
2300 GETCHARINCTEST(c, eptr);
2301 if (
2302 #ifdef SUPPORT_UTF8
2303 c >= 256 ||
2304 #endif
2305 (md->ctypes[c] & ctype_word) == 0
2306 )
2307 MRRETURN(MATCH_NOMATCH);
2308 ecode++;
2309 break;
2310
2311 case OP_ANYNL:
2312 if (eptr >= md->end_subject)
2313 {
2314 SCHECK_PARTIAL();
2315 MRRETURN(MATCH_NOMATCH);
2316 }
2317 GETCHARINCTEST(c, eptr);
2318 switch(c)
2319 {
2320 default: MRRETURN(MATCH_NOMATCH);
2321
2322 case 0x000d:
2323 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2324 break;
2325
2326 case 0x000a:
2327 break;
2328
2329 case 0x000b:
2330 case 0x000c:
2331 case 0x0085:
2332 case 0x2028:
2333 case 0x2029:
2334 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2335 break;
2336 }
2337 ecode++;
2338 break;
2339
2340 case OP_NOT_HSPACE:
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 MRRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(c, eptr);
2347 switch(c)
2348 {
2349 default: break;
2350 case 0x09: /* HT */
2351 case 0x20: /* SPACE */
2352 case 0xa0: /* NBSP */
2353 case 0x1680: /* OGHAM SPACE MARK */
2354 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2355 case 0x2000: /* EN QUAD */
2356 case 0x2001: /* EM QUAD */
2357 case 0x2002: /* EN SPACE */
2358 case 0x2003: /* EM SPACE */
2359 case 0x2004: /* THREE-PER-EM SPACE */
2360 case 0x2005: /* FOUR-PER-EM SPACE */
2361 case 0x2006: /* SIX-PER-EM SPACE */
2362 case 0x2007: /* FIGURE SPACE */
2363 case 0x2008: /* PUNCTUATION SPACE */
2364 case 0x2009: /* THIN SPACE */
2365 case 0x200A: /* HAIR SPACE */
2366 case 0x202f: /* NARROW NO-BREAK SPACE */
2367 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2368 case 0x3000: /* IDEOGRAPHIC SPACE */
2369 MRRETURN(MATCH_NOMATCH);
2370 }
2371 ecode++;
2372 break;
2373
2374 case OP_HSPACE:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 MRRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 switch(c)
2382 {
2383 default: MRRETURN(MATCH_NOMATCH);
2384 case 0x09: /* HT */
2385 case 0x20: /* SPACE */
2386 case 0xa0: /* NBSP */
2387 case 0x1680: /* OGHAM SPACE MARK */
2388 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2389 case 0x2000: /* EN QUAD */
2390 case 0x2001: /* EM QUAD */
2391 case 0x2002: /* EN SPACE */
2392 case 0x2003: /* EM SPACE */
2393 case 0x2004: /* THREE-PER-EM SPACE */
2394 case 0x2005: /* FOUR-PER-EM SPACE */
2395 case 0x2006: /* SIX-PER-EM SPACE */
2396 case 0x2007: /* FIGURE SPACE */
2397 case 0x2008: /* PUNCTUATION SPACE */
2398 case 0x2009: /* THIN SPACE */
2399 case 0x200A: /* HAIR SPACE */
2400 case 0x202f: /* NARROW NO-BREAK SPACE */
2401 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2402 case 0x3000: /* IDEOGRAPHIC SPACE */
2403 break;
2404 }
2405 ecode++;
2406 break;
2407
2408 case OP_NOT_VSPACE:
2409 if (eptr >= md->end_subject)
2410 {
2411 SCHECK_PARTIAL();
2412 MRRETURN(MATCH_NOMATCH);
2413 }
2414 GETCHARINCTEST(c, eptr);
2415 switch(c)
2416 {
2417 default: break;
2418 case 0x0a: /* LF */
2419 case 0x0b: /* VT */
2420 case 0x0c: /* FF */
2421 case 0x0d: /* CR */
2422 case 0x85: /* NEL */
2423 case 0x2028: /* LINE SEPARATOR */
2424 case 0x2029: /* PARAGRAPH SEPARATOR */
2425 MRRETURN(MATCH_NOMATCH);
2426 }
2427 ecode++;
2428 break;
2429
2430 case OP_VSPACE:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 MRRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: MRRETURN(MATCH_NOMATCH);
2440 case 0x0a: /* LF */
2441 case 0x0b: /* VT */
2442 case 0x0c: /* FF */
2443 case 0x0d: /* CR */
2444 case 0x85: /* NEL */
2445 case 0x2028: /* LINE SEPARATOR */
2446 case 0x2029: /* PARAGRAPH SEPARATOR */
2447 break;
2448 }
2449 ecode++;
2450 break;
2451
2452 #ifdef SUPPORT_UCP
2453 /* Check the next character by Unicode property. We will get here only
2454 if the support is in the binary; otherwise a compile-time error occurs. */
2455
2456 case OP_PROP:
2457 case OP_NOTPROP:
2458 if (eptr >= md->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 MRRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(c, eptr);
2464 {
2465 const ucd_record *prop = GET_UCD(c);
2466
2467 switch(ecode[1])
2468 {
2469 case PT_ANY:
2470 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2471 break;
2472
2473 case PT_LAMP:
2474 if ((prop->chartype == ucp_Lu ||
2475 prop->chartype == ucp_Ll ||
2476 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2477 MRRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_GC:
2481 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2482 MRRETURN(MATCH_NOMATCH);
2483 break;
2484
2485 case PT_PC:
2486 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2487 MRRETURN(MATCH_NOMATCH);
2488 break;
2489
2490 case PT_SC:
2491 if ((ecode[2] != prop->script) == (op == OP_PROP))
2492 MRRETURN(MATCH_NOMATCH);
2493 break;
2494
2495 /* These are specials */
2496
2497 case PT_ALNUM:
2498 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2499 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2500 MRRETURN(MATCH_NOMATCH);
2501 break;
2502
2503 case PT_SPACE: /* Perl space */
2504 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2505 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2506 == (op == OP_NOTPROP))
2507 MRRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_PXSPACE: /* POSIX space */
2511 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2513 c == CHAR_FF || c == CHAR_CR)
2514 == (op == OP_NOTPROP))
2515 MRRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_WORD:
2519 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2520 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2521 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2522 MRRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 /* This should never occur */
2526
2527 default:
2528 RRETURN(PCRE_ERROR_INTERNAL);
2529 }
2530
2531 ecode += 3;
2532 }
2533 break;
2534
2535 /* Match an extended Unicode sequence. We will get here only if the support
2536 is in the binary; otherwise a compile-time error occurs. */
2537
2538 case OP_EXTUNI:
2539 if (eptr >= md->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 MRRETURN(MATCH_NOMATCH);
2543 }
2544 GETCHARINCTEST(c, eptr);
2545 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2546 while (eptr < md->end_subject)
2547 {
2548 int len = 1;
2549 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2550 if (UCD_CATEGORY(c) != ucp_M) break;
2551 eptr += len;
2552 }
2553 ecode++;
2554 break;
2555 #endif
2556
2557
2558 /* Match a back reference, possibly repeatedly. Look past the end of the
2559 item to see if there is repeat information following. The code is similar
2560 to that for character classes, but repeated for efficiency. Then obey
2561 similar code to character type repeats - written out again for speed.
2562 However, if the referenced string is the empty string, always treat
2563 it as matched, any number of times (otherwise there could be infinite
2564 loops). */
2565
2566 case OP_REF:
2567 case OP_REFI:
2568 caseless = op == OP_REFI;
2569 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2570 ecode += 1 + IMM2_SIZE;
2571
2572 /* If the reference is unset, there are two possibilities:
2573
2574 (a) In the default, Perl-compatible state, set the length negative;
2575 this ensures that every attempt at a match fails. We can't just fail
2576 here, because of the possibility of quantifiers with zero minima.
2577
2578 (b) If the JavaScript compatibility flag is set, set the length to zero
2579 so that the back reference matches an empty string.
2580
2581 Otherwise, set the length to the length of what was matched by the
2582 referenced subpattern. */
2583
2584 if (offset >= offset_top || md->offset_vector[offset] < 0)
2585 length = (md->jscript_compat)? 0 : -1;
2586 else
2587 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2588
2589 /* Set up for repetition, or handle the non-repeated case */
2590
2591 switch (*ecode)
2592 {
2593 case OP_CRSTAR:
2594 case OP_CRMINSTAR:
2595 case OP_CRPLUS:
2596 case OP_CRMINPLUS:
2597 case OP_CRQUERY:
2598 case OP_CRMINQUERY:
2599 c = *ecode++ - OP_CRSTAR;
2600 minimize = (c & 1) != 0;
2601 min = rep_min[c]; /* Pick up values from tables; */
2602 max = rep_max[c]; /* zero for max => infinity */
2603 if (max == 0) max = INT_MAX;
2604 break;
2605
2606 case OP_CRRANGE:
2607 case OP_CRMINRANGE:
2608 minimize = (*ecode == OP_CRMINRANGE);
2609 min = GET2(ecode, 1);
2610 max = GET2(ecode, 1 + IMM2_SIZE);
2611 if (max == 0) max = INT_MAX;
2612 ecode += 1 + 2 * IMM2_SIZE;
2613 break;
2614
2615 default: /* No repeat follows */
2616 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2617 {
2618 CHECK_PARTIAL();
2619 MRRETURN(MATCH_NOMATCH);
2620 }
2621 eptr += length;
2622 continue; /* With the main loop */
2623 }
2624
2625 /* Handle repeated back references. If the length of the reference is
2626 zero, just continue with the main loop. */
2627
2628 if (length == 0) continue;
2629
2630 /* First, ensure the minimum number of matches are present. We get back
2631 the length of the reference string explicitly rather than passing the
2632 address of eptr, so that eptr can be a register variable. */
2633
2634 for (i = 1; i <= min; i++)
2635 {
2636 int slength;
2637 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2638 {
2639 CHECK_PARTIAL();
2640 MRRETURN(MATCH_NOMATCH);
2641 }
2642 eptr += slength;
2643 }
2644
2645 /* If min = max, continue at the same level without recursion.
2646 They are not both allowed to be zero. */
2647
2648 if (min == max) continue;
2649
2650 /* If minimizing, keep trying and advancing the pointer */
2651
2652 if (minimize)
2653 {
2654 for (fi = min;; fi++)
2655 {
2656 int slength;
2657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2660 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2661 {
2662 CHECK_PARTIAL();
2663 MRRETURN(MATCH_NOMATCH);
2664 }
2665 eptr += slength;
2666 }
2667 /* Control never gets here */
2668 }
2669
2670 /* If maximizing, find the longest string and work backwards */
2671
2672 else
2673 {
2674 pp = eptr;
2675 for (i = min; i < max; i++)
2676 {
2677 int slength;
2678 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2679 {
2680 CHECK_PARTIAL();
2681 break;
2682 }
2683 eptr += slength;
2684 }
2685 while (eptr >= pp)
2686 {
2687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 eptr -= length;
2690 }
2691 MRRETURN(MATCH_NOMATCH);
2692 }
2693 /* Control never gets here */
2694
2695 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2696 used when all the characters in the class have values in the range 0-255,
2697 and either the matching is caseful, or the characters are in the range
2698 0-127 when UTF-8 processing is enabled. The only difference between
2699 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2700 encountered.
2701
2702 First, look past the end of the item to see if there is repeat information
2703 following. Then obey similar code to character type repeats - written out
2704 again for speed. */
2705
2706 case OP_NCLASS:
2707 case OP_CLASS:
2708 {
2709 /* The data variable is saved across frames, so the byte map needs to
2710 be stored there. */
2711 #define BYTE_MAP ((pcre_uint8 *)data)
2712 data = ecode + 1; /* Save for matching */
2713 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2714
2715 switch (*ecode)
2716 {
2717 case OP_CRSTAR:
2718 case OP_CRMINSTAR:
2719 case OP_CRPLUS:
2720 case OP_CRMINPLUS:
2721 case OP_CRQUERY:
2722 case OP_CRMINQUERY:
2723 c = *ecode++ - OP_CRSTAR;
2724 minimize = (c & 1) != 0;
2725 min = rep_min[c]; /* Pick up values from tables; */
2726 max = rep_max[c]; /* zero for max => infinity */
2727 if (max == 0) max = INT_MAX;
2728 break;
2729
2730 case OP_CRRANGE:
2731 case OP_CRMINRANGE:
2732 minimize = (*ecode == OP_CRMINRANGE);
2733 min = GET2(ecode, 1);
2734 max = GET2(ecode, 1 + IMM2_SIZE);
2735 if (max == 0) max = INT_MAX;
2736 ecode += 1 + 2 * IMM2_SIZE;
2737 break;
2738
2739 default: /* No repeat follows */
2740 min = max = 1;
2741 break;
2742 }
2743
2744 /* First, ensure the minimum number of matches are present. */
2745
2746 #ifdef SUPPORT_UTF
2747 if (utf)
2748 {
2749 for (i = 1; i <= min; i++)
2750 {
2751 if (eptr >= md->end_subject)
2752 {
2753 SCHECK_PARTIAL();
2754 MRRETURN(MATCH_NOMATCH);
2755 }
2756 GETCHARINC(c, eptr);
2757 if (c > 255)
2758 {
2759 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2760 }
2761 else
2762 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2763 }
2764 }
2765 else
2766 #endif
2767 /* Not UTF mode */
2768 {
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr >= md->end_subject)
2772 {
2773 SCHECK_PARTIAL();
2774 MRRETURN(MATCH_NOMATCH);
2775 }
2776 c = *eptr++;
2777 #ifndef COMPILE_PCRE8
2778 if (c > 255)
2779 {
2780 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2781 }
2782 else
2783 #endif
2784 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2785 }
2786 }
2787
2788 /* If max == min we can continue with the main loop without the
2789 need to recurse. */
2790
2791 if (min == max) continue;
2792
2793 /* If minimizing, keep testing the rest of the expression and advancing
2794 the pointer while it matches the class. */
2795
2796 if (minimize)
2797 {
2798 #ifdef SUPPORT_UTF
2799 if (utf)
2800 {
2801 for (fi = min;; fi++)
2802 {
2803 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2806 if (eptr >= md->end_subject)
2807 {
2808 SCHECK_PARTIAL();
2809 MRRETURN(MATCH_NOMATCH);
2810 }
2811 GETCHARINC(c, eptr);
2812 if (c > 255)
2813 {
2814 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2815 }
2816 else
2817 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2818 }
2819 }
2820 else
2821 #endif
2822 /* Not UTF mode */
2823 {
2824 for (fi = min;; fi++)
2825 {
2826 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2827 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2828 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2829 if (eptr >= md->end_subject)
2830 {
2831 SCHECK_PARTIAL();
2832 MRRETURN(MATCH_NOMATCH);
2833 }
2834 c = *eptr++;
2835 #ifndef COMPILE_PCRE8
2836 if (c > 255)
2837 {
2838 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2839 }
2840 else
2841 #endif
2842 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2843 }
2844 }
2845 /* Control never gets here */
2846 }
2847
2848 /* If maximizing, find the longest possible run, then work backwards. */
2849
2850 else
2851 {
2852 pp = eptr;
2853
2854 #ifdef SUPPORT_UTF
2855 if (utf)
2856 {
2857 for (i = min; i < max; i++)
2858 {
2859 int len = 1;
2860 if (eptr >= md->end_subject)
2861 {
2862 SCHECK_PARTIAL();
2863 break;
2864 }
2865 GETCHARLEN(c, eptr, len);
2866 if (c > 255)
2867 {
2868 if (op == OP_CLASS) break;
2869 }
2870 else
2871 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2872 eptr += len;
2873 }
2874 for (;;)
2875 {
2876 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2878 if (eptr-- == pp) break; /* Stop if tried at original pos */
2879 BACKCHAR(eptr);
2880 }
2881 }
2882 else
2883 #endif
2884 /* Not UTF mode */
2885 {
2886 for (i = min; i < max; i++)
2887 {
2888 if (eptr >= md->end_subject)
2889 {
2890 SCHECK_PARTIAL();
2891 break;
2892 }
2893 c = *eptr;
2894 #ifndef COMPILE_PCRE8
2895 if (c > 255)
2896 {
2897 if (op == OP_CLASS) break;
2898 }
2899 else
2900 #endif
2901 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2902 eptr++;
2903 }
2904 while (eptr >= pp)
2905 {
2906 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2907 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2908 eptr--;
2909 }
2910 }
2911
2912 MRRETURN(MATCH_NOMATCH);
2913 }
2914 #undef BYTE_MAP
2915 }
2916 /* Control never gets here */
2917
2918
2919 /* Match an extended character class. This opcode is encountered only
2920 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2921 mode, because Unicode properties are supported in non-UTF-8 mode. */
2922
2923 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2924 case OP_XCLASS:
2925 {
2926 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2927 ecode += GET(ecode, 1); /* Advance past the item */
2928
2929 switch (*ecode)
2930 {
2931 case OP_CRSTAR:
2932 case OP_CRMINSTAR:
2933 case OP_CRPLUS:
2934 case OP_CRMINPLUS:
2935 case OP_CRQUERY:
2936 case OP_CRMINQUERY:
2937 c = *ecode++ - OP_CRSTAR;
2938 minimize = (c & 1) != 0;
2939 min = rep_min[c]; /* Pick up values from tables; */
2940 max = rep_max[c]; /* zero for max => infinity */
2941 if (max == 0) max = INT_MAX;
2942 break;
2943
2944 case OP_CRRANGE:
2945 case OP_CRMINRANGE:
2946 minimize = (*ecode == OP_CRMINRANGE);
2947 min = GET2(ecode, 1);
2948 max = GET2(ecode, 1 + IMM2_SIZE);
2949 if (max == 0) max = INT_MAX;
2950 ecode += 1 + 2 * IMM2_SIZE;
2951 break;
2952
2953 default: /* No repeat follows */
2954 min = max = 1;
2955 break;
2956 }
2957
2958 /* First, ensure the minimum number of matches are present. */
2959
2960 for (i = 1; i <= min; i++)
2961 {
2962 if (eptr >= md->end_subject)
2963 {
2964 SCHECK_PARTIAL();
2965 MRRETURN(MATCH_NOMATCH);
2966 }
2967 GETCHARINCTEST(c, eptr);
2968 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2969 }
2970
2971 /* If max == min we can continue with the main loop without the
2972 need to recurse. */
2973
2974 if (min == max) continue;
2975
2976 /* If minimizing, keep testing the rest of the expression and advancing
2977 the pointer while it matches the class. */
2978
2979 if (minimize)
2980 {
2981 for (fi = min;; fi++)
2982 {
2983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2985 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2986 if (eptr >= md->end_subject)
2987 {
2988 SCHECK_PARTIAL();
2989 MRRETURN(MATCH_NOMATCH);
2990 }
2991 GETCHARINCTEST(c, eptr);
2992 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2993 }
2994 /* Control never gets here */
2995 }
2996
2997 /* If maximizing, find the longest possible run, then work backwards. */
2998
2999 else
3000 {
3001 pp = eptr;
3002 for (i = min; i < max; i++)
3003 {
3004 int len = 1;
3005 if (eptr >= md->end_subject)
3006 {
3007 SCHECK_PARTIAL();
3008 break;
3009 }
3010 #ifdef SUPPORT_UTF
3011 GETCHARLENTEST(c, eptr, len);
3012 #else
3013 c = *eptr;
3014 #endif
3015 if (!PRIV(xclass)(c, data)) break;
3016 eptr += len;
3017 }
3018 for(;;)
3019 {
3020 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3022 if (eptr-- == pp) break; /* Stop if tried at original pos */
3023 #ifdef SUPPORT_UTF
3024 if (utf) BACKCHAR(eptr);
3025 #endif
3026 }
3027 MRRETURN(MATCH_NOMATCH);
3028 }
3029
3030 /* Control never gets here */
3031 }
3032 #endif /* End of XCLASS */
3033
3034 /* Match a single character, casefully */
3035
3036 case OP_CHAR:
3037 #ifdef SUPPORT_UTF8
3038 if (utf)
3039 {
3040 length = 1;
3041 ecode++;
3042 GETCHARLEN(fc, ecode, length);
3043 if (length > md->end_subject - eptr)
3044 {
3045 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3046 MRRETURN(MATCH_NOMATCH);
3047 }
3048 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3049 }
3050 else
3051 #endif
3052 /* Not UTF mode */
3053 {
3054 if (md->end_subject - eptr < 1)
3055 {
3056 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3057 MRRETURN(MATCH_NOMATCH);
3058 }
3059 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3060 ecode += 2;
3061 }
3062 break;
3063
3064 /* Match a single character, caselessly */
3065
3066 case OP_CHARI:
3067 #ifdef SUPPORT_UTF8
3068 if (utf)
3069 {
3070 length = 1;
3071 ecode++;
3072 GETCHARLEN(fc, ecode, length);
3073
3074 if (length > md->end_subject - eptr)
3075 {
3076 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3077 MRRETURN(MATCH_NOMATCH);
3078 }
3079
3080 /* If the pattern character's value is < 128, we have only one byte, and
3081 can use the fast lookup table. */
3082
3083 if (fc < 128)
3084 {
3085 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3086 }
3087
3088 /* Otherwise we must pick up the subject character */
3089
3090 else
3091 {
3092 unsigned int dc;
3093 GETCHARINC(dc, eptr);
3094 ecode += length;
3095
3096 /* If we have Unicode property support, we can use it to test the other
3097 case of the character, if there is one. */
3098
3099 if (fc != dc)
3100 {
3101 #ifdef SUPPORT_UCP
3102 if (dc != UCD_OTHERCASE(fc))
3103 #endif
3104 MRRETURN(MATCH_NOMATCH);
3105 }
3106 }
3107 }
3108 else
3109 #endif /* SUPPORT_UTF8 */
3110
3111 /* Not UTF mode */
3112 {
3113 if (md->end_subject - eptr < 1)
3114 {
3115 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3116 MRRETURN(MATCH_NOMATCH);
3117 }
3118 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3119 ecode += 2;
3120 }
3121 break;
3122
3123 /* Match a single character repeatedly. */
3124
3125 case OP_EXACT:
3126 case OP_EXACTI:
3127 min = max = GET2(ecode, 1);
3128 ecode += 1 + IMM2_SIZE;
3129 goto REPEATCHAR;
3130
3131 case OP_POSUPTO:
3132 case OP_POSUPTOI:
3133 possessive = TRUE;
3134 /* Fall through */
3135
3136 case OP_UPTO:
3137 case OP_UPTOI:
3138 case OP_MINUPTO:
3139 case OP_MINUPTOI:
3140 min = 0;
3141 max = GET2(ecode, 1);
3142 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3143 ecode += 1 + IMM2_SIZE;
3144 goto REPEATCHAR;
3145
3146 case OP_POSSTAR:
3147 case OP_POSSTARI:
3148 possessive = TRUE;
3149 min = 0;
3150 max = INT_MAX;
3151 ecode++;
3152 goto REPEATCHAR;
3153
3154 case OP_POSPLUS:
3155 case OP_POSPLUSI:
3156 possessive = TRUE;
3157 min = 1;
3158 max = INT_MAX;
3159 ecode++;
3160 goto REPEATCHAR;
3161
3162 case OP_POSQUERY:
3163 case OP_POSQUERYI:
3164 possessive = TRUE;
3165 min = 0;
3166 max = 1;
3167 ecode++;
3168 goto REPEATCHAR;
3169
3170 case OP_STAR:
3171 case OP_STARI:
3172 case OP_MINSTAR:
3173 case OP_MINSTARI:
3174 case OP_PLUS:
3175 case OP_PLUSI:
3176 case OP_MINPLUS:
3177 case OP_MINPLUSI:
3178 case OP_QUERY:
3179 case OP_QUERYI:
3180 case OP_MINQUERY:
3181 case OP_MINQUERYI:
3182 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3183 minimize = (c & 1) != 0;
3184 min = rep_min[c]; /* Pick up values from tables; */
3185 max = rep_max[c]; /* zero for max => infinity */
3186 if (max == 0) max = INT_MAX;
3187
3188 /* Common code for all repeated single-character matches. */
3189
3190 REPEATCHAR:
3191 #ifdef SUPPORT_UTF8
3192 if (utf)
3193 {
3194 length = 1;
3195 charptr = ecode;
3196 GETCHARLEN(fc, ecode, length);
3197 ecode += length;
3198
3199 /* Handle multibyte character matching specially here. There is
3200 support for caseless matching if UCP support is present. */
3201
3202 if (length > 1)
3203 {
3204 #ifdef SUPPORT_UCP
3205 unsigned int othercase;
3206 if (op >= OP_STARI && /* Caseless */
3207 (othercase = UCD_OTHERCASE(fc)) != fc)
3208 oclength = PRIV(ord2utf)(othercase, occhars);
3209 else oclength = 0;
3210 #endif /* SUPPORT_UCP */
3211
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr <= md->end_subject - length &&
3215 memcmp(eptr, charptr, length) == 0) eptr += length;
3216 #ifdef SUPPORT_UCP
3217 else if (oclength > 0 &&
3218 eptr <= md->end_subject - oclength &&
3219 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3220 #endif /* SUPPORT_UCP */
3221 else
3222 {
3223 CHECK_PARTIAL();
3224 MRRETURN(MATCH_NOMATCH);
3225 }
3226 }
3227
3228 if (min == max) continue;
3229
3230 if (minimize)
3231 {
3232 for (fi = min;; fi++)
3233 {
3234 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3236 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3237 if (eptr <= md->end_subject - length &&
3238 memcmp(eptr, charptr, length) == 0) eptr += length;
3239 #ifdef SUPPORT_UCP
3240 else if (oclength > 0 &&
3241 eptr <= md->end_subject - oclength &&
3242 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3243 #endif /* SUPPORT_UCP */
3244 else
3245 {
3246 CHECK_PARTIAL();
3247 MRRETURN(MATCH_NOMATCH);
3248 }
3249 }
3250 /* Control never gets here */
3251 }
3252
3253 else /* Maximize */
3254 {
3255 pp = eptr;
3256 for (i = min; i < max; i++)
3257 {
3258 if (eptr <= md->end_subject - length &&
3259 memcmp(eptr, charptr, length) == 0) eptr += length;
3260 #ifdef SUPPORT_UCP
3261 else if (oclength > 0 &&
3262 eptr <= md->end_subject - oclength &&
3263 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3264 #endif /* SUPPORT_UCP */
3265 else
3266 {
3267 CHECK_PARTIAL();
3268 break;
3269 }
3270 }
3271
3272 if (possessive) continue;
3273
3274 for(;;)
3275 {
3276 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3278 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3279 #ifdef SUPPORT_UCP
3280 eptr--;
3281 BACKCHAR(eptr);
3282 #else /* without SUPPORT_UCP */
3283 eptr -= length;
3284 #endif /* SUPPORT_UCP */
3285 }
3286 }
3287 /* Control never gets here */
3288 }
3289
3290 /* If the length of a UTF-8 character is 1, we fall through here, and
3291 obey the code as for non-UTF-8 characters below, though in this case the
3292 value of fc will always be < 128. */
3293 }
3294 else
3295 #endif /* SUPPORT_UTF8 */
3296
3297 /* When not in UTF-8 mode, load a single-byte character. */
3298
3299 fc = *ecode++;
3300
3301 /* The value of fc at this point is always less than 256, though we may or
3302 may not be in UTF-8 mode. The code is duplicated for the caseless and
3303 caseful cases, for speed, since matching characters is likely to be quite
3304 common. First, ensure the minimum number of matches are present. If min =
3305 max, continue at the same level without recursing. Otherwise, if
3306 minimizing, keep trying the rest of the expression and advancing one
3307 matching character if failing, up to the maximum. Alternatively, if
3308 maximizing, find the maximum number of characters and work backwards. */
3309
3310 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3311 max, eptr));
3312
3313 if (op >= OP_STARI) /* Caseless */
3314 {
3315 fc = md->lcc[fc];
3316 for (i = 1; i <= min; i++)
3317 {
3318 if (eptr >= md->end_subject)
3319 {
3320 SCHECK_PARTIAL();
3321 MRRETURN(MATCH_NOMATCH);
3322 }
3323 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3324 }
3325 if (min == max) continue;
3326 if (minimize)
3327 {
3328 for (fi = min;; fi++)
3329 {
3330 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3332 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3333 if (eptr >= md->end_subject)
3334 {
3335 SCHECK_PARTIAL();
3336 MRRETURN(MATCH_NOMATCH);
3337 }
3338 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3339 }
3340 /* Control never gets here */
3341 }
3342 else /* Maximize */
3343 {
3344 pp = eptr;
3345 for (i = min; i < max; i++)
3346 {
3347 if (eptr >= md->end_subject)
3348 {
3349 SCHECK_PARTIAL();
3350 break;
3351 }
3352 if (fc != md->lcc[*eptr]) break;
3353 eptr++;
3354 }
3355
3356 if (possessive) continue;
3357
3358 while (eptr >= pp)
3359 {
3360 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3361 eptr--;
3362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3363 }
3364 MRRETURN(MATCH_NOMATCH);
3365 }
3366 /* Control never gets here */
3367 }
3368
3369 /* Caseful comparisons (includes all multi-byte characters) */
3370
3371 else
3372 {
3373 for (i = 1; i <= min; i++)
3374 {
3375 if (eptr >= md->end_subject)
3376 {
3377 SCHECK_PARTIAL();
3378 MRRETURN(MATCH_NOMATCH);
3379 }
3380 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3381 }
3382
3383 if (min == max) continue;
3384
3385 if (minimize)
3386 {
3387 for (fi = min;; fi++)
3388 {
3389 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3390 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3391 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3392 if (eptr >= md->end_subject)
3393 {
3394 SCHECK_PARTIAL();
3395 MRRETURN(MATCH_NOMATCH);
3396 }
3397 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3398 }
3399 /* Control never gets here */
3400 }
3401 else /* Maximize */
3402 {
3403 pp = eptr;
3404 for (i = min; i < max; i++)
3405 {
3406 if (eptr >= md->end_subject)
3407 {
3408 SCHECK_PARTIAL();
3409 break;
3410 }
3411 if (fc != *eptr) break;
3412 eptr++;
3413 }
3414 if (possessive) continue;
3415
3416 while (eptr >= pp)
3417 {
3418 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3419 eptr--;
3420 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3421 }
3422 MRRETURN(MATCH_NOMATCH);
3423 }
3424 }
3425 /* Control never gets here */
3426
3427 /* Match a negated single one-byte character. The character we are
3428 checking can be multibyte. */
3429
3430 case OP_NOT:
3431 case OP_NOTI:
3432 if (eptr >= md->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 MRRETURN(MATCH_NOMATCH);
3436 }
3437 ecode++;
3438 GETCHARINCTEST(c, eptr);
3439 if (op == OP_NOTI) /* The caseless case */
3440 {
3441 #ifdef SUPPORT_UTF8
3442 if (c < 256)
3443 #endif
3444 c = md->lcc[c];
3445 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3446 }
3447 else /* Caseful */
3448 {
3449 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3450 }
3451 break;
3452
3453 /* Match a negated single one-byte character repeatedly. This is almost a
3454 repeat of the code for a repeated single character, but I haven't found a
3455 nice way of commoning these up that doesn't require a test of the
3456 positive/negative option for each character match. Maybe that wouldn't add
3457 very much to the time taken, but character matching *is* what this is all
3458 about... */
3459
3460 case OP_NOTEXACT:
3461 case OP_NOTEXACTI:
3462 min = max = GET2(ecode, 1);
3463 ecode += 1 + IMM2_SIZE;
3464 goto REPEATNOTCHAR;
3465
3466 case OP_NOTUPTO:
3467 case OP_NOTUPTOI:
3468 case OP_NOTMINUPTO:
3469 case OP_NOTMINUPTOI:
3470 min = 0;
3471 max = GET2(ecode, 1);
3472 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3473 ecode += 1 + IMM2_SIZE;
3474 goto REPEATNOTCHAR;
3475
3476 case OP_NOTPOSSTAR:
3477 case OP_NOTPOSSTARI:
3478 possessive = TRUE;
3479 min = 0;
3480 max = INT_MAX;
3481 ecode++;
3482 goto REPEATNOTCHAR;
3483
3484 case OP_NOTPOSPLUS:
3485 case OP_NOTPOSPLUSI:
3486 possessive = TRUE;
3487 min = 1;
3488 max = INT_MAX;
3489 ecode++;
3490 goto REPEATNOTCHAR;
3491
3492 case OP_NOTPOSQUERY:
3493 case OP_NOTPOSQUERYI:
3494 possessive = TRUE;
3495 min = 0;
3496 max = 1;
3497 ecode++;
3498 goto REPEATNOTCHAR;
3499
3500 case OP_NOTPOSUPTO:
3501 case OP_NOTPOSUPTOI:
3502 possessive = TRUE;
3503 min = 0;
3504 max = GET2(ecode, 1);
3505 ecode += 1 + IMM2_SIZE;
3506 goto REPEATNOTCHAR;
3507
3508 case OP_NOTSTAR:
3509 case OP_NOTSTARI:
3510 case OP_NOTMINSTAR:
3511 case OP_NOTMINSTARI:
3512 case OP_NOTPLUS:
3513 case OP_NOTPLUSI:
3514 case OP_NOTMINPLUS:
3515 case OP_NOTMINPLUSI:
3516 case OP_NOTQUERY:
3517 case OP_NOTQUERYI:
3518 case OP_NOTMINQUERY:
3519 case OP_NOTMINQUERYI:
3520 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3521 minimize = (c & 1) != 0;
3522 min = rep_min[c]; /* Pick up values from tables; */
3523 max = rep_max[c]; /* zero for max => infinity */
3524 if (max == 0) max = INT_MAX;
3525
3526 /* Common code for all repeated single-byte matches. */
3527
3528 REPEATNOTCHAR:
3529 fc = *ecode++;
3530
3531 /* The code is duplicated for the caseless and caseful cases, for speed,
3532 since matching characters is likely to be quite common. First, ensure the
3533 minimum number of matches are present. If min = max, continue at the same
3534 level without recursing. Otherwise, if minimizing, keep trying the rest of
3535 the expression and advancing one matching character if failing, up to the
3536 maximum. Alternatively, if maximizing, find the maximum number of
3537 characters and work backwards. */
3538
3539 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3540 max, eptr));
3541
3542 if (op >= OP_NOTSTARI) /* Caseless */
3543 {
3544 fc = md->lcc[fc];
3545
3546 #ifdef SUPPORT_UTF8
3547 if (utf)
3548 {
3549 register unsigned int d;
3550 for (i = 1; i <= min; i++)
3551 {
3552 if (eptr >= md->end_subject)
3553 {
3554 SCHECK_PARTIAL();
3555 MRRETURN(MATCH_NOMATCH);
3556 }
3557 GETCHARINC(d, eptr);
3558 if (d < 256) d = md->lcc[d];
3559 if (fc == d) MRRETURN(MATCH_NOMATCH);
3560 }
3561 }
3562 else
3563 #endif
3564 /* Not UTF mode */
3565 {
3566 for (i = 1; i <= min; i++)
3567 {
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 MRRETURN(MATCH_NOMATCH);
3572 }
3573 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3574 }
3575 }
3576
3577 if (min == max) continue;
3578
3579 if (minimize)
3580 {
3581 #ifdef SUPPORT_UTF8
3582 if (utf)
3583 {
3584 register unsigned int d;
3585 for (fi = min;; fi++)
3586 {
3587 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3589 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3590 if (eptr >= md->end_subject)
3591 {
3592 SCHECK_PARTIAL();
3593 MRRETURN(MATCH_NOMATCH);
3594 }
3595 GETCHARINC(d, eptr);
3596 if (d < 256) d = md->lcc[d];
3597 if (fc == d) MRRETURN(MATCH_NOMATCH);
3598 }
3599 }
3600 else
3601 #endif
3602 /* Not UTF mode */
3603 {
3604 for (fi = min;; fi++)
3605 {
3606 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3608 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 MRRETURN(MATCH_NOMATCH);
3613 }
3614 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3615 }
3616 }
3617 /* Control never gets here */
3618 }
3619
3620 /* Maximize case */
3621
3622 else
3623 {
3624 pp = eptr;
3625
3626 #ifdef SUPPORT_UTF8
3627 if (utf)
3628 {
3629 register unsigned int d;
3630 for (i = min; i < max; i++)
3631 {
3632 int len = 1;
3633 if (eptr >= md->end_subject)
3634 {
3635 SCHECK_PARTIAL();
3636 break;
3637 }
3638 GETCHARLEN(d, eptr, len);
3639 if (d < 256) d = md->lcc[d];
3640 if (fc == d) break;
3641 eptr += len;
3642 }
3643 if (possessive) continue;
3644 for(;;)
3645 {
3646 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3647 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3648 if (eptr-- == pp) break; /* Stop if tried at original pos */
3649 BACKCHAR(eptr);
3650 }
3651 }
3652 else
3653 #endif
3654 /* Not UTF mode */
3655 {
3656 for (i = min; i < max; i++)
3657 {
3658 if (eptr >= md->end_subject)
3659 {
3660 SCHECK_PARTIAL();
3661 break;
3662 }
3663 if (fc == md->lcc[*eptr]) break;
3664 eptr++;
3665 }
3666 if (possessive) continue;
3667 while (eptr >= pp)
3668 {
3669 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3671 eptr--;
3672 }
3673 }
3674
3675 MRRETURN(MATCH_NOMATCH);
3676 }
3677 /* Control never gets here */
3678 }
3679
3680 /* Caseful comparisons */
3681
3682 else
3683 {
3684 #ifdef SUPPORT_UTF8
3685 if (utf)
3686 {
3687 register unsigned int d;
3688 for (i = 1; i <= min; i++)
3689 {
3690 if (eptr >= md->end_subject)
3691 {
3692 SCHECK_PARTIAL();
3693 MRRETURN(MATCH_NOMATCH);
3694 }
3695 GETCHARINC(d, eptr);
3696 if (fc == d) MRRETURN(MATCH_NOMATCH);
3697 }
3698 }
3699 else
3700 #endif
3701 /* Not UTF mode */
3702 {
3703 for (i = 1; i <= min; i++)
3704 {
3705 if (eptr >= md->end_subject)
3706 {
3707 SCHECK_PARTIAL();
3708 MRRETURN(MATCH_NOMATCH);
3709 }
3710 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3711 }
3712 }
3713
3714 if (min == max) continue;
3715
3716 if (minimize)
3717 {
3718 #ifdef SUPPORT_UTF8
3719 if (utf)
3720 {
3721 register unsigned int d;
3722 for (fi = min;; fi++)
3723 {
3724 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3726 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3727 if (eptr >= md->end_subject)
3728 {
3729 SCHECK_PARTIAL();
3730 MRRETURN(MATCH_NOMATCH);
3731 }
3732 GETCHARINC(d, eptr);
3733 if (fc == d) MRRETURN(MATCH_NOMATCH);
3734 }
3735 }
3736 else
3737 #endif
3738 /* Not UTF mode */
3739 {
3740 for (fi = min;; fi++)
3741 {
3742 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3744 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3745 if (eptr >= md->end_subject)
3746 {
3747 SCHECK_PARTIAL();
3748 MRRETURN(MATCH_NOMATCH);
3749 }
3750 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3751 }
3752 }
3753 /* Control never gets here */
3754 }
3755
3756 /* Maximize case */
3757
3758 else
3759 {
3760 pp = eptr;
3761
3762 #ifdef SUPPORT_UTF8
3763 if (utf)
3764 {
3765 register unsigned int d;
3766 for (i = min; i < max; i++)
3767 {
3768 int len = 1;
3769 if (eptr >= md->end_subject)
3770 {
3771 SCHECK_PARTIAL();
3772 break;
3773 }
3774 GETCHARLEN(d, eptr, len);
3775 if (fc == d) break;
3776 eptr += len;
3777 }
3778 if (possessive) continue;
3779 for(;;)
3780 {
3781 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3783 if (eptr-- == pp) break; /* Stop if tried at original pos */
3784 BACKCHAR(eptr);
3785 }
3786 }
3787 else
3788 #endif
3789 /* Not UTF mode */
3790 {
3791 for (i = min; i < max; i++)
3792 {
3793 if (eptr >= md->end_subject)
3794 {
3795 SCHECK_PARTIAL();
3796 break;
3797 }
3798 if (fc == *eptr) break;
3799 eptr++;
3800 }
3801 if (possessive) continue;
3802 while (eptr >= pp)
3803 {
3804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3806 eptr--;
3807 }
3808 }
3809
3810 MRRETURN(MATCH_NOMATCH);
3811 }
3812 }
3813 /* Control never gets here */
3814
3815 /* Match a single character type repeatedly; several different opcodes
3816 share code. This is very similar to the code for single characters, but we
3817 repeat it in the interests of efficiency. */
3818
3819 case OP_TYPEEXACT:
3820 min = max = GET2(ecode, 1);
3821 minimize = TRUE;
3822 ecode += 1 + IMM2_SIZE;
3823 goto REPEATTYPE;
3824
3825 case OP_TYPEUPTO:
3826 case OP_TYPEMINUPTO:
3827 min = 0;
3828 max = GET2(ecode, 1);
3829 minimize = *ecode == OP_TYPEMINUPTO;
3830 ecode += 1 + IMM2_SIZE;
3831 goto REPEATTYPE;
3832
3833 case OP_TYPEPOSSTAR:
3834 possessive = TRUE;
3835 min = 0;
3836 max = INT_MAX;
3837 ecode++;
3838 goto REPEATTYPE;
3839
3840 case OP_TYPEPOSPLUS:
3841 possessive = TRUE;
3842 min = 1;
3843 max = INT_MAX;
3844 ecode++;
3845 goto REPEATTYPE;
3846
3847 case OP_TYPEPOSQUERY:
3848 possessive = TRUE;
3849 min = 0;
3850 max = 1;
3851 ecode++;
3852 goto REPEATTYPE;
3853
3854 case OP_TYPEPOSUPTO:
3855 possessive = TRUE;
3856 min = 0;
3857 max = GET2(ecode, 1);
3858 ecode += 1 + IMM2_SIZE;
3859 goto REPEATTYPE;
3860
3861 case OP_TYPESTAR:
3862 case OP_TYPEMINSTAR:
3863 case OP_TYPEPLUS:
3864 case OP_TYPEMINPLUS:
3865 case OP_TYPEQUERY:
3866 case OP_TYPEMINQUERY:
3867 c = *ecode++ - OP_TYPESTAR;
3868 minimize = (c & 1) != 0;
3869 min = rep_min[c]; /* Pick up values from tables; */
3870 max = rep_max[c]; /* zero for max => infinity */
3871 if (max == 0) max = INT_MAX;
3872
3873 /* Common code for all repeated single character type matches. Note that
3874 in UTF-8 mode, '.' matches a character of any length, but for the other
3875 character types, the valid characters are all one-byte long. */
3876
3877 REPEATTYPE:
3878 ctype = *ecode++; /* Code for the character type */
3879
3880 #ifdef SUPPORT_UCP
3881 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3882 {
3883 prop_fail_result = ctype == OP_NOTPROP;
3884 prop_type = *ecode++;
3885 prop_value = *ecode++;
3886 }
3887 else prop_type = -1;
3888 #endif
3889
3890 /* First, ensure the minimum number of matches are present. Use inline
3891 code for maximizing the speed, and do the type test once at the start
3892 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3893 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3894 and single-bytes. */
3895
3896 if (min > 0)
3897 {
3898 #ifdef SUPPORT_UCP
3899 if (prop_type >= 0)
3900 {
3901 switch(prop_type)
3902 {
3903 case PT_ANY:
3904 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3905 for (i = 1; i <= min; i++)
3906 {
3907 if (eptr >= md->end_subject)
3908 {
3909 SCHECK_PARTIAL();
3910 MRRETURN(MATCH_NOMATCH);
3911 }
3912 GETCHARINCTEST(c, eptr);
3913 }
3914 break;
3915
3916 case PT_LAMP:
3917 for (i = 1; i <= min; i++)
3918 {
3919 int chartype;
3920 if (eptr >= md->end_subject)
3921 {
3922 SCHECK_PARTIAL();
3923 MRRETURN(MATCH_NOMATCH);
3924 }
3925 GETCHARINCTEST(c, eptr);
3926 chartype = UCD_CHARTYPE(c);
3927 if ((chartype == ucp_Lu ||
3928 chartype == ucp_Ll ||
3929 chartype == ucp_Lt) == prop_fail_result)
3930 MRRETURN(MATCH_NOMATCH);
3931 }
3932 break;
3933
3934 case PT_GC:
3935 for (i = 1; i <= min; i++)
3936 {
3937 if (eptr >= md->end_subject)
3938 {
3939 SCHECK_PARTIAL();
3940 MRRETURN(MATCH_NOMATCH);
3941 }
3942 GETCHARINCTEST(c, eptr);
3943 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3944 MRRETURN(MATCH_NOMATCH);
3945 }
3946 break;
3947
3948 case PT_PC:
3949 for (i = 1; i <= min; i++)
3950 {
3951 if (eptr >= md->end_subject)
3952 {
3953 SCHECK_PARTIAL();
3954 MRRETURN(MATCH_NOMATCH);
3955 }
3956 GETCHARINCTEST(c, eptr);
3957 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3958 MRRETURN(MATCH_NOMATCH);
3959 }
3960 break;
3961
3962 case PT_SC:
3963 for (i = 1; i <= min; i++)
3964 {
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 MRRETURN(MATCH_NOMATCH);
3969 }
3970 GETCHARINCTEST(c, eptr);
3971 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3972 MRRETURN(MATCH_NOMATCH);
3973 }
3974 break;
3975
3976 case PT_ALNUM:
3977 for (i = 1; i <= min; i++)
3978 {
3979 int category;
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 MRRETURN(MATCH_NOMATCH);
3984 }
3985 GETCHARINCTEST(c, eptr);
3986 category = UCD_CATEGORY(c);
3987 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3988 MRRETURN(MATCH_NOMATCH);
3989 }
3990 break;
3991
3992 case PT_SPACE: /* Perl space */
3993 for (i = 1; i <= min; i++)
3994 {
3995 if (eptr >= md->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 MRRETURN(MATCH_NOMATCH);
3999 }
4000 GETCHARINCTEST(c, eptr);
4001 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4002 c == CHAR_FF || c == CHAR_CR)
4003 == prop_fail_result)
4004 MRRETURN(MATCH_NOMATCH);
4005 }
4006 break;
4007
4008 case PT_PXSPACE: /* POSIX space */
4009 for (i = 1; i <= min; i++)
4010 {
4011 if (eptr >= md->end_subject)
4012 {
4013 SCHECK_PARTIAL();
4014 MRRETURN(MATCH_NOMATCH);
4015 }
4016 GETCHARINCTEST(c, eptr);
4017 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4018 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4019 == prop_fail_result)
4020 MRRETURN(MATCH_NOMATCH);
4021 }
4022 break;
4023
4024 case PT_WORD:
4025 for (i = 1; i <= min; i++)
4026 {
4027 int category;
4028 if (eptr >= md->end_subject)
4029 {
4030 SCHECK_PARTIAL();
4031 MRRETURN(MATCH_NOMATCH);
4032 }
4033 GETCHARINCTEST(c, eptr);
4034 category = UCD_CATEGORY(c);
4035 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4036 == prop_fail_result)
4037 MRRETURN(MATCH_NOMATCH);
4038 }
4039 break;
4040
4041 /* This should not occur */
4042
4043 default:
4044 RRETURN(PCRE_ERROR_INTERNAL);
4045 }
4046 }
4047
4048 /* Match extended Unicode sequences. We will get here only if the
4049 support is in the binary; otherwise a compile-time error occurs. */
4050
4051 else if (ctype == OP_EXTUNI)
4052 {
4053 for (i = 1; i <= min; i++)
4054 {
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 MRRETURN(MATCH_NOMATCH);
4059 }
4060 GETCHARINCTEST(c, eptr);
4061 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4062 while (eptr < md->end_subject)
4063 {
4064 int len = 1;
4065 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4066 if (UCD_CATEGORY(c) != ucp_M) break;
4067 eptr += len;
4068 }
4069 }
4070 }
4071
4072 else
4073 #endif /* SUPPORT_UCP */
4074
4075 /* Handle all other cases when the coding is UTF-8 */
4076
4077 #ifdef SUPPORT_UTF8
4078 if (utf) switch(ctype)
4079 {
4080 case OP_ANY:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4089 eptr++;
4090 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4091 }
4092 break;
4093
4094 case OP_ALLANY:
4095 for (i = 1; i <= min; i++)
4096 {
4097 if (eptr >= md->end_subject)
4098 {
4099 SCHECK_PARTIAL();
4100 MRRETURN(MATCH_NOMATCH);
4101 }
4102 eptr++;
4103 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4104 }
4105 break;
4106
4107 case OP_ANYBYTE:
4108 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4109 eptr += min;
4110 break;
4111
4112 case OP_ANYNL:
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 MRRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINC(c, eptr);
4121 switch(c)
4122 {
4123 default: MRRETURN(MATCH_NOMATCH);
4124
4125 case 0x000d:
4126 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4127 break;
4128
4129 case 0x000a:
4130 break;
4131
4132 case 0x000b:
4133 case 0x000c:
4134 case 0x0085:
4135 case 0x2028:
4136 case 0x2029:
4137 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4138 break;
4139 }
4140 }
4141 break;
4142
4143 case OP_NOT_HSPACE:
4144 for (i = 1; i <= min; i++)
4145 {
4146 if (eptr >= md->end_subject)
4147 {
4148 SCHECK_PARTIAL();
4149 MRRETURN(MATCH_NOMATCH);
4150 }
4151 GETCHARINC(c, eptr);
4152 switch(c)
4153 {
4154 default: break;
4155 case 0x09: /* HT */
4156 case 0x20: /* SPACE */
4157 case 0xa0: /* NBSP */
4158 case 0x1680: /* OGHAM SPACE MARK */
4159 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4160 case 0x2000: /* EN QUAD */
4161 case 0x2001: /* EM QUAD */
4162 case 0x2002: /* EN SPACE */
4163 case 0x2003: /* EM SPACE */
4164 case 0x2004: /* THREE-PER-EM SPACE */
4165 case 0x2005: /* FOUR-PER-EM SPACE */
4166 case 0x2006: /* SIX-PER-EM SPACE */
4167 case 0x2007: /* FIGURE SPACE */
4168 case 0x2008: /* PUNCTUATION SPACE */
4169 case 0x2009: /* THIN SPACE */
4170 case 0x200A: /* HAIR SPACE */
4171 case 0x202f: /* NARROW NO-BREAK SPACE */
4172 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4173 case 0x3000: /* IDEOGRAPHIC SPACE */
4174 MRRETURN(MATCH_NOMATCH);
4175 }
4176 }
4177 break;
4178
4179 case OP_HSPACE:
4180 for (i = 1; i <= min; i++)
4181 {
4182 if (eptr >= md->end_subject)
4183 {
4184 SCHECK_PARTIAL();
4185 MRRETURN(MATCH_NOMATCH);
4186 }
4187 GETCHARINC(c, eptr);
4188 switch(c)
4189 {
4190 default: MRRETURN(MATCH_NOMATCH);
4191 case 0x09: /* HT */
4192 case 0x20: /* SPACE */
4193 case 0xa0: /* NBSP */
4194 case 0x1680: /* OGHAM SPACE MARK */
4195 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4196 case 0x2000: /* EN QUAD */
4197 case 0x2001: /* EM QUAD */
4198 case 0x2002: /* EN SPACE */
4199 case 0x2003: /* EM SPACE */
4200 case 0x2004: /* THREE-PER-EM SPACE */
4201 case 0x2005: /* FOUR-PER-EM SPACE */
4202 case 0x2006: /* SIX-PER-EM SPACE */
4203 case 0x2007: /* FIGURE SPACE */
4204 case 0x2008: /* PUNCTUATION SPACE */
4205 case 0x2009: /* THIN SPACE */
4206 case 0x200A: /* HAIR SPACE */
4207 case 0x202f: /* NARROW NO-BREAK SPACE */
4208 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4209 case 0x3000: /* IDEOGRAPHIC SPACE */
4210 break;
4211 }
4212 }
4213 break;
4214
4215 case OP_NOT_VSPACE:
4216 for (i = 1; i <= min; i++)
4217 {
4218 if (eptr >= md->end_subject)
4219 {
4220 SCHECK_PARTIAL();
4221 MRRETURN(MATCH_NOMATCH);
4222 }
4223 GETCHARINC(c, eptr);
4224 switch(c)
4225 {
4226 default: break;
4227 case 0x0a: /* LF */
4228 case 0x0b: /* VT */
4229 case 0x0c: /* FF */
4230 case 0x0d: /* CR */
4231 case 0x85: /* NEL */
4232 case 0x2028: /* LINE SEPARATOR */
4233 case 0x2029: /* PARAGRAPH SEPARATOR */
4234 MRRETURN(MATCH_NOMATCH);
4235 }
4236 }
4237 break;
4238
4239 case OP_VSPACE:
4240 for (i = 1; i <= min; i++)
4241 {
4242 if (eptr >= md->end_subject)
4243 {
4244 SCHECK_PARTIAL();
4245 MRRETURN(MATCH_NOMATCH);
4246 }
4247 GETCHARINC(c, eptr);
4248 switch(c)
4249 {
4250 default: MRRETURN(MATCH_NOMATCH);
4251 case 0x0a: /* LF */
4252 case 0x0b: /* VT */
4253 case 0x0c: /* FF */
4254 case 0x0d: /* CR */
4255 case 0x85: /* NEL */
4256 case 0x2028: /* LINE SEPARATOR */
4257 case 0x2029: /* PARAGRAPH SEPARATOR */
4258 break;
4259 }
4260 }
4261 break;
4262
4263 case OP_NOT_DIGIT:
4264 for (i = 1; i <= min; i++)
4265 {
4266 if (eptr >= md->end_subject)
4267 {
4268 SCHECK_PARTIAL();
4269 MRRETURN(MATCH_NOMATCH);
4270 }
4271 GETCHARINC(c, eptr);
4272 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4273 MRRETURN(MATCH_NOMATCH);
4274 }
4275 break;
4276
4277 case OP_DIGIT:
4278 for (i = 1; i <= min; i++)
4279 {
4280 if (eptr >= md->end_subject)
4281 {
4282 SCHECK_PARTIAL();
4283 MRRETURN(MATCH_NOMATCH);
4284 }
4285 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4286 MRRETURN(MATCH_NOMATCH);
4287 /* No need to skip more bytes - we know it's a 1-byte character */
4288 }
4289 break;
4290
4291 case OP_NOT_WHITESPACE:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 MRRETURN(MATCH_NOMATCH);
4298 }
4299 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4300 MRRETURN(MATCH_NOMATCH);
4301 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4302 }
4303 break;
4304
4305 case OP_WHITESPACE:
4306 for (i = 1; i <= min; i++)
4307 {
4308 if (eptr >= md->end_subject)
4309 {
4310 SCHECK_PARTIAL();
4311 MRRETURN(MATCH_NOMATCH);
4312 }
4313 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4314 MRRETURN(MATCH_NOMATCH);
4315 /* No need to skip more bytes - we know it's a 1-byte character */
4316 }
4317 break;
4318
4319 case OP_NOT_WORDCHAR:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 MRRETURN(MATCH_NOMATCH);
4326 }
4327 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4328 MRRETURN(MATCH_NOMATCH);
4329 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4330 }
4331 break;
4332
4333 case OP_WORDCHAR:
4334 for (i = 1; i <= min; i++)
4335 {
4336 if (eptr >= md->end_subject)
4337 {
4338 SCHECK_PARTIAL();
4339 MRRETURN(MATCH_NOMATCH);
4340 }
4341 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4342 MRRETURN(MATCH_NOMATCH);
4343 /* No need to skip more bytes - we know it's a 1-byte character */
4344 }
4345 break;
4346
4347 default:
4348 RRETURN(PCRE_ERROR_INTERNAL);
4349 } /* End switch(ctype) */
4350
4351 else
4352 #endif /* SUPPORT_UTF8 */
4353
4354 /* Code for the non-UTF-8 case for minimum matching of operators other
4355 than OP_PROP and OP_NOTPROP. */
4356
4357 switch(ctype)
4358 {
4359 case OP_ANY:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 MRRETURN(MATCH_NOMATCH);
4366 }
4367 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4368 eptr++;
4369 }
4370 break;
4371
4372 case OP_ALLANY:
4373 if (eptr > md->end_subject - min)
4374 {
4375 SCHECK_PARTIAL();
4376 MRRETURN(MATCH_NOMATCH);
4377 }
4378 eptr += min;
4379 break;
4380
4381 case OP_ANYBYTE:
4382 if (eptr > md->end_subject - min)
4383 {
4384 SCHECK_PARTIAL();
4385 MRRETURN(MATCH_NOMATCH);
4386 }
4387 eptr += min;
4388 break;
4389
4390 case OP_ANYNL:
4391 for (i = 1; i <= min; i++)
4392 {
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 MRRETURN(MATCH_NOMATCH);
4397 }
4398 switch(*eptr++)
4399 {
4400 default: MRRETURN(MATCH_NOMATCH);
4401
4402 case 0x000d:
4403 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4404 break;
4405
4406 case 0x000a:
4407 break;
4408
4409 case 0x000b:
4410 case 0x000c:
4411 case 0x0085:
4412 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4413 break;
4414 }
4415 }
4416 break;
4417
4418 case OP_NOT_HSPACE:
4419 for (i = 1; i <= min; i++)
4420 {
4421 if (eptr >= md->end_subject)
4422 {
4423 SCHECK_PARTIAL();
4424 MRRETURN(MATCH_NOMATCH);
4425 }
4426 switch(*eptr++)
4427 {
4428 default: break;
4429 case 0x09: /* HT */
4430 case 0x20: /* SPACE */
4431 case 0xa0: /* NBSP */
4432 MRRETURN(MATCH_NOMATCH);
4433 }
4434 }
4435 break;
4436
4437 case OP_HSPACE:
4438 for (i = 1; i <= min; i++)
4439 {
4440 if (eptr >= md->end_subject)
4441 {
4442 SCHECK_PARTIAL();
4443 MRRETURN(MATCH_NOMATCH);
4444 }
4445 switch(*eptr++)
4446 {
4447 default: MRRETURN(MATCH_NOMATCH);
4448 case 0x09: /* HT */
4449 case 0x20: /* SPACE */
4450 case 0xa0: /* NBSP */
4451 break;
4452 }
4453 }
4454 break;
4455
4456 case OP_NOT_VSPACE:
4457 for (i = 1; i <= min; i++)
4458 {
4459 if (eptr >= md->end_subject)
4460 {
4461 SCHECK_PARTIAL();
4462 MRRETURN(MATCH_NOMATCH);
4463 }
4464 switch(*eptr++)
4465 {
4466 default: break;
4467 case 0x0a: /* LF */
4468 case 0x0b: /* VT */
4469 case 0x0c: /* FF */
4470 case 0x0d: /* CR */
4471 case 0x85: /* NEL */
4472 MRRETURN(MATCH_NOMATCH);
4473 }
4474 }
4475 break;
4476
4477 case OP_VSPACE:
4478 for (i = 1; i <= min; i++)
4479 {
4480 if (eptr >= md->end_subject)
4481 {
4482 SCHECK_PARTIAL();
4483 MRRETURN(MATCH_NOMATCH);
4484 }
4485 switch(*eptr++)
4486 {
4487 default: MRRETURN(MATCH_NOMATCH);
4488 case 0x0a: /* LF */
4489 case 0x0b: /* VT */
4490 case 0x0c: /* FF */
4491 case 0x0d: /* CR */
4492 case 0x85: /* NEL */
4493 break;
4494 }
4495 }
4496 break;
4497
4498 case OP_NOT_DIGIT:
4499 for (i = 1; i <= min; i++)
4500 {
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 MRRETURN(MATCH_NOMATCH);
4505 }
4506 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4507 }
4508 break;
4509
4510 case OP_DIGIT:
4511 for (i = 1; i <= min; i++)
4512 {
4513 if (eptr >= md->end_subject)
4514 {
4515 SCHECK_PARTIAL();
4516 MRRETURN(MATCH_NOMATCH);
4517 }
4518 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4519 }
4520 break;
4521
4522 case OP_NOT_WHITESPACE:
4523 for (i = 1; i <= min; i++)
4524 {
4525 if (eptr >= md->end_subject)
4526 {
4527 SCHECK_PARTIAL();
4528 MRRETURN(MATCH_NOMATCH);
4529 }
4530 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4531 }
4532 break;
4533
4534 case OP_WHITESPACE:
4535 for (i = 1; i <= min; i++)
4536 {
4537 if (eptr >= md->end_subject)
4538 {
4539 SCHECK_PARTIAL();
4540 MRRETURN(MATCH_NOMATCH);
4541 }
4542 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4543 }
4544 break;
4545
4546 case OP_NOT_WORDCHAR:
4547 for (i = 1; i <= min; i++)
4548 {
4549 if (eptr >= md->end_subject)
4550 {
4551 SCHECK_PARTIAL();
4552 MRRETURN(MATCH_NOMATCH);
4553 }
4554 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4555 MRRETURN(MATCH_NOMATCH);
4556 }
4557 break;
4558
4559 case OP_WORDCHAR:
4560 for (i = 1; i <= min; i++)
4561 {
4562 if (eptr >= md->end_subject)
4563 {
4564 SCHECK_PARTIAL();
4565 MRRETURN(MATCH_NOMATCH);
4566 }
4567 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4568 MRRETURN(MATCH_NOMATCH);
4569 }
4570 break;
4571
4572 default:
4573 RRETURN(PCRE_ERROR_INTERNAL);
4574 }
4575 }
4576
4577 /* If min = max, continue at the same level without recursing */
4578
4579 if (min == max) continue;
4580
4581 /* If minimizing, we have to test the rest of the pattern before each
4582 subsequent match. Again, separate the UTF-8 case for speed, and also
4583 separate the UCP cases. */
4584
4585 if (minimize)
4586 {
4587 #ifdef SUPPORT_UCP
4588 if (prop_type >= 0)
4589 {
4590 switch(prop_type)
4591 {
4592 case PT_ANY:
4593 for (fi = min;; fi++)
4594 {
4595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4597 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4598 if (eptr >= md->end_subject)
4599 {
4600 SCHECK_PARTIAL();
4601 MRRETURN(MATCH_NOMATCH);
4602 }
4603 GETCHARINCTEST(c, eptr);
4604 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4605 }
4606 /* Control never gets here */
4607
4608 case PT_LAMP:
4609 for (fi = min;; fi++)
4610 {
4611 int chartype;
4612 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4614 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4615 if (eptr >= md->end_subject)
4616 {
4617 SCHECK_PARTIAL();
4618 MRRETURN(MATCH_NOMATCH);
4619 }
4620 GETCHARINCTEST(c, eptr);
4621 chartype = UCD_CHARTYPE(c);
4622 if ((chartype == ucp_Lu ||
4623 chartype == ucp_Ll ||
4624 chartype == ucp_Lt) == prop_fail_result)
4625 MRRETURN(MATCH_NOMATCH);
4626 }
4627 /* Control never gets here */
4628
4629 case PT_GC:
4630 for (fi = min;; fi++)
4631 {
4632 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4634 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4635 if (eptr >= md->end_subject)
4636 {
4637 SCHECK_PARTIAL();
4638 MRRETURN(MATCH_NOMATCH);
4639 }
4640 GETCHARINCTEST(c, eptr);
4641 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4642 MRRETURN(MATCH_NOMATCH);
4643 }
4644 /* Control never gets here */
4645
4646 case PT_PC:
4647 for (fi = min;; fi++)
4648 {
4649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4651 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 MRRETURN(MATCH_NOMATCH);
4656 }
4657 GETCHARINCTEST(c, eptr);
4658 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4659 MRRETURN(MATCH_NOMATCH);
4660 }
4661 /* Control never gets here */
4662
4663 case PT_SC:
4664 for (fi = min;; fi++)
4665 {
4666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4668 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4669 if (eptr >= md->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 MRRETURN(MATCH_NOMATCH);
4673 }
4674 GETCHARINCTEST(c, eptr);
4675 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4676 MRRETURN(MATCH_NOMATCH);
4677 }
4678 /* Control never gets here */
4679
4680 case PT_ALNUM:
4681 for (fi = min;; fi++)
4682 {
4683 int category;
4684 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4686 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 MRRETURN(MATCH_NOMATCH);
4691 }
4692 GETCHARINCTEST(c, eptr);
4693 category = UCD_CATEGORY(c);
4694 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4695 MRRETURN(MATCH_NOMATCH);
4696 }
4697 /* Control never gets here */
4698
4699 case PT_SPACE: /* Perl space */
4700 for (fi = min;; fi++)
4701 {
4702 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4703 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4704 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4705 if (eptr >= md->end_subject)
4706 {
4707 SCHECK_PARTIAL();
4708 MRRETURN(MATCH_NOMATCH);
4709 }
4710 GETCHARINCTEST(c, eptr);
4711 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4712 c == CHAR_FF || c == CHAR_CR)
4713 == prop_fail_result)
4714 MRRETURN(MATCH_NOMATCH);
4715 }
4716 /* Control never gets here */
4717
4718 case PT_PXSPACE: /* POSIX space */
4719 for (fi = min;; fi++)
4720 {
4721 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4723 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4724 if (eptr >= md->end_subject)
4725 {
4726 SCHECK_PARTIAL();
4727 MRRETURN(MATCH_NOMATCH);
4728 }
4729 GETCHARINCTEST(c, eptr);
4730 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4731 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4732 == prop_fail_result)
4733 MRRETURN(MATCH_NOMATCH);
4734 }
4735 /* Control never gets here */
4736
4737 case PT_WORD:
4738 for (fi = min;; fi++)
4739 {
4740 int category;
4741 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4742 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4743 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4744 if (eptr >= md->end_subject)
4745 {
4746 SCHECK_PARTIAL();
4747 MRRETURN(MATCH_NOMATCH);
4748 }
4749 GETCHARINCTEST(c, eptr);
4750 category = UCD_CATEGORY(c);
4751 if ((category == ucp_L ||
4752 category == ucp_N ||
4753 c == CHAR_UNDERSCORE)
4754 == prop_fail_result)
4755 MRRETURN(MATCH_NOMATCH);
4756 }
4757 /* Control never gets here */
4758
4759 /* This should never occur */
4760
4761 default:
4762 RRETURN(PCRE_ERROR_INTERNAL);
4763 }
4764 }
4765
4766 /* Match extended Unicode sequences. We will get here only if the
4767 support is in the binary; otherwise a compile-time error occurs. */
4768
4769 else if (ctype == OP_EXTUNI)
4770 {
4771 for (fi = min;; fi++)
4772 {
4773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4775 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4776 if (eptr >= md->end_subject)
4777 {
4778 SCHECK_PARTIAL();
4779 MRRETURN(MATCH_NOMATCH);
4780 }
4781 GETCHARINCTEST(c, eptr);
4782 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4783 while (eptr < md->end_subject)
4784 {
4785 int len = 1;
4786 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4787 if (UCD_CATEGORY(c) != ucp_M) break;
4788 eptr += len;
4789 }
4790 }
4791 }
4792 else
4793 #endif /* SUPPORT_UCP */
4794
4795 #ifdef SUPPORT_UTF8
4796 if (utf)
4797 {
4798 for (fi = min;; fi++)
4799 {
4800 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4802 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4803 if (eptr >= md->end_subject)
4804 {
4805 SCHECK_PARTIAL();
4806 MRRETURN(MATCH_NOMATCH);
4807 }
4808 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4809 MRRETURN(MATCH_NOMATCH);
4810 GETCHARINC(c, eptr);
4811 switch(ctype)
4812 {
4813 case OP_ANY: /* This is the non-NL case */
4814 case OP_ALLANY:
4815 case OP_ANYBYTE:
4816 break;
4817
4818 case OP_ANYNL:
4819 switch(c)
4820 {
4821 default: MRRETURN(MATCH_NOMATCH);
4822 case 0x000d:
4823 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4824 break;
4825 case 0x000a:
4826 break;
4827
4828 case 0x000b:
4829 case 0x000c:
4830 case 0x0085:
4831 case 0x2028:
4832 case 0x2029:
4833 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4834 break;
4835 }
4836 break;
4837
4838 case OP_NOT_HSPACE:
4839 switch(c)
4840 {
4841 default: break;
4842 case 0x09: /* HT */
4843 case 0x20: /* SPACE */
4844 case 0xa0: /* NBSP */
4845 case 0x1680: /* OGHAM SPACE MARK */
4846 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4847 case 0x2000: /* EN QUAD */
4848 case 0x2001: /* EM QUAD */
4849 case 0x2002: /* EN SPACE */
4850 case 0x2003: /* EM SPACE */
4851 case 0x2004: /* THREE-PER-EM SPACE */
4852 case 0x2005: /* FOUR-PER-EM SPACE */
4853 case 0x2006: /* SIX-PER-EM SPACE */
4854 case 0x2007: /* FIGURE SPACE */
4855 case 0x2008: /* PUNCTUATION SPACE */
4856 case 0x2009: /* THIN SPACE */
4857 case 0x200A: /* HAIR SPACE */
4858 case 0x202f: /* NARROW NO-BREAK SPACE */
4859 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4860 case 0x3000: /* IDEOGRAPHIC SPACE */
4861 MRRETURN(MATCH_NOMATCH);
4862 }
4863 break;
4864
4865 case OP_HSPACE:
4866 switch(c)
4867 {
4868 default: MRRETURN(MATCH_NOMATCH);
4869 case 0x09: /* HT */
4870 case 0x20: /* SPACE */
4871 case 0xa0: /* NBSP */
4872 case 0x1680: /* OGHAM SPACE MARK */
4873 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4874 case 0x2000: /* EN QUAD */
4875 case 0x2001: /* EM QUAD */
4876 case 0x2002: /* EN SPACE */
4877 case 0x2003: /* EM SPACE */
4878 case 0x2004: /* THREE-PER-EM SPACE */
4879 case 0x2005: /* FOUR-PER-EM SPACE */
4880 case 0x2006: /* SIX-PER-EM SPACE */
4881 case 0x2007: /* FIGURE SPACE */
4882 case 0x2008: /* PUNCTUATION SPACE */
4883 case 0x2009: /* THIN SPACE */
4884 case 0x200A: /* HAIR SPACE */
4885 case 0x202f: /* NARROW NO-BREAK SPACE */
4886 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4887 case 0x3000: /* IDEOGRAPHIC SPACE */
4888 break;
4889 }
4890 break;
4891
4892 case OP_NOT_VSPACE:
4893 switch(c)
4894 {
4895 default: break;
4896 case 0x0a: /* LF */
4897 case 0x0b: /* VT */
4898 case 0x0c: /* FF */
4899 case 0x0d: /* CR */
4900 case 0x85: /* NEL */
4901 case 0x2028: /* LINE SEPARATOR */
4902 case 0x2029: /* PARAGRAPH SEPARATOR */
4903 MRRETURN(MATCH_NOMATCH);
4904 }
4905 break;
4906
4907 case OP_VSPACE:
4908 switch(c)
4909 {
4910 default: MRRETURN(MATCH_NOMATCH);
4911 case 0x0a: /* LF */
4912 case 0x0b: /* VT */
4913 case 0x0c: /* FF */
4914 case 0x0d: /* CR */
4915 case 0x85: /* NEL */
4916 case 0x2028: /* LINE SEPARATOR */
4917 case 0x2029: /* PARAGRAPH SEPARATOR */
4918 break;
4919 }
4920 break;
4921
4922 case OP_NOT_DIGIT:
4923 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4924 MRRETURN(MATCH_NOMATCH);
4925 break;
4926
4927 case OP_DIGIT:
4928 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4929 MRRETURN(MATCH_NOMATCH);
4930 break;
4931
4932 case OP_NOT_WHITESPACE:
4933 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4934 MRRETURN(MATCH_NOMATCH);
4935 break;
4936
4937 case OP_WHITESPACE:
4938 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4939 MRRETURN(MATCH_NOMATCH);
4940 break;
4941
4942 case OP_NOT_WORDCHAR:
4943 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4944 MRRETURN(MATCH_NOMATCH);
4945 break;
4946
4947 case OP_WORDCHAR:
4948 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4949 MRRETURN(MATCH_NOMATCH);
4950 break;
4951
4952 default:
4953 RRETURN(PCRE_ERROR_INTERNAL);
4954 }
4955 }
4956 }
4957 else
4958 #endif
4959 /* Not UTF mode */
4960 {
4961 for (fi = min;; fi++)
4962 {
4963 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4965 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4966 if (eptr >= md->end_subject)
4967 {
4968 SCHECK_PARTIAL();
4969 MRRETURN(MATCH_NOMATCH);
4970 }
4971 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4972 MRRETURN(MATCH_NOMATCH);
4973 c = *eptr++;
4974 switch(ctype)
4975 {
4976 case OP_ANY: /* This is the non-NL case */
4977 case OP_ALLANY:
4978 case OP_ANYBYTE:
4979 break;
4980
4981 case OP_ANYNL:
4982 switch(c)
4983 {
4984 default: MRRETURN(MATCH_NOMATCH);
4985 case 0x000d:
4986 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4987 break;
4988
4989 case 0x000a:
4990 break;
4991
4992 case 0x000b:
4993 case 0x000c:
4994 case 0x0085:
4995 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4996 break;
4997 }
4998 break;
4999
5000 case OP_NOT_HSPACE:
5001 switch(c)
5002 {
5003 default: break;
5004 case 0x09: /* HT */
5005 case 0x20: /* SPACE */
5006 case 0xa0: /* NBSP */
5007 MRRETURN(MATCH_NOMATCH);
5008 }
5009 break;
5010
5011 case OP_HSPACE:
5012 switch(c)
5013 {
5014 default: MRRETURN(MATCH_NOMATCH);
5015 case 0x09: /* HT */
5016 case 0x20: /* SPACE */
5017 case 0xa0: /* NBSP */
5018 break;
5019 }
5020 break;
5021
5022 case OP_NOT_VSPACE:
5023 switch(c)
5024 {
5025 default: break;
5026 case 0x0a: /* LF */
5027 case 0x0b: /* VT */
5028 case 0x0c: /* FF */
5029 case 0x0d: /* CR */
5030 case 0x85: /* NEL */
5031 MRRETURN(MATCH_NOMATCH);
5032 }
5033 break;
5034
5035 case OP_VSPACE:
5036 switch(c)
5037 {
5038 default: MRRETURN(MATCH_NOMATCH);
5039 case 0x0a: /* LF */
5040 case 0x0b: /* VT */
5041 case 0x0c: /* FF */
5042 case 0x0d: /* CR */
5043 case 0x85: /* NEL */
5044 break;
5045 }
5046 break;
5047
5048 case OP_NOT_DIGIT:
5049 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5050 break;
5051
5052 case OP_DIGIT:
5053 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5054 break;
5055
5056 case OP_NOT_WHITESPACE:
5057 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5058 break;
5059
5060 case OP_WHITESPACE:
5061 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5062 break;
5063
5064 case OP_NOT_WORDCHAR:
5065 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5066 break;
5067
5068 case OP_WORDCHAR:
5069 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5070 break;
5071
5072 default:
5073 RRETURN(PCRE_ERROR_INTERNAL);
5074 }
5075 }
5076 }
5077 /* Control never gets here */
5078 }
5079
5080 /* If maximizing, it is worth using inline code for speed, doing the type
5081 test once at the start (i.e. keep it out of the loop). Again, keep the
5082 UTF-8 and UCP stuff separate. */
5083
5084 else
5085 {
5086 pp = eptr; /* Remember where we started */
5087
5088 #ifdef SUPPORT_UCP
5089 if (prop_type >= 0)
5090 {
5091 switch(prop_type)
5092 {
5093 case PT_ANY:
5094 for (i = min; i < max; i++)
5095 {
5096 int len = 1;
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 break;
5101 }
5102 GETCHARLENTEST(c, eptr, len);
5103 if (prop_fail_result) break;
5104 eptr+= len;
5105 }
5106 break;
5107
5108 case PT_LAMP:
5109 for (i = min; i < max; i++)
5110 {
5111 int chartype;
5112 int len = 1;
5113 if (eptr >= md->end_subject)
5114 {
5115 SCHECK_PARTIAL();
5116 break;
5117 }
5118 GETCHARLENTEST(c, eptr, len);
5119 chartype = UCD_CHARTYPE(c);
5120 if ((chartype == ucp_Lu ||
5121 chartype == ucp_Ll ||
5122 chartype == ucp_Lt) == prop_fail_result)
5123 break;
5124 eptr+= len;
5125 }
5126 break;
5127
5128 case PT_GC:
5129 for (i = min; i < max; i++)
5130 {
5131 int len = 1;
5132 if (eptr >= md->end_subject)
5133 {
5134 SCHECK_PARTIAL();
5135 break;
5136 }
5137 GETCHARLENTEST(c, eptr, len);
5138 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5139 eptr+= len;
5140 }
5141 break;
5142
5143 case PT_PC:
5144 for (i = min; i < max; i++)
5145 {
5146 int len = 1;
5147 if (eptr >= md->end_subject)
5148 {
5149 SCHECK_PARTIAL();
5150 break;
5151 }
5152 GETCHARLENTEST(c, eptr, len);
5153 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5154 eptr+= len;
5155 }
5156 break;
5157
5158 case PT_SC:
5159 for (i = min; i < max; i++)
5160 {
5161 int len = 1;
5162 if (eptr >= md->end_subject)
5163 {
5164 SCHECK_PARTIAL();
5165 break;
5166 }
5167 GETCHARLENTEST(c, eptr, len);
5168 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5169 eptr+= len;
5170 }
5171 break;
5172
5173 case PT_ALNUM:
5174 for (i = min; i < max; i++)
5175 {
5176 int category;
5177 int len = 1;
5178 if (eptr >= md->end_subject)
5179 {
5180 SCHECK_PARTIAL();
5181 break;
5182 }
5183 GETCHARLENTEST(c, eptr, len);
5184 category = UCD_CATEGORY(c);
5185 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5186 break;
5187 eptr+= len;
5188 }
5189 break;
5190
5191 case PT_SPACE: /* Perl space */
5192 for (i = min; i < max; i++)
5193 {
5194 int len = 1;
5195 if (eptr >= md->end_subject)
5196 {
5197 SCHECK_PARTIAL();
5198 break;
5199 }
5200 GETCHARLENTEST(c, eptr, len);
5201 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5202 c == CHAR_FF || c == CHAR_CR)
5203 == prop_fail_result)
5204 break;
5205 eptr+= len;
5206 }
5207 break;
5208
5209 case PT_PXSPACE: /* POSIX space */
5210 for (i = min; i < max; i++)
5211 {
5212 int len = 1;
5213 if (eptr >= md->end_subject)
5214 {
5215 SCHECK_PARTIAL();
5216 break;
5217 }
5218 GETCHARLENTEST(c, eptr, len);
5219 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5220 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5221 == prop_fail_result)
5222 break;
5223 eptr+= len;
5224 }
5225 break;
5226
5227 case PT_WORD:
5228 for (i = min; i < max; i++)
5229 {
5230 int category;
5231 int len = 1;
5232 if (eptr >= md->end_subject)
5233 {
5234 SCHECK_PARTIAL();
5235 break;
5236 }
5237 GETCHARLENTEST(c, eptr, len);
5238 category = UCD_CATEGORY(c);
5239 if ((category == ucp_L || category == ucp_N ||
5240 c == CHAR_UNDERSCORE) == prop_fail_result)
5241 break;
5242 eptr+= len;
5243 }
5244 break;
5245
5246 default:
5247 RRETURN(PCRE_ERROR_INTERNAL);
5248 }
5249
5250 /* eptr is now past the end of the maximum run */
5251
5252 if (possessive) continue;
5253 for(;;)
5254 {
5255 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5257 if (eptr-- == pp) break; /* Stop if tried at original pos */
5258 if (utf) BACKCHAR(eptr);
5259 }
5260 }
5261
5262 /* Match extended Unicode sequences. We will get here only if the
5263 support is in the binary; otherwise a compile-time error occurs. */
5264
5265 else if (ctype == OP_EXTUNI)
5266 {
5267 for (i = min; i < max; i++)
5268 {
5269 int len = 1;
5270 if (eptr >= md->end_subject)
5271 {
5272 SCHECK_PARTIAL();
5273 break;
5274 }
5275 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5276 if (UCD_CATEGORY(c) == ucp_M) break;
5277 eptr += len;
5278 while (eptr < md->end_subject)
5279 {
5280 len = 1;
5281 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5282 if (UCD_CATEGORY(c) != ucp_M) break;
5283 eptr += len;
5284 }
5285 }
5286
5287 /* eptr is now past the end of the maximum run */
5288
5289 if (possessive) continue;
5290
5291 for(;;)
5292 {
5293 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5295 if (eptr-- == pp) break; /* Stop if tried at original pos */
5296 for (;;) /* Move back over one extended */
5297 {
5298 if (!utf) c = *eptr; else
5299 {
5300 BACKCHAR(eptr);
5301 GETCHAR(c, eptr);
5302 }
5303 if (UCD_CATEGORY(c) != ucp_M) break;
5304 eptr--;
5305 }
5306 }
5307 }
5308
5309 else
5310 #endif /* SUPPORT_UCP */
5311
5312 #ifdef SUPPORT_UTF8
5313 if (utf)
5314 {
5315 switch(ctype)
5316 {
5317 case OP_ANY:
5318 if (max < INT_MAX)
5319 {
5320 for (i = min; i < max; i++)
5321 {
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 if (IS_NEWLINE(eptr)) break;
5328 eptr++;
5329 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5330 }
5331 }
5332
5333 /* Handle unlimited UTF-8 repeat */
5334
5335 else
5336 {
5337 for (i = min; i < max; i++)
5338 {
5339 if (eptr >= md->end_subject)
5340 {
5341 SCHECK_PARTIAL();
5342 break;
5343 }
5344 if (IS_NEWLINE(eptr)) break;
5345 eptr++;
5346 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5347 }
5348 }
5349 break;
5350
5351 case OP_ALLANY:
5352 if (max < INT_MAX)
5353 {
5354 for (i = min; i < max; i++)
5355 {
5356 if (eptr >= md->end_subject)
5357 {
5358 SCHECK_PARTIAL();
5359 break;
5360 }
5361 eptr++;
5362 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5363 }
5364 }
5365 else
5366 {
5367 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5368 SCHECK_PARTIAL();
5369 }
5370 break;
5371
5372 /* The byte case is the same as non-UTF8 */
5373
5374 case OP_ANYBYTE:
5375 c = max - min;
5376 if (c > (unsigned int)(md->end_subject - eptr))
5377 {
5378 eptr = md->end_subject;
5379 SCHECK_PARTIAL();
5380 }
5381 else eptr += c;
5382 break;
5383
5384 case OP_ANYNL:
5385 for (i = min; i < max; i++)
5386 {
5387 int len = 1;
5388 if (eptr >= md->end_subject)
5389 {
5390 SCHECK_PARTIAL();
5391 break;
5392 }
5393 GETCHARLEN(c, eptr, len);
5394 if (c == 0x000d)
5395 {
5396 if (++eptr >= md->end_subject) break;
5397 if (*eptr == 0x000a) eptr++;
5398 }
5399 else
5400 {
5401 if (c != 0x000a &&
5402 (md->bsr_anycrlf ||
5403 (c != 0x000b && c != 0x000c &&
5404 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5405 break;
5406 eptr += len;
5407 }
5408 }
5409 break;
5410
5411 case OP_NOT_HSPACE:
5412 case OP_HSPACE:
5413 for (i = min; i < max; i++)
5414 {
5415 BOOL gotspace;
5416 int len = 1;
5417 if (eptr >= md->end_subject)
5418 {
5419 SCHECK_PARTIAL();
5420 break;
5421 }
5422 GETCHARLEN(c, eptr, len);
5423 switch(c)
5424 {
5425 default: gotspace = FALSE; break;
5426 case 0x09: /* HT */
5427 case 0x20: /* SPACE */
5428 case 0xa0: /* NBSP */
5429 case 0x1680: /* OGHAM SPACE MARK */
5430 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5431 case 0x2000: /* EN QUAD */
5432 case 0x2001: /* EM QUAD */
5433 case 0x2002: /* EN SPACE */
5434 case 0x2003: /* EM SPACE */
5435 case 0x2004: /* THREE-PER-EM SPACE */
5436 case 0x2005: /* FOUR-PER-EM SPACE */
5437 case 0x2006: /* SIX-PER-EM SPACE */
5438 case 0x2007: /* FIGURE SPACE */
5439 case 0x2008: /* PUNCTUATION SPACE */
5440 case 0x2009: /* THIN SPACE */
5441 case 0x200A: /* HAIR SPACE */
5442 case 0x202f: /* NARROW NO-BREAK SPACE */
5443 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5444 case 0x3000: /* IDEOGRAPHIC SPACE */
5445 gotspace = TRUE;
5446 break;
5447 }
5448 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5449 eptr += len;
5450 }
5451 break;
5452
5453 case OP_NOT_VSPACE:
5454 case OP_VSPACE:
5455 for (i = min; i < max; i++)
5456 {
5457 BOOL gotspace;
5458 int len = 1;
5459 if (eptr >= md->end_subject)
5460 {
5461 SCHECK_PARTIAL();
5462 break;
5463 }
5464 GETCHARLEN(c, eptr, len);
5465 switch(c)
5466 {
5467 default: gotspace = FALSE; break;
5468 case 0x0a: /* LF */
5469 case 0x0b: /* VT */
5470 case 0x0c: /* FF */
5471 case 0x0d: /* CR */
5472 case 0x85: /* NEL */
5473 case 0x2028: /* LINE SEPARATOR */
5474 case 0x2029: /* PARAGRAPH SEPARATOR */
5475 gotspace = TRUE;
5476 break;
5477 }
5478 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5479 eptr += len;
5480 }
5481 break;
5482
5483 case OP_NOT_DIGIT:
5484 for (i = min; i < max; i++)
5485 {
5486 int len = 1;
5487 if (eptr >= md->end_subject)
5488 {
5489 SCHECK_PARTIAL();
5490 break;
5491 }
5492 GETCHARLEN(c, eptr, len);
5493 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5494 eptr+= len;
5495 }
5496 break;
5497
5498 case OP_DIGIT:
5499 for (i = min; i < max; i++)
5500 {
5501 int len = 1;
5502 if (eptr >= md->end_subject)
5503 {
5504 SCHECK_PARTIAL();
5505 break;
5506 }
5507 GETCHARLEN(c, eptr, len);
5508 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5509 eptr+= len;
5510 }
5511 break;
5512
5513 case OP_NOT_WHITESPACE:
5514 for (i = min; i < max; i++)
5515 {
5516 int len = 1;
5517 if (eptr >= md->end_subject)
5518 {
5519 SCHECK_PARTIAL();
5520 break;
5521 }
5522 GETCHARLEN(c, eptr, len);
5523 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5524 eptr+= len;
5525 }
5526 break;
5527
5528 case OP_WHITESPACE:
5529 for (i = min; i < max; i++)
5530 {
5531 int len = 1;
5532 if (eptr >= md->end_subject)
5533 {
5534 SCHECK_PARTIAL();
5535 break;
5536 }
5537 GETCHARLEN(c, eptr, len);
5538 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5539 eptr+= len;
5540 }
5541 break;
5542
5543 case OP_NOT_WORDCHAR:
5544 for (i = min; i < max; i++)
5545 {
5546 int len = 1;
5547 if (eptr >= md->end_subject)
5548 {
5549 SCHECK_PARTIAL();
5550 break;
5551 }
5552 GETCHARLEN(c, eptr, len);
5553 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5554 eptr+= len;
5555 }
5556 break;
5557
5558 case OP_WORDCHAR:
5559 for (i = min; i < max; i++)
5560 {
5561 int len = 1;
5562 if (eptr >= md->end_subject)
5563 {
5564 SCHECK_PARTIAL();
5565 break;
5566 }
5567 GETCHARLEN(c, eptr, len);
5568 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5569 eptr+= len;
5570 }
5571 break;
5572
5573 default:
5574 RRETURN(PCRE_ERROR_INTERNAL);
5575 }
5576
5577 /* eptr is now past the end of the maximum run. If possessive, we are
5578 done (no backing up). Otherwise, match at this position; anything other
5579 than no match is immediately returned. For nomatch, back up one
5580 character, unless we are matching \R and the last thing matched was
5581 \r\n, in which case, back up two bytes. */
5582
5583 if (possessive) continue;
5584 for(;;)
5585 {
5586 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5588 if (eptr-- == pp) break; /* Stop if tried at original pos */
5589 BACKCHAR(eptr);
5590 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5591 eptr[-1] == '\r') eptr--;
5592 }
5593 }
5594 else
5595 #endif /* SUPPORT_UTF8 */
5596 /* Not UTF mode */
5597 {
5598 switch(ctype)
5599 {
5600 case OP_ANY:
5601 for (i = min; i < max; i++)
5602 {
5603 if (eptr >= md->end_subject)
5604 {
5605 SCHECK_PARTIAL();
5606 break;
5607 }
5608 if (IS_NEWLINE(eptr)) break;
5609 eptr++;
5610 }
5611 break;
5612
5613 case OP_ALLANY:
5614 case OP_ANYBYTE:
5615 c = max - min;
5616 if (c > (unsigned int)(md->end_subject - eptr))
5617 {
5618 eptr = md->end_subject;
5619 SCHECK_PARTIAL();
5620 }
5621 else eptr += c;
5622 break;
5623
5624 case OP_ANYNL:
5625 for (i = min; i < max; i++)
5626 {
5627 if (eptr >= md->end_subject)
5628 {
5629 SCHECK_PARTIAL();
5630 break;
5631 }
5632 c = *eptr;
5633 if (c == 0x000d)
5634 {
5635 if (++eptr >= md->end_subject) break;
5636 if (*eptr == 0x000a) eptr++;
5637 }
5638 else
5639 {
5640 if (c != 0x000a &&
5641 (md->bsr_anycrlf ||
5642 (c != 0x000b && c != 0x000c && c != 0x0085)))
5643 break;
5644 eptr++;
5645 }
5646 }
5647 break;
5648
5649 case OP_NOT_HSPACE:
5650 for (i = min; i < max; i++)
5651 {
5652 if (eptr >= md->end_subject)
5653 {
5654 SCHECK_PARTIAL();
5655 break;
5656 }
5657 c = *eptr;
5658 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5659 eptr++;
5660 }
5661 break;
5662
5663 case OP_HSPACE:
5664 for (i = min; i < max; i++)
5665 {
5666 if (eptr >= md->end_subject)
5667 {
5668 SCHECK_PARTIAL();
5669 break;
5670 }
5671 c = *eptr;
5672 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5673 eptr++;
5674 }
5675 break;
5676
5677 case OP_NOT_VSPACE:
5678 for (i = min; i < max; i++)
5679 {
5680 if (eptr >= md->end_subject)
5681 {
5682 SCHECK_PARTIAL();
5683 break;
5684 }
5685 c = *eptr;
5686 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5687 break;
5688 eptr++;
5689 }
5690 break;
5691
5692 case OP_VSPACE:
5693 for (i = min; i < max; i++)
5694 {
5695 if (eptr >= md->end_subject)
5696 {
5697 SCHECK_PARTIAL();
5698 break;
5699 }
5700 c = *eptr;
5701 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5702 break;
5703 eptr++;
5704 }
5705 break;
5706
5707 case OP_NOT_DIGIT:
5708 for (i = min; i < max; i++)
5709 {
5710 if (eptr >= md->end_subject)
5711 {
5712 SCHECK_PARTIAL();
5713 break;
5714 }
5715 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5716 eptr++;
5717 }
5718 break;
5719
5720 case OP_DIGIT:
5721 for (i = min; i < max; i++)
5722 {
5723 if (eptr >= md->end_subject)
5724 {
5725 SCHECK_PARTIAL();
5726 break;
5727 }
5728 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5729 eptr++;
5730 }
5731 break;
5732
5733 case OP_NOT_WHITESPACE:
5734 for (i = min; i < max; i++)
5735 {
5736 if (eptr >= md->end_subject)
5737 {
5738 SCHECK_PARTIAL();
5739 break;
5740 }
5741 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5742 eptr++;
5743 }
5744 break;
5745
5746 case OP_WHITESPACE:
5747 for (i = min; i < max; i++)
5748 {
5749 if (eptr >= md->end_subject)
5750 {
5751 SCHECK_PARTIAL();
5752 break;
5753 }
5754 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5755 eptr++;
5756 }
5757 break;
5758
5759 case OP_NOT_WORDCHAR:
5760 for (i = min; i < max; i++)
5761 {
5762 if (eptr >= md->end_subject)
5763 {
5764 SCHECK_PARTIAL();
5765 break;
5766 }
5767 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5768 eptr++;
5769 }
5770 break;
5771
5772 case OP_WORDCHAR:
5773 for (i = min; i < max; i++)
5774 {
5775 if (eptr >= md->end_subject)
5776 {
5777 SCHECK_PARTIAL();
5778 break;
5779 }
5780 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5781 eptr++;
5782 }
5783 break;
5784
5785 default:
5786 RRETURN(PCRE_ERROR_INTERNAL);
5787 }
5788
5789 /* eptr is now past the end of the maximum run. If possessive, we are
5790 done (no backing up). Otherwise, match at this position; anything other
5791 than no match is immediately returned. For nomatch, back up one
5792 character (byte), unless we are matching \R and the last thing matched
5793 was \r\n, in which case, back up two bytes. */
5794
5795 if (possessive) continue;
5796 while (eptr >= pp)
5797 {
5798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5800 eptr--;
5801 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5802 eptr[-1] == '\r') eptr--;
5803 }
5804 }
5805
5806 /* Get here if we can't make it match with any permitted repetitions */
5807
5808 MRRETURN(MATCH_NOMATCH);
5809 }
5810 /* Control never gets here */
5811
5812 /* There's been some horrible disaster. Arrival here can only mean there is
5813 something seriously wrong in the code above or the OP_xxx definitions. */
5814
5815 default:
5816 DPRINTF(("Unknown opcode %d\n", *ecode));
5817 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5818 }
5819
5820 /* Do not stick any code in here without much thought; it is assumed
5821 that "continue" in the code above comes out to here to repeat the main
5822 loop. */
5823
5824 } /* End of main loop */
5825 /* Control never reaches here */
5826
5827
5828 /* When compiling to use the heap rather than the stack for recursive calls to
5829 match(), the RRETURN() macro jumps here. The number that is saved in
5830 frame->Xwhere indicates which label we actually want to return to. */
5831
5832 #ifdef NO_RECURSE
5833 #define LBL(val) case val: goto L_RM##val;
5834 HEAP_RETURN:
5835 switch (frame->Xwhere)
5836 {
5837 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5838 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5839 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5840 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5841 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5842 LBL(65) LBL(66)
5843 #ifdef SUPPORT_UTF8
5844 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5845 LBL(32) LBL(34) LBL(42) LBL(46)
5846 #ifdef SUPPORT_UCP
5847 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5848 LBL(59) LBL(60) LBL(61) LBL(62)
5849 #endif /* SUPPORT_UCP */
5850 #endif /* SUPPORT_UTF8 */
5851 default:
5852 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5853 return PCRE_ERROR_INTERNAL;
5854 }
5855 #undef LBL
5856 #endif /* NO_RECURSE */
5857 }
5858
5859
5860 /***************************************************************************
5861 ****************************************************************************
5862 RECURSION IN THE match() FUNCTION
5863
5864 Undefine all the macros that were defined above to handle this. */
5865
5866 #ifdef NO_RECURSE
5867 #undef eptr
5868 #undef ecode
5869 #undef mstart
5870 #undef offset_top
5871 #undef eptrb
5872 #undef flags
5873
5874 #undef callpat
5875 #undef charptr
5876 #undef data
5877 #undef next
5878 #undef pp
5879 #undef prev
5880 #undef saved_eptr
5881
5882 #undef new_recursive
5883
5884 #undef cur_is_word
5885 #undef condition
5886 #undef prev_is_word
5887
5888 #undef ctype
5889 #undef length
5890 #undef max
5891 #undef min
5892 #undef number
5893 #undef offset
5894 #undef op
5895 #undef save_capture_last
5896 #undef save_offset1
5897 #undef save_offset2
5898 #undef save_offset3
5899 #undef stacksave
5900
5901 #undef newptrb
5902
5903 #endif
5904
5905 /* These two are defined as macros in both cases */
5906
5907 #undef fc
5908 #undef fi
5909
5910 /***************************************************************************
5911 ***************************************************************************/
5912
5913
5914
5915 /*************************************************
5916 * Execute a Regular Expression *
5917 *************************************************/
5918
5919 /* This function applies a compiled re to a subject string and picks out
5920 portions of the string if it matches. Two elements in the vector are set for
5921 each substring: the offsets to the start and end of the substring.
5922
5923 Arguments:
5924 argument_re points to the compiled expression
5925 extra_data points to extra data or is NULL
5926 subject points to the subject string
5927 length length of subject string (may contain binary zeros)
5928 start_offset where to start in the subject string
5929 options option bits
5930 offsets points to a vector of ints to be filled in with offsets
5931 offsetcount the number of elements in the vector
5932
5933 Returns: > 0 => success; value is the number of elements filled in
5934 = 0 => success, but offsets is not big enough
5935 -1 => failed to match
5936 < -1 => some kind of unexpected problem
5937 */
5938
5939 #ifdef COMPILE_PCRE8
5940 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5941 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5942 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5943 int offsetcount)
5944 #else
5945 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5946 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
5947 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
5948 int offsetcount)
5949 #endif
5950 {
5951 int rc, ocount, arg_offset_max;
5952 int newline;
5953 BOOL using_temporary_offsets = FALSE;
5954 BOOL anchored;
5955 BOOL startline;
5956 BOOL firstline;
5957 BOOL utf;
5958 BOOL has_first_char = FALSE;
5959 BOOL has_req_char = FALSE;
5960 pcre_uchar first_char = 0;
5961 pcre_uchar first_char2 = 0;
5962 pcre_uchar req_char = 0;
5963 pcre_uchar req_char2 = 0;
5964 match_data match_block;
5965 match_data *md = &match_block;
5966 const pcre_uint8 *tables;
5967 const pcre_uint8 *start_bits = NULL;
5968 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
5969 PCRE_PUCHAR end_subject;
5970 PCRE_PUCHAR start_partial = NULL;
5971 PCRE_PUCHAR req_char_ptr = start_match - 1;
5972
5973 pcre_study_data internal_study;
5974 const pcre_study_data *study;
5975
5976 real_pcre internal_re;
5977 const real_pcre *external_re = (const real_pcre *)argument_re;
5978 const real_pcre *re = external_re;
5979
5980 /* Plausibility checks */
5981
5982 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5983 if (re == NULL || subject == NULL ||
5984 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5985 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5986 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5987
5988 /* These two settings are used in the code for checking a UTF-8 string that
5989 follows immediately afterwards. Other values in the md block are used only
5990 during "normal" pcre_exec() processing, not when the JIT support is in use,
5991 so they are set up later. */
5992
5993 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
5994 utf = md->utf = (re->options & PCRE_UTF8) != 0;
5995 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5996 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5997
5998 /* Check a UTF-8 string if required. Pass back the character offset and error
5999 code for an invalid string if a results vector is available. */
6000
6001 #ifdef SUPPORT_UTF8
6002 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6003 {
6004 int erroroffset;
6005 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6006 if (errorcode != 0)
6007 {
6008 if (offsetcount >= 2)
6009 {
6010 offsets[0] = erroroffset;
6011 offsets[1] = errorcode;
6012 }
6013 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6014 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6015 }
6016
6017 /* Check that a start_offset points to the start of a UTF-8 character. */
6018 if (start_offset > 0 && start_offset < length &&
6019 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
6020 return PCRE_ERROR_BADUTF8_OFFSET;
6021 }
6022 #endif
6023
6024 /* If the pattern was successfully studied with JIT support, run the JIT
6025 executable instead of the rest of this function. Most options must be set at
6026 compile time for the JIT code to be usable. Fallback to the normal code path if
6027 an unsupported flag is set. In particular, JIT does not support partial
6028 matching. */
6029
6030 #ifdef SUPPORT_JIT
6031 if (extra_data != NULL
6032 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6033 && extra_data->executable_jit != NULL
6034 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6035 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6036 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6037 return PRIV(jit_exec)(re, extra_data->executable_jit,
6038 (const pcre_uchar *)subject, length, start_offset, options,
6039 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6040 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6041 #endif
6042
6043 /* Carry on with non-JIT matching. This information is for finding all the
6044 numbers associated with a given name, for condition testing. */
6045
6046 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6047 md->name_count = re->name_count;
6048 md->name_entry_size = re->name_entry_size;
6049
6050 /* Fish out the optional data from the extra_data structure, first setting
6051 the default values. */
6052
6053 study = NULL;
6054 md->match_limit = MATCH_LIMIT;
6055 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6056 md->callout_data = NULL;
6057
6058 /* The table pointer is always in native byte order. */
6059
6060 tables = external_re->tables;
6061
6062 if (extra_data != NULL)
6063 {
6064 register unsigned int flags = extra_data->flags;
6065 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6066 study = (const pcre_study_data *)extra_data->study_data;
6067 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6068 md->match_limit = extra_data->match_limit;
6069 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6070 md->match_limit_recursion = extra_data->match_limit_recursion;
6071 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6072 md->callout_data = extra_data->callout_data;
6073 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6074 }
6075
6076 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6077 is a feature that makes it possible to save compiled regex and re-use them
6078 in other programs later. */
6079
6080 if (tables == NULL) tables = PRIV(default_tables);
6081
6082 /* Check that the first field in the block is the magic number. If it is not,
6083 test for a regex that was compiled on a host of opposite endianness. If this is
6084 the case, flipped values are put in internal_re and internal_study if there was
6085 study data too. */
6086
6087 if (re->magic_number != MAGIC_NUMBER)
6088 {
6089 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6090 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6091 if (study != NULL) study = &internal_study;
6092 }
6093
6094 /* Set up other data */
6095
6096 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6097 startline = (re->flags & PCRE_STARTLINE) != 0;
6098 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6099
6100 /* The code starts after the real_pcre block and the capture name table. */
6101
6102 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6103 re->name_count * re->name_entry_size;
6104
6105 md->start_subject = (PCRE_PUCHAR)subject;
6106 md->start_offset = start_offset;
6107 md->end_subject = md->start_subject + length;
6108 end_subject = md->end_subject;
6109
6110 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6111 md->use_ucp = (re->options & PCRE_UCP) != 0;
6112 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6113
6114 /* Some options are unpacked into BOOL variables in the hope that testing
6115 them will be faster than individual option bits. */
6116
6117 md->notbol = (options & PCRE_NOTBOL) != 0;
6118 md->noteol = (options & PCRE_NOTEOL) != 0;
6119 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6120 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6121
6122 md->hitend = FALSE;
6123 md->mark = NULL; /* In case never set */
6124
6125 md->recursive = NULL; /* No recursion at top level */
6126 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6127
6128 md->lcc = tables + lcc_offset;
6129 md->ctypes = tables + ctypes_offset;
6130
6131 /* Handle different \R options. */
6132
6133 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6134 {
6135 case 0:
6136 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6137 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6138 else
6139 #ifdef BSR_ANYCRLF
6140 md->bsr_anycrlf = TRUE;
6141 #else
6142 md->bsr_anycrlf = FALSE;
6143 #endif
6144 break;
6145
6146 case PCRE_BSR_ANYCRLF:
6147 md->bsr_anycrlf = TRUE;
6148 break;
6149
6150 case PCRE_BSR_UNICODE:
6151 md->bsr_anycrlf = FALSE;
6152 break;
6153
6154 default: return PCRE_ERROR_BADNEWLINE;
6155 }
6156
6157 /* Handle different types of newline. The three bits give eight cases. If
6158 nothing is set at run time, whatever was used at compile time applies. */
6159
6160 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6161 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6162 {
6163 case 0: newline = NEWLINE; break; /* Compile-time default */
6164 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6165 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6166 case PCRE_NEWLINE_CR+
6167 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6168 case PCRE_NEWLINE_ANY: newline = -1; break;
6169 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6170 default: return PCRE_ERROR_BADNEWLINE;
6171 }
6172
6173 if (newline == -2)
6174 {
6175 md->nltype = NLTYPE_ANYCRLF;
6176 }
6177 else if (newline < 0)
6178 {
6179 md->nltype = NLTYPE_ANY;
6180 }
6181 else
6182 {
6183 md->nltype = NLTYPE_FIXED;
6184 if (newline > 255)
6185 {
6186 md->nllen = 2;
6187 md->nl[0] = (newline >> 8) & 255;
6188 md->nl[1] = newline & 255;
6189 }
6190 else
6191 {
6192 md->nllen = 1;
6193 md->nl[0] = newline;
6194 }
6195 }
6196
6197 /* Partial matching was originally supported only for a restricted set of
6198 regexes; from release 8.00 there are no restrictions, but the bits are still
6199 defined (though never set). So there's no harm in leaving this code. */
6200
6201 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6202 return PCRE_ERROR_BADPARTIAL;
6203
6204 /* If the expression has got more back references than the offsets supplied can
6205 hold, we get a temporary chunk of working store to use during the matching.
6206 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6207 of 3. */
6208
6209 ocount = offsetcount - (offsetcount % 3);
6210 arg_offset_max = (2*ocount)/3;
6211
6212 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6213 {
6214 ocount = re->top_backref * 3 + 3;
6215 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6216 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6217 using_temporary_offsets = TRUE;
6218 DPRINTF(("Got memory to hold back references\n"));
6219 }
6220 else md->offset_vector = offsets;
6221
6222 md->offset_end = ocount;
6223 md->offset_max = (2*ocount)/3;
6224 md->offset_overflow = FALSE;
6225 md->capture_last = -1;
6226
6227 /* Reset the working variable associated with each extraction. These should
6228 never be used unless previously set, but they get saved and restored, and so we
6229 initialize them to avoid reading uninitialized locations. Also, unset the
6230 offsets for the matched string. This is really just for tidiness with callouts,
6231 in case they inspect these fields. */
6232
6233 if (md->offset_vector != NULL)
6234 {
6235 register int *iptr = md->offset_vector + ocount;
6236 register int *iend = iptr - re->top_bracket;
6237 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6238 while (--iptr >= iend) *iptr = -1;
6239 md->offset_vector[0] = md->offset_vector[1] = -1;
6240 }
6241
6242 /* Set up the first character to match, if available. The first_char value is
6243 never set for an anchored regular expression, but the anchoring may be forced
6244 at run time, so we have to test for anchoring. The first char may be unset for
6245 an unanchored pattern, of course. If there's no first char and the pattern was
6246 studied, there may be a bitmap of possible first characters. */
6247
6248 if (!anchored)
6249 {
6250 if ((re->flags & PCRE_FIRSTSET) != 0)
6251 {
6252 has_first_char = TRUE;
6253 first_char = first_char2 = re->first_char;
6254 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6255 first_char2 = TABLE_GET(first_char, tables + fcc_offset, first_char);
6256 }
6257 else
6258 if (!startline && study != NULL &&
6259 (study->flags & PCRE_STUDY_MAPPED) != 0)
6260 start_bits = study->start_bits;
6261 }
6262
6263 /* For anchored or unanchored matches, there may be a "last known required
6264 character" set. */
6265
6266 if ((re->flags & PCRE_REQCHSET) != 0)
6267 {
6268 has_req_char = TRUE;
6269 req_char = req_char2 = re->req_char;
6270 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6271 req_char2 = TABLE_GET(req_char, tables + fcc_offset, req_char);
6272 }
6273
6274
6275 /* ==========================================================================*/
6276
6277 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6278 the loop runs just once. */
6279
6280 for(;;)
6281 {
6282 PCRE_PUCHAR save_end_subject = end_subject;
6283 PCRE_PUCHAR new_start_match;
6284
6285 /* If firstline is TRUE, the start of the match is constrained to the first
6286 line of a multiline string. That is, the match must be before or at the first
6287 newline. Implement this by temporarily adjusting end_subject so that we stop
6288 scanning at a newline. If the match fails at the newline, later code breaks
6289 this loop. */
6290
6291 if (firstline)
6292 {
6293 PCRE_PUCHAR t = start_match;
6294 #ifdef SUPPORT_UTF8
6295 if (utf)
6296 {
6297 while (t < md->end_subject && !IS_NEWLINE(t))
6298 {
6299 t++;
6300 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6301 }
6302 }
6303 else
6304 #endif
6305 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6306 end_subject = t;
6307 }
6308
6309 /* There are some optimizations that avoid running the match if a known
6310 starting point is not found, or if a known later character is not present.
6311 However, there is an option that disables these, for testing and for ensuring
6312 that all callouts do actually occur. The option can be set in the regex by
6313 (*NO_START_OPT) or passed in match-time options. */
6314
6315 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6316 {
6317 /* Advance to a unique first char if there is one. */
6318
6319 if (has_first_char)
6320 {
6321 if (first_char != first_char2)
6322 while (start_match < end_subject &&
6323 *start_match != first_char && *start_match != first_char2)
6324 start_match++;
6325 else
6326 while (start_match < end_subject && *start_match != first_char)
6327 start_match++;
6328 }
6329
6330 /* Or to just after a linebreak for a multiline match */
6331
6332 else if (startline)
6333 {
6334 if (start_match > md->start_subject + start_offset)
6335 {
6336 #ifdef SUPPORT_UTF8
6337 if (utf)
6338 {
6339 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6340 {
6341 start_match++;
6342 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6343 start_match++;
6344 }
6345 }
6346 else
6347 #endif
6348 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6349 start_match++;
6350
6351 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6352 and we are now at a LF, advance the match position by one more character.
6353 */
6354
6355 if (start_match[-1] == CHAR_CR &&
6356 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6357 start_match < end_subject &&
6358 *start_match == CHAR_NL)
6359 start_match++;
6360 }
6361 }
6362
6363 /* Or to a non-unique first byte after study */
6364
6365 else if (start_bits != NULL)
6366 {
6367 while (start_match < end_subject)
6368 {
6369 #ifdef COMPILE_PCRE8
6370 register unsigned int c = *start_match;
6371 #else
6372 register unsigned int c = *start_match & 0xff;
6373 #endif
6374 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6375 {
6376 start_match++;
6377 #ifdef SUPPORT_UTF8
6378 if (utf)
6379 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6380 start_match++;
6381 #endif
6382 }
6383 else break;
6384 }
6385 }
6386 } /* Starting optimizations */
6387
6388 /* Restore fudged end_subject */
6389
6390 end_subject = save_end_subject;
6391
6392 /* The following two optimizations are disabled for partial matching or if
6393 disabling is explicitly requested. */
6394
6395 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6396 {
6397 /* If the pattern was studied, a minimum subject length may be set. This is
6398 a lower bound; no actual string of that length may actually match the
6399 pattern. Although the value is, strictly, in characters, we treat it as
6400 bytes to avoid spending too much time in this optimization. */
6401
6402 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6403 (pcre_uint32)(end_subject - start_match) < study->minlength)
6404 {
6405 rc = MATCH_NOMATCH;
6406 break;
6407 }
6408
6409 /* If req_char is set, we know that that character must appear in the
6410 subject for the match to succeed. If the first character is set, req_char
6411 must be later in the subject; otherwise the test starts at the match point.
6412 This optimization can save a huge amount of backtracking in patterns with
6413 nested unlimited repeats that aren't going to match. Writing separate code
6414 for cased/caseless versions makes it go faster, as does using an
6415 autoincrement and backing off on a match.
6416
6417 HOWEVER: when the subject string is very, very long, searching to its end
6418 can take a long time, and give bad performance on quite ordinary patterns.
6419 This showed up when somebody was matching something like /^\d+C/ on a
6420 32-megabyte string... so we don't do this when the string is sufficiently
6421 long. */
6422
6423 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6424 {
6425 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6426
6427 /* We don't need to repeat the search if we haven't yet reached the
6428 place we found it at last time. */
6429
6430 if (p > req_char_ptr)
6431 {
6432 if (req_char != req_char2)
6433 {
6434 while (p < end_subject)
6435 {
6436 register int pp = *p++;
6437 if (pp == req_char || pp == req_char2) { p--; break; }
6438 }
6439 }
6440 else
6441 {
6442 while (p < end_subject)
6443 {
6444 if (*p++ == req_char) { p--; break; }
6445 }
6446 }
6447
6448 /* If we can't find the required character, break the matching loop,
6449 forcing a match failure. */
6450
6451 if (p >= end_subject)
6452 {
6453 rc = MATCH_NOMATCH;
6454 break;
6455 }
6456
6457 /* If we have found the required character, save the point where we
6458 found it, so that we don't search again next time round the loop if
6459 the start hasn't passed this character yet. */
6460
6461 req_char_ptr = p;
6462 }
6463 }
6464 }
6465
6466 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6467 printf(">>>> Match against: ");
6468 pchars(start_match, end_subject - start_match, TRUE, md);
6469 printf("\n");
6470 #endif
6471
6472 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6473 first starting point for which a partial match was found. */
6474
6475 md->start_match_ptr = start_match;
6476 md->start_used_ptr = start_match;
6477 md->match_call_count = 0;
6478 md->match_function_type = 0;
6479 md->end_offset_top = 0;
6480 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6481 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6482
6483 switch(rc)
6484 {
6485 /* SKIP passes back the next starting point explicitly, but if it is the
6486 same as the match we have just done, treat it as NOMATCH. */
6487
6488 case MATCH_SKIP:
6489 if (md->start_match_ptr != start_match)
6490 {
6491 new_start_match = md->start_match_ptr;
6492 break;
6493 }
6494 /* Fall through */
6495
6496 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6497 the SKIP's arg was not found. We also treat this as NOMATCH. */
6498
6499 case MATCH_SKIP_ARG:
6500 /* Fall through */
6501
6502 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6503 exactly like PRUNE. */
6504
6505 case MATCH_NOMATCH:
6506 case MATCH_PRUNE:
6507 case MATCH_THEN:
6508 new_start_match = start_match + 1;
6509 #ifdef SUPPORT_UTF8
6510 if (utf)
6511 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6512 new_start_match++;
6513 #endif
6514 break;
6515
6516 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6517
6518 case MATCH_COMMIT:
6519 rc = MATCH_NOMATCH;
6520 goto ENDLOOP;
6521
6522 /* Any other return is either a match, or some kind of error. */
6523
6524 default:
6525 goto ENDLOOP;
6526 }
6527
6528 /* Control reaches here for the various types of "no match at this point"
6529 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6530
6531 rc = MATCH_NOMATCH;
6532
6533 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6534 newline in the subject (though it may continue over the newline). Therefore,
6535 if we have just failed to match, starting at a newline, do not continue. */
6536
6537 if (firstline && IS_NEWLINE(start_match)) break;
6538
6539 /* Advance to new matching position */
6540
6541 start_match = new_start_match;
6542
6543 /* Break the loop if the pattern is anchored or if we have passed the end of
6544 the subject. */
6545
6546 if (anchored || start_match > end_subject) break;
6547
6548 /* If we have just passed a CR and we are now at a LF, and the pattern does
6549 not contain any explicit matches for \r or \n, and the newline option is CRLF
6550 or ANY or ANYCRLF, advance the match position by one more character. */
6551
6552 if (start_match[-1] == CHAR_CR &&
6553 start_match < end_subject &&
6554 *start_match == CHAR_NL &&
6555 (re->flags & PCRE_HASCRORLF) == 0 &&
6556 (md->nltype == NLTYPE_ANY ||
6557 md->nltype == NLTYPE_ANYCRLF ||
6558 md->nllen == 2))
6559 start_match++;
6560
6561 md->mark = NULL; /* Reset for start of next match attempt */
6562 } /* End of for(;;) "bumpalong" loop */
6563
6564 /* ==========================================================================*/
6565
6566 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6567 conditions is true:
6568
6569 (1) The pattern is anchored or the match was failed by (*COMMIT);
6570
6571 (2) We are past the end of the subject;
6572
6573 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6574 this option requests that a match occur at or before the first newline in
6575 the subject.
6576
6577 When we have a match and the offset vector is big enough to deal with any
6578 backreferences, captured substring offsets will already be set up. In the case
6579 where we had to get some local store to hold offsets for backreference
6580 processing, copy those that we can. In this case there need not be overflow if
6581 certain parts of the pattern were not used, even though there are more
6582 capturing parentheses than vector slots. */
6583
6584 ENDLOOP:
6585
6586 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6587 {
6588 if (using_temporary_offsets)
6589 {
6590 if (arg_offset_max >= 4)
6591 {
6592 memcpy(offsets + 2, md->offset_vector + 2,
6593 (arg_offset_max - 2) * sizeof(int));
6594 DPRINTF(("Copied offsets from temporary memory\n"));
6595 }
6596 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6597 DPRINTF(("Freeing temporary memory\n"));
6598 (pcre_free)(md->offset_vector);
6599 }
6600
6601 /* Set the return code to the number of captured strings, or 0 if there were
6602 too many to fit into the vector. */
6603
6604 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6605 0 : md->end_offset_top/2;
6606
6607 /* If there is space in the offset vector, set any unused pairs at the end of
6608 the pattern to -1 for backwards compatibility. It is documented that this
6609 happens. In earlier versions, the whole set of potential capturing offsets
6610 was set to -1 each time round the loop, but this is handled differently now.
6611 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6612 those at the end that need unsetting here. We can't just unset them all at
6613 the start of the whole thing because they may get set in one branch that is
6614 not the final matching branch. */
6615
6616 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6617 {
6618 register int *iptr, *iend;
6619 int resetcount = 2 + re->top_bracket * 2;
6620 if (resetcount > offsetcount) resetcount = ocount;
6621 iptr = offsets + md->end_offset_top;
6622 iend = offsets + resetcount;
6623 while (iptr < iend) *iptr++ = -1;
6624 }
6625
6626 /* If there is space, set up the whole thing as substring 0. The value of
6627 md->start_match_ptr might be modified if \K was encountered on the success
6628 matching path. */
6629
6630 if (offsetcount < 2) rc = 0; else
6631 {
6632 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6633 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6634 }
6635
6636 DPRINTF((">>>> returning %d\n", rc));
6637 goto RETURN_MARK;
6638 }
6639
6640 /* Control gets here if there has been an error, or if the overall match
6641 attempt has failed at all permitted starting positions. */
6642
6643 if (using_temporary_offsets)
6644 {
6645 DPRINTF(("Freeing temporary memory\n"));
6646 (pcre_free)(md->offset_vector);
6647 }
6648
6649 /* For anything other than nomatch or partial match, just return the code. */
6650
6651 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6652 {
6653 DPRINTF((">>>> error: returning %d\n", rc));
6654 return rc;
6655 }
6656
6657 /* Handle partial matches - disable any mark data */
6658
6659 if (start_partial != NULL)
6660 {
6661 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6662 md->mark = NULL;
6663 if (offsetcount > 1)
6664 {
6665 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6666 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6667 }
6668 rc = PCRE_ERROR_PARTIAL;
6669 }
6670
6671 /* This is the classic nomatch case */
6672
6673 else
6674 {
6675 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6676 rc = PCRE_ERROR_NOMATCH;
6677 }
6678
6679 /* Return the MARK data if it has been requested. */
6680
6681 RETURN_MARK:
6682
6683 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6684 *(extra_data->mark) = (unsigned char *)(md->mark);
6685 return rc;
6686 }
6687
6688 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5