/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 770 - (show annotations)
Mon Nov 28 20:39:30 2011 UTC (7 years, 8 months ago) by zherczeg
File MIME type: text/plain
File size: 202882 byte(s)
Make character ranges 16 bit friendly
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 PCRE_PUCHAR eptr_start = eptr;
159 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF8
185 #ifdef SUPPORT_UCP
186 if (md->utf8)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 PCRE_PUCHAR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 PCRE_PUCHAR Xeptr;
358 const pcre_uchar *Xecode;
359 PCRE_PUCHAR Xmstart;
360 PCRE_PUCHAR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF8
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uint8 Xocchars[8];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf8, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
478 PCRE_PUCHAR mstart, const pcre_uchar *markptr, int offset_top,
479 match_data *md, eptrblock *eptrb, unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF8
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF8
589 const pcre_uchar *charptr;
590 #endif
591 const pcre_uchar *callpat;
592 const pcre_uchar *data;
593 const pcre_uchar *next;
594 PCRE_PUCHAR pp;
595 const pcre_uchar *prev;
596 PCRE_PUCHAR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 pcre_uint8 occhars[8];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637
638 /* These statements are here to stop the compiler complaining about unitialized
639 variables. */
640
641 #ifdef SUPPORT_UCP
642 prop_value = 0;
643 prop_fail_result = 0;
644 #endif
645
646
647 /* This label is used for tail recursion, which is used in a few cases even
648 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
649 used. Thanks to Ian Taylor for noticing this possibility and sending the
650 original patch. */
651
652 TAIL_RECURSE:
653
654 /* OK, now we can get on with the real code of the function. Recursive calls
655 are specified by the macro RMATCH and RRETURN is used to return. When
656 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
657 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
658 defined). However, RMATCH isn't like a function call because it's quite a
659 complicated macro. It has to be used in one particular way. This shouldn't,
660 however, impact performance when true recursion is being used. */
661
662 #ifdef SUPPORT_UTF8
663 utf8 = md->utf8; /* Local copy of the flag */
664 #else
665 utf8 = FALSE;
666 #endif
667
668 /* First check that we haven't called match() too many times, or that we
669 haven't exceeded the recursive call limit. */
670
671 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
672 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
673
674 /* At the start of a group with an unlimited repeat that may match an empty
675 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
676 done this way to save having to use another function argument, which would take
677 up space on the stack. See also MATCH_CONDASSERT below.
678
679 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
680 such remembered pointers, to be checked when we hit the closing ket, in order
681 to break infinite loops that match no characters. When match() is called in
682 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
683 NOT be used with tail recursion, because the memory block that is used is on
684 the stack, so a new one may be required for each match(). */
685
686 if (md->match_function_type == MATCH_CBEGROUP)
687 {
688 newptrb.epb_saved_eptr = eptr;
689 newptrb.epb_prev = eptrb;
690 eptrb = &newptrb;
691 md->match_function_type = 0;
692 }
693
694 /* Now start processing the opcodes. */
695
696 for (;;)
697 {
698 minimize = possessive = FALSE;
699 op = *ecode;
700
701 switch(op)
702 {
703 case OP_MARK:
704 markptr = ecode + 2;
705 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
706 eptrb, RM55);
707
708 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
709 argument, and we must check whether that argument matches this MARK's
710 argument. It is passed back in md->start_match_ptr (an overloading of that
711 variable). If it does match, we reset that variable to the current subject
712 position and return MATCH_SKIP. Otherwise, pass back the return code
713 unaltered. */
714
715 if (rrc == MATCH_SKIP_ARG &&
716 STRCMP_UC_UC(markptr, md->start_match_ptr) == 0)
717 {
718 md->start_match_ptr = eptr;
719 RRETURN(MATCH_SKIP);
720 }
721
722 if (md->mark == NULL) md->mark = markptr;
723 RRETURN(rrc);
724
725 case OP_FAIL:
726 MRRETURN(MATCH_NOMATCH);
727
728 /* COMMIT overrides PRUNE, SKIP, and THEN */
729
730 case OP_COMMIT:
731 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
732 eptrb, RM52);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
734 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
735 rrc != MATCH_THEN)
736 RRETURN(rrc);
737 MRRETURN(MATCH_COMMIT);
738
739 /* PRUNE overrides THEN */
740
741 case OP_PRUNE:
742 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
743 eptrb, RM51);
744 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
745 MRRETURN(MATCH_PRUNE);
746
747 case OP_PRUNE_ARG:
748 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
749 eptrb, RM56);
750 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
751 md->mark = ecode + 2;
752 RRETURN(MATCH_PRUNE);
753
754 /* SKIP overrides PRUNE and THEN */
755
756 case OP_SKIP:
757 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
758 eptrb, RM53);
759 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
760 RRETURN(rrc);
761 md->start_match_ptr = eptr; /* Pass back current position */
762 MRRETURN(MATCH_SKIP);
763
764 case OP_SKIP_ARG:
765 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
766 eptrb, RM57);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769
770 /* Pass back the current skip name by overloading md->start_match_ptr and
771 returning the special MATCH_SKIP_ARG return code. This will either be
772 caught by a matching MARK, or get to the top, where it is treated the same
773 as PRUNE. */
774
775 md->start_match_ptr = ecode + 2;
776 RRETURN(MATCH_SKIP_ARG);
777
778 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
779 the branch in which it occurs can be determined. Overload the start of
780 match pointer to do this. */
781
782 case OP_THEN:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM54);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 md->start_match_ptr = ecode;
787 MRRETURN(MATCH_THEN);
788
789 case OP_THEN_ARG:
790 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
791 md, eptrb, RM58);
792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
793 md->start_match_ptr = ecode;
794 md->mark = ecode + 2;
795 RRETURN(MATCH_THEN);
796
797 /* Handle an atomic group that does not contain any capturing parentheses.
798 This can be handled like an assertion. Prior to 8.13, all atomic groups
799 were handled this way. In 8.13, the code was changed as below for ONCE, so
800 that backups pass through the group and thereby reset captured values.
801 However, this uses a lot more stack, so in 8.20, atomic groups that do not
802 contain any captures generate OP_ONCE_NC, which can be handled in the old,
803 less stack intensive way.
804
805 Check the alternative branches in turn - the matching won't pass the KET
806 for this kind of subpattern. If any one branch matches, we carry on as at
807 the end of a normal bracket, leaving the subject pointer, but resetting
808 the start-of-match value in case it was changed by \K. */
809
810 case OP_ONCE_NC:
811 prev = ecode;
812 saved_eptr = eptr;
813 do
814 {
815 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
816 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
817 {
818 mstart = md->start_match_ptr;
819 markptr = md->mark;
820 break;
821 }
822 if (rrc == MATCH_THEN)
823 {
824 next = ecode + GET(ecode,1);
825 if (md->start_match_ptr < next &&
826 (*ecode == OP_ALT || *next == OP_ALT))
827 rrc = MATCH_NOMATCH;
828 }
829
830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
831 ecode += GET(ecode,1);
832 }
833 while (*ecode == OP_ALT);
834
835 /* If hit the end of the group (which could be repeated), fail */
836
837 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
838
839 /* Continue as from after the group, updating the offsets high water
840 mark, since extracts may have been taken. */
841
842 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
843
844 offset_top = md->end_offset_top;
845 eptr = md->end_match_ptr;
846
847 /* For a non-repeating ket, just continue at this level. This also
848 happens for a repeating ket if no characters were matched in the group.
849 This is the forcible breaking of infinite loops as implemented in Perl
850 5.005. */
851
852 if (*ecode == OP_KET || eptr == saved_eptr)
853 {
854 ecode += 1+LINK_SIZE;
855 break;
856 }
857
858 /* The repeating kets try the rest of the pattern or restart from the
859 preceding bracket, in the appropriate order. The second "call" of match()
860 uses tail recursion, to avoid using another stack frame. */
861
862 if (*ecode == OP_KETRMIN)
863 {
864 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
866 ecode = prev;
867 goto TAIL_RECURSE;
868 }
869 else /* OP_KETRMAX */
870 {
871 md->match_function_type = MATCH_CBEGROUP;
872 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
874 ecode += 1 + LINK_SIZE;
875 goto TAIL_RECURSE;
876 }
877 /* Control never gets here */
878
879 /* Handle a capturing bracket, other than those that are possessive with an
880 unlimited repeat. If there is space in the offset vector, save the current
881 subject position in the working slot at the top of the vector. We mustn't
882 change the current values of the data slot, because they may be set from a
883 previous iteration of this group, and be referred to by a reference inside
884 the group. A failure to match might occur after the group has succeeded,
885 if something later on doesn't match. For this reason, we need to restore
886 the working value and also the values of the final offsets, in case they
887 were set by a previous iteration of the same bracket.
888
889 If there isn't enough space in the offset vector, treat this as if it were
890 a non-capturing bracket. Don't worry about setting the flag for the error
891 case here; that is handled in the code for KET. */
892
893 case OP_CBRA:
894 case OP_SCBRA:
895 number = GET2(ecode, 1+LINK_SIZE);
896 offset = number << 1;
897
898 #ifdef PCRE_DEBUG
899 printf("start bracket %d\n", number);
900 printf("subject=");
901 pchars(eptr, 16, TRUE, md);
902 printf("\n");
903 #endif
904
905 if (offset < md->offset_max)
906 {
907 save_offset1 = md->offset_vector[offset];
908 save_offset2 = md->offset_vector[offset+1];
909 save_offset3 = md->offset_vector[md->offset_end - number];
910 save_capture_last = md->capture_last;
911
912 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
913 md->offset_vector[md->offset_end - number] =
914 (int)(eptr - md->start_subject);
915
916 for (;;)
917 {
918 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
919 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
920 eptrb, RM1);
921 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
922
923 /* If we backed up to a THEN, check whether it is within the current
924 branch by comparing the address of the THEN that is passed back with
925 the end of the branch. If it is within the current branch, and the
926 branch is one of two or more alternatives (it either starts or ends
927 with OP_ALT), we have reached the limit of THEN's action, so convert
928 the return code to NOMATCH, which will cause normal backtracking to
929 happen from now on. Otherwise, THEN is passed back to an outer
930 alternative. This implements Perl's treatment of parenthesized groups,
931 where a group not containing | does not affect the current alternative,
932 that is, (X) is NOT the same as (X|(*F)). */
933
934 if (rrc == MATCH_THEN)
935 {
936 next = ecode + GET(ecode,1);
937 if (md->start_match_ptr < next &&
938 (*ecode == OP_ALT || *next == OP_ALT))
939 rrc = MATCH_NOMATCH;
940 }
941
942 /* Anything other than NOMATCH is passed back. */
943
944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
945 md->capture_last = save_capture_last;
946 ecode += GET(ecode, 1);
947 if (*ecode != OP_ALT) break;
948 }
949
950 DPRINTF(("bracket %d failed\n", number));
951 md->offset_vector[offset] = save_offset1;
952 md->offset_vector[offset+1] = save_offset2;
953 md->offset_vector[md->offset_end - number] = save_offset3;
954
955 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
956
957 if (md->mark == NULL) md->mark = markptr;
958 RRETURN(rrc);
959 }
960
961 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
962 as a non-capturing bracket. */
963
964 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966
967 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
968
969 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971
972 /* Non-capturing or atomic group, except for possessive with unlimited
973 repeat and ONCE group with no captures. Loop for all the alternatives.
974
975 When we get to the final alternative within the brackets, we used to return
976 the result of a recursive call to match() whatever happened so it was
977 possible to reduce stack usage by turning this into a tail recursion,
978 except in the case of a possibly empty group. However, now that there is
979 the possiblity of (*THEN) occurring in the final alternative, this
980 optimization is no longer always possible.
981
982 We can optimize if we know there are no (*THEN)s in the pattern; at present
983 this is the best that can be done.
984
985 MATCH_ONCE is returned when the end of an atomic group is successfully
986 reached, but subsequent matching fails. It passes back up the tree (causing
987 captured values to be reset) until the original atomic group level is
988 reached. This is tested by comparing md->once_target with the start of the
989 group. At this point, the return is converted into MATCH_NOMATCH so that
990 previous backup points can be taken. */
991
992 case OP_ONCE:
993 case OP_BRA:
994 case OP_SBRA:
995 DPRINTF(("start non-capturing bracket\n"));
996
997 for (;;)
998 {
999 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1000
1001 /* If this is not a possibly empty group, and there are no (*THEN)s in
1002 the pattern, and this is the final alternative, optimize as described
1003 above. */
1004
1005 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1006 {
1007 ecode += PRIV(OP_lengths)[*ecode];
1008 goto TAIL_RECURSE;
1009 }
1010
1011 /* In all other cases, we have to make another call to match(). */
1012
1013 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1014 RM2);
1015
1016 /* See comment in the code for capturing groups above about handling
1017 THEN. */
1018
1019 if (rrc == MATCH_THEN)
1020 {
1021 next = ecode + GET(ecode,1);
1022 if (md->start_match_ptr < next &&
1023 (*ecode == OP_ALT || *next == OP_ALT))
1024 rrc = MATCH_NOMATCH;
1025 }
1026
1027 if (rrc != MATCH_NOMATCH)
1028 {
1029 if (rrc == MATCH_ONCE)
1030 {
1031 const pcre_uchar *scode = ecode;
1032 if (*scode != OP_ONCE) /* If not at start, find it */
1033 {
1034 while (*scode == OP_ALT) scode += GET(scode, 1);
1035 scode -= GET(scode, 1);
1036 }
1037 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1038 }
1039 RRETURN(rrc);
1040 }
1041 ecode += GET(ecode, 1);
1042 if (*ecode != OP_ALT) break;
1043 }
1044
1045 if (md->mark == NULL) md->mark = markptr;
1046 RRETURN(MATCH_NOMATCH);
1047
1048 /* Handle possessive capturing brackets with an unlimited repeat. We come
1049 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1050 handled similarly to the normal case above. However, the matching is
1051 different. The end of these brackets will always be OP_KETRPOS, which
1052 returns MATCH_KETRPOS without going further in the pattern. By this means
1053 we can handle the group by iteration rather than recursion, thereby
1054 reducing the amount of stack needed. */
1055
1056 case OP_CBRAPOS:
1057 case OP_SCBRAPOS:
1058 allow_zero = FALSE;
1059
1060 POSSESSIVE_CAPTURE:
1061 number = GET2(ecode, 1+LINK_SIZE);
1062 offset = number << 1;
1063
1064 #ifdef PCRE_DEBUG
1065 printf("start possessive bracket %d\n", number);
1066 printf("subject=");
1067 pchars(eptr, 16, TRUE, md);
1068 printf("\n");
1069 #endif
1070
1071 if (offset < md->offset_max)
1072 {
1073 matched_once = FALSE;
1074 code_offset = ecode - md->start_code;
1075
1076 save_offset1 = md->offset_vector[offset];
1077 save_offset2 = md->offset_vector[offset+1];
1078 save_offset3 = md->offset_vector[md->offset_end - number];
1079 save_capture_last = md->capture_last;
1080
1081 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1082
1083 /* Each time round the loop, save the current subject position for use
1084 when the group matches. For MATCH_MATCH, the group has matched, so we
1085 restart it with a new subject starting position, remembering that we had
1086 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1087 usual. If we haven't matched any alternatives in any iteration, check to
1088 see if a previous iteration matched. If so, the group has matched;
1089 continue from afterwards. Otherwise it has failed; restore the previous
1090 capture values before returning NOMATCH. */
1091
1092 for (;;)
1093 {
1094 md->offset_vector[md->offset_end - number] =
1095 (int)(eptr - md->start_subject);
1096 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1097 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1098 eptrb, RM63);
1099 if (rrc == MATCH_KETRPOS)
1100 {
1101 offset_top = md->end_offset_top;
1102 eptr = md->end_match_ptr;
1103 ecode = md->start_code + code_offset;
1104 save_capture_last = md->capture_last;
1105 matched_once = TRUE;
1106 continue;
1107 }
1108
1109 /* See comment in the code for capturing groups above about handling
1110 THEN. */
1111
1112 if (rrc == MATCH_THEN)
1113 {
1114 next = ecode + GET(ecode,1);
1115 if (md->start_match_ptr < next &&
1116 (*ecode == OP_ALT || *next == OP_ALT))
1117 rrc = MATCH_NOMATCH;
1118 }
1119
1120 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1121 md->capture_last = save_capture_last;
1122 ecode += GET(ecode, 1);
1123 if (*ecode != OP_ALT) break;
1124 }
1125
1126 if (!matched_once)
1127 {
1128 md->offset_vector[offset] = save_offset1;
1129 md->offset_vector[offset+1] = save_offset2;
1130 md->offset_vector[md->offset_end - number] = save_offset3;
1131 }
1132
1133 if (md->mark == NULL) md->mark = markptr;
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = ecode - md->start_code;
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (pcre_callout != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = (unsigned char *)markptr;
1236 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 pcre_uchar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 pcre_uchar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 pcre_uchar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 pcre_uchar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 MRRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to MRRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 MRRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 markptr = md->mark;
1532 break;
1533 }
1534
1535 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1536 as NOMATCH. */
1537
1538 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1539 ecode += GET(ecode, 1);
1540 }
1541 while (*ecode == OP_ALT);
1542
1543 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1544
1545 /* If checking an assertion for a condition, return MATCH_MATCH. */
1546
1547 if (condassert) RRETURN(MATCH_MATCH);
1548
1549 /* Continue from after the assertion, updating the offsets high water
1550 mark, since extracts may have been taken during the assertion. */
1551
1552 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1553 ecode += 1 + LINK_SIZE;
1554 offset_top = md->end_offset_top;
1555 continue;
1556
1557 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1558 PRUNE, or COMMIT means we must assume failure without checking subsequent
1559 branches. */
1560
1561 case OP_ASSERT_NOT:
1562 case OP_ASSERTBACK_NOT:
1563 if (md->match_function_type == MATCH_CONDASSERT)
1564 {
1565 condassert = TRUE;
1566 md->match_function_type = 0;
1567 }
1568 else condassert = FALSE;
1569
1570 do
1571 {
1572 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1573 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1574 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1575 {
1576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1577 break;
1578 }
1579
1580 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1581 as NOMATCH. */
1582
1583 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1584 ecode += GET(ecode,1);
1585 }
1586 while (*ecode == OP_ALT);
1587
1588 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1589
1590 ecode += 1 + LINK_SIZE;
1591 continue;
1592
1593 /* Move the subject pointer back. This occurs only at the start of
1594 each branch of a lookbehind assertion. If we are too close to the start to
1595 move back, this match function fails. When working with UTF-8 we move
1596 back a number of characters, not bytes. */
1597
1598 case OP_REVERSE:
1599 #ifdef SUPPORT_UTF8
1600 if (utf8)
1601 {
1602 i = GET(ecode, 1);
1603 while (i-- > 0)
1604 {
1605 eptr--;
1606 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1607 BACKCHAR(eptr);
1608 }
1609 }
1610 else
1611 #endif
1612
1613 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1614
1615 {
1616 eptr -= GET(ecode, 1);
1617 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1618 }
1619
1620 /* Save the earliest consulted character, then skip to next op code */
1621
1622 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1623 ecode += 1 + LINK_SIZE;
1624 break;
1625
1626 /* The callout item calls an external function, if one is provided, passing
1627 details of the match so far. This is mainly for debugging, though the
1628 function is able to force a failure. */
1629
1630 case OP_CALLOUT:
1631 if (pcre_callout != NULL)
1632 {
1633 pcre_callout_block cb;
1634 cb.version = 2; /* Version 1 of the callout block */
1635 cb.callout_number = ecode[1];
1636 cb.offset_vector = md->offset_vector;
1637 cb.subject = (PCRE_SPTR)md->start_subject;
1638 cb.subject_length = (int)(md->end_subject - md->start_subject);
1639 cb.start_match = (int)(mstart - md->start_subject);
1640 cb.current_position = (int)(eptr - md->start_subject);
1641 cb.pattern_position = GET(ecode, 2);
1642 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1643 cb.capture_top = offset_top/2;
1644 cb.capture_last = md->capture_last;
1645 cb.callout_data = md->callout_data;
1646 cb.mark = (unsigned char *)markptr;
1647 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1648 if (rrc < 0) RRETURN(rrc);
1649 }
1650 ecode += 2 + 2*LINK_SIZE;
1651 break;
1652
1653 /* Recursion either matches the current regex, or some subexpression. The
1654 offset data is the offset to the starting bracket from the start of the
1655 whole pattern. (This is so that it works from duplicated subpatterns.)
1656
1657 The state of the capturing groups is preserved over recursion, and
1658 re-instated afterwards. We don't know how many are started and not yet
1659 finished (offset_top records the completed total) so we just have to save
1660 all the potential data. There may be up to 65535 such values, which is too
1661 large to put on the stack, but using malloc for small numbers seems
1662 expensive. As a compromise, the stack is used when there are no more than
1663 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1664
1665 There are also other values that have to be saved. We use a chained
1666 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1667 for the original version of this logic. It has, however, been hacked around
1668 a lot, so he is not to blame for the current way it works. */
1669
1670 case OP_RECURSE:
1671 {
1672 recursion_info *ri;
1673 int recno;
1674
1675 callpat = md->start_code + GET(ecode, 1);
1676 recno = (callpat == md->start_code)? 0 :
1677 GET2(callpat, 1 + LINK_SIZE);
1678
1679 /* Check for repeating a recursion without advancing the subject pointer.
1680 This should catch convoluted mutual recursions. (Some simple cases are
1681 caught at compile time.) */
1682
1683 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1684 if (recno == ri->group_num && eptr == ri->subject_position)
1685 RRETURN(PCRE_ERROR_RECURSELOOP);
1686
1687 /* Add to "recursing stack" */
1688
1689 new_recursive.group_num = recno;
1690 new_recursive.subject_position = eptr;
1691 new_recursive.prevrec = md->recursive;
1692 md->recursive = &new_recursive;
1693
1694 /* Where to continue from afterwards */
1695
1696 ecode += 1 + LINK_SIZE;
1697
1698 /* Now save the offset data */
1699
1700 new_recursive.saved_max = md->offset_end;
1701 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1702 new_recursive.offset_save = stacksave;
1703 else
1704 {
1705 new_recursive.offset_save =
1706 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1707 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1708 }
1709 memcpy(new_recursive.offset_save, md->offset_vector,
1710 new_recursive.saved_max * sizeof(int));
1711
1712 /* OK, now we can do the recursion. After processing each alternative,
1713 restore the offset data. If there were nested recursions, md->recursive
1714 might be changed, so reset it before looping. */
1715
1716 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1717 cbegroup = (*callpat >= OP_SBRA);
1718 do
1719 {
1720 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1721 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1722 md, eptrb, RM6);
1723 memcpy(md->offset_vector, new_recursive.offset_save,
1724 new_recursive.saved_max * sizeof(int));
1725 md->recursive = new_recursive.prevrec;
1726 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1727 {
1728 DPRINTF(("Recursion matched\n"));
1729 if (new_recursive.offset_save != stacksave)
1730 (pcre_free)(new_recursive.offset_save);
1731
1732 /* Set where we got to in the subject, and reset the start in case
1733 it was changed by \K. This *is* propagated back out of a recursion,
1734 for Perl compatibility. */
1735
1736 eptr = md->end_match_ptr;
1737 mstart = md->start_match_ptr;
1738 goto RECURSION_MATCHED; /* Exit loop; end processing */
1739 }
1740
1741 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1742 as NOMATCH. */
1743
1744 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1745 {
1746 DPRINTF(("Recursion gave error %d\n", rrc));
1747 if (new_recursive.offset_save != stacksave)
1748 (pcre_free)(new_recursive.offset_save);
1749 RRETURN(rrc);
1750 }
1751
1752 md->recursive = &new_recursive;
1753 callpat += GET(callpat, 1);
1754 }
1755 while (*callpat == OP_ALT);
1756
1757 DPRINTF(("Recursion didn't match\n"));
1758 md->recursive = new_recursive.prevrec;
1759 if (new_recursive.offset_save != stacksave)
1760 (pcre_free)(new_recursive.offset_save);
1761 MRRETURN(MATCH_NOMATCH);
1762 }
1763
1764 RECURSION_MATCHED:
1765 break;
1766
1767 /* An alternation is the end of a branch; scan along to find the end of the
1768 bracketed group and go to there. */
1769
1770 case OP_ALT:
1771 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1772 break;
1773
1774 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1775 indicating that it may occur zero times. It may repeat infinitely, or not
1776 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1777 with fixed upper repeat limits are compiled as a number of copies, with the
1778 optional ones preceded by BRAZERO or BRAMINZERO. */
1779
1780 case OP_BRAZERO:
1781 next = ecode + 1;
1782 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 do next += GET(next, 1); while (*next == OP_ALT);
1785 ecode = next + 1 + LINK_SIZE;
1786 break;
1787
1788 case OP_BRAMINZERO:
1789 next = ecode + 1;
1790 do next += GET(next, 1); while (*next == OP_ALT);
1791 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793 ecode++;
1794 break;
1795
1796 case OP_SKIPZERO:
1797 next = ecode+1;
1798 do next += GET(next,1); while (*next == OP_ALT);
1799 ecode = next + 1 + LINK_SIZE;
1800 break;
1801
1802 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1803 here; just jump to the group, with allow_zero set TRUE. */
1804
1805 case OP_BRAPOSZERO:
1806 op = *(++ecode);
1807 allow_zero = TRUE;
1808 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1809 goto POSSESSIVE_NON_CAPTURE;
1810
1811 /* End of a group, repeated or non-repeating. */
1812
1813 case OP_KET:
1814 case OP_KETRMIN:
1815 case OP_KETRMAX:
1816 case OP_KETRPOS:
1817 prev = ecode - GET(ecode, 1);
1818
1819 /* If this was a group that remembered the subject start, in order to break
1820 infinite repeats of empty string matches, retrieve the subject start from
1821 the chain. Otherwise, set it NULL. */
1822
1823 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1824 {
1825 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1826 eptrb = eptrb->epb_prev; /* Backup to previous group */
1827 }
1828 else saved_eptr = NULL;
1829
1830 /* If we are at the end of an assertion group or a non-capturing atomic
1831 group, stop matching and return MATCH_MATCH, but record the current high
1832 water mark for use by positive assertions. We also need to record the match
1833 start in case it was changed by \K. */
1834
1835 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1836 *prev == OP_ONCE_NC)
1837 {
1838 md->end_match_ptr = eptr; /* For ONCE_NC */
1839 md->end_offset_top = offset_top;
1840 md->start_match_ptr = mstart;
1841 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1842 }
1843
1844 /* For capturing groups we have to check the group number back at the start
1845 and if necessary complete handling an extraction by setting the offsets and
1846 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1847 into group 0, so it won't be picked up here. Instead, we catch it when the
1848 OP_END is reached. Other recursion is handled here. We just have to record
1849 the current subject position and start match pointer and give a MATCH
1850 return. */
1851
1852 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1853 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1854 {
1855 number = GET2(prev, 1+LINK_SIZE);
1856 offset = number << 1;
1857
1858 #ifdef PCRE_DEBUG
1859 printf("end bracket %d", number);
1860 printf("\n");
1861 #endif
1862
1863 /* Handle a recursively called group. */
1864
1865 if (md->recursive != NULL && md->recursive->group_num == number)
1866 {
1867 md->end_match_ptr = eptr;
1868 md->start_match_ptr = mstart;
1869 RRETURN(MATCH_MATCH);
1870 }
1871
1872 /* Deal with capturing */
1873
1874 md->capture_last = number;
1875 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1876 {
1877 /* If offset is greater than offset_top, it means that we are
1878 "skipping" a capturing group, and that group's offsets must be marked
1879 unset. In earlier versions of PCRE, all the offsets were unset at the
1880 start of matching, but this doesn't work because atomic groups and
1881 assertions can cause a value to be set that should later be unset.
1882 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1883 part of the atomic group, but this is not on the final matching path,
1884 so must be unset when 2 is set. (If there is no group 2, there is no
1885 problem, because offset_top will then be 2, indicating no capture.) */
1886
1887 if (offset > offset_top)
1888 {
1889 register int *iptr = md->offset_vector + offset_top;
1890 register int *iend = md->offset_vector + offset;
1891 while (iptr < iend) *iptr++ = -1;
1892 }
1893
1894 /* Now make the extraction */
1895
1896 md->offset_vector[offset] =
1897 md->offset_vector[md->offset_end - number];
1898 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1899 if (offset_top <= offset) offset_top = offset + 2;
1900 }
1901 }
1902
1903 /* For an ordinary non-repeating ket, just continue at this level. This
1904 also happens for a repeating ket if no characters were matched in the
1905 group. This is the forcible breaking of infinite loops as implemented in
1906 Perl 5.005. For a non-repeating atomic group that includes captures,
1907 establish a backup point by processing the rest of the pattern at a lower
1908 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1909 original OP_ONCE level, thereby bypassing intermediate backup points, but
1910 resetting any captures that happened along the way. */
1911
1912 if (*ecode == OP_KET || eptr == saved_eptr)
1913 {
1914 if (*prev == OP_ONCE)
1915 {
1916 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1917 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1918 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1919 RRETURN(MATCH_ONCE);
1920 }
1921 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1922 break;
1923 }
1924
1925 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1926 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1927 at a time from the outer level, thus saving stack. */
1928
1929 if (*ecode == OP_KETRPOS)
1930 {
1931 md->end_match_ptr = eptr;
1932 md->end_offset_top = offset_top;
1933 RRETURN(MATCH_KETRPOS);
1934 }
1935
1936 /* The normal repeating kets try the rest of the pattern or restart from
1937 the preceding bracket, in the appropriate order. In the second case, we can
1938 use tail recursion to avoid using another stack frame, unless we have an
1939 an atomic group or an unlimited repeat of a group that can match an empty
1940 string. */
1941
1942 if (*ecode == OP_KETRMIN)
1943 {
1944 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1946 if (*prev == OP_ONCE)
1947 {
1948 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1951 RRETURN(MATCH_ONCE);
1952 }
1953 if (*prev >= OP_SBRA) /* Could match an empty string */
1954 {
1955 md->match_function_type = MATCH_CBEGROUP;
1956 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1957 RRETURN(rrc);
1958 }
1959 ecode = prev;
1960 goto TAIL_RECURSE;
1961 }
1962 else /* OP_KETRMAX */
1963 {
1964 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1965 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1966 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1968 if (*prev == OP_ONCE)
1969 {
1970 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1972 md->once_target = prev;
1973 RRETURN(MATCH_ONCE);
1974 }
1975 ecode += 1 + LINK_SIZE;
1976 goto TAIL_RECURSE;
1977 }
1978 /* Control never gets here */
1979
1980 /* Not multiline mode: start of subject assertion, unless notbol. */
1981
1982 case OP_CIRC:
1983 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1984
1985 /* Start of subject assertion */
1986
1987 case OP_SOD:
1988 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1989 ecode++;
1990 break;
1991
1992 /* Multiline mode: start of subject unless notbol, or after any newline. */
1993
1994 case OP_CIRCM:
1995 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1996 if (eptr != md->start_subject &&
1997 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1998 MRRETURN(MATCH_NOMATCH);
1999 ecode++;
2000 break;
2001
2002 /* Start of match assertion */
2003
2004 case OP_SOM:
2005 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2006 ecode++;
2007 break;
2008
2009 /* Reset the start of match point */
2010
2011 case OP_SET_SOM:
2012 mstart = eptr;
2013 ecode++;
2014 break;
2015
2016 /* Multiline mode: assert before any newline, or before end of subject
2017 unless noteol is set. */
2018
2019 case OP_DOLLM:
2020 if (eptr < md->end_subject)
2021 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2022 else
2023 {
2024 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2025 SCHECK_PARTIAL();
2026 }
2027 ecode++;
2028 break;
2029
2030 /* Not multiline mode: assert before a terminating newline or before end of
2031 subject unless noteol is set. */
2032
2033 case OP_DOLL:
2034 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2035 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2036
2037 /* ... else fall through for endonly */
2038
2039 /* End of subject assertion (\z) */
2040
2041 case OP_EOD:
2042 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2043 SCHECK_PARTIAL();
2044 ecode++;
2045 break;
2046
2047 /* End of subject or ending \n assertion (\Z) */
2048
2049 case OP_EODN:
2050 ASSERT_NL_OR_EOS:
2051 if (eptr < md->end_subject &&
2052 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2053 MRRETURN(MATCH_NOMATCH);
2054
2055 /* Either at end of string or \n before end. */
2056
2057 SCHECK_PARTIAL();
2058 ecode++;
2059 break;
2060
2061 /* Word boundary assertions */
2062
2063 case OP_NOT_WORD_BOUNDARY:
2064 case OP_WORD_BOUNDARY:
2065 {
2066
2067 /* Find out if the previous and current characters are "word" characters.
2068 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2069 be "non-word" characters. Remember the earliest consulted character for
2070 partial matching. */
2071
2072 #ifdef SUPPORT_UTF8
2073 if (utf8)
2074 {
2075 /* Get status of previous character */
2076
2077 if (eptr == md->start_subject) prev_is_word = FALSE; else
2078 {
2079 PCRE_PUCHAR lastptr = eptr - 1;
2080 while((*lastptr & 0xc0) == 0x80) lastptr--;
2081 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2082 GETCHAR(c, lastptr);
2083 #ifdef SUPPORT_UCP
2084 if (md->use_ucp)
2085 {
2086 if (c == '_') prev_is_word = TRUE; else
2087 {
2088 int cat = UCD_CATEGORY(c);
2089 prev_is_word = (cat == ucp_L || cat == ucp_N);
2090 }
2091 }
2092 else
2093 #endif
2094 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2095 }
2096
2097 /* Get status of next character */
2098
2099 if (eptr >= md->end_subject)
2100 {
2101 SCHECK_PARTIAL();
2102 cur_is_word = FALSE;
2103 }
2104 else
2105 {
2106 GETCHAR(c, eptr);
2107 #ifdef SUPPORT_UCP
2108 if (md->use_ucp)
2109 {
2110 if (c == '_') cur_is_word = TRUE; else
2111 {
2112 int cat = UCD_CATEGORY(c);
2113 cur_is_word = (cat == ucp_L || cat == ucp_N);
2114 }
2115 }
2116 else
2117 #endif
2118 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2119 }
2120 }
2121 else
2122 #endif
2123
2124 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2125 consistency with the behaviour of \w we do use it in this case. */
2126
2127 {
2128 /* Get status of previous character */
2129
2130 if (eptr == md->start_subject) prev_is_word = FALSE; else
2131 {
2132 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2133 #ifdef SUPPORT_UCP
2134 if (md->use_ucp)
2135 {
2136 c = eptr[-1];
2137 if (c == '_') prev_is_word = TRUE; else
2138 {
2139 int cat = UCD_CATEGORY(c);
2140 prev_is_word = (cat == ucp_L || cat == ucp_N);
2141 }
2142 }
2143 else
2144 #endif
2145 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2146 }
2147
2148 /* Get status of next character */
2149
2150 if (eptr >= md->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 cur_is_word = FALSE;
2154 }
2155 else
2156 #ifdef SUPPORT_UCP
2157 if (md->use_ucp)
2158 {
2159 c = *eptr;
2160 if (c == '_') cur_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 cur_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2169 }
2170
2171 /* Now see if the situation is what we want */
2172
2173 if ((*ecode++ == OP_WORD_BOUNDARY)?
2174 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2175 MRRETURN(MATCH_NOMATCH);
2176 }
2177 break;
2178
2179 /* Match a single character type; inline for speed */
2180
2181 case OP_ANY:
2182 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2183 /* Fall through */
2184
2185 case OP_ALLANY:
2186 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2187 { /* not be updated before SCHECK_PARTIAL. */
2188 SCHECK_PARTIAL();
2189 MRRETURN(MATCH_NOMATCH);
2190 }
2191 eptr++;
2192 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2193 ecode++;
2194 break;
2195
2196 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2197 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2198
2199 case OP_ANYBYTE:
2200 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2201 { /* not be updated before SCHECK_PARTIAL. */
2202 SCHECK_PARTIAL();
2203 MRRETURN(MATCH_NOMATCH);
2204 }
2205 eptr++;
2206 ecode++;
2207 break;
2208
2209 case OP_NOT_DIGIT:
2210 if (eptr >= md->end_subject)
2211 {
2212 SCHECK_PARTIAL();
2213 MRRETURN(MATCH_NOMATCH);
2214 }
2215 GETCHARINCTEST(c, eptr);
2216 if (
2217 #ifdef SUPPORT_UTF8
2218 c < 256 &&
2219 #endif
2220 (md->ctypes[c] & ctype_digit) != 0
2221 )
2222 MRRETURN(MATCH_NOMATCH);
2223 ecode++;
2224 break;
2225
2226 case OP_DIGIT:
2227 if (eptr >= md->end_subject)
2228 {
2229 SCHECK_PARTIAL();
2230 MRRETURN(MATCH_NOMATCH);
2231 }
2232 GETCHARINCTEST(c, eptr);
2233 if (
2234 #ifdef SUPPORT_UTF8
2235 c >= 256 ||
2236 #endif
2237 (md->ctypes[c] & ctype_digit) == 0
2238 )
2239 MRRETURN(MATCH_NOMATCH);
2240 ecode++;
2241 break;
2242
2243 case OP_NOT_WHITESPACE:
2244 if (eptr >= md->end_subject)
2245 {
2246 SCHECK_PARTIAL();
2247 MRRETURN(MATCH_NOMATCH);
2248 }
2249 GETCHARINCTEST(c, eptr);
2250 if (
2251 #ifdef SUPPORT_UTF8
2252 c < 256 &&
2253 #endif
2254 (md->ctypes[c] & ctype_space) != 0
2255 )
2256 MRRETURN(MATCH_NOMATCH);
2257 ecode++;
2258 break;
2259
2260 case OP_WHITESPACE:
2261 if (eptr >= md->end_subject)
2262 {
2263 SCHECK_PARTIAL();
2264 MRRETURN(MATCH_NOMATCH);
2265 }
2266 GETCHARINCTEST(c, eptr);
2267 if (
2268 #ifdef SUPPORT_UTF8
2269 c >= 256 ||
2270 #endif
2271 (md->ctypes[c] & ctype_space) == 0
2272 )
2273 MRRETURN(MATCH_NOMATCH);
2274 ecode++;
2275 break;
2276
2277 case OP_NOT_WORDCHAR:
2278 if (eptr >= md->end_subject)
2279 {
2280 SCHECK_PARTIAL();
2281 MRRETURN(MATCH_NOMATCH);
2282 }
2283 GETCHARINCTEST(c, eptr);
2284 if (
2285 #ifdef SUPPORT_UTF8
2286 c < 256 &&
2287 #endif
2288 (md->ctypes[c] & ctype_word) != 0
2289 )
2290 MRRETURN(MATCH_NOMATCH);
2291 ecode++;
2292 break;
2293
2294 case OP_WORDCHAR:
2295 if (eptr >= md->end_subject)
2296 {
2297 SCHECK_PARTIAL();
2298 MRRETURN(MATCH_NOMATCH);
2299 }
2300 GETCHARINCTEST(c, eptr);
2301 if (
2302 #ifdef SUPPORT_UTF8
2303 c >= 256 ||
2304 #endif
2305 (md->ctypes[c] & ctype_word) == 0
2306 )
2307 MRRETURN(MATCH_NOMATCH);
2308 ecode++;
2309 break;
2310
2311 case OP_ANYNL:
2312 if (eptr >= md->end_subject)
2313 {
2314 SCHECK_PARTIAL();
2315 MRRETURN(MATCH_NOMATCH);
2316 }
2317 GETCHARINCTEST(c, eptr);
2318 switch(c)
2319 {
2320 default: MRRETURN(MATCH_NOMATCH);
2321
2322 case 0x000d:
2323 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2324 break;
2325
2326 case 0x000a:
2327 break;
2328
2329 case 0x000b:
2330 case 0x000c:
2331 case 0x0085:
2332 case 0x2028:
2333 case 0x2029:
2334 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2335 break;
2336 }
2337 ecode++;
2338 break;
2339
2340 case OP_NOT_HSPACE:
2341 if (eptr >= md->end_subject)
2342 {
2343 SCHECK_PARTIAL();
2344 MRRETURN(MATCH_NOMATCH);
2345 }
2346 GETCHARINCTEST(c, eptr);
2347 switch(c)
2348 {
2349 default: break;
2350 case 0x09: /* HT */
2351 case 0x20: /* SPACE */
2352 case 0xa0: /* NBSP */
2353 case 0x1680: /* OGHAM SPACE MARK */
2354 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2355 case 0x2000: /* EN QUAD */
2356 case 0x2001: /* EM QUAD */
2357 case 0x2002: /* EN SPACE */
2358 case 0x2003: /* EM SPACE */
2359 case 0x2004: /* THREE-PER-EM SPACE */
2360 case 0x2005: /* FOUR-PER-EM SPACE */
2361 case 0x2006: /* SIX-PER-EM SPACE */
2362 case 0x2007: /* FIGURE SPACE */
2363 case 0x2008: /* PUNCTUATION SPACE */
2364 case 0x2009: /* THIN SPACE */
2365 case 0x200A: /* HAIR SPACE */
2366 case 0x202f: /* NARROW NO-BREAK SPACE */
2367 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2368 case 0x3000: /* IDEOGRAPHIC SPACE */
2369 MRRETURN(MATCH_NOMATCH);
2370 }
2371 ecode++;
2372 break;
2373
2374 case OP_HSPACE:
2375 if (eptr >= md->end_subject)
2376 {
2377 SCHECK_PARTIAL();
2378 MRRETURN(MATCH_NOMATCH);
2379 }
2380 GETCHARINCTEST(c, eptr);
2381 switch(c)
2382 {
2383 default: MRRETURN(MATCH_NOMATCH);
2384 case 0x09: /* HT */
2385 case 0x20: /* SPACE */
2386 case 0xa0: /* NBSP */
2387 case 0x1680: /* OGHAM SPACE MARK */
2388 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2389 case 0x2000: /* EN QUAD */
2390 case 0x2001: /* EM QUAD */
2391 case 0x2002: /* EN SPACE */
2392 case 0x2003: /* EM SPACE */
2393 case 0x2004: /* THREE-PER-EM SPACE */
2394 case 0x2005: /* FOUR-PER-EM SPACE */
2395 case 0x2006: /* SIX-PER-EM SPACE */
2396 case 0x2007: /* FIGURE SPACE */
2397 case 0x2008: /* PUNCTUATION SPACE */
2398 case 0x2009: /* THIN SPACE */
2399 case 0x200A: /* HAIR SPACE */
2400 case 0x202f: /* NARROW NO-BREAK SPACE */
2401 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2402 case 0x3000: /* IDEOGRAPHIC SPACE */
2403 break;
2404 }
2405 ecode++;
2406 break;
2407
2408 case OP_NOT_VSPACE:
2409 if (eptr >= md->end_subject)
2410 {
2411 SCHECK_PARTIAL();
2412 MRRETURN(MATCH_NOMATCH);
2413 }
2414 GETCHARINCTEST(c, eptr);
2415 switch(c)
2416 {
2417 default: break;
2418 case 0x0a: /* LF */
2419 case 0x0b: /* VT */
2420 case 0x0c: /* FF */
2421 case 0x0d: /* CR */
2422 case 0x85: /* NEL */
2423 case 0x2028: /* LINE SEPARATOR */
2424 case 0x2029: /* PARAGRAPH SEPARATOR */
2425 MRRETURN(MATCH_NOMATCH);
2426 }
2427 ecode++;
2428 break;
2429
2430 case OP_VSPACE:
2431 if (eptr >= md->end_subject)
2432 {
2433 SCHECK_PARTIAL();
2434 MRRETURN(MATCH_NOMATCH);
2435 }
2436 GETCHARINCTEST(c, eptr);
2437 switch(c)
2438 {
2439 default: MRRETURN(MATCH_NOMATCH);
2440 case 0x0a: /* LF */
2441 case 0x0b: /* VT */
2442 case 0x0c: /* FF */
2443 case 0x0d: /* CR */
2444 case 0x85: /* NEL */
2445 case 0x2028: /* LINE SEPARATOR */
2446 case 0x2029: /* PARAGRAPH SEPARATOR */
2447 break;
2448 }
2449 ecode++;
2450 break;
2451
2452 #ifdef SUPPORT_UCP
2453 /* Check the next character by Unicode property. We will get here only
2454 if the support is in the binary; otherwise a compile-time error occurs. */
2455
2456 case OP_PROP:
2457 case OP_NOTPROP:
2458 if (eptr >= md->end_subject)
2459 {
2460 SCHECK_PARTIAL();
2461 MRRETURN(MATCH_NOMATCH);
2462 }
2463 GETCHARINCTEST(c, eptr);
2464 {
2465 const ucd_record *prop = GET_UCD(c);
2466
2467 switch(ecode[1])
2468 {
2469 case PT_ANY:
2470 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2471 break;
2472
2473 case PT_LAMP:
2474 if ((prop->chartype == ucp_Lu ||
2475 prop->chartype == ucp_Ll ||
2476 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2477 MRRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_GC:
2481 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2482 MRRETURN(MATCH_NOMATCH);
2483 break;
2484
2485 case PT_PC:
2486 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2487 MRRETURN(MATCH_NOMATCH);
2488 break;
2489
2490 case PT_SC:
2491 if ((ecode[2] != prop->script) == (op == OP_PROP))
2492 MRRETURN(MATCH_NOMATCH);
2493 break;
2494
2495 /* These are specials */
2496
2497 case PT_ALNUM:
2498 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2499 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2500 MRRETURN(MATCH_NOMATCH);
2501 break;
2502
2503 case PT_SPACE: /* Perl space */
2504 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2505 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2506 == (op == OP_NOTPROP))
2507 MRRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_PXSPACE: /* POSIX space */
2511 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2513 c == CHAR_FF || c == CHAR_CR)
2514 == (op == OP_NOTPROP))
2515 MRRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_WORD:
2519 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2520 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2521 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2522 MRRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 /* This should never occur */
2526
2527 default:
2528 RRETURN(PCRE_ERROR_INTERNAL);
2529 }
2530
2531 ecode += 3;
2532 }
2533 break;
2534
2535 /* Match an extended Unicode sequence. We will get here only if the support
2536 is in the binary; otherwise a compile-time error occurs. */
2537
2538 case OP_EXTUNI:
2539 if (eptr >= md->end_subject)
2540 {
2541 SCHECK_PARTIAL();
2542 MRRETURN(MATCH_NOMATCH);
2543 }
2544 GETCHARINCTEST(c, eptr);
2545 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2546 while (eptr < md->end_subject)
2547 {
2548 int len = 1;
2549 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2550 if (UCD_CATEGORY(c) != ucp_M) break;
2551 eptr += len;
2552 }
2553 ecode++;
2554 break;
2555 #endif
2556
2557
2558 /* Match a back reference, possibly repeatedly. Look past the end of the
2559 item to see if there is repeat information following. The code is similar
2560 to that for character classes, but repeated for efficiency. Then obey
2561 similar code to character type repeats - written out again for speed.
2562 However, if the referenced string is the empty string, always treat
2563 it as matched, any number of times (otherwise there could be infinite
2564 loops). */
2565
2566 case OP_REF:
2567 case OP_REFI:
2568 caseless = op == OP_REFI;
2569 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2570 ecode += 1 + IMM2_SIZE;
2571
2572 /* If the reference is unset, there are two possibilities:
2573
2574 (a) In the default, Perl-compatible state, set the length negative;
2575 this ensures that every attempt at a match fails. We can't just fail
2576 here, because of the possibility of quantifiers with zero minima.
2577
2578 (b) If the JavaScript compatibility flag is set, set the length to zero
2579 so that the back reference matches an empty string.
2580
2581 Otherwise, set the length to the length of what was matched by the
2582 referenced subpattern. */
2583
2584 if (offset >= offset_top || md->offset_vector[offset] < 0)
2585 length = (md->jscript_compat)? 0 : -1;
2586 else
2587 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2588
2589 /* Set up for repetition, or handle the non-repeated case */
2590
2591 switch (*ecode)
2592 {
2593 case OP_CRSTAR:
2594 case OP_CRMINSTAR:
2595 case OP_CRPLUS:
2596 case OP_CRMINPLUS:
2597 case OP_CRQUERY:
2598 case OP_CRMINQUERY:
2599 c = *ecode++ - OP_CRSTAR;
2600 minimize = (c & 1) != 0;
2601 min = rep_min[c]; /* Pick up values from tables; */
2602 max = rep_max[c]; /* zero for max => infinity */
2603 if (max == 0) max = INT_MAX;
2604 break;
2605
2606 case OP_CRRANGE:
2607 case OP_CRMINRANGE:
2608 minimize = (*ecode == OP_CRMINRANGE);
2609 min = GET2(ecode, 1);
2610 max = GET2(ecode, 1 + IMM2_SIZE);
2611 if (max == 0) max = INT_MAX;
2612 ecode += 1 + 2 * IMM2_SIZE;
2613 break;
2614
2615 default: /* No repeat follows */
2616 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2617 {
2618 CHECK_PARTIAL();
2619 MRRETURN(MATCH_NOMATCH);
2620 }
2621 eptr += length;
2622 continue; /* With the main loop */
2623 }
2624
2625 /* Handle repeated back references. If the length of the reference is
2626 zero, just continue with the main loop. */
2627
2628 if (length == 0) continue;
2629
2630 /* First, ensure the minimum number of matches are present. We get back
2631 the length of the reference string explicitly rather than passing the
2632 address of eptr, so that eptr can be a register variable. */
2633
2634 for (i = 1; i <= min; i++)
2635 {
2636 int slength;
2637 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2638 {
2639 CHECK_PARTIAL();
2640 MRRETURN(MATCH_NOMATCH);
2641 }
2642 eptr += slength;
2643 }
2644
2645 /* If min = max, continue at the same level without recursion.
2646 They are not both allowed to be zero. */
2647
2648 if (min == max) continue;
2649
2650 /* If minimizing, keep trying and advancing the pointer */
2651
2652 if (minimize)
2653 {
2654 for (fi = min;; fi++)
2655 {
2656 int slength;
2657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2660 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2661 {
2662 CHECK_PARTIAL();
2663 MRRETURN(MATCH_NOMATCH);
2664 }
2665 eptr += slength;
2666 }
2667 /* Control never gets here */
2668 }
2669
2670 /* If maximizing, find the longest string and work backwards */
2671
2672 else
2673 {
2674 pp = eptr;
2675 for (i = min; i < max; i++)
2676 {
2677 int slength;
2678 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2679 {
2680 CHECK_PARTIAL();
2681 break;
2682 }
2683 eptr += slength;
2684 }
2685 while (eptr >= pp)
2686 {
2687 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689 eptr -= length;
2690 }
2691 MRRETURN(MATCH_NOMATCH);
2692 }
2693 /* Control never gets here */
2694
2695 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2696 used when all the characters in the class have values in the range 0-255,
2697 and either the matching is caseful, or the characters are in the range
2698 0-127 when UTF-8 processing is enabled. The only difference between
2699 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2700 encountered.
2701
2702 First, look past the end of the item to see if there is repeat information
2703 following. Then obey similar code to character type repeats - written out
2704 again for speed. */
2705
2706 case OP_NCLASS:
2707 case OP_CLASS:
2708 {
2709 /* The data variable is saved across frames, so the byte map needs to
2710 be stored there. */
2711 #define BYTE_MAP ((pcre_uint8 *)data)
2712 data = ecode + 1; /* Save for matching */
2713 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2714
2715 switch (*ecode)
2716 {
2717 case OP_CRSTAR:
2718 case OP_CRMINSTAR:
2719 case OP_CRPLUS:
2720 case OP_CRMINPLUS:
2721 case OP_CRQUERY:
2722 case OP_CRMINQUERY:
2723 c = *ecode++ - OP_CRSTAR;
2724 minimize = (c & 1) != 0;
2725 min = rep_min[c]; /* Pick up values from tables; */
2726 max = rep_max[c]; /* zero for max => infinity */
2727 if (max == 0) max = INT_MAX;
2728 break;
2729
2730 case OP_CRRANGE:
2731 case OP_CRMINRANGE:
2732 minimize = (*ecode == OP_CRMINRANGE);
2733 min = GET2(ecode, 1);
2734 max = GET2(ecode, 1 + IMM2_SIZE);
2735 if (max == 0) max = INT_MAX;
2736 ecode += 1 + 2 * IMM2_SIZE;
2737 break;
2738
2739 default: /* No repeat follows */
2740 min = max = 1;
2741 break;
2742 }
2743
2744 /* First, ensure the minimum number of matches are present. */
2745
2746 #ifdef SUPPORT_UTF
2747 /* UTF-8 mode */
2748 if (utf8)
2749 {
2750 for (i = 1; i <= min; i++)
2751 {
2752 if (eptr >= md->end_subject)
2753 {
2754 SCHECK_PARTIAL();
2755 MRRETURN(MATCH_NOMATCH);
2756 }
2757 GETCHARINC(c, eptr);
2758 if (c > 255)
2759 {
2760 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2761 }
2762 else
2763 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2764 }
2765 }
2766 else
2767 #endif
2768 /* Not UTF-8 mode */
2769 {
2770 for (i = 1; i <= min; i++)
2771 {
2772 if (eptr >= md->end_subject)
2773 {
2774 SCHECK_PARTIAL();
2775 MRRETURN(MATCH_NOMATCH);
2776 }
2777 c = *eptr++;
2778 #ifndef COMPILE_PCRE8
2779 if (c > 255)
2780 {
2781 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2782 }
2783 else
2784 #endif
2785 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2786 }
2787 }
2788
2789 /* If max == min we can continue with the main loop without the
2790 need to recurse. */
2791
2792 if (min == max) continue;
2793
2794 /* If minimizing, keep testing the rest of the expression and advancing
2795 the pointer while it matches the class. */
2796
2797 if (minimize)
2798 {
2799 #ifdef SUPPORT_UTF
2800 /* UTF-8 mode */
2801 if (utf8)
2802 {
2803 for (fi = min;; fi++)
2804 {
2805 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2808 if (eptr >= md->end_subject)
2809 {
2810 SCHECK_PARTIAL();
2811 MRRETURN(MATCH_NOMATCH);
2812 }
2813 GETCHARINC(c, eptr);
2814 if (c > 255)
2815 {
2816 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2817 }
2818 else
2819 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2820 }
2821 }
2822 else
2823 #endif
2824 /* Not UTF-8 mode */
2825 {
2826 for (fi = min;; fi++)
2827 {
2828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2830 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2831 if (eptr >= md->end_subject)
2832 {
2833 SCHECK_PARTIAL();
2834 MRRETURN(MATCH_NOMATCH);
2835 }
2836 c = *eptr++;
2837 #ifndef COMPILE_PCRE8
2838 if (c > 255)
2839 {
2840 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2841 }
2842 else
2843 #endif
2844 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2845 }
2846 }
2847 /* Control never gets here */
2848 }
2849
2850 /* If maximizing, find the longest possible run, then work backwards. */
2851
2852 else
2853 {
2854 pp = eptr;
2855
2856 #ifdef SUPPORT_UTF
2857 /* UTF mode */
2858 if (utf8)
2859 {
2860 for (i = min; i < max; i++)
2861 {
2862 int len = 1;
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 break;
2867 }
2868 GETCHARLEN(c, eptr, len);
2869 if (c > 255)
2870 {
2871 if (op == OP_CLASS) break;
2872 }
2873 else
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2875 eptr += len;
2876 }
2877 for (;;)
2878 {
2879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2881 if (eptr-- == pp) break; /* Stop if tried at original pos */
2882 BACKCHAR(eptr);
2883 }
2884 }
2885 else
2886 #endif
2887 /* Not UTF mode */
2888 {
2889 for (i = min; i < max; i++)
2890 {
2891 if (eptr >= md->end_subject)
2892 {
2893 SCHECK_PARTIAL();
2894 break;
2895 }
2896 c = *eptr;
2897 #ifndef COMPILE_PCRE8
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) break;
2901 }
2902 else
2903 #endif
2904 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2905 eptr++;
2906 }
2907 while (eptr >= pp)
2908 {
2909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2911 eptr--;
2912 }
2913 }
2914
2915 MRRETURN(MATCH_NOMATCH);
2916 }
2917 #undef BYTE_MAP
2918 }
2919 /* Control never gets here */
2920
2921
2922 /* Match an extended character class. This opcode is encountered only
2923 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2924 mode, because Unicode properties are supported in non-UTF-8 mode. */
2925
2926 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2927 case OP_XCLASS:
2928 {
2929 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2930 ecode += GET(ecode, 1); /* Advance past the item */
2931
2932 switch (*ecode)
2933 {
2934 case OP_CRSTAR:
2935 case OP_CRMINSTAR:
2936 case OP_CRPLUS:
2937 case OP_CRMINPLUS:
2938 case OP_CRQUERY:
2939 case OP_CRMINQUERY:
2940 c = *ecode++ - OP_CRSTAR;
2941 minimize = (c & 1) != 0;
2942 min = rep_min[c]; /* Pick up values from tables; */
2943 max = rep_max[c]; /* zero for max => infinity */
2944 if (max == 0) max = INT_MAX;
2945 break;
2946
2947 case OP_CRRANGE:
2948 case OP_CRMINRANGE:
2949 minimize = (*ecode == OP_CRMINRANGE);
2950 min = GET2(ecode, 1);
2951 max = GET2(ecode, 1 + IMM2_SIZE);
2952 if (max == 0) max = INT_MAX;
2953 ecode += 1 + 2 * IMM2_SIZE;
2954 break;
2955
2956 default: /* No repeat follows */
2957 min = max = 1;
2958 break;
2959 }
2960
2961 /* First, ensure the minimum number of matches are present. */
2962
2963 for (i = 1; i <= min; i++)
2964 {
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 MRRETURN(MATCH_NOMATCH);
2969 }
2970 GETCHARINCTEST(c, eptr);
2971 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2972 }
2973
2974 /* If max == min we can continue with the main loop without the
2975 need to recurse. */
2976
2977 if (min == max) continue;
2978
2979 /* If minimizing, keep testing the rest of the expression and advancing
2980 the pointer while it matches the class. */
2981
2982 if (minimize)
2983 {
2984 for (fi = min;; fi++)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 MRRETURN(MATCH_NOMATCH);
2993 }
2994 GETCHARINCTEST(c, eptr);
2995 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2996 }
2997 /* Control never gets here */
2998 }
2999
3000 /* If maximizing, find the longest possible run, then work backwards. */
3001
3002 else
3003 {
3004 pp = eptr;
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 #ifdef SUPPORT_UTF
3014 GETCHARLENTEST(c, eptr, len);
3015 #else
3016 c = *eptr;
3017 #endif
3018 if (!PRIV(xclass)(c, data)) break;
3019 eptr += len;
3020 }
3021 for(;;)
3022 {
3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025 if (eptr-- == pp) break; /* Stop if tried at original pos */
3026 #ifdef SUPPORT_UTF
3027 if (utf8) BACKCHAR(eptr);
3028 #endif
3029 }
3030 MRRETURN(MATCH_NOMATCH);
3031 }
3032
3033 /* Control never gets here */
3034 }
3035 #endif /* End of XCLASS */
3036
3037 /* Match a single character, casefully */
3038
3039 case OP_CHAR:
3040 #ifdef SUPPORT_UTF8
3041 if (utf8)
3042 {
3043 length = 1;
3044 ecode++;
3045 GETCHARLEN(fc, ecode, length);
3046 if (length > md->end_subject - eptr)
3047 {
3048 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3049 MRRETURN(MATCH_NOMATCH);
3050 }
3051 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3052 }
3053 else
3054 #endif
3055
3056 /* Non-UTF-8 mode */
3057 {
3058 if (md->end_subject - eptr < 1)
3059 {
3060 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3061 MRRETURN(MATCH_NOMATCH);
3062 }
3063 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3064 ecode += 2;
3065 }
3066 break;
3067
3068 /* Match a single character, caselessly */
3069
3070 case OP_CHARI:
3071 #ifdef SUPPORT_UTF8
3072 if (utf8)
3073 {
3074 length = 1;
3075 ecode++;
3076 GETCHARLEN(fc, ecode, length);
3077
3078 if (length > md->end_subject - eptr)
3079 {
3080 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3081 MRRETURN(MATCH_NOMATCH);
3082 }
3083
3084 /* If the pattern character's value is < 128, we have only one byte, and
3085 can use the fast lookup table. */
3086
3087 if (fc < 128)
3088 {
3089 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3090 }
3091
3092 /* Otherwise we must pick up the subject character */
3093
3094 else
3095 {
3096 unsigned int dc;
3097 GETCHARINC(dc, eptr);
3098 ecode += length;
3099
3100 /* If we have Unicode property support, we can use it to test the other
3101 case of the character, if there is one. */
3102
3103 if (fc != dc)
3104 {
3105 #ifdef SUPPORT_UCP
3106 if (dc != UCD_OTHERCASE(fc))
3107 #endif
3108 MRRETURN(MATCH_NOMATCH);
3109 }
3110 }
3111 }
3112 else
3113 #endif /* SUPPORT_UTF8 */
3114
3115 /* Non-UTF-8 mode */
3116 {
3117 if (md->end_subject - eptr < 1)
3118 {
3119 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3120 MRRETURN(MATCH_NOMATCH);
3121 }
3122 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3123 ecode += 2;
3124 }
3125 break;
3126
3127 /* Match a single character repeatedly. */
3128
3129 case OP_EXACT:
3130 case OP_EXACTI:
3131 min = max = GET2(ecode, 1);
3132 ecode += 1 + IMM2_SIZE;
3133 goto REPEATCHAR;
3134
3135 case OP_POSUPTO:
3136 case OP_POSUPTOI:
3137 possessive = TRUE;
3138 /* Fall through */
3139
3140 case OP_UPTO:
3141 case OP_UPTOI:
3142 case OP_MINUPTO:
3143 case OP_MINUPTOI:
3144 min = 0;
3145 max = GET2(ecode, 1);
3146 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3147 ecode += 1 + IMM2_SIZE;
3148 goto REPEATCHAR;
3149
3150 case OP_POSSTAR:
3151 case OP_POSSTARI:
3152 possessive = TRUE;
3153 min = 0;
3154 max = INT_MAX;
3155 ecode++;
3156 goto REPEATCHAR;
3157
3158 case OP_POSPLUS:
3159 case OP_POSPLUSI:
3160 possessive = TRUE;
3161 min = 1;
3162 max = INT_MAX;
3163 ecode++;
3164 goto REPEATCHAR;
3165
3166 case OP_POSQUERY:
3167 case OP_POSQUERYI:
3168 possessive = TRUE;
3169 min = 0;
3170 max = 1;
3171 ecode++;
3172 goto REPEATCHAR;
3173
3174 case OP_STAR:
3175 case OP_STARI:
3176 case OP_MINSTAR:
3177 case OP_MINSTARI:
3178 case OP_PLUS:
3179 case OP_PLUSI:
3180 case OP_MINPLUS:
3181 case OP_MINPLUSI:
3182 case OP_QUERY:
3183 case OP_QUERYI:
3184 case OP_MINQUERY:
3185 case OP_MINQUERYI:
3186 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3187 minimize = (c & 1) != 0;
3188 min = rep_min[c]; /* Pick up values from tables; */
3189 max = rep_max[c]; /* zero for max => infinity */
3190 if (max == 0) max = INT_MAX;
3191
3192 /* Common code for all repeated single-character matches. */
3193
3194 REPEATCHAR:
3195 #ifdef SUPPORT_UTF8
3196 if (utf8)
3197 {
3198 length = 1;
3199 charptr = ecode;
3200 GETCHARLEN(fc, ecode, length);
3201 ecode += length;
3202
3203 /* Handle multibyte character matching specially here. There is
3204 support for caseless matching if UCP support is present. */
3205
3206 if (length > 1)
3207 {
3208 #ifdef SUPPORT_UCP
3209 unsigned int othercase;
3210 if (op >= OP_STARI && /* Caseless */
3211 (othercase = UCD_OTHERCASE(fc)) != fc)
3212 oclength = PRIV(ord2utf8)(othercase, occhars);
3213 else oclength = 0;
3214 #endif /* SUPPORT_UCP */
3215
3216 for (i = 1; i <= min; i++)
3217 {
3218 if (eptr <= md->end_subject - length &&
3219 memcmp(eptr, charptr, length) == 0) eptr += length;
3220 #ifdef SUPPORT_UCP
3221 else if (oclength > 0 &&
3222 eptr <= md->end_subject - oclength &&
3223 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3224 #endif /* SUPPORT_UCP */
3225 else
3226 {
3227 CHECK_PARTIAL();
3228 MRRETURN(MATCH_NOMATCH);
3229 }
3230 }
3231
3232 if (min == max) continue;
3233
3234 if (minimize)
3235 {
3236 for (fi = min;; fi++)
3237 {
3238 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3240 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3241 if (eptr <= md->end_subject - length &&
3242 memcmp(eptr, charptr, length) == 0) eptr += length;
3243 #ifdef SUPPORT_UCP
3244 else if (oclength > 0 &&
3245 eptr <= md->end_subject - oclength &&
3246 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3247 #endif /* SUPPORT_UCP */
3248 else
3249 {
3250 CHECK_PARTIAL();
3251 MRRETURN(MATCH_NOMATCH);
3252 }
3253 }
3254 /* Control never gets here */
3255 }
3256
3257 else /* Maximize */
3258 {
3259 pp = eptr;
3260 for (i = min; i < max; i++)
3261 {
3262 if (eptr <= md->end_subject - length &&
3263 memcmp(eptr, charptr, length) == 0) eptr += length;
3264 #ifdef SUPPORT_UCP
3265 else if (oclength > 0 &&
3266 eptr <= md->end_subject - oclength &&
3267 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3268 #endif /* SUPPORT_UCP */
3269 else
3270 {
3271 CHECK_PARTIAL();
3272 break;
3273 }
3274 }
3275
3276 if (possessive) continue;
3277
3278 for(;;)
3279 {
3280 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3281 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3282 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3283 #ifdef SUPPORT_UCP
3284 eptr--;
3285 BACKCHAR(eptr);
3286 #else /* without SUPPORT_UCP */
3287 eptr -= length;
3288 #endif /* SUPPORT_UCP */
3289 }
3290 }
3291 /* Control never gets here */
3292 }
3293
3294 /* If the length of a UTF-8 character is 1, we fall through here, and
3295 obey the code as for non-UTF-8 characters below, though in this case the
3296 value of fc will always be < 128. */
3297 }
3298 else
3299 #endif /* SUPPORT_UTF8 */
3300
3301 /* When not in UTF-8 mode, load a single-byte character. */
3302
3303 fc = *ecode++;
3304
3305 /* The value of fc at this point is always less than 256, though we may or
3306 may not be in UTF-8 mode. The code is duplicated for the caseless and
3307 caseful cases, for speed, since matching characters is likely to be quite
3308 common. First, ensure the minimum number of matches are present. If min =
3309 max, continue at the same level without recursing. Otherwise, if
3310 minimizing, keep trying the rest of the expression and advancing one
3311 matching character if failing, up to the maximum. Alternatively, if
3312 maximizing, find the maximum number of characters and work backwards. */
3313
3314 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3315 max, eptr));
3316
3317 if (op >= OP_STARI) /* Caseless */
3318 {
3319 fc = md->lcc[fc];
3320 for (i = 1; i <= min; i++)
3321 {
3322 if (eptr >= md->end_subject)
3323 {
3324 SCHECK_PARTIAL();
3325 MRRETURN(MATCH_NOMATCH);
3326 }
3327 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3328 }
3329 if (min == max) continue;
3330 if (minimize)
3331 {
3332 for (fi = min;; fi++)
3333 {
3334 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3335 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3336 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3337 if (eptr >= md->end_subject)
3338 {
3339 SCHECK_PARTIAL();
3340 MRRETURN(MATCH_NOMATCH);
3341 }
3342 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3343 }
3344 /* Control never gets here */
3345 }
3346 else /* Maximize */
3347 {
3348 pp = eptr;
3349 for (i = min; i < max; i++)
3350 {
3351 if (eptr >= md->end_subject)
3352 {
3353 SCHECK_PARTIAL();
3354 break;
3355 }
3356 if (fc != md->lcc[*eptr]) break;
3357 eptr++;
3358 }
3359
3360 if (possessive) continue;
3361
3362 while (eptr >= pp)
3363 {
3364 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3365 eptr--;
3366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367 }
3368 MRRETURN(MATCH_NOMATCH);
3369 }
3370 /* Control never gets here */
3371 }
3372
3373 /* Caseful comparisons (includes all multi-byte characters) */
3374
3375 else
3376 {
3377 for (i = 1; i <= min; i++)
3378 {
3379 if (eptr >= md->end_subject)
3380 {
3381 SCHECK_PARTIAL();
3382 MRRETURN(MATCH_NOMATCH);
3383 }
3384 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3385 }
3386
3387 if (min == max) continue;
3388
3389 if (minimize)
3390 {
3391 for (fi = min;; fi++)
3392 {
3393 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3396 if (eptr >= md->end_subject)
3397 {
3398 SCHECK_PARTIAL();
3399 MRRETURN(MATCH_NOMATCH);
3400 }
3401 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3402 }
3403 /* Control never gets here */
3404 }
3405 else /* Maximize */
3406 {
3407 pp = eptr;
3408 for (i = min; i < max; i++)
3409 {
3410 if (eptr >= md->end_subject)
3411 {
3412 SCHECK_PARTIAL();
3413 break;
3414 }
3415 if (fc != *eptr) break;
3416 eptr++;
3417 }
3418 if (possessive) continue;
3419
3420 while (eptr >= pp)
3421 {
3422 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3423 eptr--;
3424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3425 }
3426 MRRETURN(MATCH_NOMATCH);
3427 }
3428 }
3429 /* Control never gets here */
3430
3431 /* Match a negated single one-byte character. The character we are
3432 checking can be multibyte. */
3433
3434 case OP_NOT:
3435 case OP_NOTI:
3436 if (eptr >= md->end_subject)
3437 {
3438 SCHECK_PARTIAL();
3439 MRRETURN(MATCH_NOMATCH);
3440 }
3441 ecode++;
3442 GETCHARINCTEST(c, eptr);
3443 if (op == OP_NOTI) /* The caseless case */
3444 {
3445 #ifdef SUPPORT_UTF8
3446 if (c < 256)
3447 #endif
3448 c = md->lcc[c];
3449 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3450 }
3451 else /* Caseful */
3452 {
3453 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3454 }
3455 break;
3456
3457 /* Match a negated single one-byte character repeatedly. This is almost a
3458 repeat of the code for a repeated single character, but I haven't found a
3459 nice way of commoning these up that doesn't require a test of the
3460 positive/negative option for each character match. Maybe that wouldn't add
3461 very much to the time taken, but character matching *is* what this is all
3462 about... */
3463
3464 case OP_NOTEXACT:
3465 case OP_NOTEXACTI:
3466 min = max = GET2(ecode, 1);
3467 ecode += 1 + IMM2_SIZE;
3468 goto REPEATNOTCHAR;
3469
3470 case OP_NOTUPTO:
3471 case OP_NOTUPTOI:
3472 case OP_NOTMINUPTO:
3473 case OP_NOTMINUPTOI:
3474 min = 0;
3475 max = GET2(ecode, 1);
3476 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3477 ecode += 1 + IMM2_SIZE;
3478 goto REPEATNOTCHAR;
3479
3480 case OP_NOTPOSSTAR:
3481 case OP_NOTPOSSTARI:
3482 possessive = TRUE;
3483 min = 0;
3484 max = INT_MAX;
3485 ecode++;
3486 goto REPEATNOTCHAR;
3487
3488 case OP_NOTPOSPLUS:
3489 case OP_NOTPOSPLUSI:
3490 possessive = TRUE;
3491 min = 1;
3492 max = INT_MAX;
3493 ecode++;
3494 goto REPEATNOTCHAR;
3495
3496 case OP_NOTPOSQUERY:
3497 case OP_NOTPOSQUERYI:
3498 possessive = TRUE;
3499 min = 0;
3500 max = 1;
3501 ecode++;
3502 goto REPEATNOTCHAR;
3503
3504 case OP_NOTPOSUPTO:
3505 case OP_NOTPOSUPTOI:
3506 possessive = TRUE;
3507 min = 0;
3508 max = GET2(ecode, 1);
3509 ecode += 1 + IMM2_SIZE;
3510 goto REPEATNOTCHAR;
3511
3512 case OP_NOTSTAR:
3513 case OP_NOTSTARI:
3514 case OP_NOTMINSTAR:
3515 case OP_NOTMINSTARI:
3516 case OP_NOTPLUS:
3517 case OP_NOTPLUSI:
3518 case OP_NOTMINPLUS:
3519 case OP_NOTMINPLUSI:
3520 case OP_NOTQUERY:
3521 case OP_NOTQUERYI:
3522 case OP_NOTMINQUERY:
3523 case OP_NOTMINQUERYI:
3524 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3525 minimize = (c & 1) != 0;
3526 min = rep_min[c]; /* Pick up values from tables; */
3527 max = rep_max[c]; /* zero for max => infinity */
3528 if (max == 0) max = INT_MAX;
3529
3530 /* Common code for all repeated single-byte matches. */
3531
3532 REPEATNOTCHAR:
3533 fc = *ecode++;
3534
3535 /* The code is duplicated for the caseless and caseful cases, for speed,
3536 since matching characters is likely to be quite common. First, ensure the
3537 minimum number of matches are present. If min = max, continue at the same
3538 level without recursing. Otherwise, if minimizing, keep trying the rest of
3539 the expression and advancing one matching character if failing, up to the
3540 maximum. Alternatively, if maximizing, find the maximum number of
3541 characters and work backwards. */
3542
3543 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3544 max, eptr));
3545
3546 if (op >= OP_NOTSTARI) /* Caseless */
3547 {
3548 fc = md->lcc[fc];
3549
3550 #ifdef SUPPORT_UTF8
3551 /* UTF-8 mode */
3552 if (utf8)
3553 {
3554 register unsigned int d;
3555 for (i = 1; i <= min; i++)
3556 {
3557 if (eptr >= md->end_subject)
3558 {
3559 SCHECK_PARTIAL();
3560 MRRETURN(MATCH_NOMATCH);
3561 }
3562 GETCHARINC(d, eptr);
3563 if (d < 256) d = md->lcc[d];
3564 if (fc == d) MRRETURN(MATCH_NOMATCH);
3565 }
3566 }
3567 else
3568 #endif
3569
3570 /* Not UTF-8 mode */
3571 {
3572 for (i = 1; i <= min; i++)
3573 {
3574 if (eptr >= md->end_subject)
3575 {
3576 SCHECK_PARTIAL();
3577 MRRETURN(MATCH_NOMATCH);
3578 }
3579 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3580 }
3581 }
3582
3583 if (min == max) continue;
3584
3585 if (minimize)
3586 {
3587 #ifdef SUPPORT_UTF8
3588 /* UTF-8 mode */
3589 if (utf8)
3590 {
3591 register unsigned int d;
3592 for (fi = min;; fi++)
3593 {
3594 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3596 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3597 if (eptr >= md->end_subject)
3598 {
3599 SCHECK_PARTIAL();
3600 MRRETURN(MATCH_NOMATCH);
3601 }
3602 GETCHARINC(d, eptr);
3603 if (d < 256) d = md->lcc[d];
3604 if (fc == d) MRRETURN(MATCH_NOMATCH);
3605 }
3606 }
3607 else
3608 #endif
3609 /* Not UTF-8 mode */
3610 {
3611 for (fi = min;; fi++)
3612 {
3613 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3615 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3616 if (eptr >= md->end_subject)
3617 {
3618 SCHECK_PARTIAL();
3619 MRRETURN(MATCH_NOMATCH);
3620 }
3621 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3622 }
3623 }
3624 /* Control never gets here */
3625 }
3626
3627 /* Maximize case */
3628
3629 else
3630 {
3631 pp = eptr;
3632
3633 #ifdef SUPPORT_UTF8
3634 /* UTF-8 mode */
3635 if (utf8)
3636 {
3637 register unsigned int d;
3638 for (i = min; i < max; i++)
3639 {
3640 int len = 1;
3641 if (eptr >= md->end_subject)
3642 {
3643 SCHECK_PARTIAL();
3644 break;
3645 }
3646 GETCHARLEN(d, eptr, len);
3647 if (d < 256) d = md->lcc[d];
3648 if (fc == d) break;
3649 eptr += len;
3650 }
3651 if (possessive) continue;
3652 for(;;)
3653 {
3654 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3655 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3656 if (eptr-- == pp) break; /* Stop if tried at original pos */
3657 BACKCHAR(eptr);
3658 }
3659 }
3660 else
3661 #endif
3662 /* Not UTF-8 mode */
3663 {
3664 for (i = min; i < max; i++)
3665 {
3666 if (eptr >= md->end_subject)
3667 {
3668 SCHECK_PARTIAL();
3669 break;
3670 }
3671 if (fc == md->lcc[*eptr]) break;
3672 eptr++;
3673 }
3674 if (possessive) continue;
3675 while (eptr >= pp)
3676 {
3677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3679 eptr--;
3680 }
3681 }
3682
3683 MRRETURN(MATCH_NOMATCH);
3684 }
3685 /* Control never gets here */
3686 }
3687
3688 /* Caseful comparisons */
3689
3690 else
3691 {
3692 #ifdef SUPPORT_UTF8
3693 /* UTF-8 mode */
3694 if (utf8)
3695 {
3696 register unsigned int d;
3697 for (i = 1; i <= min; i++)
3698 {
3699 if (eptr >= md->end_subject)
3700 {
3701 SCHECK_PARTIAL();
3702 MRRETURN(MATCH_NOMATCH);
3703 }
3704 GETCHARINC(d, eptr);
3705 if (fc == d) MRRETURN(MATCH_NOMATCH);
3706 }
3707 }
3708 else
3709 #endif
3710 /* Not UTF-8 mode */
3711 {
3712 for (i = 1; i <= min; i++)
3713 {
3714 if (eptr >= md->end_subject)
3715 {
3716 SCHECK_PARTIAL();
3717 MRRETURN(MATCH_NOMATCH);
3718 }
3719 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3720 }
3721 }
3722
3723 if (min == max) continue;
3724
3725 if (minimize)
3726 {
3727 #ifdef SUPPORT_UTF8
3728 /* UTF-8 mode */
3729 if (utf8)
3730 {
3731 register unsigned int d;
3732 for (fi = min;; fi++)
3733 {
3734 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3736 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3737 if (eptr >= md->end_subject)
3738 {
3739 SCHECK_PARTIAL();
3740 MRRETURN(MATCH_NOMATCH);
3741 }
3742 GETCHARINC(d, eptr);
3743 if (fc == d) MRRETURN(MATCH_NOMATCH);
3744 }
3745 }
3746 else
3747 #endif
3748 /* Not UTF-8 mode */
3749 {
3750 for (fi = min;; fi++)
3751 {
3752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3754 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 MRRETURN(MATCH_NOMATCH);
3759 }
3760 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3761 }
3762 }
3763 /* Control never gets here */
3764 }
3765
3766 /* Maximize case */
3767
3768 else
3769 {
3770 pp = eptr;
3771
3772 #ifdef SUPPORT_UTF8
3773 /* UTF-8 mode */
3774 if (utf8)
3775 {
3776 register unsigned int d;
3777 for (i = min; i < max; i++)
3778 {
3779 int len = 1;
3780 if (eptr >= md->end_subject)
3781 {
3782 SCHECK_PARTIAL();
3783 break;
3784 }
3785 GETCHARLEN(d, eptr, len);
3786 if (fc == d) break;
3787 eptr += len;
3788 }
3789 if (possessive) continue;
3790 for(;;)
3791 {
3792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3794 if (eptr-- == pp) break; /* Stop if tried at original pos */
3795 BACKCHAR(eptr);
3796 }
3797 }
3798 else
3799 #endif
3800 /* Not UTF-8 mode */
3801 {
3802 for (i = min; i < max; i++)
3803 {
3804 if (eptr >= md->end_subject)
3805 {
3806 SCHECK_PARTIAL();
3807 break;
3808 }
3809 if (fc == *eptr) break;
3810 eptr++;
3811 }
3812 if (possessive) continue;
3813 while (eptr >= pp)
3814 {
3815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3817 eptr--;
3818 }
3819 }
3820
3821 MRRETURN(MATCH_NOMATCH);
3822 }
3823 }
3824 /* Control never gets here */
3825
3826 /* Match a single character type repeatedly; several different opcodes
3827 share code. This is very similar to the code for single characters, but we
3828 repeat it in the interests of efficiency. */
3829
3830 case OP_TYPEEXACT:
3831 min = max = GET2(ecode, 1);
3832 minimize = TRUE;
3833 ecode += 1 + IMM2_SIZE;
3834 goto REPEATTYPE;
3835
3836 case OP_TYPEUPTO:
3837 case OP_TYPEMINUPTO:
3838 min = 0;
3839 max = GET2(ecode, 1);
3840 minimize = *ecode == OP_TYPEMINUPTO;
3841 ecode += 1 + IMM2_SIZE;
3842 goto REPEATTYPE;
3843
3844 case OP_TYPEPOSSTAR:
3845 possessive = TRUE;
3846 min = 0;
3847 max = INT_MAX;
3848 ecode++;
3849 goto REPEATTYPE;
3850
3851 case OP_TYPEPOSPLUS:
3852 possessive = TRUE;
3853 min = 1;
3854 max = INT_MAX;
3855 ecode++;
3856 goto REPEATTYPE;
3857
3858 case OP_TYPEPOSQUERY:
3859 possessive = TRUE;
3860 min = 0;
3861 max = 1;
3862 ecode++;
3863 goto REPEATTYPE;
3864
3865 case OP_TYPEPOSUPTO:
3866 possessive = TRUE;
3867 min = 0;
3868 max = GET2(ecode, 1);
3869 ecode += 1 + IMM2_SIZE;
3870 goto REPEATTYPE;
3871
3872 case OP_TYPESTAR:
3873 case OP_TYPEMINSTAR:
3874 case OP_TYPEPLUS:
3875 case OP_TYPEMINPLUS:
3876 case OP_TYPEQUERY:
3877 case OP_TYPEMINQUERY:
3878 c = *ecode++ - OP_TYPESTAR;
3879 minimize = (c & 1) != 0;
3880 min = rep_min[c]; /* Pick up values from tables; */
3881 max = rep_max[c]; /* zero for max => infinity */
3882 if (max == 0) max = INT_MAX;
3883
3884 /* Common code for all repeated single character type matches. Note that
3885 in UTF-8 mode, '.' matches a character of any length, but for the other
3886 character types, the valid characters are all one-byte long. */
3887
3888 REPEATTYPE:
3889 ctype = *ecode++; /* Code for the character type */
3890
3891 #ifdef SUPPORT_UCP
3892 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3893 {
3894 prop_fail_result = ctype == OP_NOTPROP;
3895 prop_type = *ecode++;
3896 prop_value = *ecode++;
3897 }
3898 else prop_type = -1;
3899 #endif
3900
3901 /* First, ensure the minimum number of matches are present. Use inline
3902 code for maximizing the speed, and do the type test once at the start
3903 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3904 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3905 and single-bytes. */
3906
3907 if (min > 0)
3908 {
3909 #ifdef SUPPORT_UCP
3910 if (prop_type >= 0)
3911 {
3912 switch(prop_type)
3913 {
3914 case PT_ANY:
3915 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3916 for (i = 1; i <= min; i++)
3917 {
3918 if (eptr >= md->end_subject)
3919 {
3920 SCHECK_PARTIAL();
3921 MRRETURN(MATCH_NOMATCH);
3922 }
3923 GETCHARINCTEST(c, eptr);
3924 }
3925 break;
3926
3927 case PT_LAMP:
3928 for (i = 1; i <= min; i++)
3929 {
3930 int chartype;
3931 if (eptr >= md->end_subject)
3932 {
3933 SCHECK_PARTIAL();
3934 MRRETURN(MATCH_NOMATCH);
3935 }
3936 GETCHARINCTEST(c, eptr);
3937 chartype = UCD_CHARTYPE(c);
3938 if ((chartype == ucp_Lu ||
3939 chartype == ucp_Ll ||
3940 chartype == ucp_Lt) == prop_fail_result)
3941 MRRETURN(MATCH_NOMATCH);
3942 }
3943 break;
3944
3945 case PT_GC:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 MRRETURN(MATCH_NOMATCH);
3952 }
3953 GETCHARINCTEST(c, eptr);
3954 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3955 MRRETURN(MATCH_NOMATCH);
3956 }
3957 break;
3958
3959 case PT_PC:
3960 for (i = 1; i <= min; i++)
3961 {
3962 if (eptr >= md->end_subject)
3963 {
3964 SCHECK_PARTIAL();
3965 MRRETURN(MATCH_NOMATCH);
3966 }
3967 GETCHARINCTEST(c, eptr);
3968 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3969 MRRETURN(MATCH_NOMATCH);
3970 }
3971 break;
3972
3973 case PT_SC:
3974 for (i = 1; i <= min; i++)
3975 {
3976 if (eptr >= md->end_subject)
3977 {
3978 SCHECK_PARTIAL();
3979 MRRETURN(MATCH_NOMATCH);
3980 }
3981 GETCHARINCTEST(c, eptr);
3982 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3983 MRRETURN(MATCH_NOMATCH);
3984 }
3985 break;
3986
3987 case PT_ALNUM:
3988 for (i = 1; i <= min; i++)
3989 {
3990 int category;
3991 if (eptr >= md->end_subject)
3992 {
3993 SCHECK_PARTIAL();
3994 MRRETURN(MATCH_NOMATCH);
3995 }
3996 GETCHARINCTEST(c, eptr);
3997 category = UCD_CATEGORY(c);
3998 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
3999 MRRETURN(MATCH_NOMATCH);
4000 }
4001 break;
4002
4003 case PT_SPACE: /* Perl space */
4004 for (i = 1; i <= min; i++)
4005 {
4006 if (eptr >= md->end_subject)
4007 {
4008 SCHECK_PARTIAL();
4009 MRRETURN(MATCH_NOMATCH);
4010 }
4011 GETCHARINCTEST(c, eptr);
4012 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4013 c == CHAR_FF || c == CHAR_CR)
4014 == prop_fail_result)
4015 MRRETURN(MATCH_NOMATCH);
4016 }
4017 break;
4018
4019 case PT_PXSPACE: /* POSIX space */
4020 for (i = 1; i <= min; i++)
4021 {
4022 if (eptr >= md->end_subject)
4023 {
4024 SCHECK_PARTIAL();
4025 MRRETURN(MATCH_NOMATCH);
4026 }
4027 GETCHARINCTEST(c, eptr);
4028 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4029 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4030 == prop_fail_result)
4031 MRRETURN(MATCH_NOMATCH);
4032 }
4033 break;
4034
4035 case PT_WORD:
4036 for (i = 1; i <= min; i++)
4037 {
4038 int category;
4039 if (eptr >= md->end_subject)
4040 {
4041 SCHECK_PARTIAL();
4042 MRRETURN(MATCH_NOMATCH);
4043 }
4044 GETCHARINCTEST(c, eptr);
4045 category = UCD_CATEGORY(c);
4046 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4047 == prop_fail_result)
4048 MRRETURN(MATCH_NOMATCH);
4049 }
4050 break;
4051
4052 /* This should not occur */
4053
4054 default:
4055 RRETURN(PCRE_ERROR_INTERNAL);
4056 }
4057 }
4058
4059 /* Match extended Unicode sequences. We will get here only if the
4060 support is in the binary; otherwise a compile-time error occurs. */
4061
4062 else if (ctype == OP_EXTUNI)
4063 {
4064 for (i = 1; i <= min; i++)
4065 {
4066 if (eptr >= md->end_subject)
4067 {
4068 SCHECK_PARTIAL();
4069 MRRETURN(MATCH_NOMATCH);
4070 }
4071 GETCHARINCTEST(c, eptr);
4072 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4073 while (eptr < md->end_subject)
4074 {
4075 int len = 1;
4076 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4077 if (UCD_CATEGORY(c) != ucp_M) break;
4078 eptr += len;
4079 }
4080 }
4081 }
4082
4083 else
4084 #endif /* SUPPORT_UCP */
4085
4086 /* Handle all other cases when the coding is UTF-8 */
4087
4088 #ifdef SUPPORT_UTF8
4089 if (utf8) switch(ctype)
4090 {
4091 case OP_ANY:
4092 for (i = 1; i <= min; i++)
4093 {
4094 if (eptr >= md->end_subject)
4095 {
4096 SCHECK_PARTIAL();
4097 MRRETURN(MATCH_NOMATCH);
4098 }
4099 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4100 eptr++;
4101 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4102 }
4103 break;
4104
4105 case OP_ALLANY:
4106 for (i = 1; i <= min; i++)
4107 {
4108 if (eptr >= md->end_subject)
4109 {
4110 SCHECK_PARTIAL();
4111 MRRETURN(MATCH_NOMATCH);
4112 }
4113 eptr++;
4114 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4115 }
4116 break;
4117
4118 case OP_ANYBYTE:
4119 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4120 eptr += min;
4121 break;
4122
4123 case OP_ANYNL:
4124 for (i = 1; i <= min; i++)
4125 {
4126 if (eptr >= md->end_subject)
4127 {
4128 SCHECK_PARTIAL();
4129 MRRETURN(MATCH_NOMATCH);
4130 }
4131 GETCHARINC(c, eptr);
4132 switch(c)
4133 {
4134 default: MRRETURN(MATCH_NOMATCH);
4135
4136 case 0x000d:
4137 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4138 break;
4139
4140 case 0x000a:
4141 break;
4142
4143 case 0x000b:
4144 case 0x000c:
4145 case 0x0085:
4146 case 0x2028:
4147 case 0x2029:
4148 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4149 break;
4150 }
4151 }
4152 break;
4153
4154 case OP_NOT_HSPACE:
4155 for (i = 1; i <= min; i++)
4156 {
4157 if (eptr >= md->end_subject)
4158 {
4159 SCHECK_PARTIAL();
4160 MRRETURN(MATCH_NOMATCH);
4161 }
4162 GETCHARINC(c, eptr);
4163 switch(c)
4164 {
4165 default: break;
4166 case 0x09: /* HT */
4167 case 0x20: /* SPACE */
4168 case 0xa0: /* NBSP */
4169 case 0x1680: /* OGHAM SPACE MARK */
4170 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4171 case 0x2000: /* EN QUAD */
4172 case 0x2001: /* EM QUAD */
4173 case 0x2002: /* EN SPACE */
4174 case 0x2003: /* EM SPACE */
4175 case 0x2004: /* THREE-PER-EM SPACE */
4176 case 0x2005: /* FOUR-PER-EM SPACE */
4177 case 0x2006: /* SIX-PER-EM SPACE */
4178 case 0x2007: /* FIGURE SPACE */
4179 case 0x2008: /* PUNCTUATION SPACE */
4180 case 0x2009: /* THIN SPACE */
4181 case 0x200A: /* HAIR SPACE */
4182 case 0x202f: /* NARROW NO-BREAK SPACE */
4183 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4184 case 0x3000: /* IDEOGRAPHIC SPACE */
4185 MRRETURN(MATCH_NOMATCH);
4186 }
4187 }
4188 break;
4189
4190 case OP_HSPACE:
4191 for (i = 1; i <= min; i++)
4192 {
4193 if (eptr >= md->end_subject)
4194 {
4195 SCHECK_PARTIAL();
4196 MRRETURN(MATCH_NOMATCH);
4197 }
4198 GETCHARINC(c, eptr);
4199 switch(c)
4200 {
4201 default: MRRETURN(MATCH_NOMATCH);
4202 case 0x09: /* HT */
4203 case 0x20: /* SPACE */
4204 case 0xa0: /* NBSP */
4205 case 0x1680: /* OGHAM SPACE MARK */
4206 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4207 case 0x2000: /* EN QUAD */
4208 case 0x2001: /* EM QUAD */
4209 case 0x2002: /* EN SPACE */
4210 case 0x2003: /* EM SPACE */
4211 case 0x2004: /* THREE-PER-EM SPACE */
4212 case 0x2005: /* FOUR-PER-EM SPACE */
4213 case 0x2006: /* SIX-PER-EM SPACE */
4214 case 0x2007: /* FIGURE SPACE */
4215 case 0x2008: /* PUNCTUATION SPACE */
4216 case 0x2009: /* THIN SPACE */
4217 case 0x200A: /* HAIR SPACE */
4218 case 0x202f: /* NARROW NO-BREAK SPACE */
4219 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4220 case 0x3000: /* IDEOGRAPHIC SPACE */
4221 break;
4222 }
4223 }
4224 break;
4225
4226 case OP_NOT_VSPACE:
4227 for (i = 1; i <= min; i++)
4228 {
4229 if (eptr >= md->end_subject)
4230 {
4231 SCHECK_PARTIAL();
4232 MRRETURN(MATCH_NOMATCH);
4233 }
4234 GETCHARINC(c, eptr);
4235 switch(c)
4236 {
4237 default: break;
4238 case 0x0a: /* LF */
4239 case 0x0b: /* VT */
4240 case 0x0c: /* FF */
4241 case 0x0d: /* CR */
4242 case 0x85: /* NEL */
4243 case 0x2028: /* LINE SEPARATOR */
4244 case 0x2029: /* PARAGRAPH SEPARATOR */
4245 MRRETURN(MATCH_NOMATCH);
4246 }
4247 }
4248 break;
4249
4250 case OP_VSPACE:
4251 for (i = 1; i <= min; i++)
4252 {
4253 if (eptr >= md->end_subject)
4254 {
4255 SCHECK_PARTIAL();
4256 MRRETURN(MATCH_NOMATCH);
4257 }
4258 GETCHARINC(c, eptr);
4259 switch(c)
4260 {
4261 default: MRRETURN(MATCH_NOMATCH);
4262 case 0x0a: /* LF */
4263 case 0x0b: /* VT */
4264 case 0x0c: /* FF */
4265 case 0x0d: /* CR */
4266 case 0x85: /* NEL */
4267 case 0x2028: /* LINE SEPARATOR */
4268 case 0x2029: /* PARAGRAPH SEPARATOR */
4269 break;
4270 }
4271 }
4272 break;
4273
4274 case OP_NOT_DIGIT:
4275 for (i = 1; i <= min; i++)
4276 {
4277 if (eptr >= md->end_subject)
4278 {
4279 SCHECK_PARTIAL();
4280 MRRETURN(MATCH_NOMATCH);
4281 }
4282 GETCHARINC(c, eptr);
4283 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4284 MRRETURN(MATCH_NOMATCH);
4285 }
4286 break;
4287
4288 case OP_DIGIT:
4289 for (i = 1; i <= min; i++)
4290 {
4291 if (eptr >= md->end_subject)
4292 {
4293 SCHECK_PARTIAL();
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4297 MRRETURN(MATCH_NOMATCH);
4298 /* No need to skip more bytes - we know it's a 1-byte character */
4299 }
4300 break;
4301
4302 case OP_NOT_WHITESPACE:
4303 for (i = 1; i <= min; i++)
4304 {
4305 if (eptr >= md->end_subject)
4306 {
4307 SCHECK_PARTIAL();
4308 MRRETURN(MATCH_NOMATCH);
4309 }
4310 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4311 MRRETURN(MATCH_NOMATCH);
4312 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4313 }
4314 break;
4315
4316 case OP_WHITESPACE:
4317 for (i = 1; i <= min; i++)
4318 {
4319 if (eptr >= md->end_subject)
4320 {
4321 SCHECK_PARTIAL();
4322 MRRETURN(MATCH_NOMATCH);
4323 }
4324 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4325 MRRETURN(MATCH_NOMATCH);
4326 /* No need to skip more bytes - we know it's a 1-byte character */
4327 }
4328 break;
4329
4330 case OP_NOT_WORDCHAR:
4331 for (i = 1; i <= min; i++)
4332 {
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 MRRETURN(MATCH_NOMATCH);
4337 }
4338 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4339 MRRETURN(MATCH_NOMATCH);
4340 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4341 }
4342 break;
4343
4344 case OP_WORDCHAR:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 MRRETURN(MATCH_NOMATCH);
4351 }
4352 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4353 MRRETURN(MATCH_NOMATCH);
4354 /* No need to skip more bytes - we know it's a 1-byte character */
4355 }
4356 break;
4357
4358 default:
4359 RRETURN(PCRE_ERROR_INTERNAL);
4360 } /* End switch(ctype) */
4361
4362 else
4363 #endif /* SUPPORT_UTF8 */
4364
4365 /* Code for the non-UTF-8 case for minimum matching of operators other
4366 than OP_PROP and OP_NOTPROP. */
4367
4368 switch(ctype)
4369 {
4370 case OP_ANY:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 MRRETURN(MATCH_NOMATCH);
4377 }
4378 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4379 eptr++;
4380 }
4381 break;
4382
4383 case OP_ALLANY:
4384 if (eptr > md->end_subject - min)
4385 {
4386 SCHECK_PARTIAL();
4387 MRRETURN(MATCH_NOMATCH);
4388 }
4389 eptr += min;
4390 break;
4391
4392 case OP_ANYBYTE:
4393 if (eptr > md->end_subject - min)
4394 {
4395 SCHECK_PARTIAL();
4396 MRRETURN(MATCH_NOMATCH);
4397 }
4398 eptr += min;
4399 break;
4400
4401 case OP_ANYNL:
4402 for (i = 1; i <= min; i++)
4403 {
4404 if (eptr >= md->end_subject)
4405 {
4406 SCHECK_PARTIAL();
4407 MRRETURN(MATCH_NOMATCH);
4408 }
4409 switch(*eptr++)
4410 {
4411 default: MRRETURN(MATCH_NOMATCH);
4412
4413 case 0x000d:
4414 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4415 break;
4416
4417 case 0x000a:
4418 break;
4419
4420 case 0x000b:
4421 case 0x000c:
4422 case 0x0085:
4423 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4424 break;
4425 }
4426 }
4427 break;
4428
4429 case OP_NOT_HSPACE:
4430 for (i = 1; i <= min; i++)
4431 {
4432 if (eptr >= md->end_subject)
4433 {
4434 SCHECK_PARTIAL();
4435 MRRETURN(MATCH_NOMATCH);
4436 }
4437 switch(*eptr++)
4438 {
4439 default: break;
4440 case 0x09: /* HT */
4441 case 0x20: /* SPACE */
4442 case 0xa0: /* NBSP */
4443 MRRETURN(MATCH_NOMATCH);
4444 }
4445 }
4446 break;
4447
4448 case OP_HSPACE:
4449 for (i = 1; i <= min; i++)
4450 {
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 MRRETURN(MATCH_NOMATCH);
4455 }
4456 switch(*eptr++)
4457 {
4458 default: MRRETURN(MATCH_NOMATCH);
4459 case 0x09: /* HT */
4460 case 0x20: /* SPACE */
4461 case 0xa0: /* NBSP */
4462 break;
4463 }
4464 }
4465 break;
4466
4467 case OP_NOT_VSPACE:
4468 for (i = 1; i <= min; i++)
4469 {
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 MRRETURN(MATCH_NOMATCH);
4474 }
4475 switch(*eptr++)
4476 {
4477 default: break;
4478 case 0x0a: /* LF */
4479 case 0x0b: /* VT */
4480 case 0x0c: /* FF */
4481 case 0x0d: /* CR */
4482 case 0x85: /* NEL */
4483 MRRETURN(MATCH_NOMATCH);
4484 }
4485 }
4486 break;
4487
4488 case OP_VSPACE:
4489 for (i = 1; i <= min; i++)
4490 {
4491 if (eptr >= md->end_subject)
4492 {
4493 SCHECK_PARTIAL();
4494 MRRETURN(MATCH_NOMATCH);
4495 }
4496 switch(*eptr++)
4497 {
4498 default: MRRETURN(MATCH_NOMATCH);
4499 case 0x0a: /* LF */
4500 case 0x0b: /* VT */
4501 case 0x0c: /* FF */
4502 case 0x0d: /* CR */
4503 case 0x85: /* NEL */
4504 break;
4505 }
4506 }
4507 break;
4508
4509 case OP_NOT_DIGIT:
4510 for (i = 1; i <= min; i++)
4511 {
4512 if (eptr >= md->end_subject)
4513 {
4514 SCHECK_PARTIAL();
4515 MRRETURN(MATCH_NOMATCH);
4516 }
4517 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4518 }
4519 break;
4520
4521 case OP_DIGIT:
4522 for (i = 1; i <= min; i++)
4523 {
4524 if (eptr >= md->end_subject)
4525 {
4526 SCHECK_PARTIAL();
4527 MRRETURN(MATCH_NOMATCH);
4528 }
4529 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4530 }
4531 break;
4532
4533 case OP_NOT_WHITESPACE:
4534 for (i = 1; i <= min; i++)
4535 {
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 MRRETURN(MATCH_NOMATCH);
4540 }
4541 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4542 }
4543 break;
4544
4545 case OP_WHITESPACE:
4546 for (i = 1; i <= min; i++)
4547 {
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 MRRETURN(MATCH_NOMATCH);
4552 }
4553 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4554 }
4555 break;
4556
4557 case OP_NOT_WORDCHAR:
4558 for (i = 1; i <= min; i++)
4559 {
4560 if (eptr >= md->end_subject)
4561 {
4562 SCHECK_PARTIAL();
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4566 MRRETURN(MATCH_NOMATCH);
4567 }
4568 break;
4569
4570 case OP_WORDCHAR:
4571 for (i = 1; i <= min; i++)
4572 {
4573 if (eptr >= md->end_subject)
4574 {
4575 SCHECK_PARTIAL();
4576 MRRETURN(MATCH_NOMATCH);
4577 }
4578 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4579 MRRETURN(MATCH_NOMATCH);
4580 }
4581 break;
4582
4583 default:
4584 RRETURN(PCRE_ERROR_INTERNAL);
4585 }
4586 }
4587
4588 /* If min = max, continue at the same level without recursing */
4589
4590 if (min == max) continue;
4591
4592 /* If minimizing, we have to test the rest of the pattern before each
4593 subsequent match. Again, separate the UTF-8 case for speed, and also
4594 separate the UCP cases. */
4595
4596 if (minimize)
4597 {
4598 #ifdef SUPPORT_UCP
4599 if (prop_type >= 0)
4600 {
4601 switch(prop_type)
4602 {
4603 case PT_ANY:
4604 for (fi = min;; fi++)
4605 {
4606 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4608 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4609 if (eptr >= md->end_subject)
4610 {
4611 SCHECK_PARTIAL();
4612 MRRETURN(MATCH_NOMATCH);
4613 }
4614 GETCHARINCTEST(c, eptr);
4615 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4616 }
4617 /* Control never gets here */
4618
4619 case PT_LAMP:
4620 for (fi = min;; fi++)
4621 {
4622 int chartype;
4623 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4624 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4625 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4626 if (eptr >= md->end_subject)
4627 {
4628 SCHECK_PARTIAL();
4629 MRRETURN(MATCH_NOMATCH);
4630 }
4631 GETCHARINCTEST(c, eptr);
4632 chartype = UCD_CHARTYPE(c);
4633 if ((chartype == ucp_Lu ||
4634 chartype == ucp_Ll ||
4635 chartype == ucp_Lt) == prop_fail_result)
4636 MRRETURN(MATCH_NOMATCH);
4637 }
4638 /* Control never gets here */
4639
4640 case PT_GC:
4641 for (fi = min;; fi++)
4642 {
4643 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4644 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4645 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4646 if (eptr >= md->end_subject)
4647 {
4648 SCHECK_PARTIAL();
4649 MRRETURN(MATCH_NOMATCH);
4650 }
4651 GETCHARINCTEST(c, eptr);
4652 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4653 MRRETURN(MATCH_NOMATCH);
4654 }
4655 /* Control never gets here */
4656
4657 case PT_PC:
4658 for (fi = min;; fi++)
4659 {
4660 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4662 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4663 if (eptr >= md->end_subject)
4664 {
4665 SCHECK_PARTIAL();
4666 MRRETURN(MATCH_NOMATCH);
4667 }
4668 GETCHARINCTEST(c, eptr);
4669 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4670 MRRETURN(MATCH_NOMATCH);
4671 }
4672 /* Control never gets here */
4673
4674 case PT_SC:
4675 for (fi = min;; fi++)
4676 {
4677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4679 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 MRRETURN(MATCH_NOMATCH);
4684 }
4685 GETCHARINCTEST(c, eptr);
4686 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4687 MRRETURN(MATCH_NOMATCH);
4688 }
4689 /* Control never gets here */
4690
4691 case PT_ALNUM:
4692 for (fi = min;; fi++)
4693 {
4694 int category;
4695 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4696 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4697 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4698 if (eptr >= md->end_subject)
4699 {
4700 SCHECK_PARTIAL();
4701 MRRETURN(MATCH_NOMATCH);
4702 }
4703 GETCHARINCTEST(c, eptr);
4704 category = UCD_CATEGORY(c);
4705 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4706 MRRETURN(MATCH_NOMATCH);
4707 }
4708 /* Control never gets here */
4709
4710 case PT_SPACE: /* Perl space */
4711 for (fi = min;; fi++)
4712 {
4713 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4714 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4715 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4716 if (eptr >= md->end_subject)
4717 {
4718 SCHECK_PARTIAL();
4719 MRRETURN(MATCH_NOMATCH);
4720 }
4721 GETCHARINCTEST(c, eptr);
4722 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4723 c == CHAR_FF || c == CHAR_CR)
4724 == prop_fail_result)
4725 MRRETURN(MATCH_NOMATCH);
4726 }
4727 /* Control never gets here */
4728
4729 case PT_PXSPACE: /* POSIX space */
4730 for (fi = min;; fi++)
4731 {
4732 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4734 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4735 if (eptr >= md->end_subject)
4736 {
4737 SCHECK_PARTIAL();
4738 MRRETURN(MATCH_NOMATCH);
4739 }
4740 GETCHARINCTEST(c, eptr);
4741 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4742 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4743 == prop_fail_result)
4744 MRRETURN(MATCH_NOMATCH);
4745 }
4746 /* Control never gets here */
4747
4748 case PT_WORD:
4749 for (fi = min;; fi++)
4750 {
4751 int category;
4752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4754 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4755 if (eptr >= md->end_subject)
4756 {
4757 SCHECK_PARTIAL();
4758 MRRETURN(MATCH_NOMATCH);
4759 }
4760 GETCHARINCTEST(c, eptr);
4761 category = UCD_CATEGORY(c);
4762 if ((category == ucp_L ||
4763 category == ucp_N ||
4764 c == CHAR_UNDERSCORE)
4765 == prop_fail_result)
4766 MRRETURN(MATCH_NOMATCH);
4767 }
4768 /* Control never gets here */
4769
4770 /* This should never occur */
4771
4772 default:
4773 RRETURN(PCRE_ERROR_INTERNAL);
4774 }
4775 }
4776
4777 /* Match extended Unicode sequences. We will get here only if the
4778 support is in the binary; otherwise a compile-time error occurs. */
4779
4780 else if (ctype == OP_EXTUNI)
4781 {
4782 for (fi = min;; fi++)
4783 {
4784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4786 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4787 if (eptr >= md->end_subject)
4788 {
4789 SCHECK_PARTIAL();
4790 MRRETURN(MATCH_NOMATCH);
4791 }
4792 GETCHARINCTEST(c, eptr);
4793 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4794 while (eptr < md->end_subject)
4795 {
4796 int len = 1;
4797 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4798 if (UCD_CATEGORY(c) != ucp_M) break;
4799 eptr += len;
4800 }
4801 }
4802 }
4803 else
4804 #endif /* SUPPORT_UCP */
4805
4806 #ifdef SUPPORT_UTF8
4807 /* UTF-8 mode */
4808 if (utf8)
4809 {
4810 for (fi = min;; fi++)
4811 {
4812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4814 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4815 if (eptr >= md->end_subject)
4816 {
4817 SCHECK_PARTIAL();
4818 MRRETURN(MATCH_NOMATCH);
4819 }
4820 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4821 MRRETURN(MATCH_NOMATCH);
4822 GETCHARINC(c, eptr);
4823 switch(ctype)
4824 {
4825 case OP_ANY: /* This is the non-NL case */
4826 case OP_ALLANY:
4827 case OP_ANYBYTE:
4828 break;
4829
4830 case OP_ANYNL:
4831 switch(c)
4832 {
4833 default: MRRETURN(MATCH_NOMATCH);
4834 case 0x000d:
4835 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4836 break;
4837 case 0x000a:
4838 break;
4839
4840 case 0x000b:
4841 case 0x000c:
4842 case 0x0085:
4843 case 0x2028:
4844 case 0x2029:
4845 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4846 break;
4847 }
4848 break;
4849
4850 case OP_NOT_HSPACE:
4851 switch(c)
4852 {
4853 default: break;
4854 case 0x09: /* HT */
4855 case 0x20: /* SPACE */
4856 case 0xa0: /* NBSP */
4857 case 0x1680: /* OGHAM SPACE MARK */
4858 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4859 case 0x2000: /* EN QUAD */
4860 case 0x2001: /* EM QUAD */
4861 case 0x2002: /* EN SPACE */
4862 case 0x2003: /* EM SPACE */
4863 case 0x2004: /* THREE-PER-EM SPACE */
4864 case 0x2005: /* FOUR-PER-EM SPACE */
4865 case 0x2006: /* SIX-PER-EM SPACE */
4866 case 0x2007: /* FIGURE SPACE */
4867 case 0x2008: /* PUNCTUATION SPACE */
4868 case 0x2009: /* THIN SPACE */
4869 case 0x200A: /* HAIR SPACE */
4870 case 0x202f: /* NARROW NO-BREAK SPACE */
4871 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4872 case 0x3000: /* IDEOGRAPHIC SPACE */
4873 MRRETURN(MATCH_NOMATCH);
4874 }
4875 break;
4876
4877 case OP_HSPACE:
4878 switch(c)
4879 {
4880 default: MRRETURN(MATCH_NOMATCH);
4881 case 0x09: /* HT */
4882 case 0x20: /* SPACE */
4883 case 0xa0: /* NBSP */
4884 case 0x1680: /* OGHAM SPACE MARK */
4885 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4886 case 0x2000: /* EN QUAD */
4887 case 0x2001: /* EM QUAD */
4888 case 0x2002: /* EN SPACE */
4889 case 0x2003: /* EM SPACE */
4890 case 0x2004: /* THREE-PER-EM SPACE */
4891 case 0x2005: /* FOUR-PER-EM SPACE */
4892 case 0x2006: /* SIX-PER-EM SPACE */
4893 case 0x2007: /* FIGURE SPACE */
4894 case 0x2008: /* PUNCTUATION SPACE */
4895 case 0x2009: /* THIN SPACE */
4896 case 0x200A: /* HAIR SPACE */
4897 case 0x202f: /* NARROW NO-BREAK SPACE */
4898 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4899 case 0x3000: /* IDEOGRAPHIC SPACE */
4900 break;
4901 }
4902 break;
4903
4904 case OP_NOT_VSPACE:
4905 switch(c)
4906 {
4907 default: break;
4908 case 0x0a: /* LF */
4909 case 0x0b: /* VT */
4910 case 0x0c: /* FF */
4911 case 0x0d: /* CR */
4912 case 0x85: /* NEL */
4913 case 0x2028: /* LINE SEPARATOR */
4914 case 0x2029: /* PARAGRAPH SEPARATOR */
4915 MRRETURN(MATCH_NOMATCH);
4916 }
4917 break;
4918
4919 case OP_VSPACE:
4920 switch(c)
4921 {
4922 default: MRRETURN(MATCH_NOMATCH);
4923 case 0x0a: /* LF */
4924 case 0x0b: /* VT */
4925 case 0x0c: /* FF */
4926 case 0x0d: /* CR */
4927 case 0x85: /* NEL */
4928 case 0x2028: /* LINE SEPARATOR */
4929 case 0x2029: /* PARAGRAPH SEPARATOR */
4930 break;
4931 }
4932 break;
4933
4934 case OP_NOT_DIGIT:
4935 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4936 MRRETURN(MATCH_NOMATCH);
4937 break;
4938
4939 case OP_DIGIT:
4940 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4941 MRRETURN(MATCH_NOMATCH);
4942 break;
4943
4944 case OP_NOT_WHITESPACE:
4945 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4946 MRRETURN(MATCH_NOMATCH);
4947 break;
4948
4949 case OP_WHITESPACE:
4950 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4951 MRRETURN(MATCH_NOMATCH);
4952 break;
4953
4954 case OP_NOT_WORDCHAR:
4955 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4956 MRRETURN(MATCH_NOMATCH);
4957 break;
4958
4959 case OP_WORDCHAR:
4960 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4961 MRRETURN(MATCH_NOMATCH);
4962 break;
4963
4964 default:
4965 RRETURN(PCRE_ERROR_INTERNAL);
4966 }
4967 }
4968 }
4969 else
4970 #endif
4971 /* Not UTF-8 mode */
4972 {
4973 for (fi = min;; fi++)
4974 {
4975 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4977 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 MRRETURN(MATCH_NOMATCH);
4982 }
4983 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4984 MRRETURN(MATCH_NOMATCH);
4985 c = *eptr++;
4986 switch(ctype)
4987 {
4988 case OP_ANY: /* This is the non-NL case */
4989 case OP_ALLANY:
4990 case OP_ANYBYTE:
4991 break;
4992
4993 case OP_ANYNL:
4994 switch(c)
4995 {
4996 default: MRRETURN(MATCH_NOMATCH);
4997 case 0x000d:
4998 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4999 break;
5000
5001 case 0x000a:
5002 break;
5003
5004 case 0x000b:
5005 case 0x000c:
5006 case 0x0085:
5007 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
5008 break;
5009 }
5010 break;
5011
5012 case OP_NOT_HSPACE:
5013 switch(c)
5014 {
5015 default: break;
5016 case 0x09: /* HT */
5017 case 0x20: /* SPACE */
5018 case 0xa0: /* NBSP */
5019 MRRETURN(MATCH_NOMATCH);
5020 }
5021 break;
5022
5023 case OP_HSPACE:
5024 switch(c)
5025 {
5026 default: MRRETURN(MATCH_NOMATCH);
5027 case 0x09: /* HT */
5028 case 0x20: /* SPACE */
5029 case 0xa0: /* NBSP */
5030 break;
5031 }
5032 break;
5033
5034 case OP_NOT_VSPACE:
5035 switch(c)
5036 {
5037 default: break;
5038 case 0x0a: /* LF */
5039 case 0x0b: /* VT */
5040 case 0x0c: /* FF */
5041 case 0x0d: /* CR */
5042 case 0x85: /* NEL */
5043 MRRETURN(MATCH_NOMATCH);
5044 }
5045 break;
5046
5047 case OP_VSPACE:
5048 switch(c)
5049 {
5050 default: MRRETURN(MATCH_NOMATCH);
5051 case 0x0a: /* LF */
5052 case 0x0b: /* VT */
5053 case 0x0c: /* FF */
5054 case 0x0d: /* CR */
5055 case 0x85: /* NEL */
5056 break;
5057 }
5058 break;
5059
5060 case OP_NOT_DIGIT:
5061 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5062 break;
5063
5064 case OP_DIGIT:
5065 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5066 break;
5067
5068 case OP_NOT_WHITESPACE:
5069 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5070 break;
5071
5072 case OP_WHITESPACE:
5073 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5074 break;
5075
5076 case OP_NOT_WORDCHAR:
5077 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5078 break;
5079
5080 case OP_WORDCHAR:
5081 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5082 break;
5083
5084 default:
5085 RRETURN(PCRE_ERROR_INTERNAL);
5086 }
5087 }
5088 }
5089 /* Control never gets here */
5090 }
5091
5092 /* If maximizing, it is worth using inline code for speed, doing the type
5093 test once at the start (i.e. keep it out of the loop). Again, keep the
5094 UTF-8 and UCP stuff separate. */
5095
5096 else
5097 {
5098 pp = eptr; /* Remember where we started */
5099
5100 #ifdef SUPPORT_UCP
5101 if (prop_type >= 0)
5102 {
5103 switch(prop_type)
5104 {
5105 case PT_ANY:
5106 for (i = min; i < max; i++)
5107 {
5108 int len = 1;
5109 if (eptr >= md->end_subject)
5110 {
5111 SCHECK_PARTIAL();
5112 break;
5113 }
5114 GETCHARLENTEST(c, eptr, len);
5115 if (prop_fail_result) break;
5116 eptr+= len;
5117 }
5118 break;
5119
5120 case PT_LAMP:
5121 for (i = min; i < max; i++)
5122 {
5123 int chartype;
5124 int len = 1;
5125 if (eptr >= md->end_subject)
5126 {
5127 SCHECK_PARTIAL();
5128 break;
5129 }
5130 GETCHARLENTEST(c, eptr, len);
5131 chartype = UCD_CHARTYPE(c);
5132 if ((chartype == ucp_Lu ||
5133 chartype == ucp_Ll ||
5134 chartype == ucp_Lt) == prop_fail_result)
5135 break;
5136 eptr+= len;
5137 }
5138 break;
5139
5140 case PT_GC:
5141 for (i = min; i < max; i++)
5142 {
5143 int len = 1;
5144 if (eptr >= md->end_subject)
5145 {
5146 SCHECK_PARTIAL();
5147 break;
5148 }
5149 GETCHARLENTEST(c, eptr, len);
5150 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5151 eptr+= len;
5152 }
5153 break;
5154
5155 case PT_PC:
5156 for (i = min; i < max; i++)
5157 {
5158 int len = 1;
5159 if (eptr >= md->end_subject)
5160 {
5161 SCHECK_PARTIAL();
5162 break;
5163 }
5164 GETCHARLENTEST(c, eptr, len);
5165 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5166 eptr+= len;
5167 }
5168 break;
5169
5170 case PT_SC:
5171 for (i = min; i < max; i++)
5172 {
5173 int len = 1;
5174 if (eptr >= md->end_subject)
5175 {
5176 SCHECK_PARTIAL();
5177 break;
5178 }
5179 GETCHARLENTEST(c, eptr, len);
5180 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5181 eptr+= len;
5182 }
5183 break;
5184
5185 case PT_ALNUM:
5186 for (i = min; i < max; i++)
5187 {
5188 int category;
5189 int len = 1;
5190 if (eptr >= md->end_subject)
5191 {
5192 SCHECK_PARTIAL();
5193 break;
5194 }
5195 GETCHARLENTEST(c, eptr, len);
5196 category = UCD_CATEGORY(c);
5197 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5198 break;
5199 eptr+= len;
5200 }
5201 break;
5202
5203 case PT_SPACE: /* Perl space */
5204 for (i = min; i < max; i++)
5205 {
5206 int len = 1;
5207 if (eptr >= md->end_subject)
5208 {
5209 SCHECK_PARTIAL();
5210 break;
5211 }
5212 GETCHARLENTEST(c, eptr, len);
5213 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5214 c == CHAR_FF || c == CHAR_CR)
5215 == prop_fail_result)
5216 break;
5217 eptr+= len;
5218 }
5219 break;
5220
5221 case PT_PXSPACE: /* POSIX space */
5222 for (i = min; i < max; i++)
5223 {
5224 int len = 1;
5225 if (eptr >= md->end_subject)
5226 {
5227 SCHECK_PARTIAL();
5228 break;
5229 }
5230 GETCHARLENTEST(c, eptr, len);
5231 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5232 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5233 == prop_fail_result)
5234 break;
5235 eptr+= len;
5236 }
5237 break;
5238
5239 case PT_WORD:
5240 for (i = min; i < max; i++)
5241 {
5242 int category;
5243 int len = 1;
5244 if (eptr >= md->end_subject)
5245 {
5246 SCHECK_PARTIAL();
5247 break;
5248 }
5249 GETCHARLENTEST(c, eptr, len);
5250 category = UCD_CATEGORY(c);
5251 if ((category == ucp_L || category == ucp_N ||
5252 c == CHAR_UNDERSCORE) == prop_fail_result)
5253 break;
5254 eptr+= len;
5255 }
5256 break;
5257
5258 default:
5259 RRETURN(PCRE_ERROR_INTERNAL);
5260 }
5261
5262 /* eptr is now past the end of the maximum run */
5263
5264 if (possessive) continue;
5265 for(;;)
5266 {
5267 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5268 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5269 if (eptr-- == pp) break; /* Stop if tried at original pos */
5270 if (utf8) BACKCHAR(eptr);
5271 }
5272 }
5273
5274 /* Match extended Unicode sequences. We will get here only if the
5275 support is in the binary; otherwise a compile-time error occurs. */
5276
5277 else if (ctype == OP_EXTUNI)
5278 {
5279 for (i = min; i < max; i++)
5280 {
5281 int len = 1;
5282 if (eptr >= md->end_subject)
5283 {
5284 SCHECK_PARTIAL();
5285 break;
5286 }
5287 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5288 if (UCD_CATEGORY(c) == ucp_M) break;
5289 eptr += len;
5290 while (eptr < md->end_subject)
5291 {
5292 len = 1;
5293 if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5294 if (UCD_CATEGORY(c) != ucp_M) break;
5295 eptr += len;
5296 }
5297 }
5298
5299 /* eptr is now past the end of the maximum run */
5300
5301 if (possessive) continue;
5302
5303 for(;;)
5304 {
5305 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5306 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5307 if (eptr-- == pp) break; /* Stop if tried at original pos */
5308 for (;;) /* Move back over one extended */
5309 {
5310 if (!utf8) c = *eptr; else
5311 {
5312 BACKCHAR(eptr);
5313 GETCHAR(c, eptr);
5314 }
5315 if (UCD_CATEGORY(c) != ucp_M) break;
5316 eptr--;
5317 }
5318 }
5319 }
5320
5321 else
5322 #endif /* SUPPORT_UCP */
5323
5324 #ifdef SUPPORT_UTF8
5325 /* UTF-8 mode */
5326
5327 if (utf8)
5328 {
5329 switch(ctype)
5330 {
5331 case OP_ANY:
5332 if (max < INT_MAX)
5333 {
5334 for (i = min; i < max; i++)
5335 {
5336 if (eptr >= md->end_subject)
5337 {
5338 SCHECK_PARTIAL();
5339 break;
5340 }
5341 if (IS_NEWLINE(eptr)) break;
5342 eptr++;
5343 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5344 }
5345 }
5346
5347 /* Handle unlimited UTF-8 repeat */
5348
5349 else
5350 {
5351 for (i = min; i < max; i++)
5352 {
5353 if (eptr >= md->end_subject)
5354 {
5355 SCHECK_PARTIAL();
5356 break;
5357 }
5358 if (IS_NEWLINE(eptr)) break;
5359 eptr++;
5360 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5361 }
5362 }
5363 break;
5364
5365 case OP_ALLANY:
5366 if (max < INT_MAX)
5367 {
5368 for (i = min; i < max; i++)
5369 {
5370 if (eptr >= md->end_subject)
5371 {
5372 SCHECK_PARTIAL();
5373 break;
5374 }
5375 eptr++;
5376 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5377 }
5378 }
5379 else
5380 {
5381 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5382 SCHECK_PARTIAL();
5383 }
5384 break;
5385
5386 /* The byte case is the same as non-UTF8 */
5387
5388 case OP_ANYBYTE:
5389 c = max - min;
5390 if (c > (unsigned int)(md->end_subject - eptr))
5391 {
5392 eptr = md->end_subject;
5393 SCHECK_PARTIAL();
5394 }
5395 else eptr += c;
5396 break;
5397
5398 case OP_ANYNL:
5399 for (i = min; i < max; i++)
5400 {
5401 int len = 1;
5402 if (eptr >= md->end_subject)
5403 {
5404 SCHECK_PARTIAL();
5405 break;
5406 }
5407 GETCHARLEN(c, eptr, len);
5408 if (c == 0x000d)
5409 {
5410 if (++eptr >= md->end_subject) break;
5411 if (*eptr == 0x000a) eptr++;
5412 }
5413 else
5414 {
5415 if (c != 0x000a &&
5416 (md->bsr_anycrlf ||
5417 (c != 0x000b && c != 0x000c &&
5418 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5419 break;
5420 eptr += len;
5421 }
5422 }
5423 break;
5424
5425 case OP_NOT_HSPACE:
5426 case OP_HSPACE:
5427 for (i = min; i < max; i++)
5428 {
5429 BOOL gotspace;
5430 int len = 1;
5431 if (eptr >= md->end_subject)
5432 {
5433 SCHECK_PARTIAL();
5434 break;
5435 }
5436 GETCHARLEN(c, eptr, len);
5437 switch(c)
5438 {
5439 default: gotspace = FALSE; break;
5440 case 0x09: /* HT */
5441 case 0x20: /* SPACE */
5442 case 0xa0: /* NBSP */
5443 case 0x1680: /* OGHAM SPACE MARK */
5444 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5445 case 0x2000: /* EN QUAD */
5446 case 0x2001: /* EM QUAD */
5447 case 0x2002: /* EN SPACE */
5448 case 0x2003: /* EM SPACE */
5449 case 0x2004: /* THREE-PER-EM SPACE */
5450 case 0x2005: /* FOUR-PER-EM SPACE */
5451 case 0x2006: /* SIX-PER-EM SPACE */
5452 case 0x2007: /* FIGURE SPACE */
5453 case 0x2008: /* PUNCTUATION SPACE */
5454 case 0x2009: /* THIN SPACE */
5455 case 0x200A: /* HAIR SPACE */
5456 case 0x202f: /* NARROW NO-BREAK SPACE */
5457 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5458 case 0x3000: /* IDEOGRAPHIC SPACE */
5459 gotspace = TRUE;
5460 break;
5461 }
5462 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5463 eptr += len;
5464 }
5465 break;
5466
5467 case OP_NOT_VSPACE:
5468 case OP_VSPACE:
5469 for (i = min; i < max; i++)
5470 {
5471 BOOL gotspace;
5472 int len = 1;
5473 if (eptr >= md->end_subject)
5474 {
5475 SCHECK_PARTIAL();
5476 break;
5477 }
5478 GETCHARLEN(c, eptr, len);
5479 switch(c)
5480 {
5481 default: gotspace = FALSE; break;
5482 case 0x0a: /* LF */
5483 case 0x0b: /* VT */
5484 case 0x0c: /* FF */
5485 case 0x0d: /* CR */
5486 case 0x85: /* NEL */
5487 case 0x2028: /* LINE SEPARATOR */
5488 case 0x2029: /* PARAGRAPH SEPARATOR */
5489 gotspace = TRUE;
5490 break;
5491 }
5492 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5493 eptr += len;
5494 }
5495 break;
5496
5497 case OP_NOT_DIGIT:
5498 for (i = min; i < max; i++)
5499 {
5500 int len = 1;
5501 if (eptr >= md->end_subject)
5502 {
5503 SCHECK_PARTIAL();
5504 break;
5505 }
5506 GETCHARLEN(c, eptr, len);
5507 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5508 eptr+= len;
5509 }
5510 break;
5511
5512 case OP_DIGIT:
5513 for (i = min; i < max; i++)
5514 {
5515 int len = 1;
5516 if (eptr >= md->end_subject)
5517 {
5518 SCHECK_PARTIAL();
5519 break;
5520 }
5521 GETCHARLEN(c, eptr, len);
5522 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5523 eptr+= len;
5524 }
5525 break;
5526
5527 case OP_NOT_WHITESPACE:
5528 for (i = min; i < max; i++)
5529 {
5530 int len = 1;
5531 if (eptr >= md->end_subject)
5532 {
5533 SCHECK_PARTIAL();
5534 break;
5535 }
5536 GETCHARLEN(c, eptr, len);
5537 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5538 eptr+= len;
5539 }
5540 break;
5541
5542 case OP_WHITESPACE:
5543 for (i = min; i < max; i++)
5544 {
5545 int len = 1;
5546 if (eptr >= md->end_subject)
5547 {
5548 SCHECK_PARTIAL();
5549 break;
5550 }
5551 GETCHARLEN(c, eptr, len);
5552 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5553 eptr+= len;
5554 }
5555 break;
5556
5557 case OP_NOT_WORDCHAR:
5558 for (i = min; i < max; i++)
5559 {
5560 int len = 1;
5561 if (eptr >= md->end_subject)
5562 {
5563 SCHECK_PARTIAL();
5564 break;
5565 }
5566 GETCHARLEN(c, eptr, len);
5567 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5568 eptr+= len;
5569 }
5570 break;
5571
5572 case OP_WORDCHAR:
5573 for (i = min; i < max; i++)
5574 {
5575 int len = 1;
5576 if (eptr >= md->end_subject)
5577 {
5578 SCHECK_PARTIAL();
5579 break;
5580 }
5581 GETCHARLEN(c, eptr, len);
5582 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5583 eptr+= len;
5584 }
5585 break;
5586
5587 default:
5588 RRETURN(PCRE_ERROR_INTERNAL);
5589 }
5590
5591 /* eptr is now past the end of the maximum run. If possessive, we are
5592 done (no backing up). Otherwise, match at this position; anything other
5593 than no match is immediately returned. For nomatch, back up one
5594 character, unless we are matching \R and the last thing matched was
5595 \r\n, in which case, back up two bytes. */
5596
5597 if (possessive) continue;
5598 for(;;)
5599 {
5600 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5602 if (eptr-- == pp) break; /* Stop if tried at original pos */
5603 BACKCHAR(eptr);
5604 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5605 eptr[-1] == '\r') eptr--;
5606 }
5607 }
5608 else
5609 #endif /* SUPPORT_UTF8 */
5610
5611 /* Not UTF-8 mode */
5612 {
5613 switch(ctype)
5614 {
5615 case OP_ANY:
5616 for (i = min; i < max; i++)
5617 {
5618 if (eptr >= md->end_subject)
5619 {
5620 SCHECK_PARTIAL();
5621 break;
5622 }
5623 if (IS_NEWLINE(eptr)) break;
5624 eptr++;
5625 }
5626 break;
5627
5628 case OP_ALLANY:
5629 case OP_ANYBYTE:
5630 c = max - min;
5631 if (c > (unsigned int)(md->end_subject - eptr))
5632 {
5633 eptr = md->end_subject;
5634 SCHECK_PARTIAL();
5635 }
5636 else eptr += c;
5637 break;
5638
5639 case OP_ANYNL:
5640 for (i = min; i < max; i++)
5641 {
5642 if (eptr >= md->end_subject)
5643 {
5644 SCHECK_PARTIAL();
5645 break;
5646 }
5647 c = *eptr;
5648 if (c == 0x000d)
5649 {
5650 if (++eptr >= md->end_subject) break;
5651 if (*eptr == 0x000a) eptr++;
5652 }
5653 else
5654 {
5655 if (c != 0x000a &&
5656 (md->bsr_anycrlf ||
5657 (c != 0x000b && c != 0x000c && c != 0x0085)))
5658 break;
5659 eptr++;
5660 }
5661 }
5662 break;
5663
5664 case OP_NOT_HSPACE:
5665 for (i = min; i < max; i++)
5666 {
5667 if (eptr >= md->end_subject)
5668 {
5669 SCHECK_PARTIAL();
5670 break;
5671 }
5672 c = *eptr;
5673 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5674 eptr++;
5675 }
5676 break;
5677
5678 case OP_HSPACE:
5679 for (i = min; i < max; i++)
5680 {
5681 if (eptr >= md->end_subject)
5682 {
5683 SCHECK_PARTIAL();
5684 break;
5685 }
5686 c = *eptr;
5687 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5688 eptr++;
5689 }
5690 break;
5691
5692 case OP_NOT_VSPACE:
5693 for (i = min; i < max; i++)
5694 {
5695 if (eptr >= md->end_subject)
5696 {
5697 SCHECK_PARTIAL();
5698 break;
5699 }
5700 c = *eptr;
5701 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5702 break;
5703 eptr++;
5704 }
5705 break;
5706
5707 case OP_VSPACE:
5708 for (i = min; i < max; i++)
5709 {
5710 if (eptr >= md->end_subject)
5711 {
5712 SCHECK_PARTIAL();
5713 break;
5714 }
5715 c = *eptr;
5716 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5717 break;
5718 eptr++;
5719 }
5720 break;
5721
5722 case OP_NOT_DIGIT:
5723 for (i = min; i < max; i++)
5724 {
5725 if (eptr >= md->end_subject)
5726 {
5727 SCHECK_PARTIAL();
5728 break;
5729 }
5730 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5731 eptr++;
5732 }
5733 break;
5734
5735 case OP_DIGIT:
5736 for (i = min; i < max; i++)
5737 {
5738 if (eptr >= md->end_subject)
5739 {
5740 SCHECK_PARTIAL();
5741 break;
5742 }
5743 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5744 eptr++;
5745 }
5746 break;
5747
5748 case OP_NOT_WHITESPACE:
5749 for (i = min; i < max; i++)
5750 {
5751 if (eptr >= md->end_subject)
5752 {
5753 SCHECK_PARTIAL();
5754 break;
5755 }
5756 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5757 eptr++;
5758 }
5759 break;
5760
5761 case OP_WHITESPACE:
5762 for (i = min; i < max; i++)
5763 {
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5770 eptr++;
5771 }
5772 break;
5773
5774 case OP_NOT_WORDCHAR:
5775 for (i = min; i < max; i++)
5776 {
5777 if (eptr >= md->end_subject)
5778 {
5779 SCHECK_PARTIAL();
5780 break;
5781 }
5782 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5783 eptr++;
5784 }
5785 break;
5786
5787 case OP_WORDCHAR:
5788 for (i = min; i < max; i++)
5789 {
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5796 eptr++;
5797 }
5798 break;
5799
5800 default:
5801 RRETURN(PCRE_ERROR_INTERNAL);
5802 }
5803
5804 /* eptr is now past the end of the maximum run. If possessive, we are
5805 done (no backing up). Otherwise, match at this position; anything other
5806 than no match is immediately returned. For nomatch, back up one
5807 character (byte), unless we are matching \R and the last thing matched
5808 was \r\n, in which case, back up two bytes. */
5809
5810 if (possessive) continue;
5811 while (eptr >= pp)
5812 {
5813 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5815 eptr--;
5816 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5817 eptr[-1] == '\r') eptr--;
5818 }
5819 }
5820
5821 /* Get here if we can't make it match with any permitted repetitions */
5822
5823 MRRETURN(MATCH_NOMATCH);
5824 }
5825 /* Control never gets here */
5826
5827 /* There's been some horrible disaster. Arrival here can only mean there is
5828 something seriously wrong in the code above or the OP_xxx definitions. */
5829
5830 default:
5831 DPRINTF(("Unknown opcode %d\n", *ecode));
5832 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5833 }
5834
5835 /* Do not stick any code in here without much thought; it is assumed
5836 that "continue" in the code above comes out to here to repeat the main
5837 loop. */
5838
5839 } /* End of main loop */
5840 /* Control never reaches here */
5841
5842
5843 /* When compiling to use the heap rather than the stack for recursive calls to
5844 match(), the RRETURN() macro jumps here. The number that is saved in
5845 frame->Xwhere indicates which label we actually want to return to. */
5846
5847 #ifdef NO_RECURSE
5848 #define LBL(val) case val: goto L_RM##val;
5849 HEAP_RETURN:
5850 switch (frame->Xwhere)
5851 {
5852 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5853 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5854 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5855 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5856 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5857 LBL(65) LBL(66)
5858 #ifdef SUPPORT_UTF8
5859 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5860 LBL(32) LBL(34) LBL(42) LBL(46)
5861 #ifdef SUPPORT_UCP
5862 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5863 LBL(59) LBL(60) LBL(61) LBL(62)
5864 #endif /* SUPPORT_UCP */
5865 #endif /* SUPPORT_UTF8 */
5866 default:
5867 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5868 return PCRE_ERROR_INTERNAL;
5869 }
5870 #undef LBL
5871 #endif /* NO_RECURSE */
5872 }
5873
5874
5875 /***************************************************************************
5876 ****************************************************************************
5877 RECURSION IN THE match() FUNCTION
5878
5879 Undefine all the macros that were defined above to handle this. */
5880
5881 #ifdef NO_RECURSE
5882 #undef eptr
5883 #undef ecode
5884 #undef mstart
5885 #undef offset_top
5886 #undef eptrb
5887 #undef flags
5888
5889 #undef callpat
5890 #undef charptr
5891 #undef data
5892 #undef next
5893 #undef pp
5894 #undef prev
5895 #undef saved_eptr
5896
5897 #undef new_recursive
5898
5899 #undef cur_is_word
5900 #undef condition
5901 #undef prev_is_word
5902
5903 #undef ctype
5904 #undef length
5905 #undef max
5906 #undef min
5907 #undef number
5908 #undef offset
5909 #undef op
5910 #undef save_capture_last
5911 #undef save_offset1
5912 #undef save_offset2
5913 #undef save_offset3
5914 #undef stacksave
5915
5916 #undef newptrb
5917
5918 #endif
5919
5920 /* These two are defined as macros in both cases */
5921
5922 #undef fc
5923 #undef fi
5924
5925 /***************************************************************************
5926 ***************************************************************************/
5927
5928
5929
5930 /*************************************************
5931 * Execute a Regular Expression *
5932 *************************************************/
5933
5934 /* This function applies a compiled re to a subject string and picks out
5935 portions of the string if it matches. Two elements in the vector are set for
5936 each substring: the offsets to the start and end of the substring.
5937
5938 Arguments:
5939 argument_re points to the compiled expression
5940 extra_data points to extra data or is NULL
5941 subject points to the subject string
5942 length length of subject string (may contain binary zeros)
5943 start_offset where to start in the subject string
5944 options option bits
5945 offsets points to a vector of ints to be filled in with offsets
5946 offsetcount the number of elements in the vector
5947
5948 Returns: > 0 => success; value is the number of elements filled in
5949 = 0 => success, but offsets is not big enough
5950 -1 => failed to match
5951 < -1 => some kind of unexpected problem
5952 */
5953
5954 #ifdef COMPILE_PCRE8
5955 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5956 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5957 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5958 int offsetcount)
5959 #else
5960 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5961 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
5962 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
5963 int offsetcount)
5964 #endif
5965 {
5966 int rc, ocount, arg_offset_max;
5967 int first_byte = -1;
5968 int req_byte = -1;
5969 int req_byte2 = -1;
5970 int newline;
5971 BOOL using_temporary_offsets = FALSE;
5972 BOOL anchored;
5973 BOOL startline;
5974 BOOL firstline;
5975 BOOL first_byte_caseless = FALSE;
5976 BOOL req_byte_caseless = FALSE;
5977 BOOL utf8;
5978 match_data match_block;
5979 match_data *md = &match_block;
5980 const pcre_uint8 *tables;
5981 const pcre_uint8 *start_bits = NULL;
5982 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
5983 PCRE_PUCHAR end_subject;
5984 PCRE_PUCHAR start_partial = NULL;
5985 PCRE_PUCHAR req_byte_ptr = start_match - 1;
5986
5987 pcre_study_data internal_study;
5988 const pcre_study_data *study;
5989
5990 real_pcre internal_re;
5991 const real_pcre *external_re = (const real_pcre *)argument_re;
5992 const real_pcre *re = external_re;
5993
5994 /* Plausibility checks */
5995
5996 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5997 if (re == NULL || subject == NULL ||
5998 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5999 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6000 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6001
6002 /* These two settings are used in the code for checking a UTF-8 string that
6003 follows immediately afterwards. Other values in the md block are used only
6004 during "normal" pcre_exec() processing, not when the JIT support is in use,
6005 so they are set up later. */
6006
6007 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
6008 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6009 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6010
6011 /* Check a UTF-8 string if required. Pass back the character offset and error
6012 code for an invalid string if a results vector is available. */
6013
6014 #ifdef SUPPORT_UTF8
6015 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6016 {
6017 int erroroffset;
6018 int errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)subject, length, &erroroffset);
6019 if (errorcode != 0)
6020 {
6021 if (offsetcount >= 2)
6022 {
6023 offsets[0] = erroroffset;
6024 offsets[1] = errorcode;
6025 }
6026 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6027 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6028 }
6029
6030 /* Check that a start_offset points to the start of a UTF-8 character. */
6031 if (start_offset > 0 && start_offset < length &&
6032 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
6033 return PCRE_ERROR_BADUTF8_OFFSET;
6034 }
6035 #endif
6036
6037 /* If the pattern was successfully studied with JIT support, run the JIT
6038 executable instead of the rest of this function. Most options must be set at
6039 compile time for the JIT code to be usable. Fallback to the normal code path if
6040 an unsupported flag is set. In particular, JIT does not support partial
6041 matching. */
6042
6043 #ifdef SUPPORT_JIT
6044 if (extra_data != NULL
6045 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6046 && extra_data->executable_jit != NULL
6047 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6048 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6049 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6050 return PRIV(jit_exec)(re, extra_data->executable_jit,
6051 (const pcre_uchar *)subject, length, start_offset, options,
6052 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6053 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6054 #endif
6055
6056 /* Carry on with non-JIT matching. This information is for finding all the
6057 numbers associated with a given name, for condition testing. */
6058
6059 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6060 md->name_count = re->name_count;
6061 md->name_entry_size = re->name_entry_size;
6062
6063 /* Fish out the optional data from the extra_data structure, first setting
6064 the default values. */
6065
6066 study = NULL;
6067 md->match_limit = MATCH_LIMIT;
6068 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6069 md->callout_data = NULL;
6070
6071 /* The table pointer is always in native byte order. */
6072
6073 tables = external_re->tables;
6074
6075 if (extra_data != NULL)
6076 {
6077 register unsigned int flags = extra_data->flags;
6078 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6079 study = (const pcre_study_data *)extra_data->study_data;
6080 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6081 md->match_limit = extra_data->match_limit;
6082 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6083 md->match_limit_recursion = extra_data->match_limit_recursion;
6084 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6085 md->callout_data = extra_data->callout_data;
6086 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6087 }
6088
6089 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6090 is a feature that makes it possible to save compiled regex and re-use them
6091 in other programs later. */
6092
6093 if (tables == NULL) tables = PRIV(default_tables);
6094
6095 /* Check that the first field in the block is the magic number. If it is not,
6096 test for a regex that was compiled on a host of opposite endianness. If this is
6097 the case, flipped values are put in internal_re and internal_study if there was
6098 study data too. */
6099
6100 if (re->magic_number != MAGIC_NUMBER)
6101 {
6102 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6103 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6104 if (study != NULL) study = &internal_study;
6105 }
6106
6107 /* Set up other data */
6108
6109 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6110 startline = (re->flags & PCRE_STARTLINE) != 0;
6111 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6112
6113 /* The code starts after the real_pcre block and the capture name table. */
6114
6115 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6116 re->name_count * re->name_entry_size;
6117
6118 md->start_subject = (PCRE_PUCHAR)subject;
6119 md->start_offset = start_offset;
6120 md->end_subject = md->start_subject + length;
6121 end_subject = md->end_subject;
6122
6123 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6124 md->use_ucp = (re->options & PCRE_UCP) != 0;
6125 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6126
6127 /* Some options are unpacked into BOOL variables in the hope that testing
6128 them will be faster than individual option bits. */
6129
6130 md->notbol = (options & PCRE_NOTBOL) != 0;
6131 md->noteol = (options & PCRE_NOTEOL) != 0;
6132 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6133 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6134
6135 md->hitend = FALSE;
6136 md->mark = NULL; /* In case never set */
6137
6138 md->recursive = NULL; /* No recursion at top level */
6139 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6140
6141 md->lcc = tables + lcc_offset;
6142 md->ctypes = tables + ctypes_offset;
6143
6144 /* Handle different \R options. */
6145
6146 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6147 {
6148 case 0:
6149 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6150 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6151 else
6152 #ifdef BSR_ANYCRLF
6153 md->bsr_anycrlf = TRUE;
6154 #else
6155 md->bsr_anycrlf = FALSE;
6156 #endif
6157 break;
6158
6159 case PCRE_BSR_ANYCRLF:
6160 md->bsr_anycrlf = TRUE;
6161 break;
6162
6163 case PCRE_BSR_UNICODE:
6164 md->bsr_anycrlf = FALSE;
6165 break;
6166
6167 default: return PCRE_ERROR_BADNEWLINE;
6168 }
6169
6170 /* Handle different types of newline. The three bits give eight cases. If
6171 nothing is set at run time, whatever was used at compile time applies. */
6172
6173 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6174 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6175 {
6176 case 0: newline = NEWLINE; break; /* Compile-time default */
6177 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6178 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6179 case PCRE_NEWLINE_CR+
6180 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6181 case PCRE_NEWLINE_ANY: newline = -1; break;
6182 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6183 default: return PCRE_ERROR_BADNEWLINE;
6184 }
6185
6186 if (newline == -2)
6187 {
6188 md->nltype = NLTYPE_ANYCRLF;
6189 }
6190 else if (newline < 0)
6191 {
6192 md->nltype = NLTYPE_ANY;
6193 }
6194 else
6195 {
6196 md->nltype = NLTYPE_FIXED;
6197 if (newline > 255)
6198 {
6199 md->nllen = 2;
6200 md->nl[0] = (newline >> 8) & 255;
6201 md->nl[1] = newline & 255;
6202 }
6203 else
6204 {
6205 md->nllen = 1;
6206 md->nl[0] = newline;
6207 }
6208 }
6209
6210 /* Partial matching was originally supported only for a restricted set of
6211 regexes; from release 8.00 there are no restrictions, but the bits are still
6212 defined (though never set). So there's no harm in leaving this code. */
6213
6214 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6215 return PCRE_ERROR_BADPARTIAL;
6216
6217 /* If the expression has got more back references than the offsets supplied can
6218 hold, we get a temporary chunk of working store to use during the matching.
6219 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6220 of 3. */
6221
6222 ocount = offsetcount - (offsetcount % 3);
6223 arg_offset_max = (2*ocount)/3;
6224
6225 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6226 {
6227 ocount = re->top_backref * 3 + 3;
6228 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6229 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6230 using_temporary_offsets = TRUE;
6231 DPRINTF(("Got memory to hold back references\n"));
6232 }
6233 else md->offset_vector = offsets;
6234
6235 md->offset_end = ocount;
6236 md->offset_max = (2*ocount)/3;
6237 md->offset_overflow = FALSE;
6238 md->capture_last = -1;
6239
6240 /* Reset the working variable associated with each extraction. These should
6241 never be used unless previously set, but they get saved and restored, and so we
6242 initialize them to avoid reading uninitialized locations. Also, unset the
6243 offsets for the matched string. This is really just for tidiness with callouts,
6244 in case they inspect these fields. */
6245
6246 if (md->offset_vector != NULL)
6247 {
6248 register int *iptr = md->offset_vector + ocount;
6249 register int *iend = iptr - re->top_bracket;
6250 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6251 while (--iptr >= iend) *iptr = -1;
6252 md->offset_vector[0] = md->offset_vector[1] = -1;
6253 }
6254
6255 /* Set up the first character to match, if available. The first_byte value is
6256 never set for an anchored regular expression, but the anchoring may be forced
6257 at run time, so we have to test for anchoring. The first char may be unset for
6258 an unanchored pattern, of course. If there's no first char and the pattern was
6259 studied, there may be a bitmap of possible first characters. */
6260
6261 if (!anchored)
6262 {
6263 if ((re->flags & PCRE_FIRSTSET) != 0)
6264 {
6265 first_byte = re->first_byte & 255;
6266 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6267 first_byte = md->lcc[first_byte];
6268 }
6269 else
6270 if (!startline && study != NULL &&
6271 (study->flags & PCRE_STUDY_MAPPED) != 0)
6272 start_bits = study->start_bits;
6273 }
6274
6275 /* For anchored or unanchored matches, there may be a "last known required
6276 character" set. */
6277
6278 if ((re->flags & PCRE_REQCHSET) != 0)
6279 {
6280 req_byte = re->req_byte & 255;
6281 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6282 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6283 }
6284
6285
6286
6287
6288 /* ==========================================================================*/
6289
6290 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6291 the loop runs just once. */
6292
6293 for(;;)
6294 {
6295 PCRE_PUCHAR save_end_subject = end_subject;
6296 PCRE_PUCHAR new_start_match;
6297
6298 /* If firstline is TRUE, the start of the match is constrained to the first
6299 line of a multiline string. That is, the match must be before or at the first
6300 newline. Implement this by temporarily adjusting end_subject so that we stop
6301 scanning at a newline. If the match fails at the newline, later code breaks
6302 this loop. */
6303
6304 if (firstline)
6305 {
6306 PCRE_PUCHAR t = start_match;
6307 #ifdef SUPPORT_UTF8
6308 if (utf8)
6309 {
6310 while (t < md->end_subject && !IS_NEWLINE(t))
6311 {
6312 t++;
6313 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6314 }
6315 }
6316 else
6317 #endif
6318 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6319 end_subject = t;
6320 }
6321
6322 /* There are some optimizations that avoid running the match if a known
6323 starting point is not found, or if a known later character is not present.
6324 However, there is an option that disables these, for testing and for ensuring
6325 that all callouts do actually occur. The option can be set in the regex by
6326 (*NO_START_OPT) or passed in match-time options. */
6327
6328 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6329 {
6330 /* Advance to a unique first byte if there is one. */
6331
6332 if (first_byte >= 0)
6333 {
6334 if (first_byte_caseless)
6335 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6336 start_match++;
6337 else
6338 while (start_match < end_subject && *start_match != first_byte)
6339 start_match++;
6340 }
6341
6342 /* Or to just after a linebreak for a multiline match */
6343
6344 else if (startline)
6345 {
6346 if (start_match > md->start_subject + start_offset)
6347 {
6348 #ifdef SUPPORT_UTF8
6349 if (utf8)
6350 {
6351 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6352 {
6353 start_match++;
6354 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6355 start_match++;
6356 }
6357 }
6358 else
6359 #endif
6360 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6361 start_match++;
6362
6363 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6364 and we are now at a LF, advance the match position by one more character.
6365 */
6366
6367 if (start_match[-1] == CHAR_CR &&
6368 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6369 start_match < end_subject &&
6370 *start_match == CHAR_NL)
6371 start_match++;
6372 }
6373 }
6374
6375 /* Or to a non-unique first byte after study */
6376
6377 else if (start_bits != NULL)
6378 {
6379 while (start_match < end_subject)
6380 {
6381 #ifdef COMPILE_PCRE8
6382 register unsigned int c = *start_match;
6383 #else
6384 register unsigned int c = *start_match & 0xff;
6385 #endif
6386 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6387 {
6388 start_match++;
6389 #ifdef SUPPORT_UTF8
6390 if (utf8)
6391 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6392 start_match++;
6393 #endif
6394 }
6395 else break;
6396 }
6397 }
6398 } /* Starting optimizations */
6399
6400 /* Restore fudged end_subject */
6401
6402 end_subject = save_end_subject;
6403
6404 /* The following two optimizations are disabled for partial matching or if
6405 disabling is explicitly requested. */
6406
6407 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6408 {
6409 /* If the pattern was studied, a minimum subject length may be set. This is
6410 a lower bound; no actual string of that length may actually match the
6411 pattern. Although the value is, strictly, in characters, we treat it as
6412 bytes to avoid spending too much time in this optimization. */
6413
6414 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6415 (pcre_uint32)(end_subject - start_match) < study->minlength)
6416 {
6417 rc = MATCH_NOMATCH;
6418 break;
6419 }
6420
6421 /* If req_byte is set, we know that that character must appear in the
6422 subject for the match to succeed. If the first character is set, req_byte
6423 must be later in the subject; otherwise the test starts at the match point.
6424 This optimization can save a huge amount of backtracking in patterns with
6425 nested unlimited repeats that aren't going to match. Writing separate code
6426 for cased/caseless versions makes it go faster, as does using an
6427 autoincrement and backing off on a match.
6428
6429 HOWEVER: when the subject string is very, very long, searching to its end
6430 can take a long time, and give bad performance on quite ordinary patterns.
6431 This showed up when somebody was matching something like /^\d+C/ on a
6432 32-megabyte string... so we don't do this when the string is sufficiently
6433 long. */
6434
6435 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6436 {
6437 register PCRE_PUCHAR p = start_match + ((first_byte >= 0)? 1 : 0);
6438
6439 /* We don't need to repeat the search if we haven't yet reached the
6440 place we found it at last time. */
6441
6442 if (p > req_byte_ptr)
6443 {
6444 if (req_byte_caseless)
6445 {
6446 while (p < end_subject)
6447 {
6448 register int pp = *p++;
6449 if (pp == req_byte || pp == req_byte2) { p--; break; }
6450 }
6451 }
6452 else
6453 {
6454 while (p < end_subject)
6455 {
6456 if (*p++ == req_byte) { p--; break; }
6457 }
6458 }
6459
6460 /* If we can't find the required character, break the matching loop,
6461 forcing a match failure. */
6462
6463 if (p >= end_subject)
6464 {
6465 rc = MATCH_NOMATCH;
6466 break;
6467 }
6468
6469 /* If we have found the required character, save the point where we
6470 found it, so that we don't search again next time round the loop if
6471 the start hasn't passed this character yet. */
6472
6473 req_byte_ptr = p;
6474 }
6475 }
6476 }
6477
6478 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6479 printf(">>>> Match against: ");
6480 pchars(start_match, end_subject - start_match, TRUE, md);
6481 printf("\n");
6482 #endif
6483
6484 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6485 first starting point for which a partial match was found. */
6486
6487 md->start_match_ptr = start_match;
6488 md->start_used_ptr = start_match;
6489 md->match_call_count = 0;
6490 md->match_function_type = 0;
6491 md->end_offset_top = 0;
6492 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6493 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6494
6495 switch(rc)
6496 {
6497 /* SKIP passes back the next starting point explicitly, but if it is the
6498 same as the match we have just done, treat it as NOMATCH. */
6499
6500 case MATCH_SKIP:
6501 if (md->start_match_ptr != start_match)
6502 {
6503 new_start_match = md->start_match_ptr;
6504 break;
6505 }
6506 /* Fall through */
6507
6508 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6509 the SKIP's arg was not found. We also treat this as NOMATCH. */
6510
6511 case MATCH_SKIP_ARG:
6512 /* Fall through */
6513
6514 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6515 exactly like PRUNE. */
6516
6517 case MATCH_NOMATCH:
6518 case MATCH_PRUNE:
6519 case MATCH_THEN:
6520 new_start_match = start_match + 1;
6521 #ifdef SUPPORT_UTF8
6522 if (utf8)
6523 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6524 new_start_match++;
6525 #endif
6526 break;
6527
6528 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6529
6530 case MATCH_COMMIT:
6531 rc = MATCH_NOMATCH;
6532 goto ENDLOOP;
6533
6534 /* Any other return is either a match, or some kind of error. */
6535
6536 default:
6537 goto ENDLOOP;
6538 }
6539
6540 /* Control reaches here for the various types of "no match at this point"
6541 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6542
6543 rc = MATCH_NOMATCH;
6544
6545 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6546 newline in the subject (though it may continue over the newline). Therefore,
6547 if we have just failed to match, starting at a newline, do not continue. */
6548
6549 if (firstline && IS_NEWLINE(start_match)) break;
6550
6551 /* Advance to new matching position */
6552
6553 start_match = new_start_match;
6554
6555 /* Break the loop if the pattern is anchored or if we have passed the end of
6556 the subject. */
6557
6558 if (anchored || start_match > end_subject) break;
6559
6560 /* If we have just passed a CR and we are now at a LF, and the pattern does
6561 not contain any explicit matches for \r or \n, and the newline option is CRLF
6562 or ANY or ANYCRLF, advance the match position by one more character. */
6563
6564 if (start_match[-1] == CHAR_CR &&
6565 start_match < end_subject &&
6566 *start_match == CHAR_NL &&
6567 (re->flags & PCRE_HASCRORLF) == 0 &&
6568 (md->nltype == NLTYPE_ANY ||
6569 md->nltype == NLTYPE_ANYCRLF ||
6570 md->nllen == 2))
6571 start_match++;
6572
6573 md->mark = NULL; /* Reset for start of next match attempt */
6574 } /* End of for(;;) "bumpalong" loop */
6575
6576 /* ==========================================================================*/
6577
6578 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6579 conditions is true:
6580
6581 (1) The pattern is anchored or the match was failed by (*COMMIT);
6582
6583 (2) We are past the end of the subject;
6584
6585 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6586 this option requests that a match occur at or before the first newline in
6587 the subject.
6588
6589 When we have a match and the offset vector is big enough to deal with any
6590 backreferences, captured substring offsets will already be set up. In the case
6591 where we had to get some local store to hold offsets for backreference
6592 processing, copy those that we can. In this case there need not be overflow if
6593 certain parts of the pattern were not used, even though there are more
6594 capturing parentheses than vector slots. */
6595
6596 ENDLOOP:
6597
6598 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6599 {
6600 if (using_temporary_offsets)
6601 {
6602 if (arg_offset_max >= 4)
6603 {
6604 memcpy(offsets + 2, md->offset_vector + 2,
6605 (arg_offset_max - 2) * sizeof(int));
6606 DPRINTF(("Copied offsets from temporary memory\n"));
6607 }
6608 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6609 DPRINTF(("Freeing temporary memory\n"));
6610 (pcre_free)(md->offset_vector);
6611 }
6612
6613 /* Set the return code to the number of captured strings, or 0 if there were
6614 too many to fit into the vector. */
6615
6616 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6617 0 : md->end_offset_top/2;
6618
6619 /* If there is space in the offset vector, set any unused pairs at the end of
6620 the pattern to -1 for backwards compatibility. It is documented that this
6621 happens. In earlier versions, the whole set of potential capturing offsets
6622 was set to -1 each time round the loop, but this is handled differently now.
6623 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6624 those at the end that need unsetting here. We can't just unset them all at
6625 the start of the whole thing because they may get set in one branch that is
6626 not the final matching branch. */
6627
6628 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6629 {
6630 register int *iptr, *iend;
6631 int resetcount = 2 + re->top_bracket * 2;
6632 if (resetcount > offsetcount) resetcount = ocount;
6633 iptr = offsets + md->end_offset_top;
6634 iend = offsets + resetcount;
6635 while (iptr < iend) *iptr++ = -1;
6636 }
6637
6638 /* If there is space, set up the whole thing as substring 0. The value of
6639 md->start_match_ptr might be modified if \K was encountered on the success
6640 matching path. */
6641
6642 if (offsetcount < 2) rc = 0; else
6643 {
6644 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6645 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6646 }
6647
6648 DPRINTF((">>>> returning %d\n", rc));
6649 goto RETURN_MARK;
6650 }
6651
6652 /* Control gets here if there has been an error, or if the overall match
6653 attempt has failed at all permitted starting positions. */
6654
6655 if (using_temporary_offsets)
6656 {
6657 DPRINTF(("Freeing temporary memory\n"));
6658 (pcre_free)(md->offset_vector);
6659 }
6660
6661 /* For anything other than nomatch or partial match, just return the code. */
6662
6663 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6664 {
6665 DPRINTF((">>>> error: returning %d\n", rc));
6666 return rc;
6667 }
6668
6669 /* Handle partial matches - disable any mark data */
6670
6671 if (start_partial != NULL)
6672 {
6673 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6674 md->mark = NULL;
6675 if (offsetcount > 1)
6676 {
6677 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6678 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6679 }
6680 rc = PCRE_ERROR_PARTIAL;
6681 }
6682
6683 /* This is the classic nomatch case */
6684
6685 else
6686 {
6687 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6688 rc = PCRE_ERROR_NOMATCH;
6689 }
6690
6691 /* Return the MARK data if it has been requested. */
6692
6693 RETURN_MARK:
6694
6695 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6696 *(extra_data->mark) = (unsigned char *)(md->mark);
6697 return rc;
6698 }
6699
6700 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5