/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 798 - (show annotations)
Sun Dec 11 18:07:25 2011 UTC (8 years, 7 months ago) by zherczeg
File MIME type: text/plain
File size: 204573 byte(s)
Error occurred while calculating annotation data.
Optimization fixes for ranges contains only a single character
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 PCRE_PUCHAR eptr_start = eptr;
159 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF
185 #ifdef SUPPORT_UCP
186 if (md->utf)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 PCRE_PUCHAR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 PCRE_PUCHAR Xeptr;
358 const pcre_uchar *Xecode;
359 PCRE_PUCHAR Xmstart;
360 PCRE_PUCHAR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
478 PCRE_PUCHAR mstart, const pcre_uchar *markptr, int offset_top,
479 match_data *md, eptrblock *eptrb, unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf; /* Local copy of UTF flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF
589 const pcre_uchar *charptr;
590 #endif
591 const pcre_uchar *callpat;
592 const pcre_uchar *data;
593 const pcre_uchar *next;
594 PCRE_PUCHAR pp;
595 const pcre_uchar *prev;
596 PCRE_PUCHAR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 pcre_uchar occhars[6];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637 #define foc number
638
639 /* These statements are here to stop the compiler complaining about unitialized
640 variables. */
641
642 #ifdef SUPPORT_UCP
643 prop_value = 0;
644 prop_fail_result = 0;
645 #endif
646
647
648 /* This label is used for tail recursion, which is used in a few cases even
649 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
650 used. Thanks to Ian Taylor for noticing this possibility and sending the
651 original patch. */
652
653 TAIL_RECURSE:
654
655 /* OK, now we can get on with the real code of the function. Recursive calls
656 are specified by the macro RMATCH and RRETURN is used to return. When
657 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
658 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
659 defined). However, RMATCH isn't like a function call because it's quite a
660 complicated macro. It has to be used in one particular way. This shouldn't,
661 however, impact performance when true recursion is being used. */
662
663 #ifdef SUPPORT_UTF
664 utf = md->utf; /* Local copy of the flag */
665 #else
666 utf = FALSE;
667 #endif
668
669 /* First check that we haven't called match() too many times, or that we
670 haven't exceeded the recursive call limit. */
671
672 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
673 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
674
675 /* At the start of a group with an unlimited repeat that may match an empty
676 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
677 done this way to save having to use another function argument, which would take
678 up space on the stack. See also MATCH_CONDASSERT below.
679
680 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
681 such remembered pointers, to be checked when we hit the closing ket, in order
682 to break infinite loops that match no characters. When match() is called in
683 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
684 NOT be used with tail recursion, because the memory block that is used is on
685 the stack, so a new one may be required for each match(). */
686
687 if (md->match_function_type == MATCH_CBEGROUP)
688 {
689 newptrb.epb_saved_eptr = eptr;
690 newptrb.epb_prev = eptrb;
691 eptrb = &newptrb;
692 md->match_function_type = 0;
693 }
694
695 /* Now start processing the opcodes. */
696
697 for (;;)
698 {
699 minimize = possessive = FALSE;
700 op = *ecode;
701
702 switch(op)
703 {
704 case OP_MARK:
705 markptr = ecode + 2;
706 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
707 eptrb, RM55);
708
709 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
710 argument, and we must check whether that argument matches this MARK's
711 argument. It is passed back in md->start_match_ptr (an overloading of that
712 variable). If it does match, we reset that variable to the current subject
713 position and return MATCH_SKIP. Otherwise, pass back the return code
714 unaltered. */
715
716 if (rrc == MATCH_SKIP_ARG &&
717 STRCMP_UC_UC(markptr, md->start_match_ptr) == 0)
718 {
719 md->start_match_ptr = eptr;
720 RRETURN(MATCH_SKIP);
721 }
722
723 if (md->mark == NULL) md->mark = markptr;
724 RRETURN(rrc);
725
726 case OP_FAIL:
727 MRRETURN(MATCH_NOMATCH);
728
729 /* COMMIT overrides PRUNE, SKIP, and THEN */
730
731 case OP_COMMIT:
732 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
733 eptrb, RM52);
734 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
735 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
736 rrc != MATCH_THEN)
737 RRETURN(rrc);
738 MRRETURN(MATCH_COMMIT);
739
740 /* PRUNE overrides THEN */
741
742 case OP_PRUNE:
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
744 eptrb, RM51);
745 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
746 MRRETURN(MATCH_PRUNE);
747
748 case OP_PRUNE_ARG:
749 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
750 eptrb, RM56);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 md->mark = ecode + 2;
753 RRETURN(MATCH_PRUNE);
754
755 /* SKIP overrides PRUNE and THEN */
756
757 case OP_SKIP:
758 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
759 eptrb, RM53);
760 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
761 RRETURN(rrc);
762 md->start_match_ptr = eptr; /* Pass back current position */
763 MRRETURN(MATCH_SKIP);
764
765 case OP_SKIP_ARG:
766 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
767 eptrb, RM57);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770
771 /* Pass back the current skip name by overloading md->start_match_ptr and
772 returning the special MATCH_SKIP_ARG return code. This will either be
773 caught by a matching MARK, or get to the top, where it is treated the same
774 as PRUNE. */
775
776 md->start_match_ptr = ecode + 2;
777 RRETURN(MATCH_SKIP_ARG);
778
779 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
780 the branch in which it occurs can be determined. Overload the start of
781 match pointer to do this. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode;
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
792 md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode;
795 md->mark = ecode + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle an atomic group that does not contain any capturing parentheses.
799 This can be handled like an assertion. Prior to 8.13, all atomic groups
800 were handled this way. In 8.13, the code was changed as below for ONCE, so
801 that backups pass through the group and thereby reset captured values.
802 However, this uses a lot more stack, so in 8.20, atomic groups that do not
803 contain any captures generate OP_ONCE_NC, which can be handled in the old,
804 less stack intensive way.
805
806 Check the alternative branches in turn - the matching won't pass the KET
807 for this kind of subpattern. If any one branch matches, we carry on as at
808 the end of a normal bracket, leaving the subject pointer, but resetting
809 the start-of-match value in case it was changed by \K. */
810
811 case OP_ONCE_NC:
812 prev = ecode;
813 saved_eptr = eptr;
814 do
815 {
816 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
817 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
818 {
819 mstart = md->start_match_ptr;
820 markptr = md->mark;
821 break;
822 }
823 if (rrc == MATCH_THEN)
824 {
825 next = ecode + GET(ecode,1);
826 if (md->start_match_ptr < next &&
827 (*ecode == OP_ALT || *next == OP_ALT))
828 rrc = MATCH_NOMATCH;
829 }
830
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 ecode += GET(ecode,1);
833 }
834 while (*ecode == OP_ALT);
835
836 /* If hit the end of the group (which could be repeated), fail */
837
838 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
839
840 /* Continue as from after the group, updating the offsets high water
841 mark, since extracts may have been taken. */
842
843 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
844
845 offset_top = md->end_offset_top;
846 eptr = md->end_match_ptr;
847
848 /* For a non-repeating ket, just continue at this level. This also
849 happens for a repeating ket if no characters were matched in the group.
850 This is the forcible breaking of infinite loops as implemented in Perl
851 5.005. */
852
853 if (*ecode == OP_KET || eptr == saved_eptr)
854 {
855 ecode += 1+LINK_SIZE;
856 break;
857 }
858
859 /* The repeating kets try the rest of the pattern or restart from the
860 preceding bracket, in the appropriate order. The second "call" of match()
861 uses tail recursion, to avoid using another stack frame. */
862
863 if (*ecode == OP_KETRMIN)
864 {
865 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
867 ecode = prev;
868 goto TAIL_RECURSE;
869 }
870 else /* OP_KETRMAX */
871 {
872 md->match_function_type = MATCH_CBEGROUP;
873 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875 ecode += 1 + LINK_SIZE;
876 goto TAIL_RECURSE;
877 }
878 /* Control never gets here */
879
880 /* Handle a capturing bracket, other than those that are possessive with an
881 unlimited repeat. If there is space in the offset vector, save the current
882 subject position in the working slot at the top of the vector. We mustn't
883 change the current values of the data slot, because they may be set from a
884 previous iteration of this group, and be referred to by a reference inside
885 the group. A failure to match might occur after the group has succeeded,
886 if something later on doesn't match. For this reason, we need to restore
887 the working value and also the values of the final offsets, in case they
888 were set by a previous iteration of the same bracket.
889
890 If there isn't enough space in the offset vector, treat this as if it were
891 a non-capturing bracket. Don't worry about setting the flag for the error
892 case here; that is handled in the code for KET. */
893
894 case OP_CBRA:
895 case OP_SCBRA:
896 number = GET2(ecode, 1+LINK_SIZE);
897 offset = number << 1;
898
899 #ifdef PCRE_DEBUG
900 printf("start bracket %d\n", number);
901 printf("subject=");
902 pchars(eptr, 16, TRUE, md);
903 printf("\n");
904 #endif
905
906 if (offset < md->offset_max)
907 {
908 save_offset1 = md->offset_vector[offset];
909 save_offset2 = md->offset_vector[offset+1];
910 save_offset3 = md->offset_vector[md->offset_end - number];
911 save_capture_last = md->capture_last;
912
913 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
914 md->offset_vector[md->offset_end - number] =
915 (int)(eptr - md->start_subject);
916
917 for (;;)
918 {
919 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
920 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
921 eptrb, RM1);
922 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
923
924 /* If we backed up to a THEN, check whether it is within the current
925 branch by comparing the address of the THEN that is passed back with
926 the end of the branch. If it is within the current branch, and the
927 branch is one of two or more alternatives (it either starts or ends
928 with OP_ALT), we have reached the limit of THEN's action, so convert
929 the return code to NOMATCH, which will cause normal backtracking to
930 happen from now on. Otherwise, THEN is passed back to an outer
931 alternative. This implements Perl's treatment of parenthesized groups,
932 where a group not containing | does not affect the current alternative,
933 that is, (X) is NOT the same as (X|(*F)). */
934
935 if (rrc == MATCH_THEN)
936 {
937 next = ecode + GET(ecode,1);
938 if (md->start_match_ptr < next &&
939 (*ecode == OP_ALT || *next == OP_ALT))
940 rrc = MATCH_NOMATCH;
941 }
942
943 /* Anything other than NOMATCH is passed back. */
944
945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
946 md->capture_last = save_capture_last;
947 ecode += GET(ecode, 1);
948 if (*ecode != OP_ALT) break;
949 }
950
951 DPRINTF(("bracket %d failed\n", number));
952 md->offset_vector[offset] = save_offset1;
953 md->offset_vector[offset+1] = save_offset2;
954 md->offset_vector[md->offset_end - number] = save_offset3;
955
956 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
957
958 if (md->mark == NULL) md->mark = markptr;
959 RRETURN(rrc);
960 }
961
962 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
963 as a non-capturing bracket. */
964
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
967
968 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 /* Non-capturing or atomic group, except for possessive with unlimited
974 repeat and ONCE group with no captures. Loop for all the alternatives.
975
976 When we get to the final alternative within the brackets, we used to return
977 the result of a recursive call to match() whatever happened so it was
978 possible to reduce stack usage by turning this into a tail recursion,
979 except in the case of a possibly empty group. However, now that there is
980 the possiblity of (*THEN) occurring in the final alternative, this
981 optimization is no longer always possible.
982
983 We can optimize if we know there are no (*THEN)s in the pattern; at present
984 this is the best that can be done.
985
986 MATCH_ONCE is returned when the end of an atomic group is successfully
987 reached, but subsequent matching fails. It passes back up the tree (causing
988 captured values to be reset) until the original atomic group level is
989 reached. This is tested by comparing md->once_target with the start of the
990 group. At this point, the return is converted into MATCH_NOMATCH so that
991 previous backup points can be taken. */
992
993 case OP_ONCE:
994 case OP_BRA:
995 case OP_SBRA:
996 DPRINTF(("start non-capturing bracket\n"));
997
998 for (;;)
999 {
1000 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1001
1002 /* If this is not a possibly empty group, and there are no (*THEN)s in
1003 the pattern, and this is the final alternative, optimize as described
1004 above. */
1005
1006 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1007 {
1008 ecode += PRIV(OP_lengths)[*ecode];
1009 goto TAIL_RECURSE;
1010 }
1011
1012 /* In all other cases, we have to make another call to match(). */
1013
1014 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1015 RM2);
1016
1017 /* See comment in the code for capturing groups above about handling
1018 THEN. */
1019
1020 if (rrc == MATCH_THEN)
1021 {
1022 next = ecode + GET(ecode,1);
1023 if (md->start_match_ptr < next &&
1024 (*ecode == OP_ALT || *next == OP_ALT))
1025 rrc = MATCH_NOMATCH;
1026 }
1027
1028 if (rrc != MATCH_NOMATCH)
1029 {
1030 if (rrc == MATCH_ONCE)
1031 {
1032 const pcre_uchar *scode = ecode;
1033 if (*scode != OP_ONCE) /* If not at start, find it */
1034 {
1035 while (*scode == OP_ALT) scode += GET(scode, 1);
1036 scode -= GET(scode, 1);
1037 }
1038 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1039 }
1040 RRETURN(rrc);
1041 }
1042 ecode += GET(ecode, 1);
1043 if (*ecode != OP_ALT) break;
1044 }
1045
1046 if (md->mark == NULL) md->mark = markptr;
1047 RRETURN(MATCH_NOMATCH);
1048
1049 /* Handle possessive capturing brackets with an unlimited repeat. We come
1050 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1051 handled similarly to the normal case above. However, the matching is
1052 different. The end of these brackets will always be OP_KETRPOS, which
1053 returns MATCH_KETRPOS without going further in the pattern. By this means
1054 we can handle the group by iteration rather than recursion, thereby
1055 reducing the amount of stack needed. */
1056
1057 case OP_CBRAPOS:
1058 case OP_SCBRAPOS:
1059 allow_zero = FALSE;
1060
1061 POSSESSIVE_CAPTURE:
1062 number = GET2(ecode, 1+LINK_SIZE);
1063 offset = number << 1;
1064
1065 #ifdef PCRE_DEBUG
1066 printf("start possessive bracket %d\n", number);
1067 printf("subject=");
1068 pchars(eptr, 16, TRUE, md);
1069 printf("\n");
1070 #endif
1071
1072 if (offset < md->offset_max)
1073 {
1074 matched_once = FALSE;
1075 code_offset = ecode - md->start_code;
1076
1077 save_offset1 = md->offset_vector[offset];
1078 save_offset2 = md->offset_vector[offset+1];
1079 save_offset3 = md->offset_vector[md->offset_end - number];
1080 save_capture_last = md->capture_last;
1081
1082 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1083
1084 /* Each time round the loop, save the current subject position for use
1085 when the group matches. For MATCH_MATCH, the group has matched, so we
1086 restart it with a new subject starting position, remembering that we had
1087 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1088 usual. If we haven't matched any alternatives in any iteration, check to
1089 see if a previous iteration matched. If so, the group has matched;
1090 continue from afterwards. Otherwise it has failed; restore the previous
1091 capture values before returning NOMATCH. */
1092
1093 for (;;)
1094 {
1095 md->offset_vector[md->offset_end - number] =
1096 (int)(eptr - md->start_subject);
1097 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1098 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1099 eptrb, RM63);
1100 if (rrc == MATCH_KETRPOS)
1101 {
1102 offset_top = md->end_offset_top;
1103 eptr = md->end_match_ptr;
1104 ecode = md->start_code + code_offset;
1105 save_capture_last = md->capture_last;
1106 matched_once = TRUE;
1107 continue;
1108 }
1109
1110 /* See comment in the code for capturing groups above about handling
1111 THEN. */
1112
1113 if (rrc == MATCH_THEN)
1114 {
1115 next = ecode + GET(ecode,1);
1116 if (md->start_match_ptr < next &&
1117 (*ecode == OP_ALT || *next == OP_ALT))
1118 rrc = MATCH_NOMATCH;
1119 }
1120
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 md->capture_last = save_capture_last;
1123 ecode += GET(ecode, 1);
1124 if (*ecode != OP_ALT) break;
1125 }
1126
1127 if (!matched_once)
1128 {
1129 md->offset_vector[offset] = save_offset1;
1130 md->offset_vector[offset+1] = save_offset2;
1131 md->offset_vector[md->offset_end - number] = save_offset3;
1132 }
1133
1134 if (md->mark == NULL) md->mark = markptr;
1135 if (allow_zero || matched_once)
1136 {
1137 ecode += 1 + LINK_SIZE;
1138 break;
1139 }
1140
1141 RRETURN(MATCH_NOMATCH);
1142 }
1143
1144 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1145 as a non-capturing bracket. */
1146
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1149
1150 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1151
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1154
1155 /* Non-capturing possessive bracket with unlimited repeat. We come here
1156 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1157 without the capturing complication. It is written out separately for speed
1158 and cleanliness. */
1159
1160 case OP_BRAPOS:
1161 case OP_SBRAPOS:
1162 allow_zero = FALSE;
1163
1164 POSSESSIVE_NON_CAPTURE:
1165 matched_once = FALSE;
1166 code_offset = ecode - md->start_code;
1167
1168 for (;;)
1169 {
1170 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1171 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1172 eptrb, RM48);
1173 if (rrc == MATCH_KETRPOS)
1174 {
1175 offset_top = md->end_offset_top;
1176 eptr = md->end_match_ptr;
1177 ecode = md->start_code + code_offset;
1178 matched_once = TRUE;
1179 continue;
1180 }
1181
1182 /* See comment in the code for capturing groups above about handling
1183 THEN. */
1184
1185 if (rrc == MATCH_THEN)
1186 {
1187 next = ecode + GET(ecode,1);
1188 if (md->start_match_ptr < next &&
1189 (*ecode == OP_ALT || *next == OP_ALT))
1190 rrc = MATCH_NOMATCH;
1191 }
1192
1193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (matched_once || allow_zero)
1199 {
1200 ecode += 1 + LINK_SIZE;
1201 break;
1202 }
1203 RRETURN(MATCH_NOMATCH);
1204
1205 /* Control never reaches here. */
1206
1207 /* Conditional group: compilation checked that there are no more than
1208 two branches. If the condition is false, skipping the first branch takes us
1209 past the end if there is only one branch, but that's OK because that is
1210 exactly what going to the ket would do. */
1211
1212 case OP_COND:
1213 case OP_SCOND:
1214 codelink = GET(ecode, 1);
1215
1216 /* Because of the way auto-callout works during compile, a callout item is
1217 inserted between OP_COND and an assertion condition. */
1218
1219 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1220 {
1221 if (pcre_callout != NULL)
1222 {
1223 pcre_callout_block cb;
1224 cb.version = 2; /* Version 1 of the callout block */
1225 cb.callout_number = ecode[LINK_SIZE+2];
1226 cb.offset_vector = md->offset_vector;
1227 cb.subject = (PCRE_SPTR)md->start_subject;
1228 cb.subject_length = (int)(md->end_subject - md->start_subject);
1229 cb.start_match = (int)(mstart - md->start_subject);
1230 cb.current_position = (int)(eptr - md->start_subject);
1231 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1232 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1233 cb.capture_top = offset_top/2;
1234 cb.capture_last = md->capture_last;
1235 cb.callout_data = md->callout_data;
1236 cb.mark = (unsigned char *)markptr;
1237 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1238 if (rrc < 0) RRETURN(rrc);
1239 }
1240 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1241 }
1242
1243 condcode = ecode[LINK_SIZE+1];
1244
1245 /* Now see what the actual condition is */
1246
1247 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1248 {
1249 if (md->recursive == NULL) /* Not recursing => FALSE */
1250 {
1251 condition = FALSE;
1252 ecode += GET(ecode, 1);
1253 }
1254 else
1255 {
1256 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1257 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1258
1259 /* If the test is for recursion into a specific subpattern, and it is
1260 false, but the test was set up by name, scan the table to see if the
1261 name refers to any other numbers, and test them. The condition is true
1262 if any one is set. */
1263
1264 if (!condition && condcode == OP_NRREF)
1265 {
1266 pcre_uchar *slotA = md->name_table;
1267 for (i = 0; i < md->name_count; i++)
1268 {
1269 if (GET2(slotA, 0) == recno) break;
1270 slotA += md->name_entry_size;
1271 }
1272
1273 /* Found a name for the number - there can be only one; duplicate
1274 names for different numbers are allowed, but not vice versa. First
1275 scan down for duplicates. */
1276
1277 if (i < md->name_count)
1278 {
1279 pcre_uchar *slotB = slotA;
1280 while (slotB > md->name_table)
1281 {
1282 slotB -= md->name_entry_size;
1283 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1284 {
1285 condition = GET2(slotB, 0) == md->recursive->group_num;
1286 if (condition) break;
1287 }
1288 else break;
1289 }
1290
1291 /* Scan up for duplicates */
1292
1293 if (!condition)
1294 {
1295 slotB = slotA;
1296 for (i++; i < md->name_count; i++)
1297 {
1298 slotB += md->name_entry_size;
1299 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1300 {
1301 condition = GET2(slotB, 0) == md->recursive->group_num;
1302 if (condition) break;
1303 }
1304 else break;
1305 }
1306 }
1307 }
1308 }
1309
1310 /* Chose branch according to the condition */
1311
1312 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1313 }
1314 }
1315
1316 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1317 {
1318 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1319 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1320
1321 /* If the numbered capture is unset, but the reference was by name,
1322 scan the table to see if the name refers to any other numbers, and test
1323 them. The condition is true if any one is set. This is tediously similar
1324 to the code above, but not close enough to try to amalgamate. */
1325
1326 if (!condition && condcode == OP_NCREF)
1327 {
1328 int refno = offset >> 1;
1329 pcre_uchar *slotA = md->name_table;
1330
1331 for (i = 0; i < md->name_count; i++)
1332 {
1333 if (GET2(slotA, 0) == refno) break;
1334 slotA += md->name_entry_size;
1335 }
1336
1337 /* Found a name for the number - there can be only one; duplicate names
1338 for different numbers are allowed, but not vice versa. First scan down
1339 for duplicates. */
1340
1341 if (i < md->name_count)
1342 {
1343 pcre_uchar *slotB = slotA;
1344 while (slotB > md->name_table)
1345 {
1346 slotB -= md->name_entry_size;
1347 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1348 {
1349 offset = GET2(slotB, 0) << 1;
1350 condition = offset < offset_top &&
1351 md->offset_vector[offset] >= 0;
1352 if (condition) break;
1353 }
1354 else break;
1355 }
1356
1357 /* Scan up for duplicates */
1358
1359 if (!condition)
1360 {
1361 slotB = slotA;
1362 for (i++; i < md->name_count; i++)
1363 {
1364 slotB += md->name_entry_size;
1365 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1366 {
1367 offset = GET2(slotB, 0) << 1;
1368 condition = offset < offset_top &&
1369 md->offset_vector[offset] >= 0;
1370 if (condition) break;
1371 }
1372 else break;
1373 }
1374 }
1375 }
1376 }
1377
1378 /* Chose branch according to the condition */
1379
1380 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1381 }
1382
1383 else if (condcode == OP_DEF) /* DEFINE - always false */
1384 {
1385 condition = FALSE;
1386 ecode += GET(ecode, 1);
1387 }
1388
1389 /* The condition is an assertion. Call match() to evaluate it - setting
1390 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1391 an assertion. */
1392
1393 else
1394 {
1395 md->match_function_type = MATCH_CONDASSERT;
1396 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1397 if (rrc == MATCH_MATCH)
1398 {
1399 if (md->end_offset_top > offset_top)
1400 offset_top = md->end_offset_top; /* Captures may have happened */
1401 condition = TRUE;
1402 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1403 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1404 }
1405
1406 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1407 assertion; it is therefore treated as NOMATCH. */
1408
1409 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1410 {
1411 RRETURN(rrc); /* Need braces because of following else */
1412 }
1413 else
1414 {
1415 condition = FALSE;
1416 ecode += codelink;
1417 }
1418 }
1419
1420 /* We are now at the branch that is to be obeyed. As there is only one, can
1421 use tail recursion to avoid using another stack frame, except when there is
1422 unlimited repeat of a possibly empty group. In the latter case, a recursive
1423 call to match() is always required, unless the second alternative doesn't
1424 exist, in which case we can just plough on. Note that, for compatibility
1425 with Perl, the | in a conditional group is NOT treated as creating two
1426 alternatives. If a THEN is encountered in the branch, it propagates out to
1427 the enclosing alternative (unless nested in a deeper set of alternatives,
1428 of course). */
1429
1430 if (condition || *ecode == OP_ALT)
1431 {
1432 if (op != OP_SCOND)
1433 {
1434 ecode += 1 + LINK_SIZE;
1435 goto TAIL_RECURSE;
1436 }
1437
1438 md->match_function_type = MATCH_CBEGROUP;
1439 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1440 RRETURN(rrc);
1441 }
1442
1443 /* Condition false & no alternative; continue after the group. */
1444
1445 else
1446 {
1447 ecode += 1 + LINK_SIZE;
1448 }
1449 break;
1450
1451
1452 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1453 to close any currently open capturing brackets. */
1454
1455 case OP_CLOSE:
1456 number = GET2(ecode, 1);
1457 offset = number << 1;
1458
1459 #ifdef PCRE_DEBUG
1460 printf("end bracket %d at *ACCEPT", number);
1461 printf("\n");
1462 #endif
1463
1464 md->capture_last = number;
1465 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1466 {
1467 md->offset_vector[offset] =
1468 md->offset_vector[md->offset_end - number];
1469 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1470 if (offset_top <= offset) offset_top = offset + 2;
1471 }
1472 ecode += 1 + IMM2_SIZE;
1473 break;
1474
1475
1476 /* End of the pattern, either real or forced. */
1477
1478 case OP_END:
1479 case OP_ACCEPT:
1480 case OP_ASSERT_ACCEPT:
1481
1482 /* If we have matched an empty string, fail if not in an assertion and not
1483 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1484 is set and we have matched at the start of the subject. In both cases,
1485 backtracking will then try other alternatives, if any. */
1486
1487 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1488 md->recursive == NULL &&
1489 (md->notempty ||
1490 (md->notempty_atstart &&
1491 mstart == md->start_subject + md->start_offset)))
1492 MRRETURN(MATCH_NOMATCH);
1493
1494 /* Otherwise, we have a match. */
1495
1496 md->end_match_ptr = eptr; /* Record where we ended */
1497 md->end_offset_top = offset_top; /* and how many extracts were taken */
1498 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1499
1500 /* For some reason, the macros don't work properly if an expression is
1501 given as the argument to MRRETURN when the heap is in use. */
1502
1503 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1504 MRRETURN(rrc);
1505
1506 /* Assertion brackets. Check the alternative branches in turn - the
1507 matching won't pass the KET for an assertion. If any one branch matches,
1508 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1509 start of each branch to move the current point backwards, so the code at
1510 this level is identical to the lookahead case. When the assertion is part
1511 of a condition, we want to return immediately afterwards. The caller of
1512 this incarnation of the match() function will have set MATCH_CONDASSERT in
1513 md->match_function type, and one of these opcodes will be the first opcode
1514 that is processed. We use a local variable that is preserved over calls to
1515 match() to remember this case. */
1516
1517 case OP_ASSERT:
1518 case OP_ASSERTBACK:
1519 if (md->match_function_type == MATCH_CONDASSERT)
1520 {
1521 condassert = TRUE;
1522 md->match_function_type = 0;
1523 }
1524 else condassert = FALSE;
1525
1526 do
1527 {
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1529 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1530 {
1531 mstart = md->start_match_ptr; /* In case \K reset it */
1532 markptr = md->mark;
1533 break;
1534 }
1535
1536 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1537 as NOMATCH. */
1538
1539 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1540 ecode += GET(ecode, 1);
1541 }
1542 while (*ecode == OP_ALT);
1543
1544 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1545
1546 /* If checking an assertion for a condition, return MATCH_MATCH. */
1547
1548 if (condassert) RRETURN(MATCH_MATCH);
1549
1550 /* Continue from after the assertion, updating the offsets high water
1551 mark, since extracts may have been taken during the assertion. */
1552
1553 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1554 ecode += 1 + LINK_SIZE;
1555 offset_top = md->end_offset_top;
1556 continue;
1557
1558 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1559 PRUNE, or COMMIT means we must assume failure without checking subsequent
1560 branches. */
1561
1562 case OP_ASSERT_NOT:
1563 case OP_ASSERTBACK_NOT:
1564 if (md->match_function_type == MATCH_CONDASSERT)
1565 {
1566 condassert = TRUE;
1567 md->match_function_type = 0;
1568 }
1569 else condassert = FALSE;
1570
1571 do
1572 {
1573 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1574 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1575 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1576 {
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 break;
1579 }
1580
1581 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1582 as NOMATCH. */
1583
1584 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1585 ecode += GET(ecode,1);
1586 }
1587 while (*ecode == OP_ALT);
1588
1589 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1590
1591 ecode += 1 + LINK_SIZE;
1592 continue;
1593
1594 /* Move the subject pointer back. This occurs only at the start of
1595 each branch of a lookbehind assertion. If we are too close to the start to
1596 move back, this match function fails. When working with UTF-8 we move
1597 back a number of characters, not bytes. */
1598
1599 case OP_REVERSE:
1600 #ifdef SUPPORT_UTF
1601 if (utf)
1602 {
1603 i = GET(ecode, 1);
1604 while (i-- > 0)
1605 {
1606 eptr--;
1607 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1608 BACKCHAR(eptr);
1609 }
1610 }
1611 else
1612 #endif
1613
1614 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1615
1616 {
1617 eptr -= GET(ecode, 1);
1618 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1619 }
1620
1621 /* Save the earliest consulted character, then skip to next op code */
1622
1623 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1624 ecode += 1 + LINK_SIZE;
1625 break;
1626
1627 /* The callout item calls an external function, if one is provided, passing
1628 details of the match so far. This is mainly for debugging, though the
1629 function is able to force a failure. */
1630
1631 case OP_CALLOUT:
1632 if (pcre_callout != NULL)
1633 {
1634 pcre_callout_block cb;
1635 cb.version = 2; /* Version 1 of the callout block */
1636 cb.callout_number = ecode[1];
1637 cb.offset_vector = md->offset_vector;
1638 cb.subject = (PCRE_SPTR)md->start_subject;
1639 cb.subject_length = (int)(md->end_subject - md->start_subject);
1640 cb.start_match = (int)(mstart - md->start_subject);
1641 cb.current_position = (int)(eptr - md->start_subject);
1642 cb.pattern_position = GET(ecode, 2);
1643 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1644 cb.capture_top = offset_top/2;
1645 cb.capture_last = md->capture_last;
1646 cb.callout_data = md->callout_data;
1647 cb.mark = (unsigned char *)markptr;
1648 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1649 if (rrc < 0) RRETURN(rrc);
1650 }
1651 ecode += 2 + 2*LINK_SIZE;
1652 break;
1653
1654 /* Recursion either matches the current regex, or some subexpression. The
1655 offset data is the offset to the starting bracket from the start of the
1656 whole pattern. (This is so that it works from duplicated subpatterns.)
1657
1658 The state of the capturing groups is preserved over recursion, and
1659 re-instated afterwards. We don't know how many are started and not yet
1660 finished (offset_top records the completed total) so we just have to save
1661 all the potential data. There may be up to 65535 such values, which is too
1662 large to put on the stack, but using malloc for small numbers seems
1663 expensive. As a compromise, the stack is used when there are no more than
1664 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1665
1666 There are also other values that have to be saved. We use a chained
1667 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1668 for the original version of this logic. It has, however, been hacked around
1669 a lot, so he is not to blame for the current way it works. */
1670
1671 case OP_RECURSE:
1672 {
1673 recursion_info *ri;
1674 int recno;
1675
1676 callpat = md->start_code + GET(ecode, 1);
1677 recno = (callpat == md->start_code)? 0 :
1678 GET2(callpat, 1 + LINK_SIZE);
1679
1680 /* Check for repeating a recursion without advancing the subject pointer.
1681 This should catch convoluted mutual recursions. (Some simple cases are
1682 caught at compile time.) */
1683
1684 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1685 if (recno == ri->group_num && eptr == ri->subject_position)
1686 RRETURN(PCRE_ERROR_RECURSELOOP);
1687
1688 /* Add to "recursing stack" */
1689
1690 new_recursive.group_num = recno;
1691 new_recursive.subject_position = eptr;
1692 new_recursive.prevrec = md->recursive;
1693 md->recursive = &new_recursive;
1694
1695 /* Where to continue from afterwards */
1696
1697 ecode += 1 + LINK_SIZE;
1698
1699 /* Now save the offset data */
1700
1701 new_recursive.saved_max = md->offset_end;
1702 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1703 new_recursive.offset_save = stacksave;
1704 else
1705 {
1706 new_recursive.offset_save =
1707 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1708 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1709 }
1710 memcpy(new_recursive.offset_save, md->offset_vector,
1711 new_recursive.saved_max * sizeof(int));
1712
1713 /* OK, now we can do the recursion. After processing each alternative,
1714 restore the offset data. If there were nested recursions, md->recursive
1715 might be changed, so reset it before looping. */
1716
1717 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1718 cbegroup = (*callpat >= OP_SBRA);
1719 do
1720 {
1721 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1722 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1723 md, eptrb, RM6);
1724 memcpy(md->offset_vector, new_recursive.offset_save,
1725 new_recursive.saved_max * sizeof(int));
1726 md->recursive = new_recursive.prevrec;
1727 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1728 {
1729 DPRINTF(("Recursion matched\n"));
1730 if (new_recursive.offset_save != stacksave)
1731 (pcre_free)(new_recursive.offset_save);
1732
1733 /* Set where we got to in the subject, and reset the start in case
1734 it was changed by \K. This *is* propagated back out of a recursion,
1735 for Perl compatibility. */
1736
1737 eptr = md->end_match_ptr;
1738 mstart = md->start_match_ptr;
1739 goto RECURSION_MATCHED; /* Exit loop; end processing */
1740 }
1741
1742 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1743 as NOMATCH. */
1744
1745 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1746 {
1747 DPRINTF(("Recursion gave error %d\n", rrc));
1748 if (new_recursive.offset_save != stacksave)
1749 (pcre_free)(new_recursive.offset_save);
1750 RRETURN(rrc);
1751 }
1752
1753 md->recursive = &new_recursive;
1754 callpat += GET(callpat, 1);
1755 }
1756 while (*callpat == OP_ALT);
1757
1758 DPRINTF(("Recursion didn't match\n"));
1759 md->recursive = new_recursive.prevrec;
1760 if (new_recursive.offset_save != stacksave)
1761 (pcre_free)(new_recursive.offset_save);
1762 MRRETURN(MATCH_NOMATCH);
1763 }
1764
1765 RECURSION_MATCHED:
1766 break;
1767
1768 /* An alternation is the end of a branch; scan along to find the end of the
1769 bracketed group and go to there. */
1770
1771 case OP_ALT:
1772 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1773 break;
1774
1775 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1776 indicating that it may occur zero times. It may repeat infinitely, or not
1777 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1778 with fixed upper repeat limits are compiled as a number of copies, with the
1779 optional ones preceded by BRAZERO or BRAMINZERO. */
1780
1781 case OP_BRAZERO:
1782 next = ecode + 1;
1783 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1785 do next += GET(next, 1); while (*next == OP_ALT);
1786 ecode = next + 1 + LINK_SIZE;
1787 break;
1788
1789 case OP_BRAMINZERO:
1790 next = ecode + 1;
1791 do next += GET(next, 1); while (*next == OP_ALT);
1792 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1794 ecode++;
1795 break;
1796
1797 case OP_SKIPZERO:
1798 next = ecode+1;
1799 do next += GET(next,1); while (*next == OP_ALT);
1800 ecode = next + 1 + LINK_SIZE;
1801 break;
1802
1803 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1804 here; just jump to the group, with allow_zero set TRUE. */
1805
1806 case OP_BRAPOSZERO:
1807 op = *(++ecode);
1808 allow_zero = TRUE;
1809 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1810 goto POSSESSIVE_NON_CAPTURE;
1811
1812 /* End of a group, repeated or non-repeating. */
1813
1814 case OP_KET:
1815 case OP_KETRMIN:
1816 case OP_KETRMAX:
1817 case OP_KETRPOS:
1818 prev = ecode - GET(ecode, 1);
1819
1820 /* If this was a group that remembered the subject start, in order to break
1821 infinite repeats of empty string matches, retrieve the subject start from
1822 the chain. Otherwise, set it NULL. */
1823
1824 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1825 {
1826 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1827 eptrb = eptrb->epb_prev; /* Backup to previous group */
1828 }
1829 else saved_eptr = NULL;
1830
1831 /* If we are at the end of an assertion group or a non-capturing atomic
1832 group, stop matching and return MATCH_MATCH, but record the current high
1833 water mark for use by positive assertions. We also need to record the match
1834 start in case it was changed by \K. */
1835
1836 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1837 *prev == OP_ONCE_NC)
1838 {
1839 md->end_match_ptr = eptr; /* For ONCE_NC */
1840 md->end_offset_top = offset_top;
1841 md->start_match_ptr = mstart;
1842 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1843 }
1844
1845 /* For capturing groups we have to check the group number back at the start
1846 and if necessary complete handling an extraction by setting the offsets and
1847 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1848 into group 0, so it won't be picked up here. Instead, we catch it when the
1849 OP_END is reached. Other recursion is handled here. We just have to record
1850 the current subject position and start match pointer and give a MATCH
1851 return. */
1852
1853 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1854 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1855 {
1856 number = GET2(prev, 1+LINK_SIZE);
1857 offset = number << 1;
1858
1859 #ifdef PCRE_DEBUG
1860 printf("end bracket %d", number);
1861 printf("\n");
1862 #endif
1863
1864 /* Handle a recursively called group. */
1865
1866 if (md->recursive != NULL && md->recursive->group_num == number)
1867 {
1868 md->end_match_ptr = eptr;
1869 md->start_match_ptr = mstart;
1870 RRETURN(MATCH_MATCH);
1871 }
1872
1873 /* Deal with capturing */
1874
1875 md->capture_last = number;
1876 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1877 {
1878 /* If offset is greater than offset_top, it means that we are
1879 "skipping" a capturing group, and that group's offsets must be marked
1880 unset. In earlier versions of PCRE, all the offsets were unset at the
1881 start of matching, but this doesn't work because atomic groups and
1882 assertions can cause a value to be set that should later be unset.
1883 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1884 part of the atomic group, but this is not on the final matching path,
1885 so must be unset when 2 is set. (If there is no group 2, there is no
1886 problem, because offset_top will then be 2, indicating no capture.) */
1887
1888 if (offset > offset_top)
1889 {
1890 register int *iptr = md->offset_vector + offset_top;
1891 register int *iend = md->offset_vector + offset;
1892 while (iptr < iend) *iptr++ = -1;
1893 }
1894
1895 /* Now make the extraction */
1896
1897 md->offset_vector[offset] =
1898 md->offset_vector[md->offset_end - number];
1899 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1900 if (offset_top <= offset) offset_top = offset + 2;
1901 }
1902 }
1903
1904 /* For an ordinary non-repeating ket, just continue at this level. This
1905 also happens for a repeating ket if no characters were matched in the
1906 group. This is the forcible breaking of infinite loops as implemented in
1907 Perl 5.005. For a non-repeating atomic group that includes captures,
1908 establish a backup point by processing the rest of the pattern at a lower
1909 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1910 original OP_ONCE level, thereby bypassing intermediate backup points, but
1911 resetting any captures that happened along the way. */
1912
1913 if (*ecode == OP_KET || eptr == saved_eptr)
1914 {
1915 if (*prev == OP_ONCE)
1916 {
1917 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1919 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1920 RRETURN(MATCH_ONCE);
1921 }
1922 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1923 break;
1924 }
1925
1926 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1927 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1928 at a time from the outer level, thus saving stack. */
1929
1930 if (*ecode == OP_KETRPOS)
1931 {
1932 md->end_match_ptr = eptr;
1933 md->end_offset_top = offset_top;
1934 RRETURN(MATCH_KETRPOS);
1935 }
1936
1937 /* The normal repeating kets try the rest of the pattern or restart from
1938 the preceding bracket, in the appropriate order. In the second case, we can
1939 use tail recursion to avoid using another stack frame, unless we have an
1940 an atomic group or an unlimited repeat of a group that can match an empty
1941 string. */
1942
1943 if (*ecode == OP_KETRMIN)
1944 {
1945 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1947 if (*prev == OP_ONCE)
1948 {
1949 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1951 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1952 RRETURN(MATCH_ONCE);
1953 }
1954 if (*prev >= OP_SBRA) /* Could match an empty string */
1955 {
1956 md->match_function_type = MATCH_CBEGROUP;
1957 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1958 RRETURN(rrc);
1959 }
1960 ecode = prev;
1961 goto TAIL_RECURSE;
1962 }
1963 else /* OP_KETRMAX */
1964 {
1965 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1966 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1967 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969 if (*prev == OP_ONCE)
1970 {
1971 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1973 md->once_target = prev;
1974 RRETURN(MATCH_ONCE);
1975 }
1976 ecode += 1 + LINK_SIZE;
1977 goto TAIL_RECURSE;
1978 }
1979 /* Control never gets here */
1980
1981 /* Not multiline mode: start of subject assertion, unless notbol. */
1982
1983 case OP_CIRC:
1984 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1985
1986 /* Start of subject assertion */
1987
1988 case OP_SOD:
1989 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1990 ecode++;
1991 break;
1992
1993 /* Multiline mode: start of subject unless notbol, or after any newline. */
1994
1995 case OP_CIRCM:
1996 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1997 if (eptr != md->start_subject &&
1998 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1999 MRRETURN(MATCH_NOMATCH);
2000 ecode++;
2001 break;
2002
2003 /* Start of match assertion */
2004
2005 case OP_SOM:
2006 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2007 ecode++;
2008 break;
2009
2010 /* Reset the start of match point */
2011
2012 case OP_SET_SOM:
2013 mstart = eptr;
2014 ecode++;
2015 break;
2016
2017 /* Multiline mode: assert before any newline, or before end of subject
2018 unless noteol is set. */
2019
2020 case OP_DOLLM:
2021 if (eptr < md->end_subject)
2022 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2023 else
2024 {
2025 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2026 SCHECK_PARTIAL();
2027 }
2028 ecode++;
2029 break;
2030
2031 /* Not multiline mode: assert before a terminating newline or before end of
2032 subject unless noteol is set. */
2033
2034 case OP_DOLL:
2035 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2036 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2037
2038 /* ... else fall through for endonly */
2039
2040 /* End of subject assertion (\z) */
2041
2042 case OP_EOD:
2043 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2044 SCHECK_PARTIAL();
2045 ecode++;
2046 break;
2047
2048 /* End of subject or ending \n assertion (\Z) */
2049
2050 case OP_EODN:
2051 ASSERT_NL_OR_EOS:
2052 if (eptr < md->end_subject &&
2053 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2054 MRRETURN(MATCH_NOMATCH);
2055
2056 /* Either at end of string or \n before end. */
2057
2058 SCHECK_PARTIAL();
2059 ecode++;
2060 break;
2061
2062 /* Word boundary assertions */
2063
2064 case OP_NOT_WORD_BOUNDARY:
2065 case OP_WORD_BOUNDARY:
2066 {
2067
2068 /* Find out if the previous and current characters are "word" characters.
2069 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2070 be "non-word" characters. Remember the earliest consulted character for
2071 partial matching. */
2072
2073 #ifdef SUPPORT_UTF
2074 if (utf)
2075 {
2076 /* Get status of previous character */
2077
2078 if (eptr == md->start_subject) prev_is_word = FALSE; else
2079 {
2080 PCRE_PUCHAR lastptr = eptr - 1;
2081 BACKCHAR(lastptr);
2082 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2083 GETCHAR(c, lastptr);
2084 #ifdef SUPPORT_UCP
2085 if (md->use_ucp)
2086 {
2087 if (c == '_') prev_is_word = TRUE; else
2088 {
2089 int cat = UCD_CATEGORY(c);
2090 prev_is_word = (cat == ucp_L || cat == ucp_N);
2091 }
2092 }
2093 else
2094 #endif
2095 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2096 }
2097
2098 /* Get status of next character */
2099
2100 if (eptr >= md->end_subject)
2101 {
2102 SCHECK_PARTIAL();
2103 cur_is_word = FALSE;
2104 }
2105 else
2106 {
2107 GETCHAR(c, eptr);
2108 #ifdef SUPPORT_UCP
2109 if (md->use_ucp)
2110 {
2111 if (c == '_') cur_is_word = TRUE; else
2112 {
2113 int cat = UCD_CATEGORY(c);
2114 cur_is_word = (cat == ucp_L || cat == ucp_N);
2115 }
2116 }
2117 else
2118 #endif
2119 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2120 }
2121 }
2122 else
2123 #endif
2124
2125 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2126 consistency with the behaviour of \w we do use it in this case. */
2127
2128 {
2129 /* Get status of previous character */
2130
2131 if (eptr == md->start_subject) prev_is_word = FALSE; else
2132 {
2133 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2134 #ifdef SUPPORT_UCP
2135 if (md->use_ucp)
2136 {
2137 c = eptr[-1];
2138 if (c == '_') prev_is_word = TRUE; else
2139 {
2140 int cat = UCD_CATEGORY(c);
2141 prev_is_word = (cat == ucp_L || cat == ucp_N);
2142 }
2143 }
2144 else
2145 #endif
2146 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2147 }
2148
2149 /* Get status of next character */
2150
2151 if (eptr >= md->end_subject)
2152 {
2153 SCHECK_PARTIAL();
2154 cur_is_word = FALSE;
2155 }
2156 else
2157 #ifdef SUPPORT_UCP
2158 if (md->use_ucp)
2159 {
2160 c = *eptr;
2161 if (c == '_') cur_is_word = TRUE; else
2162 {
2163 int cat = UCD_CATEGORY(c);
2164 cur_is_word = (cat == ucp_L || cat == ucp_N);
2165 }
2166 }
2167 else
2168 #endif
2169 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2170 }
2171
2172 /* Now see if the situation is what we want */
2173
2174 if ((*ecode++ == OP_WORD_BOUNDARY)?
2175 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2176 MRRETURN(MATCH_NOMATCH);
2177 }
2178 break;
2179
2180 /* Match a single character type; inline for speed */
2181
2182 case OP_ANY:
2183 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2184 /* Fall through */
2185
2186 case OP_ALLANY:
2187 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2188 { /* not be updated before SCHECK_PARTIAL. */
2189 SCHECK_PARTIAL();
2190 MRRETURN(MATCH_NOMATCH);
2191 }
2192 eptr++;
2193 #ifdef SUPPORT_UTF
2194 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2195 #endif
2196 ecode++;
2197 break;
2198
2199 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2200 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2201
2202 case OP_ANYBYTE:
2203 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2204 { /* not be updated before SCHECK_PARTIAL. */
2205 SCHECK_PARTIAL();
2206 MRRETURN(MATCH_NOMATCH);
2207 }
2208 eptr++;
2209 ecode++;
2210 break;
2211
2212 case OP_NOT_DIGIT:
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 GETCHARINCTEST(c, eptr);
2219 if (
2220 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2221 c < 256 &&
2222 #endif
2223 (md->ctypes[c] & ctype_digit) != 0
2224 )
2225 MRRETURN(MATCH_NOMATCH);
2226 ecode++;
2227 break;
2228
2229 case OP_DIGIT:
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 MRRETURN(MATCH_NOMATCH);
2234 }
2235 GETCHARINCTEST(c, eptr);
2236 if (
2237 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2238 c > 255 ||
2239 #endif
2240 (md->ctypes[c] & ctype_digit) == 0
2241 )
2242 MRRETURN(MATCH_NOMATCH);
2243 ecode++;
2244 break;
2245
2246 case OP_NOT_WHITESPACE:
2247 if (eptr >= md->end_subject)
2248 {
2249 SCHECK_PARTIAL();
2250 MRRETURN(MATCH_NOMATCH);
2251 }
2252 GETCHARINCTEST(c, eptr);
2253 if (
2254 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2255 c < 256 &&
2256 #endif
2257 (md->ctypes[c] & ctype_space) != 0
2258 )
2259 MRRETURN(MATCH_NOMATCH);
2260 ecode++;
2261 break;
2262
2263 case OP_WHITESPACE:
2264 if (eptr >= md->end_subject)
2265 {
2266 SCHECK_PARTIAL();
2267 MRRETURN(MATCH_NOMATCH);
2268 }
2269 GETCHARINCTEST(c, eptr);
2270 if (
2271 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2272 c > 255 ||
2273 #endif
2274 (md->ctypes[c] & ctype_space) == 0
2275 )
2276 MRRETURN(MATCH_NOMATCH);
2277 ecode++;
2278 break;
2279
2280 case OP_NOT_WORDCHAR:
2281 if (eptr >= md->end_subject)
2282 {
2283 SCHECK_PARTIAL();
2284 MRRETURN(MATCH_NOMATCH);
2285 }
2286 GETCHARINCTEST(c, eptr);
2287 if (
2288 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2289 c < 256 &&
2290 #endif
2291 (md->ctypes[c] & ctype_word) != 0
2292 )
2293 MRRETURN(MATCH_NOMATCH);
2294 ecode++;
2295 break;
2296
2297 case OP_WORDCHAR:
2298 if (eptr >= md->end_subject)
2299 {
2300 SCHECK_PARTIAL();
2301 MRRETURN(MATCH_NOMATCH);
2302 }
2303 GETCHARINCTEST(c, eptr);
2304 if (
2305 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2306 c > 255 ||
2307 #endif
2308 (md->ctypes[c] & ctype_word) == 0
2309 )
2310 MRRETURN(MATCH_NOMATCH);
2311 ecode++;
2312 break;
2313
2314 case OP_ANYNL:
2315 if (eptr >= md->end_subject)
2316 {
2317 SCHECK_PARTIAL();
2318 MRRETURN(MATCH_NOMATCH);
2319 }
2320 GETCHARINCTEST(c, eptr);
2321 switch(c)
2322 {
2323 default: MRRETURN(MATCH_NOMATCH);
2324
2325 case 0x000d:
2326 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2327 break;
2328
2329 case 0x000a:
2330 break;
2331
2332 case 0x000b:
2333 case 0x000c:
2334 case 0x0085:
2335 case 0x2028:
2336 case 0x2029:
2337 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2338 break;
2339 }
2340 ecode++;
2341 break;
2342
2343 case OP_NOT_HSPACE:
2344 if (eptr >= md->end_subject)
2345 {
2346 SCHECK_PARTIAL();
2347 MRRETURN(MATCH_NOMATCH);
2348 }
2349 GETCHARINCTEST(c, eptr);
2350 switch(c)
2351 {
2352 default: break;
2353 case 0x09: /* HT */
2354 case 0x20: /* SPACE */
2355 case 0xa0: /* NBSP */
2356 case 0x1680: /* OGHAM SPACE MARK */
2357 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2358 case 0x2000: /* EN QUAD */
2359 case 0x2001: /* EM QUAD */
2360 case 0x2002: /* EN SPACE */
2361 case 0x2003: /* EM SPACE */
2362 case 0x2004: /* THREE-PER-EM SPACE */
2363 case 0x2005: /* FOUR-PER-EM SPACE */
2364 case 0x2006: /* SIX-PER-EM SPACE */
2365 case 0x2007: /* FIGURE SPACE */
2366 case 0x2008: /* PUNCTUATION SPACE */
2367 case 0x2009: /* THIN SPACE */
2368 case 0x200A: /* HAIR SPACE */
2369 case 0x202f: /* NARROW NO-BREAK SPACE */
2370 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2371 case 0x3000: /* IDEOGRAPHIC SPACE */
2372 MRRETURN(MATCH_NOMATCH);
2373 }
2374 ecode++;
2375 break;
2376
2377 case OP_HSPACE:
2378 if (eptr >= md->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 MRRETURN(MATCH_NOMATCH);
2382 }
2383 GETCHARINCTEST(c, eptr);
2384 switch(c)
2385 {
2386 default: MRRETURN(MATCH_NOMATCH);
2387 case 0x09: /* HT */
2388 case 0x20: /* SPACE */
2389 case 0xa0: /* NBSP */
2390 case 0x1680: /* OGHAM SPACE MARK */
2391 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2392 case 0x2000: /* EN QUAD */
2393 case 0x2001: /* EM QUAD */
2394 case 0x2002: /* EN SPACE */
2395 case 0x2003: /* EM SPACE */
2396 case 0x2004: /* THREE-PER-EM SPACE */
2397 case 0x2005: /* FOUR-PER-EM SPACE */
2398 case 0x2006: /* SIX-PER-EM SPACE */
2399 case 0x2007: /* FIGURE SPACE */
2400 case 0x2008: /* PUNCTUATION SPACE */
2401 case 0x2009: /* THIN SPACE */
2402 case 0x200A: /* HAIR SPACE */
2403 case 0x202f: /* NARROW NO-BREAK SPACE */
2404 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2405 case 0x3000: /* IDEOGRAPHIC SPACE */
2406 break;
2407 }
2408 ecode++;
2409 break;
2410
2411 case OP_NOT_VSPACE:
2412 if (eptr >= md->end_subject)
2413 {
2414 SCHECK_PARTIAL();
2415 MRRETURN(MATCH_NOMATCH);
2416 }
2417 GETCHARINCTEST(c, eptr);
2418 switch(c)
2419 {
2420 default: break;
2421 case 0x0a: /* LF */
2422 case 0x0b: /* VT */
2423 case 0x0c: /* FF */
2424 case 0x0d: /* CR */
2425 case 0x85: /* NEL */
2426 case 0x2028: /* LINE SEPARATOR */
2427 case 0x2029: /* PARAGRAPH SEPARATOR */
2428 MRRETURN(MATCH_NOMATCH);
2429 }
2430 ecode++;
2431 break;
2432
2433 case OP_VSPACE:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 MRRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 switch(c)
2441 {
2442 default: MRRETURN(MATCH_NOMATCH);
2443 case 0x0a: /* LF */
2444 case 0x0b: /* VT */
2445 case 0x0c: /* FF */
2446 case 0x0d: /* CR */
2447 case 0x85: /* NEL */
2448 case 0x2028: /* LINE SEPARATOR */
2449 case 0x2029: /* PARAGRAPH SEPARATOR */
2450 break;
2451 }
2452 ecode++;
2453 break;
2454
2455 #ifdef SUPPORT_UCP
2456 /* Check the next character by Unicode property. We will get here only
2457 if the support is in the binary; otherwise a compile-time error occurs. */
2458
2459 case OP_PROP:
2460 case OP_NOTPROP:
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 MRRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 {
2468 const ucd_record *prop = GET_UCD(c);
2469
2470 switch(ecode[1])
2471 {
2472 case PT_ANY:
2473 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2474 break;
2475
2476 case PT_LAMP:
2477 if ((prop->chartype == ucp_Lu ||
2478 prop->chartype == ucp_Ll ||
2479 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2480 MRRETURN(MATCH_NOMATCH);
2481 break;
2482
2483 case PT_GC:
2484 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2485 MRRETURN(MATCH_NOMATCH);
2486 break;
2487
2488 case PT_PC:
2489 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2490 MRRETURN(MATCH_NOMATCH);
2491 break;
2492
2493 case PT_SC:
2494 if ((ecode[2] != prop->script) == (op == OP_PROP))
2495 MRRETURN(MATCH_NOMATCH);
2496 break;
2497
2498 /* These are specials */
2499
2500 case PT_ALNUM:
2501 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2502 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2503 MRRETURN(MATCH_NOMATCH);
2504 break;
2505
2506 case PT_SPACE: /* Perl space */
2507 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2508 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2509 == (op == OP_NOTPROP))
2510 MRRETURN(MATCH_NOMATCH);
2511 break;
2512
2513 case PT_PXSPACE: /* POSIX space */
2514 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2515 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2516 c == CHAR_FF || c == CHAR_CR)
2517 == (op == OP_NOTPROP))
2518 MRRETURN(MATCH_NOMATCH);
2519 break;
2520
2521 case PT_WORD:
2522 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2523 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2524 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2525 MRRETURN(MATCH_NOMATCH);
2526 break;
2527
2528 /* This should never occur */
2529
2530 default:
2531 RRETURN(PCRE_ERROR_INTERNAL);
2532 }
2533
2534 ecode += 3;
2535 }
2536 break;
2537
2538 /* Match an extended Unicode sequence. We will get here only if the support
2539 is in the binary; otherwise a compile-time error occurs. */
2540
2541 case OP_EXTUNI:
2542 if (eptr >= md->end_subject)
2543 {
2544 SCHECK_PARTIAL();
2545 MRRETURN(MATCH_NOMATCH);
2546 }
2547 GETCHARINCTEST(c, eptr);
2548 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2549 while (eptr < md->end_subject)
2550 {
2551 int len = 1;
2552 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2553 if (UCD_CATEGORY(c) != ucp_M) break;
2554 eptr += len;
2555 }
2556 ecode++;
2557 break;
2558 #endif
2559
2560
2561 /* Match a back reference, possibly repeatedly. Look past the end of the
2562 item to see if there is repeat information following. The code is similar
2563 to that for character classes, but repeated for efficiency. Then obey
2564 similar code to character type repeats - written out again for speed.
2565 However, if the referenced string is the empty string, always treat
2566 it as matched, any number of times (otherwise there could be infinite
2567 loops). */
2568
2569 case OP_REF:
2570 case OP_REFI:
2571 caseless = op == OP_REFI;
2572 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2573 ecode += 1 + IMM2_SIZE;
2574
2575 /* If the reference is unset, there are two possibilities:
2576
2577 (a) In the default, Perl-compatible state, set the length negative;
2578 this ensures that every attempt at a match fails. We can't just fail
2579 here, because of the possibility of quantifiers with zero minima.
2580
2581 (b) If the JavaScript compatibility flag is set, set the length to zero
2582 so that the back reference matches an empty string.
2583
2584 Otherwise, set the length to the length of what was matched by the
2585 referenced subpattern. */
2586
2587 if (offset >= offset_top || md->offset_vector[offset] < 0)
2588 length = (md->jscript_compat)? 0 : -1;
2589 else
2590 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2591
2592 /* Set up for repetition, or handle the non-repeated case */
2593
2594 switch (*ecode)
2595 {
2596 case OP_CRSTAR:
2597 case OP_CRMINSTAR:
2598 case OP_CRPLUS:
2599 case OP_CRMINPLUS:
2600 case OP_CRQUERY:
2601 case OP_CRMINQUERY:
2602 c = *ecode++ - OP_CRSTAR;
2603 minimize = (c & 1) != 0;
2604 min = rep_min[c]; /* Pick up values from tables; */
2605 max = rep_max[c]; /* zero for max => infinity */
2606 if (max == 0) max = INT_MAX;
2607 break;
2608
2609 case OP_CRRANGE:
2610 case OP_CRMINRANGE:
2611 minimize = (*ecode == OP_CRMINRANGE);
2612 min = GET2(ecode, 1);
2613 max = GET2(ecode, 1 + IMM2_SIZE);
2614 if (max == 0) max = INT_MAX;
2615 ecode += 1 + 2 * IMM2_SIZE;
2616 break;
2617
2618 default: /* No repeat follows */
2619 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2620 {
2621 CHECK_PARTIAL();
2622 MRRETURN(MATCH_NOMATCH);
2623 }
2624 eptr += length;
2625 continue; /* With the main loop */
2626 }
2627
2628 /* Handle repeated back references. If the length of the reference is
2629 zero, just continue with the main loop. */
2630
2631 if (length == 0) continue;
2632
2633 /* First, ensure the minimum number of matches are present. We get back
2634 the length of the reference string explicitly rather than passing the
2635 address of eptr, so that eptr can be a register variable. */
2636
2637 for (i = 1; i <= min; i++)
2638 {
2639 int slength;
2640 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2641 {
2642 CHECK_PARTIAL();
2643 MRRETURN(MATCH_NOMATCH);
2644 }
2645 eptr += slength;
2646 }
2647
2648 /* If min = max, continue at the same level without recursion.
2649 They are not both allowed to be zero. */
2650
2651 if (min == max) continue;
2652
2653 /* If minimizing, keep trying and advancing the pointer */
2654
2655 if (minimize)
2656 {
2657 for (fi = min;; fi++)
2658 {
2659 int slength;
2660 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2662 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2663 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2664 {
2665 CHECK_PARTIAL();
2666 MRRETURN(MATCH_NOMATCH);
2667 }
2668 eptr += slength;
2669 }
2670 /* Control never gets here */
2671 }
2672
2673 /* If maximizing, find the longest string and work backwards */
2674
2675 else
2676 {
2677 pp = eptr;
2678 for (i = min; i < max; i++)
2679 {
2680 int slength;
2681 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2682 {
2683 CHECK_PARTIAL();
2684 break;
2685 }
2686 eptr += slength;
2687 }
2688 while (eptr >= pp)
2689 {
2690 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2692 eptr -= length;
2693 }
2694 MRRETURN(MATCH_NOMATCH);
2695 }
2696 /* Control never gets here */
2697
2698 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2699 used when all the characters in the class have values in the range 0-255,
2700 and either the matching is caseful, or the characters are in the range
2701 0-127 when UTF-8 processing is enabled. The only difference between
2702 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2703 encountered.
2704
2705 First, look past the end of the item to see if there is repeat information
2706 following. Then obey similar code to character type repeats - written out
2707 again for speed. */
2708
2709 case OP_NCLASS:
2710 case OP_CLASS:
2711 {
2712 /* The data variable is saved across frames, so the byte map needs to
2713 be stored there. */
2714 #define BYTE_MAP ((pcre_uint8 *)data)
2715 data = ecode + 1; /* Save for matching */
2716 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2717
2718 switch (*ecode)
2719 {
2720 case OP_CRSTAR:
2721 case OP_CRMINSTAR:
2722 case OP_CRPLUS:
2723 case OP_CRMINPLUS:
2724 case OP_CRQUERY:
2725 case OP_CRMINQUERY:
2726 c = *ecode++ - OP_CRSTAR;
2727 minimize = (c & 1) != 0;
2728 min = rep_min[c]; /* Pick up values from tables; */
2729 max = rep_max[c]; /* zero for max => infinity */
2730 if (max == 0) max = INT_MAX;
2731 break;
2732
2733 case OP_CRRANGE:
2734 case OP_CRMINRANGE:
2735 minimize = (*ecode == OP_CRMINRANGE);
2736 min = GET2(ecode, 1);
2737 max = GET2(ecode, 1 + IMM2_SIZE);
2738 if (max == 0) max = INT_MAX;
2739 ecode += 1 + 2 * IMM2_SIZE;
2740 break;
2741
2742 default: /* No repeat follows */
2743 min = max = 1;
2744 break;
2745 }
2746
2747 /* First, ensure the minimum number of matches are present. */
2748
2749 #ifdef SUPPORT_UTF
2750 if (utf)
2751 {
2752 for (i = 1; i <= min; i++)
2753 {
2754 if (eptr >= md->end_subject)
2755 {
2756 SCHECK_PARTIAL();
2757 MRRETURN(MATCH_NOMATCH);
2758 }
2759 GETCHARINC(c, eptr);
2760 if (c > 255)
2761 {
2762 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2763 }
2764 else
2765 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2766 }
2767 }
2768 else
2769 #endif
2770 /* Not UTF mode */
2771 {
2772 for (i = 1; i <= min; i++)
2773 {
2774 if (eptr >= md->end_subject)
2775 {
2776 SCHECK_PARTIAL();
2777 MRRETURN(MATCH_NOMATCH);
2778 }
2779 c = *eptr++;
2780 #ifndef COMPILE_PCRE8
2781 if (c > 255)
2782 {
2783 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2784 }
2785 else
2786 #endif
2787 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2788 }
2789 }
2790
2791 /* If max == min we can continue with the main loop without the
2792 need to recurse. */
2793
2794 if (min == max) continue;
2795
2796 /* If minimizing, keep testing the rest of the expression and advancing
2797 the pointer while it matches the class. */
2798
2799 if (minimize)
2800 {
2801 #ifdef SUPPORT_UTF
2802 if (utf)
2803 {
2804 for (fi = min;; fi++)
2805 {
2806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2808 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2809 if (eptr >= md->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 MRRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINC(c, eptr);
2815 if (c > 255)
2816 {
2817 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2818 }
2819 else
2820 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2821 }
2822 }
2823 else
2824 #endif
2825 /* Not UTF mode */
2826 {
2827 for (fi = min;; fi++)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2832 if (eptr >= md->end_subject)
2833 {
2834 SCHECK_PARTIAL();
2835 MRRETURN(MATCH_NOMATCH);
2836 }
2837 c = *eptr++;
2838 #ifndef COMPILE_PCRE8
2839 if (c > 255)
2840 {
2841 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2842 }
2843 else
2844 #endif
2845 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2846 }
2847 }
2848 /* Control never gets here */
2849 }
2850
2851 /* If maximizing, find the longest possible run, then work backwards. */
2852
2853 else
2854 {
2855 pp = eptr;
2856
2857 #ifdef SUPPORT_UTF
2858 if (utf)
2859 {
2860 for (i = min; i < max; i++)
2861 {
2862 int len = 1;
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 break;
2867 }
2868 GETCHARLEN(c, eptr, len);
2869 if (c > 255)
2870 {
2871 if (op == OP_CLASS) break;
2872 }
2873 else
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2875 eptr += len;
2876 }
2877 for (;;)
2878 {
2879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2881 if (eptr-- == pp) break; /* Stop if tried at original pos */
2882 BACKCHAR(eptr);
2883 }
2884 }
2885 else
2886 #endif
2887 /* Not UTF mode */
2888 {
2889 for (i = min; i < max; i++)
2890 {
2891 if (eptr >= md->end_subject)
2892 {
2893 SCHECK_PARTIAL();
2894 break;
2895 }
2896 c = *eptr;
2897 #ifndef COMPILE_PCRE8
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) break;
2901 }
2902 else
2903 #endif
2904 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2905 eptr++;
2906 }
2907 while (eptr >= pp)
2908 {
2909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2911 eptr--;
2912 }
2913 }
2914
2915 MRRETURN(MATCH_NOMATCH);
2916 }
2917 #undef BYTE_MAP
2918 }
2919 /* Control never gets here */
2920
2921
2922 /* Match an extended character class. This opcode is encountered only
2923 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2924 mode, because Unicode properties are supported in non-UTF-8 mode. */
2925
2926 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2927 case OP_XCLASS:
2928 {
2929 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2930 ecode += GET(ecode, 1); /* Advance past the item */
2931
2932 switch (*ecode)
2933 {
2934 case OP_CRSTAR:
2935 case OP_CRMINSTAR:
2936 case OP_CRPLUS:
2937 case OP_CRMINPLUS:
2938 case OP_CRQUERY:
2939 case OP_CRMINQUERY:
2940 c = *ecode++ - OP_CRSTAR;
2941 minimize = (c & 1) != 0;
2942 min = rep_min[c]; /* Pick up values from tables; */
2943 max = rep_max[c]; /* zero for max => infinity */
2944 if (max == 0) max = INT_MAX;
2945 break;
2946
2947 case OP_CRRANGE:
2948 case OP_CRMINRANGE:
2949 minimize = (*ecode == OP_CRMINRANGE);
2950 min = GET2(ecode, 1);
2951 max = GET2(ecode, 1 + IMM2_SIZE);
2952 if (max == 0) max = INT_MAX;
2953 ecode += 1 + 2 * IMM2_SIZE;
2954 break;
2955
2956 default: /* No repeat follows */
2957 min = max = 1;
2958 break;
2959 }
2960
2961 /* First, ensure the minimum number of matches are present. */
2962
2963 for (i = 1; i <= min; i++)
2964 {
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 MRRETURN(MATCH_NOMATCH);
2969 }
2970 GETCHARINCTEST(c, eptr);
2971 if (!PRIV(xclass)(c, data, utf)) MRRETURN(MATCH_NOMATCH);
2972 }
2973
2974 /* If max == min we can continue with the main loop without the
2975 need to recurse. */
2976
2977 if (min == max) continue;
2978
2979 /* If minimizing, keep testing the rest of the expression and advancing
2980 the pointer while it matches the class. */
2981
2982 if (minimize)
2983 {
2984 for (fi = min;; fi++)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 MRRETURN(MATCH_NOMATCH);
2993 }
2994 GETCHARINCTEST(c, eptr);
2995 if (!PRIV(xclass)(c, data, utf)) MRRETURN(MATCH_NOMATCH);
2996 }
2997 /* Control never gets here */
2998 }
2999
3000 /* If maximizing, find the longest possible run, then work backwards. */
3001
3002 else
3003 {
3004 pp = eptr;
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 #ifdef SUPPORT_UTF
3014 GETCHARLENTEST(c, eptr, len);
3015 #else
3016 c = *eptr;
3017 #endif
3018 if (!PRIV(xclass)(c, data, utf)) break;
3019 eptr += len;
3020 }
3021 for(;;)
3022 {
3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025 if (eptr-- == pp) break; /* Stop if tried at original pos */
3026 #ifdef SUPPORT_UTF
3027 if (utf) BACKCHAR(eptr);
3028 #endif
3029 }
3030 MRRETURN(MATCH_NOMATCH);
3031 }
3032
3033 /* Control never gets here */
3034 }
3035 #endif /* End of XCLASS */
3036
3037 /* Match a single character, casefully */
3038
3039 case OP_CHAR:
3040 #ifdef SUPPORT_UTF
3041 if (utf)
3042 {
3043 length = 1;
3044 ecode++;
3045 GETCHARLEN(fc, ecode, length);
3046 if (length > md->end_subject - eptr)
3047 {
3048 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3049 MRRETURN(MATCH_NOMATCH);
3050 }
3051 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3052 }
3053 else
3054 #endif
3055 /* Not UTF mode */
3056 {
3057 if (md->end_subject - eptr < 1)
3058 {
3059 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3060 MRRETURN(MATCH_NOMATCH);
3061 }
3062 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3063 ecode += 2;
3064 }
3065 break;
3066
3067 /* Match a single character, caselessly */
3068
3069 case OP_CHARI:
3070 #ifdef SUPPORT_UTF
3071 if (utf)
3072 {
3073 length = 1;
3074 ecode++;
3075 GETCHARLEN(fc, ecode, length);
3076
3077 if (length > md->end_subject - eptr)
3078 {
3079 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3080 MRRETURN(MATCH_NOMATCH);
3081 }
3082
3083 /* If the pattern character's value is < 128, we have only one byte, and
3084 can use the fast lookup table. */
3085
3086 if (fc < 128)
3087 {
3088 if (md->lcc[fc]
3089 != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
3090 ecode++;
3091 eptr++;
3092 }
3093
3094 /* Otherwise we must pick up the subject character */
3095
3096 else
3097 {
3098 unsigned int dc;
3099 GETCHARINC(dc, eptr);
3100 ecode += length;
3101
3102 /* If we have Unicode property support, we can use it to test the other
3103 case of the character, if there is one. */
3104
3105 if (fc != dc)
3106 {
3107 #ifdef SUPPORT_UCP
3108 if (dc != UCD_OTHERCASE(fc))
3109 #endif
3110 MRRETURN(MATCH_NOMATCH);
3111 }
3112 }
3113 }
3114 else
3115 #endif /* SUPPORT_UTF */
3116
3117 /* Not UTF mode */
3118 {
3119 if (md->end_subject - eptr < 1)
3120 {
3121 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3122 MRRETURN(MATCH_NOMATCH);
3123 }
3124 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3125 != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
3126 eptr++;
3127 ecode += 2;
3128 }
3129 break;
3130
3131 /* Match a single character repeatedly. */
3132
3133 case OP_EXACT:
3134 case OP_EXACTI:
3135 min = max = GET2(ecode, 1);
3136 ecode += 1 + IMM2_SIZE;
3137 goto REPEATCHAR;
3138
3139 case OP_POSUPTO:
3140 case OP_POSUPTOI:
3141 possessive = TRUE;
3142 /* Fall through */
3143
3144 case OP_UPTO:
3145 case OP_UPTOI:
3146 case OP_MINUPTO:
3147 case OP_MINUPTOI:
3148 min = 0;
3149 max = GET2(ecode, 1);
3150 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3151 ecode += 1 + IMM2_SIZE;
3152 goto REPEATCHAR;
3153
3154 case OP_POSSTAR:
3155 case OP_POSSTARI:
3156 possessive = TRUE;
3157 min = 0;
3158 max = INT_MAX;
3159 ecode++;
3160 goto REPEATCHAR;
3161
3162 case OP_POSPLUS:
3163 case OP_POSPLUSI:
3164 possessive = TRUE;
3165 min = 1;
3166 max = INT_MAX;
3167 ecode++;
3168 goto REPEATCHAR;
3169
3170 case OP_POSQUERY:
3171 case OP_POSQUERYI:
3172 possessive = TRUE;
3173 min = 0;
3174 max = 1;
3175 ecode++;
3176 goto REPEATCHAR;
3177
3178 case OP_STAR:
3179 case OP_STARI:
3180 case OP_MINSTAR:
3181 case OP_MINSTARI:
3182 case OP_PLUS:
3183 case OP_PLUSI:
3184 case OP_MINPLUS:
3185 case OP_MINPLUSI:
3186 case OP_QUERY:
3187 case OP_QUERYI:
3188 case OP_MINQUERY:
3189 case OP_MINQUERYI:
3190 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3191 minimize = (c & 1) != 0;
3192 min = rep_min[c]; /* Pick up values from tables; */
3193 max = rep_max[c]; /* zero for max => infinity */
3194 if (max == 0) max = INT_MAX;
3195
3196 /* Common code for all repeated single-character matches. */
3197
3198 REPEATCHAR:
3199 #ifdef SUPPORT_UTF
3200 if (utf)
3201 {
3202 length = 1;
3203 charptr = ecode;
3204 GETCHARLEN(fc, ecode, length);
3205 ecode += length;
3206
3207 /* Handle multibyte character matching specially here. There is
3208 support for caseless matching if UCP support is present. */
3209
3210 if (length > 1)
3211 {
3212 #ifdef SUPPORT_UCP
3213 unsigned int othercase;
3214 if (op >= OP_STARI && /* Caseless */
3215 (othercase = UCD_OTHERCASE(fc)) != fc)
3216 oclength = PRIV(ord2utf)(othercase, occhars);
3217 else oclength = 0;
3218 #endif /* SUPPORT_UCP */
3219
3220 for (i = 1; i <= min; i++)
3221 {
3222 if (eptr <= md->end_subject - length &&
3223 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3224 #ifdef SUPPORT_UCP
3225 else if (oclength > 0 &&
3226 eptr <= md->end_subject - oclength &&
3227 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3228 #endif /* SUPPORT_UCP */
3229 else
3230 {
3231 CHECK_PARTIAL();
3232 MRRETURN(MATCH_NOMATCH);
3233 }
3234 }
3235
3236 if (min == max) continue;
3237
3238 if (minimize)
3239 {
3240 for (fi = min;; fi++)
3241 {
3242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3244 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3245 if (eptr <= md->end_subject - length &&
3246 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3247 #ifdef SUPPORT_UCP
3248 else if (oclength > 0 &&
3249 eptr <= md->end_subject - oclength &&
3250 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3251 #endif /* SUPPORT_UCP */
3252 else
3253 {
3254 CHECK_PARTIAL();
3255 MRRETURN(MATCH_NOMATCH);
3256 }
3257 }
3258 /* Control never gets here */
3259 }
3260
3261 else /* Maximize */
3262 {
3263 pp = eptr;
3264 for (i = min; i < max; i++)
3265 {
3266 if (eptr <= md->end_subject - length &&
3267 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3268 #ifdef SUPPORT_UCP
3269 else if (oclength > 0 &&
3270 eptr <= md->end_subject - oclength &&
3271 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3272 #endif /* SUPPORT_UCP */
3273 else
3274 {
3275 CHECK_PARTIAL();
3276 break;
3277 }
3278 }
3279
3280 if (possessive) continue;
3281
3282 for(;;)
3283 {
3284 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3286 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3287 #ifdef SUPPORT_UCP
3288 eptr--;
3289 BACKCHAR(eptr);
3290 #else /* without SUPPORT_UCP */
3291 eptr -= length;
3292 #endif /* SUPPORT_UCP */
3293 }
3294 }
3295 /* Control never gets here */
3296 }
3297
3298 /* If the length of a UTF-8 character is 1, we fall through here, and
3299 obey the code as for non-UTF-8 characters below, though in this case the
3300 value of fc will always be < 128. */
3301 }
3302 else
3303 #endif /* SUPPORT_UTF */
3304 /* When not in UTF-8 mode, load a single-byte character. */
3305 fc = *ecode++;
3306
3307 /* The value of fc at this point is always one character, though we may
3308 or may not be in UTF mode. The code is duplicated for the caseless and
3309 caseful cases, for speed, since matching characters is likely to be quite
3310 common. First, ensure the minimum number of matches are present. If min =
3311 max, continue at the same level without recursing. Otherwise, if
3312 minimizing, keep trying the rest of the expression and advancing one
3313 matching character if failing, up to the maximum. Alternatively, if
3314 maximizing, find the maximum number of characters and work backwards. */
3315
3316 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3317 max, eptr));
3318
3319 if (op >= OP_STARI) /* Caseless */
3320 {
3321 #ifdef COMPILE_PCRE8
3322 /* fc must be < 128 if UTF is enabled. */
3323 foc = md->fcc[fc];
3324 #else
3325 #ifdef SUPPORT_UTF
3326 #ifdef SUPPORT_UCP
3327 if (utf && fc > 127)
3328 foc = UCD_OTHERCASE(fc);
3329 #else
3330 if (utf && fc > 127)
3331 foc = fc;
3332 #endif /* SUPPORT_UCP */
3333 else
3334 #endif /* SUPPORT_UTF */
3335 foc = TABLE_GET(fc, md->fcc, fc);
3336 #endif /* COMPILE_PCRE8 */
3337
3338 for (i = 1; i <= min; i++)
3339 {
3340 if (eptr >= md->end_subject)
3341 {
3342 SCHECK_PARTIAL();
3343 MRRETURN(MATCH_NOMATCH);
3344 }
3345 if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
3346 eptr++;
3347 }
3348 if (min == max) continue;
3349 if (minimize)
3350 {
3351 for (fi = min;; fi++)
3352 {
3353 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3355 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3356 if (eptr >= md->end_subject)
3357 {
3358 SCHECK_PARTIAL();
3359 MRRETURN(MATCH_NOMATCH);
3360 }
3361 if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
3362 eptr++;
3363 }
3364 /* Control never gets here */
3365 }
3366 else /* Maximize */
3367 {
3368 pp = eptr;
3369 for (i = min; i < max; i++)
3370 {
3371 if (eptr >= md->end_subject)
3372 {
3373 SCHECK_PARTIAL();
3374 break;
3375 }
3376 if (fc != *eptr && foc != *eptr) break;
3377 eptr++;
3378 }
3379
3380 if (possessive) continue;
3381
3382 while (eptr >= pp)
3383 {
3384 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3385 eptr--;
3386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387 }
3388 MRRETURN(MATCH_NOMATCH);
3389 }
3390 /* Control never gets here */
3391 }
3392
3393 /* Caseful comparisons (includes all multi-byte characters) */
3394
3395 else
3396 {
3397 for (i = 1; i <= min; i++)
3398 {
3399 if (eptr >= md->end_subject)
3400 {
3401 SCHECK_PARTIAL();
3402 MRRETURN(MATCH_NOMATCH);
3403 }
3404 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3405 }
3406
3407 if (min == max) continue;
3408
3409 if (minimize)
3410 {
3411 for (fi = min;; fi++)
3412 {
3413 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3414 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3415 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3416 if (eptr >= md->end_subject)
3417 {
3418 SCHECK_PARTIAL();
3419 MRRETURN(MATCH_NOMATCH);
3420 }
3421 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3422 }
3423 /* Control never gets here */
3424 }
3425 else /* Maximize */
3426 {
3427 pp = eptr;
3428 for (i = min; i < max; i++)
3429 {
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 break;
3434 }
3435 if (fc != *eptr) break;
3436 eptr++;
3437 }
3438 if (possessive) continue;
3439
3440 while (eptr >= pp)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3443 eptr--;
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 }
3446 MRRETURN(MATCH_NOMATCH);
3447 }
3448 }
3449 /* Control never gets here */
3450
3451 /* Match a negated single one-byte character. The character we are
3452 checking can be multibyte. */
3453
3454 case OP_NOT:
3455 case OP_NOTI:
3456 if (eptr >= md->end_subject)
3457 {
3458 SCHECK_PARTIAL();
3459 MRRETURN(MATCH_NOMATCH);
3460 }
3461 ecode++;
3462 GETCHARINCTEST(c, eptr);
3463 if (op == OP_NOTI) /* The caseless case */
3464 {
3465 register int ch, och;
3466 ch = *ecode++;
3467 #ifdef COMPILE_PCRE8
3468 /* ch must be < 128 if UTF is enabled. */
3469 och = md->fcc[ch];
3470 #else
3471 #ifdef SUPPORT_UTF
3472 #ifdef SUPPORT_UCP
3473 if (utf && ch > 127)
3474 och = UCD_OTHERCASE(ch);
3475 #else
3476 if (utf && ch > 127)
3477 och = ch;
3478 #endif /* SUPPORT_UCP */
3479 else
3480 #endif /* SUPPORT_UTF */
3481 och = TABLE_GET(ch, md->fcc, ch);
3482 #endif /* COMPILE_PCRE8 */
3483 if (ch == c || och == c) MRRETURN(MATCH_NOMATCH);
3484 }
3485 else /* Caseful */
3486 {
3487 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3488 }
3489 break;
3490
3491 /* Match a negated single one-byte character repeatedly. This is almost a
3492 repeat of the code for a repeated single character, but I haven't found a
3493 nice way of commoning these up that doesn't require a test of the
3494 positive/negative option for each character match. Maybe that wouldn't add
3495 very much to the time taken, but character matching *is* what this is all
3496 about... */
3497
3498 case OP_NOTEXACT:
3499 case OP_NOTEXACTI:
3500 min = max = GET2(ecode, 1);
3501 ecode += 1 + IMM2_SIZE;
3502 goto REPEATNOTCHAR;
3503
3504 case OP_NOTUPTO:
3505 case OP_NOTUPTOI:
3506 case OP_NOTMINUPTO:
3507 case OP_NOTMINUPTOI:
3508 min = 0;
3509 max = GET2(ecode, 1);
3510 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3511 ecode += 1 + IMM2_SIZE;
3512 goto REPEATNOTCHAR;
3513
3514 case OP_NOTPOSSTAR:
3515 case OP_NOTPOSSTARI:
3516 possessive = TRUE;
3517 min = 0;
3518 max = INT_MAX;
3519 ecode++;
3520 goto REPEATNOTCHAR;
3521
3522 case OP_NOTPOSPLUS:
3523 case OP_NOTPOSPLUSI:
3524 possessive = TRUE;
3525 min = 1;
3526 max = INT_MAX;
3527 ecode++;
3528 goto REPEATNOTCHAR;
3529
3530 case OP_NOTPOSQUERY:
3531 case OP_NOTPOSQUERYI:
3532 possessive = TRUE;
3533 min = 0;
3534 max = 1;
3535 ecode++;
3536 goto REPEATNOTCHAR;
3537
3538 case OP_NOTPOSUPTO:
3539 case OP_NOTPOSUPTOI:
3540 possessive = TRUE;
3541 min = 0;
3542 max = GET2(ecode, 1);
3543 ecode += 1 + IMM2_SIZE;
3544 goto REPEATNOTCHAR;
3545
3546 case OP_NOTSTAR:
3547 case OP_NOTSTARI:
3548 case OP_NOTMINSTAR:
3549 case OP_NOTMINSTARI:
3550 case OP_NOTPLUS:
3551 case OP_NOTPLUSI:
3552 case OP_NOTMINPLUS:
3553 case OP_NOTMINPLUSI:
3554 case OP_NOTQUERY:
3555 case OP_NOTQUERYI:
3556 case OP_NOTMINQUERY:
3557 case OP_NOTMINQUERYI:
3558 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3559 minimize = (c & 1) != 0;
3560 min = rep_min[c]; /* Pick up values from tables; */
3561 max = rep_max[c]; /* zero for max => infinity */
3562 if (max == 0) max = INT_MAX;
3563
3564 /* Common code for all repeated single-byte matches. */
3565
3566 REPEATNOTCHAR:
3567 fc = *ecode++;
3568
3569 /* The code is duplicated for the caseless and caseful cases, for speed,
3570 since matching characters is likely to be quite common. First, ensure the
3571 minimum number of matches are present. If min = max, continue at the same
3572 level without recursing. Otherwise, if minimizing, keep trying the rest of
3573 the expression and advancing one matching character if failing, up to the
3574 maximum. Alternatively, if maximizing, find the maximum number of
3575 characters and work backwards. */
3576
3577 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3578 max, eptr));
3579
3580 if (op >= OP_NOTSTARI) /* Caseless */
3581 {
3582 #ifdef COMPILE_PCRE8
3583 /* fc must be < 128 if UTF is enabled. */
3584 foc = md->fcc[fc];
3585 #else
3586 #ifdef SUPPORT_UTF
3587 #ifdef SUPPORT_UCP
3588 if (utf && fc > 127)
3589 foc = UCD_OTHERCASE(fc);
3590 #else
3591 if (utf && fc > 127)
3592 foc = fc;
3593 #endif /* SUPPORT_UCP */
3594 else
3595 #endif /* SUPPORT_UTF */
3596 foc = TABLE_GET(fc, md->fcc, fc);
3597 #endif /* COMPILE_PCRE8 */
3598
3599 #ifdef SUPPORT_UTF
3600 if (utf)
3601 {
3602 register unsigned int d;
3603 for (i = 1; i <= min; i++)
3604 {
3605 if (eptr >= md->end_subject)
3606 {
3607 SCHECK_PARTIAL();
3608 MRRETURN(MATCH_NOMATCH);
3609 }
3610 GETCHARINC(d, eptr);
3611 if (fc == d || foc == d) MRRETURN(MATCH_NOMATCH);
3612 }
3613 }
3614 else
3615 #endif
3616 /* Not UTF mode */
3617 {
3618 for (i = 1; i <= min; i++)
3619 {
3620 if (eptr >= md->end_subject)
3621 {
3622 SCHECK_PARTIAL();
3623 MRRETURN(MATCH_NOMATCH);
3624 }
3625 if (fc == *eptr || foc == *eptr) MRRETURN(MATCH_NOMATCH);
3626 eptr++;
3627 }
3628 }
3629
3630 if (min == max) continue;
3631
3632 if (minimize)
3633 {
3634 #ifdef SUPPORT_UTF
3635 if (utf)
3636 {
3637 register unsigned int d;
3638 for (fi = min;; fi++)
3639 {
3640 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3642 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 MRRETURN(MATCH_NOMATCH);
3647 }
3648 GETCHARINC(d, eptr);
3649 if (fc == d || foc == d) MRRETURN(MATCH_NOMATCH);
3650 }
3651 }
3652 else
3653 #endif
3654 /* Not UTF mode */
3655 {
3656 for (fi = min;; fi++)
3657 {
3658 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3660 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3661 if (eptr >= md->end_subject)
3662 {
3663 SCHECK_PARTIAL();
3664 MRRETURN(MATCH_NOMATCH);
3665 }
3666 if (fc == *eptr || foc == *eptr) MRRETURN(MATCH_NOMATCH);
3667 eptr++;
3668 }
3669 }
3670 /* Control never gets here */
3671 }
3672
3673 /* Maximize case */
3674
3675 else
3676 {
3677 pp = eptr;
3678
3679 #ifdef SUPPORT_UTF
3680 if (utf)
3681 {
3682 register unsigned int d;
3683 for (i = min; i < max; i++)
3684 {
3685 int len = 1;
3686 if (eptr >= md->end_subject)
3687 {
3688 SCHECK_PARTIAL();
3689 break;
3690 }
3691 GETCHARLEN(d, eptr, len);
3692 if (fc == d || foc == d) break;
3693 eptr += len;
3694 }
3695 if (possessive) continue;
3696 for(;;)
3697 {
3698 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3700 if (eptr-- == pp) break; /* Stop if tried at original pos */
3701 BACKCHAR(eptr);
3702 }
3703 }
3704 else
3705 #endif
3706 /* Not UTF mode */
3707 {
3708 for (i = min; i < max; i++)
3709 {
3710 if (eptr >= md->end_subject)
3711 {
3712 SCHECK_PARTIAL();
3713 break;
3714 }
3715 if (fc == *eptr || foc == *eptr) break;
3716 eptr++;
3717 }
3718 if (possessive) continue;
3719 while (eptr >= pp)
3720 {
3721 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3723 eptr--;
3724 }
3725 }
3726
3727 MRRETURN(MATCH_NOMATCH);
3728 }
3729 /* Control never gets here */
3730 }
3731
3732 /* Caseful comparisons */
3733
3734 else
3735 {
3736 #ifdef SUPPORT_UTF
3737 if (utf)
3738 {
3739 register unsigned int d;
3740 for (i = 1; i <= min; i++)
3741 {
3742 if (eptr >= md->end_subject)
3743 {
3744 SCHECK_PARTIAL();
3745 MRRETURN(MATCH_NOMATCH);
3746 }
3747 GETCHARINC(d, eptr);
3748 if (fc == d) MRRETURN(MATCH_NOMATCH);
3749 }
3750 }
3751 else
3752 #endif
3753 /* Not UTF mode */
3754 {
3755 for (i = 1; i <= min; i++)
3756 {
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 MRRETURN(MATCH_NOMATCH);
3761 }
3762 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3763 }
3764 }
3765
3766 if (min == max) continue;
3767
3768 if (minimize)
3769 {
3770 #ifdef SUPPORT_UTF
3771 if (utf)
3772 {
3773 register unsigned int d;
3774 for (fi = min;; fi++)
3775 {
3776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3778 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 MRRETURN(MATCH_NOMATCH);
3783 }
3784 GETCHARINC(d, eptr);
3785 if (fc == d) MRRETURN(MATCH_NOMATCH);
3786 }
3787 }
3788 else
3789 #endif
3790 /* Not UTF mode */
3791 {
3792 for (fi = min;; fi++)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3803 }
3804 }
3805 /* Control never gets here */
3806 }
3807
3808 /* Maximize case */
3809
3810 else
3811 {
3812 pp = eptr;
3813
3814 #ifdef SUPPORT_UTF
3815 if (utf)
3816 {
3817 register unsigned int d;
3818 for (i = min; i < max; i++)
3819 {
3820 int len = 1;
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 break;
3825 }
3826 GETCHARLEN(d, eptr, len);
3827 if (fc == d) break;
3828 eptr += len;
3829 }
3830 if (possessive) continue;
3831 for(;;)
3832 {
3833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3835 if (eptr-- == pp) break; /* Stop if tried at original pos */
3836 BACKCHAR(eptr);
3837 }
3838 }
3839 else
3840 #endif
3841 /* Not UTF mode */
3842 {
3843 for (i = min; i < max; i++)
3844 {
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 break;
3849 }
3850 if (fc == *eptr) break;
3851 eptr++;
3852 }
3853 if (possessive) continue;
3854 while (eptr >= pp)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 eptr--;
3859 }
3860 }
3861
3862 MRRETURN(MATCH_NOMATCH);
3863 }
3864 }
3865 /* Control never gets here */
3866
3867 /* Match a single character type repeatedly; several different opcodes
3868 share code. This is very similar to the code for single characters, but we
3869 repeat it in the interests of efficiency. */
3870
3871 case OP_TYPEEXACT:
3872 min = max = GET2(ecode, 1);
3873 minimize = TRUE;
3874 ecode += 1 + IMM2_SIZE;
3875 goto REPEATTYPE;
3876
3877 case OP_TYPEUPTO:
3878 case OP_TYPEMINUPTO:
3879 min = 0;
3880 max = GET2(ecode, 1);
3881 minimize = *ecode == OP_TYPEMINUPTO;
3882 ecode += 1 + IMM2_SIZE;
3883 goto REPEATTYPE;
3884
3885 case OP_TYPEPOSSTAR:
3886 possessive = TRUE;
3887 min = 0;
3888 max = INT_MAX;
3889 ecode++;
3890 goto REPEATTYPE;
3891
3892 case OP_TYPEPOSPLUS:
3893 possessive = TRUE;
3894 min = 1;
3895 max = INT_MAX;
3896 ecode++;
3897 goto REPEATTYPE;
3898
3899 case OP_TYPEPOSQUERY:
3900 possessive = TRUE;
3901 min = 0;
3902 max = 1;
3903 ecode++;
3904 goto REPEATTYPE;
3905
3906 case OP_TYPEPOSUPTO:
3907 possessive = TRUE;
3908 min = 0;
3909 max = GET2(ecode, 1);
3910 ecode += 1 + IMM2_SIZE;
3911 goto REPEATTYPE;
3912
3913 case OP_TYPESTAR:
3914 case OP_TYPEMINSTAR:
3915 case OP_TYPEPLUS:
3916 case OP_TYPEMINPLUS:
3917 case OP_TYPEQUERY:
3918 case OP_TYPEMINQUERY:
3919 c = *ecode++ - OP_TYPESTAR;
3920 minimize = (c & 1) != 0;
3921 min = rep_min[c]; /* Pick up values from tables; */
3922 max = rep_max[c]; /* zero for max => infinity */
3923 if (max == 0) max = INT_MAX;
3924
3925 /* Common code for all repeated single character type matches. Note that
3926 in UTF-8 mode, '.' matches a character of any length, but for the other
3927 character types, the valid characters are all one-byte long. */
3928
3929 REPEATTYPE:
3930 ctype = *ecode++; /* Code for the character type */
3931
3932 #ifdef SUPPORT_UCP
3933 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3934 {
3935 prop_fail_result = ctype == OP_NOTPROP;
3936 prop_type = *ecode++;
3937 prop_value = *ecode++;
3938 }
3939 else prop_type = -1;
3940 #endif
3941
3942 /* First, ensure the minimum number of matches are present. Use inline
3943 code for maximizing the speed, and do the type test once at the start
3944 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3945 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3946 and single-bytes. */
3947
3948 if (min > 0)
3949 {
3950 #ifdef SUPPORT_UCP
3951 if (prop_type >= 0)
3952 {
3953 switch(prop_type)
3954 {
3955 case PT_ANY:
3956 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 MRRETURN(MATCH_NOMATCH);
3963 }
3964 GETCHARINCTEST(c, eptr);
3965 }
3966 break;
3967
3968 case PT_LAMP:
3969 for (i = 1; i <= min; i++)
3970 {
3971 int chartype;
3972 if (eptr >= md->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 MRRETURN(MATCH_NOMATCH);
3976 }
3977 GETCHARINCTEST(c, eptr);
3978 chartype = UCD_CHARTYPE(c);
3979 if ((chartype == ucp_Lu ||
3980 chartype == ucp_Ll ||
3981 chartype == ucp_Lt) == prop_fail_result)
3982 MRRETURN(MATCH_NOMATCH);
3983 }
3984 break;
3985
3986 case PT_GC:
3987 for (i = 1; i <= min; i++)
3988 {
3989 if (eptr >= md->end_subject)
3990 {
3991 SCHECK_PARTIAL();
3992 MRRETURN(MATCH_NOMATCH);
3993 }
3994 GETCHARINCTEST(c, eptr);
3995 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3996 MRRETURN(MATCH_NOMATCH);
3997 }
3998 break;
3999
4000 case PT_PC:
4001 for (i = 1; i <= min; i++)
4002 {
4003 if (eptr >= md->end_subject)
4004 {
4005 SCHECK_PARTIAL();
4006 MRRETURN(MATCH_NOMATCH);
4007 }
4008 GETCHARINCTEST(c, eptr);
4009 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4010 MRRETURN(MATCH_NOMATCH);
4011 }
4012 break;
4013
4014 case PT_SC:
4015 for (i = 1; i <= min; i++)
4016 {
4017 if (eptr >= md->end_subject)
4018 {
4019 SCHECK_PARTIAL();
4020 MRRETURN(MATCH_NOMATCH);
4021 }
4022 GETCHARINCTEST(c, eptr);
4023 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4024 MRRETURN(MATCH_NOMATCH);
4025 }
4026 break;
4027
4028 case PT_ALNUM:
4029 for (i = 1; i <= min; i++)
4030 {
4031 int category;
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 MRRETURN(MATCH_NOMATCH);
4036 }
4037 GETCHARINCTEST(c, eptr);
4038 category = UCD_CATEGORY(c);
4039 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4040 MRRETURN(MATCH_NOMATCH);
4041 }
4042 break;
4043
4044 case PT_SPACE: /* Perl space */
4045 for (i = 1; i <= min; i++)
4046 {
4047 if (eptr >= md->end_subject)
4048 {
4049 SCHECK_PARTIAL();
4050 MRRETURN(MATCH_NOMATCH);
4051 }
4052 GETCHARINCTEST(c, eptr);
4053 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4054 c == CHAR_FF || c == CHAR_CR)
4055 == prop_fail_result)
4056 MRRETURN(MATCH_NOMATCH);
4057 }
4058 break;
4059
4060 case PT_PXSPACE: /* POSIX space */
4061 for (i = 1; i <= min; i++)
4062 {
4063 if (eptr >= md->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 MRRETURN(MATCH_NOMATCH);
4067 }
4068 GETCHARINCTEST(c, eptr);
4069 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4070 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4071 == prop_fail_result)
4072 MRRETURN(MATCH_NOMATCH);
4073 }
4074 break;
4075
4076 case PT_WORD:
4077 for (i = 1; i <= min; i++)
4078 {
4079 int category;
4080 if (eptr >= md->end_subject)
4081 {
4082 SCHECK_PARTIAL();
4083 MRRETURN(MATCH_NOMATCH);
4084 }
4085 GETCHARINCTEST(c, eptr);
4086 category = UCD_CATEGORY(c);
4087 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4088 == prop_fail_result)
4089 MRRETURN(MATCH_NOMATCH);
4090 }
4091 break;
4092
4093 /* This should not occur */
4094
4095 default:
4096 RRETURN(PCRE_ERROR_INTERNAL);
4097 }
4098 }
4099
4100 /* Match extended Unicode sequences. We will get here only if the
4101 support is in the binary; otherwise a compile-time error occurs. */
4102
4103 else if (ctype == OP_EXTUNI)
4104 {
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (eptr >= md->end_subject)
4108 {
4109 SCHECK_PARTIAL();
4110 MRRETURN(MATCH_NOMATCH);
4111 }
4112 GETCHARINCTEST(c, eptr);
4113 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4114 while (eptr < md->end_subject)
4115 {
4116 int len = 1;
4117 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4118 if (UCD_CATEGORY(c) != ucp_M) break;
4119 eptr += len;
4120 }
4121 }
4122 }
4123
4124 else
4125 #endif /* SUPPORT_UCP */
4126
4127 /* Handle all other cases when the coding is UTF-8 */
4128
4129 #ifdef SUPPORT_UTF
4130 if (utf) switch(ctype)
4131 {
4132 case OP_ANY:
4133 for (i = 1; i <= min; i++)
4134 {
4135 if (eptr >= md->end_subject)
4136 {
4137 SCHECK_PARTIAL();
4138 MRRETURN(MATCH_NOMATCH);
4139 }
4140 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4141 eptr++;
4142 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4143 }
4144 break;
4145
4146 case OP_ALLANY:
4147 for (i = 1; i <= min; i++)
4148 {
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 MRRETURN(MATCH_NOMATCH);
4153 }
4154 eptr++;
4155 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4156 }
4157 break;
4158
4159 case OP_ANYBYTE:
4160 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4161 eptr += min;
4162 break;
4163
4164 case OP_ANYNL:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 MRRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINC(c, eptr);
4173 switch(c)
4174 {
4175 default: MRRETURN(MATCH_NOMATCH);
4176
4177 case 0x000d:
4178 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4179 break;
4180
4181 case 0x000a:
4182 break;
4183
4184 case 0x000b:
4185 case 0x000c:
4186 case 0x0085:
4187 case 0x2028:
4188 case 0x2029:
4189 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4190 break;
4191 }
4192 }
4193 break;
4194
4195 case OP_NOT_HSPACE:
4196 for (i = 1; i <= min; i++)
4197 {
4198 if (eptr >= md->end_subject)
4199 {
4200 SCHECK_PARTIAL();
4201 MRRETURN(MATCH_NOMATCH);
4202 }
4203 GETCHARINC(c, eptr);
4204 switch(c)
4205 {
4206 default: break;
4207 case 0x09: /* HT */
4208 case 0x20: /* SPACE */
4209 case 0xa0: /* NBSP */
4210 case 0x1680: /* OGHAM SPACE MARK */
4211 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4212 case 0x2000: /* EN QUAD */
4213 case 0x2001: /* EM QUAD */
4214 case 0x2002: /* EN SPACE */
4215 case 0x2003: /* EM SPACE */
4216 case 0x2004: /* THREE-PER-EM SPACE */
4217 case 0x2005: /* FOUR-PER-EM SPACE */
4218 case 0x2006: /* SIX-PER-EM SPACE */
4219 case 0x2007: /* FIGURE SPACE */
4220 case 0x2008: /* PUNCTUATION SPACE */
4221 case 0x2009: /* THIN SPACE */
4222 case 0x200A: /* HAIR SPACE */
4223 case 0x202f: /* NARROW NO-BREAK SPACE */
4224 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4225 case 0x3000: /* IDEOGRAPHIC SPACE */
4226 MRRETURN(MATCH_NOMATCH);
4227 }
4228 }
4229 break;
4230
4231 case OP_HSPACE:
4232 for (i = 1; i <= min; i++)
4233 {
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 MRRETURN(MATCH_NOMATCH);
4238 }
4239 GETCHARINC(c, eptr);
4240 switch(c)
4241 {
4242 default: MRRETURN(MATCH_NOMATCH);
4243 case 0x09: /* HT */
4244 case 0x20: /* SPACE */
4245 case 0xa0: /* NBSP */
4246 case 0x1680: /* OGHAM SPACE MARK */
4247 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4248 case 0x2000: /* EN QUAD */
4249 case 0x2001: /* EM QUAD */
4250 case 0x2002: /* EN SPACE */
4251 case 0x2003: /* EM SPACE */
4252 case 0x2004: /* THREE-PER-EM SPACE */
4253 case 0x2005: /* FOUR-PER-EM SPACE */
4254 case 0x2006: /* SIX-PER-EM SPACE */
4255 case 0x2007: /* FIGURE SPACE */
4256 case 0x2008: /* PUNCTUATION SPACE */
4257 case 0x2009: /* THIN SPACE */
4258 case 0x200A: /* HAIR SPACE */
4259 case 0x202f: /* NARROW NO-BREAK SPACE */
4260 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4261 case 0x3000: /* IDEOGRAPHIC SPACE */
4262 break;
4263 }
4264 }
4265 break;
4266
4267 case OP_NOT_VSPACE:
4268 for (i = 1; i <= min; i++)
4269 {
4270 if (eptr >= md->end_subject)
4271 {
4272 SCHECK_PARTIAL();
4273 MRRETURN(MATCH_NOMATCH);
4274 }
4275 GETCHARINC(c, eptr);
4276 switch(c)
4277 {
4278 default: break;
4279 case 0x0a: /* LF */
4280 case 0x0b: /* VT */
4281 case 0x0c: /* FF */
4282 case 0x0d: /* CR */
4283 case 0x85: /* NEL */
4284 case 0x2028: /* LINE SEPARATOR */
4285 case 0x2029: /* PARAGRAPH SEPARATOR */
4286 MRRETURN(MATCH_NOMATCH);
4287 }
4288 }
4289 break;
4290
4291 case OP_VSPACE:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 MRRETURN(MATCH_NOMATCH);
4298 }
4299 GETCHARINC(c, eptr);
4300 switch(c)
4301 {
4302 default: MRRETURN(MATCH_NOMATCH);
4303 case 0x0a: /* LF */
4304 case 0x0b: /* VT */
4305 case 0x0c: /* FF */
4306 case 0x0d: /* CR */
4307 case 0x85: /* NEL */
4308 case 0x2028: /* LINE SEPARATOR */
4309 case 0x2029: /* PARAGRAPH SEPARATOR */
4310 break;
4311 }
4312 }
4313 break;
4314
4315 case OP_NOT_DIGIT:
4316 for (i = 1; i <= min; i++)
4317 {
4318 if (eptr >= md->end_subject)
4319 {
4320 SCHECK_PARTIAL();
4321 MRRETURN(MATCH_NOMATCH);
4322 }
4323 GETCHARINC(c, eptr);
4324 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4325 MRRETURN(MATCH_NOMATCH);
4326 }
4327 break;
4328
4329 case OP_DIGIT:
4330 for (i = 1; i <= min; i++)
4331 {
4332 if (eptr >= md->end_subject)
4333 {
4334 SCHECK_PARTIAL();
4335 MRRETURN(MATCH_NOMATCH);
4336 }
4337 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4338 MRRETURN(MATCH_NOMATCH);
4339 /* No need to skip more bytes - we know it's a 1-byte character */
4340 }
4341 break;
4342
4343 case OP_NOT_WHITESPACE:
4344 for (i = 1; i <= min; i++)
4345 {
4346 if (eptr >= md->end_subject)
4347 {
4348 SCHECK_PARTIAL();
4349 MRRETURN(MATCH_NOMATCH);
4350 }
4351 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4352 MRRETURN(MATCH_NOMATCH);
4353 eptr++;
4354 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4355 }
4356 break;
4357
4358 case OP_WHITESPACE:
4359 for (i = 1; i <= min; i++)
4360 {
4361 if (eptr >= md->end_subject)
4362 {
4363 SCHECK_PARTIAL();
4364 MRRETURN(MATCH_NOMATCH);
4365 }
4366 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4367 MRRETURN(MATCH_NOMATCH);
4368 /* No need to skip more bytes - we know it's a 1-byte character */
4369 }
4370 break;
4371
4372 case OP_NOT_WORDCHAR:
4373 for (i = 1; i <= min; i++)
4374 {
4375 if (eptr >= md->end_subject)
4376 {
4377 SCHECK_PARTIAL();
4378 MRRETURN(MATCH_NOMATCH);
4379 }
4380 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4381 MRRETURN(MATCH_NOMATCH);
4382 eptr++;
4383 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4384 }
4385 break;
4386
4387 case OP_WORDCHAR:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 MRRETURN(MATCH_NOMATCH);
4394 }
4395 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4396 MRRETURN(MATCH_NOMATCH);
4397 /* No need to skip more bytes - we know it's a 1-byte character */
4398 }
4399 break;
4400
4401 default:
4402 RRETURN(PCRE_ERROR_INTERNAL);
4403 } /* End switch(ctype) */
4404
4405 else
4406 #endif /* SUPPORT_UTF */
4407
4408 /* Code for the non-UTF-8 case for minimum matching of operators other
4409 than OP_PROP and OP_NOTPROP. */
4410
4411 switch(ctype)
4412 {
4413 case OP_ANY:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 MRRETURN(MATCH_NOMATCH);
4420 }
4421 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4422 eptr++;
4423 }
4424 break;
4425
4426 case OP_ALLANY:
4427 if (eptr > md->end_subject - min)
4428 {
4429 SCHECK_PARTIAL();
4430 MRRETURN(MATCH_NOMATCH);
4431 }
4432 eptr += min;
4433 break;
4434
4435 case OP_ANYBYTE:
4436 if (eptr > md->end_subject - min)
4437 {
4438 SCHECK_PARTIAL();
4439 MRRETURN(MATCH_NOMATCH);
4440 }
4441 eptr += min;
4442 break;
4443
4444 case OP_ANYNL:
4445 for (i = 1; i <= min; i++)
4446 {
4447 if (eptr >= md->end_subject)
4448 {
4449 SCHECK_PARTIAL();
4450 MRRETURN(MATCH_NOMATCH);
4451 }
4452 switch(*eptr++)
4453 {
4454 default: MRRETURN(MATCH_NOMATCH);
4455
4456 case 0x000d:
4457 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4458 break;
4459
4460 case 0x000a:
4461 break;
4462
4463 case 0x000b:
4464 case 0x000c:
4465 case 0x0085:
4466 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4467 break;
4468 }
4469 }
4470 break;
4471
4472 case OP_NOT_HSPACE:
4473 for (i = 1; i <= min; i++)
4474 {
4475 if (eptr >= md->end_subject)
4476 {
4477 SCHECK_PARTIAL();
4478 MRRETURN(MATCH_NOMATCH);
4479 }
4480 switch(*eptr++)
4481 {
4482 default: break;
4483 case 0x09: /* HT */
4484 case 0x20: /* SPACE */
4485 case 0xa0: /* NBSP */
4486 MRRETURN(MATCH_NOMATCH);
4487 }
4488 }
4489 break;
4490
4491 case OP_HSPACE:
4492 for (i = 1; i <= min; i++)
4493 {
4494 if (eptr >= md->end_subject)
4495 {
4496 SCHECK_PARTIAL();
4497 MRRETURN(MATCH_NOMATCH);
4498 }
4499 switch(*eptr++)
4500 {
4501 default: MRRETURN(MATCH_NOMATCH);
4502 case 0x09: /* HT */
4503 case 0x20: /* SPACE */
4504 case 0xa0: /* NBSP */
4505 break;
4506 }
4507 }
4508 break;
4509
4510 case OP_NOT_VSPACE:
4511 for (i = 1; i <= min; i++)
4512 {
4513 if (eptr >= md->end_subject)
4514 {
4515 SCHECK_PARTIAL();
4516 MRRETURN(MATCH_NOMATCH);
4517 }
4518 switch(*eptr++)
4519 {
4520 default: break;
4521 case 0x0a: /* LF */
4522 case 0x0b: /* VT */
4523 case 0x0c: /* FF */
4524 case 0x0d: /* CR */
4525 case 0x85: /* NEL */
4526 MRRETURN(MATCH_NOMATCH);
4527 }
4528 }
4529 break;
4530
4531 case OP_VSPACE:
4532 for (i = 1; i <= min; i++)
4533 {
4534 if (eptr >= md->end_subject)
4535 {
4536 SCHECK_PARTIAL();
4537 MRRETURN(MATCH_NOMATCH);
4538 }
4539 switch(*eptr++)
4540 {
4541 default: MRRETURN(MATCH_NOMATCH);
4542 case 0x0a: /* LF */
4543 case 0x0b: /* VT */
4544 case 0x0c: /* FF */
4545 case 0x0d: /* CR */
4546 case 0x85: /* NEL */
4547 break;
4548 }
4549 }
4550 break;
4551
4552 case OP_NOT_DIGIT:
4553 for (i = 1; i <= min; i++)
4554 {
4555 if (eptr >= md->end_subject)
4556 {
4557 SCHECK_PARTIAL();
4558 MRRETURN(MATCH_NOMATCH);
4559 }
4560 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4561 }
4562 break;
4563
4564 case OP_DIGIT:
4565 for (i = 1; i <= min; i++)
4566 {
4567 if (eptr >= md->end_subject)
4568 {
4569 SCHECK_PARTIAL();
4570 MRRETURN(MATCH_NOMATCH);
4571 }
4572 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4573 }
4574 break;
4575
4576 case OP_NOT_WHITESPACE:
4577 for (i = 1; i <= min; i++)
4578 {
4579 if (eptr >= md->end_subject)
4580 {
4581 SCHECK_PARTIAL();
4582 MRRETURN(MATCH_NOMATCH);
4583 }
4584 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4585 }
4586 break;
4587
4588 case OP_WHITESPACE:
4589 for (i = 1; i <= min; i++)
4590 {
4591 if (eptr >= md->end_subject)
4592 {
4593 SCHECK_PARTIAL();
4594 MRRETURN(MATCH_NOMATCH);
4595 }
4596 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4597 }
4598 break;
4599
4600 case OP_NOT_WORDCHAR:
4601 for (i = 1; i <= min; i++)
4602 {
4603 if (eptr >= md->end_subject)
4604 {
4605 SCHECK_PARTIAL();
4606 MRRETURN(MATCH_NOMATCH);
4607 }
4608 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4609 MRRETURN(MATCH_NOMATCH);
4610 }
4611 break;
4612
4613 case OP_WORDCHAR:
4614 for (i = 1; i <= min; i++)
4615 {
4616 if (eptr >= md->end_subject)
4617 {
4618 SCHECK_PARTIAL();
4619 MRRETURN(MATCH_NOMATCH);
4620 }
4621 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4622 MRRETURN(MATCH_NOMATCH);
4623 }
4624 break;
4625
4626 default:
4627 RRETURN(PCRE_ERROR_INTERNAL);
4628 }
4629 }
4630
4631 /* If min = max, continue at the same level without recursing */
4632
4633 if (min == max) continue;
4634
4635 /* If minimizing, we have to test the rest of the pattern before each
4636 subsequent match. Again, separate the UTF-8 case for speed, and also
4637 separate the UCP cases. */
4638
4639 if (minimize)
4640 {
4641 #ifdef SUPPORT_UCP
4642 if (prop_type >= 0)
4643 {
4644 switch(prop_type)
4645 {
4646 case PT_ANY:
4647 for (fi = min;; fi++)
4648 {
4649 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4651 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4652 if (eptr >= md->end_subject)
4653 {
4654 SCHECK_PARTIAL();
4655 MRRETURN(MATCH_NOMATCH);
4656 }
4657 GETCHARINCTEST(c, eptr);
4658 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4659 }
4660 /* Control never gets here */
4661
4662 case PT_LAMP:
4663 for (fi = min;; fi++)
4664 {
4665 int chartype;
4666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4668 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4669 if (eptr >= md->end_subject)
4670 {
4671 SCHECK_PARTIAL();
4672 MRRETURN(MATCH_NOMATCH);
4673 }
4674 GETCHARINCTEST(c, eptr);
4675 chartype = UCD_CHARTYPE(c);
4676 if ((chartype == ucp_Lu ||
4677 chartype == ucp_Ll ||
4678 chartype == ucp_Lt) == prop_fail_result)
4679 MRRETURN(MATCH_NOMATCH);
4680 }
4681 /* Control never gets here */
4682
4683 case PT_GC:
4684 for (fi = min;; fi++)
4685 {
4686 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4688 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4689 if (eptr >= md->end_subject)
4690 {
4691 SCHECK_PARTIAL();
4692 MRRETURN(MATCH_NOMATCH);
4693 }
4694 GETCHARINCTEST(c, eptr);
4695 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4696 MRRETURN(MATCH_NOMATCH);
4697 }
4698 /* Control never gets here */
4699
4700 case PT_PC:
4701 for (fi = min;; fi++)
4702 {
4703 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4704 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4705 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4706 if (eptr >= md->end_subject)
4707 {
4708 SCHECK_PARTIAL();
4709 MRRETURN(MATCH_NOMATCH);
4710 }
4711 GETCHARINCTEST(c, eptr);
4712 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4713 MRRETURN(MATCH_NOMATCH);
4714 }
4715 /* Control never gets here */
4716
4717 case PT_SC:
4718 for (fi = min;; fi++)
4719 {
4720 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4722 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4723 if (eptr >= md->end_subject)
4724 {
4725 SCHECK_PARTIAL();
4726 MRRETURN(MATCH_NOMATCH);
4727 }
4728 GETCHARINCTEST(c, eptr);
4729 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4730 MRRETURN(MATCH_NOMATCH);
4731 }
4732 /* Control never gets here */
4733
4734 case PT_ALNUM:
4735 for (fi = min;; fi++)
4736 {
4737 int category;
4738 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4740 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4741 if (eptr >= md->end_subject)
4742 {
4743 SCHECK_PARTIAL();
4744 MRRETURN(MATCH_NOMATCH);
4745 }
4746 GETCHARINCTEST(c, eptr);
4747 category = UCD_CATEGORY(c);
4748 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4749 MRRETURN(MATCH_NOMATCH);
4750 }
4751 /* Control never gets here */
4752
4753 case PT_SPACE: /* Perl space */
4754 for (fi = min;; fi++)
4755 {
4756 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4758 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4759 if (eptr >= md->end_subject)
4760 {
4761 SCHECK_PARTIAL();
4762 MRRETURN(MATCH_NOMATCH);
4763 }
4764 GETCHARINCTEST(c, eptr);
4765 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4766 c == CHAR_FF || c == CHAR_CR)
4767 == prop_fail_result)
4768 MRRETURN(MATCH_NOMATCH);
4769 }
4770 /* Control never gets here */
4771
4772 case PT_PXSPACE: /* POSIX space */
4773 for (fi = min;; fi++)
4774 {
4775 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4777 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4778 if (eptr >= md->end_subject)
4779 {
4780 SCHECK_PARTIAL();
4781 MRRETURN(MATCH_NOMATCH);
4782 }
4783 GETCHARINCTEST(c, eptr);
4784 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4785 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4786 == prop_fail_result)
4787 MRRETURN(MATCH_NOMATCH);
4788 }
4789 /* Control never gets here */
4790
4791 case PT_WORD:
4792 for (fi = min;; fi++)
4793 {
4794 int category;
4795 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4797 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 MRRETURN(MATCH_NOMATCH);
4802 }
4803 GETCHARINCTEST(c, eptr);
4804 category = UCD_CATEGORY(c);
4805 if ((category == ucp_L ||
4806 category == ucp_N ||
4807 c == CHAR_UNDERSCORE)
4808 == prop_fail_result)
4809 MRRETURN(MATCH_NOMATCH);
4810 }
4811 /* Control never gets here */
4812
4813 /* This should never occur */
4814
4815 default:
4816 RRETURN(PCRE_ERROR_INTERNAL);
4817 }
4818 }
4819
4820 /* Match extended Unicode sequences. We will get here only if the
4821 support is in the binary; otherwise a compile-time error occurs. */
4822
4823 else if (ctype == OP_EXTUNI)
4824 {
4825 for (fi = min;; fi++)
4826 {
4827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4829 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4830 if (eptr >= md->end_subject)
4831 {
4832 SCHECK_PARTIAL();
4833 MRRETURN(MATCH_NOMATCH);
4834 }
4835 GETCHARINCTEST(c, eptr);
4836 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4837 while (eptr < md->end_subject)
4838 {
4839 int len = 1;
4840 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4841 if (UCD_CATEGORY(c) != ucp_M) break;
4842 eptr += len;
4843 }
4844 }
4845 }
4846 else
4847 #endif /* SUPPORT_UCP */
4848
4849 #ifdef SUPPORT_UTF
4850 if (utf)
4851 {
4852 for (fi = min;; fi++)
4853 {
4854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4856 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4857 if (eptr >= md->end_subject)
4858 {
4859 SCHECK_PARTIAL();
4860 MRRETURN(MATCH_NOMATCH);
4861 }
4862 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4863 MRRETURN(MATCH_NOMATCH);
4864 GETCHARINC(c, eptr);
4865 switch(ctype)
4866 {
4867 case OP_ANY: /* This is the non-NL case */
4868 case OP_ALLANY:
4869 case OP_ANYBYTE:
4870 break;
4871
4872 case OP_ANYNL:
4873 switch(c)
4874 {
4875 default: MRRETURN(MATCH_NOMATCH);
4876 case 0x000d:
4877 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4878 break;
4879 case 0x000a:
4880 break;
4881
4882 case 0x000b:
4883 case 0x000c:
4884 case 0x0085:
4885 case 0x2028:
4886 case 0x2029:
4887 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4888 break;
4889 }
4890 break;
4891
4892 case OP_NOT_HSPACE:
4893 switch(c)
4894 {
4895 default: break;
4896 case 0x09: /* HT */
4897 case 0x20: /* SPACE */
4898 case 0xa0: /* NBSP */
4899 case 0x1680: /* OGHAM SPACE MARK */
4900 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4901 case 0x2000: /* EN QUAD */
4902 case 0x2001: /* EM QUAD */
4903 case 0x2002: /* EN SPACE */
4904 case 0x2003: /* EM SPACE */
4905 case 0x2004: /* THREE-PER-EM SPACE */
4906 case 0x2005: /* FOUR-PER-EM SPACE */
4907 case 0x2006: /* SIX-PER-EM SPACE */
4908 case 0x2007: /* FIGURE SPACE */
4909 case 0x2008: /* PUNCTUATION SPACE */
4910 case 0x2009: /* THIN SPACE */
4911 case 0x200A: /* HAIR SPACE */
4912 case 0x202f: /* NARROW NO-BREAK SPACE */
4913 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4914 case 0x3000: /* IDEOGRAPHIC SPACE */
4915 MRRETURN(MATCH_NOMATCH);
4916 }
4917 break;
4918
4919 case OP_HSPACE:
4920 switch(c)
4921 {
4922 default: MRRETURN(MATCH_NOMATCH);
4923 case 0x09: /* HT */
4924 case 0x20: /* SPACE */
4925 case 0xa0: /* NBSP */
4926 case 0x1680: /* OGHAM SPACE MARK */
4927 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4928 case 0x2000: /* EN QUAD */
4929 case 0x2001: /* EM QUAD */
4930 case 0x2002: /* EN SPACE */
4931 case 0x2003: /* EM SPACE */
4932 case 0x2004: /* THREE-PER-EM SPACE */
4933 case 0x2005: /* FOUR-PER-EM SPACE */
4934 case 0x2006: /* SIX-PER-EM SPACE */
4935 case 0x2007: /* FIGURE SPACE */
4936 case 0x2008: /* PUNCTUATION SPACE */
4937 case 0x2009: /* THIN SPACE */
4938 case 0x200A: /* HAIR SPACE */
4939 case 0x202f: /* NARROW NO-BREAK SPACE */
4940 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4941 case 0x3000: /* IDEOGRAPHIC SPACE */
4942 break;
4943 }
4944 break;
4945
4946 case OP_NOT_VSPACE:
4947 switch(c)
4948 {
4949 default: break;
4950 case 0x0a: /* LF */
4951 case 0x0b: /* VT */
4952 case 0x0c: /* FF */
4953 case 0x0d: /* CR */
4954 case 0x85: /* NEL */
4955 case 0x2028: /* LINE SEPARATOR */
4956 case 0x2029: /* PARAGRAPH SEPARATOR */
4957 MRRETURN(MATCH_NOMATCH);
4958 }
4959 break;
4960
4961 case OP_VSPACE:
4962 switch(c)
4963 {
4964 default: MRRETURN(MATCH_NOMATCH);
4965 case 0x0a: /* LF */
4966 case 0x0b: /* VT */
4967 case 0x0c: /* FF */
4968 case 0x0d: /* CR */
4969 case 0x85: /* NEL */
4970 case 0x2028: /* LINE SEPARATOR */
4971 case 0x2029: /* PARAGRAPH SEPARATOR */
4972 break;
4973 }
4974 break;
4975
4976 case OP_NOT_DIGIT:
4977 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4978 MRRETURN(MATCH_NOMATCH);
4979 break;
4980
4981 case OP_DIGIT:
4982 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4983 MRRETURN(MATCH_NOMATCH);
4984 break;
4985
4986 case OP_NOT_WHITESPACE:
4987 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4988 MRRETURN(MATCH_NOMATCH);
4989 break;
4990
4991 case OP_WHITESPACE:
4992 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4993 MRRETURN(MATCH_NOMATCH);
4994 break;
4995
4996 case OP_NOT_WORDCHAR:
4997 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4998 MRRETURN(MATCH_NOMATCH);
4999 break;
5000
5001 case OP_WORDCHAR:
5002 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5003 MRRETURN(MATCH_NOMATCH);
5004 break;
5005
5006 default:
5007 RRETURN(PCRE_ERROR_INTERNAL);
5008 }
5009 }
5010 }
5011 else
5012 #endif
5013 /* Not UTF mode */
5014 {
5015 for (fi = min;; fi++)
5016 {
5017 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5019 if (fi >= max) MRRETURN(MATCH_NOMATCH);
5020 if (eptr >= md->end_subject)
5021 {
5022 SCHECK_PARTIAL();
5023 MRRETURN(MATCH_NOMATCH);
5024 }
5025 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5026 MRRETURN(MATCH_NOMATCH);
5027 c = *eptr++;
5028 switch(ctype)
5029 {
5030 case OP_ANY: /* This is the non-NL case */
5031 case OP_ALLANY:
5032 case OP_ANYBYTE:
5033 break;
5034
5035 case OP_ANYNL:
5036 switch(c)
5037 {
5038 default: MRRETURN(MATCH_NOMATCH);
5039 case 0x000d:
5040 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5041 break;
5042
5043 case 0x000a:
5044 break;
5045
5046 case 0x000b:
5047 case 0x000c:
5048 case 0x0085:
5049 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
5050 break;
5051 }
5052 break;
5053
5054 case OP_NOT_HSPACE:
5055 switch(c)
5056 {
5057 default: break;
5058 case 0x09: /* HT */
5059 case 0x20: /* SPACE */
5060 case 0xa0: /* NBSP */
5061 MRRETURN(MATCH_NOMATCH);
5062 }
5063 break;
5064
5065 case OP_HSPACE:
5066 switch(c)
5067 {
5068 default: MRRETURN(MATCH_NOMATCH);
5069 case 0x09: /* HT */
5070 case 0x20: /* SPACE */
5071 case 0xa0: /* NBSP */
5072 break;
5073 }
5074 break;
5075
5076 case OP_NOT_VSPACE:
5077 switch(c)
5078 {
5079 default: break;
5080 case 0x0a: /* LF */
5081 case 0x0b: /* VT */
5082 case 0x0c: /* FF */
5083 case 0x0d: /* CR */
5084 case 0x85: /* NEL */
5085 MRRETURN(MATCH_NOMATCH);
5086 }
5087 break;
5088
5089 case OP_VSPACE:
5090 switch(c)
5091 {
5092 default: MRRETURN(MATCH_NOMATCH);
5093 case 0x0a: /* LF */
5094 case 0x0b: /* VT */
5095 case 0x0c: /* FF */
5096 case 0x0d: /* CR */
5097 case 0x85: /* NEL */
5098 break;
5099 }
5100 break;
5101
5102 case OP_NOT_DIGIT:
5103 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5104 break;
5105
5106 case OP_DIGIT:
5107 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5108 break;
5109
5110 case OP_NOT_WHITESPACE:
5111 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5112 break;
5113
5114 case OP_WHITESPACE:
5115 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5116 break;
5117
5118 case OP_NOT_WORDCHAR:
5119 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5120 break;
5121
5122 case OP_WORDCHAR:
5123 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5124 break;
5125
5126 default:
5127 RRETURN(PCRE_ERROR_INTERNAL);
5128 }
5129 }
5130 }
5131 /* Control never gets here */
5132 }
5133
5134 /* If maximizing, it is worth using inline code for speed, doing the type
5135 test once at the start (i.e. keep it out of the loop). Again, keep the
5136 UTF-8 and UCP stuff separate. */
5137
5138 else
5139 {
5140 pp = eptr; /* Remember where we started */
5141
5142 #ifdef SUPPORT_UCP
5143 if (prop_type >= 0)
5144 {
5145 switch(prop_type)
5146 {
5147 case PT_ANY:
5148 for (i = min; i < max; i++)
5149 {
5150 int len = 1;
5151 if (eptr >= md->end_subject)
5152 {
5153 SCHECK_PARTIAL();
5154 break;
5155 }
5156 GETCHARLENTEST(c, eptr, len);
5157 if (prop_fail_result) break;
5158 eptr+= len;
5159 }
5160 break;
5161
5162 case PT_LAMP:
5163 for (i = min; i < max; i++)
5164 {
5165 int chartype;
5166 int len = 1;
5167 if (eptr >= md->end_subject)
5168 {
5169 SCHECK_PARTIAL();
5170 break;
5171 }
5172 GETCHARLENTEST(c, eptr, len);
5173 chartype = UCD_CHARTYPE(c);
5174 if ((chartype == ucp_Lu ||
5175 chartype == ucp_Ll ||
5176 chartype == ucp_Lt) == prop_fail_result)
5177 break;
5178 eptr+= len;
5179 }
5180 break;
5181
5182 case PT_GC:
5183 for (i = min; i < max; i++)
5184 {
5185 int len = 1;
5186 if (eptr >= md->end_subject)
5187 {
5188 SCHECK_PARTIAL();
5189 break;
5190 }
5191 GETCHARLENTEST(c, eptr, len);
5192 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5193 eptr+= len;
5194 }
5195 break;
5196
5197 case PT_PC:
5198 for (i = min; i < max; i++)
5199 {
5200 int len = 1;
5201 if (eptr >= md->end_subject)
5202 {
5203 SCHECK_PARTIAL();
5204 break;
5205 }
5206 GETCHARLENTEST(c, eptr, len);
5207 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5208 eptr+= len;
5209 }
5210 break;
5211
5212 case PT_SC:
5213 for (i = min; i < max; i++)
5214 {
5215 int len = 1;
5216 if (eptr >= md->end_subject)
5217 {
5218 SCHECK_PARTIAL();
5219 break;
5220 }
5221 GETCHARLENTEST(c, eptr, len);
5222 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5223 eptr+= len;
5224 }
5225 break;
5226
5227 case PT_ALNUM:
5228 for (i = min; i < max; i++)
5229 {
5230 int category;
5231 int len = 1;
5232 if (eptr >= md->end_subject)
5233 {
5234 SCHECK_PARTIAL();
5235 break;
5236 }
5237 GETCHARLENTEST(c, eptr, len);
5238 category = UCD_CATEGORY(c);
5239 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5240 break;
5241 eptr+= len;
5242 }
5243 break;
5244
5245 case PT_SPACE: /* Perl space */
5246 for (i = min; i < max; i++)
5247 {
5248 int len = 1;
5249 if (eptr >= md->end_subject)
5250 {
5251 SCHECK_PARTIAL();
5252 break;
5253 }
5254 GETCHARLENTEST(c, eptr, len);
5255 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5256 c == CHAR_FF || c == CHAR_CR)
5257 == prop_fail_result)
5258 break;
5259 eptr+= len;
5260 }
5261 break;
5262
5263 case PT_PXSPACE: /* POSIX space */
5264 for (i = min; i < max; i++)
5265 {
5266 int len = 1;
5267 if (eptr >= md->end_subject)
5268 {
5269 SCHECK_PARTIAL();
5270 break;
5271 }
5272 GETCHARLENTEST(c, eptr, len);
5273 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5274 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5275 == prop_fail_result)
5276 break;
5277 eptr+= len;
5278 }
5279 break;
5280
5281 case PT_WORD:
5282 for (i = min; i < max; i++)
5283 {
5284 int category;
5285 int len = 1;
5286 if (eptr >= md->end_subject)
5287 {
5288 SCHECK_PARTIAL();
5289 break;
5290 }
5291 GETCHARLENTEST(c, eptr, len);
5292 category = UCD_CATEGORY(c);
5293 if ((category == ucp_L || category == ucp_N ||
5294 c == CHAR_UNDERSCORE) == prop_fail_result)
5295 break;
5296 eptr+= len;
5297 }
5298 break;
5299
5300 default:
5301 RRETURN(PCRE_ERROR_INTERNAL);
5302 }
5303
5304 /* eptr is now past the end of the maximum run */
5305
5306 if (possessive) continue;
5307 for(;;)
5308 {
5309 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5310 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5311 if (eptr-- == pp) break; /* Stop if tried at original pos */
5312 if (utf) BACKCHAR(eptr);
5313 }
5314 }
5315
5316 /* Match extended Unicode sequences. We will get here only if the
5317 support is in the binary; otherwise a compile-time error occurs. */
5318
5319 else if (ctype == OP_EXTUNI)
5320 {
5321 for (i = min; i < max; i++)
5322 {
5323 int len = 1;
5324 if (eptr >= md->end_subject)
5325 {
5326 SCHECK_PARTIAL();
5327 break;
5328 }
5329 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5330 if (UCD_CATEGORY(c) == ucp_M) break;
5331 eptr += len;
5332 while (eptr < md->end_subject)
5333 {
5334 len = 1;
5335 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5336 if (UCD_CATEGORY(c) != ucp_M) break;
5337 eptr += len;
5338 }
5339 }
5340
5341 /* eptr is now past the end of the maximum run */
5342
5343 if (possessive) continue;
5344
5345 for(;;)
5346 {
5347 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5349 if (eptr-- == pp) break; /* Stop if tried at original pos */
5350 for (;;) /* Move back over one extended */
5351 {
5352 if (!utf) c = *eptr; else
5353 {
5354 BACKCHAR(eptr);
5355 GETCHAR(c, eptr);
5356 }
5357 if (UCD_CATEGORY(c) != ucp_M) break;
5358 eptr--;
5359 }
5360 }
5361 }
5362
5363 else
5364 #endif /* SUPPORT_UCP */
5365
5366 #ifdef SUPPORT_UTF
5367 if (utf)
5368 {
5369 switch(ctype)
5370 {
5371 case OP_ANY:
5372 if (max < INT_MAX)
5373 {
5374 for (i = min; i < max; i++)
5375 {
5376 if (eptr >= md->end_subject)
5377 {
5378 SCHECK_PARTIAL();
5379 break;
5380 }
5381 if (IS_NEWLINE(eptr)) break;
5382 eptr++;
5383 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5384 }
5385 }
5386
5387 /* Handle unlimited UTF-8 repeat */
5388
5389 else
5390 {
5391 for (i = min; i < max; i++)
5392 {
5393 if (eptr >= md->end_subject)
5394 {
5395 SCHECK_PARTIAL();
5396 break;
5397 }
5398 if (IS_NEWLINE(eptr)) break;
5399 eptr++;
5400 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5401 }
5402 }
5403 break;
5404
5405 case OP_ALLANY:
5406 if (max < INT_MAX)
5407 {
5408 for (i = min; i < max; i++)
5409 {
5410 if (eptr >= md->end_subject)
5411 {
5412 SCHECK_PARTIAL();
5413 break;
5414 }
5415 eptr++;
5416 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5417 }
5418 }
5419 else
5420 {
5421 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5422 SCHECK_PARTIAL();
5423 }
5424 break;
5425
5426 /* The byte case is the same as non-UTF8 */
5427
5428 case OP_ANYBYTE:
5429 c = max - min;
5430 if (c > (unsigned int)(md->end_subject - eptr))
5431 {
5432 eptr = md->end_subject;
5433 SCHECK_PARTIAL();
5434 }
5435 else eptr += c;
5436 break;
5437
5438 case OP_ANYNL:
5439 for (i = min; i < max; i++)
5440 {
5441 int len = 1;
5442 if (eptr >= md->end_subject)
5443 {
5444 SCHECK_PARTIAL();
5445 break;
5446 }
5447 GETCHARLEN(c, eptr, len);
5448 if (c == 0x000d)
5449 {
5450 if (++eptr >= md->end_subject) break;
5451 if (*eptr == 0x000a) eptr++;
5452 }
5453 else
5454 {
5455 if (c != 0x000a &&
5456 (md->bsr_anycrlf ||
5457 (c != 0x000b && c != 0x000c &&
5458 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5459 break;
5460 eptr += len;
5461 }
5462 }
5463 break;
5464
5465 case OP_NOT_HSPACE:
5466 case OP_HSPACE:
5467 for (i = min; i < max; i++)
5468 {
5469 BOOL gotspace;
5470 int len = 1;
5471 if (eptr >= md->end_subject)
5472 {
5473 SCHECK_PARTIAL();
5474 break;
5475 }
5476 GETCHARLEN(c, eptr, len);
5477 switch(c)
5478 {
5479 default: gotspace = FALSE; break;
5480 case 0x09: /* HT */
5481 case 0x20: /* SPACE */
5482 case 0xa0: /* NBSP */
5483 case 0x1680: /* OGHAM SPACE MARK */
5484 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5485 case 0x2000: /* EN QUAD */
5486 case 0x2001: /* EM QUAD */
5487 case 0x2002: /* EN SPACE */
5488 case 0x2003: /* EM SPACE */
5489 case 0x2004: /* THREE-PER-EM SPACE */
5490 case 0x2005: /* FOUR-PER-EM SPACE */
5491 case 0x2006: /* SIX-PER-EM SPACE */
5492 case 0x2007: /* FIGURE SPACE */
5493 case 0x2008: /* PUNCTUATION SPACE */
5494 case 0x2009: /* THIN SPACE */
5495 case 0x200A: /* HAIR SPACE */
5496 case 0x202f: /* NARROW NO-BREAK SPACE */
5497 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5498 case 0x3000: /* IDEOGRAPHIC SPACE */
5499 gotspace = TRUE;
5500 break;
5501 }
5502 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5503 eptr += len;
5504 }
5505 break;
5506
5507 case OP_NOT_VSPACE:
5508 case OP_VSPACE:
5509 for (i = min; i < max; i++)
5510 {
5511 BOOL gotspace;
5512 int len = 1;
5513 if (eptr >= md->end_subject)
5514 {
5515 SCHECK_PARTIAL();
5516 break;
5517 }
5518 GETCHARLEN(c, eptr, len);
5519 switch(c)
5520 {
5521 default: gotspace = FALSE; break;
5522 case 0x0a: /* LF */
5523 case 0x0b: /* VT */
5524 case 0x0c: /* FF */
5525 case 0x0d: /* CR */
5526 case 0x85: /* NEL */
5527 case 0x2028: /* LINE SEPARATOR */
5528 case 0x2029: /* PARAGRAPH SEPARATOR */
5529 gotspace = TRUE;
5530 break;
5531 }
5532 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5533 eptr += len;
5534 }
5535 break;
5536
5537 case OP_NOT_DIGIT:
5538 for (i = min; i < max; i++)
5539 {
5540 int len = 1;
5541 if (eptr >= md->end_subject)
5542 {
5543 SCHECK_PARTIAL();
5544 break;
5545 }
5546 GETCHARLEN(c, eptr, len);
5547 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5548 eptr+= len;
5549 }
5550 break;
5551
5552 case OP_DIGIT:
5553 for (i = min; i < max; i++)
5554 {
5555 int len = 1;
5556 if (eptr >= md->end_subject)
5557 {
5558 SCHECK_PARTIAL();
5559 break;
5560 }
5561 GETCHARLEN(c, eptr, len);
5562 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5563 eptr+= len;
5564 }
5565 break;
5566
5567 case OP_NOT_WHITESPACE:
5568 for (i = min; i < max; i++)
5569 {
5570 int len = 1;
5571 if (eptr >= md->end_subject)
5572 {
5573 SCHECK_PARTIAL();
5574 break;
5575 }
5576 GETCHARLEN(c, eptr, len);
5577 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5578 eptr+= len;
5579 }
5580 break;
5581
5582 case OP_WHITESPACE:
5583 for (i = min; i < max; i++)
5584 {
5585 int len = 1;
5586 if (eptr >= md->end_subject)
5587 {
5588 SCHECK_PARTIAL();
5589 break;
5590 }
5591 GETCHARLEN(c, eptr, len);
5592 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5593 eptr+= len;
5594 }
5595 break;
5596
5597 case OP_NOT_WORDCHAR:
5598 for (i = min; i < max; i++)
5599 {
5600 int len = 1;
5601 if (eptr >= md->end_subject)
5602 {
5603 SCHECK_PARTIAL();
5604 break;
5605 }
5606 GETCHARLEN(c, eptr, len);
5607 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5608 eptr+= len;
5609 }
5610 break;
5611
5612 case OP_WORDCHAR:
5613 for (i = min; i < max; i++)
5614 {
5615 int len = 1;
5616 if (eptr >= md->end_subject)
5617 {
5618 SCHECK_PARTIAL();
5619 break;
5620 }
5621 GETCHARLEN(c, eptr, len);
5622 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5623 eptr+= len;
5624 }
5625 break;
5626
5627 default:
5628 RRETURN(PCRE_ERROR_INTERNAL);
5629 }
5630
5631 /* eptr is now past the end of the maximum run. If possessive, we are
5632 done (no backing up). Otherwise, match at this position; anything other
5633 than no match is immediately returned. For nomatch, back up one
5634 character, unless we are matching \R and the last thing matched was
5635 \r\n, in which case, back up two bytes. */
5636
5637 if (possessive) continue;
5638 for(;;)
5639 {
5640 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5642 if (eptr-- == pp) break; /* Stop if tried at original pos */
5643 BACKCHAR(eptr);
5644 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5645 eptr[-1] == '\r') eptr--;
5646 }
5647 }
5648 else
5649 #endif /* SUPPORT_UTF */
5650 /* Not UTF mode */
5651 {
5652 switch(ctype)
5653 {
5654 case OP_ANY:
5655 for (i = min; i < max; i++)
5656 {
5657 if (eptr >= md->end_subject)
5658 {
5659 SCHECK_PARTIAL();
5660 break;
5661 }
5662 if (IS_NEWLINE(eptr)) break;
5663 eptr++;
5664 }
5665 break;
5666
5667 case OP_ALLANY:
5668 case OP_ANYBYTE:
5669 c = max - min;
5670 if (c > (unsigned int)(md->end_subject - eptr))
5671 {
5672 eptr = md->end_subject;
5673 SCHECK_PARTIAL();
5674 }
5675 else eptr += c;
5676 break;
5677
5678 case OP_ANYNL:
5679 for (i = min; i < max; i++)
5680 {
5681 if (eptr >= md->end_subject)
5682 {
5683 SCHECK_PARTIAL();
5684 break;
5685 }
5686 c = *eptr;
5687 if (c == 0x000d)
5688 {
5689 if (++eptr >= md->end_subject) break;
5690 if (*eptr == 0x000a) eptr++;
5691 }
5692 else
5693 {
5694 if (c != 0x000a &&
5695 (md->bsr_anycrlf ||
5696 (c != 0x000b && c != 0x000c && c != 0x0085)))
5697 break;
5698 eptr++;
5699 }
5700 }
5701 break;
5702
5703 case OP_NOT_HSPACE:
5704 for (i = min; i < max; i++)
5705 {
5706 if (eptr >= md->end_subject)
5707 {
5708 SCHECK_PARTIAL();
5709 break;
5710 }
5711 c = *eptr;
5712 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5713 eptr++;
5714 }
5715 break;
5716
5717 case OP_HSPACE:
5718 for (i = min; i < max; i++)
5719 {
5720 if (eptr >= md->end_subject)
5721 {
5722 SCHECK_PARTIAL();
5723 break;
5724 }
5725 c = *eptr;
5726 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5727 eptr++;
5728 }
5729 break;
5730
5731 case OP_NOT_VSPACE:
5732 for (i = min; i < max; i++)
5733 {
5734 if (eptr >= md->end_subject)
5735 {
5736 SCHECK_PARTIAL();
5737 break;
5738 }
5739 c = *eptr;
5740 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5741 break;
5742 eptr++;
5743 }
5744 break;
5745
5746 case OP_VSPACE:
5747 for (i = min; i < max; i++)
5748 {
5749 if (eptr >= md->end_subject)
5750 {
5751 SCHECK_PARTIAL();
5752 break;
5753 }
5754 c = *eptr;
5755 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5756 break;
5757 eptr++;
5758 }
5759 break;
5760
5761 case OP_NOT_DIGIT:
5762 for (i = min; i < max; i++)
5763 {
5764 if (eptr >= md->end_subject)
5765 {
5766 SCHECK_PARTIAL();
5767 break;
5768 }
5769 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5770 eptr++;
5771 }
5772 break;
5773
5774 case OP_DIGIT:
5775 for (i = min; i < max; i++)
5776 {
5777 if (eptr >= md->end_subject)
5778 {
5779 SCHECK_PARTIAL();
5780 break;
5781 }
5782 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5783 eptr++;
5784 }
5785 break;
5786
5787 case OP_NOT_WHITESPACE:
5788 for (i = min; i < max; i++)
5789 {
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5796 eptr++;
5797 }
5798 break;
5799
5800 case OP_WHITESPACE:
5801 for (i = min; i < max; i++)
5802 {
5803 if (eptr >= md->end_subject)
5804 {
5805 SCHECK_PARTIAL();
5806 break;
5807 }
5808 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5809 eptr++;
5810 }
5811 break;
5812
5813 case OP_NOT_WORDCHAR:
5814 for (i = min; i < max; i++)
5815 {
5816 if (eptr >= md->end_subject)
5817 {
5818 SCHECK_PARTIAL();
5819 break;
5820 }
5821 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5822 eptr++;
5823 }
5824 break;
5825
5826 case OP_WORDCHAR:
5827 for (i = min; i < max; i++)
5828 {
5829 if (eptr >= md->end_subject)
5830 {
5831 SCHECK_PARTIAL();
5832 break;
5833 }
5834 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5835 eptr++;
5836 }
5837 break;
5838
5839 default:
5840 RRETURN(PCRE_ERROR_INTERNAL);
5841 }
5842
5843 /* eptr is now past the end of the maximum run. If possessive, we are
5844 done (no backing up). Otherwise, match at this position; anything other
5845 than no match is immediately returned. For nomatch, back up one
5846 character (byte), unless we are matching \R and the last thing matched
5847 was \r\n, in which case, back up two bytes. */
5848
5849 if (possessive) continue;
5850 while (eptr >= pp)
5851 {
5852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5854 eptr--;
5855 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5856 eptr[-1] == '\r') eptr--;
5857 }
5858 }
5859
5860 /* Get here if we can't make it match with any permitted repetitions */
5861
5862 MRRETURN(MATCH_NOMATCH);
5863 }
5864 /* Control never gets here */
5865
5866 /* There's been some horrible disaster. Arrival here can only mean there is
5867 something seriously wrong in the code above or the OP_xxx definitions. */
5868
5869 default:
5870 DPRINTF(("Unknown opcode %d\n", *ecode));
5871 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5872 }
5873
5874 /* Do not stick any code in here without much thought; it is assumed
5875 that "continue" in the code above comes out to here to repeat the main
5876 loop. */
5877
5878 } /* End of main loop */
5879 /* Control never reaches here */
5880
5881
5882 /* When compiling to use the heap rather than the stack for recursive calls to
5883 match(), the RRETURN() macro jumps here. The number that is saved in
5884 frame->Xwhere indicates which label we actually want to return to. */
5885
5886 #ifdef NO_RECURSE
5887 #define LBL(val) case val: goto L_RM##val;
5888 HEAP_RETURN:
5889 switch (frame->Xwhere)
5890 {
5891 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5892 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5893 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5894 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5895 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5896 LBL(65) LBL(66)
5897 #ifdef SUPPORT_UTF
5898 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5899 LBL(32) LBL(34) LBL(42) LBL(46)
5900 #ifdef SUPPORT_UCP
5901 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5902 LBL(59) LBL(60) LBL(61) LBL(62)
5903 #endif /* SUPPORT_UCP */
5904 #endif /* SUPPORT_UTF */
5905 default:
5906 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5907 return PCRE_ERROR_INTERNAL;
5908 }
5909 #undef LBL
5910 #endif /* NO_RECURSE */
5911 }
5912
5913
5914 /***************************************************************************
5915 ****************************************************************************
5916 RECURSION IN THE match() FUNCTION
5917
5918 Undefine all the macros that were defined above to handle this. */
5919
5920 #ifdef NO_RECURSE
5921 #undef eptr
5922 #undef ecode
5923 #undef mstart
5924 #undef offset_top
5925 #undef eptrb
5926 #undef flags
5927
5928 #undef callpat
5929 #undef charptr
5930 #undef data
5931 #undef next
5932 #undef pp
5933 #undef prev
5934 #undef saved_eptr
5935
5936 #undef new_recursive
5937
5938 #undef cur_is_word
5939 #undef condition
5940 #undef prev_is_word
5941
5942 #undef ctype
5943 #undef length
5944 #undef max
5945 #undef min
5946 #undef number
5947 #undef offset
5948 #undef op
5949 #undef save_capture_last
5950 #undef save_offset1
5951 #undef save_offset2
5952 #undef save_offset3
5953 #undef stacksave
5954
5955 #undef newptrb
5956
5957 #endif
5958
5959 /* These two are defined as macros in both cases */
5960
5961 #undef fc
5962 #undef fi
5963
5964 /***************************************************************************
5965 ***************************************************************************/
5966
5967
5968
5969 /*************************************************
5970 * Execute a Regular Expression *
5971 *************************************************/
5972
5973 /* This function applies a compiled re to a subject string and picks out
5974 portions of the string if it matches. Two elements in the vector are set for
5975 each substring: the offsets to the start and end of the substring.
5976
5977 Arguments:
5978 argument_re points to the compiled expression
5979 extra_data points to extra data or is NULL
5980 subject points to the subject string
5981 length length of subject string (may contain binary zeros)
5982 start_offset where to start in the subject string
5983 options option bits
5984 offsets points to a vector of ints to be filled in with offsets
5985 offsetcount the number of elements in the vector
5986
5987 Returns: > 0 => success; value is the number of elements filled in
5988 = 0 => success, but offsets is not big enough
5989 -1 => failed to match
5990 < -1 => some kind of unexpected problem
5991 */
5992
5993 #ifdef COMPILE_PCRE8
5994 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5995 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5996 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5997 int offsetcount)
5998 #else
5999 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6000 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
6001 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6002 int offsetcount)
6003 #endif
6004 {
6005 int rc, ocount, arg_offset_max;
6006 int newline;
6007 BOOL using_temporary_offsets = FALSE;
6008 BOOL anchored;
6009 BOOL startline;
6010 BOOL firstline;
6011 BOOL utf;
6012 BOOL has_first_char = FALSE;
6013 BOOL has_req_char = FALSE;
6014 pcre_uchar first_char = 0;
6015 pcre_uchar first_char2 = 0;
6016 pcre_uchar req_char = 0;
6017 pcre_uchar req_char2 = 0;
6018 match_data match_block;
6019 match_data *md = &match_block;
6020 const pcre_uint8 *tables;
6021 const pcre_uint8 *start_bits = NULL;
6022 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6023 PCRE_PUCHAR end_subject;
6024 PCRE_PUCHAR start_partial = NULL;
6025 PCRE_PUCHAR req_char_ptr = start_match - 1;
6026
6027 pcre_study_data internal_study;
6028 const pcre_study_data *study;
6029
6030 real_pcre internal_re;
6031 const real_pcre *external_re = (const real_pcre *)argument_re;
6032 const real_pcre *re = external_re;
6033
6034 /* Plausibility checks */
6035
6036 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6037 if (re == NULL || subject == NULL ||
6038 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6039 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6040 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6041
6042 /* These two settings are used in the code for checking a UTF-8 string that
6043 follows immediately afterwards. Other values in the md block are used only
6044 during "normal" pcre_exec() processing, not when the JIT support is in use,
6045 so they are set up later. */
6046
6047 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6048 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6049 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6050 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6051
6052 /* Check a UTF-8 string if required. Pass back the character offset and error
6053 code for an invalid string if a results vector is available. */
6054
6055 #ifdef SUPPORT_UTF
6056 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6057 {
6058 int erroroffset;
6059 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6060 if (errorcode != 0)
6061 {
6062 if (offsetcount >= 2)
6063 {
6064 offsets[0] = erroroffset;
6065 offsets[1] = errorcode;
6066 }
6067 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6068 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6069 }
6070
6071 /* Check that a start_offset points to the start of a UTF character. */
6072 if (start_offset > 0 && start_offset < length &&
6073 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6074 return PCRE_ERROR_BADUTF8_OFFSET;
6075 }
6076 #endif
6077
6078 /* If the pattern was successfully studied with JIT support, run the JIT
6079 executable instead of the rest of this function. Most options must be set at
6080 compile time for the JIT code to be usable. Fallback to the normal code path if
6081 an unsupported flag is set. In particular, JIT does not support partial
6082 matching. */
6083
6084 #ifdef SUPPORT_JIT
6085 if (extra_data != NULL
6086 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6087 && extra_data->executable_jit != NULL
6088 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6089 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6090 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6091 return PRIV(jit_exec)(re, extra_data->executable_jit,
6092 (const pcre_uchar *)subject, length, start_offset, options,
6093 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6094 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6095 #endif
6096
6097 /* Carry on with non-JIT matching. This information is for finding all the
6098 numbers associated with a given name, for condition testing. */
6099
6100 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6101 md->name_count = re->name_count;
6102 md->name_entry_size = re->name_entry_size;
6103
6104 /* Fish out the optional data from the extra_data structure, first setting
6105 the default values. */
6106
6107 study = NULL;
6108 md->match_limit = MATCH_LIMIT;
6109 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6110 md->callout_data = NULL;
6111
6112 /* The table pointer is always in native byte order. */
6113
6114 tables = external_re->tables;
6115
6116 if (extra_data != NULL)
6117 {
6118 register unsigned int flags = extra_data->flags;
6119 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6120 study = (const pcre_study_data *)extra_data->study_data;
6121 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6122 md->match_limit = extra_data->match_limit;
6123 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6124 md->match_limit_recursion = extra_data->match_limit_recursion;
6125 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6126 md->callout_data = extra_data->callout_data;
6127 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6128 }
6129
6130 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6131 is a feature that makes it possible to save compiled regex and re-use them
6132 in other programs later. */
6133
6134 if (tables == NULL) tables = PRIV(default_tables);
6135
6136 /* Check that the first field in the block is the magic number. If it is not,
6137 test for a regex that was compiled on a host of opposite endianness. If this is
6138 the case, flipped values are put in internal_re and internal_study if there was
6139 study data too. */
6140
6141 if (re->magic_number != MAGIC_NUMBER)
6142 {
6143 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6144 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6145 if (study != NULL) study = &internal_study;
6146 }
6147 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6148
6149 /* Set up other data */
6150
6151 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6152 startline = (re->flags & PCRE_STARTLINE) != 0;
6153 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6154
6155 /* The code starts after the real_pcre block and the capture name table. */
6156
6157 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6158 re->name_count * re->name_entry_size;
6159
6160 md->start_subject = (PCRE_PUCHAR)subject;
6161 md->start_offset = start_offset;
6162 md->end_subject = md->start_subject + length;
6163 end_subject = md->end_subject;
6164
6165 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6166 md->use_ucp = (re->options & PCRE_UCP) != 0;
6167 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6168
6169 /* Some options are unpacked into BOOL variables in the hope that testing
6170 them will be faster than individual option bits. */
6171
6172 md->notbol = (options & PCRE_NOTBOL) != 0;
6173 md->noteol = (options & PCRE_NOTEOL) != 0;
6174 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6175 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6176
6177 md->hitend = FALSE;
6178 md->mark = NULL; /* In case never set */
6179
6180 md->recursive = NULL; /* No recursion at top level */
6181 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6182
6183 md->lcc = tables + lcc_offset;
6184 md->fcc = tables + fcc_offset;
6185 md->ctypes = tables + ctypes_offset;
6186
6187 /* Handle different \R options. */
6188
6189 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6190 {
6191 case 0:
6192 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6193 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6194 else
6195 #ifdef BSR_ANYCRLF
6196 md->bsr_anycrlf = TRUE;
6197 #else
6198 md->bsr_anycrlf = FALSE;
6199 #endif
6200 break;
6201
6202 case PCRE_BSR_ANYCRLF:
6203 md->bsr_anycrlf = TRUE;
6204 break;
6205
6206 case PCRE_BSR_UNICODE:
6207 md->bsr_anycrlf = FALSE;
6208 break;
6209
6210 default: return PCRE_ERROR_BADNEWLINE;
6211 }
6212
6213 /* Handle different types of newline. The three bits give eight cases. If
6214 nothing is set at run time, whatever was used at compile time applies. */
6215
6216 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6217 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6218 {
6219 case 0: newline = NEWLINE; break; /* Compile-time default */
6220 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6221 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6222 case PCRE_NEWLINE_CR+
6223 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6224 case PCRE_NEWLINE_ANY: newline = -1; break;
6225 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6226 default: return PCRE_ERROR_BADNEWLINE;
6227 }
6228
6229 if (newline == -2)
6230 {
6231 md->nltype = NLTYPE_ANYCRLF;
6232 }
6233 else if (newline < 0)
6234 {
6235 md->nltype = NLTYPE_ANY;
6236 }
6237 else
6238 {
6239 md->nltype = NLTYPE_FIXED;
6240 if (newline > 255)
6241 {
6242 md->nllen = 2;
6243 md->nl[0] = (newline >> 8) & 255;
6244 md->nl[1] = newline & 255;
6245 }
6246 else
6247 {
6248 md->nllen = 1;
6249 md->nl[0] = newline;
6250 }
6251 }
6252
6253 /* Partial matching was originally supported only for a restricted set of
6254 regexes; from release 8.00 there are no restrictions, but the bits are still
6255 defined (though never set). So there's no harm in leaving this code. */
6256
6257 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6258 return PCRE_ERROR_BADPARTIAL;
6259
6260 /* If the expression has got more back references than the offsets supplied can
6261 hold, we get a temporary chunk of working store to use during the matching.
6262 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6263 of 3. */
6264
6265 ocount = offsetcount - (offsetcount % 3);
6266 arg_offset_max = (2*ocount)/3;
6267
6268 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6269 {
6270 ocount = re->top_backref * 3 + 3;
6271 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6272 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6273 using_temporary_offsets = TRUE;
6274 DPRINTF(("Got memory to hold back references\n"));
6275 }
6276 else md->offset_vector = offsets;
6277
6278 md->offset_end = ocount;
6279 md->offset_max = (2*ocount)/3;
6280 md->offset_overflow = FALSE;
6281 md->capture_last = -1;
6282
6283 /* Reset the working variable associated with each extraction. These should
6284 never be used unless previously set, but they get saved and restored, and so we
6285 initialize them to avoid reading uninitialized locations. Also, unset the
6286 offsets for the matched string. This is really just for tidiness with callouts,
6287 in case they inspect these fields. */
6288
6289 if (md->offset_vector != NULL)
6290 {
6291 register int *iptr = md->offset_vector + ocount;
6292 register int *iend = iptr - re->top_bracket;
6293 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6294 while (--iptr >= iend) *iptr = -1;
6295 md->offset_vector[0] = md->offset_vector[1] = -1;
6296 }
6297
6298 /* Set up the first character to match, if available. The first_char value is
6299 never set for an anchored regular expression, but the anchoring may be forced
6300 at run time, so we have to test for anchoring. The first char may be unset for
6301 an unanchored pattern, of course. If there's no first char and the pattern was
6302 studied, there may be a bitmap of possible first characters. */
6303
6304 if (!anchored)
6305 {
6306 if ((re->flags & PCRE_FIRSTSET) != 0)
6307 {
6308 has_first_char = TRUE;
6309 first_char = first_char2 = re->first_char;
6310 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6311 {
6312 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6313 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6314 if (utf && first_char > 127)
6315 first_char2 = UCD_OTHERCASE(first_char);
6316 #endif
6317 }
6318 }
6319 else
6320 if (!startline && study != NULL &&
6321 (study->flags & PCRE_STUDY_MAPPED) != 0)
6322 start_bits = study->start_bits;
6323 }
6324
6325 /* For anchored or unanchored matches, there may be a "last known required
6326 character" set. */
6327
6328 if ((re->flags & PCRE_REQCHSET) != 0)
6329 {
6330 has_req_char = TRUE;
6331 req_char = req_char2 = re->req_char;
6332 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6333 {
6334 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6335 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6336 if (utf && req_char > 127)
6337 req_char2 = UCD_OTHERCASE(req_char);
6338 #endif
6339 }
6340 }
6341
6342
6343 /* ==========================================================================*/
6344
6345 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6346 the loop runs just once. */
6347
6348 for(;;)
6349 {
6350 PCRE_PUCHAR save_end_subject = end_subject;
6351 PCRE_PUCHAR new_start_match;
6352
6353 /* If firstline is TRUE, the start of the match is constrained to the first
6354 line of a multiline string. That is, the match must be before or at the first
6355 newline. Implement this by temporarily adjusting end_subject so that we stop
6356 scanning at a newline. If the match fails at the newline, later code breaks
6357 this loop. */
6358
6359 if (firstline)
6360 {
6361 PCRE_PUCHAR t = start_match;
6362 #ifdef SUPPORT_UTF
6363 if (utf)
6364 {
6365 while (t < md->end_subject && !IS_NEWLINE(t))
6366 {
6367 t++;
6368 ACROSSCHAR(t < end_subject, *t, t++);
6369 }
6370 }
6371 else
6372 #endif
6373 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6374 end_subject = t;
6375 }
6376
6377 /* There are some optimizations that avoid running the match if a known
6378 starting point is not found, or if a known later character is not present.
6379 However, there is an option that disables these, for testing and for ensuring
6380 that all callouts do actually occur. The option can be set in the regex by
6381 (*NO_START_OPT) or passed in match-time options. */
6382
6383 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6384 {
6385 /* Advance to a unique first char if there is one. */
6386
6387 if (has_first_char)
6388 {
6389 if (first_char != first_char2)
6390 while (start_match < end_subject &&
6391 *start_match != first_char && *start_match != first_char2)
6392 start_match++;
6393 else
6394 while (start_match < end_subject && *start_match != first_char)
6395 start_match++;
6396 }
6397
6398 /* Or to just after a linebreak for a multiline match */
6399
6400 else if (startline)
6401 {
6402 if (start_match > md->start_subject + start_offset)
6403 {
6404 #ifdef SUPPORT_UTF
6405 if (utf)
6406 {
6407 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6408 {
6409 start_match++;
6410 ACROSSCHAR(start_match < end_subject, *start_match,
6411 start_match++);
6412 }
6413 }
6414 else
6415 #endif
6416 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6417 start_match++;
6418
6419 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6420 and we are now at a LF, advance the match position by one more character.
6421 */
6422
6423 if (start_match[-1] == CHAR_CR &&
6424 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6425 start_match < end_subject &&
6426 *start_match == CHAR_NL)
6427 start_match++;
6428 }
6429 }
6430
6431 /* Or to a non-unique first byte after study */
6432
6433 else if (start_bits != NULL)
6434 {
6435 while (start_match < end_subject)
6436 {
6437 register unsigned int c = *start_match;
6438 #ifndef COMPILE_PCRE8
6439 if (c > 255) c = 255;
6440 #endif
6441 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6442 {
6443 start_match++;
6444 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6445 /* In non 8-bit mode, the iteration will stop for
6446 characters > 255 at the beginning or not stop at all. */
6447 if (utf)
6448 ACROSSCHAR(start_match < end_subject, *start_match,
6449 start_match++);
6450 #endif
6451 }
6452 else break;
6453 }
6454 }
6455 } /* Starting optimizations */
6456
6457 /* Restore fudged end_subject */
6458
6459 end_subject = save_end_subject;
6460
6461 /* The following two optimizations are disabled for partial matching or if
6462 disabling is explicitly requested. */
6463
6464 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6465 {
6466 /* If the pattern was studied, a minimum subject length may be set. This is
6467 a lower bound; no actual string of that length may actually match the
6468 pattern. Although the value is, strictly, in characters, we treat it as
6469 bytes to avoid spending too much time in this optimization. */
6470
6471 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6472 (pcre_uint32)(end_subject - start_match) < study->minlength)
6473 {
6474 rc = MATCH_NOMATCH;
6475 break;
6476 }
6477
6478 /* If req_char is set, we know that that character must appear in the
6479 subject for the match to succeed. If the first character is set, req_char
6480 must be later in the subject; otherwise the test starts at the match point.
6481 This optimization can save a huge amount of backtracking in patterns with
6482 nested unlimited repeats that aren't going to match. Writing separate code
6483 for cased/caseless versions makes it go faster, as does using an
6484 autoincrement and backing off on a match.
6485
6486 HOWEVER: when the subject string is very, very long, searching to its end
6487 can take a long time, and give bad performance on quite ordinary patterns.
6488 This showed up when somebody was matching something like /^\d+C/ on a
6489 32-megabyte string... so we don't do this when the string is sufficiently
6490 long. */
6491
6492 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6493 {
6494 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6495
6496 /* We don't need to repeat the search if we haven't yet reached the
6497 place we found it at last time. */
6498
6499 if (p > req_char_ptr)
6500 {
6501 if (req_char != req_char2)
6502 {
6503 while (p < end_subject)
6504 {
6505 register int pp = *p++;
6506 if (pp == req_char || pp == req_char2) { p--; break; }
6507 }
6508 }
6509 else
6510 {
6511 while (p < end_subject)
6512 {
6513 if (*p++ == req_char) { p--; break; }
6514 }
6515 }
6516
6517 /* If we can't find the required character, break the matching loop,
6518 forcing a match failure. */
6519
6520 if (p >= end_subject)
6521 {
6522 rc = MATCH_NOMATCH;
6523 break;
6524 }
6525
6526 /* If we have found the required character, save the point where we
6527 found it, so that we don't search again next time round the loop if
6528 the start hasn't passed this character yet. */
6529
6530 req_char_ptr = p;
6531 }
6532 }
6533 }
6534
6535 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6536 printf(">>>> Match against: ");
6537 pchars(start_match, end_subject - start_match, TRUE, md);
6538 printf("\n");
6539 #endif
6540
6541 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6542 first starting point for which a partial match was found. */
6543
6544 md->start_match_ptr = start_match;
6545 md->start_used_ptr = start_match;
6546 md->match_call_count = 0;
6547 md->match_function_type = 0;
6548 md->end_offset_top = 0;
6549 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6550 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6551
6552 switch(rc)
6553 {
6554 /* SKIP passes back the next starting point explicitly, but if it is the
6555 same as the match we have just done, treat it as NOMATCH. */
6556
6557 case MATCH_SKIP:
6558 if (md->start_match_ptr != start_match)
6559 {
6560 new_start_match = md->start_match_ptr;
6561 break;
6562 }
6563 /* Fall through */
6564
6565 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6566 the SKIP's arg was not found. We also treat this as NOMATCH. */
6567
6568 case MATCH_SKIP_ARG:
6569 /* Fall through */
6570
6571 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6572 exactly like PRUNE. */
6573
6574 case MATCH_NOMATCH:
6575 case MATCH_PRUNE:
6576 case MATCH_THEN:
6577 new_start_match = start_match + 1;
6578 #ifdef SUPPORT_UTF
6579 if (utf)
6580 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6581 new_start_match++);
6582 #endif
6583 break;
6584
6585 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6586
6587 case MATCH_COMMIT:
6588 rc = MATCH_NOMATCH;
6589 goto ENDLOOP;
6590
6591 /* Any other return is either a match, or some kind of error. */
6592
6593 default:
6594 goto ENDLOOP;
6595 }
6596
6597 /* Control reaches here for the various types of "no match at this point"
6598 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6599
6600 rc = MATCH_NOMATCH;
6601
6602 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6603 newline in the subject (though it may continue over the newline). Therefore,
6604 if we have just failed to match, starting at a newline, do not continue. */
6605
6606 if (firstline && IS_NEWLINE(start_match)) break;
6607
6608 /* Advance to new matching position */
6609
6610 start_match = new_start_match;
6611
6612 /* Break the loop if the pattern is anchored or if we have passed the end of
6613 the subject. */
6614
6615 if (anchored || start_match > end_subject) break;
6616
6617 /* If we have just passed a CR and we are now at a LF, and the pattern does
6618 not contain any explicit matches for \r or \n, and the newline option is CRLF
6619 or ANY or ANYCRLF, advance the match position by one more character. */
6620
6621 if (start_match[-1] == CHAR_CR &&
6622 start_match < end_subject &&
6623 *start_match == CHAR_NL &&
6624 (re->flags & PCRE_HASCRORLF) == 0 &&
6625 (md->nltype == NLTYPE_ANY ||
6626 md->nltype == NLTYPE_ANYCRLF ||
6627 md->nllen == 2))
6628 start_match++;
6629
6630 md->mark = NULL; /* Reset for start of next match attempt */
6631 } /* End of for(;;) "bumpalong" loop */
6632
6633 /* ==========================================================================*/
6634
6635 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6636 conditions is true:
6637
6638 (1) The pattern is anchored or the match was failed by (*COMMIT);
6639
6640 (2) We are past the end of the subject;
6641
6642 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6643 this option requests that a match occur at or before the first newline in
6644 the subject.
6645
6646 When we have a match and the offset vector is big enough to deal with any
6647 backreferences, captured substring offsets will already be set up. In the case
6648 where we had to get some local store to hold offsets for backreference
6649 processing, copy those that we can. In this case there need not be overflow if
6650 certain parts of the pattern were not used, even though there are more
6651 capturing parentheses than vector slots. */
6652
6653 ENDLOOP:
6654
6655 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6656 {
6657 if (using_temporary_offsets)
6658 {
6659 if (arg_offset_max >= 4)
6660 {
6661 memcpy(offsets + 2, md->offset_vector + 2,
6662 (arg_offset_max - 2) * sizeof(int));
6663 DPRINTF(("Copied offsets from temporary memory\n"));
6664 }
6665 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6666 DPRINTF(("Freeing temporary memory\n"));
6667 (pcre_free)(md->offset_vector);
6668 }
6669
6670 /* Set the return code to the number of captured strings, or 0 if there were
6671 too many to fit into the vector. */
6672
6673 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6674 0 : md->end_offset_top/2;
6675
6676 /* If there is space in the offset vector, set any unused pairs at the end of
6677 the pattern to -1 for backwards compatibility. It is documented that this
6678 happens. In earlier versions, the whole set of potential capturing offsets
6679 was set to -1 each time round the loop, but this is handled differently now.
6680 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6681 those at the end that need unsetting here. We can't just unset them all at
6682 the start of the whole thing because they may get set in one branch that is
6683 not the final matching branch. */
6684
6685 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6686 {
6687 register int *iptr, *iend;
6688 int resetcount = 2 + re->top_bracket * 2;
6689 if (resetcount > offsetcount) resetcount = ocount;
6690 iptr = offsets + md->end_offset_top;
6691 iend = offsets + resetcount;
6692 while (iptr < iend) *iptr++ = -1;
6693 }
6694
6695 /* If there is space, set up the whole thing as substring 0. The value of
6696 md->start_match_ptr might be modified if \K was encountered on the success
6697 matching path. */
6698
6699 if (offsetcount < 2) rc = 0; else
6700 {
6701 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6702 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6703 }
6704
6705 DPRINTF((">>>> returning %d\n", rc));
6706 goto RETURN_MARK;
6707 }
6708
6709 /* Control gets here if there has been an error, or if the overall match
6710 attempt has failed at all permitted starting positions. */
6711
6712 if (using_temporary_offsets)
6713 {
6714 DPRINTF(("Freeing temporary memory\n"));
6715 (pcre_free)(md->offset_vector);
6716 }
6717
6718 /* For anything other than nomatch or partial match, just return the code. */
6719
6720 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6721 {
6722 DPRINTF((">>>> error: returning %d\n", rc));
6723 return rc;
6724 }
6725
6726 /* Handle partial matches - disable any mark data */
6727
6728 if (start_partial != NULL)
6729 {
6730 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6731 md->mark = NULL;
6732 if (offsetcount > 1)
6733 {
6734 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6735 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6736 }
6737 rc = PCRE_ERROR_PARTIAL;
6738 }
6739
6740 /* This is the classic nomatch case */
6741
6742 else
6743 {
6744 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6745 rc = PCRE_ERROR_NOMATCH;
6746 }
6747
6748 /* Return the MARK data if it has been requested. */
6749
6750 RETURN_MARK:
6751
6752 if (extra_data != NULL &