/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 789 - (show annotations)
Wed Dec 7 14:36:26 2011 UTC (7 years, 8 months ago) by zherczeg
File MIME type: text/plain
File size: 204050 byte(s)
UTF16 fixes: iterated character parsing, named references
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* This is a convenience macro for code that occurs many times. */
86
87 #define MRRETURN(ra) \
88 { \
89 md->mark = markptr; \
90 RRETURN(ra); \
91 }
92
93 /* Maximum number of ints of offset to save on the stack for recursive calls.
94 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
95 because the offset vector is always a multiple of 3 long. */
96
97 #define REC_STACK_SAVE_MAX 30
98
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
100
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
103
104
105
106 #ifdef PCRE_DEBUG
107 /*************************************************
108 * Debugging function to print chars *
109 *************************************************/
110
111 /* Print a sequence of chars in printable format, stopping at the end of the
112 subject if the requested.
113
114 Arguments:
115 p points to characters
116 length number to print
117 is_subject TRUE if printing from within md->start_subject
118 md pointer to matching data block, if is_subject is TRUE
119
120 Returns: nothing
121 */
122
123 static void
124 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
125 {
126 unsigned int c;
127 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
128 while (length-- > 0)
129 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
130 }
131 #endif
132
133
134
135 /*************************************************
136 * Match a back-reference *
137 *************************************************/
138
139 /* Normally, if a back reference hasn't been set, the length that is passed is
140 negative, so the match always fails. However, in JavaScript compatibility mode,
141 the length passed is zero. Note that in caseless UTF-8 mode, the number of
142 subject bytes matched may be different to the number of reference bytes.
143
144 Arguments:
145 offset index into the offset vector
146 eptr pointer into the subject
147 length length of reference to be matched (number of bytes)
148 md points to match data block
149 caseless TRUE if caseless
150
151 Returns: < 0 if not matched, otherwise the number of subject bytes matched
152 */
153
154 static int
155 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
156 BOOL caseless)
157 {
158 PCRE_PUCHAR eptr_start = eptr;
159 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
160
161 #ifdef PCRE_DEBUG
162 if (eptr >= md->end_subject)
163 printf("matching subject <null>");
164 else
165 {
166 printf("matching subject ");
167 pchars(eptr, length, TRUE, md);
168 }
169 printf(" against backref ");
170 pchars(p, length, FALSE, md);
171 printf("\n");
172 #endif
173
174 /* Always fail if reference not set (and not JavaScript compatible). */
175
176 if (length < 0) return -1;
177
178 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
179 properly if Unicode properties are supported. Otherwise, we can check only
180 ASCII characters. */
181
182 if (caseless)
183 {
184 #ifdef SUPPORT_UTF
185 #ifdef SUPPORT_UCP
186 if (md->utf)
187 {
188 /* Match characters up to the end of the reference. NOTE: the number of
189 bytes matched may differ, because there are some characters whose upper and
190 lower case versions code as different numbers of bytes. For example, U+023A
191 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
192 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
193 the latter. It is important, therefore, to check the length along the
194 reference, not along the subject (earlier code did this wrong). */
195
196 PCRE_PUCHAR endptr = p + length;
197 while (p < endptr)
198 {
199 int c, d;
200 if (eptr >= md->end_subject) return -1;
201 GETCHARINC(c, eptr);
202 GETCHARINC(d, p);
203 if (c != d && c != UCD_OTHERCASE(d)) return -1;
204 }
205 }
206 else
207 #endif
208 #endif
209
210 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
211 is no UCP support. */
212 {
213 if (eptr + length > md->end_subject) return -1;
214 while (length-- > 0)
215 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
216 }
217 }
218
219 /* In the caseful case, we can just compare the bytes, whether or not we
220 are in UTF-8 mode. */
221
222 else
223 {
224 if (eptr + length > md->end_subject) return -1;
225 while (length-- > 0) if (*p++ != *eptr++) return -1;
226 }
227
228 return eptr - eptr_start;
229 }
230
231
232
233 /***************************************************************************
234 ****************************************************************************
235 RECURSION IN THE match() FUNCTION
236
237 The match() function is highly recursive, though not every recursive call
238 increases the recursive depth. Nevertheless, some regular expressions can cause
239 it to recurse to a great depth. I was writing for Unix, so I just let it call
240 itself recursively. This uses the stack for saving everything that has to be
241 saved for a recursive call. On Unix, the stack can be large, and this works
242 fine.
243
244 It turns out that on some non-Unix-like systems there are problems with
245 programs that use a lot of stack. (This despite the fact that every last chip
246 has oodles of memory these days, and techniques for extending the stack have
247 been known for decades.) So....
248
249 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
250 calls by keeping local variables that need to be preserved in blocks of memory
251 obtained from malloc() instead instead of on the stack. Macros are used to
252 achieve this so that the actual code doesn't look very different to what it
253 always used to.
254
255 The original heap-recursive code used longjmp(). However, it seems that this
256 can be very slow on some operating systems. Following a suggestion from Stan
257 Switzer, the use of longjmp() has been abolished, at the cost of having to
258 provide a unique number for each call to RMATCH. There is no way of generating
259 a sequence of numbers at compile time in C. I have given them names, to make
260 them stand out more clearly.
261
262 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
263 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
264 tests. Furthermore, not using longjmp() means that local dynamic variables
265 don't have indeterminate values; this has meant that the frame size can be
266 reduced because the result can be "passed back" by straight setting of the
267 variable instead of being passed in the frame.
268 ****************************************************************************
269 ***************************************************************************/
270
271 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
272 below must be updated in sync. */
273
274 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
275 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
276 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
277 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
278 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
279 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
280 RM61, RM62, RM63, RM64, RM65, RM66 };
281
282 /* These versions of the macros use the stack, as normal. There are debugging
283 versions and production versions. Note that the "rw" argument of RMATCH isn't
284 actually used in this definition. */
285
286 #ifndef NO_RECURSE
287 #define REGISTER register
288
289 #ifdef PCRE_DEBUG
290 #define RMATCH(ra,rb,rc,rd,re,rw) \
291 { \
292 printf("match() called in line %d\n", __LINE__); \
293 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
294 printf("to line %d\n", __LINE__); \
295 }
296 #define RRETURN(ra) \
297 { \
298 printf("match() returned %d from line %d ", ra, __LINE__); \
299 return ra; \
300 }
301 #else
302 #define RMATCH(ra,rb,rc,rd,re,rw) \
303 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
304 #define RRETURN(ra) return ra
305 #endif
306
307 #else
308
309
310 /* These versions of the macros manage a private stack on the heap. Note that
311 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
312 argument of match(), which never changes. */
313
314 #define REGISTER
315
316 #define RMATCH(ra,rb,rc,rd,re,rw)\
317 {\
318 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
319 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
320 frame->Xwhere = rw; \
321 newframe->Xeptr = ra;\
322 newframe->Xecode = rb;\
323 newframe->Xmstart = mstart;\
324 newframe->Xmarkptr = markptr;\
325 newframe->Xoffset_top = rc;\
326 newframe->Xeptrb = re;\
327 newframe->Xrdepth = frame->Xrdepth + 1;\
328 newframe->Xprevframe = frame;\
329 frame = newframe;\
330 DPRINTF(("restarting from line %d\n", __LINE__));\
331 goto HEAP_RECURSE;\
332 L_##rw:\
333 DPRINTF(("jumped back to line %d\n", __LINE__));\
334 }
335
336 #define RRETURN(ra)\
337 {\
338 heapframe *oldframe = frame;\
339 frame = oldframe->Xprevframe;\
340 (pcre_stack_free)(oldframe);\
341 if (frame != NULL)\
342 {\
343 rrc = ra;\
344 goto HEAP_RETURN;\
345 }\
346 return ra;\
347 }
348
349
350 /* Structure for remembering the local variables in a private frame */
351
352 typedef struct heapframe {
353 struct heapframe *Xprevframe;
354
355 /* Function arguments that may change */
356
357 PCRE_PUCHAR Xeptr;
358 const pcre_uchar *Xecode;
359 PCRE_PUCHAR Xmstart;
360 PCRE_PUCHAR Xmarkptr;
361 int Xoffset_top;
362 eptrblock *Xeptrb;
363 unsigned int Xrdepth;
364
365 /* Function local variables */
366
367 PCRE_PUCHAR Xcallpat;
368 #ifdef SUPPORT_UTF
369 PCRE_PUCHAR Xcharptr;
370 #endif
371 PCRE_PUCHAR Xdata;
372 PCRE_PUCHAR Xnext;
373 PCRE_PUCHAR Xpp;
374 PCRE_PUCHAR Xprev;
375 PCRE_PUCHAR Xsaved_eptr;
376
377 recursion_info Xnew_recursive;
378
379 BOOL Xcur_is_word;
380 BOOL Xcondition;
381 BOOL Xprev_is_word;
382
383 #ifdef SUPPORT_UCP
384 int Xprop_type;
385 int Xprop_value;
386 int Xprop_fail_result;
387 int Xoclength;
388 pcre_uchar Xocchars[6];
389 #endif
390
391 int Xcodelink;
392 int Xctype;
393 unsigned int Xfc;
394 int Xfi;
395 int Xlength;
396 int Xmax;
397 int Xmin;
398 int Xnumber;
399 int Xoffset;
400 int Xop;
401 int Xsave_capture_last;
402 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
403 int Xstacksave[REC_STACK_SAVE_MAX];
404
405 eptrblock Xnewptrb;
406
407 /* Where to jump back to */
408
409 int Xwhere;
410
411 } heapframe;
412
413 #endif
414
415
416 /***************************************************************************
417 ***************************************************************************/
418
419
420
421 /*************************************************
422 * Match from current position *
423 *************************************************/
424
425 /* This function is called recursively in many circumstances. Whenever it
426 returns a negative (error) response, the outer incarnation must also return the
427 same response. */
428
429 /* These macros pack up tests that are used for partial matching, and which
430 appears several times in the code. We set the "hit end" flag if the pointer is
431 at the end of the subject and also past the start of the subject (i.e.
432 something has been matched). For hard partial matching, we then return
433 immediately. The second one is used when we already know we are past the end of
434 the subject. */
435
436 #define CHECK_PARTIAL()\
437 if (md->partial != 0 && eptr >= md->end_subject && \
438 eptr > md->start_used_ptr) \
439 { \
440 md->hitend = TRUE; \
441 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
442 }
443
444 #define SCHECK_PARTIAL()\
445 if (md->partial != 0 && eptr > md->start_used_ptr) \
446 { \
447 md->hitend = TRUE; \
448 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
449 }
450
451
452 /* Performance note: It might be tempting to extract commonly used fields from
453 the md structure (e.g. utf, end_subject) into individual variables to improve
454 performance. Tests using gcc on a SPARC disproved this; in the first case, it
455 made performance worse.
456
457 Arguments:
458 eptr pointer to current character in subject
459 ecode pointer to current position in compiled code
460 mstart pointer to the current match start position (can be modified
461 by encountering \K)
462 markptr pointer to the most recent MARK name, or NULL
463 offset_top current top pointer
464 md pointer to "static" info for the match
465 eptrb pointer to chain of blocks containing eptr at start of
466 brackets - for testing for empty matches
467 rdepth the recursion depth
468
469 Returns: MATCH_MATCH if matched ) these values are >= 0
470 MATCH_NOMATCH if failed to match )
471 a negative MATCH_xxx value for PRUNE, SKIP, etc
472 a negative PCRE_ERROR_xxx value if aborted by an error condition
473 (e.g. stopped by repeated call or recursion limit)
474 */
475
476 static int
477 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
478 PCRE_PUCHAR mstart, const pcre_uchar *markptr, int offset_top,
479 match_data *md, eptrblock *eptrb, unsigned int rdepth)
480 {
481 /* These variables do not need to be preserved over recursion in this function,
482 so they can be ordinary variables in all cases. Mark some of them with
483 "register" because they are used a lot in loops. */
484
485 register int rrc; /* Returns from recursive calls */
486 register int i; /* Used for loops not involving calls to RMATCH() */
487 register unsigned int c; /* Character values not kept over RMATCH() calls */
488 register BOOL utf; /* Local copy of UTF flag for speed */
489
490 BOOL minimize, possessive; /* Quantifier options */
491 BOOL caseless;
492 int condcode;
493
494 /* When recursion is not being used, all "local" variables that have to be
495 preserved over calls to RMATCH() are part of a "frame" which is obtained from
496 heap storage. Set up the top-level frame here; others are obtained from the
497 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
498
499 #ifdef NO_RECURSE
500 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
501 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
502 frame->Xprevframe = NULL; /* Marks the top level */
503
504 /* Copy in the original argument variables */
505
506 frame->Xeptr = eptr;
507 frame->Xecode = ecode;
508 frame->Xmstart = mstart;
509 frame->Xmarkptr = markptr;
510 frame->Xoffset_top = offset_top;
511 frame->Xeptrb = eptrb;
512 frame->Xrdepth = rdepth;
513
514 /* This is where control jumps back to to effect "recursion" */
515
516 HEAP_RECURSE:
517
518 /* Macros make the argument variables come from the current frame */
519
520 #define eptr frame->Xeptr
521 #define ecode frame->Xecode
522 #define mstart frame->Xmstart
523 #define markptr frame->Xmarkptr
524 #define offset_top frame->Xoffset_top
525 #define eptrb frame->Xeptrb
526 #define rdepth frame->Xrdepth
527
528 /* Ditto for the local variables */
529
530 #ifdef SUPPORT_UTF
531 #define charptr frame->Xcharptr
532 #endif
533 #define callpat frame->Xcallpat
534 #define codelink frame->Xcodelink
535 #define data frame->Xdata
536 #define next frame->Xnext
537 #define pp frame->Xpp
538 #define prev frame->Xprev
539 #define saved_eptr frame->Xsaved_eptr
540
541 #define new_recursive frame->Xnew_recursive
542
543 #define cur_is_word frame->Xcur_is_word
544 #define condition frame->Xcondition
545 #define prev_is_word frame->Xprev_is_word
546
547 #ifdef SUPPORT_UCP
548 #define prop_type frame->Xprop_type
549 #define prop_value frame->Xprop_value
550 #define prop_fail_result frame->Xprop_fail_result
551 #define oclength frame->Xoclength
552 #define occhars frame->Xocchars
553 #endif
554
555 #define ctype frame->Xctype
556 #define fc frame->Xfc
557 #define fi frame->Xfi
558 #define length frame->Xlength
559 #define max frame->Xmax
560 #define min frame->Xmin
561 #define number frame->Xnumber
562 #define offset frame->Xoffset
563 #define op frame->Xop
564 #define save_capture_last frame->Xsave_capture_last
565 #define save_offset1 frame->Xsave_offset1
566 #define save_offset2 frame->Xsave_offset2
567 #define save_offset3 frame->Xsave_offset3
568 #define stacksave frame->Xstacksave
569
570 #define newptrb frame->Xnewptrb
571
572 /* When recursion is being used, local variables are allocated on the stack and
573 get preserved during recursion in the normal way. In this environment, fi and
574 i, and fc and c, can be the same variables. */
575
576 #else /* NO_RECURSE not defined */
577 #define fi i
578 #define fc c
579
580 /* Many of the following variables are used only in small blocks of the code.
581 My normal style of coding would have declared them within each of those blocks.
582 However, in order to accommodate the version of this code that uses an external
583 "stack" implemented on the heap, it is easier to declare them all here, so the
584 declarations can be cut out in a block. The only declarations within blocks
585 below are for variables that do not have to be preserved over a recursive call
586 to RMATCH(). */
587
588 #ifdef SUPPORT_UTF
589 const pcre_uchar *charptr;
590 #endif
591 const pcre_uchar *callpat;
592 const pcre_uchar *data;
593 const pcre_uchar *next;
594 PCRE_PUCHAR pp;
595 const pcre_uchar *prev;
596 PCRE_PUCHAR saved_eptr;
597
598 recursion_info new_recursive;
599
600 BOOL cur_is_word;
601 BOOL condition;
602 BOOL prev_is_word;
603
604 #ifdef SUPPORT_UCP
605 int prop_type;
606 int prop_value;
607 int prop_fail_result;
608 int oclength;
609 pcre_uchar occhars[6];
610 #endif
611
612 int codelink;
613 int ctype;
614 int length;
615 int max;
616 int min;
617 int number;
618 int offset;
619 int op;
620 int save_capture_last;
621 int save_offset1, save_offset2, save_offset3;
622 int stacksave[REC_STACK_SAVE_MAX];
623
624 eptrblock newptrb;
625 #endif /* NO_RECURSE */
626
627 /* To save space on the stack and in the heap frame, I have doubled up on some
628 of the local variables that are used only in localised parts of the code, but
629 still need to be preserved over recursive calls of match(). These macros define
630 the alternative names that are used. */
631
632 #define allow_zero cur_is_word
633 #define cbegroup condition
634 #define code_offset codelink
635 #define condassert condition
636 #define matched_once prev_is_word
637 #define foc number
638
639 /* These statements are here to stop the compiler complaining about unitialized
640 variables. */
641
642 #ifdef SUPPORT_UCP
643 prop_value = 0;
644 prop_fail_result = 0;
645 #endif
646
647
648 /* This label is used for tail recursion, which is used in a few cases even
649 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
650 used. Thanks to Ian Taylor for noticing this possibility and sending the
651 original patch. */
652
653 TAIL_RECURSE:
654
655 /* OK, now we can get on with the real code of the function. Recursive calls
656 are specified by the macro RMATCH and RRETURN is used to return. When
657 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
658 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
659 defined). However, RMATCH isn't like a function call because it's quite a
660 complicated macro. It has to be used in one particular way. This shouldn't,
661 however, impact performance when true recursion is being used. */
662
663 #ifdef SUPPORT_UTF
664 utf = md->utf; /* Local copy of the flag */
665 #else
666 utf = FALSE;
667 #endif
668
669 /* First check that we haven't called match() too many times, or that we
670 haven't exceeded the recursive call limit. */
671
672 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
673 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
674
675 /* At the start of a group with an unlimited repeat that may match an empty
676 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
677 done this way to save having to use another function argument, which would take
678 up space on the stack. See also MATCH_CONDASSERT below.
679
680 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
681 such remembered pointers, to be checked when we hit the closing ket, in order
682 to break infinite loops that match no characters. When match() is called in
683 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
684 NOT be used with tail recursion, because the memory block that is used is on
685 the stack, so a new one may be required for each match(). */
686
687 if (md->match_function_type == MATCH_CBEGROUP)
688 {
689 newptrb.epb_saved_eptr = eptr;
690 newptrb.epb_prev = eptrb;
691 eptrb = &newptrb;
692 md->match_function_type = 0;
693 }
694
695 /* Now start processing the opcodes. */
696
697 for (;;)
698 {
699 minimize = possessive = FALSE;
700 op = *ecode;
701
702 switch(op)
703 {
704 case OP_MARK:
705 markptr = ecode + 2;
706 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
707 eptrb, RM55);
708
709 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
710 argument, and we must check whether that argument matches this MARK's
711 argument. It is passed back in md->start_match_ptr (an overloading of that
712 variable). If it does match, we reset that variable to the current subject
713 position and return MATCH_SKIP. Otherwise, pass back the return code
714 unaltered. */
715
716 if (rrc == MATCH_SKIP_ARG &&
717 STRCMP_UC_UC(markptr, md->start_match_ptr) == 0)
718 {
719 md->start_match_ptr = eptr;
720 RRETURN(MATCH_SKIP);
721 }
722
723 if (md->mark == NULL) md->mark = markptr;
724 RRETURN(rrc);
725
726 case OP_FAIL:
727 MRRETURN(MATCH_NOMATCH);
728
729 /* COMMIT overrides PRUNE, SKIP, and THEN */
730
731 case OP_COMMIT:
732 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
733 eptrb, RM52);
734 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
735 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
736 rrc != MATCH_THEN)
737 RRETURN(rrc);
738 MRRETURN(MATCH_COMMIT);
739
740 /* PRUNE overrides THEN */
741
742 case OP_PRUNE:
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
744 eptrb, RM51);
745 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
746 MRRETURN(MATCH_PRUNE);
747
748 case OP_PRUNE_ARG:
749 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
750 eptrb, RM56);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 md->mark = ecode + 2;
753 RRETURN(MATCH_PRUNE);
754
755 /* SKIP overrides PRUNE and THEN */
756
757 case OP_SKIP:
758 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
759 eptrb, RM53);
760 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
761 RRETURN(rrc);
762 md->start_match_ptr = eptr; /* Pass back current position */
763 MRRETURN(MATCH_SKIP);
764
765 case OP_SKIP_ARG:
766 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
767 eptrb, RM57);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770
771 /* Pass back the current skip name by overloading md->start_match_ptr and
772 returning the special MATCH_SKIP_ARG return code. This will either be
773 caught by a matching MARK, or get to the top, where it is treated the same
774 as PRUNE. */
775
776 md->start_match_ptr = ecode + 2;
777 RRETURN(MATCH_SKIP_ARG);
778
779 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
780 the branch in which it occurs can be determined. Overload the start of
781 match pointer to do this. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode;
788 MRRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
792 md, eptrb, RM58);
793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
794 md->start_match_ptr = ecode;
795 md->mark = ecode + 2;
796 RRETURN(MATCH_THEN);
797
798 /* Handle an atomic group that does not contain any capturing parentheses.
799 This can be handled like an assertion. Prior to 8.13, all atomic groups
800 were handled this way. In 8.13, the code was changed as below for ONCE, so
801 that backups pass through the group and thereby reset captured values.
802 However, this uses a lot more stack, so in 8.20, atomic groups that do not
803 contain any captures generate OP_ONCE_NC, which can be handled in the old,
804 less stack intensive way.
805
806 Check the alternative branches in turn - the matching won't pass the KET
807 for this kind of subpattern. If any one branch matches, we carry on as at
808 the end of a normal bracket, leaving the subject pointer, but resetting
809 the start-of-match value in case it was changed by \K. */
810
811 case OP_ONCE_NC:
812 prev = ecode;
813 saved_eptr = eptr;
814 do
815 {
816 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
817 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
818 {
819 mstart = md->start_match_ptr;
820 markptr = md->mark;
821 break;
822 }
823 if (rrc == MATCH_THEN)
824 {
825 next = ecode + GET(ecode,1);
826 if (md->start_match_ptr < next &&
827 (*ecode == OP_ALT || *next == OP_ALT))
828 rrc = MATCH_NOMATCH;
829 }
830
831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
832 ecode += GET(ecode,1);
833 }
834 while (*ecode == OP_ALT);
835
836 /* If hit the end of the group (which could be repeated), fail */
837
838 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
839
840 /* Continue as from after the group, updating the offsets high water
841 mark, since extracts may have been taken. */
842
843 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
844
845 offset_top = md->end_offset_top;
846 eptr = md->end_match_ptr;
847
848 /* For a non-repeating ket, just continue at this level. This also
849 happens for a repeating ket if no characters were matched in the group.
850 This is the forcible breaking of infinite loops as implemented in Perl
851 5.005. */
852
853 if (*ecode == OP_KET || eptr == saved_eptr)
854 {
855 ecode += 1+LINK_SIZE;
856 break;
857 }
858
859 /* The repeating kets try the rest of the pattern or restart from the
860 preceding bracket, in the appropriate order. The second "call" of match()
861 uses tail recursion, to avoid using another stack frame. */
862
863 if (*ecode == OP_KETRMIN)
864 {
865 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
867 ecode = prev;
868 goto TAIL_RECURSE;
869 }
870 else /* OP_KETRMAX */
871 {
872 md->match_function_type = MATCH_CBEGROUP;
873 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
874 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875 ecode += 1 + LINK_SIZE;
876 goto TAIL_RECURSE;
877 }
878 /* Control never gets here */
879
880 /* Handle a capturing bracket, other than those that are possessive with an
881 unlimited repeat. If there is space in the offset vector, save the current
882 subject position in the working slot at the top of the vector. We mustn't
883 change the current values of the data slot, because they may be set from a
884 previous iteration of this group, and be referred to by a reference inside
885 the group. A failure to match might occur after the group has succeeded,
886 if something later on doesn't match. For this reason, we need to restore
887 the working value and also the values of the final offsets, in case they
888 were set by a previous iteration of the same bracket.
889
890 If there isn't enough space in the offset vector, treat this as if it were
891 a non-capturing bracket. Don't worry about setting the flag for the error
892 case here; that is handled in the code for KET. */
893
894 case OP_CBRA:
895 case OP_SCBRA:
896 number = GET2(ecode, 1+LINK_SIZE);
897 offset = number << 1;
898
899 #ifdef PCRE_DEBUG
900 printf("start bracket %d\n", number);
901 printf("subject=");
902 pchars(eptr, 16, TRUE, md);
903 printf("\n");
904 #endif
905
906 if (offset < md->offset_max)
907 {
908 save_offset1 = md->offset_vector[offset];
909 save_offset2 = md->offset_vector[offset+1];
910 save_offset3 = md->offset_vector[md->offset_end - number];
911 save_capture_last = md->capture_last;
912
913 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
914 md->offset_vector[md->offset_end - number] =
915 (int)(eptr - md->start_subject);
916
917 for (;;)
918 {
919 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
920 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
921 eptrb, RM1);
922 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
923
924 /* If we backed up to a THEN, check whether it is within the current
925 branch by comparing the address of the THEN that is passed back with
926 the end of the branch. If it is within the current branch, and the
927 branch is one of two or more alternatives (it either starts or ends
928 with OP_ALT), we have reached the limit of THEN's action, so convert
929 the return code to NOMATCH, which will cause normal backtracking to
930 happen from now on. Otherwise, THEN is passed back to an outer
931 alternative. This implements Perl's treatment of parenthesized groups,
932 where a group not containing | does not affect the current alternative,
933 that is, (X) is NOT the same as (X|(*F)). */
934
935 if (rrc == MATCH_THEN)
936 {
937 next = ecode + GET(ecode,1);
938 if (md->start_match_ptr < next &&
939 (*ecode == OP_ALT || *next == OP_ALT))
940 rrc = MATCH_NOMATCH;
941 }
942
943 /* Anything other than NOMATCH is passed back. */
944
945 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
946 md->capture_last = save_capture_last;
947 ecode += GET(ecode, 1);
948 if (*ecode != OP_ALT) break;
949 }
950
951 DPRINTF(("bracket %d failed\n", number));
952 md->offset_vector[offset] = save_offset1;
953 md->offset_vector[offset+1] = save_offset2;
954 md->offset_vector[md->offset_end - number] = save_offset3;
955
956 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
957
958 if (md->mark == NULL) md->mark = markptr;
959 RRETURN(rrc);
960 }
961
962 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
963 as a non-capturing bracket. */
964
965 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
966 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
967
968 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 /* Non-capturing or atomic group, except for possessive with unlimited
974 repeat and ONCE group with no captures. Loop for all the alternatives.
975
976 When we get to the final alternative within the brackets, we used to return
977 the result of a recursive call to match() whatever happened so it was
978 possible to reduce stack usage by turning this into a tail recursion,
979 except in the case of a possibly empty group. However, now that there is
980 the possiblity of (*THEN) occurring in the final alternative, this
981 optimization is no longer always possible.
982
983 We can optimize if we know there are no (*THEN)s in the pattern; at present
984 this is the best that can be done.
985
986 MATCH_ONCE is returned when the end of an atomic group is successfully
987 reached, but subsequent matching fails. It passes back up the tree (causing
988 captured values to be reset) until the original atomic group level is
989 reached. This is tested by comparing md->once_target with the start of the
990 group. At this point, the return is converted into MATCH_NOMATCH so that
991 previous backup points can be taken. */
992
993 case OP_ONCE:
994 case OP_BRA:
995 case OP_SBRA:
996 DPRINTF(("start non-capturing bracket\n"));
997
998 for (;;)
999 {
1000 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1001
1002 /* If this is not a possibly empty group, and there are no (*THEN)s in
1003 the pattern, and this is the final alternative, optimize as described
1004 above. */
1005
1006 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1007 {
1008 ecode += PRIV(OP_lengths)[*ecode];
1009 goto TAIL_RECURSE;
1010 }
1011
1012 /* In all other cases, we have to make another call to match(). */
1013
1014 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1015 RM2);
1016
1017 /* See comment in the code for capturing groups above about handling
1018 THEN. */
1019
1020 if (rrc == MATCH_THEN)
1021 {
1022 next = ecode + GET(ecode,1);
1023 if (md->start_match_ptr < next &&
1024 (*ecode == OP_ALT || *next == OP_ALT))
1025 rrc = MATCH_NOMATCH;
1026 }
1027
1028 if (rrc != MATCH_NOMATCH)
1029 {
1030 if (rrc == MATCH_ONCE)
1031 {
1032 const pcre_uchar *scode = ecode;
1033 if (*scode != OP_ONCE) /* If not at start, find it */
1034 {
1035 while (*scode == OP_ALT) scode += GET(scode, 1);
1036 scode -= GET(scode, 1);
1037 }
1038 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1039 }
1040 RRETURN(rrc);
1041 }
1042 ecode += GET(ecode, 1);
1043 if (*ecode != OP_ALT) break;
1044 }
1045
1046 if (md->mark == NULL) md->mark = markptr;
1047 RRETURN(MATCH_NOMATCH);
1048
1049 /* Handle possessive capturing brackets with an unlimited repeat. We come
1050 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1051 handled similarly to the normal case above. However, the matching is
1052 different. The end of these brackets will always be OP_KETRPOS, which
1053 returns MATCH_KETRPOS without going further in the pattern. By this means
1054 we can handle the group by iteration rather than recursion, thereby
1055 reducing the amount of stack needed. */
1056
1057 case OP_CBRAPOS:
1058 case OP_SCBRAPOS:
1059 allow_zero = FALSE;
1060
1061 POSSESSIVE_CAPTURE:
1062 number = GET2(ecode, 1+LINK_SIZE);
1063 offset = number << 1;
1064
1065 #ifdef PCRE_DEBUG
1066 printf("start possessive bracket %d\n", number);
1067 printf("subject=");
1068 pchars(eptr, 16, TRUE, md);
1069 printf("\n");
1070 #endif
1071
1072 if (offset < md->offset_max)
1073 {
1074 matched_once = FALSE;
1075 code_offset = ecode - md->start_code;
1076
1077 save_offset1 = md->offset_vector[offset];
1078 save_offset2 = md->offset_vector[offset+1];
1079 save_offset3 = md->offset_vector[md->offset_end - number];
1080 save_capture_last = md->capture_last;
1081
1082 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1083
1084 /* Each time round the loop, save the current subject position for use
1085 when the group matches. For MATCH_MATCH, the group has matched, so we
1086 restart it with a new subject starting position, remembering that we had
1087 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1088 usual. If we haven't matched any alternatives in any iteration, check to
1089 see if a previous iteration matched. If so, the group has matched;
1090 continue from afterwards. Otherwise it has failed; restore the previous
1091 capture values before returning NOMATCH. */
1092
1093 for (;;)
1094 {
1095 md->offset_vector[md->offset_end - number] =
1096 (int)(eptr - md->start_subject);
1097 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1098 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1099 eptrb, RM63);
1100 if (rrc == MATCH_KETRPOS)
1101 {
1102 offset_top = md->end_offset_top;
1103 eptr = md->end_match_ptr;
1104 ecode = md->start_code + code_offset;
1105 save_capture_last = md->capture_last;
1106 matched_once = TRUE;
1107 continue;
1108 }
1109
1110 /* See comment in the code for capturing groups above about handling
1111 THEN. */
1112
1113 if (rrc == MATCH_THEN)
1114 {
1115 next = ecode + GET(ecode,1);
1116 if (md->start_match_ptr < next &&
1117 (*ecode == OP_ALT || *next == OP_ALT))
1118 rrc = MATCH_NOMATCH;
1119 }
1120
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 md->capture_last = save_capture_last;
1123 ecode += GET(ecode, 1);
1124 if (*ecode != OP_ALT) break;
1125 }
1126
1127 if (!matched_once)
1128 {
1129 md->offset_vector[offset] = save_offset1;
1130 md->offset_vector[offset+1] = save_offset2;
1131 md->offset_vector[md->offset_end - number] = save_offset3;
1132 }
1133
1134 if (md->mark == NULL) md->mark = markptr;
1135 if (allow_zero || matched_once)
1136 {
1137 ecode += 1 + LINK_SIZE;
1138 break;
1139 }
1140
1141 RRETURN(MATCH_NOMATCH);
1142 }
1143
1144 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1145 as a non-capturing bracket. */
1146
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1149
1150 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1151
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1154
1155 /* Non-capturing possessive bracket with unlimited repeat. We come here
1156 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1157 without the capturing complication. It is written out separately for speed
1158 and cleanliness. */
1159
1160 case OP_BRAPOS:
1161 case OP_SBRAPOS:
1162 allow_zero = FALSE;
1163
1164 POSSESSIVE_NON_CAPTURE:
1165 matched_once = FALSE;
1166 code_offset = ecode - md->start_code;
1167
1168 for (;;)
1169 {
1170 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1171 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1172 eptrb, RM48);
1173 if (rrc == MATCH_KETRPOS)
1174 {
1175 offset_top = md->end_offset_top;
1176 eptr = md->end_match_ptr;
1177 ecode = md->start_code + code_offset;
1178 matched_once = TRUE;
1179 continue;
1180 }
1181
1182 /* See comment in the code for capturing groups above about handling
1183 THEN. */
1184
1185 if (rrc == MATCH_THEN)
1186 {
1187 next = ecode + GET(ecode,1);
1188 if (md->start_match_ptr < next &&
1189 (*ecode == OP_ALT || *next == OP_ALT))
1190 rrc = MATCH_NOMATCH;
1191 }
1192
1193 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1194 ecode += GET(ecode, 1);
1195 if (*ecode != OP_ALT) break;
1196 }
1197
1198 if (matched_once || allow_zero)
1199 {
1200 ecode += 1 + LINK_SIZE;
1201 break;
1202 }
1203 RRETURN(MATCH_NOMATCH);
1204
1205 /* Control never reaches here. */
1206
1207 /* Conditional group: compilation checked that there are no more than
1208 two branches. If the condition is false, skipping the first branch takes us
1209 past the end if there is only one branch, but that's OK because that is
1210 exactly what going to the ket would do. */
1211
1212 case OP_COND:
1213 case OP_SCOND:
1214 codelink = GET(ecode, 1);
1215
1216 /* Because of the way auto-callout works during compile, a callout item is
1217 inserted between OP_COND and an assertion condition. */
1218
1219 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1220 {
1221 if (pcre_callout != NULL)
1222 {
1223 pcre_callout_block cb;
1224 cb.version = 2; /* Version 1 of the callout block */
1225 cb.callout_number = ecode[LINK_SIZE+2];
1226 cb.offset_vector = md->offset_vector;
1227 cb.subject = (PCRE_SPTR)md->start_subject;
1228 cb.subject_length = (int)(md->end_subject - md->start_subject);
1229 cb.start_match = (int)(mstart - md->start_subject);
1230 cb.current_position = (int)(eptr - md->start_subject);
1231 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1232 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1233 cb.capture_top = offset_top/2;
1234 cb.capture_last = md->capture_last;
1235 cb.callout_data = md->callout_data;
1236 cb.mark = (unsigned char *)markptr;
1237 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1238 if (rrc < 0) RRETURN(rrc);
1239 }
1240 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1241 }
1242
1243 condcode = ecode[LINK_SIZE+1];
1244
1245 /* Now see what the actual condition is */
1246
1247 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1248 {
1249 if (md->recursive == NULL) /* Not recursing => FALSE */
1250 {
1251 condition = FALSE;
1252 ecode += GET(ecode, 1);
1253 }
1254 else
1255 {
1256 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1257 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1258
1259 /* If the test is for recursion into a specific subpattern, and it is
1260 false, but the test was set up by name, scan the table to see if the
1261 name refers to any other numbers, and test them. The condition is true
1262 if any one is set. */
1263
1264 if (!condition && condcode == OP_NRREF)
1265 {
1266 pcre_uchar *slotA = md->name_table;
1267 for (i = 0; i < md->name_count; i++)
1268 {
1269 if (GET2(slotA, 0) == recno) break;
1270 slotA += md->name_entry_size;
1271 }
1272
1273 /* Found a name for the number - there can be only one; duplicate
1274 names for different numbers are allowed, but not vice versa. First
1275 scan down for duplicates. */
1276
1277 if (i < md->name_count)
1278 {
1279 pcre_uchar *slotB = slotA;
1280 while (slotB > md->name_table)
1281 {
1282 slotB -= md->name_entry_size;
1283 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1284 {
1285 condition = GET2(slotB, 0) == md->recursive->group_num;
1286 if (condition) break;
1287 }
1288 else break;
1289 }
1290
1291 /* Scan up for duplicates */
1292
1293 if (!condition)
1294 {
1295 slotB = slotA;
1296 for (i++; i < md->name_count; i++)
1297 {
1298 slotB += md->name_entry_size;
1299 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1300 {
1301 condition = GET2(slotB, 0) == md->recursive->group_num;
1302 if (condition) break;
1303 }
1304 else break;
1305 }
1306 }
1307 }
1308 }
1309
1310 /* Chose branch according to the condition */
1311
1312 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1313 }
1314 }
1315
1316 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1317 {
1318 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1319 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1320
1321 /* If the numbered capture is unset, but the reference was by name,
1322 scan the table to see if the name refers to any other numbers, and test
1323 them. The condition is true if any one is set. This is tediously similar
1324 to the code above, but not close enough to try to amalgamate. */
1325
1326 if (!condition && condcode == OP_NCREF)
1327 {
1328 int refno = offset >> 1;
1329 pcre_uchar *slotA = md->name_table;
1330
1331 for (i = 0; i < md->name_count; i++)
1332 {
1333 if (GET2(slotA, 0) == refno) break;
1334 slotA += md->name_entry_size;
1335 }
1336
1337 /* Found a name for the number - there can be only one; duplicate names
1338 for different numbers are allowed, but not vice versa. First scan down
1339 for duplicates. */
1340
1341 if (i < md->name_count)
1342 {
1343 pcre_uchar *slotB = slotA;
1344 while (slotB > md->name_table)
1345 {
1346 slotB -= md->name_entry_size;
1347 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1348 {
1349 offset = GET2(slotB, 0) << 1;
1350 condition = offset < offset_top &&
1351 md->offset_vector[offset] >= 0;
1352 if (condition) break;
1353 }
1354 else break;
1355 }
1356
1357 /* Scan up for duplicates */
1358
1359 if (!condition)
1360 {
1361 slotB = slotA;
1362 for (i++; i < md->name_count; i++)
1363 {
1364 slotB += md->name_entry_size;
1365 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1366 {
1367 offset = GET2(slotB, 0) << 1;
1368 condition = offset < offset_top &&
1369 md->offset_vector[offset] >= 0;
1370 if (condition) break;
1371 }
1372 else break;
1373 }
1374 }
1375 }
1376 }
1377
1378 /* Chose branch according to the condition */
1379
1380 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1381 }
1382
1383 else if (condcode == OP_DEF) /* DEFINE - always false */
1384 {
1385 condition = FALSE;
1386 ecode += GET(ecode, 1);
1387 }
1388
1389 /* The condition is an assertion. Call match() to evaluate it - setting
1390 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1391 an assertion. */
1392
1393 else
1394 {
1395 md->match_function_type = MATCH_CONDASSERT;
1396 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1397 if (rrc == MATCH_MATCH)
1398 {
1399 if (md->end_offset_top > offset_top)
1400 offset_top = md->end_offset_top; /* Captures may have happened */
1401 condition = TRUE;
1402 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1403 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1404 }
1405
1406 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1407 assertion; it is therefore treated as NOMATCH. */
1408
1409 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1410 {
1411 RRETURN(rrc); /* Need braces because of following else */
1412 }
1413 else
1414 {
1415 condition = FALSE;
1416 ecode += codelink;
1417 }
1418 }
1419
1420 /* We are now at the branch that is to be obeyed. As there is only one, can
1421 use tail recursion to avoid using another stack frame, except when there is
1422 unlimited repeat of a possibly empty group. In the latter case, a recursive
1423 call to match() is always required, unless the second alternative doesn't
1424 exist, in which case we can just plough on. Note that, for compatibility
1425 with Perl, the | in a conditional group is NOT treated as creating two
1426 alternatives. If a THEN is encountered in the branch, it propagates out to
1427 the enclosing alternative (unless nested in a deeper set of alternatives,
1428 of course). */
1429
1430 if (condition || *ecode == OP_ALT)
1431 {
1432 if (op != OP_SCOND)
1433 {
1434 ecode += 1 + LINK_SIZE;
1435 goto TAIL_RECURSE;
1436 }
1437
1438 md->match_function_type = MATCH_CBEGROUP;
1439 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1440 RRETURN(rrc);
1441 }
1442
1443 /* Condition false & no alternative; continue after the group. */
1444
1445 else
1446 {
1447 ecode += 1 + LINK_SIZE;
1448 }
1449 break;
1450
1451
1452 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1453 to close any currently open capturing brackets. */
1454
1455 case OP_CLOSE:
1456 number = GET2(ecode, 1);
1457 offset = number << 1;
1458
1459 #ifdef PCRE_DEBUG
1460 printf("end bracket %d at *ACCEPT", number);
1461 printf("\n");
1462 #endif
1463
1464 md->capture_last = number;
1465 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1466 {
1467 md->offset_vector[offset] =
1468 md->offset_vector[md->offset_end - number];
1469 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1470 if (offset_top <= offset) offset_top = offset + 2;
1471 }
1472 ecode += 1 + IMM2_SIZE;
1473 break;
1474
1475
1476 /* End of the pattern, either real or forced. */
1477
1478 case OP_END:
1479 case OP_ACCEPT:
1480 case OP_ASSERT_ACCEPT:
1481
1482 /* If we have matched an empty string, fail if not in an assertion and not
1483 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1484 is set and we have matched at the start of the subject. In both cases,
1485 backtracking will then try other alternatives, if any. */
1486
1487 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1488 md->recursive == NULL &&
1489 (md->notempty ||
1490 (md->notempty_atstart &&
1491 mstart == md->start_subject + md->start_offset)))
1492 MRRETURN(MATCH_NOMATCH);
1493
1494 /* Otherwise, we have a match. */
1495
1496 md->end_match_ptr = eptr; /* Record where we ended */
1497 md->end_offset_top = offset_top; /* and how many extracts were taken */
1498 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1499
1500 /* For some reason, the macros don't work properly if an expression is
1501 given as the argument to MRRETURN when the heap is in use. */
1502
1503 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1504 MRRETURN(rrc);
1505
1506 /* Assertion brackets. Check the alternative branches in turn - the
1507 matching won't pass the KET for an assertion. If any one branch matches,
1508 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1509 start of each branch to move the current point backwards, so the code at
1510 this level is identical to the lookahead case. When the assertion is part
1511 of a condition, we want to return immediately afterwards. The caller of
1512 this incarnation of the match() function will have set MATCH_CONDASSERT in
1513 md->match_function type, and one of these opcodes will be the first opcode
1514 that is processed. We use a local variable that is preserved over calls to
1515 match() to remember this case. */
1516
1517 case OP_ASSERT:
1518 case OP_ASSERTBACK:
1519 if (md->match_function_type == MATCH_CONDASSERT)
1520 {
1521 condassert = TRUE;
1522 md->match_function_type = 0;
1523 }
1524 else condassert = FALSE;
1525
1526 do
1527 {
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1529 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1530 {
1531 mstart = md->start_match_ptr; /* In case \K reset it */
1532 markptr = md->mark;
1533 break;
1534 }
1535
1536 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1537 as NOMATCH. */
1538
1539 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1540 ecode += GET(ecode, 1);
1541 }
1542 while (*ecode == OP_ALT);
1543
1544 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1545
1546 /* If checking an assertion for a condition, return MATCH_MATCH. */
1547
1548 if (condassert) RRETURN(MATCH_MATCH);
1549
1550 /* Continue from after the assertion, updating the offsets high water
1551 mark, since extracts may have been taken during the assertion. */
1552
1553 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1554 ecode += 1 + LINK_SIZE;
1555 offset_top = md->end_offset_top;
1556 continue;
1557
1558 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1559 PRUNE, or COMMIT means we must assume failure without checking subsequent
1560 branches. */
1561
1562 case OP_ASSERT_NOT:
1563 case OP_ASSERTBACK_NOT:
1564 if (md->match_function_type == MATCH_CONDASSERT)
1565 {
1566 condassert = TRUE;
1567 md->match_function_type = 0;
1568 }
1569 else condassert = FALSE;
1570
1571 do
1572 {
1573 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1574 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1575 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1576 {
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 break;
1579 }
1580
1581 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1582 as NOMATCH. */
1583
1584 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1585 ecode += GET(ecode,1);
1586 }
1587 while (*ecode == OP_ALT);
1588
1589 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1590
1591 ecode += 1 + LINK_SIZE;
1592 continue;
1593
1594 /* Move the subject pointer back. This occurs only at the start of
1595 each branch of a lookbehind assertion. If we are too close to the start to
1596 move back, this match function fails. When working with UTF-8 we move
1597 back a number of characters, not bytes. */
1598
1599 case OP_REVERSE:
1600 #ifdef SUPPORT_UTF
1601 if (utf)
1602 {
1603 i = GET(ecode, 1);
1604 while (i-- > 0)
1605 {
1606 eptr--;
1607 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1608 BACKCHAR(eptr);
1609 }
1610 }
1611 else
1612 #endif
1613
1614 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1615
1616 {
1617 eptr -= GET(ecode, 1);
1618 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1619 }
1620
1621 /* Save the earliest consulted character, then skip to next op code */
1622
1623 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1624 ecode += 1 + LINK_SIZE;
1625 break;
1626
1627 /* The callout item calls an external function, if one is provided, passing
1628 details of the match so far. This is mainly for debugging, though the
1629 function is able to force a failure. */
1630
1631 case OP_CALLOUT:
1632 if (pcre_callout != NULL)
1633 {
1634 pcre_callout_block cb;
1635 cb.version = 2; /* Version 1 of the callout block */
1636 cb.callout_number = ecode[1];
1637 cb.offset_vector = md->offset_vector;
1638 cb.subject = (PCRE_SPTR)md->start_subject;
1639 cb.subject_length = (int)(md->end_subject - md->start_subject);
1640 cb.start_match = (int)(mstart - md->start_subject);
1641 cb.current_position = (int)(eptr - md->start_subject);
1642 cb.pattern_position = GET(ecode, 2);
1643 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1644 cb.capture_top = offset_top/2;
1645 cb.capture_last = md->capture_last;
1646 cb.callout_data = md->callout_data;
1647 cb.mark = (unsigned char *)markptr;
1648 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1649 if (rrc < 0) RRETURN(rrc);
1650 }
1651 ecode += 2 + 2*LINK_SIZE;
1652 break;
1653
1654 /* Recursion either matches the current regex, or some subexpression. The
1655 offset data is the offset to the starting bracket from the start of the
1656 whole pattern. (This is so that it works from duplicated subpatterns.)
1657
1658 The state of the capturing groups is preserved over recursion, and
1659 re-instated afterwards. We don't know how many are started and not yet
1660 finished (offset_top records the completed total) so we just have to save
1661 all the potential data. There may be up to 65535 such values, which is too
1662 large to put on the stack, but using malloc for small numbers seems
1663 expensive. As a compromise, the stack is used when there are no more than
1664 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1665
1666 There are also other values that have to be saved. We use a chained
1667 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1668 for the original version of this logic. It has, however, been hacked around
1669 a lot, so he is not to blame for the current way it works. */
1670
1671 case OP_RECURSE:
1672 {
1673 recursion_info *ri;
1674 int recno;
1675
1676 callpat = md->start_code + GET(ecode, 1);
1677 recno = (callpat == md->start_code)? 0 :
1678 GET2(callpat, 1 + LINK_SIZE);
1679
1680 /* Check for repeating a recursion without advancing the subject pointer.
1681 This should catch convoluted mutual recursions. (Some simple cases are
1682 caught at compile time.) */
1683
1684 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1685 if (recno == ri->group_num && eptr == ri->subject_position)
1686 RRETURN(PCRE_ERROR_RECURSELOOP);
1687
1688 /* Add to "recursing stack" */
1689
1690 new_recursive.group_num = recno;
1691 new_recursive.subject_position = eptr;
1692 new_recursive.prevrec = md->recursive;
1693 md->recursive = &new_recursive;
1694
1695 /* Where to continue from afterwards */
1696
1697 ecode += 1 + LINK_SIZE;
1698
1699 /* Now save the offset data */
1700
1701 new_recursive.saved_max = md->offset_end;
1702 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1703 new_recursive.offset_save = stacksave;
1704 else
1705 {
1706 new_recursive.offset_save =
1707 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1708 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1709 }
1710 memcpy(new_recursive.offset_save, md->offset_vector,
1711 new_recursive.saved_max * sizeof(int));
1712
1713 /* OK, now we can do the recursion. After processing each alternative,
1714 restore the offset data. If there were nested recursions, md->recursive
1715 might be changed, so reset it before looping. */
1716
1717 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1718 cbegroup = (*callpat >= OP_SBRA);
1719 do
1720 {
1721 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1722 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1723 md, eptrb, RM6);
1724 memcpy(md->offset_vector, new_recursive.offset_save,
1725 new_recursive.saved_max * sizeof(int));
1726 md->recursive = new_recursive.prevrec;
1727 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1728 {
1729 DPRINTF(("Recursion matched\n"));
1730 if (new_recursive.offset_save != stacksave)
1731 (pcre_free)(new_recursive.offset_save);
1732
1733 /* Set where we got to in the subject, and reset the start in case
1734 it was changed by \K. This *is* propagated back out of a recursion,
1735 for Perl compatibility. */
1736
1737 eptr = md->end_match_ptr;
1738 mstart = md->start_match_ptr;
1739 goto RECURSION_MATCHED; /* Exit loop; end processing */
1740 }
1741
1742 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1743 as NOMATCH. */
1744
1745 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1746 {
1747 DPRINTF(("Recursion gave error %d\n", rrc));
1748 if (new_recursive.offset_save != stacksave)
1749 (pcre_free)(new_recursive.offset_save);
1750 RRETURN(rrc);
1751 }
1752
1753 md->recursive = &new_recursive;
1754 callpat += GET(callpat, 1);
1755 }
1756 while (*callpat == OP_ALT);
1757
1758 DPRINTF(("Recursion didn't match\n"));
1759 md->recursive = new_recursive.prevrec;
1760 if (new_recursive.offset_save != stacksave)
1761 (pcre_free)(new_recursive.offset_save);
1762 MRRETURN(MATCH_NOMATCH);
1763 }
1764
1765 RECURSION_MATCHED:
1766 break;
1767
1768 /* An alternation is the end of a branch; scan along to find the end of the
1769 bracketed group and go to there. */
1770
1771 case OP_ALT:
1772 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1773 break;
1774
1775 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1776 indicating that it may occur zero times. It may repeat infinitely, or not
1777 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1778 with fixed upper repeat limits are compiled as a number of copies, with the
1779 optional ones preceded by BRAZERO or BRAMINZERO. */
1780
1781 case OP_BRAZERO:
1782 next = ecode + 1;
1783 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1785 do next += GET(next, 1); while (*next == OP_ALT);
1786 ecode = next + 1 + LINK_SIZE;
1787 break;
1788
1789 case OP_BRAMINZERO:
1790 next = ecode + 1;
1791 do next += GET(next, 1); while (*next == OP_ALT);
1792 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1794 ecode++;
1795 break;
1796
1797 case OP_SKIPZERO:
1798 next = ecode+1;
1799 do next += GET(next,1); while (*next == OP_ALT);
1800 ecode = next + 1 + LINK_SIZE;
1801 break;
1802
1803 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1804 here; just jump to the group, with allow_zero set TRUE. */
1805
1806 case OP_BRAPOSZERO:
1807 op = *(++ecode);
1808 allow_zero = TRUE;
1809 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1810 goto POSSESSIVE_NON_CAPTURE;
1811
1812 /* End of a group, repeated or non-repeating. */
1813
1814 case OP_KET:
1815 case OP_KETRMIN:
1816 case OP_KETRMAX:
1817 case OP_KETRPOS:
1818 prev = ecode - GET(ecode, 1);
1819
1820 /* If this was a group that remembered the subject start, in order to break
1821 infinite repeats of empty string matches, retrieve the subject start from
1822 the chain. Otherwise, set it NULL. */
1823
1824 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1825 {
1826 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1827 eptrb = eptrb->epb_prev; /* Backup to previous group */
1828 }
1829 else saved_eptr = NULL;
1830
1831 /* If we are at the end of an assertion group or a non-capturing atomic
1832 group, stop matching and return MATCH_MATCH, but record the current high
1833 water mark for use by positive assertions. We also need to record the match
1834 start in case it was changed by \K. */
1835
1836 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1837 *prev == OP_ONCE_NC)
1838 {
1839 md->end_match_ptr = eptr; /* For ONCE_NC */
1840 md->end_offset_top = offset_top;
1841 md->start_match_ptr = mstart;
1842 MRRETURN(MATCH_MATCH); /* Sets md->mark */
1843 }
1844
1845 /* For capturing groups we have to check the group number back at the start
1846 and if necessary complete handling an extraction by setting the offsets and
1847 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1848 into group 0, so it won't be picked up here. Instead, we catch it when the
1849 OP_END is reached. Other recursion is handled here. We just have to record
1850 the current subject position and start match pointer and give a MATCH
1851 return. */
1852
1853 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1854 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1855 {
1856 number = GET2(prev, 1+LINK_SIZE);
1857 offset = number << 1;
1858
1859 #ifdef PCRE_DEBUG
1860 printf("end bracket %d", number);
1861 printf("\n");
1862 #endif
1863
1864 /* Handle a recursively called group. */
1865
1866 if (md->recursive != NULL && md->recursive->group_num == number)
1867 {
1868 md->end_match_ptr = eptr;
1869 md->start_match_ptr = mstart;
1870 RRETURN(MATCH_MATCH);
1871 }
1872
1873 /* Deal with capturing */
1874
1875 md->capture_last = number;
1876 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1877 {
1878 /* If offset is greater than offset_top, it means that we are
1879 "skipping" a capturing group, and that group's offsets must be marked
1880 unset. In earlier versions of PCRE, all the offsets were unset at the
1881 start of matching, but this doesn't work because atomic groups and
1882 assertions can cause a value to be set that should later be unset.
1883 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1884 part of the atomic group, but this is not on the final matching path,
1885 so must be unset when 2 is set. (If there is no group 2, there is no
1886 problem, because offset_top will then be 2, indicating no capture.) */
1887
1888 if (offset > offset_top)
1889 {
1890 register int *iptr = md->offset_vector + offset_top;
1891 register int *iend = md->offset_vector + offset;
1892 while (iptr < iend) *iptr++ = -1;
1893 }
1894
1895 /* Now make the extraction */
1896
1897 md->offset_vector[offset] =
1898 md->offset_vector[md->offset_end - number];
1899 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1900 if (offset_top <= offset) offset_top = offset + 2;
1901 }
1902 }
1903
1904 /* For an ordinary non-repeating ket, just continue at this level. This
1905 also happens for a repeating ket if no characters were matched in the
1906 group. This is the forcible breaking of infinite loops as implemented in
1907 Perl 5.005. For a non-repeating atomic group that includes captures,
1908 establish a backup point by processing the rest of the pattern at a lower
1909 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1910 original OP_ONCE level, thereby bypassing intermediate backup points, but
1911 resetting any captures that happened along the way. */
1912
1913 if (*ecode == OP_KET || eptr == saved_eptr)
1914 {
1915 if (*prev == OP_ONCE)
1916 {
1917 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1919 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1920 RRETURN(MATCH_ONCE);
1921 }
1922 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1923 break;
1924 }
1925
1926 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1927 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1928 at a time from the outer level, thus saving stack. */
1929
1930 if (*ecode == OP_KETRPOS)
1931 {
1932 md->end_match_ptr = eptr;
1933 md->end_offset_top = offset_top;
1934 RRETURN(MATCH_KETRPOS);
1935 }
1936
1937 /* The normal repeating kets try the rest of the pattern or restart from
1938 the preceding bracket, in the appropriate order. In the second case, we can
1939 use tail recursion to avoid using another stack frame, unless we have an
1940 an atomic group or an unlimited repeat of a group that can match an empty
1941 string. */
1942
1943 if (*ecode == OP_KETRMIN)
1944 {
1945 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1946 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1947 if (*prev == OP_ONCE)
1948 {
1949 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1951 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1952 RRETURN(MATCH_ONCE);
1953 }
1954 if (*prev >= OP_SBRA) /* Could match an empty string */
1955 {
1956 md->match_function_type = MATCH_CBEGROUP;
1957 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1958 RRETURN(rrc);
1959 }
1960 ecode = prev;
1961 goto TAIL_RECURSE;
1962 }
1963 else /* OP_KETRMAX */
1964 {
1965 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1966 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1967 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1969 if (*prev == OP_ONCE)
1970 {
1971 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1972 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1973 md->once_target = prev;
1974 RRETURN(MATCH_ONCE);
1975 }
1976 ecode += 1 + LINK_SIZE;
1977 goto TAIL_RECURSE;
1978 }
1979 /* Control never gets here */
1980
1981 /* Not multiline mode: start of subject assertion, unless notbol. */
1982
1983 case OP_CIRC:
1984 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1985
1986 /* Start of subject assertion */
1987
1988 case OP_SOD:
1989 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1990 ecode++;
1991 break;
1992
1993 /* Multiline mode: start of subject unless notbol, or after any newline. */
1994
1995 case OP_CIRCM:
1996 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1997 if (eptr != md->start_subject &&
1998 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1999 MRRETURN(MATCH_NOMATCH);
2000 ecode++;
2001 break;
2002
2003 /* Start of match assertion */
2004
2005 case OP_SOM:
2006 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
2007 ecode++;
2008 break;
2009
2010 /* Reset the start of match point */
2011
2012 case OP_SET_SOM:
2013 mstart = eptr;
2014 ecode++;
2015 break;
2016
2017 /* Multiline mode: assert before any newline, or before end of subject
2018 unless noteol is set. */
2019
2020 case OP_DOLLM:
2021 if (eptr < md->end_subject)
2022 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
2023 else
2024 {
2025 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2026 SCHECK_PARTIAL();
2027 }
2028 ecode++;
2029 break;
2030
2031 /* Not multiline mode: assert before a terminating newline or before end of
2032 subject unless noteol is set. */
2033
2034 case OP_DOLL:
2035 if (md->noteol) MRRETURN(MATCH_NOMATCH);
2036 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2037
2038 /* ... else fall through for endonly */
2039
2040 /* End of subject assertion (\z) */
2041
2042 case OP_EOD:
2043 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
2044 SCHECK_PARTIAL();
2045 ecode++;
2046 break;
2047
2048 /* End of subject or ending \n assertion (\Z) */
2049
2050 case OP_EODN:
2051 ASSERT_NL_OR_EOS:
2052 if (eptr < md->end_subject &&
2053 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2054 MRRETURN(MATCH_NOMATCH);
2055
2056 /* Either at end of string or \n before end. */
2057
2058 SCHECK_PARTIAL();
2059 ecode++;
2060 break;
2061
2062 /* Word boundary assertions */
2063
2064 case OP_NOT_WORD_BOUNDARY:
2065 case OP_WORD_BOUNDARY:
2066 {
2067
2068 /* Find out if the previous and current characters are "word" characters.
2069 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2070 be "non-word" characters. Remember the earliest consulted character for
2071 partial matching. */
2072
2073 #ifdef SUPPORT_UTF
2074 if (utf)
2075 {
2076 /* Get status of previous character */
2077
2078 if (eptr == md->start_subject) prev_is_word = FALSE; else
2079 {
2080 PCRE_PUCHAR lastptr = eptr - 1;
2081 BACKCHAR(lastptr);
2082 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2083 GETCHAR(c, lastptr);
2084 #ifdef SUPPORT_UCP
2085 if (md->use_ucp)
2086 {
2087 if (c == '_') prev_is_word = TRUE; else
2088 {
2089 int cat = UCD_CATEGORY(c);
2090 prev_is_word = (cat == ucp_L || cat == ucp_N);
2091 }
2092 }
2093 else
2094 #endif
2095 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2096 }
2097
2098 /* Get status of next character */
2099
2100 if (eptr >= md->end_subject)
2101 {
2102 SCHECK_PARTIAL();
2103 cur_is_word = FALSE;
2104 }
2105 else
2106 {
2107 GETCHAR(c, eptr);
2108 #ifdef SUPPORT_UCP
2109 if (md->use_ucp)
2110 {
2111 if (c == '_') cur_is_word = TRUE; else
2112 {
2113 int cat = UCD_CATEGORY(c);
2114 cur_is_word = (cat == ucp_L || cat == ucp_N);
2115 }
2116 }
2117 else
2118 #endif
2119 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2120 }
2121 }
2122 else
2123 #endif
2124
2125 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2126 consistency with the behaviour of \w we do use it in this case. */
2127
2128 {
2129 /* Get status of previous character */
2130
2131 if (eptr == md->start_subject) prev_is_word = FALSE; else
2132 {
2133 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2134 #ifdef SUPPORT_UCP
2135 if (md->use_ucp)
2136 {
2137 c = eptr[-1];
2138 if (c == '_') prev_is_word = TRUE; else
2139 {
2140 int cat = UCD_CATEGORY(c);
2141 prev_is_word = (cat == ucp_L || cat == ucp_N);
2142 }
2143 }
2144 else
2145 #endif
2146 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2147 }
2148
2149 /* Get status of next character */
2150
2151 if (eptr >= md->end_subject)
2152 {
2153 SCHECK_PARTIAL();
2154 cur_is_word = FALSE;
2155 }
2156 else
2157 #ifdef SUPPORT_UCP
2158 if (md->use_ucp)
2159 {
2160 c = *eptr;
2161 if (c == '_') cur_is_word = TRUE; else
2162 {
2163 int cat = UCD_CATEGORY(c);
2164 cur_is_word = (cat == ucp_L || cat == ucp_N);
2165 }
2166 }
2167 else
2168 #endif
2169 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2170 }
2171
2172 /* Now see if the situation is what we want */
2173
2174 if ((*ecode++ == OP_WORD_BOUNDARY)?
2175 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2176 MRRETURN(MATCH_NOMATCH);
2177 }
2178 break;
2179
2180 /* Match a single character type; inline for speed */
2181
2182 case OP_ANY:
2183 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2184 /* Fall through */
2185
2186 case OP_ALLANY:
2187 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2188 { /* not be updated before SCHECK_PARTIAL. */
2189 SCHECK_PARTIAL();
2190 MRRETURN(MATCH_NOMATCH);
2191 }
2192 eptr++;
2193 #ifdef SUPPORT_UTF
2194 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2195 #endif
2196 ecode++;
2197 break;
2198
2199 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2200 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2201
2202 case OP_ANYBYTE:
2203 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2204 { /* not be updated before SCHECK_PARTIAL. */
2205 SCHECK_PARTIAL();
2206 MRRETURN(MATCH_NOMATCH);
2207 }
2208 eptr++;
2209 ecode++;
2210 break;
2211
2212 case OP_NOT_DIGIT:
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 GETCHARINCTEST(c, eptr);
2219 if (
2220 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2221 c < 256 &&
2222 #endif
2223 (md->ctypes[c] & ctype_digit) != 0
2224 )
2225 MRRETURN(MATCH_NOMATCH);
2226 ecode++;
2227 break;
2228
2229 case OP_DIGIT:
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 MRRETURN(MATCH_NOMATCH);
2234 }
2235 GETCHARINCTEST(c, eptr);
2236 if (
2237 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2238 c > 255 ||
2239 #endif
2240 (md->ctypes[c] & ctype_digit) == 0
2241 )
2242 MRRETURN(MATCH_NOMATCH);
2243 ecode++;
2244 break;
2245
2246 case OP_NOT_WHITESPACE:
2247 if (eptr >= md->end_subject)
2248 {
2249 SCHECK_PARTIAL();
2250 MRRETURN(MATCH_NOMATCH);
2251 }
2252 GETCHARINCTEST(c, eptr);
2253 if (
2254 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2255 c < 256 &&
2256 #endif
2257 (md->ctypes[c] & ctype_space) != 0
2258 )
2259 MRRETURN(MATCH_NOMATCH);
2260 ecode++;
2261 break;
2262
2263 case OP_WHITESPACE:
2264 if (eptr >= md->end_subject)
2265 {
2266 SCHECK_PARTIAL();
2267 MRRETURN(MATCH_NOMATCH);
2268 }
2269 GETCHARINCTEST(c, eptr);
2270 if (
2271 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2272 c > 255 ||
2273 #endif
2274 (md->ctypes[c] & ctype_space) == 0
2275 )
2276 MRRETURN(MATCH_NOMATCH);
2277 ecode++;
2278 break;
2279
2280 case OP_NOT_WORDCHAR:
2281 if (eptr >= md->end_subject)
2282 {
2283 SCHECK_PARTIAL();
2284 MRRETURN(MATCH_NOMATCH);
2285 }
2286 GETCHARINCTEST(c, eptr);
2287 if (
2288 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2289 c < 256 &&
2290 #endif
2291 (md->ctypes[c] & ctype_word) != 0
2292 )
2293 MRRETURN(MATCH_NOMATCH);
2294 ecode++;
2295 break;
2296
2297 case OP_WORDCHAR:
2298 if (eptr >= md->end_subject)
2299 {
2300 SCHECK_PARTIAL();
2301 MRRETURN(MATCH_NOMATCH);
2302 }
2303 GETCHARINCTEST(c, eptr);
2304 if (
2305 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2306 c > 255 ||
2307 #endif
2308 (md->ctypes[c] & ctype_word) == 0
2309 )
2310 MRRETURN(MATCH_NOMATCH);
2311 ecode++;
2312 break;
2313
2314 case OP_ANYNL:
2315 if (eptr >= md->end_subject)
2316 {
2317 SCHECK_PARTIAL();
2318 MRRETURN(MATCH_NOMATCH);
2319 }
2320 GETCHARINCTEST(c, eptr);
2321 switch(c)
2322 {
2323 default: MRRETURN(MATCH_NOMATCH);
2324
2325 case 0x000d:
2326 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2327 break;
2328
2329 case 0x000a:
2330 break;
2331
2332 case 0x000b:
2333 case 0x000c:
2334 case 0x0085:
2335 case 0x2028:
2336 case 0x2029:
2337 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2338 break;
2339 }
2340 ecode++;
2341 break;
2342
2343 case OP_NOT_HSPACE:
2344 if (eptr >= md->end_subject)
2345 {
2346 SCHECK_PARTIAL();
2347 MRRETURN(MATCH_NOMATCH);
2348 }
2349 GETCHARINCTEST(c, eptr);
2350 switch(c)
2351 {
2352 default: break;
2353 case 0x09: /* HT */
2354 case 0x20: /* SPACE */
2355 case 0xa0: /* NBSP */
2356 case 0x1680: /* OGHAM SPACE MARK */
2357 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2358 case 0x2000: /* EN QUAD */
2359 case 0x2001: /* EM QUAD */
2360 case 0x2002: /* EN SPACE */
2361 case 0x2003: /* EM SPACE */
2362 case 0x2004: /* THREE-PER-EM SPACE */
2363 case 0x2005: /* FOUR-PER-EM SPACE */
2364 case 0x2006: /* SIX-PER-EM SPACE */
2365 case 0x2007: /* FIGURE SPACE */
2366 case 0x2008: /* PUNCTUATION SPACE */
2367 case 0x2009: /* THIN SPACE */
2368 case 0x200A: /* HAIR SPACE */
2369 case 0x202f: /* NARROW NO-BREAK SPACE */
2370 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2371 case 0x3000: /* IDEOGRAPHIC SPACE */
2372 MRRETURN(MATCH_NOMATCH);
2373 }
2374 ecode++;
2375 break;
2376
2377 case OP_HSPACE:
2378 if (eptr >= md->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 MRRETURN(MATCH_NOMATCH);
2382 }
2383 GETCHARINCTEST(c, eptr);
2384 switch(c)
2385 {
2386 default: MRRETURN(MATCH_NOMATCH);
2387 case 0x09: /* HT */
2388 case 0x20: /* SPACE */
2389 case 0xa0: /* NBSP */
2390 case 0x1680: /* OGHAM SPACE MARK */
2391 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2392 case 0x2000: /* EN QUAD */
2393 case 0x2001: /* EM QUAD */
2394 case 0x2002: /* EN SPACE */
2395 case 0x2003: /* EM SPACE */
2396 case 0x2004: /* THREE-PER-EM SPACE */
2397 case 0x2005: /* FOUR-PER-EM SPACE */
2398 case 0x2006: /* SIX-PER-EM SPACE */
2399 case 0x2007: /* FIGURE SPACE */
2400 case 0x2008: /* PUNCTUATION SPACE */
2401 case 0x2009: /* THIN SPACE */
2402 case 0x200A: /* HAIR SPACE */
2403 case 0x202f: /* NARROW NO-BREAK SPACE */
2404 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2405 case 0x3000: /* IDEOGRAPHIC SPACE */
2406 break;
2407 }
2408 ecode++;
2409 break;
2410
2411 case OP_NOT_VSPACE:
2412 if (eptr >= md->end_subject)
2413 {
2414 SCHECK_PARTIAL();
2415 MRRETURN(MATCH_NOMATCH);
2416 }
2417 GETCHARINCTEST(c, eptr);
2418 switch(c)
2419 {
2420 default: break;
2421 case 0x0a: /* LF */
2422 case 0x0b: /* VT */
2423 case 0x0c: /* FF */
2424 case 0x0d: /* CR */
2425 case 0x85: /* NEL */
2426 case 0x2028: /* LINE SEPARATOR */
2427 case 0x2029: /* PARAGRAPH SEPARATOR */
2428 MRRETURN(MATCH_NOMATCH);
2429 }
2430 ecode++;
2431 break;
2432
2433 case OP_VSPACE:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 MRRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 switch(c)
2441 {
2442 default: MRRETURN(MATCH_NOMATCH);
2443 case 0x0a: /* LF */
2444 case 0x0b: /* VT */
2445 case 0x0c: /* FF */
2446 case 0x0d: /* CR */
2447 case 0x85: /* NEL */
2448 case 0x2028: /* LINE SEPARATOR */
2449 case 0x2029: /* PARAGRAPH SEPARATOR */
2450 break;
2451 }
2452 ecode++;
2453 break;
2454
2455 #ifdef SUPPORT_UCP
2456 /* Check the next character by Unicode property. We will get here only
2457 if the support is in the binary; otherwise a compile-time error occurs. */
2458
2459 case OP_PROP:
2460 case OP_NOTPROP:
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 MRRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 {
2468 const ucd_record *prop = GET_UCD(c);
2469
2470 switch(ecode[1])
2471 {
2472 case PT_ANY:
2473 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2474 break;
2475
2476 case PT_LAMP:
2477 if ((prop->chartype == ucp_Lu ||
2478 prop->chartype == ucp_Ll ||
2479 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2480 MRRETURN(MATCH_NOMATCH);
2481 break;
2482
2483 case PT_GC:
2484 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2485 MRRETURN(MATCH_NOMATCH);
2486 break;
2487
2488 case PT_PC:
2489 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2490 MRRETURN(MATCH_NOMATCH);
2491 break;
2492
2493 case PT_SC:
2494 if ((ecode[2] != prop->script) == (op == OP_PROP))
2495 MRRETURN(MATCH_NOMATCH);
2496 break;
2497
2498 /* These are specials */
2499
2500 case PT_ALNUM:
2501 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2502 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2503 MRRETURN(MATCH_NOMATCH);
2504 break;
2505
2506 case PT_SPACE: /* Perl space */
2507 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2508 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2509 == (op == OP_NOTPROP))
2510 MRRETURN(MATCH_NOMATCH);
2511 break;
2512
2513 case PT_PXSPACE: /* POSIX space */
2514 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2515 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2516 c == CHAR_FF || c == CHAR_CR)
2517 == (op == OP_NOTPROP))
2518 MRRETURN(MATCH_NOMATCH);
2519 break;
2520
2521 case PT_WORD:
2522 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2523 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2524 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2525 MRRETURN(MATCH_NOMATCH);
2526 break;
2527
2528 /* This should never occur */
2529
2530 default:
2531 RRETURN(PCRE_ERROR_INTERNAL);
2532 }
2533
2534 ecode += 3;
2535 }
2536 break;
2537
2538 /* Match an extended Unicode sequence. We will get here only if the support
2539 is in the binary; otherwise a compile-time error occurs. */
2540
2541 case OP_EXTUNI:
2542 if (eptr >= md->end_subject)
2543 {
2544 SCHECK_PARTIAL();
2545 MRRETURN(MATCH_NOMATCH);
2546 }
2547 GETCHARINCTEST(c, eptr);
2548 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
2549 while (eptr < md->end_subject)
2550 {
2551 int len = 1;
2552 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2553 if (UCD_CATEGORY(c) != ucp_M) break;
2554 eptr += len;
2555 }
2556 ecode++;
2557 break;
2558 #endif
2559
2560
2561 /* Match a back reference, possibly repeatedly. Look past the end of the
2562 item to see if there is repeat information following. The code is similar
2563 to that for character classes, but repeated for efficiency. Then obey
2564 similar code to character type repeats - written out again for speed.
2565 However, if the referenced string is the empty string, always treat
2566 it as matched, any number of times (otherwise there could be infinite
2567 loops). */
2568
2569 case OP_REF:
2570 case OP_REFI:
2571 caseless = op == OP_REFI;
2572 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2573 ecode += 1 + IMM2_SIZE;
2574
2575 /* If the reference is unset, there are two possibilities:
2576
2577 (a) In the default, Perl-compatible state, set the length negative;
2578 this ensures that every attempt at a match fails. We can't just fail
2579 here, because of the possibility of quantifiers with zero minima.
2580
2581 (b) If the JavaScript compatibility flag is set, set the length to zero
2582 so that the back reference matches an empty string.
2583
2584 Otherwise, set the length to the length of what was matched by the
2585 referenced subpattern. */
2586
2587 if (offset >= offset_top || md->offset_vector[offset] < 0)
2588 length = (md->jscript_compat)? 0 : -1;
2589 else
2590 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2591
2592 /* Set up for repetition, or handle the non-repeated case */
2593
2594 switch (*ecode)
2595 {
2596 case OP_CRSTAR:
2597 case OP_CRMINSTAR:
2598 case OP_CRPLUS:
2599 case OP_CRMINPLUS:
2600 case OP_CRQUERY:
2601 case OP_CRMINQUERY:
2602 c = *ecode++ - OP_CRSTAR;
2603 minimize = (c & 1) != 0;
2604 min = rep_min[c]; /* Pick up values from tables; */
2605 max = rep_max[c]; /* zero for max => infinity */
2606 if (max == 0) max = INT_MAX;
2607 break;
2608
2609 case OP_CRRANGE:
2610 case OP_CRMINRANGE:
2611 minimize = (*ecode == OP_CRMINRANGE);
2612 min = GET2(ecode, 1);
2613 max = GET2(ecode, 1 + IMM2_SIZE);
2614 if (max == 0) max = INT_MAX;
2615 ecode += 1 + 2 * IMM2_SIZE;
2616 break;
2617
2618 default: /* No repeat follows */
2619 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2620 {
2621 CHECK_PARTIAL();
2622 MRRETURN(MATCH_NOMATCH);
2623 }
2624 eptr += length;
2625 continue; /* With the main loop */
2626 }
2627
2628 /* Handle repeated back references. If the length of the reference is
2629 zero, just continue with the main loop. */
2630
2631 if (length == 0) continue;
2632
2633 /* First, ensure the minimum number of matches are present. We get back
2634 the length of the reference string explicitly rather than passing the
2635 address of eptr, so that eptr can be a register variable. */
2636
2637 for (i = 1; i <= min; i++)
2638 {
2639 int slength;
2640 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2641 {
2642 CHECK_PARTIAL();
2643 MRRETURN(MATCH_NOMATCH);
2644 }
2645 eptr += slength;
2646 }
2647
2648 /* If min = max, continue at the same level without recursion.
2649 They are not both allowed to be zero. */
2650
2651 if (min == max) continue;
2652
2653 /* If minimizing, keep trying and advancing the pointer */
2654
2655 if (minimize)
2656 {
2657 for (fi = min;; fi++)
2658 {
2659 int slength;
2660 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2662 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2663 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2664 {
2665 CHECK_PARTIAL();
2666 MRRETURN(MATCH_NOMATCH);
2667 }
2668 eptr += slength;
2669 }
2670 /* Control never gets here */
2671 }
2672
2673 /* If maximizing, find the longest string and work backwards */
2674
2675 else
2676 {
2677 pp = eptr;
2678 for (i = min; i < max; i++)
2679 {
2680 int slength;
2681 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2682 {
2683 CHECK_PARTIAL();
2684 break;
2685 }
2686 eptr += slength;
2687 }
2688 while (eptr >= pp)
2689 {
2690 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2692 eptr -= length;
2693 }
2694 MRRETURN(MATCH_NOMATCH);
2695 }
2696 /* Control never gets here */
2697
2698 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2699 used when all the characters in the class have values in the range 0-255,
2700 and either the matching is caseful, or the characters are in the range
2701 0-127 when UTF-8 processing is enabled. The only difference between
2702 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2703 encountered.
2704
2705 First, look past the end of the item to see if there is repeat information
2706 following. Then obey similar code to character type repeats - written out
2707 again for speed. */
2708
2709 case OP_NCLASS:
2710 case OP_CLASS:
2711 {
2712 /* The data variable is saved across frames, so the byte map needs to
2713 be stored there. */
2714 #define BYTE_MAP ((pcre_uint8 *)data)
2715 data = ecode + 1; /* Save for matching */
2716 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2717
2718 switch (*ecode)
2719 {
2720 case OP_CRSTAR:
2721 case OP_CRMINSTAR:
2722 case OP_CRPLUS:
2723 case OP_CRMINPLUS:
2724 case OP_CRQUERY:
2725 case OP_CRMINQUERY:
2726 c = *ecode++ - OP_CRSTAR;
2727 minimize = (c & 1) != 0;
2728 min = rep_min[c]; /* Pick up values from tables; */
2729 max = rep_max[c]; /* zero for max => infinity */
2730 if (max == 0) max = INT_MAX;
2731 break;
2732
2733 case OP_CRRANGE:
2734 case OP_CRMINRANGE:
2735 minimize = (*ecode == OP_CRMINRANGE);
2736 min = GET2(ecode, 1);
2737 max = GET2(ecode, 1 + IMM2_SIZE);
2738 if (max == 0) max = INT_MAX;
2739 ecode += 1 + 2 * IMM2_SIZE;
2740 break;
2741
2742 default: /* No repeat follows */
2743 min = max = 1;
2744 break;
2745 }
2746
2747 /* First, ensure the minimum number of matches are present. */
2748
2749 #ifdef SUPPORT_UTF
2750 if (utf)
2751 {
2752 for (i = 1; i <= min; i++)
2753 {
2754 if (eptr >= md->end_subject)
2755 {
2756 SCHECK_PARTIAL();
2757 MRRETURN(MATCH_NOMATCH);
2758 }
2759 GETCHARINC(c, eptr);
2760 if (c > 255)
2761 {
2762 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2763 }
2764 else
2765 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2766 }
2767 }
2768 else
2769 #endif
2770 /* Not UTF mode */
2771 {
2772 for (i = 1; i <= min; i++)
2773 {
2774 if (eptr >= md->end_subject)
2775 {
2776 SCHECK_PARTIAL();
2777 MRRETURN(MATCH_NOMATCH);
2778 }
2779 c = *eptr++;
2780 #ifndef COMPILE_PCRE8
2781 if (c > 255)
2782 {
2783 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2784 }
2785 else
2786 #endif
2787 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2788 }
2789 }
2790
2791 /* If max == min we can continue with the main loop without the
2792 need to recurse. */
2793
2794 if (min == max) continue;
2795
2796 /* If minimizing, keep testing the rest of the expression and advancing
2797 the pointer while it matches the class. */
2798
2799 if (minimize)
2800 {
2801 #ifdef SUPPORT_UTF
2802 if (utf)
2803 {
2804 for (fi = min;; fi++)
2805 {
2806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2808 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2809 if (eptr >= md->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 MRRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINC(c, eptr);
2815 if (c > 255)
2816 {
2817 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2818 }
2819 else
2820 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2821 }
2822 }
2823 else
2824 #endif
2825 /* Not UTF mode */
2826 {
2827 for (fi = min;; fi++)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2832 if (eptr >= md->end_subject)
2833 {
2834 SCHECK_PARTIAL();
2835 MRRETURN(MATCH_NOMATCH);
2836 }
2837 c = *eptr++;
2838 #ifndef COMPILE_PCRE8
2839 if (c > 255)
2840 {
2841 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2842 }
2843 else
2844 #endif
2845 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2846 }
2847 }
2848 /* Control never gets here */
2849 }
2850
2851 /* If maximizing, find the longest possible run, then work backwards. */
2852
2853 else
2854 {
2855 pp = eptr;
2856
2857 #ifdef SUPPORT_UTF
2858 if (utf)
2859 {
2860 for (i = min; i < max; i++)
2861 {
2862 int len = 1;
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 break;
2867 }
2868 GETCHARLEN(c, eptr, len);
2869 if (c > 255)
2870 {
2871 if (op == OP_CLASS) break;
2872 }
2873 else
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2875 eptr += len;
2876 }
2877 for (;;)
2878 {
2879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2881 if (eptr-- == pp) break; /* Stop if tried at original pos */
2882 BACKCHAR(eptr);
2883 }
2884 }
2885 else
2886 #endif
2887 /* Not UTF mode */
2888 {
2889 for (i = min; i < max; i++)
2890 {
2891 if (eptr >= md->end_subject)
2892 {
2893 SCHECK_PARTIAL();
2894 break;
2895 }
2896 c = *eptr;
2897 #ifndef COMPILE_PCRE8
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) break;
2901 }
2902 else
2903 #endif
2904 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2905 eptr++;
2906 }
2907 while (eptr >= pp)
2908 {
2909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2911 eptr--;
2912 }
2913 }
2914
2915 MRRETURN(MATCH_NOMATCH);
2916 }
2917 #undef BYTE_MAP
2918 }
2919 /* Control never gets here */
2920
2921
2922 /* Match an extended character class. This opcode is encountered only
2923 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2924 mode, because Unicode properties are supported in non-UTF-8 mode. */
2925
2926 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2927 case OP_XCLASS:
2928 {
2929 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2930 ecode += GET(ecode, 1); /* Advance past the item */
2931
2932 switch (*ecode)
2933 {
2934 case OP_CRSTAR:
2935 case OP_CRMINSTAR:
2936 case OP_CRPLUS:
2937 case OP_CRMINPLUS:
2938 case OP_CRQUERY:
2939 case OP_CRMINQUERY:
2940 c = *ecode++ - OP_CRSTAR;
2941 minimize = (c & 1) != 0;
2942 min = rep_min[c]; /* Pick up values from tables; */
2943 max = rep_max[c]; /* zero for max => infinity */
2944 if (max == 0) max = INT_MAX;
2945 break;
2946
2947 case OP_CRRANGE:
2948 case OP_CRMINRANGE:
2949 minimize = (*ecode == OP_CRMINRANGE);
2950 min = GET2(ecode, 1);
2951 max = GET2(ecode, 1 + IMM2_SIZE);
2952 if (max == 0) max = INT_MAX;
2953 ecode += 1 + 2 * IMM2_SIZE;
2954 break;
2955
2956 default: /* No repeat follows */
2957 min = max = 1;
2958 break;
2959 }
2960
2961 /* First, ensure the minimum number of matches are present. */
2962
2963 for (i = 1; i <= min; i++)
2964 {
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 MRRETURN(MATCH_NOMATCH);
2969 }
2970 GETCHARINCTEST(c, eptr);
2971 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2972 }
2973
2974 /* If max == min we can continue with the main loop without the
2975 need to recurse. */
2976
2977 if (min == max) continue;
2978
2979 /* If minimizing, keep testing the rest of the expression and advancing
2980 the pointer while it matches the class. */
2981
2982 if (minimize)
2983 {
2984 for (fi = min;; fi++)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 MRRETURN(MATCH_NOMATCH);
2993 }
2994 GETCHARINCTEST(c, eptr);
2995 if (!PRIV(xclass)(c, data)) MRRETURN(MATCH_NOMATCH);
2996 }
2997 /* Control never gets here */
2998 }
2999
3000 /* If maximizing, find the longest possible run, then work backwards. */
3001
3002 else
3003 {
3004 pp = eptr;
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 #ifdef SUPPORT_UTF
3014 GETCHARLENTEST(c, eptr, len);
3015 #else
3016 c = *eptr;
3017 #endif
3018 if (!PRIV(xclass)(c, data)) break;
3019 eptr += len;
3020 }
3021 for(;;)
3022 {
3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025 if (eptr-- == pp) break; /* Stop if tried at original pos */
3026 #ifdef SUPPORT_UTF
3027 if (utf) BACKCHAR(eptr);
3028 #endif
3029 }
3030 MRRETURN(MATCH_NOMATCH);
3031 }
3032
3033 /* Control never gets here */
3034 }
3035 #endif /* End of XCLASS */
3036
3037 /* Match a single character, casefully */
3038
3039 case OP_CHAR:
3040 #ifdef SUPPORT_UTF
3041 if (utf)
3042 {
3043 length = 1;
3044 ecode++;
3045 GETCHARLEN(fc, ecode, length);
3046 if (length > md->end_subject - eptr)
3047 {
3048 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3049 MRRETURN(MATCH_NOMATCH);
3050 }
3051 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
3052 }
3053 else
3054 #endif
3055 /* Not UTF mode */
3056 {
3057 if (md->end_subject - eptr < 1)
3058 {
3059 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3060 MRRETURN(MATCH_NOMATCH);
3061 }
3062 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
3063 ecode += 2;
3064 }
3065 break;
3066
3067 /* Match a single character, caselessly */
3068
3069 case OP_CHARI:
3070 #ifdef SUPPORT_UTF
3071 if (utf)
3072 {
3073 length = 1;
3074 ecode++;
3075 GETCHARLEN(fc, ecode, length);
3076
3077 if (length > md->end_subject - eptr)
3078 {
3079 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3080 MRRETURN(MATCH_NOMATCH);
3081 }
3082
3083 /* If the pattern character's value is < 128, we have only one byte, and
3084 can use the fast lookup table. */
3085
3086 if (fc < 128)
3087 {
3088 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3089 }
3090
3091 /* Otherwise we must pick up the subject character */
3092
3093 else
3094 {
3095 unsigned int dc;
3096 GETCHARINC(dc, eptr);
3097 ecode += length;
3098
3099 /* If we have Unicode property support, we can use it to test the other
3100 case of the character, if there is one. */
3101
3102 if (fc != dc)
3103 {
3104 #ifdef SUPPORT_UCP
3105 if (dc != UCD_OTHERCASE(fc))
3106 #endif
3107 MRRETURN(MATCH_NOMATCH);
3108 }
3109 }
3110 }
3111 else
3112 #endif /* SUPPORT_UTF */
3113
3114 /* Not UTF mode */
3115 {
3116 if (md->end_subject - eptr < 1)
3117 {
3118 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3119 MRRETURN(MATCH_NOMATCH);
3120 }
3121 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3122 != TABLE_GET(*eptr, md->lcc, *eptr)) MRRETURN(MATCH_NOMATCH);
3123 eptr++;
3124 ecode += 2;
3125 }
3126 break;
3127
3128 /* Match a single character repeatedly. */
3129
3130 case OP_EXACT:
3131 case OP_EXACTI:
3132 min = max = GET2(ecode, 1);
3133 ecode += 1 + IMM2_SIZE;
3134 goto REPEATCHAR;
3135
3136 case OP_POSUPTO:
3137 case OP_POSUPTOI:
3138 possessive = TRUE;
3139 /* Fall through */
3140
3141 case OP_UPTO:
3142 case OP_UPTOI:
3143 case OP_MINUPTO:
3144 case OP_MINUPTOI:
3145 min = 0;
3146 max = GET2(ecode, 1);
3147 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3148 ecode += 1 + IMM2_SIZE;
3149 goto REPEATCHAR;
3150
3151 case OP_POSSTAR:
3152 case OP_POSSTARI:
3153 possessive = TRUE;
3154 min = 0;
3155 max = INT_MAX;
3156 ecode++;
3157 goto REPEATCHAR;
3158
3159 case OP_POSPLUS:
3160 case OP_POSPLUSI:
3161 possessive = TRUE;
3162 min = 1;
3163 max = INT_MAX;
3164 ecode++;
3165 goto REPEATCHAR;
3166
3167 case OP_POSQUERY:
3168 case OP_POSQUERYI:
3169 possessive = TRUE;
3170 min = 0;
3171 max = 1;
3172 ecode++;
3173 goto REPEATCHAR;
3174
3175 case OP_STAR:
3176 case OP_STARI:
3177 case OP_MINSTAR:
3178 case OP_MINSTARI:
3179 case OP_PLUS:
3180 case OP_PLUSI:
3181 case OP_MINPLUS:
3182 case OP_MINPLUSI:
3183 case OP_QUERY:
3184 case OP_QUERYI:
3185 case OP_MINQUERY:
3186 case OP_MINQUERYI:
3187 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3188 minimize = (c & 1) != 0;
3189 min = rep_min[c]; /* Pick up values from tables; */
3190 max = rep_max[c]; /* zero for max => infinity */
3191 if (max == 0) max = INT_MAX;
3192
3193 /* Common code for all repeated single-character matches. */
3194
3195 REPEATCHAR:
3196 #ifdef SUPPORT_UTF
3197 if (utf)
3198 {
3199 length = 1;
3200 charptr = ecode;
3201 GETCHARLEN(fc, ecode, length);
3202 ecode += length;
3203
3204 /* Handle multibyte character matching specially here. There is
3205 support for caseless matching if UCP support is present. */
3206
3207 if (length > 1)
3208 {
3209 #ifdef SUPPORT_UCP
3210 unsigned int othercase;
3211 if (op >= OP_STARI && /* Caseless */
3212 (othercase = UCD_OTHERCASE(fc)) != fc)
3213 oclength = PRIV(ord2utf)(othercase, occhars);
3214 else oclength = 0;
3215 #endif /* SUPPORT_UCP */
3216
3217 for (i = 1; i <= min; i++)
3218 {
3219 if (eptr <= md->end_subject - length &&
3220 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3221 #ifdef SUPPORT_UCP
3222 else if (oclength > 0 &&
3223 eptr <= md->end_subject - oclength &&
3224 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3225 #endif /* SUPPORT_UCP */
3226 else
3227 {
3228 CHECK_PARTIAL();
3229 MRRETURN(MATCH_NOMATCH);
3230 }
3231 }
3232
3233 if (min == max) continue;
3234
3235 if (minimize)
3236 {
3237 for (fi = min;; fi++)
3238 {
3239 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3241 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3242 if (eptr <= md->end_subject - length &&
3243 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3244 #ifdef SUPPORT_UCP
3245 else if (oclength > 0 &&
3246 eptr <= md->end_subject - oclength &&
3247 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3248 #endif /* SUPPORT_UCP */
3249 else
3250 {
3251 CHECK_PARTIAL();
3252 MRRETURN(MATCH_NOMATCH);
3253 }
3254 }
3255 /* Control never gets here */
3256 }
3257
3258 else /* Maximize */
3259 {
3260 pp = eptr;
3261 for (i = min; i < max; i++)
3262 {
3263 if (eptr <= md->end_subject - length &&
3264 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3265 #ifdef SUPPORT_UCP
3266 else if (oclength > 0 &&
3267 eptr <= md->end_subject - oclength &&
3268 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3269 #endif /* SUPPORT_UCP */
3270 else
3271 {
3272 CHECK_PARTIAL();
3273 break;
3274 }
3275 }
3276
3277 if (possessive) continue;
3278
3279 for(;;)
3280 {
3281 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3282 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3283 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3284 #ifdef SUPPORT_UCP
3285 eptr--;
3286 BACKCHAR(eptr);
3287 #else /* without SUPPORT_UCP */
3288 eptr -= length;
3289 #endif /* SUPPORT_UCP */
3290 }
3291 }
3292 /* Control never gets here */
3293 }
3294
3295 /* If the length of a UTF-8 character is 1, we fall through here, and
3296 obey the code as for non-UTF-8 characters below, though in this case the
3297 value of fc will always be < 128. */
3298 }
3299 else
3300 #endif /* SUPPORT_UTF */
3301 /* When not in UTF-8 mode, load a single-byte character. */
3302 fc = *ecode++;
3303
3304 /* The value of fc at this point is always one character, though we may
3305 or may not be in UTF mode. The code is duplicated for the caseless and
3306 caseful cases, for speed, since matching characters is likely to be quite
3307 common. First, ensure the minimum number of matches are present. If min =
3308 max, continue at the same level without recursing. Otherwise, if
3309 minimizing, keep trying the rest of the expression and advancing one
3310 matching character if failing, up to the maximum. Alternatively, if
3311 maximizing, find the maximum number of characters and work backwards. */
3312
3313 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3314 max, eptr));
3315
3316 if (op >= OP_STARI) /* Caseless */
3317 {
3318 #ifdef COMPILE_PCRE8
3319 /* fc must be < 128 */
3320 foc = md->fcc[fc];
3321 #else
3322 #ifdef SUPPORT_UTF
3323 #ifdef SUPPORT_UCP
3324 if (utf && fc > 127)
3325 foc = UCD_OTHERCASE(fc);
3326 #else
3327 if (utf && fc > 127)
3328 foc = fc;
3329 #endif /* SUPPORT_UCP */
3330 else
3331 #endif /* SUPPORT_UTF */
3332 foc = TABLE_GET(fc, md->fcc, fc);
3333 #endif /* COMPILE_PCRE8 */
3334
3335 for (i = 1; i <= min; i++)
3336 {
3337 if (eptr >= md->end_subject)
3338 {
3339 SCHECK_PARTIAL();
3340 MRRETURN(MATCH_NOMATCH);
3341 }
3342 if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
3343 eptr++;
3344 }
3345 if (min == max) continue;
3346 if (minimize)
3347 {
3348 for (fi = min;; fi++)
3349 {
3350 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3351 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3352 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3353 if (eptr >= md->end_subject)
3354 {
3355 SCHECK_PARTIAL();
3356 MRRETURN(MATCH_NOMATCH);
3357 }
3358 if (fc != *eptr && foc != *eptr) MRRETURN(MATCH_NOMATCH);
3359 eptr++;
3360 }
3361 /* Control never gets here */
3362 }
3363 else /* Maximize */
3364 {
3365 pp = eptr;
3366 for (i = min; i < max; i++)
3367 {
3368 if (eptr >= md->end_subject)
3369 {
3370 SCHECK_PARTIAL();
3371 break;
3372 }
3373 if (fc != *eptr && foc != *eptr) break;
3374 eptr++;
3375 }
3376
3377 if (possessive) continue;
3378
3379 while (eptr >= pp)
3380 {
3381 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3382 eptr--;
3383 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3384 }
3385 MRRETURN(MATCH_NOMATCH);
3386 }
3387 /* Control never gets here */
3388 }
3389
3390 /* Caseful comparisons (includes all multi-byte characters) */
3391
3392 else
3393 {
3394 for (i = 1; i <= min; i++)
3395 {
3396 if (eptr >= md->end_subject)
3397 {
3398 SCHECK_PARTIAL();
3399 MRRETURN(MATCH_NOMATCH);
3400 }
3401 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3402 }
3403
3404 if (min == max) continue;
3405
3406 if (minimize)
3407 {
3408 for (fi = min;; fi++)
3409 {
3410 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3412 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3413 if (eptr >= md->end_subject)
3414 {
3415 SCHECK_PARTIAL();
3416 MRRETURN(MATCH_NOMATCH);
3417 }
3418 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3419 }
3420 /* Control never gets here */
3421 }
3422 else /* Maximize */
3423 {
3424 pp = eptr;
3425 for (i = min; i < max; i++)
3426 {
3427 if (eptr >= md->end_subject)
3428 {
3429 SCHECK_PARTIAL();
3430 break;
3431 }
3432 if (fc != *eptr) break;
3433 eptr++;
3434 }
3435 if (possessive) continue;
3436
3437 while (eptr >= pp)
3438 {
3439 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3440 eptr--;
3441 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3442 }
3443 MRRETURN(MATCH_NOMATCH);
3444 }
3445 }
3446 /* Control never gets here */
3447
3448 /* Match a negated single one-byte character. The character we are
3449 checking can be multibyte. */
3450
3451 case OP_NOT:
3452 case OP_NOTI:
3453 if (eptr >= md->end_subject)
3454 {
3455 SCHECK_PARTIAL();
3456 MRRETURN(MATCH_NOMATCH);
3457 }
3458 ecode++;
3459 GETCHARINCTEST(c, eptr);
3460 if (op == OP_NOTI) /* The caseless case */
3461 {
3462 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
3463 if (c < 256)
3464 #endif
3465 c = md->lcc[c];
3466 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3467 }
3468 else /* Caseful */
3469 {
3470 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3471 }
3472 break;
3473
3474 /* Match a negated single one-byte character repeatedly. This is almost a
3475 repeat of the code for a repeated single character, but I haven't found a
3476 nice way of commoning these up that doesn't require a test of the
3477 positive/negative option for each character match. Maybe that wouldn't add
3478 very much to the time taken, but character matching *is* what this is all
3479 about... */
3480
3481 case OP_NOTEXACT:
3482 case OP_NOTEXACTI:
3483 min = max = GET2(ecode, 1);
3484 ecode += 1 + IMM2_SIZE;
3485 goto REPEATNOTCHAR;
3486
3487 case OP_NOTUPTO:
3488 case OP_NOTUPTOI:
3489 case OP_NOTMINUPTO:
3490 case OP_NOTMINUPTOI:
3491 min = 0;
3492 max = GET2(ecode, 1);
3493 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3494 ecode += 1 + IMM2_SIZE;
3495 goto REPEATNOTCHAR;
3496
3497 case OP_NOTPOSSTAR:
3498 case OP_NOTPOSSTARI:
3499 possessive = TRUE;
3500 min = 0;
3501 max = INT_MAX;
3502 ecode++;
3503 goto REPEATNOTCHAR;
3504
3505 case OP_NOTPOSPLUS:
3506 case OP_NOTPOSPLUSI:
3507 possessive = TRUE;
3508 min = 1;
3509 max = INT_MAX;
3510 ecode++;
3511 goto REPEATNOTCHAR;
3512
3513 case OP_NOTPOSQUERY:
3514 case OP_NOTPOSQUERYI:
3515 possessive = TRUE;
3516 min = 0;
3517 max = 1;
3518 ecode++;
3519 goto REPEATNOTCHAR;
3520
3521 case OP_NOTPOSUPTO:
3522 case OP_NOTPOSUPTOI:
3523 possessive = TRUE;
3524 min = 0;
3525 max = GET2(ecode, 1);
3526 ecode += 1 + IMM2_SIZE;
3527 goto REPEATNOTCHAR;
3528
3529 case OP_NOTSTAR:
3530 case OP_NOTSTARI:
3531 case OP_NOTMINSTAR:
3532 case OP_NOTMINSTARI:
3533 case OP_NOTPLUS:
3534 case OP_NOTPLUSI:
3535 case OP_NOTMINPLUS:
3536 case OP_NOTMINPLUSI:
3537 case OP_NOTQUERY:
3538 case OP_NOTQUERYI:
3539 case OP_NOTMINQUERY:
3540 case OP_NOTMINQUERYI:
3541 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3542 minimize = (c & 1) != 0;
3543 min = rep_min[c]; /* Pick up values from tables; */
3544 max = rep_max[c]; /* zero for max => infinity */
3545 if (max == 0) max = INT_MAX;
3546
3547 /* Common code for all repeated single-byte matches. */
3548
3549 REPEATNOTCHAR:
3550 fc = *ecode++;
3551
3552 /* The code is duplicated for the caseless and caseful cases, for speed,
3553 since matching characters is likely to be quite common. First, ensure the
3554 minimum number of matches are present. If min = max, continue at the same
3555 level without recursing. Otherwise, if minimizing, keep trying the rest of
3556 the expression and advancing one matching character if failing, up to the
3557 maximum. Alternatively, if maximizing, find the maximum number of
3558 characters and work backwards. */
3559
3560 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3561 max, eptr));
3562
3563 if (op >= OP_NOTSTARI) /* Caseless */
3564 {
3565 fc = TABLE_GET(fc, md->lcc, fc);
3566
3567 #ifdef SUPPORT_UTF
3568 if (utf)
3569 {
3570 register unsigned int d;
3571 for (i = 1; i <= min; i++)
3572 {
3573 if (eptr >= md->end_subject)
3574 {
3575 SCHECK_PARTIAL();
3576 MRRETURN(MATCH_NOMATCH);
3577 }
3578 GETCHARINC(d, eptr);
3579 if (d < 256) d = md->lcc[d];
3580 if (fc == d) MRRETURN(MATCH_NOMATCH);
3581 }
3582 }
3583 else
3584 #endif
3585 /* Not UTF mode */
3586 {
3587 for (i = 1; i <= min; i++)
3588 {
3589 if (eptr >= md->end_subject)
3590 {
3591 SCHECK_PARTIAL();
3592 MRRETURN(MATCH_NOMATCH);
3593 }
3594 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3595 }
3596 }
3597
3598 if (min == max) continue;
3599
3600 if (minimize)
3601 {
3602 #ifdef SUPPORT_UTF
3603 if (utf)
3604 {
3605 register unsigned int d;
3606 for (fi = min;; fi++)
3607 {
3608 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3609 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3610 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3611 if (eptr >= md->end_subject)
3612 {
3613 SCHECK_PARTIAL();
3614 MRRETURN(MATCH_NOMATCH);
3615 }
3616 GETCHARINC(d, eptr);
3617 if (d < 256) d = md->lcc[d];
3618 if (fc == d) MRRETURN(MATCH_NOMATCH);
3619 }
3620 }
3621 else
3622 #endif
3623 /* Not UTF mode */
3624 {
3625 for (fi = min;; fi++)
3626 {
3627 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3630 if (eptr >= md->end_subject)
3631 {
3632 SCHECK_PARTIAL();
3633 MRRETURN(MATCH_NOMATCH);
3634 }
3635 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3636 }
3637 }
3638 /* Control never gets here */
3639 }
3640
3641 /* Maximize case */
3642
3643 else
3644 {
3645 pp = eptr;
3646
3647 #ifdef SUPPORT_UTF
3648 if (utf)
3649 {
3650 register unsigned int d;
3651 for (i = min; i < max; i++)
3652 {
3653 int len = 1;
3654 if (eptr >= md->end_subject)
3655 {
3656 SCHECK_PARTIAL();
3657 break;
3658 }
3659 GETCHARLEN(d, eptr, len);
3660 if (d < 256) d = md->lcc[d];
3661 if (fc == d) break;
3662 eptr += len;
3663 }
3664 if (possessive) continue;
3665 for(;;)
3666 {
3667 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3669 if (eptr-- == pp) break; /* Stop if tried at original pos */
3670 BACKCHAR(eptr);
3671 }
3672 }
3673 else
3674 #endif
3675 /* Not UTF mode */
3676 {
3677 for (i = min; i < max; i++)
3678 {
3679 if (eptr >= md->end_subject)
3680 {
3681 SCHECK_PARTIAL();
3682 break;
3683 }
3684 if (fc == md->lcc[*eptr]) break;
3685 eptr++;
3686 }
3687 if (possessive) continue;
3688 while (eptr >= pp)
3689 {
3690 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3692 eptr--;
3693 }
3694 }
3695
3696 MRRETURN(MATCH_NOMATCH);
3697 }
3698 /* Control never gets here */
3699 }
3700
3701 /* Caseful comparisons */
3702
3703 else
3704 {
3705 #ifdef SUPPORT_UTF
3706 if (utf)
3707 {
3708 register unsigned int d;
3709 for (i = 1; i <= min; i++)
3710 {
3711 if (eptr >= md->end_subject)
3712 {
3713 SCHECK_PARTIAL();
3714 MRRETURN(MATCH_NOMATCH);
3715 }
3716 GETCHARINC(d, eptr);
3717 if (fc == d) MRRETURN(MATCH_NOMATCH);
3718 }
3719 }
3720 else
3721 #endif
3722 /* Not UTF mode */
3723 {
3724 for (i = 1; i <= min; i++)
3725 {
3726 if (eptr >= md->end_subject)
3727 {
3728 SCHECK_PARTIAL();
3729 MRRETURN(MATCH_NOMATCH);
3730 }
3731 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3732 }
3733 }
3734
3735 if (min == max) continue;
3736
3737 if (minimize)
3738 {
3739 #ifdef SUPPORT_UTF
3740 if (utf)
3741 {
3742 register unsigned int d;
3743 for (fi = min;; fi++)
3744 {
3745 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3746 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3747 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3748 if (eptr >= md->end_subject)
3749 {
3750 SCHECK_PARTIAL();
3751 MRRETURN(MATCH_NOMATCH);
3752 }
3753 GETCHARINC(d, eptr);
3754 if (fc == d) MRRETURN(MATCH_NOMATCH);
3755 }
3756 }
3757 else
3758 #endif
3759 /* Not UTF mode */
3760 {
3761 for (fi = min;; fi++)
3762 {
3763 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3765 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3766 if (eptr >= md->end_subject)
3767 {
3768 SCHECK_PARTIAL();
3769 MRRETURN(MATCH_NOMATCH);
3770 }
3771 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3772 }
3773 }
3774 /* Control never gets here */
3775 }
3776
3777 /* Maximize case */
3778
3779 else
3780 {
3781 pp = eptr;
3782
3783 #ifdef SUPPORT_UTF
3784 if (utf)
3785 {
3786 register unsigned int d;
3787 for (i = min; i < max; i++)
3788 {
3789 int len = 1;
3790 if (eptr >= md->end_subject)
3791 {
3792 SCHECK_PARTIAL();
3793 break;
3794 }
3795 GETCHARLEN(d, eptr, len);
3796 if (fc == d) break;
3797 eptr += len;
3798 }
3799 if (possessive) continue;
3800 for(;;)
3801 {
3802 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3804 if (eptr-- == pp) break; /* Stop if tried at original pos */
3805 BACKCHAR(eptr);
3806 }
3807 }
3808 else
3809 #endif
3810 /* Not UTF mode */
3811 {
3812 for (i = min; i < max; i++)
3813 {
3814 if (eptr >= md->end_subject)
3815 {
3816 SCHECK_PARTIAL();
3817 break;
3818 }
3819 if (fc == *eptr) break;
3820 eptr++;
3821 }
3822 if (possessive) continue;
3823 while (eptr >= pp)
3824 {
3825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3827 eptr--;
3828 }
3829 }
3830
3831 MRRETURN(MATCH_NOMATCH);
3832 }
3833 }
3834 /* Control never gets here */
3835
3836 /* Match a single character type repeatedly; several different opcodes
3837 share code. This is very similar to the code for single characters, but we
3838 repeat it in the interests of efficiency. */
3839
3840 case OP_TYPEEXACT:
3841 min = max = GET2(ecode, 1);
3842 minimize = TRUE;
3843 ecode += 1 + IMM2_SIZE;
3844 goto REPEATTYPE;
3845
3846 case OP_TYPEUPTO:
3847 case OP_TYPEMINUPTO:
3848 min = 0;
3849 max = GET2(ecode, 1);
3850 minimize = *ecode == OP_TYPEMINUPTO;
3851 ecode += 1 + IMM2_SIZE;
3852 goto REPEATTYPE;
3853
3854 case OP_TYPEPOSSTAR:
3855 possessive = TRUE;
3856 min = 0;
3857 max = INT_MAX;
3858 ecode++;
3859 goto REPEATTYPE;
3860
3861 case OP_TYPEPOSPLUS:
3862 possessive = TRUE;
3863 min = 1;
3864 max = INT_MAX;
3865 ecode++;
3866 goto REPEATTYPE;
3867
3868 case OP_TYPEPOSQUERY:
3869 possessive = TRUE;
3870 min = 0;
3871 max = 1;
3872 ecode++;
3873 goto REPEATTYPE;
3874
3875 case OP_TYPEPOSUPTO:
3876 possessive = TRUE;
3877 min = 0;
3878 max = GET2(ecode, 1);
3879 ecode += 1 + IMM2_SIZE;
3880 goto REPEATTYPE;
3881
3882 case OP_TYPESTAR:
3883 case OP_TYPEMINSTAR:
3884 case OP_TYPEPLUS:
3885 case OP_TYPEMINPLUS:
3886 case OP_TYPEQUERY:
3887 case OP_TYPEMINQUERY:
3888 c = *ecode++ - OP_TYPESTAR;
3889 minimize = (c & 1) != 0;
3890 min = rep_min[c]; /* Pick up values from tables; */
3891 max = rep_max[c]; /* zero for max => infinity */
3892 if (max == 0) max = INT_MAX;
3893
3894 /* Common code for all repeated single character type matches. Note that
3895 in UTF-8 mode, '.' matches a character of any length, but for the other
3896 character types, the valid characters are all one-byte long. */
3897
3898 REPEATTYPE:
3899 ctype = *ecode++; /* Code for the character type */
3900
3901 #ifdef SUPPORT_UCP
3902 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3903 {
3904 prop_fail_result = ctype == OP_NOTPROP;
3905 prop_type = *ecode++;
3906 prop_value = *ecode++;
3907 }
3908 else prop_type = -1;
3909 #endif
3910
3911 /* First, ensure the minimum number of matches are present. Use inline
3912 code for maximizing the speed, and do the type test once at the start
3913 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3914 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3915 and single-bytes. */
3916
3917 if (min > 0)
3918 {
3919 #ifdef SUPPORT_UCP
3920 if (prop_type >= 0)
3921 {
3922 switch(prop_type)
3923 {
3924 case PT_ANY:
3925 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3926 for (i = 1; i <= min; i++)
3927 {
3928 if (eptr >= md->end_subject)
3929 {
3930 SCHECK_PARTIAL();
3931 MRRETURN(MATCH_NOMATCH);
3932 }
3933 GETCHARINCTEST(c, eptr);
3934 }
3935 break;
3936
3937 case PT_LAMP:
3938 for (i = 1; i <= min; i++)
3939 {
3940 int chartype;
3941 if (eptr >= md->end_subject)
3942 {
3943 SCHECK_PARTIAL();
3944 MRRETURN(MATCH_NOMATCH);
3945 }
3946 GETCHARINCTEST(c, eptr);
3947 chartype = UCD_CHARTYPE(c);
3948 if ((chartype == ucp_Lu ||
3949 chartype == ucp_Ll ||
3950 chartype == ucp_Lt) == prop_fail_result)
3951 MRRETURN(MATCH_NOMATCH);
3952 }
3953 break;
3954
3955 case PT_GC:
3956 for (i = 1; i <= min; i++)
3957 {
3958 if (eptr >= md->end_subject)
3959 {
3960 SCHECK_PARTIAL();
3961 MRRETURN(MATCH_NOMATCH);
3962 }
3963 GETCHARINCTEST(c, eptr);
3964 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3965 MRRETURN(MATCH_NOMATCH);
3966 }
3967 break;
3968
3969 case PT_PC:
3970 for (i = 1; i <= min; i++)
3971 {
3972 if (eptr >= md->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 MRRETURN(MATCH_NOMATCH);
3976 }
3977 GETCHARINCTEST(c, eptr);
3978 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
3979 MRRETURN(MATCH_NOMATCH);
3980 }
3981 break;
3982
3983 case PT_SC:
3984 for (i = 1; i <= min; i++)
3985 {
3986 if (eptr >= md->end_subject)
3987 {
3988 SCHECK_PARTIAL();
3989 MRRETURN(MATCH_NOMATCH);
3990 }
3991 GETCHARINCTEST(c, eptr);
3992 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
3993 MRRETURN(MATCH_NOMATCH);
3994 }
3995 break;
3996
3997 case PT_ALNUM:
3998 for (i = 1; i <= min; i++)
3999 {
4000 int category;
4001 if (eptr >= md->end_subject)
4002 {
4003 SCHECK_PARTIAL();
4004 MRRETURN(MATCH_NOMATCH);
4005 }
4006 GETCHARINCTEST(c, eptr);
4007 category = UCD_CATEGORY(c);
4008 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4009 MRRETURN(MATCH_NOMATCH);
4010 }
4011 break;
4012
4013 case PT_SPACE: /* Perl space */
4014 for (i = 1; i <= min; i++)
4015 {
4016 if (eptr >= md->end_subject)
4017 {
4018 SCHECK_PARTIAL();
4019 MRRETURN(MATCH_NOMATCH);
4020 }
4021 GETCHARINCTEST(c, eptr);
4022 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4023 c == CHAR_FF || c == CHAR_CR)
4024 == prop_fail_result)
4025 MRRETURN(MATCH_NOMATCH);
4026 }
4027 break;
4028
4029 case PT_PXSPACE: /* POSIX space */
4030 for (i = 1; i <= min; i++)
4031 {
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 MRRETURN(MATCH_NOMATCH);
4036 }
4037 GETCHARINCTEST(c, eptr);
4038 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4039 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4040 == prop_fail_result)
4041 MRRETURN(MATCH_NOMATCH);
4042 }
4043 break;
4044
4045 case PT_WORD:
4046 for (i = 1; i <= min; i++)
4047 {
4048 int category;
4049 if (eptr >= md->end_subject)
4050 {
4051 SCHECK_PARTIAL();
4052 MRRETURN(MATCH_NOMATCH);
4053 }
4054 GETCHARINCTEST(c, eptr);
4055 category = UCD_CATEGORY(c);
4056 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4057 == prop_fail_result)
4058 MRRETURN(MATCH_NOMATCH);
4059 }
4060 break;
4061
4062 /* This should not occur */
4063
4064 default:
4065 RRETURN(PCRE_ERROR_INTERNAL);
4066 }
4067 }
4068
4069 /* Match extended Unicode sequences. We will get here only if the
4070 support is in the binary; otherwise a compile-time error occurs. */
4071
4072 else if (ctype == OP_EXTUNI)
4073 {
4074 for (i = 1; i <= min; i++)
4075 {
4076 if (eptr >= md->end_subject)
4077 {
4078 SCHECK_PARTIAL();
4079 MRRETURN(MATCH_NOMATCH);
4080 }
4081 GETCHARINCTEST(c, eptr);
4082 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4083 while (eptr < md->end_subject)
4084 {
4085 int len = 1;
4086 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4087 if (UCD_CATEGORY(c) != ucp_M) break;
4088 eptr += len;
4089 }
4090 }
4091 }
4092
4093 else
4094 #endif /* SUPPORT_UCP */
4095
4096 /* Handle all other cases when the coding is UTF-8 */
4097
4098 #ifdef SUPPORT_UTF
4099 if (utf) switch(ctype)
4100 {
4101 case OP_ANY:
4102 for (i = 1; i <= min; i++)
4103 {
4104 if (eptr >= md->end_subject)
4105 {
4106 SCHECK_PARTIAL();
4107 MRRETURN(MATCH_NOMATCH);
4108 }
4109 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4110 eptr++;
4111 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4112 }
4113 break;
4114
4115 case OP_ALLANY:
4116 for (i = 1; i <= min; i++)
4117 {
4118 if (eptr >= md->end_subject)
4119 {
4120 SCHECK_PARTIAL();
4121 MRRETURN(MATCH_NOMATCH);
4122 }
4123 eptr++;
4124 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4125 }
4126 break;
4127
4128 case OP_ANYBYTE:
4129 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4130 eptr += min;
4131 break;
4132
4133 case OP_ANYNL:
4134 for (i = 1; i <= min; i++)
4135 {
4136 if (eptr >= md->end_subject)
4137 {
4138 SCHECK_PARTIAL();
4139 MRRETURN(MATCH_NOMATCH);
4140 }
4141 GETCHARINC(c, eptr);
4142 switch(c)
4143 {
4144 default: MRRETURN(MATCH_NOMATCH);
4145
4146 case 0x000d:
4147 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4148 break;
4149
4150 case 0x000a:
4151 break;
4152
4153 case 0x000b:
4154 case 0x000c:
4155 case 0x0085:
4156 case 0x2028:
4157 case 0x2029:
4158 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4159 break;
4160 }
4161 }
4162 break;
4163
4164 case OP_NOT_HSPACE:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 MRRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINC(c, eptr);
4173 switch(c)
4174 {
4175 default: break;
4176 case 0x09: /* HT */
4177 case 0x20: /* SPACE */
4178 case 0xa0: /* NBSP */
4179 case 0x1680: /* OGHAM SPACE MARK */
4180 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4181 case 0x2000: /* EN QUAD */
4182 case 0x2001: /* EM QUAD */
4183 case 0x2002: /* EN SPACE */
4184 case 0x2003: /* EM SPACE */
4185 case 0x2004: /* THREE-PER-EM SPACE */
4186 case 0x2005: /* FOUR-PER-EM SPACE */
4187 case 0x2006: /* SIX-PER-EM SPACE */
4188 case 0x2007: /* FIGURE SPACE */
4189 case 0x2008: /* PUNCTUATION SPACE */
4190 case 0x2009: /* THIN SPACE */
4191 case 0x200A: /* HAIR SPACE */
4192 case 0x202f: /* NARROW NO-BREAK SPACE */
4193 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4194 case 0x3000: /* IDEOGRAPHIC SPACE */
4195 MRRETURN(MATCH_NOMATCH);
4196 }
4197 }
4198 break;
4199
4200 case OP_HSPACE:
4201 for (i = 1; i <= min; i++)
4202 {
4203 if (eptr >= md->end_subject)
4204 {
4205 SCHECK_PARTIAL();
4206 MRRETURN(MATCH_NOMATCH);
4207 }
4208 GETCHARINC(c, eptr);
4209 switch(c)
4210 {
4211 default: MRRETURN(MATCH_NOMATCH);
4212 case 0x09: /* HT */
4213 case 0x20: /* SPACE */
4214 case 0xa0: /* NBSP */
4215 case 0x1680: /* OGHAM SPACE MARK */
4216 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4217 case 0x2000: /* EN QUAD */
4218 case 0x2001: /* EM QUAD */
4219 case 0x2002: /* EN SPACE */
4220 case 0x2003: /* EM SPACE */
4221 case 0x2004: /* THREE-PER-EM SPACE */
4222 case 0x2005: /* FOUR-PER-EM SPACE */
4223 case 0x2006: /* SIX-PER-EM SPACE */
4224 case 0x2007: /* FIGURE SPACE */
4225 case 0x2008: /* PUNCTUATION SPACE */
4226 case 0x2009: /* THIN SPACE */
4227 case 0x200A: /* HAIR SPACE */
4228 case 0x202f: /* NARROW NO-BREAK SPACE */
4229 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4230 case 0x3000: /* IDEOGRAPHIC SPACE */
4231 break;
4232 }
4233 }
4234 break;
4235
4236 case OP_NOT_VSPACE:
4237 for (i = 1; i <= min; i++)
4238 {
4239 if (eptr >= md->end_subject)
4240 {
4241 SCHECK_PARTIAL();
4242 MRRETURN(MATCH_NOMATCH);
4243 }
4244 GETCHARINC(c, eptr);
4245 switch(c)
4246 {
4247 default: break;
4248 case 0x0a: /* LF */
4249 case 0x0b: /* VT */
4250 case 0x0c: /* FF */
4251 case 0x0d: /* CR */
4252 case 0x85: /* NEL */
4253 case 0x2028: /* LINE SEPARATOR */
4254 case 0x2029: /* PARAGRAPH SEPARATOR */
4255 MRRETURN(MATCH_NOMATCH);
4256 }
4257 }
4258 break;
4259
4260 case OP_VSPACE:
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 MRRETURN(MATCH_NOMATCH);
4267 }
4268 GETCHARINC(c, eptr);
4269 switch(c)
4270 {
4271 default: MRRETURN(MATCH_NOMATCH);
4272 case 0x0a: /* LF */
4273 case 0x0b: /* VT */
4274 case 0x0c: /* FF */
4275 case 0x0d: /* CR */
4276 case 0x85: /* NEL */
4277 case 0x2028: /* LINE SEPARATOR */
4278 case 0x2029: /* PARAGRAPH SEPARATOR */
4279 break;
4280 }
4281 }
4282 break;
4283
4284 case OP_NOT_DIGIT:
4285 for (i = 1; i <= min; i++)
4286 {
4287 if (eptr >= md->end_subject)
4288 {
4289 SCHECK_PARTIAL();
4290 MRRETURN(MATCH_NOMATCH);
4291 }
4292 GETCHARINC(c, eptr);
4293 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4294 MRRETURN(MATCH_NOMATCH);
4295 }
4296 break;
4297
4298 case OP_DIGIT:
4299 for (i = 1; i <= min; i++)
4300 {
4301 if (eptr >= md->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 MRRETURN(MATCH_NOMATCH);
4305 }
4306 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4307 MRRETURN(MATCH_NOMATCH);
4308 /* No need to skip more bytes - we know it's a 1-byte character */
4309 }
4310 break;
4311
4312 case OP_NOT_WHITESPACE:
4313 for (i = 1; i <= min; i++)
4314 {
4315 if (eptr >= md->end_subject)
4316 {
4317 SCHECK_PARTIAL();
4318 MRRETURN(MATCH_NOMATCH);
4319 }
4320 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4321 MRRETURN(MATCH_NOMATCH);
4322 eptr++;
4323 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4324 }
4325 break;
4326
4327 case OP_WHITESPACE:
4328 for (i = 1; i <= min; i++)
4329 {
4330 if (eptr >= md->end_subject)
4331 {
4332 SCHECK_PARTIAL();
4333 MRRETURN(MATCH_NOMATCH);
4334 }
4335 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4336 MRRETURN(MATCH_NOMATCH);
4337 /* No need to skip more bytes - we know it's a 1-byte character */
4338 }
4339 break;
4340
4341 case OP_NOT_WORDCHAR:
4342 for (i = 1; i <= min; i++)
4343 {
4344 if (eptr >= md->end_subject)
4345 {
4346 SCHECK_PARTIAL();
4347 MRRETURN(MATCH_NOMATCH);
4348 }
4349 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4350 MRRETURN(MATCH_NOMATCH);
4351 eptr++;
4352 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4353 }
4354 break;
4355
4356 case OP_WORDCHAR:
4357 for (i = 1; i <= min; i++)
4358 {
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 MRRETURN(MATCH_NOMATCH);
4363 }
4364 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4365 MRRETURN(MATCH_NOMATCH);
4366 /* No need to skip more bytes - we know it's a 1-byte character */
4367 }
4368 break;
4369
4370 default:
4371 RRETURN(PCRE_ERROR_INTERNAL);
4372 } /* End switch(ctype) */
4373
4374 else
4375 #endif /* SUPPORT_UTF */
4376
4377 /* Code for the non-UTF-8 case for minimum matching of operators other
4378 than OP_PROP and OP_NOTPROP. */
4379
4380 switch(ctype)
4381 {
4382 case OP_ANY:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 MRRETURN(MATCH_NOMATCH);
4389 }
4390 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4391 eptr++;
4392 }
4393 break;
4394
4395 case OP_ALLANY:
4396 if (eptr > md->end_subject - min)
4397 {
4398 SCHECK_PARTIAL();
4399 MRRETURN(MATCH_NOMATCH);
4400 }
4401 eptr += min;
4402 break;
4403
4404 case OP_ANYBYTE:
4405 if (eptr > md->end_subject - min)
4406 {
4407 SCHECK_PARTIAL();
4408 MRRETURN(MATCH_NOMATCH);
4409 }
4410 eptr += min;
4411 break;
4412
4413 case OP_ANYNL:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 MRRETURN(MATCH_NOMATCH);
4420 }
4421 switch(*eptr++)
4422 {
4423 default: MRRETURN(MATCH_NOMATCH);
4424
4425 case 0x000d:
4426 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4427 break;
4428
4429 case 0x000a:
4430 break;
4431
4432 case 0x000b:
4433 case 0x000c:
4434 case 0x0085:
4435 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4436 break;
4437 }
4438 }
4439 break;
4440
4441 case OP_NOT_HSPACE:
4442 for (i = 1; i <= min; i++)
4443 {
4444 if (eptr >= md->end_subject)
4445 {
4446 SCHECK_PARTIAL();
4447 MRRETURN(MATCH_NOMATCH);
4448 }
4449 switch(*eptr++)
4450 {
4451 default: break;
4452 case 0x09: /* HT */
4453 case 0x20: /* SPACE */
4454 case 0xa0: /* NBSP */
4455 MRRETURN(MATCH_NOMATCH);
4456 }
4457 }
4458 break;
4459
4460 case OP_HSPACE:
4461 for (i = 1; i <= min; i++)
4462 {
4463 if (eptr >= md->end_subject)
4464 {
4465 SCHECK_PARTIAL();
4466 MRRETURN(MATCH_NOMATCH);
4467 }
4468 switch(*eptr++)
4469 {
4470 default: MRRETURN(MATCH_NOMATCH);
4471 case 0x09: /* HT */
4472 case 0x20: /* SPACE */
4473 case 0xa0: /* NBSP */
4474 break;
4475 }
4476 }
4477 break;
4478
4479 case OP_NOT_VSPACE:
4480 for (i = 1; i <= min; i++)
4481 {
4482 if (eptr >= md->end_subject)
4483 {
4484 SCHECK_PARTIAL();
4485 MRRETURN(MATCH_NOMATCH);
4486 }
4487 switch(*eptr++)
4488 {
4489 default: break;
4490 case 0x0a: /* LF */
4491 case 0x0b: /* VT */
4492 case 0x0c: /* FF */
4493 case 0x0d: /* CR */
4494 case 0x85: /* NEL */
4495 MRRETURN(MATCH_NOMATCH);
4496 }
4497 }
4498 break;
4499
4500 case OP_VSPACE:
4501 for (i = 1; i <= min; i++)
4502 {
4503 if (eptr >= md->end_subject)
4504 {
4505 SCHECK_PARTIAL();
4506 MRRETURN(MATCH_NOMATCH);
4507 }
4508 switch(*eptr++)
4509 {
4510 default: MRRETURN(MATCH_NOMATCH);
4511 case 0x0a: /* LF */
4512 case 0x0b: /* VT */
4513 case 0x0c: /* FF */
4514 case 0x0d: /* CR */
4515 case 0x85: /* NEL */
4516 break;
4517 }
4518 }
4519 break;
4520
4521 case OP_NOT_DIGIT:
4522 for (i = 1; i <= min; i++)
4523 {
4524 if (eptr >= md->end_subject)
4525 {
4526 SCHECK_PARTIAL();
4527 MRRETURN(MATCH_NOMATCH);
4528 }
4529 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4530 }
4531 break;
4532
4533 case OP_DIGIT:
4534 for (i = 1; i <= min; i++)
4535 {
4536 if (eptr >= md->end_subject)
4537 {
4538 SCHECK_PARTIAL();
4539 MRRETURN(MATCH_NOMATCH);
4540 }
4541 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4542 }
4543 break;
4544
4545 case OP_NOT_WHITESPACE:
4546 for (i = 1; i <= min; i++)
4547 {
4548 if (eptr >= md->end_subject)
4549 {
4550 SCHECK_PARTIAL();
4551 MRRETURN(MATCH_NOMATCH);
4552 }
4553 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4554 }
4555 break;
4556
4557 case OP_WHITESPACE:
4558 for (i = 1; i <= min; i++)
4559 {
4560 if (eptr >= md->end_subject)
4561 {
4562 SCHECK_PARTIAL();
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4566 }
4567 break;
4568
4569 case OP_NOT_WORDCHAR:
4570 for (i = 1; i <= min; i++)
4571 {
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 MRRETURN(MATCH_NOMATCH);
4576 }
4577 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4578 MRRETURN(MATCH_NOMATCH);
4579 }
4580 break;
4581
4582 case OP_WORDCHAR:
4583 for (i = 1; i <= min; i++)
4584 {
4585 if (eptr >= md->end_subject)
4586 {
4587 SCHECK_PARTIAL();
4588 MRRETURN(MATCH_NOMATCH);
4589 }
4590 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4591 MRRETURN(MATCH_NOMATCH);
4592 }
4593 break;
4594
4595 default:
4596 RRETURN(PCRE_ERROR_INTERNAL);
4597 }
4598 }
4599
4600 /* If min = max, continue at the same level without recursing */
4601
4602 if (min == max) continue;
4603
4604 /* If minimizing, we have to test the rest of the pattern before each
4605 subsequent match. Again, separate the UTF-8 case for speed, and also
4606 separate the UCP cases. */
4607
4608 if (minimize)
4609 {
4610 #ifdef SUPPORT_UCP
4611 if (prop_type >= 0)
4612 {
4613 switch(prop_type)
4614 {
4615 case PT_ANY:
4616 for (fi = min;; fi++)
4617 {
4618 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4620 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4621 if (eptr >= md->end_subject)
4622 {
4623 SCHECK_PARTIAL();
4624 MRRETURN(MATCH_NOMATCH);
4625 }
4626 GETCHARINCTEST(c, eptr);
4627 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4628 }
4629 /* Control never gets here */
4630
4631 case PT_LAMP:
4632 for (fi = min;; fi++)
4633 {
4634 int chartype;
4635 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4637 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 MRRETURN(MATCH_NOMATCH);
4642 }
4643 GETCHARINCTEST(c, eptr);
4644 chartype = UCD_CHARTYPE(c);
4645 if ((chartype == ucp_Lu ||
4646 chartype == ucp_Ll ||
4647 chartype == ucp_Lt) == prop_fail_result)
4648 MRRETURN(MATCH_NOMATCH);
4649 }
4650 /* Control never gets here */
4651
4652 case PT_GC:
4653 for (fi = min;; fi++)
4654 {
4655 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4656 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4657 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4658 if (eptr >= md->end_subject)
4659 {
4660 SCHECK_PARTIAL();
4661 MRRETURN(MATCH_NOMATCH);
4662 }
4663 GETCHARINCTEST(c, eptr);
4664 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4665 MRRETURN(MATCH_NOMATCH);
4666 }
4667 /* Control never gets here */
4668
4669 case PT_PC:
4670 for (fi = min;; fi++)
4671 {
4672 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4673 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4674 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4675 if (eptr >= md->end_subject)
4676 {
4677 SCHECK_PARTIAL();
4678 MRRETURN(MATCH_NOMATCH);
4679 }
4680 GETCHARINCTEST(c, eptr);
4681 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4682 MRRETURN(MATCH_NOMATCH);
4683 }
4684 /* Control never gets here */
4685
4686 case PT_SC:
4687 for (fi = min;; fi++)
4688 {
4689 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4690 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4691 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4692 if (eptr >= md->end_subject)
4693 {
4694 SCHECK_PARTIAL();
4695 MRRETURN(MATCH_NOMATCH);
4696 }
4697 GETCHARINCTEST(c, eptr);
4698 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4699 MRRETURN(MATCH_NOMATCH);
4700 }
4701 /* Control never gets here */
4702
4703 case PT_ALNUM:
4704 for (fi = min;; fi++)
4705 {
4706 int category;
4707 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4709 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4710 if (eptr >= md->end_subject)
4711 {
4712 SCHECK_PARTIAL();
4713 MRRETURN(MATCH_NOMATCH);
4714 }
4715 GETCHARINCTEST(c, eptr);
4716 category = UCD_CATEGORY(c);
4717 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4718 MRRETURN(MATCH_NOMATCH);
4719 }
4720 /* Control never gets here */
4721
4722 case PT_SPACE: /* Perl space */
4723 for (fi = min;; fi++)
4724 {
4725 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4726 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4727 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4728 if (eptr >= md->end_subject)
4729 {
4730 SCHECK_PARTIAL();
4731 MRRETURN(MATCH_NOMATCH);
4732 }
4733 GETCHARINCTEST(c, eptr);
4734 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4735 c == CHAR_FF || c == CHAR_CR)
4736 == prop_fail_result)
4737 MRRETURN(MATCH_NOMATCH);
4738 }
4739 /* Control never gets here */
4740
4741 case PT_PXSPACE: /* POSIX space */
4742 for (fi = min;; fi++)
4743 {
4744 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4745 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4746 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4747 if (eptr >= md->end_subject)
4748 {
4749 SCHECK_PARTIAL();
4750 MRRETURN(MATCH_NOMATCH);
4751 }
4752 GETCHARINCTEST(c, eptr);
4753 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4754 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4755 == prop_fail_result)
4756 MRRETURN(MATCH_NOMATCH);
4757 }
4758 /* Control never gets here */
4759
4760 case PT_WORD:
4761 for (fi = min;; fi++)
4762 {
4763 int category;
4764 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4766 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4767 if (eptr >= md->end_subject)
4768 {
4769 SCHECK_PARTIAL();
4770 MRRETURN(MATCH_NOMATCH);
4771 }
4772 GETCHARINCTEST(c, eptr);
4773 category = UCD_CATEGORY(c);
4774 if ((category == ucp_L ||
4775 category == ucp_N ||
4776 c == CHAR_UNDERSCORE)
4777 == prop_fail_result)
4778 MRRETURN(MATCH_NOMATCH);
4779 }
4780 /* Control never gets here */
4781
4782 /* This should never occur */
4783
4784 default:
4785 RRETURN(PCRE_ERROR_INTERNAL);
4786 }
4787 }
4788
4789 /* Match extended Unicode sequences. We will get here only if the
4790 support is in the binary; otherwise a compile-time error occurs. */
4791
4792 else if (ctype == OP_EXTUNI)
4793 {
4794 for (fi = min;; fi++)
4795 {
4796 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4798 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4799 if (eptr >= md->end_subject)
4800 {
4801 SCHECK_PARTIAL();
4802 MRRETURN(MATCH_NOMATCH);
4803 }
4804 GETCHARINCTEST(c, eptr);
4805 if (UCD_CATEGORY(c) == ucp_M) MRRETURN(MATCH_NOMATCH);
4806 while (eptr < md->end_subject)
4807 {
4808 int len = 1;
4809 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4810 if (UCD_CATEGORY(c) != ucp_M) break;
4811 eptr += len;
4812 }
4813 }
4814 }
4815 else
4816 #endif /* SUPPORT_UCP */
4817
4818 #ifdef SUPPORT_UTF
4819 if (utf)
4820 {
4821 for (fi = min;; fi++)
4822 {
4823 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4825 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4826 if (eptr >= md->end_subject)
4827 {
4828 SCHECK_PARTIAL();
4829 MRRETURN(MATCH_NOMATCH);
4830 }
4831 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4832 MRRETURN(MATCH_NOMATCH);
4833 GETCHARINC(c, eptr);
4834 switch(ctype)
4835 {
4836 case OP_ANY: /* This is the non-NL case */
4837 case OP_ALLANY:
4838 case OP_ANYBYTE:
4839 break;
4840
4841 case OP_ANYNL:
4842 switch(c)
4843 {
4844 default: MRRETURN(MATCH_NOMATCH);
4845 case 0x000d:
4846 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4847 break;
4848 case 0x000a:
4849 break;
4850
4851 case 0x000b:
4852 case 0x000c:
4853 case 0x0085:
4854 case 0x2028:
4855 case 0x2029:
4856 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4857 break;
4858 }
4859 break;
4860
4861 case OP_NOT_HSPACE:
4862 switch(c)
4863 {
4864 default: break;
4865 case 0x09: /* HT */
4866 case 0x20: /* SPACE */
4867 case 0xa0: /* NBSP */
4868 case 0x1680: /* OGHAM SPACE MARK */
4869 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4870 case 0x2000: /* EN QUAD */
4871 case 0x2001: /* EM QUAD */
4872 case 0x2002: /* EN SPACE */
4873 case 0x2003: /* EM SPACE */
4874 case 0x2004: /* THREE-PER-EM SPACE */
4875 case 0x2005: /* FOUR-PER-EM SPACE */
4876 case 0x2006: /* SIX-PER-EM SPACE */
4877 case 0x2007: /* FIGURE SPACE */
4878 case 0x2008: /* PUNCTUATION SPACE */
4879 case 0x2009: /* THIN SPACE */
4880 case 0x200A: /* HAIR SPACE */
4881 case 0x202f: /* NARROW NO-BREAK SPACE */
4882 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4883 case 0x3000: /* IDEOGRAPHIC SPACE */
4884 MRRETURN(MATCH_NOMATCH);
4885 }
4886 break;
4887
4888 case OP_HSPACE:
4889 switch(c)
4890 {
4891 default: MRRETURN(MATCH_NOMATCH);
4892 case 0x09: /* HT */
4893 case 0x20: /* SPACE */
4894 case 0xa0: /* NBSP */
4895 case 0x1680: /* OGHAM SPACE MARK */
4896 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4897 case 0x2000: /* EN QUAD */
4898 case 0x2001: /* EM QUAD */
4899 case 0x2002: /* EN SPACE */
4900 case 0x2003: /* EM SPACE */
4901 case 0x2004: /* THREE-PER-EM SPACE */
4902 case 0x2005: /* FOUR-PER-EM SPACE */
4903 case 0x2006: /* SIX-PER-EM SPACE */
4904 case 0x2007: /* FIGURE SPACE */
4905 case 0x2008: /* PUNCTUATION SPACE */
4906 case 0x2009: /* THIN SPACE */
4907 case 0x200A: /* HAIR SPACE */
4908 case 0x202f: /* NARROW NO-BREAK SPACE */
4909 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4910 case 0x3000: /* IDEOGRAPHIC SPACE */
4911 break;
4912 }
4913 break;
4914
4915 case OP_NOT_VSPACE:
4916 switch(c)
4917 {
4918 default: break;
4919 case 0x0a: /* LF */
4920 case 0x0b: /* VT */
4921 case 0x0c: /* FF */
4922 case 0x0d: /* CR */
4923 case 0x85: /* NEL */
4924 case 0x2028: /* LINE SEPARATOR */
4925 case 0x2029: /* PARAGRAPH SEPARATOR */
4926 MRRETURN(MATCH_NOMATCH);
4927 }
4928 break;
4929
4930 case OP_VSPACE:
4931 switch(c)
4932 {
4933 default: MRRETURN(MATCH_NOMATCH);
4934 case 0x0a: /* LF */
4935 case 0x0b: /* VT */
4936 case 0x0c: /* FF */
4937 case 0x0d: /* CR */
4938 case 0x85: /* NEL */
4939 case 0x2028: /* LINE SEPARATOR */
4940 case 0x2029: /* PARAGRAPH SEPARATOR */
4941 break;
4942 }
4943 break;
4944
4945 case OP_NOT_DIGIT:
4946 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4947 MRRETURN(MATCH_NOMATCH);
4948 break;
4949
4950 case OP_DIGIT:
4951 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4952 MRRETURN(MATCH_NOMATCH);
4953 break;
4954
4955 case OP_NOT_WHITESPACE:
4956 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4957 MRRETURN(MATCH_NOMATCH);
4958 break;
4959
4960 case OP_WHITESPACE:
4961 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4962 MRRETURN(MATCH_NOMATCH);
4963 break;
4964
4965 case OP_NOT_WORDCHAR:
4966 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4967 MRRETURN(MATCH_NOMATCH);
4968 break;
4969
4970 case OP_WORDCHAR:
4971 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4972 MRRETURN(MATCH_NOMATCH);
4973 break;
4974
4975 default:
4976 RRETURN(PCRE_ERROR_INTERNAL);
4977 }
4978 }
4979 }
4980 else
4981 #endif
4982 /* Not UTF mode */
4983 {
4984 for (fi = min;; fi++)
4985 {
4986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4988 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4989 if (eptr >= md->end_subject)
4990 {
4991 SCHECK_PARTIAL();
4992 MRRETURN(MATCH_NOMATCH);
4993 }
4994 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4995 MRRETURN(MATCH_NOMATCH);
4996 c = *eptr++;
4997 switch(ctype)
4998 {
4999 case OP_ANY: /* This is the non-NL case */
5000 case OP_ALLANY:
5001 case OP_ANYBYTE:
5002 break;
5003
5004 case OP_ANYNL:
5005 switch(c)
5006 {
5007 default: MRRETURN(MATCH_NOMATCH);
5008 case 0x000d:
5009 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5010 break;
5011
5012 case 0x000a:
5013 break;
5014
5015 case 0x000b:
5016 case 0x000c:
5017 case 0x0085:
5018 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
5019 break;
5020 }
5021 break;
5022
5023 case OP_NOT_HSPACE:
5024 switch(c)
5025 {
5026 default: break;
5027 case 0x09: /* HT */
5028 case 0x20: /* SPACE */
5029 case 0xa0: /* NBSP */
5030 MRRETURN(MATCH_NOMATCH);
5031 }
5032 break;
5033
5034 case OP_HSPACE:
5035 switch(c)
5036 {
5037 default: MRRETURN(MATCH_NOMATCH);
5038 case 0x09: /* HT */
5039 case 0x20: /* SPACE */
5040 case 0xa0: /* NBSP */
5041 break;
5042 }
5043 break;
5044
5045 case OP_NOT_VSPACE:
5046 switch(c)
5047 {
5048 default: break;
5049 case 0x0a: /* LF */
5050 case 0x0b: /* VT */
5051 case 0x0c: /* FF */
5052 case 0x0d: /* CR */
5053 case 0x85: /* NEL */
5054 MRRETURN(MATCH_NOMATCH);
5055 }
5056 break;
5057
5058 case OP_VSPACE:
5059 switch(c)
5060 {
5061 default: MRRETURN(MATCH_NOMATCH);
5062 case 0x0a: /* LF */
5063 case 0x0b: /* VT */
5064 case 0x0c: /* FF */
5065 case 0x0d: /* CR */
5066 case 0x85: /* NEL */
5067 break;
5068 }
5069 break;
5070
5071 case OP_NOT_DIGIT:
5072 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
5073 break;
5074
5075 case OP_DIGIT:
5076 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
5077 break;
5078
5079 case OP_NOT_WHITESPACE:
5080 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
5081 break;
5082
5083 case OP_WHITESPACE:
5084 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
5085 break;
5086
5087 case OP_NOT_WORDCHAR:
5088 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
5089 break;
5090
5091 case OP_WORDCHAR:
5092 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
5093 break;
5094
5095 default:
5096 RRETURN(PCRE_ERROR_INTERNAL);
5097 }
5098 }
5099 }
5100 /* Control never gets here */
5101 }
5102
5103 /* If maximizing, it is worth using inline code for speed, doing the type
5104 test once at the start (i.e. keep it out of the loop). Again, keep the
5105 UTF-8 and UCP stuff separate. */
5106
5107 else
5108 {
5109 pp = eptr; /* Remember where we started */
5110
5111 #ifdef SUPPORT_UCP
5112 if (prop_type >= 0)
5113 {
5114 switch(prop_type)
5115 {
5116 case PT_ANY:
5117 for (i = min; i < max; i++)
5118 {
5119 int len = 1;
5120 if (eptr >= md->end_subject)
5121 {
5122 SCHECK_PARTIAL();
5123 break;
5124 }
5125 GETCHARLENTEST(c, eptr, len);
5126 if (prop_fail_result) break;
5127 eptr+= len;
5128 }
5129 break;
5130
5131 case PT_LAMP:
5132 for (i = min; i < max; i++)
5133 {
5134 int chartype;
5135 int len = 1;
5136 if (eptr >= md->end_subject)
5137 {
5138 SCHECK_PARTIAL();
5139 break;
5140 }
5141 GETCHARLENTEST(c, eptr, len);
5142 chartype = UCD_CHARTYPE(c);
5143 if ((chartype == ucp_Lu ||
5144 chartype == ucp_Ll ||
5145 chartype == ucp_Lt) == prop_fail_result)
5146 break;
5147 eptr+= len;
5148 }
5149 break;
5150
5151 case PT_GC:
5152 for (i = min; i < max; i++)
5153 {
5154 int len = 1;
5155 if (eptr >= md->end_subject)
5156 {
5157 SCHECK_PARTIAL();
5158 break;
5159 }
5160 GETCHARLENTEST(c, eptr, len);
5161 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5162 eptr+= len;
5163 }
5164 break;
5165
5166 case PT_PC:
5167 for (i = min; i < max; i++)
5168 {
5169 int len = 1;
5170 if (eptr >= md->end_subject)
5171 {
5172 SCHECK_PARTIAL();
5173 break;
5174 }
5175 GETCHARLENTEST(c, eptr, len);
5176 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5177 eptr+= len;
5178 }
5179 break;
5180
5181 case PT_SC:
5182 for (i = min; i < max; i++)
5183 {
5184 int len = 1;
5185 if (eptr >= md->end_subject)
5186 {
5187 SCHECK_PARTIAL();
5188 break;
5189 }
5190 GETCHARLENTEST(c, eptr, len);
5191 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5192 eptr+= len;
5193 }
5194 break;
5195
5196 case PT_ALNUM:
5197 for (i = min; i < max; i++)
5198 {
5199 int category;
5200 int len = 1;
5201 if (eptr >= md->end_subject)
5202 {
5203 SCHECK_PARTIAL();
5204 break;
5205 }
5206 GETCHARLENTEST(c, eptr, len);
5207 category = UCD_CATEGORY(c);
5208 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5209 break;
5210 eptr+= len;
5211 }
5212 break;
5213
5214 case PT_SPACE: /* Perl space */
5215 for (i = min; i < max; i++)
5216 {
5217 int len = 1;
5218 if (eptr >= md->end_subject)
5219 {
5220 SCHECK_PARTIAL();
5221 break;
5222 }
5223 GETCHARLENTEST(c, eptr, len);
5224 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5225 c == CHAR_FF || c == CHAR_CR)
5226 == prop_fail_result)
5227 break;
5228 eptr+= len;
5229 }
5230 break;
5231
5232 case PT_PXSPACE: /* POSIX space */
5233 for (i = min; i < max; i++)
5234 {
5235 int len = 1;
5236 if (eptr >= md->end_subject)
5237 {
5238 SCHECK_PARTIAL();
5239 break;
5240 }
5241 GETCHARLENTEST(c, eptr, len);
5242 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5243 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5244 == prop_fail_result)
5245 break;
5246 eptr+= len;
5247 }
5248 break;
5249
5250 case PT_WORD:
5251 for (i = min; i < max; i++)
5252 {
5253 int category;
5254 int len = 1;
5255 if (eptr >= md->end_subject)
5256 {
5257 SCHECK_PARTIAL();
5258 break;
5259 }
5260 GETCHARLENTEST(c, eptr, len);
5261 category = UCD_CATEGORY(c);
5262 if ((category == ucp_L || category == ucp_N ||
5263 c == CHAR_UNDERSCORE) == prop_fail_result)
5264 break;
5265 eptr+= len;
5266 }
5267 break;
5268
5269 default:
5270 RRETURN(PCRE_ERROR_INTERNAL);
5271 }
5272
5273 /* eptr is now past the end of the maximum run */
5274
5275 if (possessive) continue;
5276 for(;;)
5277 {
5278 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5280 if (eptr-- == pp) break; /* Stop if tried at original pos */
5281 if (utf) BACKCHAR(eptr);
5282 }
5283 }
5284
5285 /* Match extended Unicode sequences. We will get here only if the
5286 support is in the binary; otherwise a compile-time error occurs. */
5287
5288 else if (ctype == OP_EXTUNI)
5289 {
5290 for (i = min; i < max; i++)
5291 {
5292 int len = 1;
5293 if (eptr >= md->end_subject)
5294 {
5295 SCHECK_PARTIAL();
5296 break;
5297 }
5298 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5299 if (UCD_CATEGORY(c) == ucp_M) break;
5300 eptr += len;
5301 while (eptr < md->end_subject)
5302 {
5303 len = 1;
5304 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5305 if (UCD_CATEGORY(c) != ucp_M) break;
5306 eptr += len;
5307 }
5308 }
5309
5310 /* eptr is now past the end of the maximum run */
5311
5312 if (possessive) continue;
5313
5314 for(;;)
5315 {
5316 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5317 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5318 if (eptr-- == pp) break; /* Stop if tried at original pos */
5319 for (;;) /* Move back over one extended */
5320 {
5321 if (!utf) c = *eptr; else
5322 {
5323 BACKCHAR(eptr);
5324 GETCHAR(c, eptr);
5325 }
5326 if (UCD_CATEGORY(c) != ucp_M) break;
5327 eptr--;
5328 }
5329 }
5330 }
5331
5332 else
5333 #endif /* SUPPORT_UCP */
5334
5335 #ifdef SUPPORT_UTF
5336 if (utf)
5337 {
5338 switch(ctype)
5339 {
5340 case OP_ANY:
5341 if (max < INT_MAX)
5342 {
5343 for (i = min; i < max; i++)
5344 {
5345 if (eptr >= md->end_subject)
5346 {
5347 SCHECK_PARTIAL();
5348 break;
5349 }
5350 if (IS_NEWLINE(eptr)) break;
5351 eptr++;
5352 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5353 }
5354 }
5355
5356 /* Handle unlimited UTF-8 repeat */
5357
5358 else
5359 {
5360 for (i = min; i < max; i++)
5361 {
5362 if (eptr >= md->end_subject)
5363 {
5364 SCHECK_PARTIAL();
5365 break;
5366 }
5367 if (IS_NEWLINE(eptr)) break;
5368 eptr++;
5369 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5370 }
5371 }
5372 break;
5373
5374 case OP_ALLANY:
5375 if (max < INT_MAX)
5376 {
5377 for (i = min; i < max; i++)
5378 {
5379 if (eptr >= md->end_subject)
5380 {
5381 SCHECK_PARTIAL();
5382 break;
5383 }
5384 eptr++;
5385 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5386 }
5387 }
5388 else
5389 {
5390 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5391 SCHECK_PARTIAL();
5392 }
5393 break;
5394
5395 /* The byte case is the same as non-UTF8 */
5396
5397 case OP_ANYBYTE:
5398 c = max - min;
5399 if (c > (unsigned int)(md->end_subject - eptr))
5400 {
5401 eptr = md->end_subject;
5402 SCHECK_PARTIAL();
5403 }
5404 else eptr += c;
5405 break;
5406
5407 case OP_ANYNL:
5408 for (i = min; i < max; i++)
5409 {
5410 int len = 1;
5411 if (eptr >= md->end_subject)
5412 {
5413 SCHECK_PARTIAL();
5414 break;
5415 }
5416 GETCHARLEN(c, eptr, len);
5417 if (c == 0x000d)
5418 {
5419 if (++eptr >= md->end_subject) break;
5420 if (*eptr == 0x000a) eptr++;
5421 }
5422 else
5423 {
5424 if (c != 0x000a &&
5425 (md->bsr_anycrlf ||
5426 (c != 0x000b && c != 0x000c &&
5427 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5428 break;
5429 eptr += len;
5430 }
5431 }
5432 break;
5433
5434 case OP_NOT_HSPACE:
5435 case OP_HSPACE:
5436 for (i = min; i < max; i++)
5437 {
5438 BOOL gotspace;
5439 int len = 1;
5440 if (eptr >= md->end_subject)
5441 {
5442 SCHECK_PARTIAL();
5443 break;
5444 }
5445 GETCHARLEN(c, eptr, len);
5446 switch(c)
5447 {
5448 default: gotspace = FALSE; break;
5449 case 0x09: /* HT */
5450 case 0x20: /* SPACE */
5451 case 0xa0: /* NBSP */
5452 case 0x1680: /* OGHAM SPACE MARK */
5453 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5454 case 0x2000: /* EN QUAD */
5455 case 0x2001: /* EM QUAD */
5456 case 0x2002: /* EN SPACE */
5457 case 0x2003: /* EM SPACE */
5458 case 0x2004: /* THREE-PER-EM SPACE */
5459 case 0x2005: /* FOUR-PER-EM SPACE */
5460 case 0x2006: /* SIX-PER-EM SPACE */
5461 case 0x2007: /* FIGURE SPACE */
5462 case 0x2008: /* PUNCTUATION SPACE */
5463 case 0x2009: /* THIN SPACE */
5464 case 0x200A: /* HAIR SPACE */
5465 case 0x202f: /* NARROW NO-BREAK SPACE */
5466 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5467 case 0x3000: /* IDEOGRAPHIC SPACE */
5468 gotspace = TRUE;
5469 break;
5470 }
5471 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5472 eptr += len;
5473 }
5474 break;
5475
5476 case OP_NOT_VSPACE:
5477 case OP_VSPACE:
5478 for (i = min; i < max; i++)
5479 {
5480 BOOL gotspace;
5481 int len = 1;
5482 if (eptr >= md->end_subject)
5483 {
5484 SCHECK_PARTIAL();
5485 break;
5486 }
5487 GETCHARLEN(c, eptr, len);
5488 switch(c)
5489 {
5490 default: gotspace = FALSE; break;
5491 case 0x0a: /* LF */
5492 case 0x0b: /* VT */
5493 case 0x0c: /* FF */
5494 case 0x0d: /* CR */
5495 case 0x85: /* NEL */
5496 case 0x2028: /* LINE SEPARATOR */
5497 case 0x2029: /* PARAGRAPH SEPARATOR */
5498 gotspace = TRUE;
5499 break;
5500 }
5501 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5502 eptr += len;
5503 }
5504 break;
5505
5506 case OP_NOT_DIGIT:
5507 for (i = min; i < max; i++)
5508 {
5509 int len = 1;
5510 if (eptr >= md->end_subject)
5511 {
5512 SCHECK_PARTIAL();
5513 break;
5514 }
5515 GETCHARLEN(c, eptr, len);
5516 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5517 eptr+= len;
5518 }
5519 break;
5520
5521 case OP_DIGIT:
5522 for (i = min; i < max; i++)
5523 {
5524 int len = 1;
5525 if (eptr >= md->end_subject)
5526 {
5527 SCHECK_PARTIAL();
5528 break;
5529 }
5530 GETCHARLEN(c, eptr, len);
5531 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5532 eptr+= len;
5533 }
5534 break;
5535
5536 case OP_NOT_WHITESPACE:
5537 for (i = min; i < max; i++)
5538 {
5539 int len = 1;
5540 if (eptr >= md->end_subject)
5541 {
5542 SCHECK_PARTIAL();
5543 break;
5544 }
5545 GETCHARLEN(c, eptr, len);
5546 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5547 eptr+= len;
5548 }
5549 break;
5550
5551 case OP_WHITESPACE:
5552 for (i = min; i < max; i++)
5553 {
5554 int len = 1;
5555 if (eptr >= md->end_subject)
5556 {
5557 SCHECK_PARTIAL();
5558 break;
5559 }
5560 GETCHARLEN(c, eptr, len);
5561 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5562 eptr+= len;
5563 }
5564 break;
5565
5566 case OP_NOT_WORDCHAR:
5567 for (i = min; i < max; i++)
5568 {
5569 int len = 1;
5570 if (eptr >= md->end_subject)
5571 {
5572 SCHECK_PARTIAL();
5573 break;
5574 }
5575 GETCHARLEN(c, eptr, len);
5576 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5577 eptr+= len;
5578 }
5579 break;
5580
5581 case OP_WORDCHAR:
5582 for (i = min; i < max; i++)
5583 {
5584 int len = 1;
5585 if (eptr >= md->end_subject)
5586 {
5587 SCHECK_PARTIAL();
5588 break;
5589 }
5590 GETCHARLEN(c, eptr, len);
5591 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5592 eptr+= len;
5593 }
5594 break;
5595
5596 default:
5597 RRETURN(PCRE_ERROR_INTERNAL);
5598 }
5599
5600 /* eptr is now past the end of the maximum run. If possessive, we are
5601 done (no backing up). Otherwise, match at this position; anything other
5602 than no match is immediately returned. For nomatch, back up one
5603 character, unless we are matching \R and the last thing matched was
5604 \r\n, in which case, back up two bytes. */
5605
5606 if (possessive) continue;
5607 for(;;)
5608 {
5609 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5611 if (eptr-- == pp) break; /* Stop if tried at original pos */
5612 BACKCHAR(eptr);
5613 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5614 eptr[-1] == '\r') eptr--;
5615 }
5616 }
5617 else
5618 #endif /* SUPPORT_UTF */
5619 /* Not UTF mode */
5620 {
5621 switch(ctype)
5622 {
5623 case OP_ANY:
5624 for (i = min; i < max; i++)
5625 {
5626 if (eptr >= md->end_subject)
5627 {
5628 SCHECK_PARTIAL();
5629 break;
5630 }
5631 if (IS_NEWLINE(eptr)) break;
5632 eptr++;
5633 }
5634 break;
5635
5636 case OP_ALLANY:
5637 case OP_ANYBYTE:
5638 c = max - min;
5639 if (c > (unsigned int)(md->end_subject - eptr))
5640 {
5641 eptr = md->end_subject;
5642 SCHECK_PARTIAL();
5643 }
5644 else eptr += c;
5645 break;
5646
5647 case OP_ANYNL:
5648 for (i = min; i < max; i++)
5649 {
5650 if (eptr >= md->end_subject)
5651 {
5652 SCHECK_PARTIAL();
5653 break;
5654 }
5655 c = *eptr;
5656 if (c == 0x000d)
5657 {
5658 if (++eptr >= md->end_subject) break;
5659 if (*eptr == 0x000a) eptr++;
5660 }
5661 else
5662 {
5663 if (c != 0x000a &&
5664 (md->bsr_anycrlf ||
5665 (c != 0x000b && c != 0x000c && c != 0x0085)))
5666 break;
5667 eptr++;
5668 }
5669 }
5670 break;
5671
5672 case OP_NOT_HSPACE:
5673 for (i = min; i < max; i++)
5674 {
5675 if (eptr >= md->end_subject)
5676 {
5677 SCHECK_PARTIAL();
5678 break;
5679 }
5680 c = *eptr;
5681 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5682 eptr++;
5683 }
5684 break;
5685
5686 case OP_HSPACE:
5687 for (i = min; i < max; i++)
5688 {
5689 if (eptr >= md->end_subject)
5690 {
5691 SCHECK_PARTIAL();
5692 break;
5693 }
5694 c = *eptr;
5695 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5696 eptr++;
5697 }
5698 break;
5699
5700 case OP_NOT_VSPACE:
5701 for (i = min; i < max; i++)
5702 {
5703 if (eptr >= md->end_subject)
5704 {
5705 SCHECK_PARTIAL();
5706 break;
5707 }
5708 c = *eptr;
5709 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5710 break;
5711 eptr++;
5712 }
5713 break;
5714
5715 case OP_VSPACE:
5716 for (i = min; i < max; i++)
5717 {
5718 if (eptr >= md->end_subject)
5719 {
5720 SCHECK_PARTIAL();
5721 break;
5722 }
5723 c = *eptr;
5724 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5725 break;
5726 eptr++;
5727 }
5728 break;
5729
5730 case OP_NOT_DIGIT:
5731 for (i = min; i < max; i++)
5732 {
5733 if (eptr >= md->end_subject)
5734 {
5735 SCHECK_PARTIAL();
5736 break;
5737 }
5738 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5739 eptr++;
5740 }
5741 break;
5742
5743 case OP_DIGIT:
5744 for (i = min; i < max; i++)
5745 {
5746 if (eptr >= md->end_subject)
5747 {
5748 SCHECK_PARTIAL();
5749 break;
5750 }
5751 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5752 eptr++;
5753 }
5754 break;
5755
5756 case OP_NOT_WHITESPACE:
5757 for (i = min; i < max; i++)
5758 {
5759 if (eptr >= md->end_subject)
5760 {
5761 SCHECK_PARTIAL();
5762 break;
5763 }
5764 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5765 eptr++;
5766 }
5767 break;
5768
5769 case OP_WHITESPACE:
5770 for (i = min; i < max; i++)
5771 {
5772 if (eptr >= md->end_subject)
5773 {
5774 SCHECK_PARTIAL();
5775 break;
5776 }
5777 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5778 eptr++;
5779 }
5780 break;
5781
5782 case OP_NOT_WORDCHAR:
5783 for (i = min; i < max; i++)
5784 {
5785 if (eptr >= md->end_subject)
5786 {
5787 SCHECK_PARTIAL();
5788 break;
5789 }
5790 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5791 eptr++;
5792 }
5793 break;
5794
5795 case OP_WORDCHAR:
5796 for (i = min; i < max; i++)
5797 {
5798 if (eptr >= md->end_subject)
5799 {
5800 SCHECK_PARTIAL();
5801 break;
5802 }
5803 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5804 eptr++;
5805 }
5806 break;
5807
5808 default:
5809 RRETURN(PCRE_ERROR_INTERNAL);
5810 }
5811
5812 /* eptr is now past the end of the maximum run. If possessive, we are
5813 done (no backing up). Otherwise, match at this position; anything other
5814 than no match is immediately returned. For nomatch, back up one
5815 character (byte), unless we are matching \R and the last thing matched
5816 was \r\n, in which case, back up two bytes. */
5817
5818 if (possessive) continue;
5819 while (eptr >= pp)
5820 {
5821 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5822 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5823 eptr--;
5824 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5825 eptr[-1] == '\r') eptr--;
5826 }
5827 }
5828
5829 /* Get here if we can't make it match with any permitted repetitions */
5830
5831 MRRETURN(MATCH_NOMATCH);
5832 }
5833 /* Control never gets here */
5834
5835 /* There's been some horrible disaster. Arrival here can only mean there is
5836 something seriously wrong in the code above or the OP_xxx definitions. */
5837
5838 default:
5839 DPRINTF(("Unknown opcode %d\n", *ecode));
5840 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5841 }
5842
5843 /* Do not stick any code in here without much thought; it is assumed
5844 that "continue" in the code above comes out to here to repeat the main
5845 loop. */
5846
5847 } /* End of main loop */
5848 /* Control never reaches here */
5849
5850
5851 /* When compiling to use the heap rather than the stack for recursive calls to
5852 match(), the RRETURN() macro jumps here. The number that is saved in
5853 frame->Xwhere indicates which label we actually want to return to. */
5854
5855 #ifdef NO_RECURSE
5856 #define LBL(val) case val: goto L_RM##val;
5857 HEAP_RETURN:
5858 switch (frame->Xwhere)
5859 {
5860 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5861 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5862 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5863 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5864 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5865 LBL(65) LBL(66)
5866 #ifdef SUPPORT_UTF
5867 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5868 LBL(32) LBL(34) LBL(42) LBL(46)
5869 #ifdef SUPPORT_UCP
5870 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5871 LBL(59) LBL(60) LBL(61) LBL(62)
5872 #endif /* SUPPORT_UCP */
5873 #endif /* SUPPORT_UTF */
5874 default:
5875 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5876 return PCRE_ERROR_INTERNAL;
5877 }
5878 #undef LBL
5879 #endif /* NO_RECURSE */
5880 }
5881
5882
5883 /***************************************************************************
5884 ****************************************************************************
5885 RECURSION IN THE match() FUNCTION
5886
5887 Undefine all the macros that were defined above to handle this. */
5888
5889 #ifdef NO_RECURSE
5890 #undef eptr
5891 #undef ecode
5892 #undef mstart
5893 #undef offset_top
5894 #undef eptrb
5895 #undef flags
5896
5897 #undef callpat
5898 #undef charptr
5899 #undef data
5900 #undef next
5901 #undef pp
5902 #undef prev
5903 #undef saved_eptr
5904
5905 #undef new_recursive
5906
5907 #undef cur_is_word
5908 #undef condition
5909 #undef prev_is_word
5910
5911 #undef ctype
5912 #undef length
5913 #undef max
5914 #undef min
5915 #undef number
5916 #undef offset
5917 #undef op
5918 #undef save_capture_last
5919 #undef save_offset1
5920 #undef save_offset2
5921 #undef save_offset3
5922 #undef stacksave
5923
5924 #undef newptrb
5925
5926 #endif
5927
5928 /* These two are defined as macros in both cases */
5929
5930 #undef fc
5931 #undef fi
5932
5933 /***************************************************************************
5934 ***************************************************************************/
5935
5936
5937
5938 /*************************************************
5939 * Execute a Regular Expression *
5940 *************************************************/
5941
5942 /* This function applies a compiled re to a subject string and picks out
5943 portions of the string if it matches. Two elements in the vector are set for
5944 each substring: the offsets to the start and end of the substring.
5945
5946 Arguments:
5947 argument_re points to the compiled expression
5948 extra_data points to extra data or is NULL
5949 subject points to the subject string
5950 length length of subject string (may contain binary zeros)
5951 start_offset where to start in the subject string
5952 options option bits
5953 offsets points to a vector of ints to be filled in with offsets
5954 offsetcount the number of elements in the vector
5955
5956 Returns: > 0 => success; value is the number of elements filled in
5957 = 0 => success, but offsets is not big enough
5958 -1 => failed to match
5959 < -1 => some kind of unexpected problem
5960 */
5961
5962 #ifdef COMPILE_PCRE8
5963 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5964 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5965 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5966 int offsetcount)
5967 #else
5968 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5969 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
5970 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
5971 int offsetcount)
5972 #endif
5973 {
5974 int rc, ocount, arg_offset_max;
5975 int newline;
5976 BOOL using_temporary_offsets = FALSE;
5977 BOOL anchored;
5978 BOOL startline;
5979 BOOL firstline;
5980 BOOL utf;
5981 BOOL has_first_char = FALSE;
5982 BOOL has_req_char = FALSE;
5983 pcre_uchar first_char = 0;
5984 pcre_uchar first_char2 = 0;
5985 pcre_uchar req_char = 0;
5986 pcre_uchar req_char2 = 0;
5987 match_data match_block;
5988 match_data *md = &match_block;
5989 const pcre_uint8 *tables;
5990 const pcre_uint8 *start_bits = NULL;
5991 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
5992 PCRE_PUCHAR end_subject;
5993 PCRE_PUCHAR start_partial = NULL;
5994 PCRE_PUCHAR req_char_ptr = start_match - 1;
5995
5996 pcre_study_data internal_study;
5997 const pcre_study_data *study;
5998
5999 real_pcre internal_re;
6000 const real_pcre *external_re = (const real_pcre *)argument_re;
6001 const real_pcre *re = external_re;
6002
6003 /* Plausibility checks */
6004
6005 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6006 if (re == NULL || subject == NULL ||
6007 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6008 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6009 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6010
6011 /* These two settings are used in the code for checking a UTF-8 string that
6012 follows immediately afterwards. Other values in the md block are used only
6013 during "normal" pcre_exec() processing, not when the JIT support is in use,
6014 so they are set up later. */
6015
6016 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6017 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6018 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6019 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6020
6021 /* Check a UTF-8 string if required. Pass back the character offset and error
6022 code for an invalid string if a results vector is available. */
6023
6024 #ifdef SUPPORT_UTF
6025 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6026 {
6027 int erroroffset;
6028 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6029 if (errorcode != 0)
6030 {
6031 if (offsetcount >= 2)
6032 {
6033 offsets[0] = erroroffset;
6034 offsets[1] = errorcode;
6035 }
6036 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6037 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6038 }
6039
6040 /* Check that a start_offset points to the start of a UTF character. */
6041 #ifdef COMPILE_PCRE8
6042 if (start_offset > 0 && start_offset < length &&
6043 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
6044 return PCRE_ERROR_BADUTF8_OFFSET;
6045 #else
6046 #ifdef COMPILE_PCRE16
6047 if (start_offset > 0 && start_offset < length &&
6048 (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
6049 return PCRE_ERROR_BADUTF8_OFFSET;
6050 #endif /* COMPILE_PCRE16 */
6051 #endif /* COMPILE_PCRE8 */
6052 }
6053 #endif
6054
6055 /* If the pattern was successfully studied with JIT support, run the JIT
6056 executable instead of the rest of this function. Most options must be set at
6057 compile time for the JIT code to be usable. Fallback to the normal code path if
6058 an unsupported flag is set. In particular, JIT does not support partial
6059 matching. */
6060
6061 #ifdef SUPPORT_JIT
6062 if (extra_data != NULL
6063 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6064 && extra_data->executable_jit != NULL
6065 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6066 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6067 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6068 return PRIV(jit_exec)(re, extra_data->executable_jit,
6069 (const pcre_uchar *)subject, length, start_offset, options,
6070 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6071 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6072 #endif
6073
6074 /* Carry on with non-JIT matching. This information is for finding all the
6075 numbers associated with a given name, for condition testing. */
6076
6077 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6078 md->name_count = re->name_count;
6079 md->name_entry_size = re->name_entry_size;
6080
6081 /* Fish out the optional data from the extra_data structure, first setting
6082 the default values. */
6083
6084 study = NULL;
6085 md->match_limit = MATCH_LIMIT;
6086 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6087 md->callout_data = NULL;
6088
6089 /* The table pointer is always in native byte order. */
6090
6091 tables = external_re->tables;
6092
6093 if (extra_data != NULL)
6094 {
6095 register unsigned int flags = extra_data->flags;
6096 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6097 study = (const pcre_study_data *)extra_data->study_data;
6098 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6099 md->match_limit = extra_data->match_limit;
6100 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6101 md->match_limit_recursion = extra_data->match_limit_recursion;
6102 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6103 md->callout_data = extra_data->callout_data;
6104 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6105 }
6106
6107 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6108 is a feature that makes it possible to save compiled regex and re-use them
6109 in other programs later. */
6110
6111 if (tables == NULL) tables = PRIV(default_tables);
6112
6113 /* Check that the first field in the block is the magic number. If it is not,
6114 test for a regex that was compiled on a host of opposite endianness. If this is
6115 the case, flipped values are put in internal_re and internal_study if there was
6116 study data too. */
6117
6118 if (re->magic_number != MAGIC_NUMBER)
6119 {
6120 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6121 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6122 if (study != NULL) study = &internal_study;
6123 }
6124
6125 /* Set up other data */
6126
6127 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6128 startline = (re->flags & PCRE_STARTLINE) != 0;
6129 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6130
6131 /* The code starts after the real_pcre block and the capture name table. */
6132
6133 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6134 re->name_count * re->name_entry_size;
6135
6136 md->start_subject = (PCRE_PUCHAR)subject;
6137 md->start_offset = start_offset;
6138 md->end_subject = md->start_subject + length;
6139 end_subject = md->end_subject;
6140
6141 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6142 md->use_ucp = (re->options & PCRE_UCP) != 0;
6143 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6144
6145 /* Some options are unpacked into BOOL variables in the hope that testing
6146 them will be faster than individual option bits. */
6147
6148 md->notbol = (options & PCRE_NOTBOL) != 0;
6149 md->noteol = (options & PCRE_NOTEOL) != 0;
6150 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6151 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6152
6153 md->hitend = FALSE;
6154 md->mark = NULL; /* In case never set */
6155
6156 md->recursive = NULL; /* No recursion at top level */
6157 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6158
6159 md->lcc = tables + lcc_offset;
6160 md->fcc = tables + fcc_offset;
6161 md->ctypes = tables + ctypes_offset;
6162
6163 /* Handle different \R options. */
6164
6165 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6166 {
6167 case 0:
6168 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6169 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6170 else
6171 #ifdef BSR_ANYCRLF
6172 md->bsr_anycrlf = TRUE;
6173 #else
6174 md->bsr_anycrlf = FALSE;
6175 #endif
6176 break;
6177
6178 case PCRE_BSR_ANYCRLF:
6179 md->bsr_anycrlf = TRUE;
6180 break;
6181
6182 case PCRE_BSR_UNICODE:
6183 md->bsr_anycrlf = FALSE;
6184 break;
6185
6186 default: return PCRE_ERROR_BADNEWLINE;
6187 }
6188
6189 /* Handle different types of newline. The three bits give eight cases. If
6190 nothing is set at run time, whatever was used at compile time applies. */
6191
6192 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6193 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6194 {
6195 case 0: newline = NEWLINE; break; /* Compile-time default */
6196 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6197 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6198 case PCRE_NEWLINE_CR+
6199 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6200 case PCRE_NEWLINE_ANY: newline = -1; break;
6201 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6202 default: return PCRE_ERROR_BADNEWLINE;
6203 }
6204
6205 if (newline == -2)
6206 {
6207 md->nltype = NLTYPE_ANYCRLF;
6208 }
6209 else if (newline < 0)
6210 {
6211 md->nltype = NLTYPE_ANY;
6212 }
6213 else
6214 {
6215 md->nltype = NLTYPE_FIXED;
6216 if (newline > 255)
6217 {
6218 md->nllen = 2;
6219 md->nl[0] = (newline >> 8) & 255;
6220 md->nl[1] = newline & 255;
6221 }
6222 else
6223 {
6224 md->nllen = 1;
6225 md->nl[0] = newline;
6226 }
6227 }
6228
6229 /* Partial matching was originally supported only for a restricted set of
6230 regexes; from release 8.00 there are no restrictions, but the bits are still
6231 defined (though never set). So there's no harm in leaving this code. */
6232
6233 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6234 return PCRE_ERROR_BADPARTIAL;
6235
6236 /* If the expression has got more back references than the offsets supplied can
6237 hold, we get a temporary chunk of working store to use during the matching.
6238 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6239 of 3. */
6240
6241 ocount = offsetcount - (offsetcount % 3);
6242 arg_offset_max = (2*ocount)/3;
6243
6244 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6245 {
6246 ocount = re->top_backref * 3 + 3;
6247 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6248 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6249 using_temporary_offsets = TRUE;
6250 DPRINTF(("Got memory to hold back references\n"));
6251 }
6252 else md->offset_vector = offsets;
6253
6254 md->offset_end = ocount;
6255 md->offset_max = (2*ocount)/3;
6256 md->offset_overflow = FALSE;
6257 md->capture_last = -1;
6258
6259 /* Reset the working variable associated with each extraction. These should
6260 never be used unless previously set, but they get saved and restored, and so we
6261 initialize them to avoid reading uninitialized locations. Also, unset the
6262 offsets for the matched string. This is really just for tidiness with callouts,
6263 in case they inspect these fields. */
6264
6265 if (md->offset_vector != NULL)
6266 {
6267 register int *iptr = md->offset_vector + ocount;
6268 register int *iend = iptr - re->top_bracket;
6269 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6270 while (--iptr >= iend) *iptr = -1;
6271 md->offset_vector[0] = md->offset_vector[1] = -1;
6272 }
6273
6274 /* Set up the first character to match, if available. The first_char value is
6275 never set for an anchored regular expression, but the anchoring may be forced
6276 at run time, so we have to test for anchoring. The first char may be unset for
6277 an unanchored pattern, of course. If there's no first char and the pattern was
6278 studied, there may be a bitmap of possible first characters. */
6279
6280 if (!anchored)
6281 {
6282 if ((re->flags & PCRE_FIRSTSET) != 0)
6283 {
6284 has_first_char = TRUE;
6285 first_char = first_char2 = re->first_char;
6286 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6287 {
6288 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6289 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6290 if (utf && first_char > 127)
6291 first_char2 = UCD_OTHERCASE(first_char);
6292 #endif
6293 }
6294 }
6295 else
6296 if (!startline && study != NULL &&
6297 (study->flags & PCRE_STUDY_MAPPED) != 0)
6298 start_bits = study->start_bits;
6299 }
6300
6301 /* For anchored or unanchored matches, there may be a "last known required
6302 character" set. */
6303
6304 if ((re->flags & PCRE_REQCHSET) != 0)
6305 {
6306 has_req_char = TRUE;
6307 req_char = req_char2 = re->req_char;
6308 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6309 {
6310 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6311 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6312 if (utf && req_char > 127)
6313 req_char2 = UCD_OTHERCASE(req_char);
6314 #endif
6315 }
6316 }
6317
6318
6319 /* ==========================================================================*/
6320
6321 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6322 the loop runs just once. */
6323
6324 for(;;)
6325 {
6326 PCRE_PUCHAR save_end_subject = end_subject;
6327 PCRE_PUCHAR new_start_match;
6328
6329 /* If firstline is TRUE, the start of the match is constrained to the first
6330 line of a multiline string. That is, the match must be before or at the first
6331 newline. Implement this by temporarily adjusting end_subject so that we stop
6332 scanning at a newline. If the match fails at the newline, later code breaks
6333 this loop. */
6334
6335 if (firstline)
6336 {
6337 PCRE_PUCHAR t = start_match;
6338 #ifdef SUPPORT_UTF
6339 if (utf)
6340 {
6341 while (t < md->end_subject && !IS_NEWLINE(t))
6342 {
6343 t++;
6344 ACROSSCHAR(t < end_subject, *t, t++);
6345 }
6346 }
6347 else
6348 #endif
6349 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6350 end_subject = t;
6351 }
6352
6353 /* There are some optimizations that avoid running the match if a known
6354 starting point is not found, or if a known later character is not present.
6355 However, there is an option that disables these, for testing and for ensuring
6356 that all callouts do actually occur. The option can be set in the regex by
6357 (*NO_START_OPT) or passed in match-time options. */
6358
6359 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6360 {
6361 /* Advance to a unique first char if there is one. */
6362
6363 if (has_first_char)
6364 {
6365 if (first_char != first_char2)
6366 while (start_match < end_subject &&
6367 *start_match != first_char && *start_match != first_char2)
6368 start_match++;
6369 else
6370 while (start_match < end_subject && *start_match != first_char)
6371 start_match++;
6372 }
6373
6374 /* Or to just after a linebreak for a multiline match */
6375
6376 else if (startline)
6377 {
6378 if (start_match > md->start_subject + start_offset)
6379 {
6380 #ifdef SUPPORT_UTF
6381 if (utf)
6382 {
6383 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6384 {
6385 start_match++;
6386 ACROSSCHAR(start_match < end_subject, *start_match,
6387 start_match++);
6388 }
6389 }
6390 else
6391 #endif
6392 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6393 start_match++;
6394
6395 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6396 and we are now at a LF, advance the match position by one more character.
6397 */
6398
6399 if (start_match[-1] == CHAR_CR &&
6400 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6401 start_match < end_subject &&
6402 *start_match == CHAR_NL)
6403 start_match++;
6404 }
6405 }
6406
6407 /* Or to a non-unique first byte after study */
6408
6409 else if (start_bits != NULL)
6410 {
6411 while (start_match < end_subject)
6412 {
6413 register unsigned int c = *start_match;
6414 #ifndef COMPILE_PCRE8
6415 if (c > 255) c = 255;
6416 #endif
6417 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6418 {
6419 start_match++;
6420 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6421 /* In non 8-bit mode, the iteration will stop for
6422 characters > 255 at the beginning or not stop at all. */
6423 if (utf)
6424 ACROSSCHAR(start_match < end_subject, *start_match,
6425 start_match++);
6426 #endif
6427 }
6428 else break;
6429 }
6430 }
6431 } /* Starting optimizations */
6432
6433 /* Restore fudged end_subject */
6434
6435 end_subject = save_end_subject;
6436
6437 /* The following two optimizations are disabled for partial matching or if
6438 disabling is explicitly requested. */
6439
6440 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6441 {
6442 /* If the pattern was studied, a minimum subject length may be set. This is
6443 a lower bound; no actual string of that length may actually match the
6444 pattern. Although the value is, strictly, in characters, we treat it as
6445 bytes to avoid spending too much time in this optimization. */
6446
6447 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6448 (pcre_uint32)(end_subject - start_match) < study->minlength)
6449 {
6450 rc = MATCH_NOMATCH;
6451 break;
6452 }
6453
6454 /* If req_char is set, we know that that character must appear in the
6455 subject for the match to succeed. If the first character is set, req_char
6456 must be later in the subject; otherwise the test starts at the match point.
6457 This optimization can save a huge amount of backtracking in patterns with
6458 nested unlimited repeats that aren't going to match. Writing separate code
6459 for cased/caseless versions makes it go faster, as does using an
6460 autoincrement and backing off on a match.
6461
6462 HOWEVER: when the subject string is very, very long, searching to its end
6463 can take a long time, and give bad performance on quite ordinary patterns.
6464 This showed up when somebody was matching something like /^\d+C/ on a
6465 32-megabyte string... so we don't do this when the string is sufficiently
6466 long. */
6467
6468 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6469 {
6470 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6471
6472 /* We don't need to repeat the search if we haven't yet reached the
6473 place we found it at last time. */
6474
6475 if (p > req_char_ptr)
6476 {
6477 if (req_char != req_char2)
6478 {
6479 while (p < end_subject)
6480 {
6481 register int pp = *p++;
6482 if (pp == req_char || pp == req_char2) { p--; break; }
6483 }
6484 }
6485 else
6486 {
6487 while (p < end_subject)
6488 {
6489 if (*p++ == req_char) { p--; break; }
6490 }
6491 }
6492
6493 /* If we can't find the required character, break the matching loop,
6494 forcing a match failure. */
6495
6496 if (p >= end_subject)
6497 {
6498 rc = MATCH_NOMATCH;
6499 break;
6500 }
6501
6502 /* If we have found the required character, save the point where we
6503 found it, so that we don't search again next time round the loop if
6504 the start hasn't passed this character yet. */
6505
6506 req_char_ptr = p;
6507 }
6508 }
6509 }
6510
6511 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6512 printf(">>>> Match against: ");
6513 pchars(start_match, end_subject - start_match, TRUE, md);
6514 printf("\n");
6515 #endif
6516
6517 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6518 first starting point for which a partial match was found. */
6519
6520 md->start_match_ptr = start_match;
6521 md->start_used_ptr = start_match;
6522 md->match_call_count = 0;
6523 md->match_function_type = 0;
6524 md->end_offset_top = 0;
6525 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6526 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6527
6528 switch(rc)
6529 {
6530 /* SKIP passes back the next starting point explicitly, but if it is the
6531 same as the match we have just done, treat it as NOMATCH. */
6532
6533 case MATCH_SKIP:
6534 if (md->start_match_ptr != start_match)
6535 {
6536 new_start_match = md->start_match_ptr;
6537 break;
6538 }
6539 /* Fall through */
6540
6541 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6542 the SKIP's arg was not found. We also treat this as NOMATCH. */
6543
6544 case MATCH_SKIP_ARG:
6545 /* Fall through */
6546
6547 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6548 exactly like PRUNE. */
6549
6550 case MATCH_NOMATCH:
6551 case MATCH_PRUNE:
6552 case MATCH_THEN:
6553 new_start_match = start_match + 1;
6554 #ifdef SUPPORT_UTF
6555 if (utf)
6556 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6557 new_start_match++);
6558 #endif
6559 break;
6560
6561 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6562
6563 case MATCH_COMMIT:
6564 rc = MATCH_NOMATCH;
6565 goto ENDLOOP;
6566
6567 /* Any other return is either a match, or some kind of error. */
6568
6569 default:
6570 goto ENDLOOP;
6571 }
6572
6573 /* Control reaches here for the various types of "no match at this point"
6574 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6575
6576 rc = MATCH_NOMATCH;
6577
6578 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6579 newline in the subject (though it may continue over the newline). Therefore,
6580 if we have just failed to match, starting at a newline, do not continue. */
6581
6582 if (firstline && IS_NEWLINE(start_match)) break;
6583
6584 /* Advance to new matching position */
6585
6586 start_match = new_start_match;
6587
6588 /* Break the loop if the pattern is anchored or if we have passed the end of
6589 the subject. */
6590
6591 if (anchored || start_match > end_subject) break;
6592
6593 /* If we have just passed a CR and we are now at a LF, and the pattern does
6594 not contain any explicit matches for \r or \n, and the newline option is CRLF
6595 or ANY or ANYCRLF, advance the match position by one more character. */
6596
6597 if (start_match[-1] == CHAR_CR &&
6598 start_match < end_subject &&
6599 *start_match == CHAR_NL &&
6600 (re->flags & PCRE_HASCRORLF) == 0 &&
6601 (md->nltype == NLTYPE_ANY ||
6602 md->nltype == NLTYPE_ANYCRLF ||
6603 md->nllen == 2))
6604 start_match++;
6605
6606 md->mark = NULL; /* Reset for start of next match attempt */
6607 } /* End of for(;;) "bumpalong" loop */
6608
6609 /* ==========================================================================*/
6610
6611 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6612 conditions is true:
6613
6614 (1) The pattern is anchored or the match was failed by (*COMMIT);
6615
6616 (2) We are past the end of the subject;
6617
6618 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6619 this option requests that a match occur at or before the first newline in
6620 the subject.
6621
6622 When we have a match and the offset vector is big enough to deal with any
6623 backreferences, captured substring offsets will already be set up. In the case
6624 where we had to get some local store to hold offsets for backreference
6625 processing, copy those that we can. In this case there need not be overflow if
6626 certain parts of the pattern were not used, even though there are more
6627 capturing parentheses than vector slots. */
6628
6629 ENDLOOP:
6630
6631 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6632 {
6633 if (using_temporary_offsets)
6634 {
6635 if (arg_offset_max >= 4)
6636 {
6637 memcpy(offsets + 2, md->offset_vector + 2,
6638 (arg_offset_max - 2) * sizeof(int));
6639 DPRINTF(("Copied offsets from temporary memory\n"));
6640 }
6641 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6642 DPRINTF(("Freeing temporary memory\n"));
6643 (pcre_free)(md->offset_vector);
6644 }
6645
6646 /* Set the return code to the number of captured strings, or 0 if there were
6647 too many to fit into the vector. */
6648
6649 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6650 0 : md->end_offset_top/2;
6651
6652 /* If there is space in the offset vector, set any unused pairs at the end of
6653 the pattern to -1 for backwards compatibility. It is documented that this
6654 happens. In earlier versions, the whole set of potential capturing offsets
6655 was set to -1 each time round the loop, but this is handled differently now.
6656 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6657 those at the end that need unsetting here. We can't just unset them all at
6658 the start of the whole thing because they may get set in one branch that is
6659 not the final matching branch. */
6660
6661 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6662 {
6663 register int *iptr, *iend;
6664 int resetcount = 2 + re->top_bracket * 2;
6665 if (resetcount > offsetcount) resetcount = ocount;
6666 iptr = offsets + md->end_offset_top;
6667 iend = offsets + resetcount;
6668 while (iptr < iend) *iptr++ = -1;
6669 }
6670
6671 /* If there is space, set up the whole thing as substring 0. The value of
6672 md->start_match_ptr might be modified if \K was encountered on the success
6673 matching path. */
6674
6675 if (offsetcount < 2) rc = 0; else
6676 {
6677 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6678 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6679 }
6680
6681 DPRINTF((">>>> returning %d\n", rc));
6682 goto RETURN_MARK;
6683 }
6684
6685 /* Control gets here if there has been an error, or if the overall match
6686 attempt has failed at all permitted starting positions. */
6687
6688 if (using_temporary_offsets)
6689 {
6690 DPRINTF(("Freeing temporary memory\n"));
6691 (pcre_free)(md->offset_vector);
6692 }
6693
6694 /* For anything other than nomatch or partial match, just return the code. */
6695
6696 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6697 {
6698 DPRINTF((">>>> error: returning %d\n", rc));
6699 return rc;
6700 }
6701
6702 /* Handle partial matches - disable any mark data */
6703
6704 if (start_partial != NULL)
6705 {
6706 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6707 md->mark = NULL;
6708 if (offsetcount > 1)
6709 {
6710 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6711 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6712 }
6713 rc = PCRE_ERROR_PARTIAL;
6714 }
6715
6716 /* This is the classic nomatch case */
6717
6718 else
6719 {
6720 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6721 rc = PCRE_ERROR_NOMATCH;
6722 }
6723
6724 /* Return the MARK data if it has been requested. */
6725
6726 RETURN_MARK:
6727
6728 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6729 *(extra_data->mark) = (unsigned char *)(md->mark);
6730 return rc;
6731 }
6732
6733 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5