/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 804 - (show annotations)
Wed Dec 14 11:18:01 2011 UTC (8 years, 3 months ago) by zherczeg
File MIME type: text/plain
File size: 205935 byte(s)
PUBL macro added, single char optimization is fixed, MAX_255 checks are added, pcre_jit_test now copy the default tables to help valgrind
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
208 }
209 }
210
211 /* In the caseful case, we can just compare the bytes, whether or not we
212 are in UTF-8 mode. */
213
214 else
215 {
216 if (eptr + length > md->end_subject) return -1;
217 while (length-- > 0) if (*p++ != *eptr++) return -1;
218 }
219
220 return (int)(eptr - eptr_start);
221 }
222
223
224
225 /***************************************************************************
226 ****************************************************************************
227 RECURSION IN THE match() FUNCTION
228
229 The match() function is highly recursive, though not every recursive call
230 increases the recursive depth. Nevertheless, some regular expressions can cause
231 it to recurse to a great depth. I was writing for Unix, so I just let it call
232 itself recursively. This uses the stack for saving everything that has to be
233 saved for a recursive call. On Unix, the stack can be large, and this works
234 fine.
235
236 It turns out that on some non-Unix-like systems there are problems with
237 programs that use a lot of stack. (This despite the fact that every last chip
238 has oodles of memory these days, and techniques for extending the stack have
239 been known for decades.) So....
240
241 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
242 calls by keeping local variables that need to be preserved in blocks of memory
243 obtained from malloc() instead instead of on the stack. Macros are used to
244 achieve this so that the actual code doesn't look very different to what it
245 always used to.
246
247 The original heap-recursive code used longjmp(). However, it seems that this
248 can be very slow on some operating systems. Following a suggestion from Stan
249 Switzer, the use of longjmp() has been abolished, at the cost of having to
250 provide a unique number for each call to RMATCH. There is no way of generating
251 a sequence of numbers at compile time in C. I have given them names, to make
252 them stand out more clearly.
253
254 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
255 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
256 tests. Furthermore, not using longjmp() means that local dynamic variables
257 don't have indeterminate values; this has meant that the frame size can be
258 reduced because the result can be "passed back" by straight setting of the
259 variable instead of being passed in the frame.
260 ****************************************************************************
261 ***************************************************************************/
262
263 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
264 below must be updated in sync. */
265
266 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
267 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
268 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
269 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
270 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
271 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
272 RM61, RM62, RM63, RM64, RM65, RM66 };
273
274 /* These versions of the macros use the stack, as normal. There are debugging
275 versions and production versions. Note that the "rw" argument of RMATCH isn't
276 actually used in this definition. */
277
278 #ifndef NO_RECURSE
279 #define REGISTER register
280
281 #ifdef PCRE_DEBUG
282 #define RMATCH(ra,rb,rc,rd,re,rw) \
283 { \
284 printf("match() called in line %d\n", __LINE__); \
285 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
286 printf("to line %d\n", __LINE__); \
287 }
288 #define RRETURN(ra) \
289 { \
290 printf("match() returned %d from line %d ", ra, __LINE__); \
291 return ra; \
292 }
293 #else
294 #define RMATCH(ra,rb,rc,rd,re,rw) \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
296 #define RRETURN(ra) return ra
297 #endif
298
299 #else
300
301
302 /* These versions of the macros manage a private stack on the heap. Note that
303 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
304 argument of match(), which never changes. */
305
306 #define REGISTER
307
308 #define RMATCH(ra,rb,rc,rd,re,rw)\
309 {\
310 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
311 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
312 frame->Xwhere = rw; \
313 newframe->Xeptr = ra;\
314 newframe->Xecode = rb;\
315 newframe->Xmstart = mstart;\
316 newframe->Xoffset_top = rc;\
317 newframe->Xeptrb = re;\
318 newframe->Xrdepth = frame->Xrdepth + 1;\
319 newframe->Xprevframe = frame;\
320 frame = newframe;\
321 DPRINTF(("restarting from line %d\n", __LINE__));\
322 goto HEAP_RECURSE;\
323 L_##rw:\
324 DPRINTF(("jumped back to line %d\n", __LINE__));\
325 }
326
327 #define RRETURN(ra)\
328 {\
329 heapframe *oldframe = frame;\
330 frame = oldframe->Xprevframe;\
331 (PUBL(stack_free))(oldframe);\
332 if (frame != NULL)\
333 {\
334 rrc = ra;\
335 goto HEAP_RETURN;\
336 }\
337 return ra;\
338 }
339
340
341 /* Structure for remembering the local variables in a private frame */
342
343 typedef struct heapframe {
344 struct heapframe *Xprevframe;
345
346 /* Function arguments that may change */
347
348 PCRE_PUCHAR Xeptr;
349 const pcre_uchar *Xecode;
350 PCRE_PUCHAR Xmstart;
351 int Xoffset_top;
352 eptrblock *Xeptrb;
353 unsigned int Xrdepth;
354
355 /* Function local variables */
356
357 PCRE_PUCHAR Xcallpat;
358 #ifdef SUPPORT_UTF
359 PCRE_PUCHAR Xcharptr;
360 #endif
361 PCRE_PUCHAR Xdata;
362 PCRE_PUCHAR Xnext;
363 PCRE_PUCHAR Xpp;
364 PCRE_PUCHAR Xprev;
365 PCRE_PUCHAR Xsaved_eptr;
366
367 recursion_info Xnew_recursive;
368
369 BOOL Xcur_is_word;
370 BOOL Xcondition;
371 BOOL Xprev_is_word;
372
373 #ifdef SUPPORT_UCP
374 int Xprop_type;
375 int Xprop_value;
376 int Xprop_fail_result;
377 int Xoclength;
378 pcre_uchar Xocchars[6];
379 #endif
380
381 int Xcodelink;
382 int Xctype;
383 unsigned int Xfc;
384 int Xfi;
385 int Xlength;
386 int Xmax;
387 int Xmin;
388 int Xnumber;
389 int Xoffset;
390 int Xop;
391 int Xsave_capture_last;
392 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
393 int Xstacksave[REC_STACK_SAVE_MAX];
394
395 eptrblock Xnewptrb;
396
397 /* Where to jump back to */
398
399 int Xwhere;
400
401 } heapframe;
402
403 #endif
404
405
406 /***************************************************************************
407 ***************************************************************************/
408
409
410
411 /*************************************************
412 * Match from current position *
413 *************************************************/
414
415 /* This function is called recursively in many circumstances. Whenever it
416 returns a negative (error) response, the outer incarnation must also return the
417 same response. */
418
419 /* These macros pack up tests that are used for partial matching, and which
420 appear several times in the code. We set the "hit end" flag if the pointer is
421 at the end of the subject and also past the start of the subject (i.e.
422 something has been matched). For hard partial matching, we then return
423 immediately. The second one is used when we already know we are past the end of
424 the subject. */
425
426 #define CHECK_PARTIAL()\
427 if (md->partial != 0 && eptr >= md->end_subject && \
428 eptr > md->start_used_ptr) \
429 { \
430 md->hitend = TRUE; \
431 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
432 }
433
434 #define SCHECK_PARTIAL()\
435 if (md->partial != 0 && eptr > md->start_used_ptr) \
436 { \
437 md->hitend = TRUE; \
438 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
439 }
440
441
442 /* Performance note: It might be tempting to extract commonly used fields from
443 the md structure (e.g. utf, end_subject) into individual variables to improve
444 performance. Tests using gcc on a SPARC disproved this; in the first case, it
445 made performance worse.
446
447 Arguments:
448 eptr pointer to current character in subject
449 ecode pointer to current position in compiled code
450 mstart pointer to the current match start position (can be modified
451 by encountering \K)
452 offset_top current top pointer
453 md pointer to "static" info for the match
454 eptrb pointer to chain of blocks containing eptr at start of
455 brackets - for testing for empty matches
456 rdepth the recursion depth
457
458 Returns: MATCH_MATCH if matched ) these values are >= 0
459 MATCH_NOMATCH if failed to match )
460 a negative MATCH_xxx value for PRUNE, SKIP, etc
461 a negative PCRE_ERROR_xxx value if aborted by an error condition
462 (e.g. stopped by repeated call or recursion limit)
463 */
464
465 static int
466 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
467 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
468 unsigned int rdepth)
469 {
470 /* These variables do not need to be preserved over recursion in this function,
471 so they can be ordinary variables in all cases. Mark some of them with
472 "register" because they are used a lot in loops. */
473
474 register int rrc; /* Returns from recursive calls */
475 register int i; /* Used for loops not involving calls to RMATCH() */
476 register unsigned int c; /* Character values not kept over RMATCH() calls */
477 register BOOL utf; /* Local copy of UTF flag for speed */
478
479 BOOL minimize, possessive; /* Quantifier options */
480 BOOL caseless;
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
490 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
491 frame->Xprevframe = NULL; /* Marks the top level */
492
493 /* Copy in the original argument variables */
494
495 frame->Xeptr = eptr;
496 frame->Xecode = ecode;
497 frame->Xmstart = mstart;
498 frame->Xoffset_top = offset_top;
499 frame->Xeptrb = eptrb;
500 frame->Xrdepth = rdepth;
501
502 /* This is where control jumps back to to effect "recursion" */
503
504 HEAP_RECURSE:
505
506 /* Macros make the argument variables come from the current frame */
507
508 #define eptr frame->Xeptr
509 #define ecode frame->Xecode
510 #define mstart frame->Xmstart
511 #define offset_top frame->Xoffset_top
512 #define eptrb frame->Xeptrb
513 #define rdepth frame->Xrdepth
514
515 /* Ditto for the local variables */
516
517 #ifdef SUPPORT_UTF
518 #define charptr frame->Xcharptr
519 #endif
520 #define callpat frame->Xcallpat
521 #define codelink frame->Xcodelink
522 #define data frame->Xdata
523 #define next frame->Xnext
524 #define pp frame->Xpp
525 #define prev frame->Xprev
526 #define saved_eptr frame->Xsaved_eptr
527
528 #define new_recursive frame->Xnew_recursive
529
530 #define cur_is_word frame->Xcur_is_word
531 #define condition frame->Xcondition
532 #define prev_is_word frame->Xprev_is_word
533
534 #ifdef SUPPORT_UCP
535 #define prop_type frame->Xprop_type
536 #define prop_value frame->Xprop_value
537 #define prop_fail_result frame->Xprop_fail_result
538 #define oclength frame->Xoclength
539 #define occhars frame->Xocchars
540 #endif
541
542 #define ctype frame->Xctype
543 #define fc frame->Xfc
544 #define fi frame->Xfi
545 #define length frame->Xlength
546 #define max frame->Xmax
547 #define min frame->Xmin
548 #define number frame->Xnumber
549 #define offset frame->Xoffset
550 #define op frame->Xop
551 #define save_capture_last frame->Xsave_capture_last
552 #define save_offset1 frame->Xsave_offset1
553 #define save_offset2 frame->Xsave_offset2
554 #define save_offset3 frame->Xsave_offset3
555 #define stacksave frame->Xstacksave
556
557 #define newptrb frame->Xnewptrb
558
559 /* When recursion is being used, local variables are allocated on the stack and
560 get preserved during recursion in the normal way. In this environment, fi and
561 i, and fc and c, can be the same variables. */
562
563 #else /* NO_RECURSE not defined */
564 #define fi i
565 #define fc c
566
567 /* Many of the following variables are used only in small blocks of the code.
568 My normal style of coding would have declared them within each of those blocks.
569 However, in order to accommodate the version of this code that uses an external
570 "stack" implemented on the heap, it is easier to declare them all here, so the
571 declarations can be cut out in a block. The only declarations within blocks
572 below are for variables that do not have to be preserved over a recursive call
573 to RMATCH(). */
574
575 #ifdef SUPPORT_UTF
576 const pcre_uchar *charptr;
577 #endif
578 const pcre_uchar *callpat;
579 const pcre_uchar *data;
580 const pcre_uchar *next;
581 PCRE_PUCHAR pp;
582 const pcre_uchar *prev;
583 PCRE_PUCHAR saved_eptr;
584
585 recursion_info new_recursive;
586
587 BOOL cur_is_word;
588 BOOL condition;
589 BOOL prev_is_word;
590
591 #ifdef SUPPORT_UCP
592 int prop_type;
593 int prop_value;
594 int prop_fail_result;
595 int oclength;
596 pcre_uchar occhars[6];
597 #endif
598
599 int codelink;
600 int ctype;
601 int length;
602 int max;
603 int min;
604 int number;
605 int offset;
606 int op;
607 int save_capture_last;
608 int save_offset1, save_offset2, save_offset3;
609 int stacksave[REC_STACK_SAVE_MAX];
610
611 eptrblock newptrb;
612 #endif /* NO_RECURSE */
613
614 /* To save space on the stack and in the heap frame, I have doubled up on some
615 of the local variables that are used only in localised parts of the code, but
616 still need to be preserved over recursive calls of match(). These macros define
617 the alternative names that are used. */
618
619 #define allow_zero cur_is_word
620 #define cbegroup condition
621 #define code_offset codelink
622 #define condassert condition
623 #define matched_once prev_is_word
624 #define foc number
625
626 /* These statements are here to stop the compiler complaining about unitialized
627 variables. */
628
629 #ifdef SUPPORT_UCP
630 prop_value = 0;
631 prop_fail_result = 0;
632 #endif
633
634
635 /* This label is used for tail recursion, which is used in a few cases even
636 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
637 used. Thanks to Ian Taylor for noticing this possibility and sending the
638 original patch. */
639
640 TAIL_RECURSE:
641
642 /* OK, now we can get on with the real code of the function. Recursive calls
643 are specified by the macro RMATCH and RRETURN is used to return. When
644 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
645 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
646 defined). However, RMATCH isn't like a function call because it's quite a
647 complicated macro. It has to be used in one particular way. This shouldn't,
648 however, impact performance when true recursion is being used. */
649
650 #ifdef SUPPORT_UTF
651 utf = md->utf; /* Local copy of the flag */
652 #else
653 utf = FALSE;
654 #endif
655
656 /* First check that we haven't called match() too many times, or that we
657 haven't exceeded the recursive call limit. */
658
659 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
660 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
661
662 /* At the start of a group with an unlimited repeat that may match an empty
663 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
664 done this way to save having to use another function argument, which would take
665 up space on the stack. See also MATCH_CONDASSERT below.
666
667 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
668 such remembered pointers, to be checked when we hit the closing ket, in order
669 to break infinite loops that match no characters. When match() is called in
670 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
671 NOT be used with tail recursion, because the memory block that is used is on
672 the stack, so a new one may be required for each match(). */
673
674 if (md->match_function_type == MATCH_CBEGROUP)
675 {
676 newptrb.epb_saved_eptr = eptr;
677 newptrb.epb_prev = eptrb;
678 eptrb = &newptrb;
679 md->match_function_type = 0;
680 }
681
682 /* Now start processing the opcodes. */
683
684 for (;;)
685 {
686 minimize = possessive = FALSE;
687 op = *ecode;
688
689 switch(op)
690 {
691 case OP_MARK:
692 md->nomatch_mark = ecode + 2;
693 md->mark = NULL; /* In case previously set by assertion */
694 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
695 eptrb, RM55);
696 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
697 md->mark == NULL) md->mark = ecode + 2;
698
699 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
700 argument, and we must check whether that argument matches this MARK's
701 argument. It is passed back in md->start_match_ptr (an overloading of that
702 variable). If it does match, we reset that variable to the current subject
703 position and return MATCH_SKIP. Otherwise, pass back the return code
704 unaltered. */
705
706 else if (rrc == MATCH_SKIP_ARG &&
707 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
708 {
709 md->start_match_ptr = eptr;
710 RRETURN(MATCH_SKIP);
711 }
712 RRETURN(rrc);
713
714 case OP_FAIL:
715 RRETURN(MATCH_NOMATCH);
716
717 /* COMMIT overrides PRUNE, SKIP, and THEN */
718
719 case OP_COMMIT:
720 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
721 eptrb, RM52);
722 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
723 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
724 rrc != MATCH_THEN)
725 RRETURN(rrc);
726 RRETURN(MATCH_COMMIT);
727
728 /* PRUNE overrides THEN */
729
730 case OP_PRUNE:
731 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
732 eptrb, RM51);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
734 RRETURN(MATCH_PRUNE);
735
736 case OP_PRUNE_ARG:
737 md->nomatch_mark = ecode + 2;
738 md->mark = NULL; /* In case previously set by assertion */
739 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
740 eptrb, RM56);
741 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
742 md->mark == NULL) md->mark = ecode + 2;
743 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
744 RRETURN(MATCH_PRUNE);
745
746 /* SKIP overrides PRUNE and THEN */
747
748 case OP_SKIP:
749 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
750 eptrb, RM53);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
752 RRETURN(rrc);
753 md->start_match_ptr = eptr; /* Pass back current position */
754 RRETURN(MATCH_SKIP);
755
756 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
757 nomatch_mark. There is a flag that disables this opcode when re-matching a
758 pattern that ended with a SKIP for which there was not a matching MARK. */
759
760 case OP_SKIP_ARG:
761 if (md->ignore_skip_arg)
762 {
763 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
764 break;
765 }
766 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
767 eptrb, RM57);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770
771 /* Pass back the current skip name by overloading md->start_match_ptr and
772 returning the special MATCH_SKIP_ARG return code. This will either be
773 caught by a matching MARK, or get to the top, where it causes a rematch
774 with the md->ignore_skip_arg flag set. */
775
776 md->start_match_ptr = ecode + 2;
777 RRETURN(MATCH_SKIP_ARG);
778
779 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
780 the branch in which it occurs can be determined. Overload the start of
781 match pointer to do this. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode;
788 RRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 md->nomatch_mark = ecode + 2;
792 md->mark = NULL; /* In case previously set by assertion */
793 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
794 md, eptrb, RM58);
795 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
796 md->mark == NULL) md->mark = ecode + 2;
797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
798 md->start_match_ptr = ecode;
799 RRETURN(MATCH_THEN);
800
801 /* Handle an atomic group that does not contain any capturing parentheses.
802 This can be handled like an assertion. Prior to 8.13, all atomic groups
803 were handled this way. In 8.13, the code was changed as below for ONCE, so
804 that backups pass through the group and thereby reset captured values.
805 However, this uses a lot more stack, so in 8.20, atomic groups that do not
806 contain any captures generate OP_ONCE_NC, which can be handled in the old,
807 less stack intensive way.
808
809 Check the alternative branches in turn - the matching won't pass the KET
810 for this kind of subpattern. If any one branch matches, we carry on as at
811 the end of a normal bracket, leaving the subject pointer, but resetting
812 the start-of-match value in case it was changed by \K. */
813
814 case OP_ONCE_NC:
815 prev = ecode;
816 saved_eptr = eptr;
817 do
818 {
819 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
820 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
821 {
822 mstart = md->start_match_ptr;
823 break;
824 }
825 if (rrc == MATCH_THEN)
826 {
827 next = ecode + GET(ecode,1);
828 if (md->start_match_ptr < next &&
829 (*ecode == OP_ALT || *next == OP_ALT))
830 rrc = MATCH_NOMATCH;
831 }
832
833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834 ecode += GET(ecode,1);
835 }
836 while (*ecode == OP_ALT);
837
838 /* If hit the end of the group (which could be repeated), fail */
839
840 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
841
842 /* Continue as from after the group, updating the offsets high water
843 mark, since extracts may have been taken. */
844
845 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
846
847 offset_top = md->end_offset_top;
848 eptr = md->end_match_ptr;
849
850 /* For a non-repeating ket, just continue at this level. This also
851 happens for a repeating ket if no characters were matched in the group.
852 This is the forcible breaking of infinite loops as implemented in Perl
853 5.005. */
854
855 if (*ecode == OP_KET || eptr == saved_eptr)
856 {
857 ecode += 1+LINK_SIZE;
858 break;
859 }
860
861 /* The repeating kets try the rest of the pattern or restart from the
862 preceding bracket, in the appropriate order. The second "call" of match()
863 uses tail recursion, to avoid using another stack frame. */
864
865 if (*ecode == OP_KETRMIN)
866 {
867 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode = prev;
870 goto TAIL_RECURSE;
871 }
872 else /* OP_KETRMAX */
873 {
874 md->match_function_type = MATCH_CBEGROUP;
875 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
877 ecode += 1 + LINK_SIZE;
878 goto TAIL_RECURSE;
879 }
880 /* Control never gets here */
881
882 /* Handle a capturing bracket, other than those that are possessive with an
883 unlimited repeat. If there is space in the offset vector, save the current
884 subject position in the working slot at the top of the vector. We mustn't
885 change the current values of the data slot, because they may be set from a
886 previous iteration of this group, and be referred to by a reference inside
887 the group. A failure to match might occur after the group has succeeded,
888 if something later on doesn't match. For this reason, we need to restore
889 the working value and also the values of the final offsets, in case they
890 were set by a previous iteration of the same bracket.
891
892 If there isn't enough space in the offset vector, treat this as if it were
893 a non-capturing bracket. Don't worry about setting the flag for the error
894 case here; that is handled in the code for KET. */
895
896 case OP_CBRA:
897 case OP_SCBRA:
898 number = GET2(ecode, 1+LINK_SIZE);
899 offset = number << 1;
900
901 #ifdef PCRE_DEBUG
902 printf("start bracket %d\n", number);
903 printf("subject=");
904 pchars(eptr, 16, TRUE, md);
905 printf("\n");
906 #endif
907
908 if (offset < md->offset_max)
909 {
910 save_offset1 = md->offset_vector[offset];
911 save_offset2 = md->offset_vector[offset+1];
912 save_offset3 = md->offset_vector[md->offset_end - number];
913 save_capture_last = md->capture_last;
914
915 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
916 md->offset_vector[md->offset_end - number] =
917 (int)(eptr - md->start_subject);
918
919 for (;;)
920 {
921 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
922 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
923 eptrb, RM1);
924 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
925
926 /* If we backed up to a THEN, check whether it is within the current
927 branch by comparing the address of the THEN that is passed back with
928 the end of the branch. If it is within the current branch, and the
929 branch is one of two or more alternatives (it either starts or ends
930 with OP_ALT), we have reached the limit of THEN's action, so convert
931 the return code to NOMATCH, which will cause normal backtracking to
932 happen from now on. Otherwise, THEN is passed back to an outer
933 alternative. This implements Perl's treatment of parenthesized groups,
934 where a group not containing | does not affect the current alternative,
935 that is, (X) is NOT the same as (X|(*F)). */
936
937 if (rrc == MATCH_THEN)
938 {
939 next = ecode + GET(ecode,1);
940 if (md->start_match_ptr < next &&
941 (*ecode == OP_ALT || *next == OP_ALT))
942 rrc = MATCH_NOMATCH;
943 }
944
945 /* Anything other than NOMATCH is passed back. */
946
947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
948 md->capture_last = save_capture_last;
949 ecode += GET(ecode, 1);
950 if (*ecode != OP_ALT) break;
951 }
952
953 DPRINTF(("bracket %d failed\n", number));
954 md->offset_vector[offset] = save_offset1;
955 md->offset_vector[offset+1] = save_offset2;
956 md->offset_vector[md->offset_end - number] = save_offset3;
957
958 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
959
960 RRETURN(rrc);
961 }
962
963 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
964 as a non-capturing bracket. */
965
966 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
967 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
968
969 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
970
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
973
974 /* Non-capturing or atomic group, except for possessive with unlimited
975 repeat and ONCE group with no captures. Loop for all the alternatives.
976
977 When we get to the final alternative within the brackets, we used to return
978 the result of a recursive call to match() whatever happened so it was
979 possible to reduce stack usage by turning this into a tail recursion,
980 except in the case of a possibly empty group. However, now that there is
981 the possiblity of (*THEN) occurring in the final alternative, this
982 optimization is no longer always possible.
983
984 We can optimize if we know there are no (*THEN)s in the pattern; at present
985 this is the best that can be done.
986
987 MATCH_ONCE is returned when the end of an atomic group is successfully
988 reached, but subsequent matching fails. It passes back up the tree (causing
989 captured values to be reset) until the original atomic group level is
990 reached. This is tested by comparing md->once_target with the start of the
991 group. At this point, the return is converted into MATCH_NOMATCH so that
992 previous backup points can be taken. */
993
994 case OP_ONCE:
995 case OP_BRA:
996 case OP_SBRA:
997 DPRINTF(("start non-capturing bracket\n"));
998
999 for (;;)
1000 {
1001 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1002
1003 /* If this is not a possibly empty group, and there are no (*THEN)s in
1004 the pattern, and this is the final alternative, optimize as described
1005 above. */
1006
1007 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1008 {
1009 ecode += PRIV(OP_lengths)[*ecode];
1010 goto TAIL_RECURSE;
1011 }
1012
1013 /* In all other cases, we have to make another call to match(). */
1014
1015 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1016 RM2);
1017
1018 /* See comment in the code for capturing groups above about handling
1019 THEN. */
1020
1021 if (rrc == MATCH_THEN)
1022 {
1023 next = ecode + GET(ecode,1);
1024 if (md->start_match_ptr < next &&
1025 (*ecode == OP_ALT || *next == OP_ALT))
1026 rrc = MATCH_NOMATCH;
1027 }
1028
1029 if (rrc != MATCH_NOMATCH)
1030 {
1031 if (rrc == MATCH_ONCE)
1032 {
1033 const pcre_uchar *scode = ecode;
1034 if (*scode != OP_ONCE) /* If not at start, find it */
1035 {
1036 while (*scode == OP_ALT) scode += GET(scode, 1);
1037 scode -= GET(scode, 1);
1038 }
1039 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1040 }
1041 RRETURN(rrc);
1042 }
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 RRETURN(MATCH_NOMATCH);
1048
1049 /* Handle possessive capturing brackets with an unlimited repeat. We come
1050 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1051 handled similarly to the normal case above. However, the matching is
1052 different. The end of these brackets will always be OP_KETRPOS, which
1053 returns MATCH_KETRPOS without going further in the pattern. By this means
1054 we can handle the group by iteration rather than recursion, thereby
1055 reducing the amount of stack needed. */
1056
1057 case OP_CBRAPOS:
1058 case OP_SCBRAPOS:
1059 allow_zero = FALSE;
1060
1061 POSSESSIVE_CAPTURE:
1062 number = GET2(ecode, 1+LINK_SIZE);
1063 offset = number << 1;
1064
1065 #ifdef PCRE_DEBUG
1066 printf("start possessive bracket %d\n", number);
1067 printf("subject=");
1068 pchars(eptr, 16, TRUE, md);
1069 printf("\n");
1070 #endif
1071
1072 if (offset < md->offset_max)
1073 {
1074 matched_once = FALSE;
1075 code_offset = (int)(ecode - md->start_code);
1076
1077 save_offset1 = md->offset_vector[offset];
1078 save_offset2 = md->offset_vector[offset+1];
1079 save_offset3 = md->offset_vector[md->offset_end - number];
1080 save_capture_last = md->capture_last;
1081
1082 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1083
1084 /* Each time round the loop, save the current subject position for use
1085 when the group matches. For MATCH_MATCH, the group has matched, so we
1086 restart it with a new subject starting position, remembering that we had
1087 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1088 usual. If we haven't matched any alternatives in any iteration, check to
1089 see if a previous iteration matched. If so, the group has matched;
1090 continue from afterwards. Otherwise it has failed; restore the previous
1091 capture values before returning NOMATCH. */
1092
1093 for (;;)
1094 {
1095 md->offset_vector[md->offset_end - number] =
1096 (int)(eptr - md->start_subject);
1097 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1098 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1099 eptrb, RM63);
1100 if (rrc == MATCH_KETRPOS)
1101 {
1102 offset_top = md->end_offset_top;
1103 eptr = md->end_match_ptr;
1104 ecode = md->start_code + code_offset;
1105 save_capture_last = md->capture_last;
1106 matched_once = TRUE;
1107 continue;
1108 }
1109
1110 /* See comment in the code for capturing groups above about handling
1111 THEN. */
1112
1113 if (rrc == MATCH_THEN)
1114 {
1115 next = ecode + GET(ecode,1);
1116 if (md->start_match_ptr < next &&
1117 (*ecode == OP_ALT || *next == OP_ALT))
1118 rrc = MATCH_NOMATCH;
1119 }
1120
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 md->capture_last = save_capture_last;
1123 ecode += GET(ecode, 1);
1124 if (*ecode != OP_ALT) break;
1125 }
1126
1127 if (!matched_once)
1128 {
1129 md->offset_vector[offset] = save_offset1;
1130 md->offset_vector[offset+1] = save_offset2;
1131 md->offset_vector[md->offset_end - number] = save_offset3;
1132 }
1133
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = (int)(ecode - md->start_code);
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (PUBL(callout) != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = md->nomatch_mark;
1236 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 pcre_uchar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 pcre_uchar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 pcre_uchar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 pcre_uchar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 RRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to RRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 RRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 break;
1532 }
1533
1534 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1535 as NOMATCH. */
1536
1537 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1538 ecode += GET(ecode, 1);
1539 }
1540 while (*ecode == OP_ALT);
1541
1542 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1543
1544 /* If checking an assertion for a condition, return MATCH_MATCH. */
1545
1546 if (condassert) RRETURN(MATCH_MATCH);
1547
1548 /* Continue from after the assertion, updating the offsets high water
1549 mark, since extracts may have been taken during the assertion. */
1550
1551 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1552 ecode += 1 + LINK_SIZE;
1553 offset_top = md->end_offset_top;
1554 continue;
1555
1556 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1557 PRUNE, or COMMIT means we must assume failure without checking subsequent
1558 branches. */
1559
1560 case OP_ASSERT_NOT:
1561 case OP_ASSERTBACK_NOT:
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1573 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1574 {
1575 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1576 break;
1577 }
1578
1579 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580 as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 ecode += GET(ecode,1);
1584 }
1585 while (*ecode == OP_ALT);
1586
1587 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1588
1589 ecode += 1 + LINK_SIZE;
1590 continue;
1591
1592 /* Move the subject pointer back. This occurs only at the start of
1593 each branch of a lookbehind assertion. If we are too close to the start to
1594 move back, this match function fails. When working with UTF-8 we move
1595 back a number of characters, not bytes. */
1596
1597 case OP_REVERSE:
1598 #ifdef SUPPORT_UTF
1599 if (utf)
1600 {
1601 i = GET(ecode, 1);
1602 while (i-- > 0)
1603 {
1604 eptr--;
1605 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1606 BACKCHAR(eptr);
1607 }
1608 }
1609 else
1610 #endif
1611
1612 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1613
1614 {
1615 eptr -= GET(ecode, 1);
1616 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1617 }
1618
1619 /* Save the earliest consulted character, then skip to next op code */
1620
1621 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1622 ecode += 1 + LINK_SIZE;
1623 break;
1624
1625 /* The callout item calls an external function, if one is provided, passing
1626 details of the match so far. This is mainly for debugging, though the
1627 function is able to force a failure. */
1628
1629 case OP_CALLOUT:
1630 if (PUBL(callout) != NULL)
1631 {
1632 pcre_callout_block cb;
1633 cb.version = 2; /* Version 1 of the callout block */
1634 cb.callout_number = ecode[1];
1635 cb.offset_vector = md->offset_vector;
1636 cb.subject = (PCRE_SPTR)md->start_subject;
1637 cb.subject_length = (int)(md->end_subject - md->start_subject);
1638 cb.start_match = (int)(mstart - md->start_subject);
1639 cb.current_position = (int)(eptr - md->start_subject);
1640 cb.pattern_position = GET(ecode, 2);
1641 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1642 cb.capture_top = offset_top/2;
1643 cb.capture_last = md->capture_last;
1644 cb.callout_data = md->callout_data;
1645 cb.mark = md->nomatch_mark;
1646 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1647 if (rrc < 0) RRETURN(rrc);
1648 }
1649 ecode += 2 + 2*LINK_SIZE;
1650 break;
1651
1652 /* Recursion either matches the current regex, or some subexpression. The
1653 offset data is the offset to the starting bracket from the start of the
1654 whole pattern. (This is so that it works from duplicated subpatterns.)
1655
1656 The state of the capturing groups is preserved over recursion, and
1657 re-instated afterwards. We don't know how many are started and not yet
1658 finished (offset_top records the completed total) so we just have to save
1659 all the potential data. There may be up to 65535 such values, which is too
1660 large to put on the stack, but using malloc for small numbers seems
1661 expensive. As a compromise, the stack is used when there are no more than
1662 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1663
1664 There are also other values that have to be saved. We use a chained
1665 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1666 for the original version of this logic. It has, however, been hacked around
1667 a lot, so he is not to blame for the current way it works. */
1668
1669 case OP_RECURSE:
1670 {
1671 recursion_info *ri;
1672 int recno;
1673
1674 callpat = md->start_code + GET(ecode, 1);
1675 recno = (callpat == md->start_code)? 0 :
1676 GET2(callpat, 1 + LINK_SIZE);
1677
1678 /* Check for repeating a recursion without advancing the subject pointer.
1679 This should catch convoluted mutual recursions. (Some simple cases are
1680 caught at compile time.) */
1681
1682 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1683 if (recno == ri->group_num && eptr == ri->subject_position)
1684 RRETURN(PCRE_ERROR_RECURSELOOP);
1685
1686 /* Add to "recursing stack" */
1687
1688 new_recursive.group_num = recno;
1689 new_recursive.subject_position = eptr;
1690 new_recursive.prevrec = md->recursive;
1691 md->recursive = &new_recursive;
1692
1693 /* Where to continue from afterwards */
1694
1695 ecode += 1 + LINK_SIZE;
1696
1697 /* Now save the offset data */
1698
1699 new_recursive.saved_max = md->offset_end;
1700 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1701 new_recursive.offset_save = stacksave;
1702 else
1703 {
1704 new_recursive.offset_save =
1705 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1706 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1707 }
1708 memcpy(new_recursive.offset_save, md->offset_vector,
1709 new_recursive.saved_max * sizeof(int));
1710
1711 /* OK, now we can do the recursion. After processing each alternative,
1712 restore the offset data. If there were nested recursions, md->recursive
1713 might be changed, so reset it before looping. */
1714
1715 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1716 cbegroup = (*callpat >= OP_SBRA);
1717 do
1718 {
1719 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1720 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1721 md, eptrb, RM6);
1722 memcpy(md->offset_vector, new_recursive.offset_save,
1723 new_recursive.saved_max * sizeof(int));
1724 md->recursive = new_recursive.prevrec;
1725 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1726 {
1727 DPRINTF(("Recursion matched\n"));
1728 if (new_recursive.offset_save != stacksave)
1729 (PUBL(free))(new_recursive.offset_save);
1730
1731 /* Set where we got to in the subject, and reset the start in case
1732 it was changed by \K. This *is* propagated back out of a recursion,
1733 for Perl compatibility. */
1734
1735 eptr = md->end_match_ptr;
1736 mstart = md->start_match_ptr;
1737 goto RECURSION_MATCHED; /* Exit loop; end processing */
1738 }
1739
1740 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1741 as NOMATCH. */
1742
1743 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1744 {
1745 DPRINTF(("Recursion gave error %d\n", rrc));
1746 if (new_recursive.offset_save != stacksave)
1747 (PUBL(free))(new_recursive.offset_save);
1748 RRETURN(rrc);
1749 }
1750
1751 md->recursive = &new_recursive;
1752 callpat += GET(callpat, 1);
1753 }
1754 while (*callpat == OP_ALT);
1755
1756 DPRINTF(("Recursion didn't match\n"));
1757 md->recursive = new_recursive.prevrec;
1758 if (new_recursive.offset_save != stacksave)
1759 (PUBL(free))(new_recursive.offset_save);
1760 RRETURN(MATCH_NOMATCH);
1761 }
1762
1763 RECURSION_MATCHED:
1764 break;
1765
1766 /* An alternation is the end of a branch; scan along to find the end of the
1767 bracketed group and go to there. */
1768
1769 case OP_ALT:
1770 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1771 break;
1772
1773 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1774 indicating that it may occur zero times. It may repeat infinitely, or not
1775 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1776 with fixed upper repeat limits are compiled as a number of copies, with the
1777 optional ones preceded by BRAZERO or BRAMINZERO. */
1778
1779 case OP_BRAZERO:
1780 next = ecode + 1;
1781 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 do next += GET(next, 1); while (*next == OP_ALT);
1784 ecode = next + 1 + LINK_SIZE;
1785 break;
1786
1787 case OP_BRAMINZERO:
1788 next = ecode + 1;
1789 do next += GET(next, 1); while (*next == OP_ALT);
1790 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792 ecode++;
1793 break;
1794
1795 case OP_SKIPZERO:
1796 next = ecode+1;
1797 do next += GET(next,1); while (*next == OP_ALT);
1798 ecode = next + 1 + LINK_SIZE;
1799 break;
1800
1801 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1802 here; just jump to the group, with allow_zero set TRUE. */
1803
1804 case OP_BRAPOSZERO:
1805 op = *(++ecode);
1806 allow_zero = TRUE;
1807 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1808 goto POSSESSIVE_NON_CAPTURE;
1809
1810 /* End of a group, repeated or non-repeating. */
1811
1812 case OP_KET:
1813 case OP_KETRMIN:
1814 case OP_KETRMAX:
1815 case OP_KETRPOS:
1816 prev = ecode - GET(ecode, 1);
1817
1818 /* If this was a group that remembered the subject start, in order to break
1819 infinite repeats of empty string matches, retrieve the subject start from
1820 the chain. Otherwise, set it NULL. */
1821
1822 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1823 {
1824 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1825 eptrb = eptrb->epb_prev; /* Backup to previous group */
1826 }
1827 else saved_eptr = NULL;
1828
1829 /* If we are at the end of an assertion group or a non-capturing atomic
1830 group, stop matching and return MATCH_MATCH, but record the current high
1831 water mark for use by positive assertions. We also need to record the match
1832 start in case it was changed by \K. */
1833
1834 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1835 *prev == OP_ONCE_NC)
1836 {
1837 md->end_match_ptr = eptr; /* For ONCE_NC */
1838 md->end_offset_top = offset_top;
1839 md->start_match_ptr = mstart;
1840 RRETURN(MATCH_MATCH); /* Sets md->mark */
1841 }
1842
1843 /* For capturing groups we have to check the group number back at the start
1844 and if necessary complete handling an extraction by setting the offsets and
1845 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1846 into group 0, so it won't be picked up here. Instead, we catch it when the
1847 OP_END is reached. Other recursion is handled here. We just have to record
1848 the current subject position and start match pointer and give a MATCH
1849 return. */
1850
1851 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1852 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1853 {
1854 number = GET2(prev, 1+LINK_SIZE);
1855 offset = number << 1;
1856
1857 #ifdef PCRE_DEBUG
1858 printf("end bracket %d", number);
1859 printf("\n");
1860 #endif
1861
1862 /* Handle a recursively called group. */
1863
1864 if (md->recursive != NULL && md->recursive->group_num == number)
1865 {
1866 md->end_match_ptr = eptr;
1867 md->start_match_ptr = mstart;
1868 RRETURN(MATCH_MATCH);
1869 }
1870
1871 /* Deal with capturing */
1872
1873 md->capture_last = number;
1874 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1875 {
1876 /* If offset is greater than offset_top, it means that we are
1877 "skipping" a capturing group, and that group's offsets must be marked
1878 unset. In earlier versions of PCRE, all the offsets were unset at the
1879 start of matching, but this doesn't work because atomic groups and
1880 assertions can cause a value to be set that should later be unset.
1881 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1882 part of the atomic group, but this is not on the final matching path,
1883 so must be unset when 2 is set. (If there is no group 2, there is no
1884 problem, because offset_top will then be 2, indicating no capture.) */
1885
1886 if (offset > offset_top)
1887 {
1888 register int *iptr = md->offset_vector + offset_top;
1889 register int *iend = md->offset_vector + offset;
1890 while (iptr < iend) *iptr++ = -1;
1891 }
1892
1893 /* Now make the extraction */
1894
1895 md->offset_vector[offset] =
1896 md->offset_vector[md->offset_end - number];
1897 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1898 if (offset_top <= offset) offset_top = offset + 2;
1899 }
1900 }
1901
1902 /* For an ordinary non-repeating ket, just continue at this level. This
1903 also happens for a repeating ket if no characters were matched in the
1904 group. This is the forcible breaking of infinite loops as implemented in
1905 Perl 5.005. For a non-repeating atomic group that includes captures,
1906 establish a backup point by processing the rest of the pattern at a lower
1907 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1908 original OP_ONCE level, thereby bypassing intermediate backup points, but
1909 resetting any captures that happened along the way. */
1910
1911 if (*ecode == OP_KET || eptr == saved_eptr)
1912 {
1913 if (*prev == OP_ONCE)
1914 {
1915 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1918 RRETURN(MATCH_ONCE);
1919 }
1920 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1921 break;
1922 }
1923
1924 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1925 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1926 at a time from the outer level, thus saving stack. */
1927
1928 if (*ecode == OP_KETRPOS)
1929 {
1930 md->end_match_ptr = eptr;
1931 md->end_offset_top = offset_top;
1932 RRETURN(MATCH_KETRPOS);
1933 }
1934
1935 /* The normal repeating kets try the rest of the pattern or restart from
1936 the preceding bracket, in the appropriate order. In the second case, we can
1937 use tail recursion to avoid using another stack frame, unless we have an
1938 an atomic group or an unlimited repeat of a group that can match an empty
1939 string. */
1940
1941 if (*ecode == OP_KETRMIN)
1942 {
1943 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 if (*prev == OP_ONCE)
1946 {
1947 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1950 RRETURN(MATCH_ONCE);
1951 }
1952 if (*prev >= OP_SBRA) /* Could match an empty string */
1953 {
1954 md->match_function_type = MATCH_CBEGROUP;
1955 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1956 RRETURN(rrc);
1957 }
1958 ecode = prev;
1959 goto TAIL_RECURSE;
1960 }
1961 else /* OP_KETRMAX */
1962 {
1963 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1964 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1965 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1967 if (*prev == OP_ONCE)
1968 {
1969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 md->once_target = prev;
1972 RRETURN(MATCH_ONCE);
1973 }
1974 ecode += 1 + LINK_SIZE;
1975 goto TAIL_RECURSE;
1976 }
1977 /* Control never gets here */
1978
1979 /* Not multiline mode: start of subject assertion, unless notbol. */
1980
1981 case OP_CIRC:
1982 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1983
1984 /* Start of subject assertion */
1985
1986 case OP_SOD:
1987 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1988 ecode++;
1989 break;
1990
1991 /* Multiline mode: start of subject unless notbol, or after any newline. */
1992
1993 case OP_CIRCM:
1994 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1995 if (eptr != md->start_subject &&
1996 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1997 RRETURN(MATCH_NOMATCH);
1998 ecode++;
1999 break;
2000
2001 /* Start of match assertion */
2002
2003 case OP_SOM:
2004 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2005 ecode++;
2006 break;
2007
2008 /* Reset the start of match point */
2009
2010 case OP_SET_SOM:
2011 mstart = eptr;
2012 ecode++;
2013 break;
2014
2015 /* Multiline mode: assert before any newline, or before end of subject
2016 unless noteol is set. */
2017
2018 case OP_DOLLM:
2019 if (eptr < md->end_subject)
2020 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2021 else
2022 {
2023 if (md->noteol) RRETURN(MATCH_NOMATCH);
2024 SCHECK_PARTIAL();
2025 }
2026 ecode++;
2027 break;
2028
2029 /* Not multiline mode: assert before a terminating newline or before end of
2030 subject unless noteol is set. */
2031
2032 case OP_DOLL:
2033 if (md->noteol) RRETURN(MATCH_NOMATCH);
2034 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2035
2036 /* ... else fall through for endonly */
2037
2038 /* End of subject assertion (\z) */
2039
2040 case OP_EOD:
2041 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2042 SCHECK_PARTIAL();
2043 ecode++;
2044 break;
2045
2046 /* End of subject or ending \n assertion (\Z) */
2047
2048 case OP_EODN:
2049 ASSERT_NL_OR_EOS:
2050 if (eptr < md->end_subject &&
2051 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2052 RRETURN(MATCH_NOMATCH);
2053
2054 /* Either at end of string or \n before end. */
2055
2056 SCHECK_PARTIAL();
2057 ecode++;
2058 break;
2059
2060 /* Word boundary assertions */
2061
2062 case OP_NOT_WORD_BOUNDARY:
2063 case OP_WORD_BOUNDARY:
2064 {
2065
2066 /* Find out if the previous and current characters are "word" characters.
2067 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2068 be "non-word" characters. Remember the earliest consulted character for
2069 partial matching. */
2070
2071 #ifdef SUPPORT_UTF
2072 if (utf)
2073 {
2074 /* Get status of previous character */
2075
2076 if (eptr == md->start_subject) prev_is_word = FALSE; else
2077 {
2078 PCRE_PUCHAR lastptr = eptr - 1;
2079 BACKCHAR(lastptr);
2080 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2081 GETCHAR(c, lastptr);
2082 #ifdef SUPPORT_UCP
2083 if (md->use_ucp)
2084 {
2085 if (c == '_') prev_is_word = TRUE; else
2086 {
2087 int cat = UCD_CATEGORY(c);
2088 prev_is_word = (cat == ucp_L || cat == ucp_N);
2089 }
2090 }
2091 else
2092 #endif
2093 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2094 }
2095
2096 /* Get status of next character */
2097
2098 if (eptr >= md->end_subject)
2099 {
2100 SCHECK_PARTIAL();
2101 cur_is_word = FALSE;
2102 }
2103 else
2104 {
2105 GETCHAR(c, eptr);
2106 #ifdef SUPPORT_UCP
2107 if (md->use_ucp)
2108 {
2109 if (c == '_') cur_is_word = TRUE; else
2110 {
2111 int cat = UCD_CATEGORY(c);
2112 cur_is_word = (cat == ucp_L || cat == ucp_N);
2113 }
2114 }
2115 else
2116 #endif
2117 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2118 }
2119 }
2120 else
2121 #endif
2122
2123 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2124 consistency with the behaviour of \w we do use it in this case. */
2125
2126 {
2127 /* Get status of previous character */
2128
2129 if (eptr == md->start_subject) prev_is_word = FALSE; else
2130 {
2131 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2132 #ifdef SUPPORT_UCP
2133 if (md->use_ucp)
2134 {
2135 c = eptr[-1];
2136 if (c == '_') prev_is_word = TRUE; else
2137 {
2138 int cat = UCD_CATEGORY(c);
2139 prev_is_word = (cat == ucp_L || cat == ucp_N);
2140 }
2141 }
2142 else
2143 #endif
2144 prev_is_word = MAX_255(eptr[-1])
2145 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2146 }
2147
2148 /* Get status of next character */
2149
2150 if (eptr >= md->end_subject)
2151 {
2152 SCHECK_PARTIAL();
2153 cur_is_word = FALSE;
2154 }
2155 else
2156 #ifdef SUPPORT_UCP
2157 if (md->use_ucp)
2158 {
2159 c = *eptr;
2160 if (c == '_') cur_is_word = TRUE; else
2161 {
2162 int cat = UCD_CATEGORY(c);
2163 cur_is_word = (cat == ucp_L || cat == ucp_N);
2164 }
2165 }
2166 else
2167 #endif
2168 cur_is_word = MAX_255(*eptr)
2169 && ((md->ctypes[*eptr] & ctype_word) != 0);
2170 }
2171
2172 /* Now see if the situation is what we want */
2173
2174 if ((*ecode++ == OP_WORD_BOUNDARY)?
2175 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2176 RRETURN(MATCH_NOMATCH);
2177 }
2178 break;
2179
2180 /* Match a single character type; inline for speed */
2181
2182 case OP_ANY:
2183 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2184 /* Fall through */
2185
2186 case OP_ALLANY:
2187 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2188 { /* not be updated before SCHECK_PARTIAL. */
2189 SCHECK_PARTIAL();
2190 RRETURN(MATCH_NOMATCH);
2191 }
2192 eptr++;
2193 #ifdef SUPPORT_UTF
2194 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2195 #endif
2196 ecode++;
2197 break;
2198
2199 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2200 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2201
2202 case OP_ANYBYTE:
2203 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2204 { /* not be updated before SCHECK_PARTIAL. */
2205 SCHECK_PARTIAL();
2206 RRETURN(MATCH_NOMATCH);
2207 }
2208 eptr++;
2209 ecode++;
2210 break;
2211
2212 case OP_NOT_DIGIT:
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 RRETURN(MATCH_NOMATCH);
2217 }
2218 GETCHARINCTEST(c, eptr);
2219 if (
2220 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2221 c < 256 &&
2222 #endif
2223 (md->ctypes[c] & ctype_digit) != 0
2224 )
2225 RRETURN(MATCH_NOMATCH);
2226 ecode++;
2227 break;
2228
2229 case OP_DIGIT:
2230 if (eptr >= md->end_subject)
2231 {
2232 SCHECK_PARTIAL();
2233 RRETURN(MATCH_NOMATCH);
2234 }
2235 GETCHARINCTEST(c, eptr);
2236 if (
2237 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2238 c > 255 ||
2239 #endif
2240 (md->ctypes[c] & ctype_digit) == 0
2241 )
2242 RRETURN(MATCH_NOMATCH);
2243 ecode++;
2244 break;
2245
2246 case OP_NOT_WHITESPACE:
2247 if (eptr >= md->end_subject)
2248 {
2249 SCHECK_PARTIAL();
2250 RRETURN(MATCH_NOMATCH);
2251 }
2252 GETCHARINCTEST(c, eptr);
2253 if (
2254 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2255 c < 256 &&
2256 #endif
2257 (md->ctypes[c] & ctype_space) != 0
2258 )
2259 RRETURN(MATCH_NOMATCH);
2260 ecode++;
2261 break;
2262
2263 case OP_WHITESPACE:
2264 if (eptr >= md->end_subject)
2265 {
2266 SCHECK_PARTIAL();
2267 RRETURN(MATCH_NOMATCH);
2268 }
2269 GETCHARINCTEST(c, eptr);
2270 if (
2271 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2272 c > 255 ||
2273 #endif
2274 (md->ctypes[c] & ctype_space) == 0
2275 )
2276 RRETURN(MATCH_NOMATCH);
2277 ecode++;
2278 break;
2279
2280 case OP_NOT_WORDCHAR:
2281 if (eptr >= md->end_subject)
2282 {
2283 SCHECK_PARTIAL();
2284 RRETURN(MATCH_NOMATCH);
2285 }
2286 GETCHARINCTEST(c, eptr);
2287 if (
2288 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2289 c < 256 &&
2290 #endif
2291 (md->ctypes[c] & ctype_word) != 0
2292 )
2293 RRETURN(MATCH_NOMATCH);
2294 ecode++;
2295 break;
2296
2297 case OP_WORDCHAR:
2298 if (eptr >= md->end_subject)
2299 {
2300 SCHECK_PARTIAL();
2301 RRETURN(MATCH_NOMATCH);
2302 }
2303 GETCHARINCTEST(c, eptr);
2304 if (
2305 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2306 c > 255 ||
2307 #endif
2308 (md->ctypes[c] & ctype_word) == 0
2309 )
2310 RRETURN(MATCH_NOMATCH);
2311 ecode++;
2312 break;
2313
2314 case OP_ANYNL:
2315 if (eptr >= md->end_subject)
2316 {
2317 SCHECK_PARTIAL();
2318 RRETURN(MATCH_NOMATCH);
2319 }
2320 GETCHARINCTEST(c, eptr);
2321 switch(c)
2322 {
2323 default: RRETURN(MATCH_NOMATCH);
2324
2325 case 0x000d:
2326 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2327 break;
2328
2329 case 0x000a:
2330 break;
2331
2332 case 0x000b:
2333 case 0x000c:
2334 case 0x0085:
2335 case 0x2028:
2336 case 0x2029:
2337 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2338 break;
2339 }
2340 ecode++;
2341 break;
2342
2343 case OP_NOT_HSPACE:
2344 if (eptr >= md->end_subject)
2345 {
2346 SCHECK_PARTIAL();
2347 RRETURN(MATCH_NOMATCH);
2348 }
2349 GETCHARINCTEST(c, eptr);
2350 switch(c)
2351 {
2352 default: break;
2353 case 0x09: /* HT */
2354 case 0x20: /* SPACE */
2355 case 0xa0: /* NBSP */
2356 case 0x1680: /* OGHAM SPACE MARK */
2357 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2358 case 0x2000: /* EN QUAD */
2359 case 0x2001: /* EM QUAD */
2360 case 0x2002: /* EN SPACE */
2361 case 0x2003: /* EM SPACE */
2362 case 0x2004: /* THREE-PER-EM SPACE */
2363 case 0x2005: /* FOUR-PER-EM SPACE */
2364 case 0x2006: /* SIX-PER-EM SPACE */
2365 case 0x2007: /* FIGURE SPACE */
2366 case 0x2008: /* PUNCTUATION SPACE */
2367 case 0x2009: /* THIN SPACE */
2368 case 0x200A: /* HAIR SPACE */
2369 case 0x202f: /* NARROW NO-BREAK SPACE */
2370 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2371 case 0x3000: /* IDEOGRAPHIC SPACE */
2372 RRETURN(MATCH_NOMATCH);
2373 }
2374 ecode++;
2375 break;
2376
2377 case OP_HSPACE:
2378 if (eptr >= md->end_subject)
2379 {
2380 SCHECK_PARTIAL();
2381 RRETURN(MATCH_NOMATCH);
2382 }
2383 GETCHARINCTEST(c, eptr);
2384 switch(c)
2385 {
2386 default: RRETURN(MATCH_NOMATCH);
2387 case 0x09: /* HT */
2388 case 0x20: /* SPACE */
2389 case 0xa0: /* NBSP */
2390 case 0x1680: /* OGHAM SPACE MARK */
2391 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2392 case 0x2000: /* EN QUAD */
2393 case 0x2001: /* EM QUAD */
2394 case 0x2002: /* EN SPACE */
2395 case 0x2003: /* EM SPACE */
2396 case 0x2004: /* THREE-PER-EM SPACE */
2397 case 0x2005: /* FOUR-PER-EM SPACE */
2398 case 0x2006: /* SIX-PER-EM SPACE */
2399 case 0x2007: /* FIGURE SPACE */
2400 case 0x2008: /* PUNCTUATION SPACE */
2401 case 0x2009: /* THIN SPACE */
2402 case 0x200A: /* HAIR SPACE */
2403 case 0x202f: /* NARROW NO-BREAK SPACE */
2404 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2405 case 0x3000: /* IDEOGRAPHIC SPACE */
2406 break;
2407 }
2408 ecode++;
2409 break;
2410
2411 case OP_NOT_VSPACE:
2412 if (eptr >= md->end_subject)
2413 {
2414 SCHECK_PARTIAL();
2415 RRETURN(MATCH_NOMATCH);
2416 }
2417 GETCHARINCTEST(c, eptr);
2418 switch(c)
2419 {
2420 default: break;
2421 case 0x0a: /* LF */
2422 case 0x0b: /* VT */
2423 case 0x0c: /* FF */
2424 case 0x0d: /* CR */
2425 case 0x85: /* NEL */
2426 case 0x2028: /* LINE SEPARATOR */
2427 case 0x2029: /* PARAGRAPH SEPARATOR */
2428 RRETURN(MATCH_NOMATCH);
2429 }
2430 ecode++;
2431 break;
2432
2433 case OP_VSPACE:
2434 if (eptr >= md->end_subject)
2435 {
2436 SCHECK_PARTIAL();
2437 RRETURN(MATCH_NOMATCH);
2438 }
2439 GETCHARINCTEST(c, eptr);
2440 switch(c)
2441 {
2442 default: RRETURN(MATCH_NOMATCH);
2443 case 0x0a: /* LF */
2444 case 0x0b: /* VT */
2445 case 0x0c: /* FF */
2446 case 0x0d: /* CR */
2447 case 0x85: /* NEL */
2448 case 0x2028: /* LINE SEPARATOR */
2449 case 0x2029: /* PARAGRAPH SEPARATOR */
2450 break;
2451 }
2452 ecode++;
2453 break;
2454
2455 #ifdef SUPPORT_UCP
2456 /* Check the next character by Unicode property. We will get here only
2457 if the support is in the binary; otherwise a compile-time error occurs. */
2458
2459 case OP_PROP:
2460 case OP_NOTPROP:
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 RRETURN(MATCH_NOMATCH);
2465 }
2466 GETCHARINCTEST(c, eptr);
2467 {
2468 const ucd_record *prop = GET_UCD(c);
2469
2470 switch(ecode[1])
2471 {
2472 case PT_ANY:
2473 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2474 break;
2475
2476 case PT_LAMP:
2477 if ((prop->chartype == ucp_Lu ||
2478 prop->chartype == ucp_Ll ||
2479 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2480 RRETURN(MATCH_NOMATCH);
2481 break;
2482
2483 case PT_GC:
2484 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2485 RRETURN(MATCH_NOMATCH);
2486 break;
2487
2488 case PT_PC:
2489 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2490 RRETURN(MATCH_NOMATCH);
2491 break;
2492
2493 case PT_SC:
2494 if ((ecode[2] != prop->script) == (op == OP_PROP))
2495 RRETURN(MATCH_NOMATCH);
2496 break;
2497
2498 /* These are specials */
2499
2500 case PT_ALNUM:
2501 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2502 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2503 RRETURN(MATCH_NOMATCH);
2504 break;
2505
2506 case PT_SPACE: /* Perl space */
2507 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2508 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2509 == (op == OP_NOTPROP))
2510 RRETURN(MATCH_NOMATCH);
2511 break;
2512
2513 case PT_PXSPACE: /* POSIX space */
2514 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2515 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2516 c == CHAR_FF || c == CHAR_CR)
2517 == (op == OP_NOTPROP))
2518 RRETURN(MATCH_NOMATCH);
2519 break;
2520
2521 case PT_WORD:
2522 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2523 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2524 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2525 RRETURN(MATCH_NOMATCH);
2526 break;
2527
2528 /* This should never occur */
2529
2530 default:
2531 RRETURN(PCRE_ERROR_INTERNAL);
2532 }
2533
2534 ecode += 3;
2535 }
2536 break;
2537
2538 /* Match an extended Unicode sequence. We will get here only if the support
2539 is in the binary; otherwise a compile-time error occurs. */
2540
2541 case OP_EXTUNI:
2542 if (eptr >= md->end_subject)
2543 {
2544 SCHECK_PARTIAL();
2545 RRETURN(MATCH_NOMATCH);
2546 }
2547 GETCHARINCTEST(c, eptr);
2548 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2549 while (eptr < md->end_subject)
2550 {
2551 int len = 1;
2552 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2553 if (UCD_CATEGORY(c) != ucp_M) break;
2554 eptr += len;
2555 }
2556 ecode++;
2557 break;
2558 #endif
2559
2560
2561 /* Match a back reference, possibly repeatedly. Look past the end of the
2562 item to see if there is repeat information following. The code is similar
2563 to that for character classes, but repeated for efficiency. Then obey
2564 similar code to character type repeats - written out again for speed.
2565 However, if the referenced string is the empty string, always treat
2566 it as matched, any number of times (otherwise there could be infinite
2567 loops). */
2568
2569 case OP_REF:
2570 case OP_REFI:
2571 caseless = op == OP_REFI;
2572 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2573 ecode += 1 + IMM2_SIZE;
2574
2575 /* If the reference is unset, there are two possibilities:
2576
2577 (a) In the default, Perl-compatible state, set the length negative;
2578 this ensures that every attempt at a match fails. We can't just fail
2579 here, because of the possibility of quantifiers with zero minima.
2580
2581 (b) If the JavaScript compatibility flag is set, set the length to zero
2582 so that the back reference matches an empty string.
2583
2584 Otherwise, set the length to the length of what was matched by the
2585 referenced subpattern. */
2586
2587 if (offset >= offset_top || md->offset_vector[offset] < 0)
2588 length = (md->jscript_compat)? 0 : -1;
2589 else
2590 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2591
2592 /* Set up for repetition, or handle the non-repeated case */
2593
2594 switch (*ecode)
2595 {
2596 case OP_CRSTAR:
2597 case OP_CRMINSTAR:
2598 case OP_CRPLUS:
2599 case OP_CRMINPLUS:
2600 case OP_CRQUERY:
2601 case OP_CRMINQUERY:
2602 c = *ecode++ - OP_CRSTAR;
2603 minimize = (c & 1) != 0;
2604 min = rep_min[c]; /* Pick up values from tables; */
2605 max = rep_max[c]; /* zero for max => infinity */
2606 if (max == 0) max = INT_MAX;
2607 break;
2608
2609 case OP_CRRANGE:
2610 case OP_CRMINRANGE:
2611 minimize = (*ecode == OP_CRMINRANGE);
2612 min = GET2(ecode, 1);
2613 max = GET2(ecode, 1 + IMM2_SIZE);
2614 if (max == 0) max = INT_MAX;
2615 ecode += 1 + 2 * IMM2_SIZE;
2616 break;
2617
2618 default: /* No repeat follows */
2619 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2620 {
2621 CHECK_PARTIAL();
2622 RRETURN(MATCH_NOMATCH);
2623 }
2624 eptr += length;
2625 continue; /* With the main loop */
2626 }
2627
2628 /* Handle repeated back references. If the length of the reference is
2629 zero, just continue with the main loop. */
2630
2631 if (length == 0) continue;
2632
2633 /* First, ensure the minimum number of matches are present. We get back
2634 the length of the reference string explicitly rather than passing the
2635 address of eptr, so that eptr can be a register variable. */
2636
2637 for (i = 1; i <= min; i++)
2638 {
2639 int slength;
2640 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2641 {
2642 CHECK_PARTIAL();
2643 RRETURN(MATCH_NOMATCH);
2644 }
2645 eptr += slength;
2646 }
2647
2648 /* If min = max, continue at the same level without recursion.
2649 They are not both allowed to be zero. */
2650
2651 if (min == max) continue;
2652
2653 /* If minimizing, keep trying and advancing the pointer */
2654
2655 if (minimize)
2656 {
2657 for (fi = min;; fi++)
2658 {
2659 int slength;
2660 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2662 if (fi >= max) RRETURN(MATCH_NOMATCH);
2663 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2664 {
2665 CHECK_PARTIAL();
2666 RRETURN(MATCH_NOMATCH);
2667 }
2668 eptr += slength;
2669 }
2670 /* Control never gets here */
2671 }
2672
2673 /* If maximizing, find the longest string and work backwards */
2674
2675 else
2676 {
2677 pp = eptr;
2678 for (i = min; i < max; i++)
2679 {
2680 int slength;
2681 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2682 {
2683 CHECK_PARTIAL();
2684 break;
2685 }
2686 eptr += slength;
2687 }
2688 while (eptr >= pp)
2689 {
2690 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2692 eptr -= length;
2693 }
2694 RRETURN(MATCH_NOMATCH);
2695 }
2696 /* Control never gets here */
2697
2698 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2699 used when all the characters in the class have values in the range 0-255,
2700 and either the matching is caseful, or the characters are in the range
2701 0-127 when UTF-8 processing is enabled. The only difference between
2702 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2703 encountered.
2704
2705 First, look past the end of the item to see if there is repeat information
2706 following. Then obey similar code to character type repeats - written out
2707 again for speed. */
2708
2709 case OP_NCLASS:
2710 case OP_CLASS:
2711 {
2712 /* The data variable is saved across frames, so the byte map needs to
2713 be stored there. */
2714 #define BYTE_MAP ((pcre_uint8 *)data)
2715 data = ecode + 1; /* Save for matching */
2716 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2717
2718 switch (*ecode)
2719 {
2720 case OP_CRSTAR:
2721 case OP_CRMINSTAR:
2722 case OP_CRPLUS:
2723 case OP_CRMINPLUS:
2724 case OP_CRQUERY:
2725 case OP_CRMINQUERY:
2726 c = *ecode++ - OP_CRSTAR;
2727 minimize = (c & 1) != 0;
2728 min = rep_min[c]; /* Pick up values from tables; */
2729 max = rep_max[c]; /* zero for max => infinity */
2730 if (max == 0) max = INT_MAX;
2731 break;
2732
2733 case OP_CRRANGE:
2734 case OP_CRMINRANGE:
2735 minimize = (*ecode == OP_CRMINRANGE);
2736 min = GET2(ecode, 1);
2737 max = GET2(ecode, 1 + IMM2_SIZE);
2738 if (max == 0) max = INT_MAX;
2739 ecode += 1 + 2 * IMM2_SIZE;
2740 break;
2741
2742 default: /* No repeat follows */
2743 min = max = 1;
2744 break;
2745 }
2746
2747 /* First, ensure the minimum number of matches are present. */
2748
2749 #ifdef SUPPORT_UTF
2750 if (utf)
2751 {
2752 for (i = 1; i <= min; i++)
2753 {
2754 if (eptr >= md->end_subject)
2755 {
2756 SCHECK_PARTIAL();
2757 RRETURN(MATCH_NOMATCH);
2758 }
2759 GETCHARINC(c, eptr);
2760 if (c > 255)
2761 {
2762 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2763 }
2764 else
2765 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2766 }
2767 }
2768 else
2769 #endif
2770 /* Not UTF mode */
2771 {
2772 for (i = 1; i <= min; i++)
2773 {
2774 if (eptr >= md->end_subject)
2775 {
2776 SCHECK_PARTIAL();
2777 RRETURN(MATCH_NOMATCH);
2778 }
2779 c = *eptr++;
2780 #ifndef COMPILE_PCRE8
2781 if (c > 255)
2782 {
2783 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2784 }
2785 else
2786 #endif
2787 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2788 }
2789 }
2790
2791 /* If max == min we can continue with the main loop without the
2792 need to recurse. */
2793
2794 if (min == max) continue;
2795
2796 /* If minimizing, keep testing the rest of the expression and advancing
2797 the pointer while it matches the class. */
2798
2799 if (minimize)
2800 {
2801 #ifdef SUPPORT_UTF
2802 if (utf)
2803 {
2804 for (fi = min;; fi++)
2805 {
2806 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2808 if (fi >= max) RRETURN(MATCH_NOMATCH);
2809 if (eptr >= md->end_subject)
2810 {
2811 SCHECK_PARTIAL();
2812 RRETURN(MATCH_NOMATCH);
2813 }
2814 GETCHARINC(c, eptr);
2815 if (c > 255)
2816 {
2817 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2818 }
2819 else
2820 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2821 }
2822 }
2823 else
2824 #endif
2825 /* Not UTF mode */
2826 {
2827 for (fi = min;; fi++)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (fi >= max) RRETURN(MATCH_NOMATCH);
2832 if (eptr >= md->end_subject)
2833 {
2834 SCHECK_PARTIAL();
2835 RRETURN(MATCH_NOMATCH);
2836 }
2837 c = *eptr++;
2838 #ifndef COMPILE_PCRE8
2839 if (c > 255)
2840 {
2841 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2842 }
2843 else
2844 #endif
2845 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2846 }
2847 }
2848 /* Control never gets here */
2849 }
2850
2851 /* If maximizing, find the longest possible run, then work backwards. */
2852
2853 else
2854 {
2855 pp = eptr;
2856
2857 #ifdef SUPPORT_UTF
2858 if (utf)
2859 {
2860 for (i = min; i < max; i++)
2861 {
2862 int len = 1;
2863 if (eptr >= md->end_subject)
2864 {
2865 SCHECK_PARTIAL();
2866 break;
2867 }
2868 GETCHARLEN(c, eptr, len);
2869 if (c > 255)
2870 {
2871 if (op == OP_CLASS) break;
2872 }
2873 else
2874 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2875 eptr += len;
2876 }
2877 for (;;)
2878 {
2879 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2881 if (eptr-- == pp) break; /* Stop if tried at original pos */
2882 BACKCHAR(eptr);
2883 }
2884 }
2885 else
2886 #endif
2887 /* Not UTF mode */
2888 {
2889 for (i = min; i < max; i++)
2890 {
2891 if (eptr >= md->end_subject)
2892 {
2893 SCHECK_PARTIAL();
2894 break;
2895 }
2896 c = *eptr;
2897 #ifndef COMPILE_PCRE8
2898 if (c > 255)
2899 {
2900 if (op == OP_CLASS) break;
2901 }
2902 else
2903 #endif
2904 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2905 eptr++;
2906 }
2907 while (eptr >= pp)
2908 {
2909 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2910 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2911 eptr--;
2912 }
2913 }
2914
2915 RRETURN(MATCH_NOMATCH);
2916 }
2917 #undef BYTE_MAP
2918 }
2919 /* Control never gets here */
2920
2921
2922 /* Match an extended character class. This opcode is encountered only
2923 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2924 mode, because Unicode properties are supported in non-UTF-8 mode. */
2925
2926 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2927 case OP_XCLASS:
2928 {
2929 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2930 ecode += GET(ecode, 1); /* Advance past the item */
2931
2932 switch (*ecode)
2933 {
2934 case OP_CRSTAR:
2935 case OP_CRMINSTAR:
2936 case OP_CRPLUS:
2937 case OP_CRMINPLUS:
2938 case OP_CRQUERY:
2939 case OP_CRMINQUERY:
2940 c = *ecode++ - OP_CRSTAR;
2941 minimize = (c & 1) != 0;
2942 min = rep_min[c]; /* Pick up values from tables; */
2943 max = rep_max[c]; /* zero for max => infinity */
2944 if (max == 0) max = INT_MAX;
2945 break;
2946
2947 case OP_CRRANGE:
2948 case OP_CRMINRANGE:
2949 minimize = (*ecode == OP_CRMINRANGE);
2950 min = GET2(ecode, 1);
2951 max = GET2(ecode, 1 + IMM2_SIZE);
2952 if (max == 0) max = INT_MAX;
2953 ecode += 1 + 2 * IMM2_SIZE;
2954 break;
2955
2956 default: /* No repeat follows */
2957 min = max = 1;
2958 break;
2959 }
2960
2961 /* First, ensure the minimum number of matches are present. */
2962
2963 for (i = 1; i <= min; i++)
2964 {
2965 if (eptr >= md->end_subject)
2966 {
2967 SCHECK_PARTIAL();
2968 RRETURN(MATCH_NOMATCH);
2969 }
2970 GETCHARINCTEST(c, eptr);
2971 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2972 }
2973
2974 /* If max == min we can continue with the main loop without the
2975 need to recurse. */
2976
2977 if (min == max) continue;
2978
2979 /* If minimizing, keep testing the rest of the expression and advancing
2980 the pointer while it matches the class. */
2981
2982 if (minimize)
2983 {
2984 for (fi = min;; fi++)
2985 {
2986 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2987 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2988 if (fi >= max) RRETURN(MATCH_NOMATCH);
2989 if (eptr >= md->end_subject)
2990 {
2991 SCHECK_PARTIAL();
2992 RRETURN(MATCH_NOMATCH);
2993 }
2994 GETCHARINCTEST(c, eptr);
2995 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2996 }
2997 /* Control never gets here */
2998 }
2999
3000 /* If maximizing, find the longest possible run, then work backwards. */
3001
3002 else
3003 {
3004 pp = eptr;
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject)
3009 {
3010 SCHECK_PARTIAL();
3011 break;
3012 }
3013 #ifdef SUPPORT_UTF
3014 GETCHARLENTEST(c, eptr, len);
3015 #else
3016 c = *eptr;
3017 #endif
3018 if (!PRIV(xclass)(c, data, utf)) break;
3019 eptr += len;
3020 }
3021 for(;;)
3022 {
3023 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3025 if (eptr-- == pp) break; /* Stop if tried at original pos */
3026 #ifdef SUPPORT_UTF
3027 if (utf) BACKCHAR(eptr);
3028 #endif
3029 }
3030 RRETURN(MATCH_NOMATCH);
3031 }
3032
3033 /* Control never gets here */
3034 }
3035 #endif /* End of XCLASS */
3036
3037 /* Match a single character, casefully */
3038
3039 case OP_CHAR:
3040 #ifdef SUPPORT_UTF
3041 if (utf)
3042 {
3043 length = 1;
3044 ecode++;
3045 GETCHARLEN(fc, ecode, length);
3046 if (length > md->end_subject - eptr)
3047 {
3048 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3049 RRETURN(MATCH_NOMATCH);
3050 }
3051 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3052 }
3053 else
3054 #endif
3055 /* Not UTF mode */
3056 {
3057 if (md->end_subject - eptr < 1)
3058 {
3059 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3060 RRETURN(MATCH_NOMATCH);
3061 }
3062 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3063 ecode += 2;
3064 }
3065 break;
3066
3067 /* Match a single character, caselessly. If we are at the end of the
3068 subject, give up immediately. */
3069
3070 case OP_CHARI:
3071 if (eptr >= md->end_subject)
3072 {
3073 SCHECK_PARTIAL();
3074 RRETURN(MATCH_NOMATCH);
3075 }
3076
3077 #ifdef SUPPORT_UTF
3078 if (utf)
3079 {
3080 length = 1;
3081 ecode++;
3082 GETCHARLEN(fc, ecode, length);
3083
3084 /* If the pattern character's value is < 128, we have only one byte, and
3085 we know that its other case must also be one byte long, so we can use the
3086 fast lookup table. We know that there is at least one byte left in the
3087 subject. */
3088
3089 if (fc < 128)
3090 {
3091 if (md->lcc[fc]
3092 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3093 ecode++;
3094 eptr++;
3095 }
3096
3097 /* Otherwise we must pick up the subject character. Note that we cannot
3098 use the value of "length" to check for sufficient bytes left, because the
3099 other case of the character may have more or fewer bytes. */
3100
3101 else
3102 {
3103 unsigned int dc;
3104 GETCHARINC(dc, eptr);
3105 ecode += length;
3106
3107 /* If we have Unicode property support, we can use it to test the other
3108 case of the character, if there is one. */
3109
3110 if (fc != dc)
3111 {
3112 #ifdef SUPPORT_UCP
3113 if (dc != UCD_OTHERCASE(fc))
3114 #endif
3115 RRETURN(MATCH_NOMATCH);
3116 }
3117 }
3118 }
3119 else
3120 #endif /* SUPPORT_UTF */
3121
3122 /* Not UTF mode */
3123 {
3124 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3125 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3126 eptr++;
3127 ecode += 2;
3128 }
3129 break;
3130
3131 /* Match a single character repeatedly. */
3132
3133 case OP_EXACT:
3134 case OP_EXACTI:
3135 min = max = GET2(ecode, 1);
3136 ecode += 1 + IMM2_SIZE;
3137 goto REPEATCHAR;
3138
3139 case OP_POSUPTO:
3140 case OP_POSUPTOI:
3141 possessive = TRUE;
3142 /* Fall through */
3143
3144 case OP_UPTO:
3145 case OP_UPTOI:
3146 case OP_MINUPTO:
3147 case OP_MINUPTOI:
3148 min = 0;
3149 max = GET2(ecode, 1);
3150 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3151 ecode += 1 + IMM2_SIZE;
3152 goto REPEATCHAR;
3153
3154 case OP_POSSTAR:
3155 case OP_POSSTARI:
3156 possessive = TRUE;
3157 min = 0;
3158 max = INT_MAX;
3159 ecode++;
3160 goto REPEATCHAR;
3161
3162 case OP_POSPLUS:
3163 case OP_POSPLUSI:
3164 possessive = TRUE;
3165 min = 1;
3166 max = INT_MAX;
3167 ecode++;
3168 goto REPEATCHAR;
3169
3170 case OP_POSQUERY:
3171 case OP_POSQUERYI:
3172 possessive = TRUE;
3173 min = 0;
3174 max = 1;
3175 ecode++;
3176 goto REPEATCHAR;
3177
3178 case OP_STAR:
3179 case OP_STARI:
3180 case OP_MINSTAR:
3181 case OP_MINSTARI:
3182 case OP_PLUS:
3183 case OP_PLUSI:
3184 case OP_MINPLUS:
3185 case OP_MINPLUSI:
3186 case OP_QUERY:
3187 case OP_QUERYI:
3188 case OP_MINQUERY:
3189 case OP_MINQUERYI:
3190 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3191 minimize = (c & 1) != 0;
3192 min = rep_min[c]; /* Pick up values from tables; */
3193 max = rep_max[c]; /* zero for max => infinity */
3194 if (max == 0) max = INT_MAX;
3195
3196 /* Common code for all repeated single-character matches. */
3197
3198 REPEATCHAR:
3199 #ifdef SUPPORT_UTF
3200 if (utf)
3201 {
3202 length = 1;
3203 charptr = ecode;
3204 GETCHARLEN(fc, ecode, length);
3205 ecode += length;
3206
3207 /* Handle multibyte character matching specially here. There is
3208 support for caseless matching if UCP support is present. */
3209
3210 if (length > 1)
3211 {
3212 #ifdef SUPPORT_UCP
3213 unsigned int othercase;
3214 if (op >= OP_STARI && /* Caseless */
3215 (othercase = UCD_OTHERCASE(fc)) != fc)
3216 oclength = PRIV(ord2utf)(othercase, occhars);
3217 else oclength = 0;
3218 #endif /* SUPPORT_UCP */
3219
3220 for (i = 1; i <= min; i++)
3221 {
3222 if (eptr <= md->end_subject - length &&
3223 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3224 #ifdef SUPPORT_UCP
3225 else if (oclength > 0 &&
3226 eptr <= md->end_subject - oclength &&
3227 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3228 #endif /* SUPPORT_UCP */
3229 else
3230 {
3231 CHECK_PARTIAL();
3232 RRETURN(MATCH_NOMATCH);
3233 }
3234 }
3235
3236 if (min == max) continue;
3237
3238 if (minimize)
3239 {
3240 for (fi = min;; fi++)
3241 {
3242 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3243 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3244 if (fi >= max) RRETURN(MATCH_NOMATCH);
3245 if (eptr <= md->end_subject - length &&
3246 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3247 #ifdef SUPPORT_UCP
3248 else if (oclength > 0 &&
3249 eptr <= md->end_subject - oclength &&
3250 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3251 #endif /* SUPPORT_UCP */
3252 else
3253 {
3254 CHECK_PARTIAL();
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257 }
3258 /* Control never gets here */
3259 }
3260
3261 else /* Maximize */
3262 {
3263 pp = eptr;
3264 for (i = min; i < max; i++)
3265 {
3266 if (eptr <= md->end_subject - length &&
3267 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3268 #ifdef SUPPORT_UCP
3269 else if (oclength > 0 &&
3270 eptr <= md->end_subject - oclength &&
3271 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3272 #endif /* SUPPORT_UCP */
3273 else
3274 {
3275 CHECK_PARTIAL();
3276 break;
3277 }
3278 }
3279
3280 if (possessive) continue;
3281
3282 for(;;)
3283 {
3284 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3285 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3286 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3287 #ifdef SUPPORT_UCP
3288 eptr--;
3289 BACKCHAR(eptr);
3290 #else /* without SUPPORT_UCP */
3291 eptr -= length;
3292 #endif /* SUPPORT_UCP */
3293 }
3294 }
3295 /* Control never gets here */
3296 }
3297
3298 /* If the length of a UTF-8 character is 1, we fall through here, and
3299 obey the code as for non-UTF-8 characters below, though in this case the
3300 value of fc will always be < 128. */
3301 }
3302 else
3303 #endif /* SUPPORT_UTF */
3304 /* When not in UTF-8 mode, load a single-byte character. */
3305 fc = *ecode++;
3306
3307 /* The value of fc at this point is always one character, though we may
3308 or may not be in UTF mode. The code is duplicated for the caseless and
3309 caseful cases, for speed, since matching characters is likely to be quite
3310 common. First, ensure the minimum number of matches are present. If min =
3311 max, continue at the same level without recursing. Otherwise, if
3312 minimizing, keep trying the rest of the expression and advancing one
3313 matching character if failing, up to the maximum. Alternatively, if
3314 maximizing, find the maximum number of characters and work backwards. */
3315
3316 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3317 max, eptr));
3318
3319 if (op >= OP_STARI) /* Caseless */
3320 {
3321 #ifdef COMPILE_PCRE8
3322 /* fc must be < 128 if UTF is enabled. */
3323 foc = md->fcc[fc];
3324 #else
3325 #ifdef SUPPORT_UTF
3326 #ifdef SUPPORT_UCP
3327 if (utf && fc > 127)
3328 foc = UCD_OTHERCASE(fc);
3329 #else
3330 if (utf && fc > 127)
3331 foc = fc;
3332 #endif /* SUPPORT_UCP */
3333 else
3334 #endif /* SUPPORT_UTF */
3335 foc = TABLE_GET(fc, md->fcc, fc);
3336 #endif /* COMPILE_PCRE8 */
3337
3338 for (i = 1; i <= min; i++)
3339 {
3340 if (eptr >= md->end_subject)
3341 {
3342 SCHECK_PARTIAL();
3343 RRETURN(MATCH_NOMATCH);
3344 }
3345 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3346 eptr++;
3347 }
3348 if (min == max) continue;
3349 if (minimize)
3350 {
3351 for (fi = min;; fi++)
3352 {
3353 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3354 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3355 if (fi >= max) RRETURN(MATCH_NOMATCH);
3356 if (eptr >= md->end_subject)
3357 {
3358 SCHECK_PARTIAL();
3359 RRETURN(MATCH_NOMATCH);
3360 }
3361 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3362 eptr++;
3363 }
3364 /* Control never gets here */
3365 }
3366 else /* Maximize */
3367 {
3368 pp = eptr;
3369 for (i = min; i < max; i++)
3370 {
3371 if (eptr >= md->end_subject)
3372 {
3373 SCHECK_PARTIAL();
3374 break;
3375 }
3376 if (fc != *eptr && foc != *eptr) break;
3377 eptr++;
3378 }
3379
3380 if (possessive) continue;
3381
3382 while (eptr >= pp)
3383 {
3384 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3385 eptr--;
3386 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387 }
3388 RRETURN(MATCH_NOMATCH);
3389 }
3390 /* Control never gets here */
3391 }
3392
3393 /* Caseful comparisons (includes all multi-byte characters) */
3394
3395 else
3396 {
3397 for (i = 1; i <= min; i++)
3398 {
3399 if (eptr >= md->end_subject)
3400 {
3401 SCHECK_PARTIAL();
3402 RRETURN(MATCH_NOMATCH);
3403 }
3404 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3405 }
3406
3407 if (min == max) continue;
3408
3409 if (minimize)
3410 {
3411 for (fi = min;; fi++)
3412 {
3413 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3414 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3415 if (fi >= max) RRETURN(MATCH_NOMATCH);
3416 if (eptr >= md->end_subject)
3417 {
3418 SCHECK_PARTIAL();
3419 RRETURN(MATCH_NOMATCH);
3420 }
3421 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3422 }
3423 /* Control never gets here */
3424 }
3425 else /* Maximize */
3426 {
3427 pp = eptr;
3428 for (i = min; i < max; i++)
3429 {
3430 if (eptr >= md->end_subject)
3431 {
3432 SCHECK_PARTIAL();
3433 break;
3434 }
3435 if (fc != *eptr) break;
3436 eptr++;
3437 }
3438 if (possessive) continue;
3439
3440 while (eptr >= pp)
3441 {
3442 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3443 eptr--;
3444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3445 }
3446 RRETURN(MATCH_NOMATCH);
3447 }
3448 }
3449 /* Control never gets here */
3450
3451 /* Match a negated single one-byte character. The character we are
3452 checking can be multibyte. */
3453
3454 case OP_NOT:
3455 case OP_NOTI:
3456 if (eptr >= md->end_subject)
3457 {
3458 SCHECK_PARTIAL();
3459 RRETURN(MATCH_NOMATCH);
3460 }
3461 ecode++;
3462 GETCHARINCTEST(c, eptr);
3463 if (op == OP_NOTI) /* The caseless case */
3464 {
3465 register int ch, och;
3466 ch = *ecode++;
3467 #ifdef COMPILE_PCRE8
3468 /* ch must be < 128 if UTF is enabled. */
3469 och = md->fcc[ch];
3470 #else
3471 #ifdef SUPPORT_UTF
3472 #ifdef SUPPORT_UCP
3473 if (utf && ch > 127)
3474 och = UCD_OTHERCASE(ch);
3475 #else
3476 if (utf && ch > 127)
3477 och = ch;
3478 #endif /* SUPPORT_UCP */
3479 else
3480 #endif /* SUPPORT_UTF */
3481 och = TABLE_GET(ch, md->fcc, ch);
3482 #endif /* COMPILE_PCRE8 */
3483 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3484 }
3485 else /* Caseful */
3486 {
3487 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3488 }
3489 break;
3490
3491 /* Match a negated single one-byte character repeatedly. This is almost a
3492 repeat of the code for a repeated single character, but I haven't found a
3493 nice way of commoning these up that doesn't require a test of the
3494 positive/negative option for each character match. Maybe that wouldn't add
3495 very much to the time taken, but character matching *is* what this is all
3496 about... */
3497
3498 case OP_NOTEXACT:
3499 case OP_NOTEXACTI:
3500 min = max = GET2(ecode, 1);
3501 ecode += 1 + IMM2_SIZE;
3502 goto REPEATNOTCHAR;
3503
3504 case OP_NOTUPTO:
3505 case OP_NOTUPTOI:
3506 case OP_NOTMINUPTO:
3507 case OP_NOTMINUPTOI:
3508 min = 0;
3509 max = GET2(ecode, 1);
3510 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3511 ecode += 1 + IMM2_SIZE;
3512 goto REPEATNOTCHAR;
3513
3514 case OP_NOTPOSSTAR:
3515 case OP_NOTPOSSTARI:
3516 possessive = TRUE;
3517 min = 0;
3518 max = INT_MAX;
3519 ecode++;
3520 goto REPEATNOTCHAR;
3521
3522 case OP_NOTPOSPLUS:
3523 case OP_NOTPOSPLUSI:
3524 possessive = TRUE;
3525 min = 1;
3526 max = INT_MAX;
3527 ecode++;
3528 goto REPEATNOTCHAR;
3529
3530 case OP_NOTPOSQUERY:
3531 case OP_NOTPOSQUERYI:
3532 possessive = TRUE;
3533 min = 0;
3534 max = 1;
3535 ecode++;
3536 goto REPEATNOTCHAR;
3537
3538 case OP_NOTPOSUPTO:
3539 case OP_NOTPOSUPTOI:
3540 possessive = TRUE;
3541 min = 0;
3542 max = GET2(ecode, 1);
3543 ecode += 1 + IMM2_SIZE;
3544 goto REPEATNOTCHAR;
3545
3546 case OP_NOTSTAR:
3547 case OP_NOTSTARI:
3548 case OP_NOTMINSTAR:
3549 case OP_NOTMINSTARI:
3550 case OP_NOTPLUS:
3551 case OP_NOTPLUSI:
3552 case OP_NOTMINPLUS:
3553 case OP_NOTMINPLUSI:
3554 case OP_NOTQUERY:
3555 case OP_NOTQUERYI:
3556 case OP_NOTMINQUERY:
3557 case OP_NOTMINQUERYI:
3558 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3559 minimize = (c & 1) != 0;
3560 min = rep_min[c]; /* Pick up values from tables; */
3561 max = rep_max[c]; /* zero for max => infinity */
3562 if (max == 0) max = INT_MAX;
3563
3564 /* Common code for all repeated single-byte matches. */
3565
3566 REPEATNOTCHAR:
3567 fc = *ecode++;
3568
3569 /* The code is duplicated for the caseless and caseful cases, for speed,
3570 since matching characters is likely to be quite common. First, ensure the
3571 minimum number of matches are present. If min = max, continue at the same
3572 level without recursing. Otherwise, if minimizing, keep trying the rest of
3573 the expression and advancing one matching character if failing, up to the
3574 maximum. Alternatively, if maximizing, find the maximum number of
3575 characters and work backwards. */
3576
3577 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3578 max, eptr));
3579
3580 if (op >= OP_NOTSTARI) /* Caseless */
3581 {
3582 #ifdef COMPILE_PCRE8
3583 /* fc must be < 128 if UTF is enabled. */
3584 foc = md->fcc[fc];
3585 #else
3586 #ifdef SUPPORT_UTF
3587 #ifdef SUPPORT_UCP
3588 if (utf && fc > 127)
3589 foc = UCD_OTHERCASE(fc);
3590 #else
3591 if (utf && fc > 127)
3592 foc = fc;
3593 #endif /* SUPPORT_UCP */
3594 else
3595 #endif /* SUPPORT_UTF */
3596 foc = TABLE_GET(fc, md->fcc, fc);
3597 #endif /* COMPILE_PCRE8 */
3598
3599 #ifdef SUPPORT_UTF
3600 if (utf)
3601 {
3602 register unsigned int d;
3603 for (i = 1; i <= min; i++)
3604 {
3605 if (eptr >= md->end_subject)
3606 {
3607 SCHECK_PARTIAL();
3608 RRETURN(MATCH_NOMATCH);
3609 }
3610 GETCHARINC(d, eptr);
3611 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3612 }
3613 }
3614 else
3615 #endif
3616 /* Not UTF mode */
3617 {
3618 for (i = 1; i <= min; i++)
3619 {
3620 if (eptr >= md->end_subject)
3621 {
3622 SCHECK_PARTIAL();
3623 RRETURN(MATCH_NOMATCH);
3624 }
3625 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3626 eptr++;
3627 }
3628 }
3629
3630 if (min == max) continue;
3631
3632 if (minimize)
3633 {
3634 #ifdef SUPPORT_UTF
3635 if (utf)
3636 {
3637 register unsigned int d;
3638 for (fi = min;; fi++)
3639 {
3640 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3641 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3642 if (fi >= max) RRETURN(MATCH_NOMATCH);
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 RRETURN(MATCH_NOMATCH);
3647 }
3648 GETCHARINC(d, eptr);
3649 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3650 }
3651 }
3652 else
3653 #endif
3654 /* Not UTF mode */
3655 {
3656 for (fi = min;; fi++)
3657 {
3658 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3660 if (fi >= max) RRETURN(MATCH_NOMATCH);
3661 if (eptr >= md->end_subject)
3662 {
3663 SCHECK_PARTIAL();
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3667 eptr++;
3668 }
3669 }
3670 /* Control never gets here */
3671 }
3672
3673 /* Maximize case */
3674
3675 else
3676 {
3677 pp = eptr;
3678
3679 #ifdef SUPPORT_UTF
3680 if (utf)
3681 {
3682 register unsigned int d;
3683 for (i = min; i < max; i++)
3684 {
3685 int len = 1;
3686 if (eptr >= md->end_subject)
3687 {
3688 SCHECK_PARTIAL();
3689 break;
3690 }
3691 GETCHARLEN(d, eptr, len);
3692 if (fc == d || foc == d) break;
3693 eptr += len;
3694 }
3695 if (possessive) continue;
3696 for(;;)
3697 {
3698 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3700 if (eptr-- == pp) break; /* Stop if tried at original pos */
3701 BACKCHAR(eptr);
3702 }
3703 }
3704 else
3705 #endif
3706 /* Not UTF mode */
3707 {
3708 for (i = min; i < max; i++)
3709 {
3710 if (eptr >= md->end_subject)
3711 {
3712 SCHECK_PARTIAL();
3713 break;
3714 }
3715 if (fc == *eptr || foc == *eptr) break;
3716 eptr++;
3717 }
3718 if (possessive) continue;
3719 while (eptr >= pp)
3720 {
3721 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3723 eptr--;
3724 }
3725 }
3726
3727 RRETURN(MATCH_NOMATCH);
3728 }
3729 /* Control never gets here */
3730 }
3731
3732 /* Caseful comparisons */
3733
3734 else
3735 {
3736 #ifdef SUPPORT_UTF
3737 if (utf)
3738 {
3739 register unsigned int d;
3740 for (i = 1; i <= min; i++)
3741 {
3742 if (eptr >= md->end_subject)
3743 {
3744 SCHECK_PARTIAL();
3745 RRETURN(MATCH_NOMATCH);
3746 }
3747 GETCHARINC(d, eptr);
3748 if (fc == d) RRETURN(MATCH_NOMATCH);
3749 }
3750 }
3751 else
3752 #endif
3753 /* Not UTF mode */
3754 {
3755 for (i = 1; i <= min; i++)
3756 {
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3761 }
3762 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3763 }
3764 }
3765
3766 if (min == max) continue;
3767
3768 if (minimize)
3769 {
3770 #ifdef SUPPORT_UTF
3771 if (utf)
3772 {
3773 register unsigned int d;
3774 for (fi = min;; fi++)
3775 {
3776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3778 if (fi >= max) RRETURN(MATCH_NOMATCH);
3779 if (eptr >= md->end_subject)
3780 {
3781 SCHECK_PARTIAL();
3782 RRETURN(MATCH_NOMATCH);
3783 }
3784 GETCHARINC(d, eptr);
3785 if (fc == d) RRETURN(MATCH_NOMATCH);
3786 }
3787 }
3788 else
3789 #endif
3790 /* Not UTF mode */
3791 {
3792 for (fi = min;; fi++)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (fi >= max) RRETURN(MATCH_NOMATCH);
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 RRETURN(MATCH_NOMATCH);
3801 }
3802 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3803 }
3804 }
3805 /* Control never gets here */
3806 }
3807
3808 /* Maximize case */
3809
3810 else
3811 {
3812 pp = eptr;
3813
3814 #ifdef SUPPORT_UTF
3815 if (utf)
3816 {
3817 register unsigned int d;
3818 for (i = min; i < max; i++)
3819 {
3820 int len = 1;
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 break;
3825 }
3826 GETCHARLEN(d, eptr, len);
3827 if (fc == d) break;
3828 eptr += len;
3829 }
3830 if (possessive) continue;
3831 for(;;)
3832 {
3833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3835 if (eptr-- == pp) break; /* Stop if tried at original pos */
3836 BACKCHAR(eptr);
3837 }
3838 }
3839 else
3840 #endif
3841 /* Not UTF mode */
3842 {
3843 for (i = min; i < max; i++)
3844 {
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 break;
3849 }
3850 if (fc == *eptr) break;
3851 eptr++;
3852 }
3853 if (possessive) continue;
3854 while (eptr >= pp)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 eptr--;
3859 }
3860 }
3861
3862 RRETURN(MATCH_NOMATCH);
3863 }
3864 }
3865 /* Control never gets here */
3866
3867 /* Match a single character type repeatedly; several different opcodes
3868 share code. This is very similar to the code for single characters, but we
3869 repeat it in the interests of efficiency. */
3870
3871 case OP_TYPEEXACT:
3872 min = max = GET2(ecode, 1);
3873 minimize = TRUE;
3874 ecode += 1 + IMM2_SIZE;
3875 goto REPEATTYPE;
3876
3877 case OP_TYPEUPTO:
3878 case OP_TYPEMINUPTO:
3879 min = 0;
3880 max = GET2(ecode, 1);
3881 minimize = *ecode == OP_TYPEMINUPTO;
3882 ecode += 1 + IMM2_SIZE;
3883 goto REPEATTYPE;
3884
3885 case OP_TYPEPOSSTAR:
3886 possessive = TRUE;
3887 min = 0;
3888 max = INT_MAX;
3889 ecode++;
3890 goto REPEATTYPE;
3891
3892 case OP_TYPEPOSPLUS:
3893 possessive = TRUE;
3894 min = 1;
3895 max = INT_MAX;
3896 ecode++;
3897 goto REPEATTYPE;
3898
3899 case OP_TYPEPOSQUERY:
3900 possessive = TRUE;
3901 min = 0;
3902 max = 1;
3903 ecode++;
3904 goto REPEATTYPE;
3905
3906 case OP_TYPEPOSUPTO:
3907 possessive = TRUE;
3908 min = 0;
3909 max = GET2(ecode, 1);
3910 ecode += 1 + IMM2_SIZE;
3911 goto REPEATTYPE;
3912
3913 case OP_TYPESTAR:
3914 case OP_TYPEMINSTAR:
3915 case OP_TYPEPLUS:
3916 case OP_TYPEMINPLUS:
3917 case OP_TYPEQUERY:
3918 case OP_TYPEMINQUERY:
3919 c = *ecode++ - OP_TYPESTAR;
3920 minimize = (c & 1) != 0;
3921 min = rep_min[c]; /* Pick up values from tables; */
3922 max = rep_max[c]; /* zero for max => infinity */
3923 if (max == 0) max = INT_MAX;
3924
3925 /* Common code for all repeated single character type matches. Note that
3926 in UTF-8 mode, '.' matches a character of any length, but for the other
3927 character types, the valid characters are all one-byte long. */
3928
3929 REPEATTYPE:
3930 ctype = *ecode++; /* Code for the character type */
3931
3932 #ifdef SUPPORT_UCP
3933 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3934 {
3935 prop_fail_result = ctype == OP_NOTPROP;
3936 prop_type = *ecode++;
3937 prop_value = *ecode++;
3938 }
3939 else prop_type = -1;
3940 #endif
3941
3942 /* First, ensure the minimum number of matches are present. Use inline
3943 code for maximizing the speed, and do the type test once at the start
3944 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3945 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3946 and single-bytes. */
3947
3948 if (min > 0)
3949 {
3950 #ifdef SUPPORT_UCP
3951 if (prop_type >= 0)
3952 {
3953 switch(prop_type)
3954 {
3955 case PT_ANY:
3956 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3957 for (i = 1; i <= min; i++)
3958 {
3959 if (eptr >= md->end_subject)
3960 {
3961 SCHECK_PARTIAL();
3962 RRETURN(MATCH_NOMATCH);
3963 }
3964 GETCHARINCTEST(c, eptr);
3965 }
3966 break;
3967
3968 case PT_LAMP:
3969 for (i = 1; i <= min; i++)
3970 {
3971 int chartype;
3972 if (eptr >= md->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 RRETURN(MATCH_NOMATCH);
3976 }
3977 GETCHARINCTEST(c, eptr);
3978 chartype = UCD_CHARTYPE(c);
3979 if ((chartype == ucp_Lu ||
3980 chartype == ucp_Ll ||
3981 chartype == ucp_Lt) == prop_fail_result)
3982 RRETURN(MATCH_NOMATCH);
3983 }
3984 break;
3985
3986 case PT_GC:
3987 for (i = 1; i <= min; i++)
3988 {
3989 if (eptr >= md->end_subject)
3990 {
3991 SCHECK_PARTIAL();
3992 RRETURN(MATCH_NOMATCH);
3993 }
3994 GETCHARINCTEST(c, eptr);
3995 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3996 RRETURN(MATCH_NOMATCH);
3997 }
3998 break;
3999
4000 case PT_PC:
4001 for (i = 1; i <= min; i++)
4002 {
4003 if (eptr >= md->end_subject)
4004 {
4005 SCHECK_PARTIAL();
4006 RRETURN(MATCH_NOMATCH);
4007 }
4008 GETCHARINCTEST(c, eptr);
4009 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4010 RRETURN(MATCH_NOMATCH);
4011 }
4012 break;
4013
4014 case PT_SC:
4015 for (i = 1; i <= min; i++)
4016 {
4017 if (eptr >= md->end_subject)
4018 {
4019 SCHECK_PARTIAL();
4020 RRETURN(MATCH_NOMATCH);
4021 }
4022 GETCHARINCTEST(c, eptr);
4023 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4024 RRETURN(MATCH_NOMATCH);
4025 }
4026 break;
4027
4028 case PT_ALNUM:
4029 for (i = 1; i <= min; i++)
4030 {
4031 int category;
4032 if (eptr >= md->end_subject)
4033 {
4034 SCHECK_PARTIAL();
4035 RRETURN(MATCH_NOMATCH);
4036 }
4037 GETCHARINCTEST(c, eptr);
4038 category = UCD_CATEGORY(c);
4039 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4040 RRETURN(MATCH_NOMATCH);
4041 }
4042 break;
4043
4044 case PT_SPACE: /* Perl space */
4045 for (i = 1; i <= min; i++)
4046 {
4047 if (eptr >= md->end_subject)
4048 {
4049 SCHECK_PARTIAL();
4050 RRETURN(MATCH_NOMATCH);
4051 }
4052 GETCHARINCTEST(c, eptr);
4053 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4054 c == CHAR_FF || c == CHAR_CR)
4055 == prop_fail_result)
4056 RRETURN(MATCH_NOMATCH);
4057 }
4058 break;
4059
4060 case PT_PXSPACE: /* POSIX space */
4061 for (i = 1; i <= min; i++)
4062 {
4063 if (eptr >= md->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 RRETURN(MATCH_NOMATCH);
4067 }
4068 GETCHARINCTEST(c, eptr);
4069 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4070 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4071 == prop_fail_result)
4072 RRETURN(MATCH_NOMATCH);
4073 }
4074 break;
4075
4076 case PT_WORD:
4077 for (i = 1; i <= min; i++)
4078 {
4079 int category;
4080 if (eptr >= md->end_subject)
4081 {
4082 SCHECK_PARTIAL();
4083 RRETURN(MATCH_NOMATCH);
4084 }
4085 GETCHARINCTEST(c, eptr);
4086 category = UCD_CATEGORY(c);
4087 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4088 == prop_fail_result)
4089 RRETURN(MATCH_NOMATCH);
4090 }
4091 break;
4092
4093 /* This should not occur */
4094
4095 default:
4096 RRETURN(PCRE_ERROR_INTERNAL);
4097 }
4098 }
4099
4100 /* Match extended Unicode sequences. We will get here only if the
4101 support is in the binary; otherwise a compile-time error occurs. */
4102
4103 else if (ctype == OP_EXTUNI)
4104 {
4105 for (i = 1; i <= min; i++)
4106 {
4107 if (eptr >= md->end_subject)
4108 {
4109 SCHECK_PARTIAL();
4110 RRETURN(MATCH_NOMATCH);
4111 }
4112 GETCHARINCTEST(c, eptr);
4113 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4114 while (eptr < md->end_subject)
4115 {
4116 int len = 1;
4117 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4118 if (UCD_CATEGORY(c) != ucp_M) break;
4119 eptr += len;
4120 }
4121 }
4122 }
4123
4124 else
4125 #endif /* SUPPORT_UCP */
4126
4127 /* Handle all other cases when the coding is UTF-8 */
4128
4129 #ifdef SUPPORT_UTF
4130 if (utf) switch(ctype)
4131 {
4132 case OP_ANY:
4133 for (i = 1; i <= min; i++)
4134 {
4135 if (eptr >= md->end_subject)
4136 {
4137 SCHECK_PARTIAL();
4138 RRETURN(MATCH_NOMATCH);
4139 }
4140 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4141 eptr++;
4142 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4143 }
4144 break;
4145
4146 case OP_ALLANY:
4147 for (i = 1; i <= min; i++)
4148 {
4149 if (eptr >= md->end_subject)
4150 {
4151 SCHECK_PARTIAL();
4152 RRETURN(MATCH_NOMATCH);
4153 }
4154 eptr++;
4155 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4156 }
4157 break;
4158
4159 case OP_ANYBYTE:
4160 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4161 eptr += min;
4162 break;
4163
4164 case OP_ANYNL:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 RRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINC(c, eptr);
4173 switch(c)
4174 {
4175 default: RRETURN(MATCH_NOMATCH);
4176
4177 case 0x000d:
4178 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4179 break;
4180
4181 case 0x000a:
4182 break;
4183
4184 case 0x000b:
4185 case 0x000c:
4186 case 0x0085:
4187 case 0x2028:
4188 case 0x2029:
4189 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4190 break;
4191 }
4192 }
4193 break;
4194
4195 case OP_NOT_HSPACE:
4196 for (i = 1; i <= min; i++)
4197 {
4198 if (eptr >= md->end_subject)
4199 {
4200 SCHECK_PARTIAL();
4201 RRETURN(MATCH_NOMATCH);
4202 }
4203 GETCHARINC(c, eptr);
4204 switch(c)
4205 {
4206 default: break;
4207 case 0x09: /* HT */
4208 case 0x20: /* SPACE */
4209 case 0xa0: /* NBSP */
4210 case 0x1680: /* OGHAM SPACE MARK */
4211 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4212 case 0x2000: /* EN QUAD */
4213 case 0x2001: /* EM QUAD */
4214 case 0x2002: /* EN SPACE */
4215 case 0x2003: /* EM SPACE */
4216 case 0x2004: /* THREE-PER-EM SPACE */
4217 case 0x2005: /* FOUR-PER-EM SPACE */
4218 case 0x2006: /* SIX-PER-EM SPACE */
4219 case 0x2007: /* FIGURE SPACE */
4220 case 0x2008: /* PUNCTUATION SPACE */
4221 case 0x2009: /* THIN SPACE */
4222 case 0x200A: /* HAIR SPACE */
4223 case 0x202f: /* NARROW NO-BREAK SPACE */
4224 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4225 case 0x3000: /* IDEOGRAPHIC SPACE */
4226 RRETURN(MATCH_NOMATCH);
4227 }
4228 }
4229 break;
4230
4231 case OP_HSPACE:
4232 for (i = 1; i <= min; i++)
4233 {
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 RRETURN(MATCH_NOMATCH);
4238 }
4239 GETCHARINC(c, eptr);
4240 switch(c)
4241 {
4242 default: RRETURN(MATCH_NOMATCH);
4243 case 0x09: /* HT */
4244 case 0x20: /* SPACE */
4245 case 0xa0: /* NBSP */
4246 case 0x1680: /* OGHAM SPACE MARK */
4247 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4248 case 0x2000: /* EN QUAD */
4249 case 0x2001: /* EM QUAD */
4250 case 0x2002: /* EN SPACE */
4251 case 0x2003: /* EM SPACE */
4252 case 0x2004: /* THREE-PER-EM SPACE */
4253 case 0x2005: /* FOUR-PER-EM SPACE */
4254 case 0x2006: /* SIX-PER-EM SPACE */
4255 case 0x2007: /* FIGURE SPACE */
4256 case 0x2008: /* PUNCTUATION SPACE */
4257 case 0x2009: /* THIN SPACE */
4258 case 0x200A: /* HAIR SPACE */
4259 case 0x202f: /* NARROW NO-BREAK SPACE */
4260 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4261 case 0x3000: /* IDEOGRAPHIC SPACE */
4262 break;
4263 }
4264 }
4265 break;
4266
4267 case OP_NOT_VSPACE:
4268 for (i = 1; i <= min; i++)
4269 {
4270 if (eptr >= md->end_subject)
4271 {
4272 SCHECK_PARTIAL();
4273 RRETURN(MATCH_NOMATCH);
4274 }
4275 GETCHARINC(c, eptr);
4276 switch(c)
4277 {
4278 default: break;
4279 case 0x0a: /* LF */
4280 case 0x0b: /* VT */
4281 case 0x0c: /* FF */
4282 case 0x0d: /* CR */
4283 case 0x85: /* NEL */
4284 case 0x2028: /* LINE SEPARATOR */
4285 case 0x2029: /* PARAGRAPH SEPARATOR */
4286 RRETURN(MATCH_NOMATCH);
4287 }
4288 }
4289 break;
4290
4291 case OP_VSPACE:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 RRETURN(MATCH_NOMATCH);
4298 }
4299 GETCHARINC(c, eptr);
4300 switch(c)
4301 {
4302 default: RRETURN(MATCH_NOMATCH);
4303 case 0x0a: /* LF */
4304 case 0x0b: /* VT */
4305 case 0x0c: /* FF */
4306 case 0x0d: /* CR */
4307 case 0x85: /* NEL */
4308 case 0x2028: /* LINE SEPARATOR */
4309 case 0x2029: /* PARAGRAPH SEPARATOR */
4310 break;
4311 }
4312 }
4313 break;
4314
4315 case OP_NOT_DIGIT:
4316 for (i = 1; i <= min; i++)
4317 {
4318 if (eptr >= md->end_subject)
4319 {
4320 SCHECK_PARTIAL();
4321 RRETURN(MATCH_NOMATCH);
4322 }
4323 GETCHARINC(c, eptr);
4324 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4325 RRETURN(MATCH_NOMATCH);
4326 }
4327 break;
4328
4329 case OP_DIGIT:
4330 for (i = 1; i <= min; i++)
4331 {
4332 if (eptr >= md->end_subject)
4333 {
4334 SCHECK_PARTIAL();
4335 RRETURN(MATCH_NOMATCH);
4336 }
4337 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4338 RRETURN(MATCH_NOMATCH);
4339 eptr++;
4340 /* No need to skip more bytes - we know it's a 1-byte character */
4341 }
4342 break;
4343
4344 case OP_NOT_WHITESPACE:
4345 for (i = 1; i <= min; i++)
4346 {
4347 if (eptr >= md->end_subject)
4348 {
4349 SCHECK_PARTIAL();
4350 RRETURN(MATCH_NOMATCH);
4351 }
4352 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4353 RRETURN(MATCH_NOMATCH);
4354 eptr++;
4355 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4356 }
4357 break;
4358
4359 case OP_WHITESPACE:
4360 for (i = 1; i <= min; i++)
4361 {
4362 if (eptr >= md->end_subject)
4363 {
4364 SCHECK_PARTIAL();
4365 RRETURN(MATCH_NOMATCH);
4366 }
4367 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4368 RRETURN(MATCH_NOMATCH);
4369 eptr++;
4370 /* No need to skip more bytes - we know it's a 1-byte character */
4371 }
4372 break;
4373
4374 case OP_NOT_WORDCHAR:
4375 for (i = 1; i <= min; i++)
4376 {
4377 if (eptr >= md->end_subject)
4378 {
4379 SCHECK_PARTIAL();
4380 RRETURN(MATCH_NOMATCH);
4381 }
4382 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4383 RRETURN(MATCH_NOMATCH);
4384 eptr++;
4385 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4386 }
4387 break;
4388
4389 case OP_WORDCHAR:
4390 for (i = 1; i <= min; i++)
4391 {
4392 if (eptr >= md->end_subject)
4393 {
4394 SCHECK_PARTIAL();
4395 RRETURN(MATCH_NOMATCH);
4396 }
4397 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4398 RRETURN(MATCH_NOMATCH);
4399 eptr++;
4400 /* No need to skip more bytes - we know it's a 1-byte character */
4401 }
4402 break;
4403
4404 default:
4405 RRETURN(PCRE_ERROR_INTERNAL);
4406 } /* End switch(ctype) */
4407
4408 else
4409 #endif /* SUPPORT_UTF */
4410
4411 /* Code for the non-UTF-8 case for minimum matching of operators other
4412 than OP_PROP and OP_NOTPROP. */
4413
4414 switch(ctype)
4415 {
4416 case OP_ANY:
4417 for (i = 1; i <= min; i++)
4418 {
4419 if (eptr >= md->end_subject)
4420 {
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4423 }
4424 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4425 eptr++;
4426 }
4427 break;
4428
4429 case OP_ALLANY:
4430 if (eptr > md->end_subject - min)
4431 {
4432 SCHECK_PARTIAL();
4433 RRETURN(MATCH_NOMATCH);
4434 }
4435 eptr += min;
4436 break;
4437
4438 case OP_ANYBYTE:
4439 if (eptr > md->end_subject - min)
4440 {
4441 SCHECK_PARTIAL();
4442 RRETURN(MATCH_NOMATCH);
4443 }
4444 eptr += min;
4445 break;
4446
4447 case OP_ANYNL:
4448 for (i = 1; i <= min; i++)
4449 {
4450 if (eptr >= md->end_subject)
4451 {
4452 SCHECK_PARTIAL();
4453 RRETURN(MATCH_NOMATCH);
4454 }
4455 switch(*eptr++)
4456 {
4457 default: RRETURN(MATCH_NOMATCH);
4458
4459 case 0x000d:
4460 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4461 break;
4462
4463 case 0x000a:
4464 break;
4465
4466 case 0x000b:
4467 case 0x000c:
4468 case 0x0085:
4469 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4470 break;
4471 }
4472 }
4473 break;
4474
4475 case OP_NOT_HSPACE:
4476 for (i = 1; i <= min; i++)
4477 {
4478 if (eptr >= md->end_subject)
4479 {
4480 SCHECK_PARTIAL();
4481 RRETURN(MATCH_NOMATCH);
4482 }
4483 switch(*eptr++)
4484 {
4485 default: break;
4486 case 0x09: /* HT */
4487 case 0x20: /* SPACE */
4488 case 0xa0: /* NBSP */
4489 RRETURN(MATCH_NOMATCH);
4490 }
4491 }
4492 break;
4493
4494 case OP_HSPACE:
4495 for (i = 1; i <= min; i++)
4496 {
4497 if (eptr >= md->end_subject)
4498 {
4499 SCHECK_PARTIAL();
4500 RRETURN(MATCH_NOMATCH);
4501 }
4502 switch(*eptr++)
4503 {
4504 default: RRETURN(MATCH_NOMATCH);
4505 case 0x09: /* HT */
4506 case 0x20: /* SPACE */
4507 case 0xa0: /* NBSP */
4508 break;
4509 }
4510 }
4511 break;
4512
4513 case OP_NOT_VSPACE:
4514 for (i = 1; i <= min; i++)
4515 {
4516 if (eptr >= md->end_subject)
4517 {
4518 SCHECK_PARTIAL();
4519 RRETURN(MATCH_NOMATCH);
4520 }
4521 switch(*eptr++)
4522 {
4523 default: break;
4524 case 0x0a: /* LF */
4525 case 0x0b: /* VT */
4526 case 0x0c: /* FF */
4527 case 0x0d: /* CR */
4528 case 0x85: /* NEL */
4529 RRETURN(MATCH_NOMATCH);
4530 }
4531 }
4532 break;
4533
4534 case OP_VSPACE:
4535 for (i = 1; i <= min; i++)
4536 {
4537 if (eptr >= md->end_subject)
4538 {
4539 SCHECK_PARTIAL();
4540 RRETURN(MATCH_NOMATCH);
4541 }
4542 switch(*eptr++)
4543 {
4544 default: RRETURN(MATCH_NOMATCH);
4545 case 0x0a: /* LF */
4546 case 0x0b: /* VT */
4547 case 0x0c: /* FF */
4548 case 0x0d: /* CR */
4549 case 0x85: /* NEL */
4550 break;
4551 }
4552 }
4553 break;
4554
4555 case OP_NOT_DIGIT:
4556 for (i = 1; i <= min; i++)
4557 {
4558 if (eptr >= md->end_subject)
4559 {
4560 SCHECK_PARTIAL();
4561 RRETURN(MATCH_NOMATCH);
4562 }
4563 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4564 RRETURN(MATCH_NOMATCH);
4565 eptr++;
4566 }
4567 break;
4568
4569 case OP_DIGIT:
4570 for (i = 1; i <= min; i++)
4571 {
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4576 }
4577 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4578 RRETURN(MATCH_NOMATCH);
4579 eptr++;
4580 }
4581 break;
4582
4583 case OP_NOT_WHITESPACE:
4584 for (i = 1; i <= min; i++)
4585 {
4586 if (eptr >= md->end_subject)
4587 {
4588 SCHECK_PARTIAL();
4589 RRETURN(MATCH_NOMATCH);
4590 }
4591 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4592 RRETURN(MATCH_NOMATCH);
4593 eptr++;
4594 }
4595 break;
4596
4597 case OP_WHITESPACE:
4598 for (i = 1; i <= min; i++)
4599 {
4600 if (eptr >= md->end_subject)
4601 {
4602 SCHECK_PARTIAL();
4603 RRETURN(MATCH_NOMATCH);
4604 }
4605 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4606 RRETURN(MATCH_NOMATCH);
4607 eptr++;
4608 }
4609 break;
4610
4611 case OP_NOT_WORDCHAR:
4612 for (i = 1; i <= min; i++)
4613 {
4614 if (eptr >= md->end_subject)
4615 {
4616 SCHECK_PARTIAL();
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4620 RRETURN(MATCH_NOMATCH);
4621 eptr++;
4622 }
4623 break;
4624
4625 case OP_WORDCHAR:
4626 for (i = 1; i <= min; i++)
4627 {
4628 if (eptr >= md->end_subject)
4629 {
4630 SCHECK_PARTIAL();
4631 RRETURN(MATCH_NOMATCH);
4632 }
4633 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4634 RRETURN(MATCH_NOMATCH);
4635 eptr++;
4636 }
4637 break;
4638
4639 default:
4640 RRETURN(PCRE_ERROR_INTERNAL);
4641 }
4642 }
4643
4644 /* If min = max, continue at the same level without recursing */
4645
4646 if (min == max) continue;
4647
4648 /* If minimizing, we have to test the rest of the pattern before each
4649 subsequent match. Again, separate the UTF-8 case for speed, and also
4650 separate the UCP cases. */
4651
4652 if (minimize)
4653 {
4654 #ifdef SUPPORT_UCP
4655 if (prop_type >= 0)
4656 {
4657 switch(prop_type)
4658 {
4659 case PT_ANY:
4660 for (fi = min;; fi++)
4661 {
4662 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4664 if (fi >= max) RRETURN(MATCH_NOMATCH);
4665 if (eptr >= md->end_subject)
4666 {
4667 SCHECK_PARTIAL();
4668 RRETURN(MATCH_NOMATCH);
4669 }
4670 GETCHARINCTEST(c, eptr);
4671 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4672 }
4673 /* Control never gets here */
4674
4675 case PT_LAMP:
4676 for (fi = min;; fi++)
4677 {
4678 int chartype;
4679 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4681 if (fi >= max) RRETURN(MATCH_NOMATCH);
4682 if (eptr >= md->end_subject)
4683 {
4684 SCHECK_PARTIAL();
4685 RRETURN(MATCH_NOMATCH);
4686 }
4687 GETCHARINCTEST(c, eptr);
4688 chartype = UCD_CHARTYPE(c);
4689 if ((chartype == ucp_Lu ||
4690 chartype == ucp_Ll ||
4691 chartype == ucp_Lt) == prop_fail_result)
4692 RRETURN(MATCH_NOMATCH);
4693 }
4694 /* Control never gets here */
4695
4696 case PT_GC:
4697 for (fi = min;; fi++)
4698 {
4699 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4700 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4701 if (fi >= max) RRETURN(MATCH_NOMATCH);
4702 if (eptr >= md->end_subject)
4703 {
4704 SCHECK_PARTIAL();
4705 RRETURN(MATCH_NOMATCH);
4706 }
4707 GETCHARINCTEST(c, eptr);
4708 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4709 RRETURN(MATCH_NOMATCH);
4710 }
4711 /* Control never gets here */
4712
4713 case PT_PC:
4714 for (fi = min;; fi++)
4715 {
4716 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4718 if (fi >= max) RRETURN(MATCH_NOMATCH);
4719 if (eptr >= md->end_subject)
4720 {
4721 SCHECK_PARTIAL();
4722 RRETURN(MATCH_NOMATCH);
4723 }
4724 GETCHARINCTEST(c, eptr);
4725 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4726 RRETURN(MATCH_NOMATCH);
4727 }
4728 /* Control never gets here */
4729
4730 case PT_SC:
4731 for (fi = min;; fi++)
4732 {
4733 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4734 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4735 if (fi >= max) RRETURN(MATCH_NOMATCH);
4736 if (eptr >= md->end_subject)
4737 {
4738 SCHECK_PARTIAL();
4739 RRETURN(MATCH_NOMATCH);
4740 }
4741 GETCHARINCTEST(c, eptr);
4742 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4743 RRETURN(MATCH_NOMATCH);
4744 }
4745 /* Control never gets here */
4746
4747 case PT_ALNUM:
4748 for (fi = min;; fi++)
4749 {
4750 int category;
4751 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4753 if (fi >= max) RRETURN(MATCH_NOMATCH);
4754 if (eptr >= md->end_subject)
4755 {
4756 SCHECK_PARTIAL();
4757 RRETURN(MATCH_NOMATCH);
4758 }
4759 GETCHARINCTEST(c, eptr);
4760 category = UCD_CATEGORY(c);
4761 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4762 RRETURN(MATCH_NOMATCH);
4763 }
4764 /* Control never gets here */
4765
4766 case PT_SPACE: /* Perl space */
4767 for (fi = min;; fi++)
4768 {
4769 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4770 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4771 if (fi >= max) RRETURN(MATCH_NOMATCH);
4772 if (eptr >= md->end_subject)
4773 {
4774 SCHECK_PARTIAL();
4775 RRETURN(MATCH_NOMATCH);
4776 }
4777 GETCHARINCTEST(c, eptr);
4778 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4779 c == CHAR_FF || c == CHAR_CR)
4780 == prop_fail_result)
4781 RRETURN(MATCH_NOMATCH);
4782 }
4783 /* Control never gets here */
4784
4785 case PT_PXSPACE: /* POSIX space */
4786 for (fi = min;; fi++)
4787 {
4788 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4790 if (fi >= max) RRETURN(MATCH_NOMATCH);
4791 if (eptr >= md->end_subject)
4792 {
4793 SCHECK_PARTIAL();
4794 RRETURN(MATCH_NOMATCH);
4795 }
4796 GETCHARINCTEST(c, eptr);
4797 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4798 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4799 == prop_fail_result)
4800 RRETURN(MATCH_NOMATCH);
4801 }
4802 /* Control never gets here */
4803
4804 case PT_WORD:
4805 for (fi = min;; fi++)
4806 {
4807 int category;
4808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4810 if (fi >= max) RRETURN(MATCH_NOMATCH);
4811 if (eptr >= md->end_subject)
4812 {
4813 SCHECK_PARTIAL();
4814 RRETURN(MATCH_NOMATCH);
4815 }
4816 GETCHARINCTEST(c, eptr);
4817 category = UCD_CATEGORY(c);
4818 if ((category == ucp_L ||
4819 category == ucp_N ||
4820 c == CHAR_UNDERSCORE)
4821 == prop_fail_result)
4822 RRETURN(MATCH_NOMATCH);
4823 }
4824 /* Control never gets here */
4825
4826 /* This should never occur */
4827
4828 default:
4829 RRETURN(PCRE_ERROR_INTERNAL);
4830 }
4831 }
4832
4833 /* Match extended Unicode sequences. We will get here only if the
4834 support is in the binary; otherwise a compile-time error occurs. */
4835
4836 else if (ctype == OP_EXTUNI)
4837 {
4838 for (fi = min;; fi++)
4839 {
4840 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4842 if (fi >= max) RRETURN(MATCH_NOMATCH);
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 RRETURN(MATCH_NOMATCH);
4847 }
4848 GETCHARINCTEST(c, eptr);
4849 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4850 while (eptr < md->end_subject)
4851 {
4852 int len = 1;
4853 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4854 if (UCD_CATEGORY(c) != ucp_M) break;
4855 eptr += len;
4856 }
4857 }
4858 }
4859 else
4860 #endif /* SUPPORT_UCP */
4861
4862 #ifdef SUPPORT_UTF
4863 if (utf)
4864 {
4865 for (fi = min;; fi++)
4866 {
4867 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4869 if (fi >= max) RRETURN(MATCH_NOMATCH);
4870 if (eptr >= md->end_subject)
4871 {
4872 SCHECK_PARTIAL();
4873 RRETURN(MATCH_NOMATCH);
4874 }
4875 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4876 RRETURN(MATCH_NOMATCH);
4877 GETCHARINC(c, eptr);
4878 switch(ctype)
4879 {
4880 case OP_ANY: /* This is the non-NL case */
4881 case OP_ALLANY:
4882 case OP_ANYBYTE:
4883 break;
4884
4885 case OP_ANYNL:
4886 switch(c)
4887 {
4888 default: RRETURN(MATCH_NOMATCH);
4889 case 0x000d:
4890 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4891 break;
4892 case 0x000a:
4893 break;
4894
4895 case 0x000b:
4896 case 0x000c:
4897 case 0x0085:
4898 case 0x2028:
4899 case 0x2029:
4900 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4901 break;
4902 }
4903 break;
4904
4905 case OP_NOT_HSPACE:
4906 switch(c)
4907 {
4908 default: break;
4909 case 0x09: /* HT */
4910 case 0x20: /* SPACE */
4911 case 0xa0: /* NBSP */
4912 case 0x1680: /* OGHAM SPACE MARK */
4913 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4914 case 0x2000: /* EN QUAD */
4915 case 0x2001: /* EM QUAD */
4916 case 0x2002: /* EN SPACE */
4917 case 0x2003: /* EM SPACE */
4918 case 0x2004: /* THREE-PER-EM SPACE */
4919 case 0x2005: /* FOUR-PER-EM SPACE */
4920 case 0x2006: /* SIX-PER-EM SPACE */
4921 case 0x2007: /* FIGURE SPACE */
4922 case 0x2008: /* PUNCTUATION SPACE */
4923 case 0x2009: /* THIN SPACE */
4924 case 0x200A: /* HAIR SPACE */
4925 case 0x202f: /* NARROW NO-BREAK SPACE */
4926 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4927 case 0x3000: /* IDEOGRAPHIC SPACE */
4928 RRETURN(MATCH_NOMATCH);
4929 }
4930 break;
4931
4932 case OP_HSPACE:
4933 switch(c)
4934 {
4935 default: RRETURN(MATCH_NOMATCH);
4936 case 0x09: /* HT */
4937 case 0x20: /* SPACE */
4938 case 0xa0: /* NBSP */
4939 case 0x1680: /* OGHAM SPACE MARK */
4940 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4941 case 0x2000: /* EN QUAD */
4942 case 0x2001: /* EM QUAD */
4943 case 0x2002: /* EN SPACE */
4944 case 0x2003: /* EM SPACE */
4945 case 0x2004: /* THREE-PER-EM SPACE */
4946 case 0x2005: /* FOUR-PER-EM SPACE */
4947 case 0x2006: /* SIX-PER-EM SPACE */
4948 case 0x2007: /* FIGURE SPACE */
4949 case 0x2008: /* PUNCTUATION SPACE */
4950 case 0x2009: /* THIN SPACE */
4951 case 0x200A: /* HAIR SPACE */
4952 case 0x202f: /* NARROW NO-BREAK SPACE */
4953 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4954 case 0x3000: /* IDEOGRAPHIC SPACE */
4955 break;
4956 }
4957 break;
4958
4959 case OP_NOT_VSPACE:
4960 switch(c)
4961 {
4962 default: break;
4963 case 0x0a: /* LF */
4964 case 0x0b: /* VT */
4965 case 0x0c: /* FF */
4966 case 0x0d: /* CR */
4967 case 0x85: /* NEL */
4968 case 0x2028: /* LINE SEPARATOR */
4969 case 0x2029: /* PARAGRAPH SEPARATOR */
4970 RRETURN(MATCH_NOMATCH);
4971 }
4972 break;
4973
4974 case OP_VSPACE:
4975 switch(c)
4976 {
4977 default: RRETURN(MATCH_NOMATCH);
4978 case 0x0a: /* LF */
4979 case 0x0b: /* VT */
4980 case 0x0c: /* FF */
4981 case 0x0d: /* CR */
4982 case 0x85: /* NEL */
4983 case 0x2028: /* LINE SEPARATOR */
4984 case 0x2029: /* PARAGRAPH SEPARATOR */
4985 break;
4986 }
4987 break;
4988
4989 case OP_NOT_DIGIT:
4990 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4991 RRETURN(MATCH_NOMATCH);
4992 break;
4993
4994 case OP_DIGIT:
4995 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4996 RRETURN(MATCH_NOMATCH);
4997 break;
4998
4999 case OP_NOT_WHITESPACE:
5000 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5001 RRETURN(MATCH_NOMATCH);
5002 break;
5003
5004 case OP_WHITESPACE:
5005 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5006 RRETURN(MATCH_NOMATCH);
5007 break;
5008
5009 case OP_NOT_WORDCHAR:
5010 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5011 RRETURN(MATCH_NOMATCH);
5012 break;
5013
5014 case OP_WORDCHAR:
5015 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5016 RRETURN(MATCH_NOMATCH);
5017 break;
5018
5019 default:
5020 RRETURN(PCRE_ERROR_INTERNAL);
5021 }
5022 }
5023 }
5024 else
5025 #endif
5026 /* Not UTF mode */
5027 {
5028 for (fi = min;; fi++)
5029 {
5030 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5032 if (fi >= max) RRETURN(MATCH_NOMATCH);
5033 if (eptr >= md->end_subject)
5034 {
5035 SCHECK_PARTIAL();
5036 RRETURN(MATCH_NOMATCH);
5037 }
5038 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5039 RRETURN(MATCH_NOMATCH);
5040 c = *eptr++;
5041 switch(ctype)
5042 {
5043 case OP_ANY: /* This is the non-NL case */
5044 case OP_ALLANY:
5045 case OP_ANYBYTE:
5046 break;
5047
5048 case OP_ANYNL:
5049 switch(c)
5050 {
5051 default: RRETURN(MATCH_NOMATCH);
5052 case 0x000d:
5053 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5054 break;
5055
5056 case 0x000a:
5057 break;
5058
5059 case 0x000b:
5060 case 0x000c:
5061 case 0x0085:
5062 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5063 break;
5064 }
5065 break;
5066
5067 case OP_NOT_HSPACE:
5068 switch(c)
5069 {
5070 default: break;
5071 case 0x09: /* HT */
5072 case 0x20: /* SPACE */
5073 case 0xa0: /* NBSP */
5074 RRETURN(MATCH_NOMATCH);
5075 }
5076 break;
5077
5078 case OP_HSPACE:
5079 switch(c)
5080 {
5081 default: RRETURN(MATCH_NOMATCH);
5082 case 0x09: /* HT */
5083 case 0x20: /* SPACE */
5084 case 0xa0: /* NBSP */
5085 break;
5086 }
5087 break;
5088
5089 case OP_NOT_VSPACE:
5090 switch(c)
5091 {
5092 default: break;
5093 case 0x0a: /* LF */
5094 case 0x0b: /* VT */
5095 case 0x0c: /* FF */
5096 case 0x0d: /* CR */
5097 case 0x85: /* NEL */
5098 RRETURN(MATCH_NOMATCH);
5099 }
5100 break;
5101
5102 case OP_VSPACE:
5103 switch(c)
5104 {
5105 default: RRETURN(MATCH_NOMATCH);
5106 case 0x0a: /* LF */
5107 case 0x0b: /* VT */
5108 case 0x0c: /* FF */
5109 case 0x0d: /* CR */
5110 case 0x85: /* NEL */
5111 break;
5112 }
5113 break;
5114
5115 case OP_NOT_DIGIT:
5116 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5117 break;
5118
5119 case OP_DIGIT:
5120 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5121 break;
5122
5123 case OP_NOT_WHITESPACE:
5124 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5125 break;
5126
5127 case OP_WHITESPACE:
5128 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5129 break;
5130
5131 case OP_NOT_WORDCHAR:
5132 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5133 break;
5134
5135 case OP_WORDCHAR:
5136 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5137 break;
5138
5139 default:
5140 RRETURN(PCRE_ERROR_INTERNAL);
5141 }
5142 }
5143 }
5144 /* Control never gets here */
5145 }
5146
5147 /* If maximizing, it is worth using inline code for speed, doing the type
5148 test once at the start (i.e. keep it out of the loop). Again, keep the
5149 UTF-8 and UCP stuff separate. */
5150
5151 else
5152 {
5153 pp = eptr; /* Remember where we started */
5154
5155 #ifdef SUPPORT_UCP
5156 if (prop_type >= 0)
5157 {
5158 switch(prop_type)
5159 {
5160 case PT_ANY:
5161 for (i = min; i < max; i++)
5162 {
5163 int len = 1;
5164 if (eptr >= md->end_subject)
5165 {
5166 SCHECK_PARTIAL();
5167 break;
5168 }
5169 GETCHARLENTEST(c, eptr, len);
5170 if (prop_fail_result) break;
5171 eptr+= len;
5172 }
5173 break;
5174
5175 case PT_LAMP:
5176 for (i = min; i < max; i++)
5177 {
5178 int chartype;
5179 int len = 1;
5180 if (eptr >= md->end_subject)
5181 {
5182 SCHECK_PARTIAL();
5183 break;
5184 }
5185 GETCHARLENTEST(c, eptr, len);
5186 chartype = UCD_CHARTYPE(c);
5187 if ((chartype == ucp_Lu ||
5188 chartype == ucp_Ll ||
5189 chartype == ucp_Lt) == prop_fail_result)
5190 break;
5191 eptr+= len;
5192 }
5193 break;
5194
5195 case PT_GC:
5196 for (i = min; i < max; i++)
5197 {
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 GETCHARLENTEST(c, eptr, len);
5205 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5206 eptr+= len;
5207 }
5208 break;
5209
5210 case PT_PC:
5211 for (i = min; i < max; i++)
5212 {
5213 int len = 1;
5214 if (eptr >= md->end_subject)
5215 {
5216 SCHECK_PARTIAL();
5217 break;
5218 }
5219 GETCHARLENTEST(c, eptr, len);
5220 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5221 eptr+= len;
5222 }
5223 break;
5224
5225 case PT_SC:
5226 for (i = min; i < max; i++)
5227 {
5228 int len = 1;
5229 if (eptr >= md->end_subject)
5230 {
5231 SCHECK_PARTIAL();
5232 break;
5233 }
5234 GETCHARLENTEST(c, eptr, len);
5235 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5236 eptr+= len;
5237 }
5238 break;
5239
5240 case PT_ALNUM:
5241 for (i = min; i < max; i++)
5242 {
5243 int category;
5244 int len = 1;
5245 if (eptr >= md->end_subject)
5246 {
5247 SCHECK_PARTIAL();
5248 break;
5249 }
5250 GETCHARLENTEST(c, eptr, len);
5251 category = UCD_CATEGORY(c);
5252 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5253 break;
5254 eptr+= len;
5255 }
5256 break;
5257
5258 case PT_SPACE: /* Perl space */
5259 for (i = min; i < max; i++)
5260 {
5261 int len = 1;
5262 if (eptr >= md->end_subject)
5263 {
5264 SCHECK_PARTIAL();
5265 break;
5266 }
5267 GETCHARLENTEST(c, eptr, len);
5268 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5269 c == CHAR_FF || c == CHAR_CR)
5270 == prop_fail_result)
5271 break;
5272 eptr+= len;
5273 }
5274 break;
5275
5276 case PT_PXSPACE: /* POSIX space */
5277 for (i = min; i < max; i++)
5278 {
5279 int len = 1;
5280 if (eptr >= md->end_subject)
5281 {
5282 SCHECK_PARTIAL();
5283 break;
5284 }
5285 GETCHARLENTEST(c, eptr, len);
5286 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5287 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5288 == prop_fail_result)
5289 break;
5290 eptr+= len;
5291 }
5292 break;
5293
5294 case PT_WORD:
5295 for (i = min; i < max; i++)
5296 {
5297 int category;
5298 int len = 1;
5299 if (eptr >= md->end_subject)
5300 {
5301 SCHECK_PARTIAL();
5302 break;
5303 }
5304 GETCHARLENTEST(c, eptr, len);
5305 category = UCD_CATEGORY(c);
5306 if ((category == ucp_L || category == ucp_N ||
5307 c == CHAR_UNDERSCORE) == prop_fail_result)
5308 break;
5309 eptr+= len;
5310 }
5311 break;
5312
5313 default:
5314 RRETURN(PCRE_ERROR_INTERNAL);
5315 }
5316
5317 /* eptr is now past the end of the maximum run */
5318
5319 if (possessive) continue;
5320 for(;;)
5321 {
5322 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5323 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5324 if (eptr-- == pp) break; /* Stop if tried at original pos */
5325 if (utf) BACKCHAR(eptr);
5326 }
5327 }
5328
5329 /* Match extended Unicode sequences. We will get here only if the
5330 support is in the binary; otherwise a compile-time error occurs. */
5331
5332 else if (ctype == OP_EXTUNI)
5333 {
5334 for (i = min; i < max; i++)
5335 {
5336 int len = 1;
5337 if (eptr >= md->end_subject)
5338 {
5339 SCHECK_PARTIAL();
5340 break;
5341 }
5342 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5343 if (UCD_CATEGORY(c) == ucp_M) break;
5344 eptr += len;
5345 while (eptr < md->end_subject)
5346 {
5347 len = 1;
5348 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5349 if (UCD_CATEGORY(c) != ucp_M) break;
5350 eptr += len;
5351 }
5352 }
5353
5354 /* eptr is now past the end of the maximum run */
5355
5356 if (possessive) continue;
5357
5358 for(;;)
5359 {
5360 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5362 if (eptr-- == pp) break; /* Stop if tried at original pos */
5363 for (;;) /* Move back over one extended */
5364 {
5365 if (!utf) c = *eptr; else
5366 {
5367 BACKCHAR(eptr);
5368 GETCHAR(c, eptr);
5369 }
5370 if (UCD_CATEGORY(c) != ucp_M) break;
5371 eptr--;
5372 }
5373 }
5374 }
5375
5376 else
5377 #endif /* SUPPORT_UCP */
5378
5379 #ifdef SUPPORT_UTF
5380 if (utf)
5381 {
5382 switch(ctype)
5383 {
5384 case OP_ANY:
5385 if (max < INT_MAX)
5386 {
5387 for (i = min; i < max; i++)
5388 {
5389 if (eptr >= md->end_subject)
5390 {
5391 SCHECK_PARTIAL();
5392 break;
5393 }
5394 if (IS_NEWLINE(eptr)) break;
5395 eptr++;
5396 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5397 }
5398 }
5399
5400 /* Handle unlimited UTF-8 repeat */
5401
5402 else
5403 {
5404 for (i = min; i < max; i++)
5405 {
5406 if (eptr >= md->end_subject)
5407 {
5408 SCHECK_PARTIAL();
5409 break;
5410 }
5411 if (IS_NEWLINE(eptr)) break;
5412 eptr++;
5413 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5414 }
5415 }
5416 break;
5417
5418 case OP_ALLANY:
5419 if (max < INT_MAX)
5420 {
5421 for (i = min; i < max; i++)
5422 {
5423 if (eptr >= md->end_subject)
5424 {
5425 SCHECK_PARTIAL();
5426 break;
5427 }
5428 eptr++;
5429 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5430 }
5431 }
5432 else
5433 {
5434 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5435 SCHECK_PARTIAL();
5436 }
5437 break;
5438
5439 /* The byte case is the same as non-UTF8 */
5440
5441 case OP_ANYBYTE:
5442 c = max - min;
5443 if (c > (unsigned int)(md->end_subject - eptr))
5444 {
5445 eptr = md->end_subject;
5446 SCHECK_PARTIAL();
5447 }
5448 else eptr += c;
5449 break;
5450
5451 case OP_ANYNL:
5452 for (i = min; i < max; i++)
5453 {
5454 int len = 1;
5455 if (eptr >= md->end_subject)
5456 {
5457 SCHECK_PARTIAL();
5458 break;
5459 }
5460 GETCHARLEN(c, eptr, len);
5461 if (c == 0x000d)
5462 {
5463 if (++eptr >= md->end_subject) break;
5464 if (*eptr == 0x000a) eptr++;
5465 }
5466 else
5467 {
5468 if (c != 0x000a &&
5469 (md->bsr_anycrlf ||
5470 (c != 0x000b && c != 0x000c &&
5471 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5472 break;
5473 eptr += len;
5474 }
5475 }
5476 break;
5477
5478 case OP_NOT_HSPACE:
5479 case OP_HSPACE:
5480 for (i = min; i < max; i++)
5481 {
5482 BOOL gotspace;
5483 int len = 1;
5484 if (eptr >= md->end_subject)
5485 {
5486 SCHECK_PARTIAL();
5487 break;
5488 }
5489 GETCHARLEN(c, eptr, len);
5490 switch(c)
5491 {
5492 default: gotspace = FALSE; break;
5493 case 0x09: /* HT */
5494 case 0x20: /* SPACE */
5495 case 0xa0: /* NBSP */
5496 case 0x1680: /* OGHAM SPACE MARK */
5497 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5498 case 0x2000: /* EN QUAD */
5499 case 0x2001: /* EM QUAD */
5500 case 0x2002: /* EN SPACE */
5501 case 0x2003: /* EM SPACE */
5502 case 0x2004: /* THREE-PER-EM SPACE */
5503 case 0x2005: /* FOUR-PER-EM SPACE */
5504 case 0x2006: /* SIX-PER-EM SPACE */
5505 case 0x2007: /* FIGURE SPACE */
5506 case 0x2008: /* PUNCTUATION SPACE */
5507 case 0x2009: /* THIN SPACE */
5508 case 0x200A: /* HAIR SPACE */
5509 case 0x202f: /* NARROW NO-BREAK SPACE */
5510 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5511 case 0x3000: /* IDEOGRAPHIC SPACE */
5512 gotspace = TRUE;
5513 break;
5514 }
5515 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5516 eptr += len;
5517 }
5518 break;
5519
5520 case OP_NOT_VSPACE:
5521 case OP_VSPACE:
5522 for (i = min; i < max; i++)
5523 {
5524 BOOL gotspace;
5525 int len = 1;
5526 if (eptr >= md->end_subject)
5527 {
5528 SCHECK_PARTIAL();
5529 break;
5530 }
5531 GETCHARLEN(c, eptr, len);
5532 switch(c)
5533 {
5534 default: gotspace = FALSE; break;
5535 case 0x0a: /* LF */
5536 case 0x0b: /* VT */
5537 case 0x0c: /* FF */
5538 case 0x0d: /* CR */
5539 case 0x85: /* NEL */
5540 case 0x2028: /* LINE SEPARATOR */
5541 case 0x2029: /* PARAGRAPH SEPARATOR */
5542 gotspace = TRUE;
5543 break;
5544 }
5545 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5546 eptr += len;
5547 }
5548 break;
5549
5550 case OP_NOT_DIGIT:
5551 for (i = min; i < max; i++)
5552 {
5553 int len = 1;
5554 if (eptr >= md->end_subject)
5555 {
5556 SCHECK_PARTIAL();
5557 break;
5558 }
5559 GETCHARLEN(c, eptr, len);
5560 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5561 eptr+= len;
5562 }
5563 break;
5564
5565 case OP_DIGIT:
5566 for (i = min; i < max; i++)
5567 {
5568 int len = 1;
5569 if (eptr >= md->end_subject)
5570 {
5571 SCHECK_PARTIAL();
5572 break;
5573 }
5574 GETCHARLEN(c, eptr, len);
5575 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5576 eptr+= len;
5577 }
5578 break;
5579
5580 case OP_NOT_WHITESPACE:
5581 for (i = min; i < max; i++)
5582 {
5583 int len = 1;
5584 if (eptr >= md->end_subject)
5585 {
5586 SCHECK_PARTIAL();
5587 break;
5588 }
5589 GETCHARLEN(c, eptr, len);
5590 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5591 eptr+= len;
5592 }
5593 break;
5594
5595 case OP_WHITESPACE:
5596 for (i = min; i < max; i++)
5597 {
5598 int len = 1;
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 GETCHARLEN(c, eptr, len);
5605 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5606 eptr+= len;
5607 }
5608 break;
5609
5610 case OP_NOT_WORDCHAR:
5611 for (i = min; i < max; i++)
5612 {
5613 int len = 1;
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 GETCHARLEN(c, eptr, len);
5620 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5621 eptr+= len;
5622 }
5623 break;
5624
5625 case OP_WORDCHAR:
5626 for (i = min; i < max; i++)
5627 {
5628 int len = 1;
5629 if (eptr >= md->end_subject)
5630 {
5631 SCHECK_PARTIAL();
5632 break;
5633 }
5634 GETCHARLEN(c, eptr, len);
5635 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5636 eptr+= len;
5637 }
5638 break;
5639
5640 default:
5641 RRETURN(PCRE_ERROR_INTERNAL);
5642 }
5643
5644 /* eptr is now past the end of the maximum run. If possessive, we are
5645 done (no backing up). Otherwise, match at this position; anything other
5646 than no match is immediately returned. For nomatch, back up one
5647 character, unless we are matching \R and the last thing matched was
5648 \r\n, in which case, back up two bytes. */
5649
5650 if (possessive) continue;
5651 for(;;)
5652 {
5653 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5654 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5655 if (eptr-- == pp) break; /* Stop if tried at original pos */
5656 BACKCHAR(eptr);
5657 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5658 eptr[-1] == '\r') eptr--;
5659 }
5660 }
5661 else
5662 #endif /* SUPPORT_UTF */
5663 /* Not UTF mode */
5664 {
5665 switch(ctype)
5666 {
5667 case OP_ANY:
5668 for (i = min; i < max; i++)
5669 {
5670 if (eptr >= md->end_subject)
5671 {
5672 SCHECK_PARTIAL();
5673 break;
5674 }
5675 if (IS_NEWLINE(eptr)) break;
5676 eptr++;
5677 }
5678 break;
5679
5680 case OP_ALLANY:
5681 case OP_ANYBYTE:
5682 c = max - min;
5683 if (c > (unsigned int)(md->end_subject - eptr))
5684 {
5685 eptr = md->end_subject;
5686 SCHECK_PARTIAL();
5687 }
5688 else eptr += c;
5689 break;
5690
5691 case OP_ANYNL:
5692 for (i = min; i < max; i++)
5693 {
5694 if (eptr >= md->end_subject)
5695 {
5696 SCHECK_PARTIAL();
5697 break;
5698 }
5699 c = *eptr;
5700 if (c == 0x000d)
5701 {
5702 if (++eptr >= md->end_subject) break;
5703 if (*eptr == 0x000a) eptr++;
5704 }
5705 else
5706 {
5707 if (c != 0x000a &&
5708 (md->bsr_anycrlf ||
5709 (c != 0x000b && c != 0x000c && c != 0x0085)))
5710 break;
5711 eptr++;
5712 }
5713 }
5714 break;
5715
5716 case OP_NOT_HSPACE:
5717 for (i = min; i < max; i++)
5718 {
5719 if (eptr >= md->end_subject)
5720 {
5721 SCHECK_PARTIAL();
5722 break;
5723 }
5724 c = *eptr;
5725 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5726 eptr++;
5727 }
5728 break;
5729
5730 case OP_HSPACE:
5731 for (i = min; i < max; i++)
5732 {
5733 if (eptr >= md->end_subject)
5734 {
5735 SCHECK_PARTIAL();
5736 break;
5737 }
5738 c = *eptr;
5739 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5740 eptr++;
5741 }
5742 break;
5743
5744 case OP_NOT_VSPACE:
5745 for (i = min; i < max; i++)
5746 {
5747 if (eptr >= md->end_subject)
5748 {
5749 SCHECK_PARTIAL();
5750 break;
5751 }
5752 c = *eptr;
5753 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5754 break;
5755 eptr++;
5756 }
5757 break;
5758
5759 case OP_VSPACE:
5760 for (i = min; i < max; i++)
5761 {
5762 if (eptr >= md->end_subject)
5763 {
5764 SCHECK_PARTIAL();
5765 break;
5766 }
5767 c = *eptr;
5768 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5769 break;
5770 eptr++;
5771 }
5772 break;
5773
5774 case OP_NOT_DIGIT:
5775 for (i = min; i < max; i++)
5776 {
5777 if (eptr >= md->end_subject)
5778 {
5779 SCHECK_PARTIAL();
5780 break;
5781 }
5782 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5783 eptr++;
5784 }
5785 break;
5786
5787 case OP_DIGIT:
5788 for (i = min; i < max; i++)
5789 {
5790 if (eptr >= md->end_subject)
5791 {
5792 SCHECK_PARTIAL();
5793 break;
5794 }
5795 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5796 eptr++;
5797 }
5798 break;
5799
5800 case OP_NOT_WHITESPACE:
5801 for (i = min; i < max; i++)
5802 {
5803 if (eptr >= md->end_subject)
5804 {
5805 SCHECK_PARTIAL();
5806 break;
5807 }
5808 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5809 eptr++;
5810 }
5811 break;
5812
5813 case OP_WHITESPACE:
5814 for (i = min; i < max; i++)
5815 {
5816 if (eptr >= md->end_subject)
5817 {
5818 SCHECK_PARTIAL();
5819 break;
5820 }
5821 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5822 eptr++;
5823 }
5824 break;
5825
5826 case OP_NOT_WORDCHAR:
5827 for (i = min; i < max; i++)
5828 {
5829 if (eptr >= md->end_subject)
5830 {
5831 SCHECK_PARTIAL();
5832 break;
5833 }
5834 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5835 eptr++;
5836 }
5837 break;
5838
5839 case OP_WORDCHAR:
5840 for (i = min; i < max; i++)
5841 {
5842 if (eptr >= md->end_subject)
5843 {
5844 SCHECK_PARTIAL();
5845 break;
5846 }
5847 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5848 eptr++;
5849 }
5850 break;
5851
5852 default:
5853 RRETURN(PCRE_ERROR_INTERNAL);
5854 }
5855
5856 /* eptr is now past the end of the maximum run. If possessive, we are
5857 done (no backing up). Otherwise, match at this position; anything other
5858 than no match is immediately returned. For nomatch, back up one
5859 character (byte), unless we are matching \R and the last thing matched
5860 was \r\n, in which case, back up two bytes. */
5861
5862 if (possessive) continue;
5863 while (eptr >= pp)
5864 {
5865 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5866 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5867 eptr--;
5868 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5869 eptr[-1] == '\r') eptr--;
5870 }
5871 }
5872
5873 /* Get here if we can't make it match with any permitted repetitions */
5874
5875 RRETURN(MATCH_NOMATCH);
5876 }
5877 /* Control never gets here */
5878
5879 /* There's been some horrible disaster. Arrival here can only mean there is
5880 something seriously wrong in the code above or the OP_xxx definitions. */
5881
5882 default:
5883 DPRINTF(("Unknown opcode %d\n", *ecode));
5884 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5885 }
5886
5887 /* Do not stick any code in here without much thought; it is assumed
5888 that "continue" in the code above comes out to here to repeat the main
5889 loop. */
5890
5891 } /* End of main loop */
5892 /* Control never reaches here */
5893
5894
5895 /* When compiling to use the heap rather than the stack for recursive calls to
5896 match(), the RRETURN() macro jumps here. The number that is saved in
5897 frame->Xwhere indicates which label we actually want to return to. */
5898
5899 #ifdef NO_RECURSE
5900 #define LBL(val) case val: goto L_RM##val;
5901 HEAP_RETURN:
5902 switch (frame->Xwhere)
5903 {
5904 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5905 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5906 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5907 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5908 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5909 LBL(65) LBL(66)
5910 #ifdef SUPPORT_UTF
5911 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5912 LBL(32) LBL(34) LBL(42) LBL(46)
5913 #ifdef SUPPORT_UCP
5914 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5915 LBL(59) LBL(60) LBL(61) LBL(62)
5916 #endif /* SUPPORT_UCP */
5917 #endif /* SUPPORT_UTF */
5918 default:
5919 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5920 return PCRE_ERROR_INTERNAL;
5921 }
5922 #undef LBL
5923 #endif /* NO_RECURSE */
5924 }
5925
5926
5927 /***************************************************************************
5928 ****************************************************************************
5929 RECURSION IN THE match() FUNCTION
5930
5931 Undefine all the macros that were defined above to handle this. */
5932
5933 #ifdef NO_RECURSE
5934 #undef eptr
5935 #undef ecode
5936 #undef mstart
5937 #undef offset_top
5938 #undef eptrb
5939 #undef flags
5940
5941 #undef callpat
5942 #undef charptr
5943 #undef data
5944 #undef next
5945 #undef pp
5946 #undef prev
5947 #undef saved_eptr
5948
5949 #undef new_recursive
5950
5951 #undef cur_is_word
5952 #undef condition
5953 #undef prev_is_word
5954
5955 #undef ctype
5956 #undef length
5957 #undef max
5958 #undef min
5959 #undef number
5960 #undef offset
5961 #undef op
5962 #undef save_capture_last
5963 #undef save_offset1
5964 #undef save_offset2
5965 #undef save_offset3
5966 #undef stacksave
5967
5968 #undef newptrb
5969
5970 #endif
5971
5972 /* These two are defined as macros in both cases */
5973
5974 #undef fc
5975 #undef fi
5976
5977 /***************************************************************************
5978 ***************************************************************************/
5979
5980
5981
5982 /*************************************************
5983 * Execute a Regular Expression *
5984 *************************************************/
5985
5986 /* This function applies a compiled re to a subject string and picks out
5987 portions of the string if it matches. Two elements in the vector are set for
5988 each substring: the offsets to the start and end of the substring.
5989
5990 Arguments:
5991 argument_re points to the compiled expression
5992 extra_data points to extra data or is NULL
5993 subject points to the subject string
5994 length length of subject string (may contain binary zeros)
5995 start_offset where to start in the subject string
5996 options option bits
5997 offsets points to a vector of ints to be filled in with offsets
5998 offsetcount the number of elements in the vector
5999
6000 Returns: > 0 => success; value is the number of elements filled in
6001 = 0 => success, but offsets is not big enough
6002 -1 => failed to match
6003 < -1 => some kind of unexpected problem
6004 */
6005
6006 #ifdef COMPILE_PCRE8
6007 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6008 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6009 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6010 int offsetcount)
6011 #else
6012 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6013 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
6014 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6015 int offsetcount)
6016 #endif
6017 {
6018 int rc, ocount, arg_offset_max;
6019 int newline;
6020 BOOL using_temporary_offsets = FALSE;
6021 BOOL anchored;
6022 BOOL startline;
6023 BOOL firstline;
6024 BOOL utf;
6025 BOOL has_first_char = FALSE;
6026 BOOL has_req_char = FALSE;
6027 pcre_uchar first_char = 0;
6028 pcre_uchar first_char2 = 0;
6029 pcre_uchar req_char = 0;
6030 pcre_uchar req_char2 = 0;
6031 match_data match_block;
6032 match_data *md = &match_block;
6033 const pcre_uint8 *tables;
6034 const pcre_uint8 *start_bits = NULL;
6035 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6036 PCRE_PUCHAR end_subject;
6037 PCRE_PUCHAR start_partial = NULL;
6038 PCRE_PUCHAR req_char_ptr = start_match - 1;
6039
6040 pcre_study_data internal_study;
6041 const pcre_study_data *study;
6042
6043 real_pcre internal_re;
6044 const real_pcre *external_re = (const real_pcre *)argument_re;
6045 const real_pcre *re = external_re;
6046
6047 /* Plausibility checks */
6048
6049 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6050 if (re == NULL || subject == NULL ||
6051 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6052 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6053 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6054
6055 /* These two settings are used in the code for checking a UTF-8 string that
6056 follows immediately afterwards. Other values in the md block are used only
6057 during "normal" pcre_exec() processing, not when the JIT support is in use,
6058 so they are set up later. */
6059
6060 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6061 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6062 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6063 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6064
6065 /* Check a UTF-8 string if required. Pass back the character offset and error
6066 code for an invalid string if a results vector is available. */
6067
6068 #ifdef SUPPORT_UTF
6069 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6070 {
6071 int erroroffset;
6072 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6073 if (errorcode != 0)
6074 {
6075 if (offsetcount >= 2)
6076 {
6077 offsets[0] = erroroffset;
6078 offsets[1] = errorcode;
6079 }
6080 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6081 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6082 }
6083
6084 /* Check that a start_offset points to the start of a UTF character. */
6085 if (start_offset > 0 && start_offset < length &&
6086 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6087 return PCRE_ERROR_BADUTF8_OFFSET;
6088 }
6089 #endif
6090
6091 /* If the pattern was successfully studied with JIT support, run the JIT
6092 executable instead of the rest of this function. Most options must be set at
6093 compile time for the JIT code to be usable. Fallback to the normal code path if
6094 an unsupported flag is set. In particular, JIT does not support partial
6095 matching. */
6096
6097 #ifdef SUPPORT_JIT
6098 if (extra_data != NULL
6099 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6100 && extra_data->executable_jit != NULL
6101 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6102 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6103 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6104 return PRIV(jit_exec)(re, extra_data->executable_jit,
6105 (const pcre_uchar *)subject, length, start_offset, options,
6106 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6107 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6108 #endif
6109
6110 /* Carry on with non-JIT matching. This information is for finding all the
6111 numbers associated with a given name, for condition testing. */
6112
6113 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6114 md->name_count = re->name_count;
6115 md->name_entry_size = re->name_entry_size;
6116
6117 /* Fish out the optional data from the extra_data structure, first setting
6118 the default values. */
6119
6120 study = NULL;
6121 md->match_limit = MATCH_LIMIT;
6122 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6123 md->callout_data = NULL;
6124
6125 /* The table pointer is always in native byte order. */
6126
6127 tables = external_re->tables;
6128
6129 if (extra_data != NULL)
6130 {
6131 register unsigned int flags = extra_data->flags;
6132 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6133 study = (const pcre_study_data *)extra_data->study_data;
6134 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6135 md->match_limit = extra_data->match_limit;
6136 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6137 md->match_limit_recursion = extra_data->match_limit_recursion;
6138 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6139 md->callout_data = extra_data->callout_data;
6140 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6141 }
6142
6143 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6144 is a feature that makes it possible to save compiled regex and re-use them
6145 in other programs later. */
6146
6147 if (tables == NULL) tables = PRIV(default_tables);
6148
6149 /* Check that the first field in the block is the magic number. If it is not,
6150 test for a regex that was compiled on a host of opposite endianness. If this is
6151 the case, flipped values are put in internal_re and internal_study if there was
6152 study data too. */
6153
6154 if (re->magic_number != MAGIC_NUMBER)
6155 {
6156 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6157 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6158 if (study != NULL) study = &internal_study;
6159 }
6160 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6161
6162 /* Set up other data */
6163
6164 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6165 startline = (re->flags & PCRE_STARTLINE) != 0;
6166 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6167
6168 /* The code starts after the real_pcre block and the capture name table. */
6169
6170 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6171 re->name_count * re->name_entry_size;
6172
6173 md->start_subject = (PCRE_PUCHAR)subject;
6174 md->start_offset = start_offset;
6175 md->end_subject = md->start_subject + length;
6176 end_subject = md->end_subject;
6177
6178 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6179 md->use_ucp = (re->options & PCRE_UCP) != 0;
6180 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6181 md->ignore_skip_arg = FALSE;
6182
6183 /* Some options are unpacked into BOOL variables in the hope that testing
6184 them will be faster than individual option bits. */
6185
6186 md->notbol = (options & PCRE_NOTBOL) != 0;
6187 md->noteol = (options & PCRE_NOTEOL) != 0;
6188 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6189 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6190
6191 md->hitend = FALSE;
6192 md->mark = md->nomatch_mark = NULL; /* In case never set */
6193
6194 md->recursive = NULL; /* No recursion at top level */
6195 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6196
6197 md->lcc = tables + lcc_offset;
6198 md->fcc = tables + fcc_offset;
6199 md->ctypes = tables + ctypes_offset;
6200
6201 /* Handle different \R options. */
6202
6203 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6204 {
6205 case 0:
6206 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6207 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6208 else
6209 #ifdef BSR_ANYCRLF
6210 md->bsr_anycrlf = TRUE;
6211 #else
6212 md->bsr_anycrlf = FALSE;
6213 #endif
6214 break;
6215
6216 case PCRE_BSR_ANYCRLF:
6217 md->bsr_anycrlf = TRUE;
6218 break;
6219
6220 case PCRE_BSR_UNICODE:
6221 md->bsr_anycrlf = FALSE;
6222 break;
6223
6224 default: return PCRE_ERROR_BADNEWLINE;
6225 }
6226
6227 /* Handle different types of newline. The three bits give eight cases. If
6228 nothing is set at run time, whatever was used at compile time applies. */
6229
6230 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6231 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6232 {
6233 case 0: newline = NEWLINE; break; /* Compile-time default */
6234 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6235 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6236 case PCRE_NEWLINE_CR+
6237 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6238 case PCRE_NEWLINE_ANY: newline = -1; break;
6239 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6240 default: return PCRE_ERROR_BADNEWLINE;
6241 }
6242
6243 if (newline == -2)
6244 {
6245 md->nltype = NLTYPE_ANYCRLF;
6246 }
6247 else if (newline < 0)
6248 {
6249 md->nltype = NLTYPE_ANY;
6250 }
6251 else
6252 {
6253 md->nltype = NLTYPE_FIXED;
6254 if (newline > 255)
6255 {
6256 md->nllen = 2;
6257 md->nl[0] = (newline >> 8) & 255;
6258 md->nl[1] = newline & 255;
6259 }
6260 else
6261 {
6262 md->nllen = 1;
6263 md->nl[0] = newline;
6264 }
6265 }
6266
6267 /* Partial matching was originally supported only for a restricted set of
6268 regexes; from release 8.00 there are no restrictions, but the bits are still
6269 defined (though never set). So there's no harm in leaving this code. */
6270
6271 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6272 return PCRE_ERROR_BADPARTIAL;
6273
6274 /* If the expression has got more back references than the offsets supplied can
6275 hold, we get a temporary chunk of working store to use during the matching.
6276 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6277 of 3. */
6278
6279 ocount = offsetcount - (offsetcount % 3);
6280 arg_offset_max = (2*ocount)/3;
6281
6282 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6283 {
6284 ocount = re->top_backref * 3 + 3;
6285 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6286 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6287 using_temporary_offsets = TRUE;
6288 DPRINTF(("Got memory to hold back references\n"));
6289 }
6290 else md->offset_vector = offsets;
6291
6292 md->offset_end = ocount;
6293 md->offset_max = (2*ocount)/3;
6294 md->offset_overflow = FALSE;
6295 md->capture_last = -1;
6296
6297 /* Reset the working variable associated with each extraction. These should
6298 never be used unless previously set, but they get saved and restored, and so we
6299 initialize them to avoid reading uninitialized locations. Also, unset the
6300 offsets for the matched string. This is really just for tidiness with callouts,
6301 in case they inspect these fields. */
6302
6303 if (md->offset_vector != NULL)
6304 {
6305 register int *iptr = md->offset_vector + ocount;
6306 register int *iend = iptr - re->top_bracket;
6307 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6308 while (--iptr >= iend) *iptr = -1;
6309 md->offset_vector[0] = md->offset_vector[1] = -1;
6310 }
6311
6312 /* Set up the first character to match, if available. The first_char value is
6313 never set for an anchored regular expression, but the anchoring may be forced
6314 at run time, so we have to test for anchoring. The first char may be unset for
6315 an unanchored pattern, of course. If there's no first char and the pattern was
6316 studied, there may be a bitmap of possible first characters. */
6317
6318 if (!anchored)
6319 {
6320 if ((re->flags & PCRE_FIRSTSET) != 0)
6321 {
6322 has_first_char = TRUE;
6323 first_char = first_char2 = re->first_char;
6324 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6325 {
6326 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6327 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6328 if (utf && first_char > 127)
6329 first_char2 = UCD_OTHERCASE(first_char);
6330 #endif
6331 }
6332 }
6333 else
6334 if (!startline && study != NULL &&
6335 (study->flags & PCRE_STUDY_MAPPED) != 0)
6336 start_bits = study->start_bits;
6337 }
6338
6339 /* For anchored or unanchored matches, there may be a "last known required
6340 character" set. */
6341
6342 if ((re->flags & PCRE_REQCHSET) != 0)
6343 {
6344 has_req_char = TRUE;
6345 req_char = req_char2 = re->req_char;
6346 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6347 {
6348 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6349 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6350 if (utf && req_char > 127)
6351 req_char2 = UCD_OTHERCASE(req_char);
6352 #endif
6353 }
6354 }
6355
6356
6357 /* ==========================================================================*/
6358
6359 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6360 the loop runs just once. */
6361
6362 for(;;)
6363 {
6364 PCRE_PUCHAR save_end_subject = end_subject;
6365 PCRE_PUCHAR new_start_match;
6366
6367 /* If firstline is TRUE, the start of the match is constrained to the first
6368 line of a multiline string. That is, the match must be before or at the first
6369 newline. Implement this by temporarily adjusting end_subject so that we stop
6370 scanning at a newline. If the match fails at the newline, later code breaks
6371 this loop. */
6372
6373 if (firstline)
6374 {
6375 PCRE_PUCHAR t = start_match;
6376 #ifdef SUPPORT_UTF
6377 if (utf)
6378 {
6379 while (t < md->end_subject && !IS_NEWLINE(t))
6380 {
6381 t++;
6382 ACROSSCHAR(t < end_subject, *t, t++);
6383 }
6384 }
6385 else
6386 #endif
6387 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6388 end_subject = t;
6389 }
6390
6391 /* There are some optimizations that avoid running the match if a known
6392 starting point is not found, or if a known later character is not present.
6393 However, there is an option that disables these, for testing and for ensuring
6394 that all callouts do actually occur. The option can be set in the regex by
6395 (*NO_START_OPT) or passed in match-time options. */
6396
6397 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6398 {
6399 /* Advance to a unique first char if there is one. */
6400
6401 if (has_first_char)
6402 {
6403 if (first_char != first_char2)
6404 while (start_match < end_subject &&
6405 *start_match != first_char && *start_match != first_char2)
6406 start_match++;
6407 else
6408 while (start_match < end_subject && *start_match != first_char)
6409 start_match++;
6410 }
6411
6412 /* Or to just after a linebreak for a multiline match */
6413
6414 else if (startline)
6415 {
6416 if (start_match > md->start_subject + start_offset)
6417 {
6418 #ifdef SUPPORT_UTF
6419 if (utf)
6420 {
6421 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6422 {
6423 start_match++;
6424 ACROSSCHAR(start_match < end_subject, *start_match,
6425 start_match++);
6426 }
6427 }
6428 else
6429 #endif
6430 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6431 start_match++;
6432
6433 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6434 and we are now at a LF, advance the match position by one more character.
6435 */
6436
6437 if (start_match[-1] == CHAR_CR &&
6438 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6439 start_match < end_subject &&
6440 *start_match == CHAR_NL)
6441 start_match++;
6442 }
6443 }
6444
6445 /* Or to a non-unique first byte after study */
6446
6447 else if (start_bits != NULL)
6448 {
6449 while (start_match < end_subject)
6450 {
6451 register unsigned int c = *start_match;
6452 #ifndef COMPILE_PCRE8
6453 if (c > 255) c = 255;
6454 #endif
6455 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6456 {
6457 start_match++;
6458 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6459 /* In non 8-bit mode, the iteration will stop for
6460 characters > 255 at the beginning or not stop at all. */
6461 if (utf)
6462 ACROSSCHAR(start_match < end_subject, *start_match,
6463 start_match++);
6464 #endif
6465 }
6466 else break;
6467 }
6468 }
6469 } /* Starting optimizations */
6470
6471 /* Restore fudged end_subject */
6472
6473 end_subject = save_end_subject;
6474
6475 /* The following two optimizations are disabled for partial matching or if
6476 disabling is explicitly requested. */
6477
6478 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6479 {
6480 /* If the pattern was studied, a minimum subject length may be set. This is
6481 a lower bound; no actual string of that length may actually match the
6482 pattern. Although the value is, strictly, in characters, we treat it as
6483 bytes to avoid spending too much time in this optimization. */
6484
6485 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6486 (pcre_uint32)(end_subject - start_match) < study->minlength)
6487 {
6488 rc = MATCH_NOMATCH;
6489 break;
6490 }
6491
6492 /* If req_char is set, we know that that character must appear in the
6493 subject for the match to succeed. If the first character is set, req_char
6494 must be later in the subject; otherwise the test starts at the match point.
6495 This optimization can save a huge amount of backtracking in patterns with
6496 nested unlimited repeats that aren't going to match. Writing separate code
6497 for cased/caseless versions makes it go faster, as does using an
6498 autoincrement and backing off on a match.
6499
6500 HOWEVER: when the subject string is very, very long, searching to its end
6501 can take a long time, and give bad performance on quite ordinary patterns.
6502 This showed up when somebody was matching something like /^\d+C/ on a
6503 32-megabyte string... so we don't do this when the string is sufficiently
6504 long. */
6505
6506 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6507 {
6508 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6509
6510 /* We don't need to repeat the search if we haven't yet reached the
6511 place we found it at last time. */
6512
6513 if (p > req_char_ptr)
6514 {
6515 if (req_char != req_char2)
6516 {
6517 while (p < end_subject)
6518 {
6519 register int pp = *p++;
6520 if (pp == req_char || pp == req_char2) { p--; break; }
6521 }
6522 }
6523 else
6524 {
6525 while (p < end_subject)
6526 {
6527 if (*p++ == req_char) { p--; break; }
6528 }
6529 }
6530
6531 /* If we can't find the required character, break the matching loop,
6532 forcing a match failure. */
6533
6534 if (p >= end_subject)
6535 {
6536 rc = MATCH_NOMATCH;
6537 break;
6538 }
6539
6540 /* If we have found the required character, save the point where we
6541 found it, so that we don't search again next time round the loop if
6542 the start hasn't passed this character yet. */
6543
6544 req_char_ptr = p;
6545 }
6546 }
6547 }
6548
6549 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6550 printf(">>>> Match against: ");
6551 pchars(start_match, end_subject - start_match, TRUE, md);
6552 printf("\n");
6553 #endif
6554
6555 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6556 first starting point for which a partial match was found. */
6557
6558 md->start_match_ptr = start_match;
6559 md->start_used_ptr = start_match;
6560 md->match_call_count = 0;
6561 md->match_function_type = 0;
6562 md->end_offset_top = 0;
6563 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6564 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6565
6566 switch(rc)
6567 {
6568 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6569 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6570 entirely. The only way we can do that is to re-do the match at the same
6571 point, with a flag to force SKIP with an argument to be ignored. Just
6572 treating this case as NOMATCH does not work because it does not check other
6573 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6574
6575 case MATCH_SKIP_ARG:
6576 new_start_match = start_match;
6577 md->ignore_skip_arg = TRUE;
6578 break;
6579
6580 /* SKIP passes back the next starting point explicitly, but if it is the
6581 same as the match we have just done, treat it as NOMATCH. */
6582
6583 case MATCH_SKIP:
6584 if (md->start_match_ptr != start_match)
6585 {
6586 new_start_match = md->start_match_ptr;
6587 break;
6588 }
6589 /* Fall through */
6590
6591 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6592 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6593
6594 case MATCH_NOMATCH:
6595 case MATCH_PRUNE:
6596 case MATCH_THEN:
6597 md->ignore_skip_arg = FALSE;
6598 new_start_match = start_match + 1;
6599 #ifdef SUPPORT_UTF
6600 if (utf)
6601 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6602 new_start_match++);
6603 #endif
6604 break;
6605
6606 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6607
6608 case MATCH_COMMIT:
6609 rc = MATCH_NOMATCH;
6610 goto ENDLOOP;
6611
6612 /* Any other return is either a match, or some kind of error. */
6613
6614 default:
6615 goto ENDLOOP;
6616 }
6617
6618 /* Control reaches here for the various types of "no match at this point"
6619 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6620
6621 rc = MATCH_NOMATCH;
6622
6623 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6624 newline in the subject (though it may continue over the newline). Therefore,
6625 if we have just failed to match, starting at a newline, do not continue. */
6626
6627 if (firstline && IS_NEWLINE(start_match)) break;
6628
6629 /* Advance to new matching position */
6630
6631 start_match = new_start_match;
6632
6633 /* Break the loop if the pattern is anchored or if we have passed the end of
6634 the subject. */
6635
6636 if (anchored || start_match > end_subject) break;
6637
6638 /* If we have just passed a CR and we are now at a LF, and the pattern does
6639 not contain any explicit matches for \r or \n, and the newline option is CRLF
6640 or ANY or ANYCRLF, advance the match position by one more character. */
6641
6642 if (start_match[-1] == CHAR_CR &&
6643 start_match < end_subject &&
6644 *start_match == CHAR_NL &&
6645 (re->flags & PCRE_HASCRORLF) == 0 &&
6646 (md->nltype == NLTYPE_ANY ||
6647 md->nltype == NLTYPE_ANYCRLF ||
6648 md->nllen == 2))
6649 start_match++;
6650
6651 md->mark = NULL; /* Reset for start of next match attempt */
6652 } /* End of for(;;) "bumpalong" loop */
6653
6654 /* ==========================================================================*/
6655
6656 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6657 conditions is true:
6658
6659 (1) The pattern is anchored or the match was failed by (*COMMIT);
6660
6661 (2) We are past the end of the subject;
6662
6663 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6664 this option requests that a match occur at or before the first newline in
6665 the subject.
6666
6667 When we have a match and the offset vector is big enough to deal with any
6668 backreferences, captured substring offsets will already be set up. In the case
6669 where we had to get some local store to hold offsets for backreference
6670 processing, copy those that we can. In this case there need not be overflow if
6671 certain parts of the pattern were not used, even though there are more
6672 capturing parentheses than vector slots. */
6673
6674 ENDLOOP:
6675
6676 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6677 {
6678 if (using_temporary_offsets)
6679 {
6680 if (arg_offset_max >= 4)
6681 {
6682 memcpy(offsets + 2, md->offset_vector + 2,
6683 (arg_offset_max - 2) * sizeof(int));
6684 DPRINTF(("Copied offsets from temporary memory\n"));
6685 }
6686 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6687 DPRINTF(("Freeing temporary memory\n"));
6688 (PUBL(free))(md->offset_vector);
6689 }
6690
6691 /* Set the return code to the number of captured strings, or 0 if there were
6692 too many to fit into the vector. */
6693
6694 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6695 0 : md->end_offset_top/2;
6696
6697 /* If there is space in the offset vector, set any unused pairs at the end of
6698 the pattern to -1 for backwards compatibility. It is documented that this
6699 happens. In earlier versions, the whole set of potential capturing offsets
6700 was set to -1 each time round the loop, but this is handled differently now.
6701 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6702 those at the end that need unsetting here. We can't just unset them all at
6703 the start of the whole thing because they may get set in one branch that is
6704 not the final matching branch. */
6705
6706 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6707 {
6708 register int *iptr, *iend;
6709 int resetcount = 2 + re->top_bracket * 2;
6710 if (resetcount > offsetcount) resetcount = ocount;
6711 iptr = offsets + md->end_offset_top;
6712 iend = offsets + resetcount;
6713 while (iptr < iend) *iptr++ = -1;
6714 }
6715
6716 /* If there is space, set up the whole thing as substring 0. The value of
6717 md->start_match_ptr might be modified if \K was encountered on the success
6718 matching path. */
6719
6720 if (offsetcount < 2) rc = 0; else
6721 {
6722 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6723 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6724 }
6725
6726 /* Return MARK data if requested */
6727
6728 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6729 *(extra_data->mark) = (unsigned char *)(md->mark);
6730 DPRINTF((">>>> returning %d\n", rc));
6731 return rc;
6732 }
6733
6734 /* Control gets here if there has been an error, or if the overall match
6735 attempt has failed at all permitted starting positions. */
6736
6737 if (using_temporary_offsets)
6738 {
6739 DPRINTF(("Freeing temporary memory\n"));
6740 (PUBL(free))(md->offset_vector);
6741 }
6742
6743 /* For anything other than nomatch or partial match, just return the code. */
6744
6745 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6746 {
6747