/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 852 - (show annotations)
Thu Jan 5 19:18:12 2012 UTC (3 years, 6 months ago) by zherczeg
File MIME type: text/plain
File size: 211468 byte(s)
Error occurred while calculating annotation data.
Add pcre16 prefix to 16 bit structs
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496
497 /* Copy in the original argument variables */
498
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xoffset_top = offset_top;
503 frame->Xeptrb = eptrb;
504 frame->Xrdepth = rdepth;
505
506 /* This is where control jumps back to to effect "recursion" */
507
508 HEAP_RECURSE:
509
510 /* Macros make the argument variables come from the current frame */
511
512 #define eptr frame->Xeptr
513 #define ecode frame->Xecode
514 #define mstart frame->Xmstart
515 #define offset_top frame->Xoffset_top
516 #define eptrb frame->Xeptrb
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #ifdef SUPPORT_UCP
539 #define prop_type frame->Xprop_type
540 #define prop_value frame->Xprop_value
541 #define prop_fail_result frame->Xprop_fail_result
542 #define oclength frame->Xoclength
543 #define occhars frame->Xocchars
544 #endif
545
546 #define ctype frame->Xctype
547 #define fc frame->Xfc
548 #define fi frame->Xfi
549 #define length frame->Xlength
550 #define max frame->Xmax
551 #define min frame->Xmin
552 #define number frame->Xnumber
553 #define offset frame->Xoffset
554 #define op frame->Xop
555 #define save_capture_last frame->Xsave_capture_last
556 #define save_offset1 frame->Xsave_offset1
557 #define save_offset2 frame->Xsave_offset2
558 #define save_offset3 frame->Xsave_offset3
559 #define stacksave frame->Xstacksave
560
561 #define newptrb frame->Xnewptrb
562
563 /* When recursion is being used, local variables are allocated on the stack and
564 get preserved during recursion in the normal way. In this environment, fi and
565 i, and fc and c, can be the same variables. */
566
567 #else /* NO_RECURSE not defined */
568 #define fi i
569 #define fc c
570
571 /* Many of the following variables are used only in small blocks of the code.
572 My normal style of coding would have declared them within each of those blocks.
573 However, in order to accommodate the version of this code that uses an external
574 "stack" implemented on the heap, it is easier to declare them all here, so the
575 declarations can be cut out in a block. The only declarations within blocks
576 below are for variables that do not have to be preserved over a recursive call
577 to RMATCH(). */
578
579 #ifdef SUPPORT_UTF
580 const pcre_uchar *charptr;
581 #endif
582 const pcre_uchar *callpat;
583 const pcre_uchar *data;
584 const pcre_uchar *next;
585 PCRE_PUCHAR pp;
586 const pcre_uchar *prev;
587 PCRE_PUCHAR saved_eptr;
588
589 recursion_info new_recursive;
590
591 BOOL cur_is_word;
592 BOOL condition;
593 BOOL prev_is_word;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int oclength;
600 pcre_uchar occhars[6];
601 #endif
602
603 int codelink;
604 int ctype;
605 int length;
606 int max;
607 int min;
608 int number;
609 int offset;
610 int op;
611 int save_capture_last;
612 int save_offset1, save_offset2, save_offset3;
613 int stacksave[REC_STACK_SAVE_MAX];
614
615 eptrblock newptrb;
616 #endif /* NO_RECURSE */
617
618 /* To save space on the stack and in the heap frame, I have doubled up on some
619 of the local variables that are used only in localised parts of the code, but
620 still need to be preserved over recursive calls of match(). These macros define
621 the alternative names that are used. */
622
623 #define allow_zero cur_is_word
624 #define cbegroup condition
625 #define code_offset codelink
626 #define condassert condition
627 #define matched_once prev_is_word
628 #define foc number
629
630 /* These statements are here to stop the compiler complaining about unitialized
631 variables. */
632
633 #ifdef SUPPORT_UCP
634 prop_value = 0;
635 prop_fail_result = 0;
636 #endif
637
638
639 /* This label is used for tail recursion, which is used in a few cases even
640 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641 used. Thanks to Ian Taylor for noticing this possibility and sending the
642 original patch. */
643
644 TAIL_RECURSE:
645
646 /* OK, now we can get on with the real code of the function. Recursive calls
647 are specified by the macro RMATCH and RRETURN is used to return. When
648 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 defined). However, RMATCH isn't like a function call because it's quite a
651 complicated macro. It has to be used in one particular way. This shouldn't,
652 however, impact performance when true recursion is being used. */
653
654 #ifdef SUPPORT_UTF
655 utf = md->utf; /* Local copy of the flag */
656 #else
657 utf = FALSE;
658 #endif
659
660 /* First check that we haven't called match() too many times, or that we
661 haven't exceeded the recursive call limit. */
662
663 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665
666 /* At the start of a group with an unlimited repeat that may match an empty
667 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668 done this way to save having to use another function argument, which would take
669 up space on the stack. See also MATCH_CONDASSERT below.
670
671 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672 such remembered pointers, to be checked when we hit the closing ket, in order
673 to break infinite loops that match no characters. When match() is called in
674 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675 NOT be used with tail recursion, because the memory block that is used is on
676 the stack, so a new one may be required for each match(). */
677
678 if (md->match_function_type == MATCH_CBEGROUP)
679 {
680 newptrb.epb_saved_eptr = eptr;
681 newptrb.epb_prev = eptrb;
682 eptrb = &newptrb;
683 md->match_function_type = 0;
684 }
685
686 /* Now start processing the opcodes. */
687
688 for (;;)
689 {
690 minimize = possessive = FALSE;
691 op = *ecode;
692
693 switch(op)
694 {
695 case OP_MARK:
696 md->nomatch_mark = ecode + 2;
697 md->mark = NULL; /* In case previously set by assertion */
698 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 eptrb, RM55);
700 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701 md->mark == NULL) md->mark = ecode + 2;
702
703 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704 argument, and we must check whether that argument matches this MARK's
705 argument. It is passed back in md->start_match_ptr (an overloading of that
706 variable). If it does match, we reset that variable to the current subject
707 position and return MATCH_SKIP. Otherwise, pass back the return code
708 unaltered. */
709
710 else if (rrc == MATCH_SKIP_ARG &&
711 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 {
713 md->start_match_ptr = eptr;
714 RRETURN(MATCH_SKIP);
715 }
716 RRETURN(rrc);
717
718 case OP_FAIL:
719 RRETURN(MATCH_NOMATCH);
720
721 /* COMMIT overrides PRUNE, SKIP, and THEN */
722
723 case OP_COMMIT:
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 eptrb, RM52);
726 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728 rrc != MATCH_THEN)
729 RRETURN(rrc);
730 RRETURN(MATCH_COMMIT);
731
732 /* PRUNE overrides THEN */
733
734 case OP_PRUNE:
735 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 eptrb, RM51);
737 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 RRETURN(MATCH_PRUNE);
739
740 case OP_PRUNE_ARG:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM56);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 RRETURN(MATCH_PRUNE);
749
750 /* SKIP overrides PRUNE and THEN */
751
752 case OP_SKIP:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM53);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 RRETURN(rrc);
757 md->start_match_ptr = eptr; /* Pass back current position */
758 RRETURN(MATCH_SKIP);
759
760 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761 nomatch_mark. There is a flag that disables this opcode when re-matching a
762 pattern that ended with a SKIP for which there was not a matching MARK. */
763
764 case OP_SKIP_ARG:
765 if (md->ignore_skip_arg)
766 {
767 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768 break;
769 }
770 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 eptrb, RM57);
772 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 RRETURN(rrc);
774
775 /* Pass back the current skip name by overloading md->start_match_ptr and
776 returning the special MATCH_SKIP_ARG return code. This will either be
777 caught by a matching MARK, or get to the top, where it causes a rematch
778 with the md->ignore_skip_arg flag set. */
779
780 md->start_match_ptr = ecode + 2;
781 RRETURN(MATCH_SKIP_ARG);
782
783 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784 the branch in which it occurs can be determined. Overload the start of
785 match pointer to do this. */
786
787 case OP_THEN:
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 eptrb, RM54);
790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 md->start_match_ptr = ecode;
792 RRETURN(MATCH_THEN);
793
794 case OP_THEN_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 md, eptrb, RM58);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode;
803 RRETURN(MATCH_THEN);
804
805 /* Handle an atomic group that does not contain any capturing parentheses.
806 This can be handled like an assertion. Prior to 8.13, all atomic groups
807 were handled this way. In 8.13, the code was changed as below for ONCE, so
808 that backups pass through the group and thereby reset captured values.
809 However, this uses a lot more stack, so in 8.20, atomic groups that do not
810 contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 less stack intensive way.
812
813 Check the alternative branches in turn - the matching won't pass the KET
814 for this kind of subpattern. If any one branch matches, we carry on as at
815 the end of a normal bracket, leaving the subject pointer, but resetting
816 the start-of-match value in case it was changed by \K. */
817
818 case OP_ONCE_NC:
819 prev = ecode;
820 saved_eptr = eptr;
821 do
822 {
823 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825 {
826 mstart = md->start_match_ptr;
827 break;
828 }
829 if (rrc == MATCH_THEN)
830 {
831 next = ecode + GET(ecode,1);
832 if (md->start_match_ptr < next &&
833 (*ecode == OP_ALT || *next == OP_ALT))
834 rrc = MATCH_NOMATCH;
835 }
836
837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838 ecode += GET(ecode,1);
839 }
840 while (*ecode == OP_ALT);
841
842 /* If hit the end of the group (which could be repeated), fail */
843
844 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845
846 /* Continue as from after the group, updating the offsets high water
847 mark, since extracts may have been taken. */
848
849 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850
851 offset_top = md->end_offset_top;
852 eptr = md->end_match_ptr;
853
854 /* For a non-repeating ket, just continue at this level. This also
855 happens for a repeating ket if no characters were matched in the group.
856 This is the forcible breaking of infinite loops as implemented in Perl
857 5.005. */
858
859 if (*ecode == OP_KET || eptr == saved_eptr)
860 {
861 ecode += 1+LINK_SIZE;
862 break;
863 }
864
865 /* The repeating kets try the rest of the pattern or restart from the
866 preceding bracket, in the appropriate order. The second "call" of match()
867 uses tail recursion, to avoid using another stack frame. */
868
869 if (*ecode == OP_KETRMIN)
870 {
871 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873 ecode = prev;
874 goto TAIL_RECURSE;
875 }
876 else /* OP_KETRMAX */
877 {
878 md->match_function_type = MATCH_CBEGROUP;
879 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881 ecode += 1 + LINK_SIZE;
882 goto TAIL_RECURSE;
883 }
884 /* Control never gets here */
885
886 /* Handle a capturing bracket, other than those that are possessive with an
887 unlimited repeat. If there is space in the offset vector, save the current
888 subject position in the working slot at the top of the vector. We mustn't
889 change the current values of the data slot, because they may be set from a
890 previous iteration of this group, and be referred to by a reference inside
891 the group. A failure to match might occur after the group has succeeded,
892 if something later on doesn't match. For this reason, we need to restore
893 the working value and also the values of the final offsets, in case they
894 were set by a previous iteration of the same bracket.
895
896 If there isn't enough space in the offset vector, treat this as if it were
897 a non-capturing bracket. Don't worry about setting the flag for the error
898 case here; that is handled in the code for KET. */
899
900 case OP_CBRA:
901 case OP_SCBRA:
902 number = GET2(ecode, 1+LINK_SIZE);
903 offset = number << 1;
904
905 #ifdef PCRE_DEBUG
906 printf("start bracket %d\n", number);
907 printf("subject=");
908 pchars(eptr, 16, TRUE, md);
909 printf("\n");
910 #endif
911
912 if (offset < md->offset_max)
913 {
914 save_offset1 = md->offset_vector[offset];
915 save_offset2 = md->offset_vector[offset+1];
916 save_offset3 = md->offset_vector[md->offset_end - number];
917 save_capture_last = md->capture_last;
918
919 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 md->offset_vector[md->offset_end - number] =
921 (int)(eptr - md->start_subject);
922
923 for (;;)
924 {
925 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 eptrb, RM1);
928 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929
930 /* If we backed up to a THEN, check whether it is within the current
931 branch by comparing the address of the THEN that is passed back with
932 the end of the branch. If it is within the current branch, and the
933 branch is one of two or more alternatives (it either starts or ends
934 with OP_ALT), we have reached the limit of THEN's action, so convert
935 the return code to NOMATCH, which will cause normal backtracking to
936 happen from now on. Otherwise, THEN is passed back to an outer
937 alternative. This implements Perl's treatment of parenthesized groups,
938 where a group not containing | does not affect the current alternative,
939 that is, (X) is NOT the same as (X|(*F)). */
940
941 if (rrc == MATCH_THEN)
942 {
943 next = ecode + GET(ecode,1);
944 if (md->start_match_ptr < next &&
945 (*ecode == OP_ALT || *next == OP_ALT))
946 rrc = MATCH_NOMATCH;
947 }
948
949 /* Anything other than NOMATCH is passed back. */
950
951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 md->capture_last = save_capture_last;
953 ecode += GET(ecode, 1);
954 if (*ecode != OP_ALT) break;
955 }
956
957 DPRINTF(("bracket %d failed\n", number));
958 md->offset_vector[offset] = save_offset1;
959 md->offset_vector[offset+1] = save_offset2;
960 md->offset_vector[md->offset_end - number] = save_offset3;
961
962 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963
964 RRETURN(rrc);
965 }
966
967 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968 as a non-capturing bracket. */
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974
975 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977
978 /* Non-capturing or atomic group, except for possessive with unlimited
979 repeat and ONCE group with no captures. Loop for all the alternatives.
980
981 When we get to the final alternative within the brackets, we used to return
982 the result of a recursive call to match() whatever happened so it was
983 possible to reduce stack usage by turning this into a tail recursion,
984 except in the case of a possibly empty group. However, now that there is
985 the possiblity of (*THEN) occurring in the final alternative, this
986 optimization is no longer always possible.
987
988 We can optimize if we know there are no (*THEN)s in the pattern; at present
989 this is the best that can be done.
990
991 MATCH_ONCE is returned when the end of an atomic group is successfully
992 reached, but subsequent matching fails. It passes back up the tree (causing
993 captured values to be reset) until the original atomic group level is
994 reached. This is tested by comparing md->once_target with the start of the
995 group. At this point, the return is converted into MATCH_NOMATCH so that
996 previous backup points can be taken. */
997
998 case OP_ONCE:
999 case OP_BRA:
1000 case OP_SBRA:
1001 DPRINTF(("start non-capturing bracket\n"));
1002
1003 for (;;)
1004 {
1005 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006
1007 /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 the pattern, and this is the final alternative, optimize as described
1009 above. */
1010
1011 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012 {
1013 ecode += PRIV(OP_lengths)[*ecode];
1014 goto TAIL_RECURSE;
1015 }
1016
1017 /* In all other cases, we have to make another call to match(). */
1018
1019 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 RM2);
1021
1022 /* See comment in the code for capturing groups above about handling
1023 THEN. */
1024
1025 if (rrc == MATCH_THEN)
1026 {
1027 next = ecode + GET(ecode,1);
1028 if (md->start_match_ptr < next &&
1029 (*ecode == OP_ALT || *next == OP_ALT))
1030 rrc = MATCH_NOMATCH;
1031 }
1032
1033 if (rrc != MATCH_NOMATCH)
1034 {
1035 if (rrc == MATCH_ONCE)
1036 {
1037 const pcre_uchar *scode = ecode;
1038 if (*scode != OP_ONCE) /* If not at start, find it */
1039 {
1040 while (*scode == OP_ALT) scode += GET(scode, 1);
1041 scode -= GET(scode, 1);
1042 }
1043 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 }
1045 RRETURN(rrc);
1046 }
1047 ecode += GET(ecode, 1);
1048 if (*ecode != OP_ALT) break;
1049 }
1050
1051 RRETURN(MATCH_NOMATCH);
1052
1053 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055 handled similarly to the normal case above. However, the matching is
1056 different. The end of these brackets will always be OP_KETRPOS, which
1057 returns MATCH_KETRPOS without going further in the pattern. By this means
1058 we can handle the group by iteration rather than recursion, thereby
1059 reducing the amount of stack needed. */
1060
1061 case OP_CBRAPOS:
1062 case OP_SCBRAPOS:
1063 allow_zero = FALSE;
1064
1065 POSSESSIVE_CAPTURE:
1066 number = GET2(ecode, 1+LINK_SIZE);
1067 offset = number << 1;
1068
1069 #ifdef PCRE_DEBUG
1070 printf("start possessive bracket %d\n", number);
1071 printf("subject=");
1072 pchars(eptr, 16, TRUE, md);
1073 printf("\n");
1074 #endif
1075
1076 if (offset < md->offset_max)
1077 {
1078 matched_once = FALSE;
1079 code_offset = (int)(ecode - md->start_code);
1080
1081 save_offset1 = md->offset_vector[offset];
1082 save_offset2 = md->offset_vector[offset+1];
1083 save_offset3 = md->offset_vector[md->offset_end - number];
1084 save_capture_last = md->capture_last;
1085
1086 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087
1088 /* Each time round the loop, save the current subject position for use
1089 when the group matches. For MATCH_MATCH, the group has matched, so we
1090 restart it with a new subject starting position, remembering that we had
1091 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092 usual. If we haven't matched any alternatives in any iteration, check to
1093 see if a previous iteration matched. If so, the group has matched;
1094 continue from afterwards. Otherwise it has failed; restore the previous
1095 capture values before returning NOMATCH. */
1096
1097 for (;;)
1098 {
1099 md->offset_vector[md->offset_end - number] =
1100 (int)(eptr - md->start_subject);
1101 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 eptrb, RM63);
1104 if (rrc == MATCH_KETRPOS)
1105 {
1106 offset_top = md->end_offset_top;
1107 eptr = md->end_match_ptr;
1108 ecode = md->start_code + code_offset;
1109 save_capture_last = md->capture_last;
1110 matched_once = TRUE;
1111 continue;
1112 }
1113
1114 /* See comment in the code for capturing groups above about handling
1115 THEN. */
1116
1117 if (rrc == MATCH_THEN)
1118 {
1119 next = ecode + GET(ecode,1);
1120 if (md->start_match_ptr < next &&
1121 (*ecode == OP_ALT || *next == OP_ALT))
1122 rrc = MATCH_NOMATCH;
1123 }
1124
1125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 md->capture_last = save_capture_last;
1127 ecode += GET(ecode, 1);
1128 if (*ecode != OP_ALT) break;
1129 }
1130
1131 if (!matched_once)
1132 {
1133 md->offset_vector[offset] = save_offset1;
1134 md->offset_vector[offset+1] = save_offset2;
1135 md->offset_vector[md->offset_end - number] = save_offset3;
1136 }
1137
1138 if (allow_zero || matched_once)
1139 {
1140 ecode += 1 + LINK_SIZE;
1141 break;
1142 }
1143
1144 RRETURN(MATCH_NOMATCH);
1145 }
1146
1147 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148 as a non-capturing bracket. */
1149
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152
1153 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154
1155 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157
1158 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160 without the capturing complication. It is written out separately for speed
1161 and cleanliness. */
1162
1163 case OP_BRAPOS:
1164 case OP_SBRAPOS:
1165 allow_zero = FALSE;
1166
1167 POSSESSIVE_NON_CAPTURE:
1168 matched_once = FALSE;
1169 code_offset = (int)(ecode - md->start_code);
1170
1171 for (;;)
1172 {
1173 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 eptrb, RM48);
1176 if (rrc == MATCH_KETRPOS)
1177 {
1178 offset_top = md->end_offset_top;
1179 eptr = md->end_match_ptr;
1180 ecode = md->start_code + code_offset;
1181 matched_once = TRUE;
1182 continue;
1183 }
1184
1185 /* See comment in the code for capturing groups above about handling
1186 THEN. */
1187
1188 if (rrc == MATCH_THEN)
1189 {
1190 next = ecode + GET(ecode,1);
1191 if (md->start_match_ptr < next &&
1192 (*ecode == OP_ALT || *next == OP_ALT))
1193 rrc = MATCH_NOMATCH;
1194 }
1195
1196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ecode += GET(ecode, 1);
1198 if (*ecode != OP_ALT) break;
1199 }
1200
1201 if (matched_once || allow_zero)
1202 {
1203 ecode += 1 + LINK_SIZE;
1204 break;
1205 }
1206 RRETURN(MATCH_NOMATCH);
1207
1208 /* Control never reaches here. */
1209
1210 /* Conditional group: compilation checked that there are no more than
1211 two branches. If the condition is false, skipping the first branch takes us
1212 past the end if there is only one branch, but that's OK because that is
1213 exactly what going to the ket would do. */
1214
1215 case OP_COND:
1216 case OP_SCOND:
1217 codelink = GET(ecode, 1);
1218
1219 /* Because of the way auto-callout works during compile, a callout item is
1220 inserted between OP_COND and an assertion condition. */
1221
1222 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223 {
1224 if (PUBL(callout) != NULL)
1225 {
1226 PUBL(callout_block) cb;
1227 cb.version = 2; /* Version 1 of the callout block */
1228 cb.callout_number = ecode[LINK_SIZE+2];
1229 cb.offset_vector = md->offset_vector;
1230 #ifdef COMPILE_PCRE8
1231 cb.subject = (PCRE_SPTR)md->start_subject;
1232 #else
1233 cb.subject = (PCRE_SPTR16)md->start_subject;
1234 #endif
1235 cb.subject_length = (int)(md->end_subject - md->start_subject);
1236 cb.start_match = (int)(mstart - md->start_subject);
1237 cb.current_position = (int)(eptr - md->start_subject);
1238 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1239 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1240 cb.capture_top = offset_top/2;
1241 cb.capture_last = md->capture_last;
1242 cb.callout_data = md->callout_data;
1243 cb.mark = md->nomatch_mark;
1244 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1245 if (rrc < 0) RRETURN(rrc);
1246 }
1247 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1248 }
1249
1250 condcode = ecode[LINK_SIZE+1];
1251
1252 /* Now see what the actual condition is */
1253
1254 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1255 {
1256 if (md->recursive == NULL) /* Not recursing => FALSE */
1257 {
1258 condition = FALSE;
1259 ecode += GET(ecode, 1);
1260 }
1261 else
1262 {
1263 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1264 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1265
1266 /* If the test is for recursion into a specific subpattern, and it is
1267 false, but the test was set up by name, scan the table to see if the
1268 name refers to any other numbers, and test them. The condition is true
1269 if any one is set. */
1270
1271 if (!condition && condcode == OP_NRREF)
1272 {
1273 pcre_uchar *slotA = md->name_table;
1274 for (i = 0; i < md->name_count; i++)
1275 {
1276 if (GET2(slotA, 0) == recno) break;
1277 slotA += md->name_entry_size;
1278 }
1279
1280 /* Found a name for the number - there can be only one; duplicate
1281 names for different numbers are allowed, but not vice versa. First
1282 scan down for duplicates. */
1283
1284 if (i < md->name_count)
1285 {
1286 pcre_uchar *slotB = slotA;
1287 while (slotB > md->name_table)
1288 {
1289 slotB -= md->name_entry_size;
1290 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1291 {
1292 condition = GET2(slotB, 0) == md->recursive->group_num;
1293 if (condition) break;
1294 }
1295 else break;
1296 }
1297
1298 /* Scan up for duplicates */
1299
1300 if (!condition)
1301 {
1302 slotB = slotA;
1303 for (i++; i < md->name_count; i++)
1304 {
1305 slotB += md->name_entry_size;
1306 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1307 {
1308 condition = GET2(slotB, 0) == md->recursive->group_num;
1309 if (condition) break;
1310 }
1311 else break;
1312 }
1313 }
1314 }
1315 }
1316
1317 /* Chose branch according to the condition */
1318
1319 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1320 }
1321 }
1322
1323 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1324 {
1325 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1326 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1327
1328 /* If the numbered capture is unset, but the reference was by name,
1329 scan the table to see if the name refers to any other numbers, and test
1330 them. The condition is true if any one is set. This is tediously similar
1331 to the code above, but not close enough to try to amalgamate. */
1332
1333 if (!condition && condcode == OP_NCREF)
1334 {
1335 int refno = offset >> 1;
1336 pcre_uchar *slotA = md->name_table;
1337
1338 for (i = 0; i < md->name_count; i++)
1339 {
1340 if (GET2(slotA, 0) == refno) break;
1341 slotA += md->name_entry_size;
1342 }
1343
1344 /* Found a name for the number - there can be only one; duplicate names
1345 for different numbers are allowed, but not vice versa. First scan down
1346 for duplicates. */
1347
1348 if (i < md->name_count)
1349 {
1350 pcre_uchar *slotB = slotA;
1351 while (slotB > md->name_table)
1352 {
1353 slotB -= md->name_entry_size;
1354 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1355 {
1356 offset = GET2(slotB, 0) << 1;
1357 condition = offset < offset_top &&
1358 md->offset_vector[offset] >= 0;
1359 if (condition) break;
1360 }
1361 else break;
1362 }
1363
1364 /* Scan up for duplicates */
1365
1366 if (!condition)
1367 {
1368 slotB = slotA;
1369 for (i++; i < md->name_count; i++)
1370 {
1371 slotB += md->name_entry_size;
1372 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1373 {
1374 offset = GET2(slotB, 0) << 1;
1375 condition = offset < offset_top &&
1376 md->offset_vector[offset] >= 0;
1377 if (condition) break;
1378 }
1379 else break;
1380 }
1381 }
1382 }
1383 }
1384
1385 /* Chose branch according to the condition */
1386
1387 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1388 }
1389
1390 else if (condcode == OP_DEF) /* DEFINE - always false */
1391 {
1392 condition = FALSE;
1393 ecode += GET(ecode, 1);
1394 }
1395
1396 /* The condition is an assertion. Call match() to evaluate it - setting
1397 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1398 an assertion. */
1399
1400 else
1401 {
1402 md->match_function_type = MATCH_CONDASSERT;
1403 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1404 if (rrc == MATCH_MATCH)
1405 {
1406 if (md->end_offset_top > offset_top)
1407 offset_top = md->end_offset_top; /* Captures may have happened */
1408 condition = TRUE;
1409 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1410 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1411 }
1412
1413 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1414 assertion; it is therefore treated as NOMATCH. */
1415
1416 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1417 {
1418 RRETURN(rrc); /* Need braces because of following else */
1419 }
1420 else
1421 {
1422 condition = FALSE;
1423 ecode += codelink;
1424 }
1425 }
1426
1427 /* We are now at the branch that is to be obeyed. As there is only one, can
1428 use tail recursion to avoid using another stack frame, except when there is
1429 unlimited repeat of a possibly empty group. In the latter case, a recursive
1430 call to match() is always required, unless the second alternative doesn't
1431 exist, in which case we can just plough on. Note that, for compatibility
1432 with Perl, the | in a conditional group is NOT treated as creating two
1433 alternatives. If a THEN is encountered in the branch, it propagates out to
1434 the enclosing alternative (unless nested in a deeper set of alternatives,
1435 of course). */
1436
1437 if (condition || *ecode == OP_ALT)
1438 {
1439 if (op != OP_SCOND)
1440 {
1441 ecode += 1 + LINK_SIZE;
1442 goto TAIL_RECURSE;
1443 }
1444
1445 md->match_function_type = MATCH_CBEGROUP;
1446 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1447 RRETURN(rrc);
1448 }
1449
1450 /* Condition false & no alternative; continue after the group. */
1451
1452 else
1453 {
1454 ecode += 1 + LINK_SIZE;
1455 }
1456 break;
1457
1458
1459 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1460 to close any currently open capturing brackets. */
1461
1462 case OP_CLOSE:
1463 number = GET2(ecode, 1);
1464 offset = number << 1;
1465
1466 #ifdef PCRE_DEBUG
1467 printf("end bracket %d at *ACCEPT", number);
1468 printf("\n");
1469 #endif
1470
1471 md->capture_last = number;
1472 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1473 {
1474 md->offset_vector[offset] =
1475 md->offset_vector[md->offset_end - number];
1476 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1477 if (offset_top <= offset) offset_top = offset + 2;
1478 }
1479 ecode += 1 + IMM2_SIZE;
1480 break;
1481
1482
1483 /* End of the pattern, either real or forced. */
1484
1485 case OP_END:
1486 case OP_ACCEPT:
1487 case OP_ASSERT_ACCEPT:
1488
1489 /* If we have matched an empty string, fail if not in an assertion and not
1490 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1491 is set and we have matched at the start of the subject. In both cases,
1492 backtracking will then try other alternatives, if any. */
1493
1494 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1495 md->recursive == NULL &&
1496 (md->notempty ||
1497 (md->notempty_atstart &&
1498 mstart == md->start_subject + md->start_offset)))
1499 RRETURN(MATCH_NOMATCH);
1500
1501 /* Otherwise, we have a match. */
1502
1503 md->end_match_ptr = eptr; /* Record where we ended */
1504 md->end_offset_top = offset_top; /* and how many extracts were taken */
1505 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1506
1507 /* For some reason, the macros don't work properly if an expression is
1508 given as the argument to RRETURN when the heap is in use. */
1509
1510 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1511 RRETURN(rrc);
1512
1513 /* Assertion brackets. Check the alternative branches in turn - the
1514 matching won't pass the KET for an assertion. If any one branch matches,
1515 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1516 start of each branch to move the current point backwards, so the code at
1517 this level is identical to the lookahead case. When the assertion is part
1518 of a condition, we want to return immediately afterwards. The caller of
1519 this incarnation of the match() function will have set MATCH_CONDASSERT in
1520 md->match_function type, and one of these opcodes will be the first opcode
1521 that is processed. We use a local variable that is preserved over calls to
1522 match() to remember this case. */
1523
1524 case OP_ASSERT:
1525 case OP_ASSERTBACK:
1526 if (md->match_function_type == MATCH_CONDASSERT)
1527 {
1528 condassert = TRUE;
1529 md->match_function_type = 0;
1530 }
1531 else condassert = FALSE;
1532
1533 do
1534 {
1535 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1536 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1537 {
1538 mstart = md->start_match_ptr; /* In case \K reset it */
1539 break;
1540 }
1541
1542 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1543 as NOMATCH. */
1544
1545 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1546 ecode += GET(ecode, 1);
1547 }
1548 while (*ecode == OP_ALT);
1549
1550 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1551
1552 /* If checking an assertion for a condition, return MATCH_MATCH. */
1553
1554 if (condassert) RRETURN(MATCH_MATCH);
1555
1556 /* Continue from after the assertion, updating the offsets high water
1557 mark, since extracts may have been taken during the assertion. */
1558
1559 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1560 ecode += 1 + LINK_SIZE;
1561 offset_top = md->end_offset_top;
1562 continue;
1563
1564 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1565 PRUNE, or COMMIT means we must assume failure without checking subsequent
1566 branches. */
1567
1568 case OP_ASSERT_NOT:
1569 case OP_ASSERTBACK_NOT:
1570 if (md->match_function_type == MATCH_CONDASSERT)
1571 {
1572 condassert = TRUE;
1573 md->match_function_type = 0;
1574 }
1575 else condassert = FALSE;
1576
1577 do
1578 {
1579 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1580 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1581 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1582 {
1583 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1584 break;
1585 }
1586
1587 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1588 as NOMATCH. */
1589
1590 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1591 ecode += GET(ecode,1);
1592 }
1593 while (*ecode == OP_ALT);
1594
1595 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1596
1597 ecode += 1 + LINK_SIZE;
1598 continue;
1599
1600 /* Move the subject pointer back. This occurs only at the start of
1601 each branch of a lookbehind assertion. If we are too close to the start to
1602 move back, this match function fails. When working with UTF-8 we move
1603 back a number of characters, not bytes. */
1604
1605 case OP_REVERSE:
1606 #ifdef SUPPORT_UTF
1607 if (utf)
1608 {
1609 i = GET(ecode, 1);
1610 while (i-- > 0)
1611 {
1612 eptr--;
1613 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1614 BACKCHAR(eptr);
1615 }
1616 }
1617 else
1618 #endif
1619
1620 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1621
1622 {
1623 eptr -= GET(ecode, 1);
1624 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1625 }
1626
1627 /* Save the earliest consulted character, then skip to next op code */
1628
1629 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1630 ecode += 1 + LINK_SIZE;
1631 break;
1632
1633 /* The callout item calls an external function, if one is provided, passing
1634 details of the match so far. This is mainly for debugging, though the
1635 function is able to force a failure. */
1636
1637 case OP_CALLOUT:
1638 if (PUBL(callout) != NULL)
1639 {
1640 PUBL(callout_block) cb;
1641 cb.version = 2; /* Version 1 of the callout block */
1642 cb.callout_number = ecode[1];
1643 cb.offset_vector = md->offset_vector;
1644 #ifdef COMPILE_PCRE8
1645 cb.subject = (PCRE_SPTR)md->start_subject;
1646 #else
1647 cb.subject = (PCRE_SPTR16)md->start_subject;
1648 #endif
1649 cb.subject_length = (int)(md->end_subject - md->start_subject);
1650 cb.start_match = (int)(mstart - md->start_subject);
1651 cb.current_position = (int)(eptr - md->start_subject);
1652 cb.pattern_position = GET(ecode, 2);
1653 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1654 cb.capture_top = offset_top/2;
1655 cb.capture_last = md->capture_last;
1656 cb.callout_data = md->callout_data;
1657 cb.mark = md->nomatch_mark;
1658 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1659 if (rrc < 0) RRETURN(rrc);
1660 }
1661 ecode += 2 + 2*LINK_SIZE;
1662 break;
1663
1664 /* Recursion either matches the current regex, or some subexpression. The
1665 offset data is the offset to the starting bracket from the start of the
1666 whole pattern. (This is so that it works from duplicated subpatterns.)
1667
1668 The state of the capturing groups is preserved over recursion, and
1669 re-instated afterwards. We don't know how many are started and not yet
1670 finished (offset_top records the completed total) so we just have to save
1671 all the potential data. There may be up to 65535 such values, which is too
1672 large to put on the stack, but using malloc for small numbers seems
1673 expensive. As a compromise, the stack is used when there are no more than
1674 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1675
1676 There are also other values that have to be saved. We use a chained
1677 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1678 for the original version of this logic. It has, however, been hacked around
1679 a lot, so he is not to blame for the current way it works. */
1680
1681 case OP_RECURSE:
1682 {
1683 recursion_info *ri;
1684 int recno;
1685
1686 callpat = md->start_code + GET(ecode, 1);
1687 recno = (callpat == md->start_code)? 0 :
1688 GET2(callpat, 1 + LINK_SIZE);
1689
1690 /* Check for repeating a recursion without advancing the subject pointer.
1691 This should catch convoluted mutual recursions. (Some simple cases are
1692 caught at compile time.) */
1693
1694 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1695 if (recno == ri->group_num && eptr == ri->subject_position)
1696 RRETURN(PCRE_ERROR_RECURSELOOP);
1697
1698 /* Add to "recursing stack" */
1699
1700 new_recursive.group_num = recno;
1701 new_recursive.subject_position = eptr;
1702 new_recursive.prevrec = md->recursive;
1703 md->recursive = &new_recursive;
1704
1705 /* Where to continue from afterwards */
1706
1707 ecode += 1 + LINK_SIZE;
1708
1709 /* Now save the offset data */
1710
1711 new_recursive.saved_max = md->offset_end;
1712 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1713 new_recursive.offset_save = stacksave;
1714 else
1715 {
1716 new_recursive.offset_save =
1717 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1718 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1719 }
1720 memcpy(new_recursive.offset_save, md->offset_vector,
1721 new_recursive.saved_max * sizeof(int));
1722
1723 /* OK, now we can do the recursion. After processing each alternative,
1724 restore the offset data. If there were nested recursions, md->recursive
1725 might be changed, so reset it before looping. */
1726
1727 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1728 cbegroup = (*callpat >= OP_SBRA);
1729 do
1730 {
1731 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1732 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1733 md, eptrb, RM6);
1734 memcpy(md->offset_vector, new_recursive.offset_save,
1735 new_recursive.saved_max * sizeof(int));
1736 md->recursive = new_recursive.prevrec;
1737 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1738 {
1739 DPRINTF(("Recursion matched\n"));
1740 if (new_recursive.offset_save != stacksave)
1741 (PUBL(free))(new_recursive.offset_save);
1742
1743 /* Set where we got to in the subject, and reset the start in case
1744 it was changed by \K. This *is* propagated back out of a recursion,
1745 for Perl compatibility. */
1746
1747 eptr = md->end_match_ptr;
1748 mstart = md->start_match_ptr;
1749 goto RECURSION_MATCHED; /* Exit loop; end processing */
1750 }
1751
1752 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1753 as NOMATCH. */
1754
1755 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1756 {
1757 DPRINTF(("Recursion gave error %d\n", rrc));
1758 if (new_recursive.offset_save != stacksave)
1759 (PUBL(free))(new_recursive.offset_save);
1760 RRETURN(rrc);
1761 }
1762
1763 md->recursive = &new_recursive;
1764 callpat += GET(callpat, 1);
1765 }
1766 while (*callpat == OP_ALT);
1767
1768 DPRINTF(("Recursion didn't match\n"));
1769 md->recursive = new_recursive.prevrec;
1770 if (new_recursive.offset_save != stacksave)
1771 (PUBL(free))(new_recursive.offset_save);
1772 RRETURN(MATCH_NOMATCH);
1773 }
1774
1775 RECURSION_MATCHED:
1776 break;
1777
1778 /* An alternation is the end of a branch; scan along to find the end of the
1779 bracketed group and go to there. */
1780
1781 case OP_ALT:
1782 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1783 break;
1784
1785 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1786 indicating that it may occur zero times. It may repeat infinitely, or not
1787 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1788 with fixed upper repeat limits are compiled as a number of copies, with the
1789 optional ones preceded by BRAZERO or BRAMINZERO. */
1790
1791 case OP_BRAZERO:
1792 next = ecode + 1;
1793 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1795 do next += GET(next, 1); while (*next == OP_ALT);
1796 ecode = next + 1 + LINK_SIZE;
1797 break;
1798
1799 case OP_BRAMINZERO:
1800 next = ecode + 1;
1801 do next += GET(next, 1); while (*next == OP_ALT);
1802 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1804 ecode++;
1805 break;
1806
1807 case OP_SKIPZERO:
1808 next = ecode+1;
1809 do next += GET(next,1); while (*next == OP_ALT);
1810 ecode = next + 1 + LINK_SIZE;
1811 break;
1812
1813 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1814 here; just jump to the group, with allow_zero set TRUE. */
1815
1816 case OP_BRAPOSZERO:
1817 op = *(++ecode);
1818 allow_zero = TRUE;
1819 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1820 goto POSSESSIVE_NON_CAPTURE;
1821
1822 /* End of a group, repeated or non-repeating. */
1823
1824 case OP_KET:
1825 case OP_KETRMIN:
1826 case OP_KETRMAX:
1827 case OP_KETRPOS:
1828 prev = ecode - GET(ecode, 1);
1829
1830 /* If this was a group that remembered the subject start, in order to break
1831 infinite repeats of empty string matches, retrieve the subject start from
1832 the chain. Otherwise, set it NULL. */
1833
1834 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1835 {
1836 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1837 eptrb = eptrb->epb_prev; /* Backup to previous group */
1838 }
1839 else saved_eptr = NULL;
1840
1841 /* If we are at the end of an assertion group or a non-capturing atomic
1842 group, stop matching and return MATCH_MATCH, but record the current high
1843 water mark for use by positive assertions. We also need to record the match
1844 start in case it was changed by \K. */
1845
1846 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1847 *prev == OP_ONCE_NC)
1848 {
1849 md->end_match_ptr = eptr; /* For ONCE_NC */
1850 md->end_offset_top = offset_top;
1851 md->start_match_ptr = mstart;
1852 RRETURN(MATCH_MATCH); /* Sets md->mark */
1853 }
1854
1855 /* For capturing groups we have to check the group number back at the start
1856 and if necessary complete handling an extraction by setting the offsets and
1857 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1858 into group 0, so it won't be picked up here. Instead, we catch it when the
1859 OP_END is reached. Other recursion is handled here. We just have to record
1860 the current subject position and start match pointer and give a MATCH
1861 return. */
1862
1863 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1864 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1865 {
1866 number = GET2(prev, 1+LINK_SIZE);
1867 offset = number << 1;
1868
1869 #ifdef PCRE_DEBUG
1870 printf("end bracket %d", number);
1871 printf("\n");
1872 #endif
1873
1874 /* Handle a recursively called group. */
1875
1876 if (md->recursive != NULL && md->recursive->group_num == number)
1877 {
1878 md->end_match_ptr = eptr;
1879 md->start_match_ptr = mstart;
1880 RRETURN(MATCH_MATCH);
1881 }
1882
1883 /* Deal with capturing */
1884
1885 md->capture_last = number;
1886 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1887 {
1888 /* If offset is greater than offset_top, it means that we are
1889 "skipping" a capturing group, and that group's offsets must be marked
1890 unset. In earlier versions of PCRE, all the offsets were unset at the
1891 start of matching, but this doesn't work because atomic groups and
1892 assertions can cause a value to be set that should later be unset.
1893 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1894 part of the atomic group, but this is not on the final matching path,
1895 so must be unset when 2 is set. (If there is no group 2, there is no
1896 problem, because offset_top will then be 2, indicating no capture.) */
1897
1898 if (offset > offset_top)
1899 {
1900 register int *iptr = md->offset_vector + offset_top;
1901 register int *iend = md->offset_vector + offset;
1902 while (iptr < iend) *iptr++ = -1;
1903 }
1904
1905 /* Now make the extraction */
1906
1907 md->offset_vector[offset] =
1908 md->offset_vector[md->offset_end - number];
1909 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1910 if (offset_top <= offset) offset_top = offset + 2;
1911 }
1912 }
1913
1914 /* For an ordinary non-repeating ket, just continue at this level. This
1915 also happens for a repeating ket if no characters were matched in the
1916 group. This is the forcible breaking of infinite loops as implemented in
1917 Perl 5.005. For a non-repeating atomic group that includes captures,
1918 establish a backup point by processing the rest of the pattern at a lower
1919 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1920 original OP_ONCE level, thereby bypassing intermediate backup points, but
1921 resetting any captures that happened along the way. */
1922
1923 if (*ecode == OP_KET || eptr == saved_eptr)
1924 {
1925 if (*prev == OP_ONCE)
1926 {
1927 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1929 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1930 RRETURN(MATCH_ONCE);
1931 }
1932 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1933 break;
1934 }
1935
1936 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1937 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1938 at a time from the outer level, thus saving stack. */
1939
1940 if (*ecode == OP_KETRPOS)
1941 {
1942 md->end_match_ptr = eptr;
1943 md->end_offset_top = offset_top;
1944 RRETURN(MATCH_KETRPOS);
1945 }
1946
1947 /* The normal repeating kets try the rest of the pattern or restart from
1948 the preceding bracket, in the appropriate order. In the second case, we can
1949 use tail recursion to avoid using another stack frame, unless we have an
1950 an atomic group or an unlimited repeat of a group that can match an empty
1951 string. */
1952
1953 if (*ecode == OP_KETRMIN)
1954 {
1955 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1956 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1957 if (*prev == OP_ONCE)
1958 {
1959 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1961 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1962 RRETURN(MATCH_ONCE);
1963 }
1964 if (*prev >= OP_SBRA) /* Could match an empty string */
1965 {
1966 md->match_function_type = MATCH_CBEGROUP;
1967 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1968 RRETURN(rrc);
1969 }
1970 ecode = prev;
1971 goto TAIL_RECURSE;
1972 }
1973 else /* OP_KETRMAX */
1974 {
1975 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1976 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1977 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1979 if (*prev == OP_ONCE)
1980 {
1981 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983 md->once_target = prev;
1984 RRETURN(MATCH_ONCE);
1985 }
1986 ecode += 1 + LINK_SIZE;
1987 goto TAIL_RECURSE;
1988 }
1989 /* Control never gets here */
1990
1991 /* Not multiline mode: start of subject assertion, unless notbol. */
1992
1993 case OP_CIRC:
1994 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1995
1996 /* Start of subject assertion */
1997
1998 case OP_SOD:
1999 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2000 ecode++;
2001 break;
2002
2003 /* Multiline mode: start of subject unless notbol, or after any newline. */
2004
2005 case OP_CIRCM:
2006 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2007 if (eptr != md->start_subject &&
2008 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2009 RRETURN(MATCH_NOMATCH);
2010 ecode++;
2011 break;
2012
2013 /* Start of match assertion */
2014
2015 case OP_SOM:
2016 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2017 ecode++;
2018 break;
2019
2020 /* Reset the start of match point */
2021
2022 case OP_SET_SOM:
2023 mstart = eptr;
2024 ecode++;
2025 break;
2026
2027 /* Multiline mode: assert before any newline, or before end of subject
2028 unless noteol is set. */
2029
2030 case OP_DOLLM:
2031 if (eptr < md->end_subject)
2032 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2033 else
2034 {
2035 if (md->noteol) RRETURN(MATCH_NOMATCH);
2036 SCHECK_PARTIAL();
2037 }
2038 ecode++;
2039 break;
2040
2041 /* Not multiline mode: assert before a terminating newline or before end of
2042 subject unless noteol is set. */
2043
2044 case OP_DOLL:
2045 if (md->noteol) RRETURN(MATCH_NOMATCH);
2046 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2047
2048 /* ... else fall through for endonly */
2049
2050 /* End of subject assertion (\z) */
2051
2052 case OP_EOD:
2053 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2054 SCHECK_PARTIAL();
2055 ecode++;
2056 break;
2057
2058 /* End of subject or ending \n assertion (\Z) */
2059
2060 case OP_EODN:
2061 ASSERT_NL_OR_EOS:
2062 if (eptr < md->end_subject &&
2063 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2064 RRETURN(MATCH_NOMATCH);
2065
2066 /* Either at end of string or \n before end. */
2067
2068 SCHECK_PARTIAL();
2069 ecode++;
2070 break;
2071
2072 /* Word boundary assertions */
2073
2074 case OP_NOT_WORD_BOUNDARY:
2075 case OP_WORD_BOUNDARY:
2076 {
2077
2078 /* Find out if the previous and current characters are "word" characters.
2079 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2080 be "non-word" characters. Remember the earliest consulted character for
2081 partial matching. */
2082
2083 #ifdef SUPPORT_UTF
2084 if (utf)
2085 {
2086 /* Get status of previous character */
2087
2088 if (eptr == md->start_subject) prev_is_word = FALSE; else
2089 {
2090 PCRE_PUCHAR lastptr = eptr - 1;
2091 BACKCHAR(lastptr);
2092 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2093 GETCHAR(c, lastptr);
2094 #ifdef SUPPORT_UCP
2095 if (md->use_ucp)
2096 {
2097 if (c == '_') prev_is_word = TRUE; else
2098 {
2099 int cat = UCD_CATEGORY(c);
2100 prev_is_word = (cat == ucp_L || cat == ucp_N);
2101 }
2102 }
2103 else
2104 #endif
2105 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2106 }
2107
2108 /* Get status of next character */
2109
2110 if (eptr >= md->end_subject)
2111 {
2112 SCHECK_PARTIAL();
2113 cur_is_word = FALSE;
2114 }
2115 else
2116 {
2117 GETCHAR(c, eptr);
2118 #ifdef SUPPORT_UCP
2119 if (md->use_ucp)
2120 {
2121 if (c == '_') cur_is_word = TRUE; else
2122 {
2123 int cat = UCD_CATEGORY(c);
2124 cur_is_word = (cat == ucp_L || cat == ucp_N);
2125 }
2126 }
2127 else
2128 #endif
2129 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2130 }
2131 }
2132 else
2133 #endif
2134
2135 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2136 consistency with the behaviour of \w we do use it in this case. */
2137
2138 {
2139 /* Get status of previous character */
2140
2141 if (eptr == md->start_subject) prev_is_word = FALSE; else
2142 {
2143 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2144 #ifdef SUPPORT_UCP
2145 if (md->use_ucp)
2146 {
2147 c = eptr[-1];
2148 if (c == '_') prev_is_word = TRUE; else
2149 {
2150 int cat = UCD_CATEGORY(c);
2151 prev_is_word = (cat == ucp_L || cat == ucp_N);
2152 }
2153 }
2154 else
2155 #endif
2156 prev_is_word = MAX_255(eptr[-1])
2157 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2158 }
2159
2160 /* Get status of next character */
2161
2162 if (eptr >= md->end_subject)
2163 {
2164 SCHECK_PARTIAL();
2165 cur_is_word = FALSE;
2166 }
2167 else
2168 #ifdef SUPPORT_UCP
2169 if (md->use_ucp)
2170 {
2171 c = *eptr;
2172 if (c == '_') cur_is_word = TRUE; else
2173 {
2174 int cat = UCD_CATEGORY(c);
2175 cur_is_word = (cat == ucp_L || cat == ucp_N);
2176 }
2177 }
2178 else
2179 #endif
2180 cur_is_word = MAX_255(*eptr)
2181 && ((md->ctypes[*eptr] & ctype_word) != 0);
2182 }
2183
2184 /* Now see if the situation is what we want */
2185
2186 if ((*ecode++ == OP_WORD_BOUNDARY)?
2187 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2188 RRETURN(MATCH_NOMATCH);
2189 }
2190 break;
2191
2192 /* Match a single character type; inline for speed */
2193
2194 case OP_ANY:
2195 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2196 /* Fall through */
2197
2198 case OP_ALLANY:
2199 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2200 { /* not be updated before SCHECK_PARTIAL. */
2201 SCHECK_PARTIAL();
2202 RRETURN(MATCH_NOMATCH);
2203 }
2204 eptr++;
2205 #ifdef SUPPORT_UTF
2206 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2207 #endif
2208 ecode++;
2209 break;
2210
2211 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2212 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2213
2214 case OP_ANYBYTE:
2215 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2216 { /* not be updated before SCHECK_PARTIAL. */
2217 SCHECK_PARTIAL();
2218 RRETURN(MATCH_NOMATCH);
2219 }
2220 eptr++;
2221 ecode++;
2222 break;
2223
2224 case OP_NOT_DIGIT:
2225 if (eptr >= md->end_subject)
2226 {
2227 SCHECK_PARTIAL();
2228 RRETURN(MATCH_NOMATCH);
2229 }
2230 GETCHARINCTEST(c, eptr);
2231 if (
2232 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2233 c < 256 &&
2234 #endif
2235 (md->ctypes[c] & ctype_digit) != 0
2236 )
2237 RRETURN(MATCH_NOMATCH);
2238 ecode++;
2239 break;
2240
2241 case OP_DIGIT:
2242 if (eptr >= md->end_subject)
2243 {
2244 SCHECK_PARTIAL();
2245 RRETURN(MATCH_NOMATCH);
2246 }
2247 GETCHARINCTEST(c, eptr);
2248 if (
2249 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2250 c > 255 ||
2251 #endif
2252 (md->ctypes[c] & ctype_digit) == 0
2253 )
2254 RRETURN(MATCH_NOMATCH);
2255 ecode++;
2256 break;
2257
2258 case OP_NOT_WHITESPACE:
2259 if (eptr >= md->end_subject)
2260 {
2261 SCHECK_PARTIAL();
2262 RRETURN(MATCH_NOMATCH);
2263 }
2264 GETCHARINCTEST(c, eptr);
2265 if (
2266 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2267 c < 256 &&
2268 #endif
2269 (md->ctypes[c] & ctype_space) != 0
2270 )
2271 RRETURN(MATCH_NOMATCH);
2272 ecode++;
2273 break;
2274
2275 case OP_WHITESPACE:
2276 if (eptr >= md->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 RRETURN(MATCH_NOMATCH);
2280 }
2281 GETCHARINCTEST(c, eptr);
2282 if (
2283 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2284 c > 255 ||
2285 #endif
2286 (md->ctypes[c] & ctype_space) == 0
2287 )
2288 RRETURN(MATCH_NOMATCH);
2289 ecode++;
2290 break;
2291
2292 case OP_NOT_WORDCHAR:
2293 if (eptr >= md->end_subject)
2294 {
2295 SCHECK_PARTIAL();
2296 RRETURN(MATCH_NOMATCH);
2297 }
2298 GETCHARINCTEST(c, eptr);
2299 if (
2300 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2301 c < 256 &&
2302 #endif
2303 (md->ctypes[c] & ctype_word) != 0
2304 )
2305 RRETURN(MATCH_NOMATCH);
2306 ecode++;
2307 break;
2308
2309 case OP_WORDCHAR:
2310 if (eptr >= md->end_subject)
2311 {
2312 SCHECK_PARTIAL();
2313 RRETURN(MATCH_NOMATCH);
2314 }
2315 GETCHARINCTEST(c, eptr);
2316 if (
2317 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2318 c > 255 ||
2319 #endif
2320 (md->ctypes[c] & ctype_word) == 0
2321 )
2322 RRETURN(MATCH_NOMATCH);
2323 ecode++;
2324 break;
2325
2326 case OP_ANYNL:
2327 if (eptr >= md->end_subject)
2328 {
2329 SCHECK_PARTIAL();
2330 RRETURN(MATCH_NOMATCH);
2331 }
2332 GETCHARINCTEST(c, eptr);
2333 switch(c)
2334 {
2335 default: RRETURN(MATCH_NOMATCH);
2336
2337 case 0x000d:
2338 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2339 break;
2340
2341 case 0x000a:
2342 break;
2343
2344 case 0x000b:
2345 case 0x000c:
2346 case 0x0085:
2347 case 0x2028:
2348 case 0x2029:
2349 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2350 break;
2351 }
2352 ecode++;
2353 break;
2354
2355 case OP_NOT_HSPACE:
2356 if (eptr >= md->end_subject)
2357 {
2358 SCHECK_PARTIAL();
2359 RRETURN(MATCH_NOMATCH);
2360 }
2361 GETCHARINCTEST(c, eptr);
2362 switch(c)
2363 {
2364 default: break;
2365 case 0x09: /* HT */
2366 case 0x20: /* SPACE */
2367 case 0xa0: /* NBSP */
2368 case 0x1680: /* OGHAM SPACE MARK */
2369 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2370 case 0x2000: /* EN QUAD */
2371 case 0x2001: /* EM QUAD */
2372 case 0x2002: /* EN SPACE */
2373 case 0x2003: /* EM SPACE */
2374 case 0x2004: /* THREE-PER-EM SPACE */
2375 case 0x2005: /* FOUR-PER-EM SPACE */
2376 case 0x2006: /* SIX-PER-EM SPACE */
2377 case 0x2007: /* FIGURE SPACE */
2378 case 0x2008: /* PUNCTUATION SPACE */
2379 case 0x2009: /* THIN SPACE */
2380 case 0x200A: /* HAIR SPACE */
2381 case 0x202f: /* NARROW NO-BREAK SPACE */
2382 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2383 case 0x3000: /* IDEOGRAPHIC SPACE */
2384 RRETURN(MATCH_NOMATCH);
2385 }
2386 ecode++;
2387 break;
2388
2389 case OP_HSPACE:
2390 if (eptr >= md->end_subject)
2391 {
2392 SCHECK_PARTIAL();
2393 RRETURN(MATCH_NOMATCH);
2394 }
2395 GETCHARINCTEST(c, eptr);
2396 switch(c)
2397 {
2398 default: RRETURN(MATCH_NOMATCH);
2399 case 0x09: /* HT */
2400 case 0x20: /* SPACE */
2401 case 0xa0: /* NBSP */
2402 case 0x1680: /* OGHAM SPACE MARK */
2403 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2404 case 0x2000: /* EN QUAD */
2405 case 0x2001: /* EM QUAD */
2406 case 0x2002: /* EN SPACE */
2407 case 0x2003: /* EM SPACE */
2408 case 0x2004: /* THREE-PER-EM SPACE */
2409 case 0x2005: /* FOUR-PER-EM SPACE */
2410 case 0x2006: /* SIX-PER-EM SPACE */
2411 case 0x2007: /* FIGURE SPACE */
2412 case 0x2008: /* PUNCTUATION SPACE */
2413 case 0x2009: /* THIN SPACE */
2414 case 0x200A: /* HAIR SPACE */
2415 case 0x202f: /* NARROW NO-BREAK SPACE */
2416 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2417 case 0x3000: /* IDEOGRAPHIC SPACE */
2418 break;
2419 }
2420 ecode++;
2421 break;
2422
2423 case OP_NOT_VSPACE:
2424 if (eptr >= md->end_subject)
2425 {
2426 SCHECK_PARTIAL();
2427 RRETURN(MATCH_NOMATCH);
2428 }
2429 GETCHARINCTEST(c, eptr);
2430 switch(c)
2431 {
2432 default: break;
2433 case 0x0a: /* LF */
2434 case 0x0b: /* VT */
2435 case 0x0c: /* FF */
2436 case 0x0d: /* CR */
2437 case 0x85: /* NEL */
2438 case 0x2028: /* LINE SEPARATOR */
2439 case 0x2029: /* PARAGRAPH SEPARATOR */
2440 RRETURN(MATCH_NOMATCH);
2441 }
2442 ecode++;
2443 break;
2444
2445 case OP_VSPACE:
2446 if (eptr >= md->end_subject)
2447 {
2448 SCHECK_PARTIAL();
2449 RRETURN(MATCH_NOMATCH);
2450 }
2451 GETCHARINCTEST(c, eptr);
2452 switch(c)
2453 {
2454 default: RRETURN(MATCH_NOMATCH);
2455 case 0x0a: /* LF */
2456 case 0x0b: /* VT */
2457 case 0x0c: /* FF */
2458 case 0x0d: /* CR */
2459 case 0x85: /* NEL */
2460 case 0x2028: /* LINE SEPARATOR */
2461 case 0x2029: /* PARAGRAPH SEPARATOR */
2462 break;
2463 }
2464 ecode++;
2465 break;
2466
2467 #ifdef SUPPORT_UCP
2468 /* Check the next character by Unicode property. We will get here only
2469 if the support is in the binary; otherwise a compile-time error occurs. */
2470
2471 case OP_PROP:
2472 case OP_NOTPROP:
2473 if (eptr >= md->end_subject)
2474 {
2475 SCHECK_PARTIAL();
2476 RRETURN(MATCH_NOMATCH);
2477 }
2478 GETCHARINCTEST(c, eptr);
2479 {
2480 const ucd_record *prop = GET_UCD(c);
2481
2482 switch(ecode[1])
2483 {
2484 case PT_ANY:
2485 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2486 break;
2487
2488 case PT_LAMP:
2489 if ((prop->chartype == ucp_Lu ||
2490 prop->chartype == ucp_Ll ||
2491 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2492 RRETURN(MATCH_NOMATCH);
2493 break;
2494
2495 case PT_GC:
2496 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2497 RRETURN(MATCH_NOMATCH);
2498 break;
2499
2500 case PT_PC:
2501 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2502 RRETURN(MATCH_NOMATCH);
2503 break;
2504
2505 case PT_SC:
2506 if ((ecode[2] != prop->script) == (op == OP_PROP))
2507 RRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 /* These are specials */
2511
2512 case PT_ALNUM:
2513 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2514 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2515 RRETURN(MATCH_NOMATCH);
2516 break;
2517
2518 case PT_SPACE: /* Perl space */
2519 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2520 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2521 == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_PXSPACE: /* POSIX space */
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2527 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2528 c == CHAR_FF || c == CHAR_CR)
2529 == (op == OP_NOTPROP))
2530 RRETURN(MATCH_NOMATCH);
2531 break;
2532
2533 case PT_WORD:
2534 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2535 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2536 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2537 RRETURN(MATCH_NOMATCH);
2538 break;
2539
2540 /* This should never occur */
2541
2542 default:
2543 RRETURN(PCRE_ERROR_INTERNAL);
2544 }
2545
2546 ecode += 3;
2547 }
2548 break;
2549
2550 /* Match an extended Unicode sequence. We will get here only if the support
2551 is in the binary; otherwise a compile-time error occurs. */
2552
2553 case OP_EXTUNI:
2554 if (eptr >= md->end_subject)
2555 {
2556 SCHECK_PARTIAL();
2557 RRETURN(MATCH_NOMATCH);
2558 }
2559 GETCHARINCTEST(c, eptr);
2560 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2561 while (eptr < md->end_subject)
2562 {
2563 int len = 1;
2564 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2565 if (UCD_CATEGORY(c) != ucp_M) break;
2566 eptr += len;
2567 }
2568 ecode++;
2569 break;
2570 #endif
2571
2572
2573 /* Match a back reference, possibly repeatedly. Look past the end of the
2574 item to see if there is repeat information following. The code is similar
2575 to that for character classes, but repeated for efficiency. Then obey
2576 similar code to character type repeats - written out again for speed.
2577 However, if the referenced string is the empty string, always treat
2578 it as matched, any number of times (otherwise there could be infinite
2579 loops). */
2580
2581 case OP_REF:
2582 case OP_REFI:
2583 caseless = op == OP_REFI;
2584 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2585 ecode += 1 + IMM2_SIZE;
2586
2587 /* If the reference is unset, there are two possibilities:
2588
2589 (a) In the default, Perl-compatible state, set the length negative;
2590 this ensures that every attempt at a match fails. We can't just fail
2591 here, because of the possibility of quantifiers with zero minima.
2592
2593 (b) If the JavaScript compatibility flag is set, set the length to zero
2594 so that the back reference matches an empty string.
2595
2596 Otherwise, set the length to the length of what was matched by the
2597 referenced subpattern. */
2598
2599 if (offset >= offset_top || md->offset_vector[offset] < 0)
2600 length = (md->jscript_compat)? 0 : -1;
2601 else
2602 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2603
2604 /* Set up for repetition, or handle the non-repeated case */
2605
2606 switch (*ecode)
2607 {
2608 case OP_CRSTAR:
2609 case OP_CRMINSTAR:
2610 case OP_CRPLUS:
2611 case OP_CRMINPLUS:
2612 case OP_CRQUERY:
2613 case OP_CRMINQUERY:
2614 c = *ecode++ - OP_CRSTAR;
2615 minimize = (c & 1) != 0;
2616 min = rep_min[c]; /* Pick up values from tables; */
2617 max = rep_max[c]; /* zero for max => infinity */
2618 if (max == 0) max = INT_MAX;
2619 break;
2620
2621 case OP_CRRANGE:
2622 case OP_CRMINRANGE:
2623 minimize = (*ecode == OP_CRMINRANGE);
2624 min = GET2(ecode, 1);
2625 max = GET2(ecode, 1 + IMM2_SIZE);
2626 if (max == 0) max = INT_MAX;
2627 ecode += 1 + 2 * IMM2_SIZE;
2628 break;
2629
2630 default: /* No repeat follows */
2631 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2632 {
2633 CHECK_PARTIAL();
2634 RRETURN(MATCH_NOMATCH);
2635 }
2636 eptr += length;
2637 continue; /* With the main loop */
2638 }
2639
2640 /* Handle repeated back references. If the length of the reference is
2641 zero, just continue with the main loop. If the length is negative, it
2642 means the reference is unset in non-Java-compatible mode. If the minimum is
2643 zero, we can continue at the same level without recursion. For any other
2644 minimum, carrying on will result in NOMATCH. */
2645
2646 if (length == 0) continue;
2647 if (length < 0 && min == 0) continue;
2648
2649 /* First, ensure the minimum number of matches are present. We get back
2650 the length of the reference string explicitly rather than passing the
2651 address of eptr, so that eptr can be a register variable. */
2652
2653 for (i = 1; i <= min; i++)
2654 {
2655 int slength;
2656 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2657 {
2658 CHECK_PARTIAL();
2659 RRETURN(MATCH_NOMATCH);
2660 }
2661 eptr += slength;
2662 }
2663
2664 /* If min = max, continue at the same level without recursion.
2665 They are not both allowed to be zero. */
2666
2667 if (min == max) continue;
2668
2669 /* If minimizing, keep trying and advancing the pointer */
2670
2671 if (minimize)
2672 {
2673 for (fi = min;; fi++)
2674 {
2675 int slength;
2676 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678 if (fi >= max) RRETURN(MATCH_NOMATCH);
2679 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2680 {
2681 CHECK_PARTIAL();
2682 RRETURN(MATCH_NOMATCH);
2683 }
2684 eptr += slength;
2685 }
2686 /* Control never gets here */
2687 }
2688
2689 /* If maximizing, find the longest string and work backwards */
2690
2691 else
2692 {
2693 pp = eptr;
2694 for (i = min; i < max; i++)
2695 {
2696 int slength;
2697 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2698 {
2699 CHECK_PARTIAL();
2700 break;
2701 }
2702 eptr += slength;
2703 }
2704 while (eptr >= pp)
2705 {
2706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2708 eptr -= length;
2709 }
2710 RRETURN(MATCH_NOMATCH);
2711 }
2712 /* Control never gets here */
2713
2714 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2715 used when all the characters in the class have values in the range 0-255,
2716 and either the matching is caseful, or the characters are in the range
2717 0-127 when UTF-8 processing is enabled. The only difference between
2718 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2719 encountered.
2720
2721 First, look past the end of the item to see if there is repeat information
2722 following. Then obey similar code to character type repeats - written out
2723 again for speed. */
2724
2725 case OP_NCLASS:
2726 case OP_CLASS:
2727 {
2728 /* The data variable is saved across frames, so the byte map needs to
2729 be stored there. */
2730 #define BYTE_MAP ((pcre_uint8 *)data)
2731 data = ecode + 1; /* Save for matching */
2732 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2733
2734 switch (*ecode)
2735 {
2736 case OP_CRSTAR:
2737 case OP_CRMINSTAR:
2738 case OP_CRPLUS:
2739 case OP_CRMINPLUS:
2740 case OP_CRQUERY:
2741 case OP_CRMINQUERY:
2742 c = *ecode++ - OP_CRSTAR;
2743 minimize = (c & 1) != 0;
2744 min = rep_min[c]; /* Pick up values from tables; */
2745 max = rep_max[c]; /* zero for max => infinity */
2746 if (max == 0) max = INT_MAX;
2747 break;
2748
2749 case OP_CRRANGE:
2750 case OP_CRMINRANGE:
2751 minimize = (*ecode == OP_CRMINRANGE);
2752 min = GET2(ecode, 1);
2753 max = GET2(ecode, 1 + IMM2_SIZE);
2754 if (max == 0) max = INT_MAX;
2755 ecode += 1 + 2 * IMM2_SIZE;
2756 break;
2757
2758 default: /* No repeat follows */
2759 min = max = 1;
2760 break;
2761 }
2762
2763 /* First, ensure the minimum number of matches are present. */
2764
2765 #ifdef SUPPORT_UTF
2766 if (utf)
2767 {
2768 for (i = 1; i <= min; i++)
2769 {
2770 if (eptr >= md->end_subject)
2771 {
2772 SCHECK_PARTIAL();
2773 RRETURN(MATCH_NOMATCH);
2774 }
2775 GETCHARINC(c, eptr);
2776 if (c > 255)
2777 {
2778 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2779 }
2780 else
2781 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2782 }
2783 }
2784 else
2785 #endif
2786 /* Not UTF mode */
2787 {
2788 for (i = 1; i <= min; i++)
2789 {
2790 if (eptr >= md->end_subject)
2791 {
2792 SCHECK_PARTIAL();
2793 RRETURN(MATCH_NOMATCH);
2794 }
2795 c = *eptr++;
2796 #ifndef COMPILE_PCRE8
2797 if (c > 255)
2798 {
2799 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2800 }
2801 else
2802 #endif
2803 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2804 }
2805 }
2806
2807 /* If max == min we can continue with the main loop without the
2808 need to recurse. */
2809
2810 if (min == max) continue;
2811
2812 /* If minimizing, keep testing the rest of the expression and advancing
2813 the pointer while it matches the class. */
2814
2815 if (minimize)
2816 {
2817 #ifdef SUPPORT_UTF
2818 if (utf)
2819 {
2820 for (fi = min;; fi++)
2821 {
2822 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2823 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2824 if (fi >= max) RRETURN(MATCH_NOMATCH);
2825 if (eptr >= md->end_subject)
2826 {
2827 SCHECK_PARTIAL();
2828 RRETURN(MATCH_NOMATCH);
2829 }
2830 GETCHARINC(c, eptr);
2831 if (c > 255)
2832 {
2833 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2834 }
2835 else
2836 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2837 }
2838 }
2839 else
2840 #endif
2841 /* Not UTF mode */
2842 {
2843 for (fi = min;; fi++)
2844 {
2845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2847 if (fi >= max) RRETURN(MATCH_NOMATCH);
2848 if (eptr >= md->end_subject)
2849 {
2850 SCHECK_PARTIAL();
2851 RRETURN(MATCH_NOMATCH);
2852 }
2853 c = *eptr++;
2854 #ifndef COMPILE_PCRE8
2855 if (c > 255)
2856 {
2857 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2858 }
2859 else
2860 #endif
2861 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2862 }
2863 }
2864 /* Control never gets here */
2865 }
2866
2867 /* If maximizing, find the longest possible run, then work backwards. */
2868
2869 else
2870 {
2871 pp = eptr;
2872
2873 #ifdef SUPPORT_UTF
2874 if (utf)
2875 {
2876 for (i = min; i < max; i++)
2877 {
2878 int len = 1;
2879 if (eptr >= md->end_subject)
2880 {
2881 SCHECK_PARTIAL();
2882 break;
2883 }
2884 GETCHARLEN(c, eptr, len);
2885 if (c > 255)
2886 {
2887 if (op == OP_CLASS) break;
2888 }
2889 else
2890 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2891 eptr += len;
2892 }
2893 for (;;)
2894 {
2895 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2897 if (eptr-- == pp) break; /* Stop if tried at original pos */
2898 BACKCHAR(eptr);
2899 }
2900 }
2901 else
2902 #endif
2903 /* Not UTF mode */
2904 {
2905 for (i = min; i < max; i++)
2906 {
2907 if (eptr >= md->end_subject)
2908 {
2909 SCHECK_PARTIAL();
2910 break;
2911 }
2912 c = *eptr;
2913 #ifndef COMPILE_PCRE8
2914 if (c > 255)
2915 {
2916 if (op == OP_CLASS) break;
2917 }
2918 else
2919 #endif
2920 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2921 eptr++;
2922 }
2923 while (eptr >= pp)
2924 {
2925 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2927 eptr--;
2928 }
2929 }
2930
2931 RRETURN(MATCH_NOMATCH);
2932 }
2933 #undef BYTE_MAP
2934 }
2935 /* Control never gets here */
2936
2937
2938 /* Match an extended character class. This opcode is encountered only
2939 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2940 mode, because Unicode properties are supported in non-UTF-8 mode. */
2941
2942 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2943 case OP_XCLASS:
2944 {
2945 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2946 ecode += GET(ecode, 1); /* Advance past the item */
2947
2948 switch (*ecode)
2949 {
2950 case OP_CRSTAR:
2951 case OP_CRMINSTAR:
2952 case OP_CRPLUS:
2953 case OP_CRMINPLUS:
2954 case OP_CRQUERY:
2955 case OP_CRMINQUERY:
2956 c = *ecode++ - OP_CRSTAR;
2957 minimize = (c & 1) != 0;
2958 min = rep_min[c]; /* Pick up values from tables; */
2959 max = rep_max[c]; /* zero for max => infinity */
2960 if (max == 0) max = INT_MAX;
2961 break;
2962
2963 case OP_CRRANGE:
2964 case OP_CRMINRANGE:
2965 minimize = (*ecode == OP_CRMINRANGE);
2966 min = GET2(ecode, 1);
2967 max = GET2(ecode, 1 + IMM2_SIZE);
2968 if (max == 0) max = INT_MAX;
2969 ecode += 1 + 2 * IMM2_SIZE;
2970 break;
2971
2972 default: /* No repeat follows */
2973 min = max = 1;
2974 break;
2975 }
2976
2977 /* First, ensure the minimum number of matches are present. */
2978
2979 for (i = 1; i <= min; i++)
2980 {
2981 if (eptr >= md->end_subject)
2982 {
2983 SCHECK_PARTIAL();
2984 RRETURN(MATCH_NOMATCH);
2985 }
2986 GETCHARINCTEST(c, eptr);
2987 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2988 }
2989
2990 /* If max == min we can continue with the main loop without the
2991 need to recurse. */
2992
2993 if (min == max) continue;
2994
2995 /* If minimizing, keep testing the rest of the expression and advancing
2996 the pointer while it matches the class. */
2997
2998 if (minimize)
2999 {
3000 for (fi = min;; fi++)
3001 {
3002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3004 if (fi >= max) RRETURN(MATCH_NOMATCH);
3005 if (eptr >= md->end_subject)
3006 {
3007 SCHECK_PARTIAL();
3008 RRETURN(MATCH_NOMATCH);
3009 }
3010 GETCHARINCTEST(c, eptr);
3011 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3012 }
3013 /* Control never gets here */
3014 }
3015
3016 /* If maximizing, find the longest possible run, then work backwards. */
3017
3018 else
3019 {
3020 pp = eptr;
3021 for (i = min; i < max; i++)
3022 {
3023 int len = 1;
3024 if (eptr >= md->end_subject)
3025 {
3026 SCHECK_PARTIAL();
3027 break;
3028 }
3029 #ifdef SUPPORT_UTF
3030 GETCHARLENTEST(c, eptr, len);
3031 #else
3032 c = *eptr;
3033 #endif
3034 if (!PRIV(xclass)(c, data, utf)) break;
3035 eptr += len;
3036 }
3037 for(;;)
3038 {
3039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3041 if (eptr-- == pp) break; /* Stop if tried at original pos */
3042 #ifdef SUPPORT_UTF
3043 if (utf) BACKCHAR(eptr);
3044 #endif
3045 }
3046 RRETURN(MATCH_NOMATCH);
3047 }
3048
3049 /* Control never gets here */
3050 }
3051 #endif /* End of XCLASS */
3052
3053 /* Match a single character, casefully */
3054
3055 case OP_CHAR:
3056 #ifdef SUPPORT_UTF
3057 if (utf)
3058 {
3059 length = 1;
3060 ecode++;
3061 GETCHARLEN(fc, ecode, length);
3062 if (length > md->end_subject - eptr)
3063 {
3064 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3065 RRETURN(MATCH_NOMATCH);
3066 }
3067 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3068 }
3069 else
3070 #endif
3071 /* Not UTF mode */
3072 {
3073 if (md->end_subject - eptr < 1)
3074 {
3075 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3076 RRETURN(MATCH_NOMATCH);
3077 }
3078 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3079 ecode += 2;
3080 }
3081 break;
3082
3083 /* Match a single character, caselessly. If we are at the end of the
3084 subject, give up immediately. */
3085
3086 case OP_CHARI:
3087 if (eptr >= md->end_subject)
3088 {
3089 SCHECK_PARTIAL();
3090 RRETURN(MATCH_NOMATCH);
3091 }
3092
3093 #ifdef SUPPORT_UTF
3094 if (utf)
3095 {
3096 length = 1;
3097 ecode++;
3098 GETCHARLEN(fc, ecode, length);
3099
3100 /* If the pattern character's value is < 128, we have only one byte, and
3101 we know that its other case must also be one byte long, so we can use the
3102 fast lookup table. We know that there is at least one byte left in the
3103 subject. */
3104
3105 if (fc < 128)
3106 {
3107 if (md->lcc[fc]
3108 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3109 ecode++;
3110 eptr++;
3111 }
3112
3113 /* Otherwise we must pick up the subject character. Note that we cannot
3114 use the value of "length" to check for sufficient bytes left, because the
3115 other case of the character may have more or fewer bytes. */
3116
3117 else
3118 {
3119 unsigned int dc;
3120 GETCHARINC(dc, eptr);
3121 ecode += length;
3122
3123 /* If we have Unicode property support, we can use it to test the other
3124 case of the character, if there is one. */
3125
3126 if (fc != dc)
3127 {
3128 #ifdef SUPPORT_UCP
3129 if (dc != UCD_OTHERCASE(fc))
3130 #endif
3131 RRETURN(MATCH_NOMATCH);
3132 }
3133 }
3134 }
3135 else
3136 #endif /* SUPPORT_UTF */
3137
3138 /* Not UTF mode */
3139 {
3140 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3141 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3142 eptr++;
3143 ecode += 2;
3144 }
3145 break;
3146
3147 /* Match a single character repeatedly. */
3148
3149 case OP_EXACT:
3150 case OP_EXACTI:
3151 min = max = GET2(ecode, 1);
3152 ecode += 1 + IMM2_SIZE;
3153 goto REPEATCHAR;
3154
3155 case OP_POSUPTO:
3156 case OP_POSUPTOI:
3157 possessive = TRUE;
3158 /* Fall through */
3159
3160 case OP_UPTO:
3161 case OP_UPTOI:
3162 case OP_MINUPTO:
3163 case OP_MINUPTOI:
3164 min = 0;
3165 max = GET2(ecode, 1);
3166 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3167 ecode += 1 + IMM2_SIZE;
3168 goto REPEATCHAR;
3169
3170 case OP_POSSTAR:
3171 case OP_POSSTARI:
3172 possessive = TRUE;
3173 min = 0;
3174 max = INT_MAX;
3175 ecode++;
3176 goto REPEATCHAR;
3177
3178 case OP_POSPLUS:
3179 case OP_POSPLUSI:
3180 possessive = TRUE;
3181 min = 1;
3182 max = INT_MAX;
3183 ecode++;
3184 goto REPEATCHAR;
3185
3186 case OP_POSQUERY:
3187 case OP_POSQUERYI:
3188 possessive = TRUE;
3189 min = 0;
3190 max = 1;
3191 ecode++;
3192 goto REPEATCHAR;
3193
3194 case OP_STAR:
3195 case OP_STARI:
3196 case OP_MINSTAR:
3197 case OP_MINSTARI:
3198 case OP_PLUS:
3199 case OP_PLUSI:
3200 case OP_MINPLUS:
3201 case OP_MINPLUSI:
3202 case OP_QUERY:
3203 case OP_QUERYI:
3204 case OP_MINQUERY:
3205 case OP_MINQUERYI:
3206 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3207 minimize = (c & 1) != 0;
3208 min = rep_min[c]; /* Pick up values from tables; */
3209 max = rep_max[c]; /* zero for max => infinity */
3210 if (max == 0) max = INT_MAX;
3211
3212 /* Common code for all repeated single-character matches. */
3213
3214 REPEATCHAR:
3215 #ifdef SUPPORT_UTF
3216 if (utf)
3217 {
3218 length = 1;
3219 charptr = ecode;
3220 GETCHARLEN(fc, ecode, length);
3221 ecode += length;
3222
3223 /* Handle multibyte character matching specially here. There is
3224 support for caseless matching if UCP support is present. */
3225
3226 if (length > 1)
3227 {
3228 #ifdef SUPPORT_UCP
3229 unsigned int othercase;
3230 if (op >= OP_STARI && /* Caseless */
3231 (othercase = UCD_OTHERCASE(fc)) != fc)
3232 oclength = PRIV(ord2utf)(othercase, occhars);
3233 else oclength = 0;
3234 #endif /* SUPPORT_UCP */
3235
3236 for (i = 1; i <= min; i++)
3237 {
3238 if (eptr <= md->end_subject - length &&
3239 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3240 #ifdef SUPPORT_UCP
3241 else if (oclength > 0 &&
3242 eptr <= md->end_subject - oclength &&
3243 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3244 #endif /* SUPPORT_UCP */
3245 else
3246 {
3247 CHECK_PARTIAL();
3248 RRETURN(MATCH_NOMATCH);
3249 }
3250 }
3251
3252 if (min == max) continue;
3253
3254 if (minimize)
3255 {
3256 for (fi = min;; fi++)
3257 {
3258 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 if (fi >= max) RRETURN(MATCH_NOMATCH);
3261 if (eptr <= md->end_subject - length &&
3262 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3263 #ifdef SUPPORT_UCP
3264 else if (oclength > 0 &&
3265 eptr <= md->end_subject - oclength &&
3266 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3267 #endif /* SUPPORT_UCP */
3268 else
3269 {
3270 CHECK_PARTIAL();
3271 RRETURN(MATCH_NOMATCH);
3272 }
3273 }
3274 /* Control never gets here */
3275 }
3276
3277 else /* Maximize */
3278 {
3279 pp = eptr;
3280 for (i = min; i < max; i++)
3281 {
3282 if (eptr <= md->end_subject - length &&
3283 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3284 #ifdef SUPPORT_UCP
3285 else if (oclength > 0 &&
3286 eptr <= md->end_subject - oclength &&
3287 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3288 #endif /* SUPPORT_UCP */
3289 else
3290 {
3291 CHECK_PARTIAL();
3292 break;
3293 }
3294 }
3295
3296 if (possessive) continue;
3297
3298 for(;;)
3299 {
3300 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3302 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3303 #ifdef SUPPORT_UCP
3304 eptr--;
3305 BACKCHAR(eptr);
3306 #else /* without SUPPORT_UCP */
3307 eptr -= length;
3308 #endif /* SUPPORT_UCP */
3309 }
3310 }
3311 /* Control never gets here */
3312 }
3313
3314 /* If the length of a UTF-8 character is 1, we fall through here, and
3315 obey the code as for non-UTF-8 characters below, though in this case the
3316 value of fc will always be < 128. */
3317 }
3318 else
3319 #endif /* SUPPORT_UTF */
3320 /* When not in UTF-8 mode, load a single-byte character. */
3321 fc = *ecode++;
3322
3323 /* The value of fc at this point is always one character, though we may
3324 or may not be in UTF mode. The code is duplicated for the caseless and
3325 caseful cases, for speed, since matching characters is likely to be quite
3326 common. First, ensure the minimum number of matches are present. If min =
3327 max, continue at the same level without recursing. Otherwise, if
3328 minimizing, keep trying the rest of the expression and advancing one
3329 matching character if failing, up to the maximum. Alternatively, if
3330 maximizing, find the maximum number of characters and work backwards. */
3331
3332 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3333 max, eptr));
3334
3335 if (op >= OP_STARI) /* Caseless */
3336 {
3337 #ifdef COMPILE_PCRE8
3338 /* fc must be < 128 if UTF is enabled. */
3339 foc = md->fcc[fc];
3340 #else
3341 #ifdef SUPPORT_UTF
3342 #ifdef SUPPORT_UCP
3343 if (utf && fc > 127)
3344 foc = UCD_OTHERCASE(fc);
3345 #else
3346 if (utf && fc > 127)
3347 foc = fc;
3348 #endif /* SUPPORT_UCP */
3349 else
3350 #endif /* SUPPORT_UTF */
3351 foc = TABLE_GET(fc, md->fcc, fc);
3352 #endif /* COMPILE_PCRE8 */
3353
3354 for (i = 1; i <= min; i++)
3355 {
3356 if (eptr >= md->end_subject)
3357 {
3358 SCHECK_PARTIAL();
3359 RRETURN(MATCH_NOMATCH);
3360 }
3361 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3362 eptr++;
3363 }
3364 if (min == max) continue;
3365 if (minimize)
3366 {
3367 for (fi = min;; fi++)
3368 {
3369 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3370 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3371 if (fi >= max) RRETURN(MATCH_NOMATCH);
3372 if (eptr >= md->end_subject)
3373 {
3374 SCHECK_PARTIAL();
3375 RRETURN(MATCH_NOMATCH);
3376 }
3377 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3378 eptr++;
3379 }
3380 /* Control never gets here */
3381 }
3382 else /* Maximize */
3383 {
3384 pp = eptr;
3385 for (i = min; i < max; i++)
3386 {
3387 if (eptr >= md->end_subject)
3388 {
3389 SCHECK_PARTIAL();
3390 break;
3391 }
3392 if (fc != *eptr && foc != *eptr) break;
3393 eptr++;
3394 }
3395
3396 if (possessive) continue;
3397
3398 while (eptr >= pp)
3399 {
3400 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3401 eptr--;
3402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403 }
3404 RRETURN(MATCH_NOMATCH);
3405 }
3406 /* Control never gets here */
3407 }
3408
3409 /* Caseful comparisons (includes all multi-byte characters) */
3410
3411 else
3412 {
3413 for (i = 1; i <= min; i++)
3414 {
3415 if (eptr >= md->end_subject)
3416 {
3417 SCHECK_PARTIAL();
3418 RRETURN(MATCH_NOMATCH);
3419 }
3420 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3421 }
3422
3423 if (min == max) continue;
3424
3425 if (minimize)
3426 {
3427 for (fi = min;; fi++)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (fi >= max) RRETURN(MATCH_NOMATCH);
3432 if (eptr >= md->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 RRETURN(MATCH_NOMATCH);
3436 }
3437 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3438 }
3439 /* Control never gets here */
3440 }
3441 else /* Maximize */
3442 {
3443 pp = eptr;
3444 for (i = min; i < max; i++)
3445 {
3446 if (eptr >= md->end_subject)
3447 {
3448 SCHECK_PARTIAL();
3449 break;
3450 }
3451 if (fc != *eptr) break;
3452 eptr++;
3453 }
3454 if (possessive) continue;
3455
3456 while (eptr >= pp)
3457 {
3458 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3459 eptr--;
3460 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3461 }
3462 RRETURN(MATCH_NOMATCH);
3463 }
3464 }
3465 /* Control never gets here */
3466
3467 /* Match a negated single one-byte character. The character we are
3468 checking can be multibyte. */
3469
3470 case OP_NOT:
3471 case OP_NOTI:
3472 if (eptr >= md->end_subject)
3473 {
3474 SCHECK_PARTIAL();
3475 RRETURN(MATCH_NOMATCH);
3476 }
3477 ecode++;
3478 GETCHARINCTEST(c, eptr);
3479 if (op == OP_NOTI) /* The caseless case */
3480 {
3481 register int ch, och;
3482 ch = *ecode++;
3483 #ifdef COMPILE_PCRE8
3484 /* ch must be < 128 if UTF is enabled. */
3485 och = md->fcc[ch];
3486 #else
3487 #ifdef SUPPORT_UTF
3488 #ifdef SUPPORT_UCP
3489 if (utf && ch > 127)
3490 och = UCD_OTHERCASE(ch);
3491 #else
3492 if (utf && ch > 127)
3493 och = ch;
3494 #endif /* SUPPORT_UCP */
3495 else
3496 #endif /* SUPPORT_UTF */
3497 och = TABLE_GET(ch, md->fcc, ch);
3498 #endif /* COMPILE_PCRE8 */
3499 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3500 }
3501 else /* Caseful */
3502 {
3503 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3504 }
3505 break;
3506
3507 /* Match a negated single one-byte character repeatedly. This is almost a
3508 repeat of the code for a repeated single character, but I haven't found a
3509 nice way of commoning these up that doesn't require a test of the
3510 positive/negative option for each character match. Maybe that wouldn't add
3511 very much to the time taken, but character matching *is* what this is all
3512 about... */
3513
3514 case OP_NOTEXACT:
3515 case OP_NOTEXACTI:
3516 min = max = GET2(ecode, 1);
3517 ecode += 1 + IMM2_SIZE;
3518 goto REPEATNOTCHAR;
3519
3520 case OP_NOTUPTO:
3521 case OP_NOTUPTOI:
3522 case OP_NOTMINUPTO:
3523 case OP_NOTMINUPTOI:
3524 min = 0;
3525 max = GET2(ecode, 1);
3526 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3527 ecode += 1 + IMM2_SIZE;
3528 goto REPEATNOTCHAR;
3529
3530 case OP_NOTPOSSTAR:
3531 case OP_NOTPOSSTARI:
3532 possessive = TRUE;
3533 min = 0;
3534 max = INT_MAX;
3535 ecode++;
3536 goto REPEATNOTCHAR;
3537
3538 case OP_NOTPOSPLUS:
3539 case OP_NOTPOSPLUSI:
3540 possessive = TRUE;
3541 min = 1;
3542 max = INT_MAX;
3543 ecode++;
3544 goto REPEATNOTCHAR;
3545
3546 case OP_NOTPOSQUERY:
3547 case OP_NOTPOSQUERYI:
3548 possessive = TRUE;
3549 min = 0;
3550 max = 1;
3551 ecode++;
3552 goto REPEATNOTCHAR;
3553
3554 case OP_NOTPOSUPTO:
3555 case OP_NOTPOSUPTOI:
3556 possessive = TRUE;
3557 min = 0;
3558 max = GET2(ecode, 1);
3559 ecode += 1 + IMM2_SIZE;
3560 goto REPEATNOTCHAR;
3561
3562 case OP_NOTSTAR:
3563 case OP_NOTSTARI:
3564 case OP_NOTMINSTAR:
3565 case OP_NOTMINSTARI:
3566 case OP_NOTPLUS:
3567 case OP_NOTPLUSI:
3568 case OP_NOTMINPLUS:
3569 case OP_NOTMINPLUSI:
3570 case OP_NOTQUERY:
3571 case OP_NOTQUERYI:
3572 case OP_NOTMINQUERY:
3573 case OP_NOTMINQUERYI:
3574 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3575 minimize = (c & 1) != 0;
3576 min = rep_min[c]; /* Pick up values from tables; */
3577 max = rep_max[c]; /* zero for max => infinity */
3578 if (max == 0) max = INT_MAX;
3579
3580 /* Common code for all repeated single-byte matches. */
3581
3582 REPEATNOTCHAR:
3583 fc = *ecode++;
3584
3585 /* The code is duplicated for the caseless and caseful cases, for speed,
3586 since matching characters is likely to be quite common. First, ensure the
3587 minimum number of matches are present. If min = max, continue at the same
3588 level without recursing. Otherwise, if minimizing, keep trying the rest of
3589 the expression and advancing one matching character if failing, up to the
3590 maximum. Alternatively, if maximizing, find the maximum number of
3591 characters and work backwards. */
3592
3593 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3594 max, eptr));
3595
3596 if (op >= OP_NOTSTARI) /* Caseless */
3597 {
3598 #ifdef COMPILE_PCRE8
3599 /* fc must be < 128 if UTF is enabled. */
3600 foc = md->fcc[fc];
3601 #else
3602 #ifdef SUPPORT_UTF
3603 #ifdef SUPPORT_UCP
3604 if (utf && fc > 127)
3605 foc = UCD_OTHERCASE(fc);
3606 #else
3607 if (utf && fc > 127)
3608 foc = fc;
3609 #endif /* SUPPORT_UCP */
3610 else
3611 #endif /* SUPPORT_UTF */
3612 foc = TABLE_GET(fc, md->fcc, fc);
3613 #endif /* COMPILE_PCRE8 */
3614
3615 #ifdef SUPPORT_UTF
3616 if (utf)
3617 {
3618 register unsigned int d;
3619 for (i = 1; i <= min; i++)
3620 {
3621 if (eptr >= md->end_subject)
3622 {
3623 SCHECK_PARTIAL();
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 GETCHARINC(d, eptr);
3627 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3628 }
3629 }
3630 else
3631 #endif
3632 /* Not UTF mode */
3633 {
3634 for (i = 1; i <= min; i++)
3635 {
3636 if (eptr >= md->end_subject)
3637 {
3638 SCHECK_PARTIAL();
3639 RRETURN(MATCH_NOMATCH);
3640 }
3641 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3642 eptr++;
3643 }
3644 }
3645
3646 if (min == max) continue;
3647
3648 if (minimize)
3649 {
3650 #ifdef SUPPORT_UTF
3651 if (utf)
3652 {
3653 register unsigned int d;
3654 for (fi = min;; fi++)
3655 {
3656 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 if (fi >= max) RRETURN(MATCH_NOMATCH);
3659 if (eptr >= md->end_subject)
3660 {
3661 SCHECK_PARTIAL();
3662 RRETURN(MATCH_NOMATCH);
3663 }
3664 GETCHARINC(d, eptr);
3665 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3666 }
3667 }
3668 else
3669 #endif
3670 /* Not UTF mode */
3671 {
3672 for (fi = min;; fi++)
3673 {
3674 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3676 if (fi >= max) RRETURN(MATCH_NOMATCH);
3677 if (eptr >= md->end_subject)
3678 {
3679 SCHECK_PARTIAL();
3680 RRETURN(MATCH_NOMATCH);
3681 }
3682 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3683 eptr++;
3684 }
3685 }
3686 /* Control never gets here */
3687 }
3688
3689 /* Maximize case */
3690
3691 else
3692 {
3693 pp = eptr;
3694
3695 #ifdef SUPPORT_UTF
3696 if (utf)
3697 {
3698 register unsigned int d;
3699 for (i = min; i < max; i++)
3700 {
3701 int len = 1;
3702 if (eptr >= md->end_subject)
3703 {
3704 SCHECK_PARTIAL();
3705 break;
3706 }
3707 GETCHARLEN(d, eptr, len);
3708 if (fc == d || foc == d) break;
3709 eptr += len;
3710 }
3711 if (possessive) continue;
3712 for(;;)
3713 {
3714 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3716 if (eptr-- == pp) break; /* Stop if tried at original pos */
3717 BACKCHAR(eptr);
3718 }
3719 }
3720 else
3721 #endif
3722 /* Not UTF mode */
3723 {
3724 for (i = min; i < max; i++)
3725 {
3726 if (eptr >= md->end_subject)
3727 {
3728 SCHECK_PARTIAL();
3729 break;
3730 }
3731 if (fc == *eptr || foc == *eptr) break;
3732 eptr++;
3733 }
3734 if (possessive) continue;
3735 while (eptr >= pp)
3736 {
3737 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3739 eptr--;
3740 }
3741 }
3742
3743 RRETURN(MATCH_NOMATCH);
3744 }
3745 /* Control never gets here */
3746 }
3747
3748 /* Caseful comparisons */
3749
3750 else
3751 {
3752 #ifdef SUPPORT_UTF
3753 if (utf)
3754 {
3755 register unsigned int d;
3756 for (i = 1; i <= min; i++)
3757 {
3758 if (eptr >= md->end_subject)
3759 {
3760 SCHECK_PARTIAL();
3761 RRETURN(MATCH_NOMATCH);
3762 }
3763 GETCHARINC(d, eptr);
3764 if (fc == d) RRETURN(MATCH_NOMATCH);
3765 }
3766 }
3767 else
3768 #endif
3769 /* Not UTF mode */
3770 {
3771 for (i = 1; i <= min; i++)
3772 {
3773 if (eptr >= md->end_subject)
3774 {
3775 SCHECK_PARTIAL();
3776 RRETURN(MATCH_NOMATCH);
3777 }
3778 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3779 }
3780 }
3781
3782 if (min == max) continue;
3783
3784 if (minimize)
3785 {
3786 #ifdef SUPPORT_UTF
3787 if (utf)
3788 {
3789 register unsigned int d;
3790 for (fi = min;; fi++)
3791 {
3792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3794 if (fi >= max) RRETURN(MATCH_NOMATCH);
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 RRETURN(MATCH_NOMATCH);
3799 }
3800 GETCHARINC(d, eptr);
3801 if (fc == d) RRETURN(MATCH_NOMATCH);
3802 }
3803 }
3804 else
3805 #endif
3806 /* Not UTF mode */
3807 {
3808 for (fi = min;; fi++)
3809 {
3810 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3812 if (fi >= max) RRETURN(MATCH_NOMATCH);
3813 if (eptr >= md->end_subject)
3814 {
3815 SCHECK_PARTIAL();
3816 RRETURN(MATCH_NOMATCH);
3817 }
3818 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3819 }
3820 }
3821 /* Control never gets here */
3822 }
3823
3824 /* Maximize case */
3825
3826 else
3827 {
3828 pp = eptr;
3829
3830 #ifdef SUPPORT_UTF
3831 if (utf)
3832 {
3833 register unsigned int d;
3834 for (i = min; i < max; i++)
3835 {
3836 int len = 1;
3837 if (eptr >= md->end_subject)
3838 {
3839 SCHECK_PARTIAL();
3840 break;
3841 }
3842 GETCHARLEN(d, eptr, len);
3843 if (fc == d) break;
3844 eptr += len;
3845 }
3846 if (possessive) continue;
3847 for(;;)
3848 {
3849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3851 if (eptr-- == pp) break; /* Stop if tried at original pos */
3852 BACKCHAR(eptr);
3853 }
3854 }
3855 else
3856 #endif
3857 /* Not UTF mode */
3858 {
3859 for (i = min; i < max; i++)
3860 {
3861 if (eptr >= md->end_subject)
3862 {
3863 SCHECK_PARTIAL();
3864 break;
3865 }
3866 if (fc == *eptr) break;
3867 eptr++;
3868 }
3869 if (possessive) continue;
3870 while (eptr >= pp)
3871 {
3872 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3874 eptr--;
3875 }
3876 }
3877
3878 RRETURN(MATCH_NOMATCH);
3879 }
3880 }
3881 /* Control never gets here */
3882
3883 /* Match a single character type repeatedly; several different opcodes
3884 share code. This is very similar to the code for single characters, but we
3885 repeat it in the interests of efficiency. */
3886
3887 case OP_TYPEEXACT:
3888 min = max = GET2(ecode, 1);
3889 minimize = TRUE;
3890 ecode += 1 + IMM2_SIZE;
3891 goto REPEATTYPE;
3892
3893 case OP_TYPEUPTO:
3894 case OP_TYPEMINUPTO:
3895 min = 0;
3896 max = GET2(ecode, 1);
3897 minimize = *ecode == OP_TYPEMINUPTO;
3898 ecode += 1 + IMM2_SIZE;
3899 goto REPEATTYPE;
3900
3901 case OP_TYPEPOSSTAR:
3902 possessive = TRUE;
3903 min = 0;
3904 max = INT_MAX;
3905 ecode++;
3906 goto REPEATTYPE;
3907
3908 case OP_TYPEPOSPLUS:
3909 possessive = TRUE;
3910 min = 1;
3911 max = INT_MAX;
3912 ecode++;
3913 goto REPEATTYPE;
3914
3915 case OP_TYPEPOSQUERY:
3916 possessive = TRUE;
3917 min = 0;
3918 max = 1;
3919 ecode++;
3920 goto REPEATTYPE;
3921
3922 case OP_TYPEPOSUPTO:
3923 possessive = TRUE;
3924 min = 0;
3925 max = GET2(ecode, 1);
3926 ecode += 1 + IMM2_SIZE;
3927 goto REPEATTYPE;
3928
3929 case OP_TYPESTAR:
3930 case OP_TYPEMINSTAR:
3931 case OP_TYPEPLUS:
3932 case OP_TYPEMINPLUS:
3933 case OP_TYPEQUERY:
3934 case OP_TYPEMINQUERY:
3935 c = *ecode++ - OP_TYPESTAR;
3936 minimize = (c & 1) != 0;
3937 min = rep_min[c]; /* Pick up values from tables; */
3938 max = rep_max[c]; /* zero for max => infinity */
3939 if (max == 0) max = INT_MAX;
3940
3941 /* Common code for all repeated single character type matches. Note that
3942 in UTF-8 mode, '.' matches a character of any length, but for the other
3943 character types, the valid characters are all one-byte long. */
3944
3945 REPEATTYPE:
3946 ctype = *ecode++; /* Code for the character type */
3947
3948 #ifdef SUPPORT_UCP
3949 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3950 {
3951 prop_fail_result = ctype == OP_NOTPROP;
3952 prop_type = *ecode++;
3953 prop_value = *ecode++;
3954 }
3955 else prop_type = -1;
3956 #endif
3957
3958 /* First, ensure the minimum number of matches are present. Use inline
3959 code for maximizing the speed, and do the type test once at the start
3960 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3961 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3962 and single-bytes. */
3963
3964 if (min > 0)
3965 {
3966 #ifdef SUPPORT_UCP
3967 if (prop_type >= 0)
3968 {
3969 switch(prop_type)
3970 {
3971 case PT_ANY:
3972 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3973 for (i = 1; i <= min; i++)
3974 {
3975 if (eptr >= md->end_subject)
3976 {
3977 SCHECK_PARTIAL();
3978 RRETURN(MATCH_NOMATCH);
3979 }
3980 GETCHARINCTEST(c, eptr);
3981 }
3982 break;
3983
3984 case PT_LAMP:
3985 for (i = 1; i <= min; i++)
3986 {
3987 int chartype;
3988 if (eptr >= md->end_subject)
3989 {
3990 SCHECK_PARTIAL();
3991 RRETURN(MATCH_NOMATCH);
3992 }
3993 GETCHARINCTEST(c, eptr);
3994 chartype = UCD_CHARTYPE(c);
3995 if ((chartype == ucp_Lu ||
3996 chartype == ucp_Ll ||
3997 chartype == ucp_Lt) == prop_fail_result)
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 break;
4001
4002 case PT_GC:
4003 for (i = 1; i <= min; i++)
4004 {
4005 if (eptr >= md->end_subject)
4006 {
4007 SCHECK_PARTIAL();
4008 RRETURN(MATCH_NOMATCH);
4009 }
4010 GETCHARINCTEST(c, eptr);
4011 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4012 RRETURN(MATCH_NOMATCH);
4013 }
4014 break;
4015
4016 case PT_PC:
4017 for (i = 1; i <= min; i++)
4018 {
4019 if (eptr >= md->end_subject)
4020 {
4021 SCHECK_PARTIAL();
4022 RRETURN(MATCH_NOMATCH);
4023 }
4024 GETCHARINCTEST(c, eptr);
4025 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4026 RRETURN(MATCH_NOMATCH);
4027 }
4028 break;
4029
4030 case PT_SC:
4031 for (i = 1; i <= min; i++)
4032 {
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 RRETURN(MATCH_NOMATCH);
4037 }
4038 GETCHARINCTEST(c, eptr);
4039 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4040 RRETURN(MATCH_NOMATCH);
4041 }
4042 break;
4043
4044 case PT_ALNUM:
4045 for (i = 1; i <= min; i++)
4046 {
4047 int category;
4048 if (eptr >= md->end_subject)
4049 {
4050 SCHECK_PARTIAL();
4051 RRETURN(MATCH_NOMATCH);
4052 }
4053 GETCHARINCTEST(c, eptr);
4054 category = UCD_CATEGORY(c);
4055 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4056 RRETURN(MATCH_NOMATCH);
4057 }
4058 break;
4059
4060 case PT_SPACE: /* Perl space */
4061 for (i = 1; i <= min; i++)
4062 {
4063 if (eptr >= md->end_subject)
4064 {
4065 SCHECK_PARTIAL();
4066 RRETURN(MATCH_NOMATCH);
4067 }
4068 GETCHARINCTEST(c, eptr);
4069 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4070 c == CHAR_FF || c == CHAR_CR)
4071 == prop_fail_result)
4072 RRETURN(MATCH_NOMATCH);
4073 }
4074 break;
4075
4076 case PT_PXSPACE: /* POSIX space */
4077 for (i = 1; i <= min; i++)
4078 {
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 RRETURN(MATCH_NOMATCH);
4083 }
4084 GETCHARINCTEST(c, eptr);
4085 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4086 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4087 == prop_fail_result)
4088 RRETURN(MATCH_NOMATCH);
4089 }
4090 break;
4091
4092 case PT_WORD:
4093 for (i = 1; i <= min; i++)
4094 {
4095 int category;
4096 if (eptr >= md->end_subject)
4097 {
4098 SCHECK_PARTIAL();
4099 RRETURN(MATCH_NOMATCH);
4100 }
4101 GETCHARINCTEST(c, eptr);
4102 category = UCD_CATEGORY(c);
4103 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4104 == prop_fail_result)
4105 RRETURN(MATCH_NOMATCH);
4106 }
4107 break;
4108
4109 /* This should not occur */
4110
4111 default:
4112 RRETURN(PCRE_ERROR_INTERNAL);
4113 }
4114 }
4115
4116 /* Match extended Unicode sequences. We will get here only if the
4117 support is in the binary; otherwise a compile-time error occurs. */
4118
4119 else if (ctype == OP_EXTUNI)
4120 {
4121 for (i = 1; i <= min; i++)
4122 {
4123 if (eptr >= md->end_subject)
4124 {
4125 SCHECK_PARTIAL();
4126 RRETURN(MATCH_NOMATCH);
4127 }
4128 GETCHARINCTEST(c, eptr);
4129 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4130 while (eptr < md->end_subject)
4131 {
4132 int len = 1;
4133 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4134 if (UCD_CATEGORY(c) != ucp_M) break;
4135 eptr += len;
4136 }
4137 }
4138 }
4139
4140 else
4141 #endif /* SUPPORT_UCP */
4142
4143 /* Handle all other cases when the coding is UTF-8 */
4144
4145 #ifdef SUPPORT_UTF
4146 if (utf) switch(ctype)
4147 {
4148 case OP_ANY:
4149 for (i = 1; i <= min; i++)
4150 {
4151 if (eptr >= md->end_subject)
4152 {
4153 SCHECK_PARTIAL();
4154 RRETURN(MATCH_NOMATCH);
4155 }
4156 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4157 eptr++;
4158 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4159 }
4160 break;
4161
4162 case OP_ALLANY:
4163 for (i = 1; i <= min; i++)
4164 {
4165 if (eptr >= md->end_subject)
4166 {
4167 SCHECK_PARTIAL();
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 eptr++;
4171 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4172 }
4173 break;
4174
4175 case OP_ANYBYTE:
4176 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4177 eptr += min;
4178 break;
4179
4180 case OP_ANYNL:
4181 for (i = 1; i <= min; i++)
4182 {
4183 if (eptr >= md->end_subject)
4184 {
4185 SCHECK_PARTIAL();
4186 RRETURN(MATCH_NOMATCH);
4187 }
4188 GETCHARINC(c, eptr);
4189 switch(c)
4190 {
4191 default: RRETURN(MATCH_NOMATCH);
4192
4193 case 0x000d:
4194 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4195 break;
4196
4197 case 0x000a:
4198 break;
4199
4200 case 0x000b:
4201 case 0x000c:
4202 case 0x0085:
4203 case 0x2028:
4204 case 0x2029:
4205 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4206 break;
4207 }
4208 }
4209 break;
4210
4211 case OP_NOT_HSPACE:
4212 for (i = 1; i <= min; i++)
4213 {
4214 if (eptr >= md->end_subject)
4215 {
4216 SCHECK_PARTIAL();
4217 RRETURN(MATCH_NOMATCH);
4218 }
4219 GETCHARINC(c, eptr);
4220 switch(c)
4221 {
4222 default: break;
4223 case 0x09: /* HT */
4224 case 0x20: /* SPACE */
4225 case 0xa0: /* NBSP */
4226 case 0x1680: /* OGHAM SPACE MARK */
4227 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4228 case 0x2000: /* EN QUAD */
4229 case 0x2001: /* EM QUAD */
4230 case 0x2002: /* EN SPACE */
4231 case 0x2003: /* EM SPACE */
4232 case 0x2004: /* THREE-PER-EM SPACE */
4233 case 0x2005: /* FOUR-PER-EM SPACE */
4234 case 0x2006: /* SIX-PER-EM SPACE */
4235 case 0x2007: /* FIGURE SPACE */
4236 case 0x2008: /* PUNCTUATION SPACE */
4237 case 0x2009: /* THIN SPACE */
4238 case 0x200A: /* HAIR SPACE */
4239 case 0x202f: /* NARROW NO-BREAK SPACE */
4240 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4241 case 0x3000: /* IDEOGRAPHIC SPACE */
4242 RRETURN(MATCH_NOMATCH);
4243 }
4244 }
4245 break;
4246
4247 case OP_HSPACE:
4248 for (i = 1; i <= min; i++)
4249 {
4250 if (eptr >= md->end_subject)
4251 {
4252 SCHECK_PARTIAL();
4253 RRETURN(MATCH_NOMATCH);
4254 }
4255 GETCHARINC(c, eptr);
4256 switch(c)
4257 {
4258 default: RRETURN(MATCH_NOMATCH);
4259 case 0x09: /* HT */
4260 case 0x20: /* SPACE */
4261 case 0xa0: /* NBSP */
4262 case 0x1680: /* OGHAM SPACE MARK */
4263 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4264 case 0x2000: /* EN QUAD */
4265 case 0x2001: /* EM QUAD */
4266 case 0x2002: /* EN SPACE */
4267 case 0x2003: /* EM SPACE */
4268 case 0x2004: /* THREE-PER-EM SPACE */
4269 case 0x2005: /* FOUR-PER-EM SPACE */
4270 case 0x2006: /* SIX-PER-EM SPACE */
4271 case 0x2007: /* FIGURE SPACE */
4272 case 0x2008: /* PUNCTUATION SPACE */
4273 case 0x2009: /* THIN SPACE */
4274 case 0x200A: /* HAIR SPACE */
4275 case 0x202f: /* NARROW NO-BREAK SPACE */
4276 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4277 case 0x3000: /* IDEOGRAPHIC SPACE */
4278 break;
4279 }
4280 }
4281 break;
4282
4283 case OP_NOT_VSPACE:
4284 for (i = 1; i <= min; i++)
4285 {
4286 if (eptr >= md->end_subject)
4287 {
4288 SCHECK_PARTIAL();
4289 RRETURN(MATCH_NOMATCH);
4290 }
4291 GETCHARINC(c, eptr);
4292 switch(c)
4293 {
4294 default: break;
4295 case 0x0a: /* LF */
4296 case 0x0b: /* VT */
4297 case 0x0c: /* FF */
4298 case 0x0d: /* CR */
4299 case 0x85: /* NEL */
4300 case 0x2028: /* LINE SEPARATOR */
4301 case 0x2029: /* PARAGRAPH SEPARATOR */
4302 RRETURN(MATCH_NOMATCH);
4303 }
4304 }
4305 break;
4306
4307 case OP_VSPACE:
4308 for (i = 1; i <= min; i++)
4309 {
4310 if (eptr >= md->end_subject)
4311 {
4312 SCHECK_PARTIAL();
4313 RRETURN(MATCH_NOMATCH);
4314 }
4315 GETCHARINC(c, eptr);
4316 switch(c)
4317 {
4318 default: RRETURN(MATCH_NOMATCH);
4319 case 0x0a: /* LF */
4320 case 0x0b: /* VT */
4321 case 0x0c: /* FF */
4322 case 0x0d: /* CR */
4323 case 0x85: /* NEL */
4324 case 0x2028: /* LINE SEPARATOR */
4325 case 0x2029: /* PARAGRAPH SEPARATOR */
4326 break;
4327 }
4328 }
4329 break;
4330
4331 case OP_NOT_DIGIT:
4332 for (i = 1; i <= min; i++)
4333 {
4334 if (eptr >= md->end_subject)
4335 {
4336 SCHECK_PARTIAL();
4337 RRETURN(MATCH_NOMATCH);
4338 }
4339 GETCHARINC(c, eptr);
4340 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4341 RRETURN(MATCH_NOMATCH);
4342 }
4343 break;
4344
4345 case OP_DIGIT:
4346 for (i = 1; i <= min; i++)
4347 {
4348 if (eptr >= md->end_subject)
4349 {
4350 SCHECK_PARTIAL();
4351 RRETURN(MATCH_NOMATCH);
4352 }
4353 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4354 RRETURN(MATCH_NOMATCH);
4355 eptr++;
4356 /* No need to skip more bytes - we know it's a 1-byte character */
4357 }
4358 break;
4359
4360 case OP_NOT_WHITESPACE:
4361 for (i = 1; i <= min; i++)
4362 {
4363 if (eptr >= md->end_subject)
4364 {
4365 SCHECK_PARTIAL();
4366 RRETURN(MATCH_NOMATCH);
4367 }
4368 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4369 RRETURN(MATCH_NOMATCH);
4370 eptr++;
4371 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4372 }
4373 break;
4374
4375 case OP_WHITESPACE:
4376 for (i = 1; i <= min; i++)
4377 {
4378 if (eptr >= md->end_subject)
4379 {
4380 SCHECK_PARTIAL();
4381 RRETURN(MATCH_NOMATCH);
4382 }
4383 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4384 RRETURN(MATCH_NOMATCH);
4385 eptr++;
4386 /* No need to skip more bytes - we know it's a 1-byte character */
4387 }
4388 break;
4389
4390 case OP_NOT_WORDCHAR:
4391 for (i = 1; i <= min; i++)
4392 {
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 RRETURN(MATCH_NOMATCH);
4397 }
4398 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4399 RRETURN(MATCH_NOMATCH);
4400 eptr++;
4401 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4402 }
4403 break;
4404
4405 case OP_WORDCHAR:
4406 for (i = 1; i <= min; i++)
4407 {
4408 if (eptr >= md->end_subject)
4409 {
4410 SCHECK_PARTIAL();
4411 RRETURN(MATCH_NOMATCH);
4412 }
4413 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4414 RRETURN(MATCH_NOMATCH);
4415 eptr++;
4416 /* No need to skip more bytes - we know it's a 1-byte character */
4417 }
4418 break;
4419
4420 default:
4421 RRETURN(PCRE_ERROR_INTERNAL);
4422 } /* End switch(ctype) */
4423
4424 else
4425 #endif /* SUPPORT_UTF */
4426
4427 /* Code for the non-UTF-8 case for minimum matching of operators other
4428 than OP_PROP and OP_NOTPROP. */
4429
4430 switch(ctype)
4431 {
4432 case OP_ANY:
4433 for (i = 1; i <= min; i++)
4434 {
4435 if (eptr >= md->end_subject)
4436 {
4437 SCHECK_PARTIAL();
4438 RRETURN(MATCH_NOMATCH);
4439 }
4440 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4441 eptr++;
4442 }
4443 break;
4444
4445 case OP_ALLANY:
4446 if (eptr > md->end_subject - min)
4447 {
4448 SCHECK_PARTIAL();
4449 RRETURN(MATCH_NOMATCH);
4450 }
4451 eptr += min;
4452 break;
4453
4454 case OP_ANYBYTE:
4455 if (eptr > md->end_subject - min)
4456 {
4457 SCHECK_PARTIAL();
4458 RRETURN(MATCH_NOMATCH);
4459 }
4460 eptr += min;
4461 break;
4462
4463 case OP_ANYNL:
4464 for (i = 1; i <= min; i++)
4465 {
4466 if (eptr >= md->end_subject)
4467 {
4468 SCHECK_PARTIAL();
4469 RRETURN(MATCH_NOMATCH);
4470 }
4471 switch(*eptr++)
4472 {
4473 default: RRETURN(MATCH_NOMATCH);
4474
4475 case 0x000d:
4476 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4477 break;
4478
4479 case 0x000a:
4480 break;
4481
4482 case 0x000b:
4483 case 0x000c:
4484 case 0x0085:
4485 #ifdef COMPILE_PCRE16
4486 case 0x2028:
4487 case 0x2029:
4488 #endif
4489 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4490 break;
4491 }
4492 }
4493 break;
4494
4495 case OP_NOT_HSPACE:
4496 for (i = 1; i <= min; i++)
4497 {
4498 if (eptr >= md->end_subject)
4499 {
4500 SCHECK_PARTIAL();
4501 RRETURN(MATCH_NOMATCH);
4502 }
4503 switch(*eptr++)
4504 {
4505 default: break;
4506 case 0x09: /* HT */
4507 case 0x20: /* SPACE */
4508 case 0xa0: /* NBSP */
4509 #ifdef COMPILE_PCRE16
4510 case 0x1680: /* OGHAM SPACE MARK */
4511 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4512 case 0x2000: /* EN QUAD */
4513 case 0x2001: /* EM QUAD */
4514 case 0x2002: /* EN SPACE */
4515 case 0x2003: /* EM SPACE */
4516 case 0x2004: /* THREE-PER-EM SPACE */
4517 case 0x2005: /* FOUR-PER-EM SPACE */
4518 case 0x2006: /* SIX-PER-EM SPACE */
4519 case 0x2007: /* FIGURE SPACE */
4520 case 0x2008: /* PUNCTUATION SPACE */
4521 case 0x2009: /* THIN SPACE */
4522 case 0x200A: /* HAIR SPACE */
4523 case 0x202f: /* NARROW NO-BREAK SPACE */
4524 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4525 case 0x3000: /* IDEOGRAPHIC SPACE */
4526 #endif
4527 RRETURN(MATCH_NOMATCH);
4528 }
4529 }
4530 break;
4531
4532 case OP_HSPACE:
4533 for (i = 1; i <= min; i++)
4534 {
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 RRETURN(MATCH_NOMATCH);
4539 }
4540 switch(*eptr++)
4541 {
4542 default: RRETURN(MATCH_NOMATCH);
4543 case 0x09: /* HT */
4544 case 0x20: /* SPACE */
4545 case 0xa0: /* NBSP */
4546 #ifdef COMPILE_PCRE16
4547 case 0x1680: /* OGHAM SPACE MARK */
4548 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4549 case 0x2000: /* EN QUAD */
4550 case 0x2001: /* EM QUAD */
4551 case 0x2002: /* EN SPACE */
4552 case 0x2003: /* EM SPACE */
4553 case 0x2004: /* THREE-PER-EM SPACE */
4554 case 0x2005: /* FOUR-PER-EM SPACE */
4555 case 0x2006: /* SIX-PER-EM SPACE */
4556 case 0x2007: /* FIGURE SPACE */
4557 case 0x2008: /* PUNCTUATION SPACE */
4558 case 0x2009: /* THIN SPACE */
4559 case 0x200A: /* HAIR SPACE */
4560 case 0x202f: /* NARROW NO-BREAK SPACE */
4561 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4562 case 0x3000: /* IDEOGRAPHIC SPACE */
4563 #endif
4564 break;
4565 }
4566 }
4567 break;
4568
4569 case OP_NOT_VSPACE:
4570 for (i = 1; i <= min; i++)
4571 {
4572 if (eptr >= md->end_subject)
4573 {
4574 SCHECK_PARTIAL();
4575 RRETURN(MATCH_NOMATCH);
4576 }
4577 switch(*eptr++)
4578 {
4579 default: break;
4580 case 0x0a: /* LF */
4581 case 0x0b: /* VT */
4582 case 0x0c: /* FF */
4583 case 0x0d: /* CR */
4584 case 0x85: /* NEL */
4585 #ifdef COMPILE_PCRE16
4586 case 0x2028: /* LINE SEPARATOR */
4587 case 0x2029: /* PARAGRAPH SEPARATOR */
4588 #endif
4589 RRETURN(MATCH_NOMATCH);
4590 }
4591 }
4592 break;
4593
4594 case OP_VSPACE:
4595 for (i = 1; i <= min; i++)
4596 {
4597 if (eptr >= md->end_subject)
4598 {
4599 SCHECK_PARTIAL();
4600 RRETURN(MATCH_NOMATCH);
4601 }
4602 switch(*eptr++)
4603 {
4604 default: RRETURN(MATCH_NOMATCH);
4605 case 0x0a: /* LF */
4606 case 0x0b: /* VT */
4607 case 0x0c: /* FF */
4608 case 0x0d: /* CR */
4609 case 0x85: /* NEL */
4610 #ifdef COMPILE_PCRE16
4611 case 0x2028: /* LINE SEPARATOR */
4612 case 0x2029: /* PARAGRAPH SEPARATOR */
4613 #endif
4614 break;
4615 }
4616 }
4617 break;
4618
4619 case OP_NOT_DIGIT:
4620 for (i = 1; i <= min; i++)
4621 {
4622 if (eptr >= md->end_subject)
4623 {
4624 SCHECK_PARTIAL();
4625 RRETURN(MATCH_NOMATCH);
4626 }
4627 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4628 RRETURN(MATCH_NOMATCH);
4629 eptr++;
4630 }
4631 break;
4632
4633 case OP_DIGIT:
4634 for (i = 1; i <= min; i++)
4635 {
4636 if (eptr >= md->end_subject)
4637 {
4638 SCHECK_PARTIAL();
4639 RRETURN(MATCH_NOMATCH);
4640 }
4641 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4642 RRETURN(MATCH_NOMATCH);
4643 eptr++;
4644 }
4645 break;
4646
4647 case OP_NOT_WHITESPACE:
4648 for (i = 1; i <= min; i++)
4649 {
4650 if (eptr >= md->end_subject)
4651 {
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4654 }
4655 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4656 RRETURN(MATCH_NOMATCH);
4657 eptr++;
4658 }
4659 break;
4660
4661 case OP_WHITESPACE:
4662 for (i = 1; i <= min; i++)
4663 {
4664 if (eptr >= md->end_subject)
4665 {
4666 SCHECK_PARTIAL();
4667 RRETURN(MATCH_NOMATCH);
4668 }
4669 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4670 RRETURN(MATCH_NOMATCH);
4671 eptr++;
4672 }
4673 break;
4674
4675 case OP_NOT_WORDCHAR:
4676 for (i = 1; i <= min; i++)
4677 {
4678 if (eptr >= md->end_subject)
4679 {
4680 SCHECK_PARTIAL();
4681 RRETURN(MATCH_NOMATCH);
4682 }
4683 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4684 RRETURN(MATCH_NOMATCH);
4685 eptr++;
4686 }
4687 break;
4688
4689 case OP_WORDCHAR:
4690 for (i = 1; i <= min; i++)
4691 {
4692 if (eptr >= md->end_subject)
4693 {
4694 SCHECK_PARTIAL();
4695 RRETURN(MATCH_NOMATCH);
4696 }
4697 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4698 RRETURN(MATCH_NOMATCH);
4699 eptr++;
4700 }
4701 break;
4702
4703 default:
4704 RRETURN(PCRE_ERROR_INTERNAL);
4705 }
4706 }
4707
4708 /* If min = max, continue at the same level without recursing */
4709
4710 if (min == max) continue;
4711
4712 /* If minimizing, we have to test the rest of the pattern before each
4713 subsequent match. Again, separate the UTF-8 case for speed, and also
4714 separate the UCP cases. */
4715
4716 if (minimize)
4717 {
4718 #ifdef SUPPORT_UCP
4719 if (prop_type >= 0)
4720 {
4721 switch(prop_type)
4722 {
4723 case PT_ANY:
4724 for (fi = min;; fi++)
4725 {
4726 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4728 if (fi >= max) RRETURN(MATCH_NOMATCH);
4729 if (eptr >= md->end_subject)
4730 {
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4733 }
4734 GETCHARINCTEST(c, eptr);
4735 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4736 }
4737 /* Control never gets here */
4738
4739 case PT_LAMP:
4740 for (fi = min;; fi++)
4741 {
4742 int chartype;
4743 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4744 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4745 if (fi >= max) RRETURN(MATCH_NOMATCH);
4746 if (eptr >= md->end_subject)
4747 {
4748 SCHECK_PARTIAL();
4749 RRETURN(MATCH_NOMATCH);
4750 }
4751 GETCHARINCTEST(c, eptr);
4752 chartype = UCD_CHARTYPE(c);
4753 if ((chartype == ucp_Lu ||
4754 chartype == ucp_Ll ||
4755 chartype == ucp_Lt) == prop_fail_result)
4756 RRETURN(MATCH_NOMATCH);
4757 }
4758 /* Control never gets here */
4759
4760 case PT_GC:
4761 for (fi = min;; fi++)
4762 {
4763 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4764 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4765 if (fi >= max) RRETURN(MATCH_NOMATCH);
4766 if (eptr >= md->end_subject)
4767 {
4768 SCHECK_PARTIAL();
4769 RRETURN(MATCH_NOMATCH);
4770 }
4771 GETCHARINCTEST(c, eptr);
4772 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4773 RRETURN(MATCH_NOMATCH);
4774 }
4775 /* Control never gets here */
4776
4777 case PT_PC:
4778 for (fi = min;; fi++)
4779 {
4780 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4782 if (fi >= max) RRETURN(MATCH_NOMATCH);
4783 if (eptr >= md->end_subject)
4784 {
4785 SCHECK_PARTIAL();
4786 RRETURN(MATCH_NOMATCH);
4787 }
4788 GETCHARINCTEST(c, eptr);
4789 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4790 RRETURN(MATCH_NOMATCH);
4791 }
4792 /* Control never gets here */
4793
4794 case PT_SC:
4795 for (fi = min;; fi++)
4796 {
4797 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4799 if (fi >= max) RRETURN(MATCH_NOMATCH);
4800 if (eptr >= md->end_subject)
4801 {
4802 SCHECK_PARTIAL();
4803 RRETURN(MATCH_NOMATCH);
4804 }
4805 GETCHARINCTEST(c, eptr);
4806 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4807 RRETURN(MATCH_NOMATCH);
4808 }
4809 /* Control never gets here */
4810
4811 case PT_ALNUM:
4812 for (fi = min;; fi++)
4813 {
4814 int category;
4815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4817 if (fi >= max) RRETURN(MATCH_NOMATCH);
4818 if (eptr >= md->end_subject)
4819 {
4820 SCHECK_PARTIAL();
4821 RRETURN(MATCH_NOMATCH);
4822 }
4823 GETCHARINCTEST(c, eptr);
4824 category = UCD_CATEGORY(c);
4825 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4826 RRETURN(MATCH_NOMATCH);
4827 }
4828 /* Control never gets here */
4829
4830 case PT_SPACE: /* Perl space */
4831 for (fi = min;; fi++)
4832 {
4833 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4835 if (fi >= max) RRETURN(MATCH_NOMATCH);
4836 if (eptr >= md->end_subject)
4837 {
4838 SCHECK_PARTIAL();
4839 RRETURN(MATCH_NOMATCH);
4840 }
4841 GETCHARINCTEST(c, eptr);
4842 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4843 c == CHAR_FF || c == CHAR_CR)
4844 == prop_fail_result)
4845 RRETURN(MATCH_NOMATCH);
4846 }
4847 /* Control never gets here */
4848
4849 case PT_PXSPACE: /* POSIX space */
4850 for (fi = min;; fi++)
4851 {
4852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4854 if (fi >= max) RRETURN(MATCH_NOMATCH);
4855 if (eptr >= md->end_subject)
4856 {
4857 SCHECK_PARTIAL();
4858 RRETURN(MATCH_NOMATCH);
4859 }
4860 GETCHARINCTEST(c, eptr);
4861 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4862 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4863 == prop_fail_result)
4864 RRETURN(MATCH_NOMATCH);
4865 }
4866 /* Control never gets here */
4867
4868 case PT_WORD:
4869 for (fi = min;; fi++)
4870 {
4871 int category;
4872 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4874 if (fi >= max) RRETURN(MATCH_NOMATCH);
4875 if (eptr >= md->end_subject)
4876 {
4877 SCHECK_PARTIAL();
4878 RRETURN(MATCH_NOMATCH);
4879 }
4880 GETCHARINCTEST(c, eptr);
4881 category = UCD_CATEGORY(c);
4882 if ((category == ucp_L ||
4883 category == ucp_N ||
4884 c == CHAR_UNDERSCORE)
4885 == prop_fail_result)
4886 RRETURN(MATCH_NOMATCH);
4887 }
4888 /* Control never gets here */
4889
4890 /* This should never occur */
4891
4892 default:
4893 RRETURN(PCRE_ERROR_INTERNAL);
4894 }
4895 }
4896
4897 /* Match extended Unicode sequences. We will get here only if the
4898 support is in the binary; otherwise a compile-time error occurs. */
4899
4900 else if (ctype == OP_EXTUNI)
4901 {
4902 for (fi = min;; fi++)
4903 {
4904 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4906 if (fi >= max) RRETURN(MATCH_NOMATCH);
4907 if (eptr >= md->end_subject)
4908 {
4909 SCHECK_PARTIAL();
4910 RRETURN(MATCH_NOMATCH);
4911 }
4912 GETCHARINCTEST(c, eptr);
4913 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4914 while (eptr < md->end_subject)
4915 {
4916 int len = 1;
4917 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4918 if (UCD_CATEGORY(c) != ucp_M) break;
4919 eptr += len;
4920 }
4921 }
4922 }
4923 else
4924 #endif /* SUPPORT_UCP */
4925
4926 #ifdef SUPPORT_UTF
4927 if (utf)
4928 {
4929 for (fi = min;; fi++)
4930 {
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4940 RRETURN(MATCH_NOMATCH);
4941 GETCHARINC(c, eptr);
4942 switch(ctype)
4943 {
4944 case OP_ANY: /* This is the non-NL case */
4945 case OP_ALLANY:
4946 case OP_ANYBYTE:
4947 break;
4948
4949 case OP_ANYNL:
4950 switch(c)
4951 {
4952 default: RRETURN(MATCH_NOMATCH);
4953 case 0x000d:
4954 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4955 break;
4956 case 0x000a:
4957 break;
4958
4959 case 0x000b:
4960 case 0x000c:
4961 case 0x0085:
4962 case 0x2028:
4963 case 0x2029:
4964 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4965 break;
4966 }
4967 break;
4968
4969 case OP_NOT_HSPACE:
4970 switch(c)
4971 {
4972 default: break;
4973 case 0x09: /* HT */
4974 case 0x20: /* SPACE */
4975 case 0xa0: /* NBSP */
4976 case 0x1680: /* OGHAM SPACE MARK */
4977 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4978 case 0x2000: /* EN QUAD */
4979 case 0x2001: /* EM QUAD */
4980 case 0x2002: /* EN SPACE */
4981 case 0x2003: /* EM SPACE */
4982 case 0x2004: /* THREE-PER-EM SPACE */
4983 case 0x2005: /* FOUR-PER-EM SPACE */
4984 case 0x2006: /* SIX-PER-EM SPACE */
4985 case 0x2007: /* FIGURE SPACE */
4986 case 0x2008: /* PUNCTUATION SPACE */
4987 case 0x2009: /* THIN SPACE */
4988 case 0x200A: /* HAIR SPACE */
4989 case 0x202f: /* NARROW NO-BREAK SPACE */
4990 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4991 case 0x3000: /* IDEOGRAPHIC SPACE */
4992 RRETURN(MATCH_NOMATCH);
4993 }
4994 break;
4995
4996 case OP_HSPACE:
4997 switch(c)
4998 {
4999 default: RRETURN(MATCH_NOMATCH);
5000 case 0x09: /* HT */
5001 case 0x20: /* SPACE */
5002 case 0xa0: /* NBSP */
5003 case 0x1680: /* OGHAM SPACE MARK */
5004 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5005 case 0x2000: /* EN QUAD */
5006 case 0x2001: /* EM QUAD */
5007 case 0x2002: /* EN SPACE */
5008 case 0x2003: /* EM SPACE */
5009 case 0x2004: /* THREE-PER-EM SPACE */
5010 case 0x2005: /* FOUR-PER-EM SPACE */
5011 case 0x2006: /* SIX-PER-EM SPACE */
5012 case 0x2007: /* FIGURE SPACE */
5013 case 0x2008: /* PUNCTUATION SPACE */
5014 case 0x2009: /* THIN SPACE */
5015 case 0x200A: /* HAIR SPACE */
5016 case 0x202f: /* NARROW NO-BREAK SPACE */
5017 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5018 case 0x3000: /* IDEOGRAPHIC SPACE */
5019 break;
5020 }
5021 break;
5022
5023 case OP_NOT_VSPACE:
5024 switch(c)
5025 {
5026 default: break;
5027 case 0x0a: /* LF */
5028 case 0x0b: /* VT */
5029 case 0x0c: /* FF */
5030 case 0x0d: /* CR */
5031 case 0x85: /* NEL */
5032 case 0x2028: /* LINE SEPARATOR */
5033 case 0x2029: /* PARAGRAPH SEPARATOR */
5034 RRETURN(MATCH_NOMATCH);
5035 }
5036 break;
5037
5038 case OP_VSPACE:
5039 switch(c)
5040 {
5041 default: RRETURN(MATCH_NOMATCH);
5042 case 0x0a: /* LF */
5043 case 0x0b: /* VT */
5044 case 0x0c: /* FF */
5045 case 0x0d: /* CR */
5046 case 0x85: /* NEL */
5047 case 0x2028: /* LINE SEPARATOR */
5048 case 0x2029: /* PARAGRAPH SEPARATOR */
5049 break;
5050 }
5051 break;
5052
5053 case OP_NOT_DIGIT:
5054 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5055 RRETURN(MATCH_NOMATCH);
5056 break;
5057
5058 case OP_DIGIT:
5059 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5060 RRETURN(MATCH_NOMATCH);
5061 break;
5062
5063 case OP_NOT_WHITESPACE:
5064 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5065 RRETURN(MATCH_NOMATCH);
5066 break;
5067
5068 case OP_WHITESPACE:
5069 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5070 RRETURN(MATCH_NOMATCH);
5071 break;
5072
5073 case OP_NOT_WORDCHAR:
5074 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5075 RRETURN(MATCH_NOMATCH);
5076 break;
5077
5078 case OP_WORDCHAR:
5079 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5080 RRETURN(MATCH_NOMATCH);
5081 break;
5082
5083 default:
5084 RRETURN(PCRE_ERROR_INTERNAL);
5085 }
5086 }
5087 }
5088 else
5089 #endif
5090 /* Not UTF mode */
5091 {
5092 for (fi = min;; fi++)
5093 {
5094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5096 if (fi >= max) RRETURN(MATCH_NOMATCH);
5097 if (eptr >= md->end_subject)
5098 {
5099 SCHECK_PARTIAL();
5100 RRETURN(MATCH_NOMATCH);
5101 }
5102 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5103 RRETURN(MATCH_NOMATCH);
5104 c = *eptr++;
5105 switch(ctype)
5106 {
5107 case OP_ANY: /* This is the non-NL case */
5108 case OP_ALLANY:
5109 case OP_ANYBYTE:
5110 break;
5111
5112 case OP_ANYNL:
5113 switch(c)
5114 {
5115 default: RRETURN(MATCH_NOMATCH);
5116 case 0x000d:
5117 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5118 break;
5119
5120 case 0x000a:
5121 break;
5122
5123 case 0x000b:
5124 case 0x000c:
5125 case 0x0085:
5126 #ifdef COMPILE_PCRE16
5127 case 0x2028:
5128 case 0x2029:
5129 #endif
5130 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5131 break;
5132 }
5133 break;
5134
5135 case OP_NOT_HSPACE:
5136 switch(c)
5137 {
5138 default: break;
5139 case 0x09: /* HT */
5140 case 0x20: /* SPACE */
5141 case 0xa0: /* NBSP */
5142 #ifdef COMPILE_PCRE16
5143 case 0x1680: /* OGHAM SPACE MARK */
5144 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5145 case 0x2000: /* EN QUAD */
5146 case 0x2001: /* EM QUAD */
5147 case 0x2002: /* EN SPACE */
5148 case 0x2003: /* EM SPACE */
5149 case 0x2004: /* THREE-PER-EM SPACE */
5150 case 0x2005: /* FOUR-PER-EM SPACE */
5151 case 0x2006: /* SIX-PER-EM SPACE */
5152 case 0x2007: /* FIGURE SPACE */
5153 case 0x2008: /* PUNCTUATION SPACE */
5154 case 0x2009: /* THIN SPACE */
5155 case 0x200A: /* HAIR SPACE */
5156 case 0x202f: /* NARROW NO-BREAK SPACE */
5157 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5158 case 0x3000: /* IDEOGRAPHIC SPACE */
5159 #endif
5160 RRETURN(MATCH_NOMATCH);
5161 }
5162 break;
5163
5164 case OP_HSPACE:
5165 switch(c)
5166 {
5167 default: RRETURN(MATCH_NOMATCH);
5168 case 0x09: /* HT */
5169 case 0x20: /* SPACE */
5170 case 0xa0: /* NBSP */
5171 #ifdef COMPILE_PCRE16
5172 case 0x1680: /* OGHAM SPACE MARK */
5173 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5174 case 0x2000: /* EN QUAD */
5175 case 0x2001: /* EM QUAD */
5176 case 0x2002: /* EN SPACE */
5177 case 0x2003: /* EM SPACE */
5178 case 0x2004: /* THREE-PER-EM SPACE */
5179 case 0x2005: /* FOUR-PER-EM SPACE */
5180 case 0x2006: /* SIX-PER-EM SPACE */
5181 case 0x2007: /* FIGURE SPACE */
5182 case 0x2008: /* PUNCTUATION SPACE */
5183 case 0x2009: /* THIN SPACE */
5184 case 0x200A: /* HAIR SPACE */
5185 case 0x202f: /* NARROW NO-BREAK SPACE */
5186 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5187 case 0x3000: /* IDEOGRAPHIC SPACE */
5188 #endif
5189 break;
5190 }
5191 break;
5192
5193 case OP_NOT_VSPACE:
5194 switch(c)
5195 {
5196 default: break;
5197 case 0x0a: /* LF */
5198 case 0x0b: /* VT */
5199 case 0x0c: /* FF */
5200 case 0x0d: /* CR */
5201 case 0x85: /* NEL */
5202 #ifdef COMPILE_PCRE16
5203 case 0x2028: /* LINE SEPARATOR */
5204 case 0x2029: /* PARAGRAPH SEPARATOR */
5205 #endif
5206 RRETURN(MATCH_NOMATCH);
5207 }
5208 break;
5209
5210 case OP_VSPACE:
5211 switch(c)
5212 {
5213 default: RRETURN(MATCH_NOMATCH);
5214 case 0x0a: /* LF */
5215 case 0x0b: /* VT */
5216 case 0x0c: /* FF */
5217 case 0x0d: /* CR */
5218 case 0x85: /* NEL */
5219 #ifdef COMPILE_PCRE16
5220 case 0x2028: /* LINE SEPARATOR */
5221 case 0x2029: /* PARAGRAPH SEPARATOR */
5222 #endif
5223 break;
5224 }
5225 break;
5226
5227 case OP_NOT_DIGIT:
5228 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5229 break;
5230
5231 case OP_DIGIT:
5232 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5233 break;
5234
5235 case OP_NOT_WHITESPACE:
5236 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5237 break;
5238
5239 case OP_WHITESPACE:
5240 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5241 break;
5242
5243 case OP_NOT_WORDCHAR:
5244 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5245 break;
5246
5247 case OP_WORDCHAR:
5248 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5249 break;
5250
5251 default:
5252 RRETURN(PCRE_ERROR_INTERNAL);
5253 }
5254 }
5255 }
5256 /* Control never gets here */
5257 }
5258
5259 /* If maximizing, it is worth using inline code for speed, doing the type
5260 test once at the start (i.e. keep it out of the loop). Again, keep the
5261 UTF-8 and UCP stuff separate. */
5262
5263 else
5264 {
5265 pp = eptr; /* Remember where we started */
5266
5267 #ifdef SUPPORT_UCP
5268 if (prop_type >= 0)
5269 {
5270 switch(prop_type)
5271 {
5272 case PT_ANY:
5273 for (i = min; i < max; i++)
5274 {
5275 int len = 1;
5276 if (eptr >= md->end_subject)
5277 {
5278 SCHECK_PARTIAL();
5279 break;
5280 }
5281 GETCHARLENTEST(c, eptr, len);
5282 if (prop_fail_result) break;
5283 eptr+= len;
5284 }
5285 break;
5286
5287 case PT_LAMP:
5288 for (i = min; i < max; i++)
5289 {
5290 int chartype;
5291 int len = 1;
5292 if (eptr >= md->end_subject)
5293 {
5294 SCHECK_PARTIAL();
5295 break;
5296 }
5297 GETCHARLENTEST(c, eptr, len);
5298 chartype = UCD_CHARTYPE(c);
5299 if ((chartype == ucp_Lu ||
5300 chartype == ucp_Ll ||
5301 chartype == ucp_Lt) == prop_fail_result)
5302 break;
5303 eptr+= len;
5304 }
5305 break;
5306
5307 case PT_GC:
5308 for (i = min; i < max; i++)
5309 {
5310 int len = 1;
5311 if (eptr >= md->end_subject)
5312 {
5313 SCHECK_PARTIAL();
5314 break;
5315 }
5316 GETCHARLENTEST(c, eptr, len);
5317 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5318 eptr+= len;
5319 }
5320 break;
5321
5322 case PT_PC:
5323 for (i = min; i < max; i++)
5324 {
5325 int len = 1;
5326 if (eptr >= md->end_subject)
5327 {
5328 SCHECK_PARTIAL();
5329 break;
5330 }
5331 GETCHARLENTEST(c, eptr, len);
5332 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5333 eptr+= len;
5334 }
5335 break;
5336
5337 case PT_SC:
5338 for (i = min; i < max; i++)
5339 {
5340 int len = 1;
5341 if (eptr >= md->end_subject)
5342 {
5343 SCHECK_PARTIAL();
5344 break;
5345 }
5346 GETCHARLENTEST(c, eptr, len);
5347 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5348 eptr+= len;
5349 }
5350 break;
5351
5352 case PT_ALNUM:
5353 for (i = min; i < max; i++)
5354 {
5355 int category;
5356 int len = 1;
5357 if (eptr >= md->end_subject)
5358 {
5359 SCHECK_PARTIAL();
5360 break;
5361 }
5362 GETCHARLENTEST(c, eptr, len);
5363 category = UCD_CATEGORY(c);
5364 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5365 break;
5366 eptr+= len;
5367 }
5368 break;
5369
5370 case PT_SPACE: /* Perl space */
5371 for (i = min; i < max; i++)
5372 {
5373 int len = 1;
5374 if (eptr >= md->end_subject)
5375 {
5376 SCHECK_PARTIAL();
5377 break;
5378 }
5379 GETCHARLENTEST(c, eptr, len);
5380 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5381 c == CHAR_FF || c == CHAR_CR)
5382 == prop_fail_result)
5383 break;
5384 eptr+= len;
5385 }
5386 break;
5387
5388 case PT_PXSPACE: /* POSIX space */
5389 for (i = min; i < max; i++)
5390 {
5391 int len = 1;
5392 if (eptr >= md->end_subject)
5393 {
5394 SCHECK_PARTIAL();
5395 break;
5396 }
5397 GETCHARLENTEST(c, eptr, len);
5398 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5399 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5400 == prop_fail_result)
5401 break;
5402 eptr+= len;
5403 }
5404 break;
5405
5406 case PT_WORD:
5407 for (i = min; i < max; i++)
5408 {
5409 int category;
5410 int len = 1;
5411 if (eptr >= md->end_subject)
5412 {
5413 SCHECK_PARTIAL();
5414 break;
5415 }
5416 GETCHARLENTEST(c, eptr, len);
5417 category = UCD_CATEGORY(c);
5418 if ((category == ucp_L || category == ucp_N ||
5419 c == CHAR_UNDERSCORE) == prop_fail_result)
5420 break;
5421 eptr+= len;
5422 }
5423 break;
5424
5425 default:
5426 RRETURN(PCRE_ERROR_INTERNAL);
5427 }
5428
5429 /* eptr is now past the end of the maximum run */
5430
5431 if (possessive) continue;
5432 for(;;)
5433 {
5434 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5435 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5436 if (eptr-- == pp) break; /* Stop if tried at original pos */
5437 if (utf) BACKCHAR(eptr);
5438 }
5439 }
5440
5441 /* Match extended Unicode sequences. We will get here only if the
5442 support is in the binary; otherwise a compile-time error occurs. */
5443
5444 else if (ctype == OP_EXTUNI)
5445 {
5446 for (i = min; i < max; i++)
5447 {
5448 int len = 1;
5449 if (eptr >= md->end_subject)
5450 {
5451 SCHECK_PARTIAL();
5452 break;
5453 }
5454 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5455 if (UCD_CATEGORY(c) == ucp_M) break;
5456 eptr += len;
5457 while (eptr < md->end_subject)
5458 {
5459 len = 1;
5460 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5461 if (UCD_CATEGORY(c) != ucp_M) break;
5462 eptr += len;
5463 }
5464 }
5465
5466 /* eptr is now past the end of the maximum run */
5467
5468 if (possessive) continue;
5469
5470 for(;;)
5471 {
5472 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5474 if (eptr-- == pp) break; /* Stop if tried at original pos */
5475 for (;;) /* Move back over one extended */
5476 {
5477 if (!utf) c = *eptr; else
5478 {
5479 BACKCHAR(eptr);
5480 GETCHAR(c, eptr);
5481 }
5482 if (UCD_CATEGORY(c) != ucp_M) break;
5483 eptr--;
5484 }
5485 }
5486 }
5487
5488 else
5489 #endif /* SUPPORT_UCP */
5490
5491 #ifdef SUPPORT_UTF
5492 if (utf)
5493 {
5494 switch(ctype)
5495 {
5496 case OP_ANY:
5497 if (max < INT_MAX)
5498 {
5499 for (i = min; i < max; i++)
5500 {
5501 if (eptr >= md->end_subject)
5502 {
5503 SCHECK_PARTIAL();
5504 break;
5505 }
5506 if (IS_NEWLINE(eptr)) break;
5507 eptr++;
5508 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5509 }
5510 }
5511
5512 /* Handle unlimited UTF-8 repeat */
5513
5514 else
5515 {
5516 for (i = min; i < max; i++)
5517 {
5518 if (eptr >= md->end_subject)
5519 {
5520 SCHECK_PARTIAL();
5521 break;
5522 }
5523 if (IS_NEWLINE(eptr)) break;
5524 eptr++;
5525 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5526 }
5527 }
5528 break;
5529
5530 case OP_ALLANY:
5531 if (max < INT_MAX)
5532 {
5533 for (i = min; i < max; i++)
5534 {
5535 if (eptr >= md->end_subject)
5536 {
5537 SCHECK_PARTIAL();
5538 break;
5539 }
5540 eptr++;
5541 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5542 }
5543 }
5544 else
5545 {
5546 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5547 SCHECK_PARTIAL();
5548 }
5549 break;
5550
5551 /* The byte case is the same as non-UTF8 */
5552
5553 case OP_ANYBYTE:
5554 c = max - min;
5555 if (c > (unsigned int)(md->end_subject - eptr))
5556 {
5557 eptr = md->end_subject;
5558 SCHECK_PARTIAL();
5559 }
5560 else eptr += c;
5561 break;
5562
5563 case OP_ANYNL:
5564 for (i = min; i < max; i++)
5565 {
5566 int len = 1;
5567 if (eptr >= md->end_subject)
5568 {
5569 SCHECK_PARTIAL();
5570 break;
5571 }
5572 GETCHARLEN(c, eptr, len);
5573 if (c == 0x000d)
5574 {
5575 if (++eptr >= md->end_subject) break;
5576 if (*eptr == 0x000a) eptr++;
5577 }
5578 else
5579 {
5580 if (c != 0x000a &&
5581 (md->bsr_anycrlf ||
5582 (c != 0x000b && c != 0x000c &&
5583 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5584 break;
5585 eptr += len;
5586 }
5587 }
5588 break;
5589
5590 case OP_NOT_HSPACE:
5591 case OP_HSPACE:
5592 for (i = min; i < max; i++)
5593 {
5594 BOOL gotspace;
5595 int len = 1;
5596 if (eptr >= md->end_subject)
5597 {
5598 SCHECK_PARTIAL();
5599 break;
5600 }
5601 GETCHARLEN(c, eptr, len);
5602 switch(c)
5603 {
5604 default: gotspace = FALSE; break;
5605 case 0x09: /* HT */
5606 case 0x20: /* SPACE */
5607 case 0xa0: /* NBSP */
5608 case 0x1680: /* OGHAM SPACE MARK */
5609 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5610 case 0x2000: /* EN QUAD */
5611 case 0x2001: /* EM QUAD */
5612 case 0x2002: /* EN SPACE */
5613 case 0x2003: /* EM SPACE */
5614 case 0x2004: /* THREE-PER-EM SPACE */
5615 case 0x2005: /* FOUR-PER-EM SPACE */
5616 case 0x2006: /* SIX-PER-EM SPACE */
5617 case 0x2007: /* FIGURE SPACE */
5618 case 0x2008: /* PUNCTUATION SPACE */
5619 case 0x2009: /* THIN SPACE */
5620 case 0x200A: /* HAIR SPACE */
5621 case 0x202f: /* NARROW NO-BREAK SPACE */
5622 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5623 case 0x3000: /* IDEOGRAPHIC SPACE */
5624 gotspace = TRUE;
5625 break;
5626 }
5627 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5628 eptr += len;
5629 }
5630 break;
5631
5632 case OP_NOT_VSPACE:
5633 case OP_VSPACE:
5634 for (i = min; i < max; i++)
5635 {
5636 BOOL gotspace;
5637 int len = 1;
5638 if (eptr >= md->end_subject)
5639 {
5640 SCHECK_PARTIAL();
5641 break;
5642 }
5643 GETCHARLEN(c, eptr, len);
5644 switch(c)
5645 {
5646 default: gotspace = FALSE; break;
5647 case 0x0a: /* LF */
5648 case 0x0b: /* VT */
5649 case 0x0c: /* FF */
5650 case 0x0d: /* CR */
5651 case 0x85: /* NEL */
5652 case 0x2028: /* LINE SEPARATOR */
5653 case 0x2029: /* PARAGRAPH SEPARATOR */
5654 gotspace = TRUE;
5655 break;
5656 }
5657 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5658 eptr += len;
5659 }
5660 break;
5661
5662 case OP_NOT_DIGIT:
5663 for (i = min; i < max; i++)
5664 {
5665 int len = 1;
5666 if (eptr >= md->end_subject)
5667 {
5668 SCHECK_PARTIAL();
5669 break;
5670 }
5671 GETCHARLEN(c, eptr, len);
5672 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5673 eptr+= len;
5674 }
5675 break;
5676
5677 case OP_DIGIT:
5678 for (i = min; i < max; i++)
5679 {
5680 int len = 1;
5681 if (eptr >= md->end_subject)
5682 {
5683 SCHECK_PARTIAL();
5684 break;
5685 }
5686 GETCHARLEN(c, eptr, len);
5687 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5688 eptr+= len;
5689 }
5690 break;
5691
5692 case OP_NOT_WHITESPACE:
5693 for (i = min; i < max; i++)
5694 {
5695 int len = 1;
5696 if (eptr >= md->end_subject)
5697 {
5698 SCHECK_PARTIAL();
5699 break;
5700 }
5701 GETCHARLEN(c, eptr, len);
5702 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5703 eptr+= len;
5704 }
5705 break;
5706
5707 case OP_WHITESPACE:
5708 for (i = min; i < max; i++)
5709 {
5710 int len = 1;
5711 if (eptr >= md->end_subject)
5712 {
5713 SCHECK_PARTIAL();
5714 break;
5715 }
5716 GETCHARLEN(c, eptr, len);
5717 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5718 eptr+= len;
5719 }
5720 break;
5721
5722 case OP_NOT_WORDCHAR:
5723 for (i = min; i < max; i++)
5724 {
5725 int len = 1;
5726 if (eptr >= md->end_subject)
5727 {
5728 SCHECK_PARTIAL();
5729 break;
5730 }
5731 GETCHARLEN(c, eptr, len);
5732 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5733 eptr+= len;
5734 }
5735 break;
5736
5737 case OP_WORDCHAR:
5738 for (i = min; i < max; i++)
5739 {
5740 int len = 1;
5741 if (eptr >= md->end_subject)
5742 {
5743 SCHECK_PARTIAL();
5744 break;
5745 }
5746 GETCHARLEN(c, eptr, len);
5747 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5748 eptr+= len;
5749 }
5750 break;
5751
5752 default:
5753 RRETURN(PCRE_ERROR_INTERNAL);
5754 }
5755
5756 /* eptr is now past the end of the maximum run. If possessive, we are
5757 done (no backing up). Otherwise, match at this position; anything other
5758 than no match is immediately returned. For nomatch, back up one
5759 character, unless we are matching \R and the last thing matched was
5760 \r\n, in which case, back up two bytes. */
5761
5762 if (possessive) continue;
5763 for(;;)
5764 {
5765 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5766 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5767 if (eptr-- == pp) break; /* Stop if tried at original pos */
5768 BACKCHAR(eptr);
5769 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5770 eptr[-1] == '\r') eptr--;
5771 }
5772 }
5773 else
5774 #endif /* SUPPORT_UTF */
5775 /* Not UTF mode */
5776 {
5777 switch(ctype)
5778 {
5779 case OP_ANY:
5780 for (i = min; i < max; i++)
5781 {
5782 if (eptr >= md->end_subject)
5783 {
5784 SCHECK_PARTIAL();
5785 break;
5786 }
5787 if (IS_NEWLINE(eptr)) break;
5788 eptr++;
5789 }
5790 break;
5791
5792 case OP_ALLANY:
5793 case OP_ANYBYTE:
5794 c = max - min;
5795 if (c > (unsigned int)(md->end_subject - eptr))
5796 {
5797 eptr = md->end_subject;
5798 SCHECK_PARTIAL();
5799 }
5800 else eptr += c;
5801 break;
5802
5803 case OP_ANYNL:
5804 for (i = min; i < max; i++)
5805 {
5806 if (eptr >= md->end_subject)
5807 {
5808 SCHECK_PARTIAL();
5809 break;
5810 }
5811 c = *eptr;
5812 if (c == 0x000d)
5813 {
5814 if (++eptr >= md->end_subject) break;
5815 if (*eptr == 0x000a) eptr++;
5816 }
5817 else
5818 {
5819 if (c != 0x000a && (md->bsr_anycrlf ||
5820 (c != 0x000b && c != 0x000c && c != 0x0085
5821 #ifdef COMPILE_PCRE16
5822 && c != 0x2028 && c != 0x2029
5823 #endif
5824 ))) break;
5825 eptr++;
5826 }
5827 }
5828 break;
5829
5830 case OP_NOT_HSPACE:
5831 for (i = min; i < max; i++)
5832 {
5833 if (eptr >= md->end_subject)
5834 {
5835 SCHECK_PARTIAL();
5836 break;
5837 }
5838 c = *eptr;
5839 if (c == 0x09 || c == 0x20 || c == 0xa0
5840 #ifdef COMPILE_PCRE16
5841 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5842 || c == 0x202f || c == 0x205f || c == 0x3000
5843 #endif
5844 ) break;
5845 eptr++;
5846 }
5847 break;
5848
5849 case OP_HSPACE:
5850 for (i = min; i < max; i++)
5851 {
5852 if (eptr >= md->end_subject)
5853 {
5854 SCHECK_PARTIAL();
5855 break;
5856 }
5857 c = *eptr;
5858 if (c != 0x09 && c != 0x20 && c != 0xa0
5859 #ifdef COMPILE_PCRE16
5860 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5861 && c != 0x202f && c != 0x205f && c != 0x3000
5862 #endif
5863 ) break;
5864 eptr++;
5865 }
5866 break;
5867
5868 case OP_NOT_VSPACE:
5869 for (i = min; i < max; i++)
5870 {
5871 if (eptr >= md->end_subject)
5872 {
5873 SCHECK_PARTIAL();
5874 break;
5875 }
5876 c = *eptr;
5877 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5878 #ifdef COMPILE_PCRE16
5879 || c == 0x2028 || c == 0x2029
5880 #endif
5881 ) break;
5882 eptr++;
5883 }
5884 break;
5885
5886 case OP_VSPACE:
5887 for (i = min; i < max; i++)
5888 {
5889 if (eptr >= md->end_subject)
5890 {
5891 SCHECK_PARTIAL();
5892 break;
5893 }
5894 c = *eptr;
5895 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5896 #ifdef COMPILE_PCRE16
5897 && c != 0x2028 && c != 0x2029
5898 #endif
5899 ) break;
5900 eptr++;
5901 }
5902 break;
5903
5904 case OP_NOT_DIGIT:
5905 for (i = min; i < max; i++)
5906 {
5907 if (eptr >= md->end_subject)
5908 {
5909 SCHECK_PARTIAL();
5910 break;
5911 }
5912 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5913 eptr++;
5914 }
5915 break;
5916
5917 case OP_DIGIT:
5918 for (i = min; i < max; i++)
5919 {
5920 if (eptr >= md->end_subject)
5921 {
5922 SCHECK_PARTIAL();
5923 break;
5924 }
5925 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5926 eptr++;
5927 }
5928 break;
5929
5930 case OP_NOT_WHITESPACE:
5931 for (i = min; i < max; i++)
5932 {
5933 if (eptr >= md->end_subject)
5934 {
5935 SCHECK_PARTIAL();
5936 break;
5937 }
5938 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5939 eptr++;
5940 }
5941 break;
5942
5943 case OP_WHITESPACE:
5944 for (i = min; i < max; i++)
5945 {
5946 if (eptr >= md->end_subject)
5947 {
5948 SCHECK_PARTIAL();
5949 break;
5950 }
5951 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5952 eptr++;
5953 }
5954 break;
5955
5956 case OP_NOT_WORDCHAR:
5957 for (i = min; i < max; i++)
5958 {
5959 if (eptr >= md->end_subject)
5960 {
5961 SCHECK_PARTIAL();
5962 break;
5963 }
5964 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5965 eptr++;
5966 }
5967 break;
5968
5969 case OP_WORDCHAR:
5970 for (i = min; i < max; i++)
5971 {
5972 if (eptr >= md->end_subject)
5973 {
5974 SCHECK_PARTIAL();
5975 break;
5976 }
5977 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5978 eptr++;
5979 }
5980 break;
5981
5982 default:
5983 RRETURN(PCRE_ERROR_INTERNAL);
5984 }
5985
5986 /* eptr is now past the end of the maximum run. If possessive, we are
5987 done (no backing up). Otherwise, match at this position; anything other
5988 than no match is immediately returned. For nomatch, back up one
5989 character (byte), unless we are matching \R and the last thing matched
5990 was \r\n, in which case, back up two bytes. */
5991
5992 if (possessive) continue;
5993 while (eptr >= pp)
5994 {
5995 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5997 eptr--;
5998 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5999 eptr[-1] == '\r') eptr--;
6000 }
6001 }
6002
6003 /* Get here if we can't make it match with any permitted repetitions */
6004
6005 RRETURN(MATCH_NOMATCH);
6006 }
6007 /* Control never gets here */
6008
6009 /* There's been some horrible disaster. Arrival here can only mean there is
6010 something seriously wrong in the code above or the OP_xxx definitions. */
6011
6012 default:
6013 DPRINTF(("Unknown opcode %d\n", *ecode));
6014 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6015 }
6016
6017 /* Do not stick any code in here without much thought; it is assumed
6018 that "continue" in the code above comes out to here to repeat the main
6019 loop. */
6020
6021 } /* End of main loop */
6022 /* Control never reaches here */
6023
6024
6025 /* When compiling to use the heap rather than the stack for recursive calls to
6026 match(), the RRETURN() macro jumps here. The number that is saved in
6027 frame->Xwhere indicates which label we actually want to return to. */
6028
6029 #ifdef NO_RECURSE
6030 #define LBL(val) case val: goto L_RM##val;
6031 HEAP_RETURN:
6032 switch (frame->Xwhere)
6033 {
6034 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6035 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6036 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6037 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6038 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6039 LBL(65) LBL(66)
6040 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6041 LBL(21)
6042 #endif
6043 #ifdef SUPPORT_UTF
6044 LBL(16) LBL(18) LBL(20)
6045 LBL(22) LBL(23) LBL(28) LBL(30)
6046 LBL(32) LBL(34) LBL(42) LBL(46)
6047 #ifdef SUPPORT_UCP
6048 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6049 LBL(59) LBL(60) LBL(61) LBL(62)
6050 #endif /* SUPPORT_UCP */
6051 #endif /* SUPPORT_UTF */
6052 default:
6053 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6054
6055 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6056
6057 return PCRE_ERROR_INTERNAL;
6058 }
6059 #undef LBL
6060 #endif /* NO_RECURSE */
6061 }
6062
6063
6064 /***************************************************************************
6065 ****************************************************************************
6066 RECURSION IN THE match() FUNCTION
6067
6068 Undefine all the macros that were defined above to handle this. */
6069
6070 #ifdef NO_RECURSE
6071 #undef eptr
6072 #undef ecode
6073 #undef mstart
6074 #undef offset_top
6075 #undef eptrb
6076 #undef flags
6077
6078 #undef callpat
6079 #undef charptr
6080 #undef data
6081 #undef next
6082 #undef pp
6083 #undef prev
6084 #undef saved_eptr
6085
6086 #undef new_recursive
6087
6088 #undef cur_is_word
6089 #undef condition
6090 #undef prev_is_word
6091
6092 #undef ctype
6093 #undef length
6094 #undef max
6095 #undef min
6096 #undef number
6097 #undef offset
6098 #undef op
6099 #undef save_capture_last
6100 #undef save_offset1
6101 #undef save_offset2
6102 #undef save_offset3
6103 #undef stacksave
6104
6105 #undef newptrb
6106
6107 #endif
6108
6109 /* These two are defined as macros in both cases */
6110
6111 #undef fc
6112 #undef fi
6113
6114 /***************************************************************************
6115 ***************************************************************************/
6116
6117
6118
6119 /*************************************************
6120 * Execute a Regular Expression *
6121 *************************************************/
6122
6123 /* This function applies a compiled re to a subject string and picks out
6124 portions of the string if it matches. Two elements in the vector are set for
6125 each substring: the offsets to the start and end of the substring.
6126
6127 Arguments:
6128 argument_re points to the compiled expression
6129 extra_data points to extra data or is NULL
6130 subject points to the subject string
6131 length length of subject string (may contain binary zeros)
6132 start_offset where to start in the subject string
6133 options option bits
6134 offsets points to a vector of ints to be filled in with offsets
6135 offsetcount the number of elements in the vector
6136
6137 Returns: > 0 => success; value is the number of elements filled in
6138 = 0 => success, but offsets is not big enough
6139 -1 => failed to match
6140 < -1 => some kind of unexpected problem
6141 */
6142
6143 #ifdef COMPILE_PCRE8
6144 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6145 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6146 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6147 int offsetcount)
6148 #else
6149 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6150 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6151 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6152 int offsetcount)
6153 #endif
6154 {
6155 int rc, ocount, arg_offset_max;
6156 int newline;
6157 BOOL using_temporary_offsets = FALSE;
6158 BOOL anchored;
6159 BOOL startline;
6160 BOOL firstline;
6161 BOOL utf;
6162 BOOL has_first_char = FALSE;
6163 BOOL has_req_char = FALSE;
6164 pcre_uchar first_char = 0;
6165 pcre_uchar first_char2 = 0;
6166 pcre_uchar req_char = 0;
6167 pcre_uchar req_char2 = 0;
6168 match_data match_block;
6169 match_data *md = &match_block;
6170 const pcre_uint8 *tables;
6171 const pcre_uint8 *start_bits = NULL;
6172 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6173 PCRE_PUCHAR end_subject;
6174 PCRE_PUCHAR start_partial = NULL;
6175 PCRE_PUCHAR req_char_ptr = start_match - 1;
6176
6177 const pcre_study_data *study;
6178 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6179
6180 /* Plausibility checks */
6181
6182 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6183 if (re == NULL || subject == NULL ||
6184 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6185 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6186 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6187
6188 /* These two settings are used in the code for checking a UTF-8 string that
6189 follows immediately afterwards. Other values in the md block are used only
6190 during "normal" pcre_exec() processing, not when the JIT support is in use,
6191 so they are set up later. */
6192
6193 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6194 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6195 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6196 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6197
6198 /* Check a UTF-8 string if required. Pass back the character offset and error
6199 code for an invalid string if a results vector is available. */
6200
6201 #ifdef SUPPORT_UTF
6202 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6203 {
6204 int erroroffset;
6205 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6206 if (errorcode != 0)
6207 {
6208 if (offsetcount >= 2)
6209 {
6210 offsets[0] = erroroffset;
6211 offsets[1] = errorcode;
6212 }
6213 #ifdef COMPILE_PCRE16
6214 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6215 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6216 #else
6217 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6218 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6219 #endif
6220 }
6221
6222 /* Check that a start_offset points to the start of a UTF character. */
6223 if (start_offset > 0 && start_offset < length &&
6224 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6225 return PCRE_ERROR_BADUTF8_OFFSET;
6226 }
6227 #endif
6228
6229 /* If the pattern was successfully studied with JIT support, run the JIT
6230 executable instead of the rest of this function. Most options must be set at
6231 compile time for the JIT code to be usable. Fallback to the normal code path if
6232 an unsupported flag is set. In particular, JIT does not support partial
6233 matching. */
6234
6235 #ifdef SUPPORT_JIT
6236 if (extra_data != NULL
6237 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6238 && extra_data->executable_jit != NULL
6239 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6240 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6241 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6242 return PRIV(jit_exec)(re, extra_data->executable_jit,
6243 (const pcre_uchar *)subject, length, start_offset, options,
6244 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6245 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6246 #endif
6247
6248 /* Carry on with non-JIT matching. This information is for finding all the
6249 numbers associated with a given name, for condition testing. */
6250
6251 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6252 md->name_count = re->name_count;
6253 md->name_entry_size = re->name_entry_size;
6254
6255 /* Fish out the optional data from the extra_data structure, first setting
6256 the default values. */
6257
6258 study = NULL;
6259 md->match_limit = MATCH_LIMIT;
6260 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6261 md->callout_data = NULL;
6262
6263 /* The table pointer is always in native byte order. */
6264
6265 tables = re->tables;
6266
6267 if (extra_data != NULL)
6268 {
6269 register unsigned int flags = extra_data->flags;
6270 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6271 study = (const pcre_study_data *)extra_data->study_data;
6272 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6273 md->match_limit = extra_data->match_limit;
6274 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6275 md->match_limit_recursion = extra_data->match_limit_recursion;
6276 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6277 md->callout_data = extra_data->callout_data;
6278 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6279 }
6280
6281 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6282 is a feature that makes it possible to save compiled regex and re-use them
6283 in other programs later. */
6284
6285 if (tables == NULL) tables = PRIV(default_tables);
6286
6287 /* Check that the first field in the block is the magic number. If it is not,
6288 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6289 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6290 means that the pattern is likely compiled with different endianness. */
6291
6292 if (re->magic_number != MAGIC_NUMBER)
6293 return re->magic_number == REVERSED_MAGIC_NUMBER?
6294 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6295 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6296
6297 /* Set up other data */
6298
6299 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6300 startline = (re->flags & PCRE_STARTLINE) != 0;
6301 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6302
6303 /* The code starts after the real_pcre block and the capture name table. */
6304
6305 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6306 re->name_count * re->name_entry_size;
6307
6308 md->start_subject = (PCRE_PUCHAR)subject;
6309 md->start_offset = start_offset;
6310 md->end_subject = md->start_subject + length;
6311 end_subject = md->end_subject;
6312
6313 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6314 md->use_ucp = (re->options & PCRE_UCP) != 0;
6315 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6316 md->ignore_skip_arg = FALSE;
6317
6318 /* Some options are unpacked into BOOL variables in the hope that testing
6319 them will be faster than individual option bits. */
6320
6321 md->notbol = (options & PCRE_NOTBOL) != 0;
6322 md->noteol = (options & PCRE_NOTEOL) != 0;
6323 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6324 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6325
6326 md->hitend = FALSE;
6327 md->mark = md->nomatch_mark = NULL; /* In case never set */
6328
6329 md->recursive = NULL; /* No recursion at top level */
6330 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6331
6332 md->lcc = tables + lcc_offset;
6333 md->fcc = tables + fcc_offset;
6334 md->ctypes = tables + ctypes_offset;
6335
6336 /* Handle different \R options. */
6337
6338 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6339 {
6340 case 0:
6341 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6342 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6343 else
6344 #ifdef BSR_ANYCRLF
6345 md->bsr_anycrlf = TRUE;
6346 #else
6347 md->bsr_anycrlf = FALSE;
6348 #endif
6349 break;
6350
6351 case PCRE_BSR_ANYCRLF:
6352 md->bsr_anycrlf = TRUE;
6353 break;
6354
6355 case PCRE_BSR_UNICODE:
6356 md->bsr_anycrlf = FALSE;
6357 break;
6358
6359 default: return PCRE_ERROR_BADNEWLINE;
6360 }
6361
6362 /* Handle different types of newline. The three bits give eight cases. If
6363 nothing is set at run time, whatever was used at compile time applies. */
6364
6365 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6366 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6367 {
6368 case 0: newline = NEWLINE; break; /* Compile-time default */
6369 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6370 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6371 case PCRE_NEWLINE_CR+
6372 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6373 case PCRE_NEWLINE_ANY: newline = -1; break;
6374 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6375 default: return PCRE_ERROR_BADNEWLINE;
6376 }
6377
6378 if (newline == -2)
6379 {
6380 md->nltype = NLTYPE_ANYCRLF;
6381 }
6382 else if (newline < 0)
6383 {
6384 md->nltype = NLTYPE_ANY;
6385 }
6386 else
6387 {
6388 md->nltype = NLTYPE_FIXED;
6389 if (newline > 255)
6390 {
6391 md->nllen = 2;
6392 md->nl[0] = (newline >> 8) & 255;
6393 md->nl[1] = newline & 255;
6394 }
6395 else
6396 {
6397 md->nllen = 1;
6398 md->nl[0] = newline;
6399 }
6400 }
6401
6402 /* Partial matching was originally supported only for a restricted set of
6403 regexes; from release 8.00 there are no restrictions, but the bits are still
6404 defined (though never set). So there's no harm in leaving this code. */
6405
6406 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6407 return PCRE_ERROR_BADPARTIAL;
6408
6409 /* If the expression has got more back references than the offsets supplied can
6410 hold, we get a temporary chunk of working store to use during the matching.
6411 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6412 of 3. */
6413
6414 ocount = offsetcount - (offsetcount % 3);
6415 arg_offset_max = (2*ocount)/3;
6416
6417 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6418 {
6419 ocount = re->top_backref * 3 + 3;
6420 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6421 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6422 using_temporary_offsets = TRUE;
6423 DPRINTF(("Got memory to hold back references\n"));
6424 }
6425 else md->offset_vector = offsets;
6426
6427 md->offset_end = ocount;
6428 md->offset_max = (2*ocount)/3;
6429 md->offset_overflow = FALSE;
6430 md->capture_last = -1;
6431
6432 /* Reset the working variable associated with each extraction. These should
6433 never be used unless previously set, but they get saved and restored, and so we
6434 initialize them to avoid reading uninitialized locations. Also, unset the
6435 offsets for the matched string. This is really just for tidiness with callouts,
6436 in case they inspect these fields. */
6437
6438 if (md->offset_vector != NULL)
6439 {
6440 register int *iptr = md->offset_vector + ocount;
6441 register int *iend = iptr - re->top_bracket;
6442 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6443 while (--iptr >= iend) *iptr = -1;
6444 md->offset_vector[0] = md->offset_vector[1] = -1;
6445 }
6446
6447 /* Set up the first character to match, if available. The first_char value is
6448 never set for an anchored regular expression, but the anchoring may be forced
6449 at run time, so we have to test for anchoring. The first char may be unset for
6450 an unanchored pattern, of course. If there's no first char and the pattern was
6451 studied, there may be a bitmap of possible first characters. */
6452
6453 if (!anchored)
6454 {
6455 if ((re->flags & PCRE_FIRSTSET) != 0)
6456 {
6457 has_first_char = TRUE;
6458 first_char = first_char2 = re->first_char;
6459 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6460 {
6461 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6462 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6463 if (utf && first_char > 127)
6464 first_char2 = UCD_OTHERCASE(first_char);
6465 #endif
6466 }
6467 }
6468 else
6469 if (!startline && study != NULL &&
6470 (study->flags & PCRE_STUDY_MAPPED) != 0)
6471 start_bits = study->start_bits;
6472 }
6473
6474 /* For anchored or unanchored matches, there may be a "last known required
6475 character" set. */
6476
6477 if ((re->flags & PCRE_REQCHSET) != 0)
6478 {
6479 has_req_char = TRUE;
6480 req_char = req_char2 = re->req_char;
6481 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6482 {
6483 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6484 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6485 if (utf && req_char > 127)
6486 req_char2 = UCD_OTHERCASE(req_char);
6487 #endif
6488 }
6489 }
6490
6491
6492 /* ==========================================================================*/
6493
6494 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6495 the loop runs just once. */
6496
6497 for(;;)
6498 {
6499 PCRE_PUCHAR save_end_subject = end_subject;
6500 PCRE_PUCHAR new_start_match;
6501
6502 /* If firstline is TRUE, the start of the match is constrained to the first
6503 line of a multiline string. That is, the match must be before or at the first
6504 newline. Implement this by temporarily adjusting end_subject so that we stop
6505 scanning at a newline. If the match fails at the newline, later code breaks
6506 this loop. */
6507
6508 if (firstline)
6509 {
6510 PCRE_PUCHAR t = start_match;
6511 #ifdef SUPPORT_UTF
6512 if (utf)
6513 {
6514 while (t < md->end_subject && !IS_NEWLINE(t))
6515 {
6516 t++;
6517 ACROSSCHAR(t < end_subject, *t, t++);
6518 }
6519 }
6520 else
6521 #endif
6522 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6523 end_subject = t;
6524 }
6525
6526 /* There are some optimizations that avoid running the match if a known
6527 starting point is not found, or if a known later character is not present.
6528 However, there is an option that disables these, for testing and for ensuring
6529 that all callouts do actually occur. The option can be set in the regex by
6530 (*NO_START_OPT) or passed in match-time options. */
6531
6532 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6533 {
6534 /* Advance to a unique first char if there is one. */
6535
6536 if (has_first_char)
6537 {
6538 if (first_char != first_char2)
6539 while (start_match < end_subject &&
6540 *start_match != first_char && *start_match != first_char2)
6541 start_match++;
6542 else
6543 while (start_match < end_subject && *start_match != first_char)
6544 start_match++;
6545 }
6546
6547 /* Or to just after a linebreak for a multiline match */
6548
6549 else if (startline)
6550 {
6551 if (start_match > md->start_subject + start_offset)
6552 {
6553 #ifdef SUPPORT_UTF
6554 if (utf)
6555 {
6556 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6557 {
6558 start_match++;
6559 ACROSSCHAR(start_match < end_subject, *start_match,
6560 start_match++);
6561 }
6562 }
6563 else
6564 #endif
6565 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6566 start_match++;
6567
6568 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6569 and we are now at a LF, advance the match position by one more character.
6570 */
6571
6572 if (start_match[-1] == CHAR_CR &&
6573 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6574 start_match < end_subject &&
6575 *start_match == CHAR_NL)
6576 start_match++;
6577 }
6578 }
6579
6580 /* Or to a non-unique first byte after study */
6581
6582 else if (start_bits != NULL)
6583 {
6584 while (start_match < end_subject)
6585 {
6586 register unsigned int c = *start_match;
6587 #ifndef COMPILE_PCRE8
6588 if (c > 255) c = 255;
6589 #endif
6590 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6591 {
6592 start_match++;
6593 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6594 /* In non 8-bit mode, the iteration will stop for
6595 characters > 255 at the beginning or not stop at all. */
6596 if (utf)
6597 ACROSSCHAR(start_match < end_subject, *start_match,
6598 start_match++);
6599 #endif
6600 }
6601 else break;
6602 }
6603 }
6604 } /* Starting optimizations */
6605
6606 /* Restore fudged end_subject */
6607
6608 end_subject = save_end_subject;
6609
6610 /* The following two optimizations are disabled for partial matching or if
6611 disabling is explicitly requested. */
6612
6613 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6614 {
6615 /* If the pattern was studied, a minimum subject length may be set. This is
6616 a lower bound; no actual string of that length may actually match the
6617 pattern. Although the value is, strictly, in characters, we treat it as
6618 bytes to avoid spending too much time in this optimization. */
6619
6620 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6621 (pcre_uint32)(end_subject - start_match) < study->minlength)
6622 {
6623 rc = MATCH_NOMATCH;
6624 break;
6625 }
6626
6627 /* If req_char is set, we know that that character must appear in the
6628 subject for the match to succeed. If the first character is set, req_char
6629 must be later in the subject; otherwise the test starts at the match point.
6630 This optimization can save a huge amount of backtracking in patterns with
6631 nested unlimited repeats that aren't going to match. Writing separate code
6632 for cased/caseless versions makes it go faster, as does using an
6633 autoincrement and backing off on a match.
6634
6635 HOWEVER: when the subject string is very, very long, searching to its end
6636 can take a long time, and give bad performance on quite ordinary patterns.
6637 This showed up when somebody was matching something like /^\d+C/ on a
6638 32-megabyte string... so we don't do this when the string is sufficiently
6639 long. */
6640
6641 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6642 {
6643 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6644
6645 /* We don't need to repeat the search if we haven't yet reached the
6646 place we found it at last time. */
6647
6648 if (p > req_char_ptr)
6649 {
6650 if (req_char != req_char2)
6651 {
6652 while (p < end_subject)
6653 {
6654 register int pp = *p++;
6655 if (pp == req_char || pp == req_char2) { p--; break; }
6656 }
6657 }
6658 else
6659 {
6660 while (p < end_subject)
6661 {
6662 if (*p++ == req_char) { p--; break; }
6663 }
6664 }
6665
6666 /* If we can't find the required character, break the matching loop,
6667 forcing a match failure. */
6668
6669 if (p >= end_subject)
6670 {
6671 rc = MATCH_NOMATCH;
6672 break;
6673 }
6674
6675 /* If we have found the required character, save the point where we
6676 found it, so that we don't search again next time round the loop if
6677 the start hasn't passed this character yet. */
6678
6679 req_char_ptr = p;
6680 }
6681 }
6682 }
6683
6684 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6685 printf(">>>> Match against: ");
6686 pchars(start_match, end_subject - start_match, TRUE, md);
6687 printf("\n");
6688 #endif
6689
6690 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6691 first starting point for which a partial match was found. */
6692
6693 md->start_match_ptr = start_match;
6694 md->start_used_ptr = start_match;
6695 md->match_call_count = 0;
6696 md->match_function_type = 0;
6697 md->end_offset_top = 0;
6698 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6699 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6700
6701 switch(rc)
6702 {
6703 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6704