/[pcre]/code/branches/pcre16/pcre_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 801 - (show annotations)
Mon Dec 12 16:23:37 2011 UTC (8 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 205361 byte(s)
Error occurred while calculating annotation data.
Merge changes from trunk r755 to r800 into the 16-bit branch.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
208 }
209 }
210
211 /* In the caseful case, we can just compare the bytes, whether or not we
212 are in UTF-8 mode. */
213
214 else
215 {
216 if (eptr + length > md->end_subject) return -1;
217 while (length-- > 0) if (*p++ != *eptr++) return -1;
218 }
219
220 return (int)(eptr - eptr_start);
221 }
222
223
224
225 /***************************************************************************
226 ****************************************************************************
227 RECURSION IN THE match() FUNCTION
228
229 The match() function is highly recursive, though not every recursive call
230 increases the recursive depth. Nevertheless, some regular expressions can cause
231 it to recurse to a great depth. I was writing for Unix, so I just let it call
232 itself recursively. This uses the stack for saving everything that has to be
233 saved for a recursive call. On Unix, the stack can be large, and this works
234 fine.
235
236 It turns out that on some non-Unix-like systems there are problems with
237 programs that use a lot of stack. (This despite the fact that every last chip
238 has oodles of memory these days, and techniques for extending the stack have
239 been known for decades.) So....
240
241 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
242 calls by keeping local variables that need to be preserved in blocks of memory
243 obtained from malloc() instead instead of on the stack. Macros are used to
244 achieve this so that the actual code doesn't look very different to what it
245 always used to.
246
247 The original heap-recursive code used longjmp(). However, it seems that this
248 can be very slow on some operating systems. Following a suggestion from Stan
249 Switzer, the use of longjmp() has been abolished, at the cost of having to
250 provide a unique number for each call to RMATCH. There is no way of generating
251 a sequence of numbers at compile time in C. I have given them names, to make
252 them stand out more clearly.
253
254 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
255 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
256 tests. Furthermore, not using longjmp() means that local dynamic variables
257 don't have indeterminate values; this has meant that the frame size can be
258 reduced because the result can be "passed back" by straight setting of the
259 variable instead of being passed in the frame.
260 ****************************************************************************
261 ***************************************************************************/
262
263 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
264 below must be updated in sync. */
265
266 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
267 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
268 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
269 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
270 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
271 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
272 RM61, RM62, RM63, RM64, RM65, RM66 };
273
274 /* These versions of the macros use the stack, as normal. There are debugging
275 versions and production versions. Note that the "rw" argument of RMATCH isn't
276 actually used in this definition. */
277
278 #ifndef NO_RECURSE
279 #define REGISTER register
280
281 #ifdef PCRE_DEBUG
282 #define RMATCH(ra,rb,rc,rd,re,rw) \
283 { \
284 printf("match() called in line %d\n", __LINE__); \
285 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
286 printf("to line %d\n", __LINE__); \
287 }
288 #define RRETURN(ra) \
289 { \
290 printf("match() returned %d from line %d ", ra, __LINE__); \
291 return ra; \
292 }
293 #else
294 #define RMATCH(ra,rb,rc,rd,re,rw) \
295 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
296 #define RRETURN(ra) return ra
297 #endif
298
299 #else
300
301
302 /* These versions of the macros manage a private stack on the heap. Note that
303 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
304 argument of match(), which never changes. */
305
306 #define REGISTER
307
308 #define RMATCH(ra,rb,rc,rd,re,rw)\
309 {\
310 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
311 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
312 frame->Xwhere = rw; \
313 newframe->Xeptr = ra;\
314 newframe->Xecode = rb;\
315 newframe->Xmstart = mstart;\
316 newframe->Xoffset_top = rc;\
317 newframe->Xeptrb = re;\
318 newframe->Xrdepth = frame->Xrdepth + 1;\
319 newframe->Xprevframe = frame;\
320 frame = newframe;\
321 DPRINTF(("restarting from line %d\n", __LINE__));\
322 goto HEAP_RECURSE;\
323 L_##rw:\
324 DPRINTF(("jumped back to line %d\n", __LINE__));\
325 }
326
327 #define RRETURN(ra)\
328 {\
329 heapframe *oldframe = frame;\
330 frame = oldframe->Xprevframe;\
331 (pcre_stack_free)(oldframe);\
332 if (frame != NULL)\
333 {\
334 rrc = ra;\
335 goto HEAP_RETURN;\
336 }\
337 return ra;\
338 }
339
340
341 /* Structure for remembering the local variables in a private frame */
342
343 typedef struct heapframe {
344 struct heapframe *Xprevframe;
345
346 /* Function arguments that may change */
347
348 PCRE_PUCHAR Xeptr;
349 const pcre_uchar *Xecode;
350 PCRE_PUCHAR Xmstart;
351 int Xoffset_top;
352 eptrblock *Xeptrb;
353 unsigned int Xrdepth;
354
355 /* Function local variables */
356
357 PCRE_PUCHAR Xcallpat;
358 #ifdef SUPPORT_UTF
359 PCRE_PUCHAR Xcharptr;
360 #endif
361 PCRE_PUCHAR Xdata;
362 PCRE_PUCHAR Xnext;
363 PCRE_PUCHAR Xpp;
364 PCRE_PUCHAR Xprev;
365 PCRE_PUCHAR Xsaved_eptr;
366
367 recursion_info Xnew_recursive;
368
369 BOOL Xcur_is_word;
370 BOOL Xcondition;
371 BOOL Xprev_is_word;
372
373 #ifdef SUPPORT_UCP
374 int Xprop_type;
375 int Xprop_value;
376 int Xprop_fail_result;
377 int Xoclength;
378 pcre_uchar Xocchars[6];
379 #endif
380
381 int Xcodelink;
382 int Xctype;
383 unsigned int Xfc;
384 int Xfi;
385 int Xlength;
386 int Xmax;
387 int Xmin;
388 int Xnumber;
389 int Xoffset;
390 int Xop;
391 int Xsave_capture_last;
392 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
393 int Xstacksave[REC_STACK_SAVE_MAX];
394
395 eptrblock Xnewptrb;
396
397 /* Where to jump back to */
398
399 int Xwhere;
400
401 } heapframe;
402
403 #endif
404
405
406 /***************************************************************************
407 ***************************************************************************/
408
409
410
411 /*************************************************
412 * Match from current position *
413 *************************************************/
414
415 /* This function is called recursively in many circumstances. Whenever it
416 returns a negative (error) response, the outer incarnation must also return the
417 same response. */
418
419 /* These macros pack up tests that are used for partial matching, and which
420 appear several times in the code. We set the "hit end" flag if the pointer is
421 at the end of the subject and also past the start of the subject (i.e.
422 something has been matched). For hard partial matching, we then return
423 immediately. The second one is used when we already know we are past the end of
424 the subject. */
425
426 #define CHECK_PARTIAL()\
427 if (md->partial != 0 && eptr >= md->end_subject && \
428 eptr > md->start_used_ptr) \
429 { \
430 md->hitend = TRUE; \
431 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
432 }
433
434 #define SCHECK_PARTIAL()\
435 if (md->partial != 0 && eptr > md->start_used_ptr) \
436 { \
437 md->hitend = TRUE; \
438 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
439 }
440
441
442 /* Performance note: It might be tempting to extract commonly used fields from
443 the md structure (e.g. utf, end_subject) into individual variables to improve
444 performance. Tests using gcc on a SPARC disproved this; in the first case, it
445 made performance worse.
446
447 Arguments:
448 eptr pointer to current character in subject
449 ecode pointer to current position in compiled code
450 mstart pointer to the current match start position (can be modified
451 by encountering \K)
452 offset_top current top pointer
453 md pointer to "static" info for the match
454 eptrb pointer to chain of blocks containing eptr at start of
455 brackets - for testing for empty matches
456 rdepth the recursion depth
457
458 Returns: MATCH_MATCH if matched ) these values are >= 0
459 MATCH_NOMATCH if failed to match )
460 a negative MATCH_xxx value for PRUNE, SKIP, etc
461 a negative PCRE_ERROR_xxx value if aborted by an error condition
462 (e.g. stopped by repeated call or recursion limit)
463 */
464
465 static int
466 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
467 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
468 unsigned int rdepth)
469 {
470 /* These variables do not need to be preserved over recursion in this function,
471 so they can be ordinary variables in all cases. Mark some of them with
472 "register" because they are used a lot in loops. */
473
474 register int rrc; /* Returns from recursive calls */
475 register int i; /* Used for loops not involving calls to RMATCH() */
476 register unsigned int c; /* Character values not kept over RMATCH() calls */
477 register BOOL utf; /* Local copy of UTF flag for speed */
478
479 BOOL minimize, possessive; /* Quantifier options */
480 BOOL caseless;
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
490 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
491 frame->Xprevframe = NULL; /* Marks the top level */
492
493 /* Copy in the original argument variables */
494
495 frame->Xeptr = eptr;
496 frame->Xecode = ecode;
497 frame->Xmstart = mstart;
498 frame->Xoffset_top = offset_top;
499 frame->Xeptrb = eptrb;
500 frame->Xrdepth = rdepth;
501
502 /* This is where control jumps back to to effect "recursion" */
503
504 HEAP_RECURSE:
505
506 /* Macros make the argument variables come from the current frame */
507
508 #define eptr frame->Xeptr
509 #define ecode frame->Xecode
510 #define mstart frame->Xmstart
511 #define offset_top frame->Xoffset_top
512 #define eptrb frame->Xeptrb
513 #define rdepth frame->Xrdepth
514
515 /* Ditto for the local variables */
516
517 #ifdef SUPPORT_UTF
518 #define charptr frame->Xcharptr
519 #endif
520 #define callpat frame->Xcallpat
521 #define codelink frame->Xcodelink
522 #define data frame->Xdata
523 #define next frame->Xnext
524 #define pp frame->Xpp
525 #define prev frame->Xprev
526 #define saved_eptr frame->Xsaved_eptr
527
528 #define new_recursive frame->Xnew_recursive
529
530 #define cur_is_word frame->Xcur_is_word
531 #define condition frame->Xcondition
532 #define prev_is_word frame->Xprev_is_word
533
534 #ifdef SUPPORT_UCP
535 #define prop_type frame->Xprop_type
536 #define prop_value frame->Xprop_value
537 #define prop_fail_result frame->Xprop_fail_result
538 #define oclength frame->Xoclength
539 #define occhars frame->Xocchars
540 #endif
541
542 #define ctype frame->Xctype
543 #define fc frame->Xfc
544 #define fi frame->Xfi
545 #define length frame->Xlength
546 #define max frame->Xmax
547 #define min frame->Xmin
548 #define number frame->Xnumber
549 #define offset frame->Xoffset
550 #define op frame->Xop
551 #define save_capture_last frame->Xsave_capture_last
552 #define save_offset1 frame->Xsave_offset1
553 #define save_offset2 frame->Xsave_offset2
554 #define save_offset3 frame->Xsave_offset3
555 #define stacksave frame->Xstacksave
556
557 #define newptrb frame->Xnewptrb
558
559 /* When recursion is being used, local variables are allocated on the stack and
560 get preserved during recursion in the normal way. In this environment, fi and
561 i, and fc and c, can be the same variables. */
562
563 #else /* NO_RECURSE not defined */
564 #define fi i
565 #define fc c
566
567 /* Many of the following variables are used only in small blocks of the code.
568 My normal style of coding would have declared them within each of those blocks.
569 However, in order to accommodate the version of this code that uses an external
570 "stack" implemented on the heap, it is easier to declare them all here, so the
571 declarations can be cut out in a block. The only declarations within blocks
572 below are for variables that do not have to be preserved over a recursive call
573 to RMATCH(). */
574
575 #ifdef SUPPORT_UTF
576 const pcre_uchar *charptr;
577 #endif
578 const pcre_uchar *callpat;
579 const pcre_uchar *data;
580 const pcre_uchar *next;
581 PCRE_PUCHAR pp;
582 const pcre_uchar *prev;
583 PCRE_PUCHAR saved_eptr;
584
585 recursion_info new_recursive;
586
587 BOOL cur_is_word;
588 BOOL condition;
589 BOOL prev_is_word;
590
591 #ifdef SUPPORT_UCP
592 int prop_type;
593 int prop_value;
594 int prop_fail_result;
595 int oclength;
596 pcre_uchar occhars[6];
597 #endif
598
599 int codelink;
600 int ctype;
601 int length;
602 int max;
603 int min;
604 int number;
605 int offset;
606 int op;
607 int save_capture_last;
608 int save_offset1, save_offset2, save_offset3;
609 int stacksave[REC_STACK_SAVE_MAX];
610
611 eptrblock newptrb;
612 #endif /* NO_RECURSE */
613
614 /* To save space on the stack and in the heap frame, I have doubled up on some
615 of the local variables that are used only in localised parts of the code, but
616 still need to be preserved over recursive calls of match(). These macros define
617 the alternative names that are used. */
618
619 #define allow_zero cur_is_word
620 #define cbegroup condition
621 #define code_offset codelink
622 #define condassert condition
623 #define matched_once prev_is_word
624 #define foc number
625
626 /* These statements are here to stop the compiler complaining about unitialized
627 variables. */
628
629 #ifdef SUPPORT_UCP
630 prop_value = 0;
631 prop_fail_result = 0;
632 #endif
633
634
635 /* This label is used for tail recursion, which is used in a few cases even
636 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
637 used. Thanks to Ian Taylor for noticing this possibility and sending the
638 original patch. */
639
640 TAIL_RECURSE:
641
642 /* OK, now we can get on with the real code of the function. Recursive calls
643 are specified by the macro RMATCH and RRETURN is used to return. When
644 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
645 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
646 defined). However, RMATCH isn't like a function call because it's quite a
647 complicated macro. It has to be used in one particular way. This shouldn't,
648 however, impact performance when true recursion is being used. */
649
650 #ifdef SUPPORT_UTF
651 utf = md->utf; /* Local copy of the flag */
652 #else
653 utf = FALSE;
654 #endif
655
656 /* First check that we haven't called match() too many times, or that we
657 haven't exceeded the recursive call limit. */
658
659 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
660 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
661
662 /* At the start of a group with an unlimited repeat that may match an empty
663 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
664 done this way to save having to use another function argument, which would take
665 up space on the stack. See also MATCH_CONDASSERT below.
666
667 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
668 such remembered pointers, to be checked when we hit the closing ket, in order
669 to break infinite loops that match no characters. When match() is called in
670 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
671 NOT be used with tail recursion, because the memory block that is used is on
672 the stack, so a new one may be required for each match(). */
673
674 if (md->match_function_type == MATCH_CBEGROUP)
675 {
676 newptrb.epb_saved_eptr = eptr;
677 newptrb.epb_prev = eptrb;
678 eptrb = &newptrb;
679 md->match_function_type = 0;
680 }
681
682 /* Now start processing the opcodes. */
683
684 for (;;)
685 {
686 minimize = possessive = FALSE;
687 op = *ecode;
688
689 switch(op)
690 {
691 case OP_MARK:
692 md->nomatch_mark = ecode + 2;
693 md->mark = NULL; /* In case previously set by assertion */
694 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
695 eptrb, RM55);
696 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
697 md->mark == NULL) md->mark = ecode + 2;
698
699 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
700 argument, and we must check whether that argument matches this MARK's
701 argument. It is passed back in md->start_match_ptr (an overloading of that
702 variable). If it does match, we reset that variable to the current subject
703 position and return MATCH_SKIP. Otherwise, pass back the return code
704 unaltered. */
705
706 else if (rrc == MATCH_SKIP_ARG &&
707 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
708 {
709 md->start_match_ptr = eptr;
710 RRETURN(MATCH_SKIP);
711 }
712 RRETURN(rrc);
713
714 case OP_FAIL:
715 RRETURN(MATCH_NOMATCH);
716
717 /* COMMIT overrides PRUNE, SKIP, and THEN */
718
719 case OP_COMMIT:
720 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
721 eptrb, RM52);
722 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
723 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
724 rrc != MATCH_THEN)
725 RRETURN(rrc);
726 RRETURN(MATCH_COMMIT);
727
728 /* PRUNE overrides THEN */
729
730 case OP_PRUNE:
731 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
732 eptrb, RM51);
733 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
734 RRETURN(MATCH_PRUNE);
735
736 case OP_PRUNE_ARG:
737 md->nomatch_mark = ecode + 2;
738 md->mark = NULL; /* In case previously set by assertion */
739 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
740 eptrb, RM56);
741 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
742 md->mark == NULL) md->mark = ecode + 2;
743 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
744 RRETURN(MATCH_PRUNE);
745
746 /* SKIP overrides PRUNE and THEN */
747
748 case OP_SKIP:
749 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
750 eptrb, RM53);
751 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
752 RRETURN(rrc);
753 md->start_match_ptr = eptr; /* Pass back current position */
754 RRETURN(MATCH_SKIP);
755
756 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
757 nomatch_mark. There is a flag that disables this opcode when re-matching a
758 pattern that ended with a SKIP for which there was not a matching MARK. */
759
760 case OP_SKIP_ARG:
761 if (md->ignore_skip_arg)
762 {
763 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
764 break;
765 }
766 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
767 eptrb, RM57);
768 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
769 RRETURN(rrc);
770
771 /* Pass back the current skip name by overloading md->start_match_ptr and
772 returning the special MATCH_SKIP_ARG return code. This will either be
773 caught by a matching MARK, or get to the top, where it causes a rematch
774 with the md->ignore_skip_arg flag set. */
775
776 md->start_match_ptr = ecode + 2;
777 RRETURN(MATCH_SKIP_ARG);
778
779 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
780 the branch in which it occurs can be determined. Overload the start of
781 match pointer to do this. */
782
783 case OP_THEN:
784 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
785 eptrb, RM54);
786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
787 md->start_match_ptr = ecode;
788 RRETURN(MATCH_THEN);
789
790 case OP_THEN_ARG:
791 md->nomatch_mark = ecode + 2;
792 md->mark = NULL; /* In case previously set by assertion */
793 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
794 md, eptrb, RM58);
795 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
796 md->mark == NULL) md->mark = ecode + 2;
797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
798 md->start_match_ptr = ecode;
799 RRETURN(MATCH_THEN);
800
801 /* Handle an atomic group that does not contain any capturing parentheses.
802 This can be handled like an assertion. Prior to 8.13, all atomic groups
803 were handled this way. In 8.13, the code was changed as below for ONCE, so
804 that backups pass through the group and thereby reset captured values.
805 However, this uses a lot more stack, so in 8.20, atomic groups that do not
806 contain any captures generate OP_ONCE_NC, which can be handled in the old,
807 less stack intensive way.
808
809 Check the alternative branches in turn - the matching won't pass the KET
810 for this kind of subpattern. If any one branch matches, we carry on as at
811 the end of a normal bracket, leaving the subject pointer, but resetting
812 the start-of-match value in case it was changed by \K. */
813
814 case OP_ONCE_NC:
815 prev = ecode;
816 saved_eptr = eptr;
817 do
818 {
819 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
820 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
821 {
822 mstart = md->start_match_ptr;
823 break;
824 }
825 if (rrc == MATCH_THEN)
826 {
827 next = ecode + GET(ecode,1);
828 if (md->start_match_ptr < next &&
829 (*ecode == OP_ALT || *next == OP_ALT))
830 rrc = MATCH_NOMATCH;
831 }
832
833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834 ecode += GET(ecode,1);
835 }
836 while (*ecode == OP_ALT);
837
838 /* If hit the end of the group (which could be repeated), fail */
839
840 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
841
842 /* Continue as from after the group, updating the offsets high water
843 mark, since extracts may have been taken. */
844
845 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
846
847 offset_top = md->end_offset_top;
848 eptr = md->end_match_ptr;
849
850 /* For a non-repeating ket, just continue at this level. This also
851 happens for a repeating ket if no characters were matched in the group.
852 This is the forcible breaking of infinite loops as implemented in Perl
853 5.005. */
854
855 if (*ecode == OP_KET || eptr == saved_eptr)
856 {
857 ecode += 1+LINK_SIZE;
858 break;
859 }
860
861 /* The repeating kets try the rest of the pattern or restart from the
862 preceding bracket, in the appropriate order. The second "call" of match()
863 uses tail recursion, to avoid using another stack frame. */
864
865 if (*ecode == OP_KETRMIN)
866 {
867 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
869 ecode = prev;
870 goto TAIL_RECURSE;
871 }
872 else /* OP_KETRMAX */
873 {
874 md->match_function_type = MATCH_CBEGROUP;
875 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
877 ecode += 1 + LINK_SIZE;
878 goto TAIL_RECURSE;
879 }
880 /* Control never gets here */
881
882 /* Handle a capturing bracket, other than those that are possessive with an
883 unlimited repeat. If there is space in the offset vector, save the current
884 subject position in the working slot at the top of the vector. We mustn't
885 change the current values of the data slot, because they may be set from a
886 previous iteration of this group, and be referred to by a reference inside
887 the group. A failure to match might occur after the group has succeeded,
888 if something later on doesn't match. For this reason, we need to restore
889 the working value and also the values of the final offsets, in case they
890 were set by a previous iteration of the same bracket.
891
892 If there isn't enough space in the offset vector, treat this as if it were
893 a non-capturing bracket. Don't worry about setting the flag for the error
894 case here; that is handled in the code for KET. */
895
896 case OP_CBRA:
897 case OP_SCBRA:
898 number = GET2(ecode, 1+LINK_SIZE);
899 offset = number << 1;
900
901 #ifdef PCRE_DEBUG
902 printf("start bracket %d\n", number);
903 printf("subject=");
904 pchars(eptr, 16, TRUE, md);
905 printf("\n");
906 #endif
907
908 if (offset < md->offset_max)
909 {
910 save_offset1 = md->offset_vector[offset];
911 save_offset2 = md->offset_vector[offset+1];
912 save_offset3 = md->offset_vector[md->offset_end - number];
913 save_capture_last = md->capture_last;
914
915 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
916 md->offset_vector[md->offset_end - number] =
917 (int)(eptr - md->start_subject);
918
919 for (;;)
920 {
921 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
922 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
923 eptrb, RM1);
924 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
925
926 /* If we backed up to a THEN, check whether it is within the current
927 branch by comparing the address of the THEN that is passed back with
928 the end of the branch. If it is within the current branch, and the
929 branch is one of two or more alternatives (it either starts or ends
930 with OP_ALT), we have reached the limit of THEN's action, so convert
931 the return code to NOMATCH, which will cause normal backtracking to
932 happen from now on. Otherwise, THEN is passed back to an outer
933 alternative. This implements Perl's treatment of parenthesized groups,
934 where a group not containing | does not affect the current alternative,
935 that is, (X) is NOT the same as (X|(*F)). */
936
937 if (rrc == MATCH_THEN)
938 {
939 next = ecode + GET(ecode,1);
940 if (md->start_match_ptr < next &&
941 (*ecode == OP_ALT || *next == OP_ALT))
942 rrc = MATCH_NOMATCH;
943 }
944
945 /* Anything other than NOMATCH is passed back. */
946
947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
948 md->capture_last = save_capture_last;
949 ecode += GET(ecode, 1);
950 if (*ecode != OP_ALT) break;
951 }
952
953 DPRINTF(("bracket %d failed\n", number));
954 md->offset_vector[offset] = save_offset1;
955 md->offset_vector[offset+1] = save_offset2;
956 md->offset_vector[md->offset_end - number] = save_offset3;
957
958 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
959
960 RRETURN(rrc);
961 }
962
963 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
964 as a non-capturing bracket. */
965
966 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
967 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
968
969 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
970
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
973
974 /* Non-capturing or atomic group, except for possessive with unlimited
975 repeat and ONCE group with no captures. Loop for all the alternatives.
976
977 When we get to the final alternative within the brackets, we used to return
978 the result of a recursive call to match() whatever happened so it was
979 possible to reduce stack usage by turning this into a tail recursion,
980 except in the case of a possibly empty group. However, now that there is
981 the possiblity of (*THEN) occurring in the final alternative, this
982 optimization is no longer always possible.
983
984 We can optimize if we know there are no (*THEN)s in the pattern; at present
985 this is the best that can be done.
986
987 MATCH_ONCE is returned when the end of an atomic group is successfully
988 reached, but subsequent matching fails. It passes back up the tree (causing
989 captured values to be reset) until the original atomic group level is
990 reached. This is tested by comparing md->once_target with the start of the
991 group. At this point, the return is converted into MATCH_NOMATCH so that
992 previous backup points can be taken. */
993
994 case OP_ONCE:
995 case OP_BRA:
996 case OP_SBRA:
997 DPRINTF(("start non-capturing bracket\n"));
998
999 for (;;)
1000 {
1001 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1002
1003 /* If this is not a possibly empty group, and there are no (*THEN)s in
1004 the pattern, and this is the final alternative, optimize as described
1005 above. */
1006
1007 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1008 {
1009 ecode += PRIV(OP_lengths)[*ecode];
1010 goto TAIL_RECURSE;
1011 }
1012
1013 /* In all other cases, we have to make another call to match(). */
1014
1015 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1016 RM2);
1017
1018 /* See comment in the code for capturing groups above about handling
1019 THEN. */
1020
1021 if (rrc == MATCH_THEN)
1022 {
1023 next = ecode + GET(ecode,1);
1024 if (md->start_match_ptr < next &&
1025 (*ecode == OP_ALT || *next == OP_ALT))
1026 rrc = MATCH_NOMATCH;
1027 }
1028
1029 if (rrc != MATCH_NOMATCH)
1030 {
1031 if (rrc == MATCH_ONCE)
1032 {
1033 const pcre_uchar *scode = ecode;
1034 if (*scode != OP_ONCE) /* If not at start, find it */
1035 {
1036 while (*scode == OP_ALT) scode += GET(scode, 1);
1037 scode -= GET(scode, 1);
1038 }
1039 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1040 }
1041 RRETURN(rrc);
1042 }
1043 ecode += GET(ecode, 1);
1044 if (*ecode != OP_ALT) break;
1045 }
1046
1047 RRETURN(MATCH_NOMATCH);
1048
1049 /* Handle possessive capturing brackets with an unlimited repeat. We come
1050 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1051 handled similarly to the normal case above. However, the matching is
1052 different. The end of these brackets will always be OP_KETRPOS, which
1053 returns MATCH_KETRPOS without going further in the pattern. By this means
1054 we can handle the group by iteration rather than recursion, thereby
1055 reducing the amount of stack needed. */
1056
1057 case OP_CBRAPOS:
1058 case OP_SCBRAPOS:
1059 allow_zero = FALSE;
1060
1061 POSSESSIVE_CAPTURE:
1062 number = GET2(ecode, 1+LINK_SIZE);
1063 offset = number << 1;
1064
1065 #ifdef PCRE_DEBUG
1066 printf("start possessive bracket %d\n", number);
1067 printf("subject=");
1068 pchars(eptr, 16, TRUE, md);
1069 printf("\n");
1070 #endif
1071
1072 if (offset < md->offset_max)
1073 {
1074 matched_once = FALSE;
1075 code_offset = (int)(ecode - md->start_code);
1076
1077 save_offset1 = md->offset_vector[offset];
1078 save_offset2 = md->offset_vector[offset+1];
1079 save_offset3 = md->offset_vector[md->offset_end - number];
1080 save_capture_last = md->capture_last;
1081
1082 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1083
1084 /* Each time round the loop, save the current subject position for use
1085 when the group matches. For MATCH_MATCH, the group has matched, so we
1086 restart it with a new subject starting position, remembering that we had
1087 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1088 usual. If we haven't matched any alternatives in any iteration, check to
1089 see if a previous iteration matched. If so, the group has matched;
1090 continue from afterwards. Otherwise it has failed; restore the previous
1091 capture values before returning NOMATCH. */
1092
1093 for (;;)
1094 {
1095 md->offset_vector[md->offset_end - number] =
1096 (int)(eptr - md->start_subject);
1097 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1098 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1099 eptrb, RM63);
1100 if (rrc == MATCH_KETRPOS)
1101 {
1102 offset_top = md->end_offset_top;
1103 eptr = md->end_match_ptr;
1104 ecode = md->start_code + code_offset;
1105 save_capture_last = md->capture_last;
1106 matched_once = TRUE;
1107 continue;
1108 }
1109
1110 /* See comment in the code for capturing groups above about handling
1111 THEN. */
1112
1113 if (rrc == MATCH_THEN)
1114 {
1115 next = ecode + GET(ecode,1);
1116 if (md->start_match_ptr < next &&
1117 (*ecode == OP_ALT || *next == OP_ALT))
1118 rrc = MATCH_NOMATCH;
1119 }
1120
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122 md->capture_last = save_capture_last;
1123 ecode += GET(ecode, 1);
1124 if (*ecode != OP_ALT) break;
1125 }
1126
1127 if (!matched_once)
1128 {
1129 md->offset_vector[offset] = save_offset1;
1130 md->offset_vector[offset+1] = save_offset2;
1131 md->offset_vector[md->offset_end - number] = save_offset3;
1132 }
1133
1134 if (allow_zero || matched_once)
1135 {
1136 ecode += 1 + LINK_SIZE;
1137 break;
1138 }
1139
1140 RRETURN(MATCH_NOMATCH);
1141 }
1142
1143 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1144 as a non-capturing bracket. */
1145
1146 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1147 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1148
1149 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1150
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1153
1154 /* Non-capturing possessive bracket with unlimited repeat. We come here
1155 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1156 without the capturing complication. It is written out separately for speed
1157 and cleanliness. */
1158
1159 case OP_BRAPOS:
1160 case OP_SBRAPOS:
1161 allow_zero = FALSE;
1162
1163 POSSESSIVE_NON_CAPTURE:
1164 matched_once = FALSE;
1165 code_offset = (int)(ecode - md->start_code);
1166
1167 for (;;)
1168 {
1169 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1170 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1171 eptrb, RM48);
1172 if (rrc == MATCH_KETRPOS)
1173 {
1174 offset_top = md->end_offset_top;
1175 eptr = md->end_match_ptr;
1176 ecode = md->start_code + code_offset;
1177 matched_once = TRUE;
1178 continue;
1179 }
1180
1181 /* See comment in the code for capturing groups above about handling
1182 THEN. */
1183
1184 if (rrc == MATCH_THEN)
1185 {
1186 next = ecode + GET(ecode,1);
1187 if (md->start_match_ptr < next &&
1188 (*ecode == OP_ALT || *next == OP_ALT))
1189 rrc = MATCH_NOMATCH;
1190 }
1191
1192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1193 ecode += GET(ecode, 1);
1194 if (*ecode != OP_ALT) break;
1195 }
1196
1197 if (matched_once || allow_zero)
1198 {
1199 ecode += 1 + LINK_SIZE;
1200 break;
1201 }
1202 RRETURN(MATCH_NOMATCH);
1203
1204 /* Control never reaches here. */
1205
1206 /* Conditional group: compilation checked that there are no more than
1207 two branches. If the condition is false, skipping the first branch takes us
1208 past the end if there is only one branch, but that's OK because that is
1209 exactly what going to the ket would do. */
1210
1211 case OP_COND:
1212 case OP_SCOND:
1213 codelink = GET(ecode, 1);
1214
1215 /* Because of the way auto-callout works during compile, a callout item is
1216 inserted between OP_COND and an assertion condition. */
1217
1218 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1219 {
1220 if (pcre_callout != NULL)
1221 {
1222 pcre_callout_block cb;
1223 cb.version = 2; /* Version 1 of the callout block */
1224 cb.callout_number = ecode[LINK_SIZE+2];
1225 cb.offset_vector = md->offset_vector;
1226 cb.subject = (PCRE_SPTR)md->start_subject;
1227 cb.subject_length = (int)(md->end_subject - md->start_subject);
1228 cb.start_match = (int)(mstart - md->start_subject);
1229 cb.current_position = (int)(eptr - md->start_subject);
1230 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1231 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1232 cb.capture_top = offset_top/2;
1233 cb.capture_last = md->capture_last;
1234 cb.callout_data = md->callout_data;
1235 cb.mark = md->nomatch_mark;
1236 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1237 if (rrc < 0) RRETURN(rrc);
1238 }
1239 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1240 }
1241
1242 condcode = ecode[LINK_SIZE+1];
1243
1244 /* Now see what the actual condition is */
1245
1246 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1247 {
1248 if (md->recursive == NULL) /* Not recursing => FALSE */
1249 {
1250 condition = FALSE;
1251 ecode += GET(ecode, 1);
1252 }
1253 else
1254 {
1255 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1256 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1257
1258 /* If the test is for recursion into a specific subpattern, and it is
1259 false, but the test was set up by name, scan the table to see if the
1260 name refers to any other numbers, and test them. The condition is true
1261 if any one is set. */
1262
1263 if (!condition && condcode == OP_NRREF)
1264 {
1265 pcre_uchar *slotA = md->name_table;
1266 for (i = 0; i < md->name_count; i++)
1267 {
1268 if (GET2(slotA, 0) == recno) break;
1269 slotA += md->name_entry_size;
1270 }
1271
1272 /* Found a name for the number - there can be only one; duplicate
1273 names for different numbers are allowed, but not vice versa. First
1274 scan down for duplicates. */
1275
1276 if (i < md->name_count)
1277 {
1278 pcre_uchar *slotB = slotA;
1279 while (slotB > md->name_table)
1280 {
1281 slotB -= md->name_entry_size;
1282 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1283 {
1284 condition = GET2(slotB, 0) == md->recursive->group_num;
1285 if (condition) break;
1286 }
1287 else break;
1288 }
1289
1290 /* Scan up for duplicates */
1291
1292 if (!condition)
1293 {
1294 slotB = slotA;
1295 for (i++; i < md->name_count; i++)
1296 {
1297 slotB += md->name_entry_size;
1298 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1299 {
1300 condition = GET2(slotB, 0) == md->recursive->group_num;
1301 if (condition) break;
1302 }
1303 else break;
1304 }
1305 }
1306 }
1307 }
1308
1309 /* Chose branch according to the condition */
1310
1311 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1312 }
1313 }
1314
1315 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1316 {
1317 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1318 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1319
1320 /* If the numbered capture is unset, but the reference was by name,
1321 scan the table to see if the name refers to any other numbers, and test
1322 them. The condition is true if any one is set. This is tediously similar
1323 to the code above, but not close enough to try to amalgamate. */
1324
1325 if (!condition && condcode == OP_NCREF)
1326 {
1327 int refno = offset >> 1;
1328 pcre_uchar *slotA = md->name_table;
1329
1330 for (i = 0; i < md->name_count; i++)
1331 {
1332 if (GET2(slotA, 0) == refno) break;
1333 slotA += md->name_entry_size;
1334 }
1335
1336 /* Found a name for the number - there can be only one; duplicate names
1337 for different numbers are allowed, but not vice versa. First scan down
1338 for duplicates. */
1339
1340 if (i < md->name_count)
1341 {
1342 pcre_uchar *slotB = slotA;
1343 while (slotB > md->name_table)
1344 {
1345 slotB -= md->name_entry_size;
1346 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1347 {
1348 offset = GET2(slotB, 0) << 1;
1349 condition = offset < offset_top &&
1350 md->offset_vector[offset] >= 0;
1351 if (condition) break;
1352 }
1353 else break;
1354 }
1355
1356 /* Scan up for duplicates */
1357
1358 if (!condition)
1359 {
1360 slotB = slotA;
1361 for (i++; i < md->name_count; i++)
1362 {
1363 slotB += md->name_entry_size;
1364 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1365 {
1366 offset = GET2(slotB, 0) << 1;
1367 condition = offset < offset_top &&
1368 md->offset_vector[offset] >= 0;
1369 if (condition) break;
1370 }
1371 else break;
1372 }
1373 }
1374 }
1375 }
1376
1377 /* Chose branch according to the condition */
1378
1379 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1380 }
1381
1382 else if (condcode == OP_DEF) /* DEFINE - always false */
1383 {
1384 condition = FALSE;
1385 ecode += GET(ecode, 1);
1386 }
1387
1388 /* The condition is an assertion. Call match() to evaluate it - setting
1389 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1390 an assertion. */
1391
1392 else
1393 {
1394 md->match_function_type = MATCH_CONDASSERT;
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 if (md->end_offset_top > offset_top)
1399 offset_top = md->end_offset_top; /* Captures may have happened */
1400 condition = TRUE;
1401 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1402 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. */
1407
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1409 {
1410 RRETURN(rrc); /* Need braces because of following else */
1411 }
1412 else
1413 {
1414 condition = FALSE;
1415 ecode += codelink;
1416 }
1417 }
1418
1419 /* We are now at the branch that is to be obeyed. As there is only one, can
1420 use tail recursion to avoid using another stack frame, except when there is
1421 unlimited repeat of a possibly empty group. In the latter case, a recursive
1422 call to match() is always required, unless the second alternative doesn't
1423 exist, in which case we can just plough on. Note that, for compatibility
1424 with Perl, the | in a conditional group is NOT treated as creating two
1425 alternatives. If a THEN is encountered in the branch, it propagates out to
1426 the enclosing alternative (unless nested in a deeper set of alternatives,
1427 of course). */
1428
1429 if (condition || *ecode == OP_ALT)
1430 {
1431 if (op != OP_SCOND)
1432 {
1433 ecode += 1 + LINK_SIZE;
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 ecode += 1 + LINK_SIZE;
1447 }
1448 break;
1449
1450
1451 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452 to close any currently open capturing brackets. */
1453
1454 case OP_CLOSE:
1455 number = GET2(ecode, 1);
1456 offset = number << 1;
1457
1458 #ifdef PCRE_DEBUG
1459 printf("end bracket %d at *ACCEPT", number);
1460 printf("\n");
1461 #endif
1462
1463 md->capture_last = number;
1464 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1465 {
1466 md->offset_vector[offset] =
1467 md->offset_vector[md->offset_end - number];
1468 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1469 if (offset_top <= offset) offset_top = offset + 2;
1470 }
1471 ecode += 1 + IMM2_SIZE;
1472 break;
1473
1474
1475 /* End of the pattern, either real or forced. */
1476
1477 case OP_END:
1478 case OP_ACCEPT:
1479 case OP_ASSERT_ACCEPT:
1480
1481 /* If we have matched an empty string, fail if not in an assertion and not
1482 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1483 is set and we have matched at the start of the subject. In both cases,
1484 backtracking will then try other alternatives, if any. */
1485
1486 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1487 md->recursive == NULL &&
1488 (md->notempty ||
1489 (md->notempty_atstart &&
1490 mstart == md->start_subject + md->start_offset)))
1491 RRETURN(MATCH_NOMATCH);
1492
1493 /* Otherwise, we have a match. */
1494
1495 md->end_match_ptr = eptr; /* Record where we ended */
1496 md->end_offset_top = offset_top; /* and how many extracts were taken */
1497 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1498
1499 /* For some reason, the macros don't work properly if an expression is
1500 given as the argument to RRETURN when the heap is in use. */
1501
1502 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1503 RRETURN(rrc);
1504
1505 /* Assertion brackets. Check the alternative branches in turn - the
1506 matching won't pass the KET for an assertion. If any one branch matches,
1507 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1508 start of each branch to move the current point backwards, so the code at
1509 this level is identical to the lookahead case. When the assertion is part
1510 of a condition, we want to return immediately afterwards. The caller of
1511 this incarnation of the match() function will have set MATCH_CONDASSERT in
1512 md->match_function type, and one of these opcodes will be the first opcode
1513 that is processed. We use a local variable that is preserved over calls to
1514 match() to remember this case. */
1515
1516 case OP_ASSERT:
1517 case OP_ASSERTBACK:
1518 if (md->match_function_type == MATCH_CONDASSERT)
1519 {
1520 condassert = TRUE;
1521 md->match_function_type = 0;
1522 }
1523 else condassert = FALSE;
1524
1525 do
1526 {
1527 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1528 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1529 {
1530 mstart = md->start_match_ptr; /* In case \K reset it */
1531 break;
1532 }
1533
1534 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1535 as NOMATCH. */
1536
1537 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1538 ecode += GET(ecode, 1);
1539 }
1540 while (*ecode == OP_ALT);
1541
1542 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1543
1544 /* If checking an assertion for a condition, return MATCH_MATCH. */
1545
1546 if (condassert) RRETURN(MATCH_MATCH);
1547
1548 /* Continue from after the assertion, updating the offsets high water
1549 mark, since extracts may have been taken during the assertion. */
1550
1551 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1552 ecode += 1 + LINK_SIZE;
1553 offset_top = md->end_offset_top;
1554 continue;
1555
1556 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1557 PRUNE, or COMMIT means we must assume failure without checking subsequent
1558 branches. */
1559
1560 case OP_ASSERT_NOT:
1561 case OP_ASSERTBACK_NOT:
1562 if (md->match_function_type == MATCH_CONDASSERT)
1563 {
1564 condassert = TRUE;
1565 md->match_function_type = 0;
1566 }
1567 else condassert = FALSE;
1568
1569 do
1570 {
1571 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1572 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1573 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1574 {
1575 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1576 break;
1577 }
1578
1579 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1580 as NOMATCH. */
1581
1582 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1583 ecode += GET(ecode,1);
1584 }
1585 while (*ecode == OP_ALT);
1586
1587 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1588
1589 ecode += 1 + LINK_SIZE;
1590 continue;
1591
1592 /* Move the subject pointer back. This occurs only at the start of
1593 each branch of a lookbehind assertion. If we are too close to the start to
1594 move back, this match function fails. When working with UTF-8 we move
1595 back a number of characters, not bytes. */
1596
1597 case OP_REVERSE:
1598 #ifdef SUPPORT_UTF
1599 if (utf)
1600 {
1601 i = GET(ecode, 1);
1602 while (i-- > 0)
1603 {
1604 eptr--;
1605 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1606 BACKCHAR(eptr);
1607 }
1608 }
1609 else
1610 #endif
1611
1612 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1613
1614 {
1615 eptr -= GET(ecode, 1);
1616 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1617 }
1618
1619 /* Save the earliest consulted character, then skip to next op code */
1620
1621 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1622 ecode += 1 + LINK_SIZE;
1623 break;
1624
1625 /* The callout item calls an external function, if one is provided, passing
1626 details of the match so far. This is mainly for debugging, though the
1627 function is able to force a failure. */
1628
1629 case OP_CALLOUT:
1630 if (pcre_callout != NULL)
1631 {
1632 pcre_callout_block cb;
1633 cb.version = 2; /* Version 1 of the callout block */
1634 cb.callout_number = ecode[1];
1635 cb.offset_vector = md->offset_vector;
1636 cb.subject = (PCRE_SPTR)md->start_subject;
1637 cb.subject_length = (int)(md->end_subject - md->start_subject);
1638 cb.start_match = (int)(mstart - md->start_subject);
1639 cb.current_position = (int)(eptr - md->start_subject);
1640 cb.pattern_position = GET(ecode, 2);
1641 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1642 cb.capture_top = offset_top/2;
1643 cb.capture_last = md->capture_last;
1644 cb.callout_data = md->callout_data;
1645 cb.mark = md->nomatch_mark;
1646 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1647 if (rrc < 0) RRETURN(rrc);
1648 }
1649 ecode += 2 + 2*LINK_SIZE;
1650 break;
1651
1652 /* Recursion either matches the current regex, or some subexpression. The
1653 offset data is the offset to the starting bracket from the start of the
1654 whole pattern. (This is so that it works from duplicated subpatterns.)
1655
1656 The state of the capturing groups is preserved over recursion, and
1657 re-instated afterwards. We don't know how many are started and not yet
1658 finished (offset_top records the completed total) so we just have to save
1659 all the potential data. There may be up to 65535 such values, which is too
1660 large to put on the stack, but using malloc for small numbers seems
1661 expensive. As a compromise, the stack is used when there are no more than
1662 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1663
1664 There are also other values that have to be saved. We use a chained
1665 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1666 for the original version of this logic. It has, however, been hacked around
1667 a lot, so he is not to blame for the current way it works. */
1668
1669 case OP_RECURSE:
1670 {
1671 recursion_info *ri;
1672 int recno;
1673
1674 callpat = md->start_code + GET(ecode, 1);
1675 recno = (callpat == md->start_code)? 0 :
1676 GET2(callpat, 1 + LINK_SIZE);
1677
1678 /* Check for repeating a recursion without advancing the subject pointer.
1679 This should catch convoluted mutual recursions. (Some simple cases are
1680 caught at compile time.) */
1681
1682 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1683 if (recno == ri->group_num && eptr == ri->subject_position)
1684 RRETURN(PCRE_ERROR_RECURSELOOP);
1685
1686 /* Add to "recursing stack" */
1687
1688 new_recursive.group_num = recno;
1689 new_recursive.subject_position = eptr;
1690 new_recursive.prevrec = md->recursive;
1691 md->recursive = &new_recursive;
1692
1693 /* Where to continue from afterwards */
1694
1695 ecode += 1 + LINK_SIZE;
1696
1697 /* Now save the offset data */
1698
1699 new_recursive.saved_max = md->offset_end;
1700 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1701 new_recursive.offset_save = stacksave;
1702 else
1703 {
1704 new_recursive.offset_save =
1705 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1706 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1707 }
1708 memcpy(new_recursive.offset_save, md->offset_vector,
1709 new_recursive.saved_max * sizeof(int));
1710
1711 /* OK, now we can do the recursion. After processing each alternative,
1712 restore the offset data. If there were nested recursions, md->recursive
1713 might be changed, so reset it before looping. */
1714
1715 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1716 cbegroup = (*callpat >= OP_SBRA);
1717 do
1718 {
1719 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1720 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1721 md, eptrb, RM6);
1722 memcpy(md->offset_vector, new_recursive.offset_save,
1723 new_recursive.saved_max * sizeof(int));
1724 md->recursive = new_recursive.prevrec;
1725 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1726 {
1727 DPRINTF(("Recursion matched\n"));
1728 if (new_recursive.offset_save != stacksave)
1729 (pcre_free)(new_recursive.offset_save);
1730
1731 /* Set where we got to in the subject, and reset the start in case
1732 it was changed by \K. This *is* propagated back out of a recursion,
1733 for Perl compatibility. */
1734
1735 eptr = md->end_match_ptr;
1736 mstart = md->start_match_ptr;
1737 goto RECURSION_MATCHED; /* Exit loop; end processing */
1738 }
1739
1740 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1741 as NOMATCH. */
1742
1743 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1744 {
1745 DPRINTF(("Recursion gave error %d\n", rrc));
1746 if (new_recursive.offset_save != stacksave)
1747 (pcre_free)(new_recursive.offset_save);
1748 RRETURN(rrc);
1749 }
1750
1751 md->recursive = &new_recursive;
1752 callpat += GET(callpat, 1);
1753 }
1754 while (*callpat == OP_ALT);
1755
1756 DPRINTF(("Recursion didn't match\n"));
1757 md->recursive = new_recursive.prevrec;
1758 if (new_recursive.offset_save != stacksave)
1759 (pcre_free)(new_recursive.offset_save);
1760 RRETURN(MATCH_NOMATCH);
1761 }
1762
1763 RECURSION_MATCHED:
1764 break;
1765
1766 /* An alternation is the end of a branch; scan along to find the end of the
1767 bracketed group and go to there. */
1768
1769 case OP_ALT:
1770 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1771 break;
1772
1773 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1774 indicating that it may occur zero times. It may repeat infinitely, or not
1775 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1776 with fixed upper repeat limits are compiled as a number of copies, with the
1777 optional ones preceded by BRAZERO or BRAMINZERO. */
1778
1779 case OP_BRAZERO:
1780 next = ecode + 1;
1781 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1782 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1783 do next += GET(next, 1); while (*next == OP_ALT);
1784 ecode = next + 1 + LINK_SIZE;
1785 break;
1786
1787 case OP_BRAMINZERO:
1788 next = ecode + 1;
1789 do next += GET(next, 1); while (*next == OP_ALT);
1790 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1792 ecode++;
1793 break;
1794
1795 case OP_SKIPZERO:
1796 next = ecode+1;
1797 do next += GET(next,1); while (*next == OP_ALT);
1798 ecode = next + 1 + LINK_SIZE;
1799 break;
1800
1801 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1802 here; just jump to the group, with allow_zero set TRUE. */
1803
1804 case OP_BRAPOSZERO:
1805 op = *(++ecode);
1806 allow_zero = TRUE;
1807 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1808 goto POSSESSIVE_NON_CAPTURE;
1809
1810 /* End of a group, repeated or non-repeating. */
1811
1812 case OP_KET:
1813 case OP_KETRMIN:
1814 case OP_KETRMAX:
1815 case OP_KETRPOS:
1816 prev = ecode - GET(ecode, 1);
1817
1818 /* If this was a group that remembered the subject start, in order to break
1819 infinite repeats of empty string matches, retrieve the subject start from
1820 the chain. Otherwise, set it NULL. */
1821
1822 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1823 {
1824 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1825 eptrb = eptrb->epb_prev; /* Backup to previous group */
1826 }
1827 else saved_eptr = NULL;
1828
1829 /* If we are at the end of an assertion group or a non-capturing atomic
1830 group, stop matching and return MATCH_MATCH, but record the current high
1831 water mark for use by positive assertions. We also need to record the match
1832 start in case it was changed by \K. */
1833
1834 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1835 *prev == OP_ONCE_NC)
1836 {
1837 md->end_match_ptr = eptr; /* For ONCE_NC */
1838 md->end_offset_top = offset_top;
1839 md->start_match_ptr = mstart;
1840 RRETURN(MATCH_MATCH); /* Sets md->mark */
1841 }
1842
1843 /* For capturing groups we have to check the group number back at the start
1844 and if necessary complete handling an extraction by setting the offsets and
1845 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1846 into group 0, so it won't be picked up here. Instead, we catch it when the
1847 OP_END is reached. Other recursion is handled here. We just have to record
1848 the current subject position and start match pointer and give a MATCH
1849 return. */
1850
1851 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1852 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1853 {
1854 number = GET2(prev, 1+LINK_SIZE);
1855 offset = number << 1;
1856
1857 #ifdef PCRE_DEBUG
1858 printf("end bracket %d", number);
1859 printf("\n");
1860 #endif
1861
1862 /* Handle a recursively called group. */
1863
1864 if (md->recursive != NULL && md->recursive->group_num == number)
1865 {
1866 md->end_match_ptr = eptr;
1867 md->start_match_ptr = mstart;
1868 RRETURN(MATCH_MATCH);
1869 }
1870
1871 /* Deal with capturing */
1872
1873 md->capture_last = number;
1874 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1875 {
1876 /* If offset is greater than offset_top, it means that we are
1877 "skipping" a capturing group, and that group's offsets must be marked
1878 unset. In earlier versions of PCRE, all the offsets were unset at the
1879 start of matching, but this doesn't work because atomic groups and
1880 assertions can cause a value to be set that should later be unset.
1881 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1882 part of the atomic group, but this is not on the final matching path,
1883 so must be unset when 2 is set. (If there is no group 2, there is no
1884 problem, because offset_top will then be 2, indicating no capture.) */
1885
1886 if (offset > offset_top)
1887 {
1888 register int *iptr = md->offset_vector + offset_top;
1889 register int *iend = md->offset_vector + offset;
1890 while (iptr < iend) *iptr++ = -1;
1891 }
1892
1893 /* Now make the extraction */
1894
1895 md->offset_vector[offset] =
1896 md->offset_vector[md->offset_end - number];
1897 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1898 if (offset_top <= offset) offset_top = offset + 2;
1899 }
1900 }
1901
1902 /* For an ordinary non-repeating ket, just continue at this level. This
1903 also happens for a repeating ket if no characters were matched in the
1904 group. This is the forcible breaking of infinite loops as implemented in
1905 Perl 5.005. For a non-repeating atomic group that includes captures,
1906 establish a backup point by processing the rest of the pattern at a lower
1907 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1908 original OP_ONCE level, thereby bypassing intermediate backup points, but
1909 resetting any captures that happened along the way. */
1910
1911 if (*ecode == OP_KET || eptr == saved_eptr)
1912 {
1913 if (*prev == OP_ONCE)
1914 {
1915 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1918 RRETURN(MATCH_ONCE);
1919 }
1920 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1921 break;
1922 }
1923
1924 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1925 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1926 at a time from the outer level, thus saving stack. */
1927
1928 if (*ecode == OP_KETRPOS)
1929 {
1930 md->end_match_ptr = eptr;
1931 md->end_offset_top = offset_top;
1932 RRETURN(MATCH_KETRPOS);
1933 }
1934
1935 /* The normal repeating kets try the rest of the pattern or restart from
1936 the preceding bracket, in the appropriate order. In the second case, we can
1937 use tail recursion to avoid using another stack frame, unless we have an
1938 an atomic group or an unlimited repeat of a group that can match an empty
1939 string. */
1940
1941 if (*ecode == OP_KETRMIN)
1942 {
1943 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1944 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1945 if (*prev == OP_ONCE)
1946 {
1947 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1950 RRETURN(MATCH_ONCE);
1951 }
1952 if (*prev >= OP_SBRA) /* Could match an empty string */
1953 {
1954 md->match_function_type = MATCH_CBEGROUP;
1955 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1956 RRETURN(rrc);
1957 }
1958 ecode = prev;
1959 goto TAIL_RECURSE;
1960 }
1961 else /* OP_KETRMAX */
1962 {
1963 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1964 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1965 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1967 if (*prev == OP_ONCE)
1968 {
1969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 md->once_target = prev;
1972 RRETURN(MATCH_ONCE);
1973 }
1974 ecode += 1 + LINK_SIZE;
1975 goto TAIL_RECURSE;
1976 }
1977 /* Control never gets here */
1978
1979 /* Not multiline mode: start of subject assertion, unless notbol. */
1980
1981 case OP_CIRC:
1982 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1983
1984 /* Start of subject assertion */
1985
1986 case OP_SOD:
1987 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1988 ecode++;
1989 break;
1990
1991 /* Multiline mode: start of subject unless notbol, or after any newline. */
1992
1993 case OP_CIRCM:
1994 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1995 if (eptr != md->start_subject &&
1996 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1997 RRETURN(MATCH_NOMATCH);
1998 ecode++;
1999 break;
2000
2001 /* Start of match assertion */
2002
2003 case OP_SOM:
2004 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2005 ecode++;
2006 break;
2007
2008 /* Reset the start of match point */
2009
2010 case OP_SET_SOM:
2011 mstart = eptr;
2012 ecode++;
2013 break;
2014
2015 /* Multiline mode: assert before any newline, or before end of subject
2016 unless noteol is set. */
2017
2018 case OP_DOLLM:
2019 if (eptr < md->end_subject)
2020 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2021 else
2022 {
2023 if (md->noteol) RRETURN(MATCH_NOMATCH);
2024 SCHECK_PARTIAL();
2025 }
2026 ecode++;
2027 break;
2028
2029 /* Not multiline mode: assert before a terminating newline or before end of
2030 subject unless noteol is set. */
2031
2032 case OP_DOLL:
2033 if (md->noteol) RRETURN(MATCH_NOMATCH);
2034 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2035
2036 /* ... else fall through for endonly */
2037
2038 /* End of subject assertion (\z) */
2039
2040 case OP_EOD:
2041 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2042 SCHECK_PARTIAL();
2043 ecode++;
2044 break;
2045
2046 /* End of subject or ending \n assertion (\Z) */
2047
2048 case OP_EODN:
2049 ASSERT_NL_OR_EOS:
2050 if (eptr < md->end_subject &&
2051 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2052 RRETURN(MATCH_NOMATCH);
2053
2054 /* Either at end of string or \n before end. */
2055
2056 SCHECK_PARTIAL();
2057 ecode++;
2058 break;
2059
2060 /* Word boundary assertions */
2061
2062 case OP_NOT_WORD_BOUNDARY:
2063 case OP_WORD_BOUNDARY:
2064 {
2065
2066 /* Find out if the previous and current characters are "word" characters.
2067 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2068 be "non-word" characters. Remember the earliest consulted character for
2069 partial matching. */
2070
2071 #ifdef SUPPORT_UTF
2072 if (utf)
2073 {
2074 /* Get status of previous character */
2075
2076 if (eptr == md->start_subject) prev_is_word = FALSE; else
2077 {
2078 PCRE_PUCHAR lastptr = eptr - 1;
2079 BACKCHAR(lastptr);
2080 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2081 GETCHAR(c, lastptr);
2082 #ifdef SUPPORT_UCP
2083 if (md->use_ucp)
2084 {
2085 if (c == '_') prev_is_word = TRUE; else
2086 {
2087 int cat = UCD_CATEGORY(c);
2088 prev_is_word = (cat == ucp_L || cat == ucp_N);
2089 }
2090 }
2091 else
2092 #endif
2093 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2094 }
2095
2096 /* Get status of next character */
2097
2098 if (eptr >= md->end_subject)
2099 {
2100 SCHECK_PARTIAL();
2101 cur_is_word = FALSE;
2102 }
2103 else
2104 {
2105 GETCHAR(c, eptr);
2106 #ifdef SUPPORT_UCP
2107 if (md->use_ucp)
2108 {
2109 if (c == '_') cur_is_word = TRUE; else
2110 {
2111 int cat = UCD_CATEGORY(c);
2112 cur_is_word = (cat == ucp_L || cat == ucp_N);
2113 }
2114 }
2115 else
2116 #endif
2117 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2118 }
2119 }
2120 else
2121 #endif
2122
2123 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2124 consistency with the behaviour of \w we do use it in this case. */
2125
2126 {
2127 /* Get status of previous character */
2128
2129 if (eptr == md->start_subject) prev_is_word = FALSE; else
2130 {
2131 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2132 #ifdef SUPPORT_UCP
2133 if (md->use_ucp)
2134 {
2135 c = eptr[-1];
2136 if (c == '_') prev_is_word = TRUE; else
2137 {
2138 int cat = UCD_CATEGORY(c);
2139 prev_is_word = (cat == ucp_L || cat == ucp_N);
2140 }
2141 }
2142 else
2143 #endif
2144 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2145 }
2146
2147 /* Get status of next character */
2148
2149 if (eptr >= md->end_subject)
2150 {
2151 SCHECK_PARTIAL();
2152 cur_is_word = FALSE;
2153 }
2154 else
2155 #ifdef SUPPORT_UCP
2156 if (md->use_ucp)
2157 {
2158 c = *eptr;
2159 if (c == '_') cur_is_word = TRUE; else
2160 {
2161 int cat = UCD_CATEGORY(c);
2162 cur_is_word = (cat == ucp_L || cat == ucp_N);
2163 }
2164 }
2165 else
2166 #endif
2167 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2168 }
2169
2170 /* Now see if the situation is what we want */
2171
2172 if ((*ecode++ == OP_WORD_BOUNDARY)?
2173 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2174 RRETURN(MATCH_NOMATCH);
2175 }
2176 break;
2177
2178 /* Match a single character type; inline for speed */
2179
2180 case OP_ANY:
2181 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2182 /* Fall through */
2183
2184 case OP_ALLANY:
2185 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2186 { /* not be updated before SCHECK_PARTIAL. */
2187 SCHECK_PARTIAL();
2188 RRETURN(MATCH_NOMATCH);
2189 }
2190 eptr++;
2191 #ifdef SUPPORT_UTF
2192 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2193 #endif
2194 ecode++;
2195 break;
2196
2197 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2198 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2199
2200 case OP_ANYBYTE:
2201 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2202 { /* not be updated before SCHECK_PARTIAL. */
2203 SCHECK_PARTIAL();
2204 RRETURN(MATCH_NOMATCH);
2205 }
2206 eptr++;
2207 ecode++;
2208 break;
2209
2210 case OP_NOT_DIGIT:
2211 if (eptr >= md->end_subject)
2212 {
2213 SCHECK_PARTIAL();
2214 RRETURN(MATCH_NOMATCH);
2215 }
2216 GETCHARINCTEST(c, eptr);
2217 if (
2218 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2219 c < 256 &&
2220 #endif
2221 (md->ctypes[c] & ctype_digit) != 0
2222 )
2223 RRETURN(MATCH_NOMATCH);
2224 ecode++;
2225 break;
2226
2227 case OP_DIGIT:
2228 if (eptr >= md->end_subject)
2229 {
2230 SCHECK_PARTIAL();
2231 RRETURN(MATCH_NOMATCH);
2232 }
2233 GETCHARINCTEST(c, eptr);
2234 if (
2235 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2236 c > 255 ||
2237 #endif
2238 (md->ctypes[c] & ctype_digit) == 0
2239 )
2240 RRETURN(MATCH_NOMATCH);
2241 ecode++;
2242 break;
2243
2244 case OP_NOT_WHITESPACE:
2245 if (eptr >= md->end_subject)
2246 {
2247 SCHECK_PARTIAL();
2248 RRETURN(MATCH_NOMATCH);
2249 }
2250 GETCHARINCTEST(c, eptr);
2251 if (
2252 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2253 c < 256 &&
2254 #endif
2255 (md->ctypes[c] & ctype_space) != 0
2256 )
2257 RRETURN(MATCH_NOMATCH);
2258 ecode++;
2259 break;
2260
2261 case OP_WHITESPACE:
2262 if (eptr >= md->end_subject)
2263 {
2264 SCHECK_PARTIAL();
2265 RRETURN(MATCH_NOMATCH);
2266 }
2267 GETCHARINCTEST(c, eptr);
2268 if (
2269 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2270 c > 255 ||
2271 #endif
2272 (md->ctypes[c] & ctype_space) == 0
2273 )
2274 RRETURN(MATCH_NOMATCH);
2275 ecode++;
2276 break;
2277
2278 case OP_NOT_WORDCHAR:
2279 if (eptr >= md->end_subject)
2280 {
2281 SCHECK_PARTIAL();
2282 RRETURN(MATCH_NOMATCH);
2283 }
2284 GETCHARINCTEST(c, eptr);
2285 if (
2286 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2287 c < 256 &&
2288 #endif
2289 (md->ctypes[c] & ctype_word) != 0
2290 )
2291 RRETURN(MATCH_NOMATCH);
2292 ecode++;
2293 break;
2294
2295 case OP_WORDCHAR:
2296 if (eptr >= md->end_subject)
2297 {
2298 SCHECK_PARTIAL();
2299 RRETURN(MATCH_NOMATCH);
2300 }
2301 GETCHARINCTEST(c, eptr);
2302 if (
2303 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2304 c > 255 ||
2305 #endif
2306 (md->ctypes[c] & ctype_word) == 0
2307 )
2308 RRETURN(MATCH_NOMATCH);
2309 ecode++;
2310 break;
2311
2312 case OP_ANYNL:
2313 if (eptr >= md->end_subject)
2314 {
2315 SCHECK_PARTIAL();
2316 RRETURN(MATCH_NOMATCH);
2317 }
2318 GETCHARINCTEST(c, eptr);
2319 switch(c)
2320 {
2321 default: RRETURN(MATCH_NOMATCH);
2322
2323 case 0x000d:
2324 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2325 break;
2326
2327 case 0x000a:
2328 break;
2329
2330 case 0x000b:
2331 case 0x000c:
2332 case 0x0085:
2333 case 0x2028:
2334 case 0x2029:
2335 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2336 break;
2337 }
2338 ecode++;
2339 break;
2340
2341 case OP_NOT_HSPACE:
2342 if (eptr >= md->end_subject)
2343 {
2344 SCHECK_PARTIAL();
2345 RRETURN(MATCH_NOMATCH);
2346 }
2347 GETCHARINCTEST(c, eptr);
2348 switch(c)
2349 {
2350 default: break;
2351 case 0x09: /* HT */
2352 case 0x20: /* SPACE */
2353 case 0xa0: /* NBSP */
2354 case 0x1680: /* OGHAM SPACE MARK */
2355 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2356 case 0x2000: /* EN QUAD */
2357 case 0x2001: /* EM QUAD */
2358 case 0x2002: /* EN SPACE */
2359 case 0x2003: /* EM SPACE */
2360 case 0x2004: /* THREE-PER-EM SPACE */
2361 case 0x2005: /* FOUR-PER-EM SPACE */
2362 case 0x2006: /* SIX-PER-EM SPACE */
2363 case 0x2007: /* FIGURE SPACE */
2364 case 0x2008: /* PUNCTUATION SPACE */
2365 case 0x2009: /* THIN SPACE */
2366 case 0x200A: /* HAIR SPACE */
2367 case 0x202f: /* NARROW NO-BREAK SPACE */
2368 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2369 case 0x3000: /* IDEOGRAPHIC SPACE */
2370 RRETURN(MATCH_NOMATCH);
2371 }
2372 ecode++;
2373 break;
2374
2375 case OP_HSPACE:
2376 if (eptr >= md->end_subject)
2377 {
2378 SCHECK_PARTIAL();
2379 RRETURN(MATCH_NOMATCH);
2380 }
2381 GETCHARINCTEST(c, eptr);
2382 switch(c)
2383 {
2384 default: RRETURN(MATCH_NOMATCH);
2385 case 0x09: /* HT */
2386 case 0x20: /* SPACE */
2387 case 0xa0: /* NBSP */
2388 case 0x1680: /* OGHAM SPACE MARK */
2389 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2390 case 0x2000: /* EN QUAD */
2391 case 0x2001: /* EM QUAD */
2392 case 0x2002: /* EN SPACE */
2393 case 0x2003: /* EM SPACE */
2394 case 0x2004: /* THREE-PER-EM SPACE */
2395 case 0x2005: /* FOUR-PER-EM SPACE */
2396 case 0x2006: /* SIX-PER-EM SPACE */
2397 case 0x2007: /* FIGURE SPACE */
2398 case 0x2008: /* PUNCTUATION SPACE */
2399 case 0x2009: /* THIN SPACE */
2400 case 0x200A: /* HAIR SPACE */
2401 case 0x202f: /* NARROW NO-BREAK SPACE */
2402 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2403 case 0x3000: /* IDEOGRAPHIC SPACE */
2404 break;
2405 }
2406 ecode++;
2407 break;
2408
2409 case OP_NOT_VSPACE:
2410 if (eptr >= md->end_subject)
2411 {
2412 SCHECK_PARTIAL();
2413 RRETURN(MATCH_NOMATCH);
2414 }
2415 GETCHARINCTEST(c, eptr);
2416 switch(c)
2417 {
2418 default: break;
2419 case 0x0a: /* LF */
2420 case 0x0b: /* VT */
2421 case 0x0c: /* FF */
2422 case 0x0d: /* CR */
2423 case 0x85: /* NEL */
2424 case 0x2028: /* LINE SEPARATOR */
2425 case 0x2029: /* PARAGRAPH SEPARATOR */
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 ecode++;
2429 break;
2430
2431 case OP_VSPACE:
2432 if (eptr >= md->end_subject)
2433 {
2434 SCHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2436 }
2437 GETCHARINCTEST(c, eptr);
2438 switch(c)
2439 {
2440 default: RRETURN(MATCH_NOMATCH);
2441 case 0x0a: /* LF */
2442 case 0x0b: /* VT */
2443 case 0x0c: /* FF */
2444 case 0x0d: /* CR */
2445 case 0x85: /* NEL */
2446 case 0x2028: /* LINE SEPARATOR */
2447 case 0x2029: /* PARAGRAPH SEPARATOR */
2448 break;
2449 }
2450 ecode++;
2451 break;
2452
2453 #ifdef SUPPORT_UCP
2454 /* Check the next character by Unicode property. We will get here only
2455 if the support is in the binary; otherwise a compile-time error occurs. */
2456
2457 case OP_PROP:
2458 case OP_NOTPROP:
2459 if (eptr >= md->end_subject)
2460 {
2461 SCHECK_PARTIAL();
2462 RRETURN(MATCH_NOMATCH);
2463 }
2464 GETCHARINCTEST(c, eptr);
2465 {
2466 const ucd_record *prop = GET_UCD(c);
2467
2468 switch(ecode[1])
2469 {
2470 case PT_ANY:
2471 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2472 break;
2473
2474 case PT_LAMP:
2475 if ((prop->chartype == ucp_Lu ||
2476 prop->chartype == ucp_Ll ||
2477 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2478 RRETURN(MATCH_NOMATCH);
2479 break;
2480
2481 case PT_GC:
2482 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2483 RRETURN(MATCH_NOMATCH);
2484 break;
2485
2486 case PT_PC:
2487 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2488 RRETURN(MATCH_NOMATCH);
2489 break;
2490
2491 case PT_SC:
2492 if ((ecode[2] != prop->script) == (op == OP_PROP))
2493 RRETURN(MATCH_NOMATCH);
2494 break;
2495
2496 /* These are specials */
2497
2498 case PT_ALNUM:
2499 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2500 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2501 RRETURN(MATCH_NOMATCH);
2502 break;
2503
2504 case PT_SPACE: /* Perl space */
2505 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2506 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2507 == (op == OP_NOTPROP))
2508 RRETURN(MATCH_NOMATCH);
2509 break;
2510
2511 case PT_PXSPACE: /* POSIX space */
2512 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2513 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2514 c == CHAR_FF || c == CHAR_CR)
2515 == (op == OP_NOTPROP))
2516 RRETURN(MATCH_NOMATCH);
2517 break;
2518
2519 case PT_WORD:
2520 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2521 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2522 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2523 RRETURN(MATCH_NOMATCH);
2524 break;
2525
2526 /* This should never occur */
2527
2528 default:
2529 RRETURN(PCRE_ERROR_INTERNAL);
2530 }
2531
2532 ecode += 3;
2533 }
2534 break;
2535
2536 /* Match an extended Unicode sequence. We will get here only if the support
2537 is in the binary; otherwise a compile-time error occurs. */
2538
2539 case OP_EXTUNI:
2540 if (eptr >= md->end_subject)
2541 {
2542 SCHECK_PARTIAL();
2543 RRETURN(MATCH_NOMATCH);
2544 }
2545 GETCHARINCTEST(c, eptr);
2546 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2547 while (eptr < md->end_subject)
2548 {
2549 int len = 1;
2550 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2551 if (UCD_CATEGORY(c) != ucp_M) break;
2552 eptr += len;
2553 }
2554 ecode++;
2555 break;
2556 #endif
2557
2558
2559 /* Match a back reference, possibly repeatedly. Look past the end of the
2560 item to see if there is repeat information following. The code is similar
2561 to that for character classes, but repeated for efficiency. Then obey
2562 similar code to character type repeats - written out again for speed.
2563 However, if the referenced string is the empty string, always treat
2564 it as matched, any number of times (otherwise there could be infinite
2565 loops). */
2566
2567 case OP_REF:
2568 case OP_REFI:
2569 caseless = op == OP_REFI;
2570 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2571 ecode += 1 + IMM2_SIZE;
2572
2573 /* If the reference is unset, there are two possibilities:
2574
2575 (a) In the default, Perl-compatible state, set the length negative;
2576 this ensures that every attempt at a match fails. We can't just fail
2577 here, because of the possibility of quantifiers with zero minima.
2578
2579 (b) If the JavaScript compatibility flag is set, set the length to zero
2580 so that the back reference matches an empty string.
2581
2582 Otherwise, set the length to the length of what was matched by the
2583 referenced subpattern. */
2584
2585 if (offset >= offset_top || md->offset_vector[offset] < 0)
2586 length = (md->jscript_compat)? 0 : -1;
2587 else
2588 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2589
2590 /* Set up for repetition, or handle the non-repeated case */
2591
2592 switch (*ecode)
2593 {
2594 case OP_CRSTAR:
2595 case OP_CRMINSTAR:
2596 case OP_CRPLUS:
2597 case OP_CRMINPLUS:
2598 case OP_CRQUERY:
2599 case OP_CRMINQUERY:
2600 c = *ecode++ - OP_CRSTAR;
2601 minimize = (c & 1) != 0;
2602 min = rep_min[c]; /* Pick up values from tables; */
2603 max = rep_max[c]; /* zero for max => infinity */
2604 if (max == 0) max = INT_MAX;
2605 break;
2606
2607 case OP_CRRANGE:
2608 case OP_CRMINRANGE:
2609 minimize = (*ecode == OP_CRMINRANGE);
2610 min = GET2(ecode, 1);
2611 max = GET2(ecode, 1 + IMM2_SIZE);
2612 if (max == 0) max = INT_MAX;
2613 ecode += 1 + 2 * IMM2_SIZE;
2614 break;
2615
2616 default: /* No repeat follows */
2617 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2618 {
2619 CHECK_PARTIAL();
2620 RRETURN(MATCH_NOMATCH);
2621 }
2622 eptr += length;
2623 continue; /* With the main loop */
2624 }
2625
2626 /* Handle repeated back references. If the length of the reference is
2627 zero, just continue with the main loop. */
2628
2629 if (length == 0) continue;
2630
2631 /* First, ensure the minimum number of matches are present. We get back
2632 the length of the reference string explicitly rather than passing the
2633 address of eptr, so that eptr can be a register variable. */
2634
2635 for (i = 1; i <= min; i++)
2636 {
2637 int slength;
2638 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2639 {
2640 CHECK_PARTIAL();
2641 RRETURN(MATCH_NOMATCH);
2642 }
2643 eptr += slength;
2644 }
2645
2646 /* If min = max, continue at the same level without recursion.
2647 They are not both allowed to be zero. */
2648
2649 if (min == max) continue;
2650
2651 /* If minimizing, keep trying and advancing the pointer */
2652
2653 if (minimize)
2654 {
2655 for (fi = min;; fi++)
2656 {
2657 int slength;
2658 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2660 if (fi >= max) RRETURN(MATCH_NOMATCH);
2661 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2662 {
2663 CHECK_PARTIAL();
2664 RRETURN(MATCH_NOMATCH);
2665 }
2666 eptr += slength;
2667 }
2668 /* Control never gets here */
2669 }
2670
2671 /* If maximizing, find the longest string and work backwards */
2672
2673 else
2674 {
2675 pp = eptr;
2676 for (i = min; i < max; i++)
2677 {
2678 int slength;
2679 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2680 {
2681 CHECK_PARTIAL();
2682 break;
2683 }
2684 eptr += slength;
2685 }
2686 while (eptr >= pp)
2687 {
2688 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2690 eptr -= length;
2691 }
2692 RRETURN(MATCH_NOMATCH);
2693 }
2694 /* Control never gets here */
2695
2696 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2697 used when all the characters in the class have values in the range 0-255,
2698 and either the matching is caseful, or the characters are in the range
2699 0-127 when UTF-8 processing is enabled. The only difference between
2700 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2701 encountered.
2702
2703 First, look past the end of the item to see if there is repeat information
2704 following. Then obey similar code to character type repeats - written out
2705 again for speed. */
2706
2707 case OP_NCLASS:
2708 case OP_CLASS:
2709 {
2710 /* The data variable is saved across frames, so the byte map needs to
2711 be stored there. */
2712 #define BYTE_MAP ((pcre_uint8 *)data)
2713 data = ecode + 1; /* Save for matching */
2714 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2715
2716 switch (*ecode)
2717 {
2718 case OP_CRSTAR:
2719 case OP_CRMINSTAR:
2720 case OP_CRPLUS:
2721 case OP_CRMINPLUS:
2722 case OP_CRQUERY:
2723 case OP_CRMINQUERY:
2724 c = *ecode++ - OP_CRSTAR;
2725 minimize = (c & 1) != 0;
2726 min = rep_min[c]; /* Pick up values from tables; */
2727 max = rep_max[c]; /* zero for max => infinity */
2728 if (max == 0) max = INT_MAX;
2729 break;
2730
2731 case OP_CRRANGE:
2732 case OP_CRMINRANGE:
2733 minimize = (*ecode == OP_CRMINRANGE);
2734 min = GET2(ecode, 1);
2735 max = GET2(ecode, 1 + IMM2_SIZE);
2736 if (max == 0) max = INT_MAX;
2737 ecode += 1 + 2 * IMM2_SIZE;
2738 break;
2739
2740 default: /* No repeat follows */
2741 min = max = 1;
2742 break;
2743 }
2744
2745 /* First, ensure the minimum number of matches are present. */
2746
2747 #ifdef SUPPORT_UTF
2748 if (utf)
2749 {
2750 for (i = 1; i <= min; i++)
2751 {
2752 if (eptr >= md->end_subject)
2753 {
2754 SCHECK_PARTIAL();
2755 RRETURN(MATCH_NOMATCH);
2756 }
2757 GETCHARINC(c, eptr);
2758 if (c > 255)
2759 {
2760 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2761 }
2762 else
2763 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2764 }
2765 }
2766 else
2767 #endif
2768 /* Not UTF mode */
2769 {
2770 for (i = 1; i <= min; i++)
2771 {
2772 if (eptr >= md->end_subject)
2773 {
2774 SCHECK_PARTIAL();
2775 RRETURN(MATCH_NOMATCH);
2776 }
2777 c = *eptr++;
2778 #ifndef COMPILE_PCRE8
2779 if (c > 255)
2780 {
2781 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2782 }
2783 else
2784 #endif
2785 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2786 }
2787 }
2788
2789 /* If max == min we can continue with the main loop without the
2790 need to recurse. */
2791
2792 if (min == max) continue;
2793
2794 /* If minimizing, keep testing the rest of the expression and advancing
2795 the pointer while it matches the class. */
2796
2797 if (minimize)
2798 {
2799 #ifdef SUPPORT_UTF
2800 if (utf)
2801 {
2802 for (fi = min;; fi++)
2803 {
2804 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2805 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2806 if (fi >= max) RRETURN(MATCH_NOMATCH);
2807 if (eptr >= md->end_subject)
2808 {
2809 SCHECK_PARTIAL();
2810 RRETURN(MATCH_NOMATCH);
2811 }
2812 GETCHARINC(c, eptr);
2813 if (c > 255)
2814 {
2815 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2816 }
2817 else
2818 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2819 }
2820 }
2821 else
2822 #endif
2823 /* Not UTF mode */
2824 {
2825 for (fi = min;; fi++)
2826 {
2827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2829 if (fi >= max) RRETURN(MATCH_NOMATCH);
2830 if (eptr >= md->end_subject)
2831 {
2832 SCHECK_PARTIAL();
2833 RRETURN(MATCH_NOMATCH);
2834 }
2835 c = *eptr++;
2836 #ifndef COMPILE_PCRE8
2837 if (c > 255)
2838 {
2839 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2840 }
2841 else
2842 #endif
2843 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2844 }
2845 }
2846 /* Control never gets here */
2847 }
2848
2849 /* If maximizing, find the longest possible run, then work backwards. */
2850
2851 else
2852 {
2853 pp = eptr;
2854
2855 #ifdef SUPPORT_UTF
2856 if (utf)
2857 {
2858 for (i = min; i < max; i++)
2859 {
2860 int len = 1;
2861 if (eptr >= md->end_subject)
2862 {
2863 SCHECK_PARTIAL();
2864 break;
2865 }
2866 GETCHARLEN(c, eptr, len);
2867 if (c > 255)
2868 {
2869 if (op == OP_CLASS) break;
2870 }
2871 else
2872 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2873 eptr += len;
2874 }
2875 for (;;)
2876 {
2877 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2879 if (eptr-- == pp) break; /* Stop if tried at original pos */
2880 BACKCHAR(eptr);
2881 }
2882 }
2883 else
2884 #endif
2885 /* Not UTF mode */
2886 {
2887 for (i = min; i < max; i++)
2888 {
2889 if (eptr >= md->end_subject)
2890 {
2891 SCHECK_PARTIAL();
2892 break;
2893 }
2894 c = *eptr;
2895 #ifndef COMPILE_PCRE8
2896 if (c > 255)
2897 {
2898 if (op == OP_CLASS) break;
2899 }
2900 else
2901 #endif
2902 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2903 eptr++;
2904 }
2905 while (eptr >= pp)
2906 {
2907 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2909 eptr--;
2910 }
2911 }
2912
2913 RRETURN(MATCH_NOMATCH);
2914 }
2915 #undef BYTE_MAP
2916 }
2917 /* Control never gets here */
2918
2919
2920 /* Match an extended character class. This opcode is encountered only
2921 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2922 mode, because Unicode properties are supported in non-UTF-8 mode. */
2923
2924 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2925 case OP_XCLASS:
2926 {
2927 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2928 ecode += GET(ecode, 1); /* Advance past the item */
2929
2930 switch (*ecode)
2931 {
2932 case OP_CRSTAR:
2933 case OP_CRMINSTAR:
2934 case OP_CRPLUS:
2935 case OP_CRMINPLUS:
2936 case OP_CRQUERY:
2937 case OP_CRMINQUERY:
2938 c = *ecode++ - OP_CRSTAR;
2939 minimize = (c & 1) != 0;
2940 min = rep_min[c]; /* Pick up values from tables; */
2941 max = rep_max[c]; /* zero for max => infinity */
2942 if (max == 0) max = INT_MAX;
2943 break;
2944
2945 case OP_CRRANGE:
2946 case OP_CRMINRANGE:
2947 minimize = (*ecode == OP_CRMINRANGE);
2948 min = GET2(ecode, 1);
2949 max = GET2(ecode, 1 + IMM2_SIZE);
2950 if (max == 0) max = INT_MAX;
2951 ecode += 1 + 2 * IMM2_SIZE;
2952 break;
2953
2954 default: /* No repeat follows */
2955 min = max = 1;
2956 break;
2957 }
2958
2959 /* First, ensure the minimum number of matches are present. */
2960
2961 for (i = 1; i <= min; i++)
2962 {
2963 if (eptr >= md->end_subject)
2964 {
2965 SCHECK_PARTIAL();
2966 RRETURN(MATCH_NOMATCH);
2967 }
2968 GETCHARINCTEST(c, eptr);
2969 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2970 }
2971
2972 /* If max == min we can continue with the main loop without the
2973 need to recurse. */
2974
2975 if (min == max) continue;
2976
2977 /* If minimizing, keep testing the rest of the expression and advancing
2978 the pointer while it matches the class. */
2979
2980 if (minimize)
2981 {
2982 for (fi = min;; fi++)
2983 {
2984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2986 if (fi >= max) RRETURN(MATCH_NOMATCH);
2987 if (eptr >= md->end_subject)
2988 {
2989 SCHECK_PARTIAL();
2990 RRETURN(MATCH_NOMATCH);
2991 }
2992 GETCHARINCTEST(c, eptr);
2993 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2994 }
2995 /* Control never gets here */
2996 }
2997
2998 /* If maximizing, find the longest possible run, then work backwards. */
2999
3000 else
3001 {
3002 pp = eptr;
3003 for (i = min; i < max; i++)
3004 {
3005 int len = 1;
3006 if (eptr >= md->end_subject)
3007 {
3008 SCHECK_PARTIAL();
3009 break;
3010 }
3011 #ifdef SUPPORT_UTF
3012 GETCHARLENTEST(c, eptr, len);
3013 #else
3014 c = *eptr;
3015 #endif
3016 if (!PRIV(xclass)(c, data, utf)) break;
3017 eptr += len;
3018 }
3019 for(;;)
3020 {
3021 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3023 if (eptr-- == pp) break; /* Stop if tried at original pos */
3024 #ifdef SUPPORT_UTF
3025 if (utf) BACKCHAR(eptr);
3026 #endif
3027 }
3028 RRETURN(MATCH_NOMATCH);
3029 }
3030
3031 /* Control never gets here */
3032 }
3033 #endif /* End of XCLASS */
3034
3035 /* Match a single character, casefully */
3036
3037 case OP_CHAR:
3038 #ifdef SUPPORT_UTF
3039 if (utf)
3040 {
3041 length = 1;
3042 ecode++;
3043 GETCHARLEN(fc, ecode, length);
3044 if (length > md->end_subject - eptr)
3045 {
3046 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3047 RRETURN(MATCH_NOMATCH);
3048 }
3049 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3050 }
3051 else
3052 #endif
3053 /* Not UTF mode */
3054 {
3055 if (md->end_subject - eptr < 1)
3056 {
3057 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3058 RRETURN(MATCH_NOMATCH);
3059 }
3060 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3061 ecode += 2;
3062 }
3063 break;
3064
3065 /* Match a single character, caselessly. If we are at the end of the
3066 subject, give up immediately. */
3067
3068 case OP_CHARI:
3069 if (eptr >= md->end_subject)
3070 {
3071 SCHECK_PARTIAL();
3072 RRETURN(MATCH_NOMATCH);
3073 }
3074
3075 #ifdef SUPPORT_UTF
3076 if (utf)
3077 {
3078 length = 1;
3079 ecode++;
3080 GETCHARLEN(fc, ecode, length);
3081
3082 /* If the pattern character's value is < 128, we have only one byte, and
3083 we know that its other case must also be one byte long, so we can use the
3084 fast lookup table. We know that there is at least one byte left in the
3085 subject. */
3086
3087 if (fc < 128)
3088 {
3089 if (md->lcc[fc]
3090 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3091 ecode++;
3092 eptr++;
3093 }
3094
3095 /* Otherwise we must pick up the subject character. Note that we cannot
3096 use the value of "length" to check for sufficient bytes left, because the
3097 other case of the character may have more or fewer bytes. */
3098
3099 else
3100 {
3101 unsigned int dc;
3102 GETCHARINC(dc, eptr);
3103 ecode += length;
3104
3105 /* If we have Unicode property support, we can use it to test the other
3106 case of the character, if there is one. */
3107
3108 if (fc != dc)
3109 {
3110 #ifdef SUPPORT_UCP
3111 if (dc != UCD_OTHERCASE(fc))
3112 #endif
3113 RRETURN(MATCH_NOMATCH);
3114 }
3115 }
3116 }
3117 else
3118 #endif /* SUPPORT_UTF */
3119
3120 /* Not UTF mode */
3121 {
3122 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3123 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3124 eptr++;
3125 ecode += 2;
3126 }
3127 break;
3128
3129 /* Match a single character repeatedly. */
3130
3131 case OP_EXACT:
3132 case OP_EXACTI:
3133 min = max = GET2(ecode, 1);
3134 ecode += 1 + IMM2_SIZE;
3135 goto REPEATCHAR;
3136
3137 case OP_POSUPTO:
3138 case OP_POSUPTOI:
3139 possessive = TRUE;
3140 /* Fall through */
3141
3142 case OP_UPTO:
3143 case OP_UPTOI:
3144 case OP_MINUPTO:
3145 case OP_MINUPTOI:
3146 min = 0;
3147 max = GET2(ecode, 1);
3148 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3149 ecode += 1 + IMM2_SIZE;
3150 goto REPEATCHAR;
3151
3152 case OP_POSSTAR:
3153 case OP_POSSTARI:
3154 possessive = TRUE;
3155 min = 0;
3156 max = INT_MAX;
3157 ecode++;
3158 goto REPEATCHAR;
3159
3160 case OP_POSPLUS:
3161 case OP_POSPLUSI:
3162 possessive = TRUE;
3163 min = 1;
3164 max = INT_MAX;
3165 ecode++;
3166 goto REPEATCHAR;
3167
3168 case OP_POSQUERY:
3169 case OP_POSQUERYI:
3170 possessive = TRUE;
3171 min = 0;
3172 max = 1;
3173 ecode++;
3174 goto REPEATCHAR;
3175
3176 case OP_STAR:
3177 case OP_STARI:
3178 case OP_MINSTAR:
3179 case OP_MINSTARI:
3180 case OP_PLUS:
3181 case OP_PLUSI:
3182 case OP_MINPLUS:
3183 case OP_MINPLUSI:
3184 case OP_QUERY:
3185 case OP_QUERYI:
3186 case OP_MINQUERY:
3187 case OP_MINQUERYI:
3188 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3189 minimize = (c & 1) != 0;
3190 min = rep_min[c]; /* Pick up values from tables; */
3191 max = rep_max[c]; /* zero for max => infinity */
3192 if (max == 0) max = INT_MAX;
3193
3194 /* Common code for all repeated single-character matches. */
3195
3196 REPEATCHAR:
3197 #ifdef SUPPORT_UTF
3198 if (utf)
3199 {
3200 length = 1;
3201 charptr = ecode;
3202 GETCHARLEN(fc, ecode, length);
3203 ecode += length;
3204
3205 /* Handle multibyte character matching specially here. There is
3206 support for caseless matching if UCP support is present. */
3207
3208 if (length > 1)
3209 {
3210 #ifdef SUPPORT_UCP
3211 unsigned int othercase;
3212 if (op >= OP_STARI && /* Caseless */
3213 (othercase = UCD_OTHERCASE(fc)) != fc)
3214 oclength = PRIV(ord2utf)(othercase, occhars);
3215 else oclength = 0;
3216 #endif /* SUPPORT_UCP */
3217
3218 for (i = 1; i <= min; i++)
3219 {
3220 if (eptr <= md->end_subject - length &&
3221 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3222 #ifdef SUPPORT_UCP
3223 else if (oclength > 0 &&
3224 eptr <= md->end_subject - oclength &&
3225 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3226 #endif /* SUPPORT_UCP */
3227 else
3228 {
3229 CHECK_PARTIAL();
3230 RRETURN(MATCH_NOMATCH);
3231 }
3232 }
3233
3234 if (min == max) continue;
3235
3236 if (minimize)
3237 {
3238 for (fi = min;; fi++)
3239 {
3240 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3241 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3242 if (fi >= max) RRETURN(MATCH_NOMATCH);
3243 if (eptr <= md->end_subject - length &&
3244 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3245 #ifdef SUPPORT_UCP
3246 else if (oclength > 0 &&
3247 eptr <= md->end_subject - oclength &&
3248 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3249 #endif /* SUPPORT_UCP */
3250 else
3251 {
3252 CHECK_PARTIAL();
3253 RRETURN(MATCH_NOMATCH);
3254 }
3255 }
3256 /* Control never gets here */
3257 }
3258
3259 else /* Maximize */
3260 {
3261 pp = eptr;
3262 for (i = min; i < max; i++)
3263 {
3264 if (eptr <= md->end_subject - length &&
3265 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3266 #ifdef SUPPORT_UCP
3267 else if (oclength > 0 &&
3268 eptr <= md->end_subject - oclength &&
3269 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3270 #endif /* SUPPORT_UCP */
3271 else
3272 {
3273 CHECK_PARTIAL();
3274 break;
3275 }
3276 }
3277
3278 if (possessive) continue;
3279
3280 for(;;)
3281 {
3282 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3283 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3284 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3285 #ifdef SUPPORT_UCP
3286 eptr--;
3287 BACKCHAR(eptr);
3288 #else /* without SUPPORT_UCP */
3289 eptr -= length;
3290 #endif /* SUPPORT_UCP */
3291 }
3292 }
3293 /* Control never gets here */
3294 }
3295
3296 /* If the length of a UTF-8 character is 1, we fall through here, and
3297 obey the code as for non-UTF-8 characters below, though in this case the
3298 value of fc will always be < 128. */
3299 }
3300 else
3301 #endif /* SUPPORT_UTF */
3302 /* When not in UTF-8 mode, load a single-byte character. */
3303 fc = *ecode++;
3304
3305 /* The value of fc at this point is always one character, though we may
3306 or may not be in UTF mode. The code is duplicated for the caseless and
3307 caseful cases, for speed, since matching characters is likely to be quite
3308 common. First, ensure the minimum number of matches are present. If min =
3309 max, continue at the same level without recursing. Otherwise, if
3310 minimizing, keep trying the rest of the expression and advancing one
3311 matching character if failing, up to the maximum. Alternatively, if
3312 maximizing, find the maximum number of characters and work backwards. */
3313
3314 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3315 max, eptr));
3316
3317 if (op >= OP_STARI) /* Caseless */
3318 {
3319 #ifdef COMPILE_PCRE8
3320 /* fc must be < 128 if UTF is enabled. */
3321 foc = md->fcc[fc];
3322 #else
3323 #ifdef SUPPORT_UTF
3324 #ifdef SUPPORT_UCP
3325 if (utf && fc > 127)
3326 foc = UCD_OTHERCASE(fc);
3327 #else
3328 if (utf && fc > 127)
3329 foc = fc;
3330 #endif /* SUPPORT_UCP */
3331 else
3332 #endif /* SUPPORT_UTF */
3333 foc = TABLE_GET(fc, md->fcc, fc);
3334 #endif /* COMPILE_PCRE8 */
3335
3336 for (i = 1; i <= min; i++)
3337 {
3338 if (eptr >= md->end_subject)
3339 {
3340 SCHECK_PARTIAL();
3341 RRETURN(MATCH_NOMATCH);
3342 }
3343 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3344 eptr++;
3345 }
3346 if (min == max) continue;
3347 if (minimize)
3348 {
3349 for (fi = min;; fi++)
3350 {
3351 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3352 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3353 if (fi >= max) RRETURN(MATCH_NOMATCH);
3354 if (eptr >= md->end_subject)
3355 {
3356 SCHECK_PARTIAL();
3357 RRETURN(MATCH_NOMATCH);
3358 }
3359 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3360 eptr++;
3361 }
3362 /* Control never gets here */
3363 }
3364 else /* Maximize */
3365 {
3366 pp = eptr;
3367 for (i = min; i < max; i++)
3368 {
3369 if (eptr >= md->end_subject)
3370 {
3371 SCHECK_PARTIAL();
3372 break;
3373 }
3374 if (fc != *eptr && foc != *eptr) break;
3375 eptr++;
3376 }
3377
3378 if (possessive) continue;
3379
3380 while (eptr >= pp)
3381 {
3382 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3383 eptr--;
3384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3385 }
3386 RRETURN(MATCH_NOMATCH);
3387 }
3388 /* Control never gets here */
3389 }
3390
3391 /* Caseful comparisons (includes all multi-byte characters) */
3392
3393 else
3394 {
3395 for (i = 1; i <= min; i++)
3396 {
3397 if (eptr >= md->end_subject)
3398 {
3399 SCHECK_PARTIAL();
3400 RRETURN(MATCH_NOMATCH);
3401 }
3402 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3403 }
3404
3405 if (min == max) continue;
3406
3407 if (minimize)
3408 {
3409 for (fi = min;; fi++)
3410 {
3411 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3412 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3413 if (fi >= max) RRETURN(MATCH_NOMATCH);
3414 if (eptr >= md->end_subject)
3415 {
3416 SCHECK_PARTIAL();
3417 RRETURN(MATCH_NOMATCH);
3418 }
3419 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3420 }
3421 /* Control never gets here */
3422 }
3423 else /* Maximize */
3424 {
3425 pp = eptr;
3426 for (i = min; i < max; i++)
3427 {
3428 if (eptr >= md->end_subject)
3429 {
3430 SCHECK_PARTIAL();
3431 break;
3432 }
3433 if (fc != *eptr) break;
3434 eptr++;
3435 }
3436 if (possessive) continue;
3437
3438 while (eptr >= pp)
3439 {
3440 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3441 eptr--;
3442 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3443 }
3444 RRETURN(MATCH_NOMATCH);
3445 }
3446 }
3447 /* Control never gets here */
3448
3449 /* Match a negated single one-byte character. The character we are
3450 checking can be multibyte. */
3451
3452 case OP_NOT:
3453 case OP_NOTI:
3454 if (eptr >= md->end_subject)
3455 {
3456 SCHECK_PARTIAL();
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 ecode++;
3460 GETCHARINCTEST(c, eptr);
3461 if (op == OP_NOTI) /* The caseless case */
3462 {
3463 register int ch, och;
3464 ch = *ecode++;
3465 #ifdef COMPILE_PCRE8
3466 /* ch must be < 128 if UTF is enabled. */
3467 och = md->fcc[ch];
3468 #else
3469 #ifdef SUPPORT_UTF
3470 #ifdef SUPPORT_UCP
3471 if (utf && ch > 127)
3472 och = UCD_OTHERCASE(ch);
3473 #else
3474 if (utf && ch > 127)
3475 och = ch;
3476 #endif /* SUPPORT_UCP */
3477 else
3478 #endif /* SUPPORT_UTF */
3479 och = TABLE_GET(ch, md->fcc, ch);
3480 #endif /* COMPILE_PCRE8 */
3481 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3482 }
3483 else /* Caseful */
3484 {
3485 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3486 }
3487 break;
3488
3489 /* Match a negated single one-byte character repeatedly. This is almost a
3490 repeat of the code for a repeated single character, but I haven't found a
3491 nice way of commoning these up that doesn't require a test of the
3492 positive/negative option for each character match. Maybe that wouldn't add
3493 very much to the time taken, but character matching *is* what this is all
3494 about... */
3495
3496 case OP_NOTEXACT:
3497 case OP_NOTEXACTI:
3498 min = max = GET2(ecode, 1);
3499 ecode += 1 + IMM2_SIZE;
3500 goto REPEATNOTCHAR;
3501
3502 case OP_NOTUPTO:
3503 case OP_NOTUPTOI:
3504 case OP_NOTMINUPTO:
3505 case OP_NOTMINUPTOI:
3506 min = 0;
3507 max = GET2(ecode, 1);
3508 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3509 ecode += 1 + IMM2_SIZE;
3510 goto REPEATNOTCHAR;
3511
3512 case OP_NOTPOSSTAR:
3513 case OP_NOTPOSSTARI:
3514 possessive = TRUE;
3515 min = 0;
3516 max = INT_MAX;
3517 ecode++;
3518 goto REPEATNOTCHAR;
3519
3520 case OP_NOTPOSPLUS:
3521 case OP_NOTPOSPLUSI:
3522 possessive = TRUE;
3523 min = 1;
3524 max = INT_MAX;
3525 ecode++;
3526 goto REPEATNOTCHAR;
3527
3528 case OP_NOTPOSQUERY:
3529 case OP_NOTPOSQUERYI:
3530 possessive = TRUE;
3531 min = 0;
3532 max = 1;
3533 ecode++;
3534 goto REPEATNOTCHAR;
3535
3536 case OP_NOTPOSUPTO:
3537 case OP_NOTPOSUPTOI:
3538 possessive = TRUE;
3539 min = 0;
3540 max = GET2(ecode, 1);
3541 ecode += 1 + IMM2_SIZE;
3542 goto REPEATNOTCHAR;
3543
3544 case OP_NOTSTAR:
3545 case OP_NOTSTARI:
3546 case OP_NOTMINSTAR:
3547 case OP_NOTMINSTARI:
3548 case OP_NOTPLUS:
3549 case OP_NOTPLUSI:
3550 case OP_NOTMINPLUS:
3551 case OP_NOTMINPLUSI:
3552 case OP_NOTQUERY:
3553 case OP_NOTQUERYI:
3554 case OP_NOTMINQUERY:
3555 case OP_NOTMINQUERYI:
3556 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3557 minimize = (c & 1) != 0;
3558 min = rep_min[c]; /* Pick up values from tables; */
3559 max = rep_max[c]; /* zero for max => infinity */
3560 if (max == 0) max = INT_MAX;
3561
3562 /* Common code for all repeated single-byte matches. */
3563
3564 REPEATNOTCHAR:
3565 fc = *ecode++;
3566
3567 /* The code is duplicated for the caseless and caseful cases, for speed,
3568 since matching characters is likely to be quite common. First, ensure the
3569 minimum number of matches are present. If min = max, continue at the same
3570 level without recursing. Otherwise, if minimizing, keep trying the rest of
3571 the expression and advancing one matching character if failing, up to the
3572 maximum. Alternatively, if maximizing, find the maximum number of
3573 characters and work backwards. */
3574
3575 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3576 max, eptr));
3577
3578 if (op >= OP_NOTSTARI) /* Caseless */
3579 {
3580 #ifdef COMPILE_PCRE8
3581 /* fc must be < 128 if UTF is enabled. */
3582 foc = md->fcc[fc];
3583 #else
3584 #ifdef SUPPORT_UTF
3585 #ifdef SUPPORT_UCP
3586 if (utf && fc > 127)
3587 foc = UCD_OTHERCASE(fc);
3588 #else
3589 if (utf && fc > 127)
3590 foc = fc;
3591 #endif /* SUPPORT_UCP */
3592 else
3593 #endif /* SUPPORT_UTF */
3594 foc = TABLE_GET(fc, md->fcc, fc);
3595 #endif /* COMPILE_PCRE8 */
3596
3597 #ifdef SUPPORT_UTF
3598 if (utf)
3599 {
3600 register unsigned int d;
3601 for (i = 1; i <= min; i++)
3602 {
3603 if (eptr >= md->end_subject)
3604 {
3605 SCHECK_PARTIAL();
3606 RRETURN(MATCH_NOMATCH);
3607 }
3608 GETCHARINC(d, eptr);
3609 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3610 }
3611 }
3612 else
3613 #endif
3614 /* Not UTF mode */
3615 {
3616 for (i = 1; i <= min; i++)
3617 {
3618 if (eptr >= md->end_subject)
3619 {
3620 SCHECK_PARTIAL();
3621 RRETURN(MATCH_NOMATCH);
3622 }
3623 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3624 eptr++;
3625 }
3626 }
3627
3628 if (min == max) continue;
3629
3630 if (minimize)
3631 {
3632 #ifdef SUPPORT_UTF
3633 if (utf)
3634 {
3635 register unsigned int d;
3636 for (fi = min;; fi++)
3637 {
3638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3640 if (fi >= max) RRETURN(MATCH_NOMATCH);
3641 if (eptr >= md->end_subject)
3642 {
3643 SCHECK_PARTIAL();
3644 RRETURN(MATCH_NOMATCH);
3645 }
3646 GETCHARINC(d, eptr);
3647 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3648 }
3649 }
3650 else
3651 #endif
3652 /* Not UTF mode */
3653 {
3654 for (fi = min;; fi++)
3655 {
3656 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 if (fi >= max) RRETURN(MATCH_NOMATCH);
3659 if (eptr >= md->end_subject)
3660 {
3661 SCHECK_PARTIAL();
3662 RRETURN(MATCH_NOMATCH);
3663 }
3664 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3665 eptr++;
3666 }
3667 }
3668 /* Control never gets here */
3669 }
3670
3671 /* Maximize case */
3672
3673 else
3674 {
3675 pp = eptr;
3676
3677 #ifdef SUPPORT_UTF
3678 if (utf)
3679 {
3680 register unsigned int d;
3681 for (i = min; i < max; i++)
3682 {
3683 int len = 1;
3684 if (eptr >= md->end_subject)
3685 {
3686 SCHECK_PARTIAL();
3687 break;
3688 }
3689 GETCHARLEN(d, eptr, len);
3690 if (fc == d || foc == d) break;
3691 eptr += len;
3692 }
3693 if (possessive) continue;
3694 for(;;)
3695 {
3696 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3697 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3698 if (eptr-- == pp) break; /* Stop if tried at original pos */
3699 BACKCHAR(eptr);
3700 }
3701 }
3702 else
3703 #endif
3704 /* Not UTF mode */
3705 {
3706 for (i = min; i < max; i++)
3707 {
3708 if (eptr >= md->end_subject)
3709 {
3710 SCHECK_PARTIAL();
3711 break;
3712 }
3713 if (fc == *eptr || foc == *eptr) break;
3714 eptr++;
3715 }
3716 if (possessive) continue;
3717 while (eptr >= pp)
3718 {
3719 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3721 eptr--;
3722 }
3723 }
3724
3725 RRETURN(MATCH_NOMATCH);
3726 }
3727 /* Control never gets here */
3728 }
3729
3730 /* Caseful comparisons */
3731
3732 else
3733 {
3734 #ifdef SUPPORT_UTF
3735 if (utf)
3736 {
3737 register unsigned int d;
3738 for (i = 1; i <= min; i++)
3739 {
3740 if (eptr >= md->end_subject)
3741 {
3742 SCHECK_PARTIAL();
3743 RRETURN(MATCH_NOMATCH);
3744 }
3745 GETCHARINC(d, eptr);
3746 if (fc == d) RRETURN(MATCH_NOMATCH);
3747 }
3748 }
3749 else
3750 #endif
3751 /* Not UTF mode */
3752 {
3753 for (i = 1; i <= min; i++)
3754 {
3755 if (eptr >= md->end_subject)
3756 {
3757 SCHECK_PARTIAL();
3758 RRETURN(MATCH_NOMATCH);
3759 }
3760 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3761 }
3762 }
3763
3764 if (min == max) continue;
3765
3766 if (minimize)
3767 {
3768 #ifdef SUPPORT_UTF
3769 if (utf)
3770 {
3771 register unsigned int d;
3772 for (fi = min;; fi++)
3773 {
3774 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3775 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3776 if (fi >= max) RRETURN(MATCH_NOMATCH);
3777 if (eptr >= md->end_subject)
3778 {
3779 SCHECK_PARTIAL();
3780 RRETURN(MATCH_NOMATCH);
3781 }
3782 GETCHARINC(d, eptr);
3783 if (fc == d) RRETURN(MATCH_NOMATCH);
3784 }
3785 }
3786 else
3787 #endif
3788 /* Not UTF mode */
3789 {
3790 for (fi = min;; fi++)
3791 {
3792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3794 if (fi >= max) RRETURN(MATCH_NOMATCH);
3795 if (eptr >= md->end_subject)
3796 {
3797 SCHECK_PARTIAL();
3798 RRETURN(MATCH_NOMATCH);
3799 }
3800 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3801 }
3802 }
3803 /* Control never gets here */
3804 }
3805
3806 /* Maximize case */
3807
3808 else
3809 {
3810 pp = eptr;
3811
3812 #ifdef SUPPORT_UTF
3813 if (utf)
3814 {
3815 register unsigned int d;
3816 for (i = min; i < max; i++)
3817 {
3818 int len = 1;
3819 if (eptr >= md->end_subject)
3820 {
3821 SCHECK_PARTIAL();
3822 break;
3823 }
3824 GETCHARLEN(d, eptr, len);
3825 if (fc == d) break;
3826 eptr += len;
3827 }
3828 if (possessive) continue;
3829 for(;;)
3830 {
3831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3833 if (eptr-- == pp) break; /* Stop if tried at original pos */
3834 BACKCHAR(eptr);
3835 }
3836 }
3837 else
3838 #endif
3839 /* Not UTF mode */
3840 {
3841 for (i = min; i < max; i++)
3842 {
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 break;
3847 }
3848 if (fc == *eptr) break;
3849 eptr++;
3850 }
3851 if (possessive) continue;
3852 while (eptr >= pp)
3853 {
3854 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3855 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3856 eptr--;
3857 }
3858 }
3859
3860 RRETURN(MATCH_NOMATCH);
3861 }
3862 }
3863 /* Control never gets here */
3864
3865 /* Match a single character type repeatedly; several different opcodes
3866 share code. This is very similar to the code for single characters, but we
3867 repeat it in the interests of efficiency. */
3868
3869 case OP_TYPEEXACT:
3870 min = max = GET2(ecode, 1);
3871 minimize = TRUE;
3872 ecode += 1 + IMM2_SIZE;
3873 goto REPEATTYPE;
3874
3875 case OP_TYPEUPTO:
3876 case OP_TYPEMINUPTO:
3877 min = 0;
3878 max = GET2(ecode, 1);
3879 minimize = *ecode == OP_TYPEMINUPTO;
3880 ecode += 1 + IMM2_SIZE;
3881 goto REPEATTYPE;
3882
3883 case OP_TYPEPOSSTAR:
3884 possessive = TRUE;
3885 min = 0;
3886 max = INT_MAX;
3887 ecode++;
3888 goto REPEATTYPE;
3889
3890 case OP_TYPEPOSPLUS:
3891 possessive = TRUE;
3892 min = 1;
3893 max = INT_MAX;
3894 ecode++;
3895 goto REPEATTYPE;
3896
3897 case OP_TYPEPOSQUERY:
3898 possessive = TRUE;
3899 min = 0;
3900 max = 1;
3901 ecode++;
3902 goto REPEATTYPE;
3903
3904 case OP_TYPEPOSUPTO:
3905 possessive = TRUE;
3906 min = 0;
3907 max = GET2(ecode, 1);
3908 ecode += 1 + IMM2_SIZE;
3909 goto REPEATTYPE;
3910
3911 case OP_TYPESTAR:
3912 case OP_TYPEMINSTAR:
3913 case OP_TYPEPLUS:
3914 case OP_TYPEMINPLUS:
3915 case OP_TYPEQUERY:
3916 case OP_TYPEMINQUERY:
3917 c = *ecode++ - OP_TYPESTAR;
3918 minimize = (c & 1) != 0;
3919 min = rep_min[c]; /* Pick up values from tables; */
3920 max = rep_max[c]; /* zero for max => infinity */
3921 if (max == 0) max = INT_MAX;
3922
3923 /* Common code for all repeated single character type matches. Note that
3924 in UTF-8 mode, '.' matches a character of any length, but for the other
3925 character types, the valid characters are all one-byte long. */
3926
3927 REPEATTYPE:
3928 ctype = *ecode++; /* Code for the character type */
3929
3930 #ifdef SUPPORT_UCP
3931 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3932 {
3933 prop_fail_result = ctype == OP_NOTPROP;
3934 prop_type = *ecode++;
3935 prop_value = *ecode++;
3936 }
3937 else prop_type = -1;
3938 #endif
3939
3940 /* First, ensure the minimum number of matches are present. Use inline
3941 code for maximizing the speed, and do the type test once at the start
3942 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3943 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3944 and single-bytes. */
3945
3946 if (min > 0)
3947 {
3948 #ifdef SUPPORT_UCP
3949 if (prop_type >= 0)
3950 {
3951 switch(prop_type)
3952 {
3953 case PT_ANY:
3954 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3955 for (i = 1; i <= min; i++)
3956 {
3957 if (eptr >= md->end_subject)
3958 {
3959 SCHECK_PARTIAL();
3960 RRETURN(MATCH_NOMATCH);
3961 }
3962 GETCHARINCTEST(c, eptr);
3963 }
3964 break;
3965
3966 case PT_LAMP:
3967 for (i = 1; i <= min; i++)
3968 {
3969 int chartype;
3970 if (eptr >= md->end_subject)
3971 {
3972 SCHECK_PARTIAL();
3973 RRETURN(MATCH_NOMATCH);
3974 }
3975 GETCHARINCTEST(c, eptr);
3976 chartype = UCD_CHARTYPE(c);
3977 if ((chartype == ucp_Lu ||
3978 chartype == ucp_Ll ||
3979 chartype == ucp_Lt) == prop_fail_result)
3980 RRETURN(MATCH_NOMATCH);
3981 }
3982 break;
3983
3984 case PT_GC:
3985 for (i = 1; i <= min; i++)
3986 {
3987 if (eptr >= md->end_subject)
3988 {
3989 SCHECK_PARTIAL();
3990 RRETURN(MATCH_NOMATCH);
3991 }
3992 GETCHARINCTEST(c, eptr);
3993 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
3994 RRETURN(MATCH_NOMATCH);
3995 }
3996 break;
3997
3998 case PT_PC:
3999 for (i = 1; i <= min; i++)
4000 {
4001 if (eptr >= md->end_subject)
4002 {
4003 SCHECK_PARTIAL();
4004 RRETURN(MATCH_NOMATCH);
4005 }
4006 GETCHARINCTEST(c, eptr);
4007 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4008 RRETURN(MATCH_NOMATCH);
4009 }
4010 break;
4011
4012 case PT_SC:
4013 for (i = 1; i <= min; i++)
4014 {
4015 if (eptr >= md->end_subject)
4016 {
4017 SCHECK_PARTIAL();
4018 RRETURN(MATCH_NOMATCH);
4019 }
4020 GETCHARINCTEST(c, eptr);
4021 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4022 RRETURN(MATCH_NOMATCH);
4023 }
4024 break;
4025
4026 case PT_ALNUM:
4027 for (i = 1; i <= min; i++)
4028 {
4029 int category;
4030 if (eptr >= md->end_subject)
4031 {
4032 SCHECK_PARTIAL();
4033 RRETURN(MATCH_NOMATCH);
4034 }
4035 GETCHARINCTEST(c, eptr);
4036 category = UCD_CATEGORY(c);
4037 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4038 RRETURN(MATCH_NOMATCH);
4039 }
4040 break;
4041
4042 case PT_SPACE: /* Perl space */
4043 for (i = 1; i <= min; i++)
4044 {
4045 if (eptr >= md->end_subject)
4046 {
4047 SCHECK_PARTIAL();
4048 RRETURN(MATCH_NOMATCH);
4049 }
4050 GETCHARINCTEST(c, eptr);
4051 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4052 c == CHAR_FF || c == CHAR_CR)
4053 == prop_fail_result)
4054 RRETURN(MATCH_NOMATCH);
4055 }
4056 break;
4057
4058 case PT_PXSPACE: /* POSIX space */
4059 for (i = 1; i <= min; i++)
4060 {
4061 if (eptr >= md->end_subject)
4062 {
4063 SCHECK_PARTIAL();
4064 RRETURN(MATCH_NOMATCH);
4065 }
4066 GETCHARINCTEST(c, eptr);
4067 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4068 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4069 == prop_fail_result)
4070 RRETURN(MATCH_NOMATCH);
4071 }
4072 break;
4073
4074 case PT_WORD:
4075 for (i = 1; i <= min; i++)
4076 {
4077 int category;
4078 if (eptr >= md->end_subject)
4079 {
4080 SCHECK_PARTIAL();
4081 RRETURN(MATCH_NOMATCH);
4082 }
4083 GETCHARINCTEST(c, eptr);
4084 category = UCD_CATEGORY(c);
4085 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4086 == prop_fail_result)
4087 RRETURN(MATCH_NOMATCH);
4088 }
4089 break;
4090
4091 /* This should not occur */
4092
4093 default:
4094 RRETURN(PCRE_ERROR_INTERNAL);
4095 }
4096 }
4097
4098 /* Match extended Unicode sequences. We will get here only if the
4099 support is in the binary; otherwise a compile-time error occurs. */
4100
4101 else if (ctype == OP_EXTUNI)
4102 {
4103 for (i = 1; i <= min; i++)
4104 {
4105 if (eptr >= md->end_subject)
4106 {
4107 SCHECK_PARTIAL();
4108 RRETURN(MATCH_NOMATCH);
4109 }
4110 GETCHARINCTEST(c, eptr);
4111 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4112 while (eptr < md->end_subject)
4113 {
4114 int len = 1;
4115 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4116 if (UCD_CATEGORY(c) != ucp_M) break;
4117 eptr += len;
4118 }
4119 }
4120 }
4121
4122 else
4123 #endif /* SUPPORT_UCP */
4124
4125 /* Handle all other cases when the coding is UTF-8 */
4126
4127 #ifdef SUPPORT_UTF
4128 if (utf) switch(ctype)
4129 {
4130 case OP_ANY:
4131 for (i = 1; i <= min; i++)
4132 {
4133 if (eptr >= md->end_subject)
4134 {
4135 SCHECK_PARTIAL();
4136 RRETURN(MATCH_NOMATCH);
4137 }
4138 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4139 eptr++;
4140 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4141 }
4142 break;
4143
4144 case OP_ALLANY:
4145 for (i = 1; i <= min; i++)
4146 {
4147 if (eptr >= md->end_subject)
4148 {
4149 SCHECK_PARTIAL();
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 eptr++;
4153 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4154 }
4155 break;
4156
4157 case OP_ANYBYTE:
4158 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4159 eptr += min;
4160 break;
4161
4162 case OP_ANYNL:
4163 for (i = 1; i <= min; i++)
4164 {
4165 if (eptr >= md->end_subject)
4166 {
4167 SCHECK_PARTIAL();
4168 RRETURN(MATCH_NOMATCH);
4169 }
4170 GETCHARINC(c, eptr);
4171 switch(c)
4172 {
4173 default: RRETURN(MATCH_NOMATCH);
4174
4175 case 0x000d:
4176 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4177 break;
4178
4179 case 0x000a:
4180 break;
4181
4182 case 0x000b:
4183 case 0x000c:
4184 case 0x0085:
4185 case 0x2028:
4186 case 0x2029:
4187 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4188 break;
4189 }
4190 }
4191 break;
4192
4193 case OP_NOT_HSPACE:
4194 for (i = 1; i <= min; i++)
4195 {
4196 if (eptr >= md->end_subject)
4197 {
4198 SCHECK_PARTIAL();
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 GETCHARINC(c, eptr);
4202 switch(c)
4203 {
4204 default: break;
4205 case 0x09: /* HT */
4206 case 0x20: /* SPACE */
4207 case 0xa0: /* NBSP */
4208 case 0x1680: /* OGHAM SPACE MARK */
4209 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4210 case 0x2000: /* EN QUAD */
4211 case 0x2001: /* EM QUAD */
4212 case 0x2002: /* EN SPACE */
4213 case 0x2003: /* EM SPACE */
4214 case 0x2004: /* THREE-PER-EM SPACE */
4215 case 0x2005: /* FOUR-PER-EM SPACE */
4216 case 0x2006: /* SIX-PER-EM SPACE */
4217 case 0x2007: /* FIGURE SPACE */
4218 case 0x2008: /* PUNCTUATION SPACE */
4219 case 0x2009: /* THIN SPACE */
4220 case 0x200A: /* HAIR SPACE */
4221 case 0x202f: /* NARROW NO-BREAK SPACE */
4222 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4223 case 0x3000: /* IDEOGRAPHIC SPACE */
4224 RRETURN(MATCH_NOMATCH);
4225 }
4226 }
4227 break;
4228
4229 case OP_HSPACE:
4230 for (i = 1; i <= min; i++)
4231 {
4232 if (eptr >= md->end_subject)
4233 {
4234 SCHECK_PARTIAL();
4235 RRETURN(MATCH_NOMATCH);
4236 }
4237 GETCHARINC(c, eptr);
4238 switch(c)
4239 {
4240 default: RRETURN(MATCH_NOMATCH);
4241 case 0x09: /* HT */
4242 case 0x20: /* SPACE */
4243 case 0xa0: /* NBSP */
4244 case 0x1680: /* OGHAM SPACE MARK */
4245 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4246 case 0x2000: /* EN QUAD */
4247 case 0x2001: /* EM QUAD */
4248 case 0x2002: /* EN SPACE */
4249 case 0x2003: /* EM SPACE */
4250 case 0x2004: /* THREE-PER-EM SPACE */
4251 case 0x2005: /* FOUR-PER-EM SPACE */
4252 case 0x2006: /* SIX-PER-EM SPACE */
4253 case 0x2007: /* FIGURE SPACE */
4254 case 0x2008: /* PUNCTUATION SPACE */
4255 case 0x2009: /* THIN SPACE */
4256 case 0x200A: /* HAIR SPACE */
4257 case 0x202f: /* NARROW NO-BREAK SPACE */
4258 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4259 case 0x3000: /* IDEOGRAPHIC SPACE */
4260 break;
4261 }
4262 }
4263 break;
4264
4265 case OP_NOT_VSPACE:
4266 for (i = 1; i <= min; i++)
4267 {
4268 if (eptr >= md->end_subject)
4269 {
4270 SCHECK_PARTIAL();
4271 RRETURN(MATCH_NOMATCH);
4272 }
4273 GETCHARINC(c, eptr);
4274 switch(c)
4275 {
4276 default: break;
4277 case 0x0a: /* LF */
4278 case 0x0b: /* VT */
4279 case 0x0c: /* FF */
4280 case 0x0d: /* CR */
4281 case 0x85: /* NEL */
4282 case 0x2028: /* LINE SEPARATOR */
4283 case 0x2029: /* PARAGRAPH SEPARATOR */
4284 RRETURN(MATCH_NOMATCH);
4285 }
4286 }
4287 break;
4288
4289 case OP_VSPACE:
4290 for (i = 1; i <= min; i++)
4291 {
4292 if (eptr >= md->end_subject)
4293 {
4294 SCHECK_PARTIAL();
4295 RRETURN(MATCH_NOMATCH);
4296 }
4297 GETCHARINC(c, eptr);
4298 switch(c)
4299 {
4300 default: RRETURN(MATCH_NOMATCH);
4301 case 0x0a: /* LF */
4302 case 0x0b: /* VT */
4303 case 0x0c: /* FF */
4304 case 0x0d: /* CR */
4305 case 0x85: /* NEL */
4306 case 0x2028: /* LINE SEPARATOR */
4307 case 0x2029: /* PARAGRAPH SEPARATOR */
4308 break;
4309 }
4310 }
4311 break;
4312
4313 case OP_NOT_DIGIT:
4314 for (i = 1; i <= min; i++)
4315 {
4316 if (eptr >= md->end_subject)
4317 {
4318 SCHECK_PARTIAL();
4319 RRETURN(MATCH_NOMATCH);
4320 }
4321 GETCHARINC(c, eptr);
4322 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4323 RRETURN(MATCH_NOMATCH);
4324 }
4325 break;
4326
4327 case OP_DIGIT:
4328 for (i = 1; i <= min; i++)
4329 {
4330 if (eptr >= md->end_subject)
4331 {
4332 SCHECK_PARTIAL();
4333 RRETURN(MATCH_NOMATCH);
4334 }
4335 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4336 RRETURN(MATCH_NOMATCH);
4337 /* No need to skip more bytes - we know it's a 1-byte character */
4338 }
4339 break;
4340
4341 case OP_NOT_WHITESPACE:
4342 for (i = 1; i <= min; i++)
4343 {
4344 if (eptr >= md->end_subject)
4345 {
4346 SCHECK_PARTIAL();
4347 RRETURN(MATCH_NOMATCH);
4348 }
4349 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4350 RRETURN(MATCH_NOMATCH);
4351 eptr++;
4352 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4353 }
4354 break;
4355
4356 case OP_WHITESPACE:
4357 for (i = 1; i <= min; i++)
4358 {
4359 if (eptr >= md->end_subject)
4360 {
4361 SCHECK_PARTIAL();
4362 RRETURN(MATCH_NOMATCH);
4363 }
4364 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4365 RRETURN(MATCH_NOMATCH);
4366 /* No need to skip more bytes - we know it's a 1-byte character */
4367 }
4368 break;
4369
4370 case OP_NOT_WORDCHAR:
4371 for (i = 1; i <= min; i++)
4372 {
4373 if (eptr >= md->end_subject)
4374 {
4375 SCHECK_PARTIAL();
4376 RRETURN(MATCH_NOMATCH);
4377 }
4378 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4379 RRETURN(MATCH_NOMATCH);
4380 eptr++;
4381 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4382 }
4383 break;
4384
4385 case OP_WORDCHAR:
4386 for (i = 1; i <= min; i++)
4387 {
4388 if (eptr >= md->end_subject)
4389 {
4390 SCHECK_PARTIAL();
4391 RRETURN(MATCH_NOMATCH);
4392 }
4393 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4394 RRETURN(MATCH_NOMATCH);
4395 /* No need to skip more bytes - we know it's a 1-byte character */
4396 }
4397 break;
4398
4399 default:
4400 RRETURN(PCRE_ERROR_INTERNAL);
4401 } /* End switch(ctype) */
4402
4403 else
4404 #endif /* SUPPORT_UTF */
4405
4406 /* Code for the non-UTF-8 case for minimum matching of operators other
4407 than OP_PROP and OP_NOTPROP. */
4408
4409 switch(ctype)
4410 {
4411 case OP_ANY:
4412 for (i = 1; i <= min; i++)
4413 {
4414 if (eptr >= md->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 RRETURN(MATCH_NOMATCH);
4418 }
4419 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4420 eptr++;
4421 }
4422 break;
4423
4424 case OP_ALLANY:
4425 if (eptr > md->end_subject - min)
4426 {
4427 SCHECK_PARTIAL();
4428 RRETURN(MATCH_NOMATCH);
4429 }
4430 eptr += min;
4431 break;
4432
4433 case OP_ANYBYTE:
4434 if (eptr > md->end_subject - min)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 eptr += min;
4440 break;
4441
4442 case OP_ANYNL:
4443 for (i = 1; i <= min; i++)
4444 {
4445 if (eptr >= md->end_subject)
4446 {
4447 SCHECK_PARTIAL();
4448 RRETURN(MATCH_NOMATCH);
4449 }
4450 switch(*eptr++)
4451 {
4452 default: RRETURN(MATCH_NOMATCH);
4453
4454 case 0x000d:
4455 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4456 break;
4457
4458 case 0x000a:
4459 break;
4460
4461 case 0x000b:
4462 case 0x000c:
4463 case 0x0085:
4464 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4465 break;
4466 }
4467 }
4468 break;
4469
4470 case OP_NOT_HSPACE:
4471 for (i = 1; i <= min; i++)
4472 {
4473 if (eptr >= md->end_subject)
4474 {
4475 SCHECK_PARTIAL();
4476 RRETURN(MATCH_NOMATCH);
4477 }
4478 switch(*eptr++)
4479 {
4480 default: break;
4481 case 0x09: /* HT */
4482 case 0x20: /* SPACE */
4483 case 0xa0: /* NBSP */
4484 RRETURN(MATCH_NOMATCH);
4485 }
4486 }
4487 break;
4488
4489 case OP_HSPACE:
4490 for (i = 1; i <= min; i++)
4491 {
4492 if (eptr >= md->end_subject)
4493 {
4494 SCHECK_PARTIAL();
4495 RRETURN(MATCH_NOMATCH);
4496 }
4497 switch(*eptr++)
4498 {
4499 default: RRETURN(MATCH_NOMATCH);
4500 case 0x09: /* HT */
4501 case 0x20: /* SPACE */
4502 case 0xa0: /* NBSP */
4503 break;
4504 }
4505 }
4506 break;
4507
4508 case OP_NOT_VSPACE:
4509 for (i = 1; i <= min; i++)
4510 {
4511 if (eptr >= md->end_subject)
4512 {
4513 SCHECK_PARTIAL();
4514 RRETURN(MATCH_NOMATCH);
4515 }
4516 switch(*eptr++)
4517 {
4518 default: break;
4519 case 0x0a: /* LF */
4520 case 0x0b: /* VT */
4521 case 0x0c: /* FF */
4522 case 0x0d: /* CR */
4523 case 0x85: /* NEL */
4524 RRETURN(MATCH_NOMATCH);
4525 }
4526 }
4527 break;
4528
4529 case OP_VSPACE:
4530 for (i = 1; i <= min; i++)
4531 {
4532 if (eptr >= md->end_subject)
4533 {
4534 SCHECK_PARTIAL();
4535 RRETURN(MATCH_NOMATCH);
4536 }
4537 switch(*eptr++)
4538 {
4539 default: RRETURN(MATCH_NOMATCH);
4540 case 0x0a: /* LF */
4541 case 0x0b: /* VT */
4542 case 0x0c: /* FF */
4543 case 0x0d: /* CR */
4544 case 0x85: /* NEL */
4545 break;
4546 }
4547 }
4548 break;
4549
4550 case OP_NOT_DIGIT:
4551 for (i = 1; i <= min; i++)
4552 {
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 RRETURN(MATCH_NOMATCH);
4557 }
4558 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4559 }
4560 break;
4561
4562 case OP_DIGIT:
4563 for (i = 1; i <= min; i++)
4564 {
4565 if (eptr >= md->end_subject)
4566 {
4567 SCHECK_PARTIAL();
4568 RRETURN(MATCH_NOMATCH);
4569 }
4570 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4571 }
4572 break;
4573
4574 case OP_NOT_WHITESPACE:
4575 for (i = 1; i <= min; i++)
4576 {
4577 if (eptr >= md->end_subject)
4578 {
4579 SCHECK_PARTIAL();
4580 RRETURN(MATCH_NOMATCH);
4581 }
4582 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4583 }
4584 break;
4585
4586 case OP_WHITESPACE:
4587 for (i = 1; i <= min; i++)
4588 {
4589 if (eptr >= md->end_subject)
4590 {
4591 SCHECK_PARTIAL();
4592 RRETURN(MATCH_NOMATCH);
4593 }
4594 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4595 }
4596 break;
4597
4598 case OP_NOT_WORDCHAR:
4599 for (i = 1; i <= min; i++)
4600 {
4601 if (eptr >= md->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 RRETURN(MATCH_NOMATCH);
4605 }
4606 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4607 RRETURN(MATCH_NOMATCH);
4608 }
4609 break;
4610
4611 case OP_WORDCHAR:
4612 for (i = 1; i <= min; i++)
4613 {
4614 if (eptr >= md->end_subject)
4615 {
4616 SCHECK_PARTIAL();
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4620 RRETURN(MATCH_NOMATCH);
4621 }
4622 break;
4623
4624 default:
4625 RRETURN(PCRE_ERROR_INTERNAL);
4626 }
4627 }
4628
4629 /* If min = max, continue at the same level without recursing */
4630
4631 if (min == max) continue;
4632
4633 /* If minimizing, we have to test the rest of the pattern before each
4634 subsequent match. Again, separate the UTF-8 case for speed, and also
4635 separate the UCP cases. */
4636
4637 if (minimize)
4638 {
4639 #ifdef SUPPORT_UCP
4640 if (prop_type >= 0)
4641 {
4642 switch(prop_type)
4643 {
4644 case PT_ANY:
4645 for (fi = min;; fi++)
4646 {
4647 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4649 if (fi >= max) RRETURN(MATCH_NOMATCH);
4650 if (eptr >= md->end_subject)
4651 {
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4654 }
4655 GETCHARINCTEST(c, eptr);
4656 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4657 }
4658 /* Control never gets here */
4659
4660 case PT_LAMP:
4661 for (fi = min;; fi++)
4662 {
4663 int chartype;
4664 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4665 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4666 if (fi >= max) RRETURN(MATCH_NOMATCH);
4667 if (eptr >= md->end_subject)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 GETCHARINCTEST(c, eptr);
4673 chartype = UCD_CHARTYPE(c);
4674 if ((chartype == ucp_Lu ||
4675 chartype == ucp_Ll ||
4676 chartype == ucp_Lt) == prop_fail_result)
4677 RRETURN(MATCH_NOMATCH);
4678 }
4679 /* Control never gets here */
4680
4681 case PT_GC:
4682 for (fi = min;; fi++)
4683 {
4684 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4686 if (fi >= max) RRETURN(MATCH_NOMATCH);
4687 if (eptr >= md->end_subject)
4688 {
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4691 }
4692 GETCHARINCTEST(c, eptr);
4693 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4694 RRETURN(MATCH_NOMATCH);
4695 }
4696 /* Control never gets here */
4697
4698 case PT_PC:
4699 for (fi = min;; fi++)
4700 {
4701 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4702 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4703 if (fi >= max) RRETURN(MATCH_NOMATCH);
4704 if (eptr >= md->end_subject)
4705 {
4706 SCHECK_PARTIAL();
4707 RRETURN(MATCH_NOMATCH);
4708 }
4709 GETCHARINCTEST(c, eptr);
4710 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4711 RRETURN(MATCH_NOMATCH);
4712 }
4713 /* Control never gets here */
4714
4715 case PT_SC:
4716 for (fi = min;; fi++)
4717 {
4718 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4720 if (fi >= max) RRETURN(MATCH_NOMATCH);
4721 if (eptr >= md->end_subject)
4722 {
4723 SCHECK_PARTIAL();
4724 RRETURN(MATCH_NOMATCH);
4725 }
4726 GETCHARINCTEST(c, eptr);
4727 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4728 RRETURN(MATCH_NOMATCH);
4729 }
4730 /* Control never gets here */
4731
4732 case PT_ALNUM:
4733 for (fi = min;; fi++)
4734 {
4735 int category;
4736 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4737 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4738 if (fi >= max) RRETURN(MATCH_NOMATCH);
4739 if (eptr >= md->end_subject)
4740 {
4741 SCHECK_PARTIAL();
4742 RRETURN(MATCH_NOMATCH);
4743 }
4744 GETCHARINCTEST(c, eptr);
4745 category = UCD_CATEGORY(c);
4746 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4747 RRETURN(MATCH_NOMATCH);
4748 }
4749 /* Control never gets here */
4750
4751 case PT_SPACE: /* Perl space */
4752 for (fi = min;; fi++)
4753 {
4754 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4756 if (fi >= max) RRETURN(MATCH_NOMATCH);
4757 if (eptr >= md->end_subject)
4758 {
4759 SCHECK_PARTIAL();
4760 RRETURN(MATCH_NOMATCH);
4761 }
4762 GETCHARINCTEST(c, eptr);
4763 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4764 c == CHAR_FF || c == CHAR_CR)
4765 == prop_fail_result)
4766 RRETURN(MATCH_NOMATCH);
4767 }
4768 /* Control never gets here */
4769
4770 case PT_PXSPACE: /* POSIX space */
4771 for (fi = min;; fi++)
4772 {
4773 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4774 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4775 if (fi >= max) RRETURN(MATCH_NOMATCH);
4776 if (eptr >= md->end_subject)
4777 {
4778 SCHECK_PARTIAL();
4779 RRETURN(MATCH_NOMATCH);
4780 }
4781 GETCHARINCTEST(c, eptr);
4782 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4783 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4784 == prop_fail_result)
4785 RRETURN(MATCH_NOMATCH);
4786 }
4787 /* Control never gets here */
4788
4789 case PT_WORD:
4790 for (fi = min;; fi++)
4791 {
4792 int category;
4793 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4795 if (fi >= max) RRETURN(MATCH_NOMATCH);
4796 if (eptr >= md->end_subject)
4797 {
4798 SCHECK_PARTIAL();
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 GETCHARINCTEST(c, eptr);
4802 category = UCD_CATEGORY(c);
4803 if ((category == ucp_L ||
4804 category == ucp_N ||
4805 c == CHAR_UNDERSCORE)
4806 == prop_fail_result)
4807 RRETURN(MATCH_NOMATCH);
4808 }
4809 /* Control never gets here */
4810
4811 /* This should never occur */
4812
4813 default:
4814 RRETURN(PCRE_ERROR_INTERNAL);
4815 }
4816 }
4817
4818 /* Match extended Unicode sequences. We will get here only if the
4819 support is in the binary; otherwise a compile-time error occurs. */
4820
4821 else if (ctype == OP_EXTUNI)
4822 {
4823 for (fi = min;; fi++)
4824 {
4825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4827 if (fi >= max) RRETURN(MATCH_NOMATCH);
4828 if (eptr >= md->end_subject)
4829 {
4830 SCHECK_PARTIAL();
4831 RRETURN(MATCH_NOMATCH);
4832 }
4833 GETCHARINCTEST(c, eptr);
4834 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4835 while (eptr < md->end_subject)
4836 {
4837 int len = 1;
4838 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4839 if (UCD_CATEGORY(c) != ucp_M) break;
4840 eptr += len;
4841 }
4842 }
4843 }
4844 else
4845 #endif /* SUPPORT_UCP */
4846
4847 #ifdef SUPPORT_UTF
4848 if (utf)
4849 {
4850 for (fi = min;; fi++)
4851 {
4852 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4853 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4854 if (fi >= max) RRETURN(MATCH_NOMATCH);
4855 if (eptr >= md->end_subject)
4856 {
4857 SCHECK_PARTIAL();
4858 RRETURN(MATCH_NOMATCH);
4859 }
4860 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4861 RRETURN(MATCH_NOMATCH);
4862 GETCHARINC(c, eptr);
4863 switch(ctype)
4864 {
4865 case OP_ANY: /* This is the non-NL case */
4866 case OP_ALLANY:
4867 case OP_ANYBYTE:
4868 break;
4869
4870 case OP_ANYNL:
4871 switch(c)
4872 {
4873 default: RRETURN(MATCH_NOMATCH);
4874 case 0x000d:
4875 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4876 break;
4877 case 0x000a:
4878 break;
4879
4880 case 0x000b:
4881 case 0x000c:
4882 case 0x0085:
4883 case 0x2028:
4884 case 0x2029:
4885 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4886 break;
4887 }
4888 break;
4889
4890 case OP_NOT_HSPACE:
4891 switch(c)
4892 {
4893 default: break;
4894 case 0x09: /* HT */
4895 case 0x20: /* SPACE */
4896 case 0xa0: /* NBSP */
4897 case 0x1680: /* OGHAM SPACE MARK */
4898 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4899 case 0x2000: /* EN QUAD */
4900 case 0x2001: /* EM QUAD */
4901 case 0x2002: /* EN SPACE */
4902 case 0x2003: /* EM SPACE */
4903 case 0x2004: /* THREE-PER-EM SPACE */
4904 case 0x2005: /* FOUR-PER-EM SPACE */
4905 case 0x2006: /* SIX-PER-EM SPACE */
4906 case 0x2007: /* FIGURE SPACE */
4907 case 0x2008: /* PUNCTUATION SPACE */
4908 case 0x2009: /* THIN SPACE */
4909 case 0x200A: /* HAIR SPACE */
4910 case 0x202f: /* NARROW NO-BREAK SPACE */
4911 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4912 case 0x3000: /* IDEOGRAPHIC SPACE */
4913 RRETURN(MATCH_NOMATCH);
4914 }
4915 break;
4916
4917 case OP_HSPACE:
4918 switch(c)
4919 {
4920 default: RRETURN(MATCH_NOMATCH);
4921 case 0x09: /* HT */
4922 case 0x20: /* SPACE */
4923 case 0xa0: /* NBSP */
4924 case 0x1680: /* OGHAM SPACE MARK */
4925 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4926 case 0x2000: /* EN QUAD */
4927 case 0x2001: /* EM QUAD */
4928 case 0x2002: /* EN SPACE */
4929 case 0x2003: /* EM SPACE */
4930 case 0x2004: /* THREE-PER-EM SPACE */
4931 case 0x2005: /* FOUR-PER-EM SPACE */
4932 case 0x2006: /* SIX-PER-EM SPACE */
4933 case 0x2007: /* FIGURE SPACE */
4934 case 0x2008: /* PUNCTUATION SPACE */
4935 case 0x2009: /* THIN SPACE */
4936 case 0x200A: /* HAIR SPACE */
4937 case 0x202f: /* NARROW NO-BREAK SPACE */
4938 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4939 case 0x3000: /* IDEOGRAPHIC SPACE */
4940 break;
4941 }
4942 break;
4943
4944 case OP_NOT_VSPACE:
4945 switch(c)
4946 {
4947 default: break;
4948 case 0x0a: /* LF */
4949 case 0x0b: /* VT */
4950 case 0x0c: /* FF */
4951 case 0x0d: /* CR */
4952 case 0x85: /* NEL */
4953 case 0x2028: /* LINE SEPARATOR */
4954 case 0x2029: /* PARAGRAPH SEPARATOR */
4955 RRETURN(MATCH_NOMATCH);
4956 }
4957 break;
4958
4959 case OP_VSPACE:
4960 switch(c)
4961 {
4962 default: RRETURN(MATCH_NOMATCH);
4963 case 0x0a: /* LF */
4964 case 0x0b: /* VT */
4965 case 0x0c: /* FF */
4966 case 0x0d: /* CR */
4967 case 0x85: /* NEL */
4968 case 0x2028: /* LINE SEPARATOR */
4969 case 0x2029: /* PARAGRAPH SEPARATOR */
4970 break;
4971 }
4972 break;
4973
4974 case OP_NOT_DIGIT:
4975 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4976 RRETURN(MATCH_NOMATCH);
4977 break;
4978
4979 case OP_DIGIT:
4980 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4981 RRETURN(MATCH_NOMATCH);
4982 break;
4983
4984 case OP_NOT_WHITESPACE:
4985 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4986 RRETURN(MATCH_NOMATCH);
4987 break;
4988
4989 case OP_WHITESPACE:
4990 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4991 RRETURN(MATCH_NOMATCH);
4992 break;
4993
4994 case OP_NOT_WORDCHAR:
4995 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4996 RRETURN(MATCH_NOMATCH);
4997 break;
4998
4999 case OP_WORDCHAR:
5000 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5001 RRETURN(MATCH_NOMATCH);
5002 break;
5003
5004 default:
5005 RRETURN(PCRE_ERROR_INTERNAL);
5006 }
5007 }
5008 }
5009 else
5010 #endif
5011 /* Not UTF mode */
5012 {
5013 for (fi = min;; fi++)
5014 {
5015 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5017 if (fi >= max) RRETURN(MATCH_NOMATCH);
5018 if (eptr >= md->end_subject)
5019 {
5020 SCHECK_PARTIAL();
5021 RRETURN(MATCH_NOMATCH);
5022 }
5023 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5024 RRETURN(MATCH_NOMATCH);
5025 c = *eptr++;
5026 switch(ctype)
5027 {
5028 case OP_ANY: /* This is the non-NL case */
5029 case OP_ALLANY:
5030 case OP_ANYBYTE:
5031 break;
5032
5033 case OP_ANYNL:
5034 switch(c)
5035 {
5036 default: RRETURN(MATCH_NOMATCH);
5037 case 0x000d:
5038 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5039 break;
5040
5041 case 0x000a:
5042 break;
5043
5044 case 0x000b:
5045 case 0x000c:
5046 case 0x0085:
5047 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5048 break;
5049 }
5050 break;
5051
5052 case OP_NOT_HSPACE:
5053 switch(c)
5054 {
5055 default: break;
5056 case 0x09: /* HT */
5057 case 0x20: /* SPACE */
5058 case 0xa0: /* NBSP */
5059 RRETURN(MATCH_NOMATCH);
5060 }
5061 break;
5062
5063 case OP_HSPACE:
5064 switch(c)
5065 {
5066 default: RRETURN(MATCH_NOMATCH);
5067 case 0x09: /* HT */
5068 case 0x20: /* SPACE */
5069 case 0xa0: /* NBSP */
5070 break;
5071 }
5072 break;
5073
5074 case OP_NOT_VSPACE:
5075 switch(c)
5076 {
5077 default: break;
5078 case 0x0a: /* LF */
5079 case 0x0b: /* VT */
5080 case 0x0c: /* FF */
5081 case 0x0d: /* CR */
5082 case 0x85: /* NEL */
5083 RRETURN(MATCH_NOMATCH);
5084 }
5085 break;
5086
5087 case OP_VSPACE:
5088 switch(c)
5089 {
5090 default: RRETURN(MATCH_NOMATCH);
5091 case 0x0a: /* LF */
5092 case 0x0b: /* VT */
5093 case 0x0c: /* FF */
5094 case 0x0d: /* CR */
5095 case 0x85: /* NEL */
5096 break;
5097 }
5098 break;
5099
5100 case OP_NOT_DIGIT:
5101 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5102 break;
5103
5104 case OP_DIGIT:
5105 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5106 break;
5107
5108 case OP_NOT_WHITESPACE:
5109 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5110 break;
5111
5112 case OP_WHITESPACE:
5113 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5114 break;
5115
5116 case OP_NOT_WORDCHAR:
5117 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5118 break;
5119
5120 case OP_WORDCHAR:
5121 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5122 break;
5123
5124 default:
5125 RRETURN(PCRE_ERROR_INTERNAL);
5126 }
5127 }
5128 }
5129 /* Control never gets here */
5130 }
5131
5132 /* If maximizing, it is worth using inline code for speed, doing the type
5133 test once at the start (i.e. keep it out of the loop). Again, keep the
5134 UTF-8 and UCP stuff separate. */
5135
5136 else
5137 {
5138 pp = eptr; /* Remember where we started */
5139
5140 #ifdef SUPPORT_UCP
5141 if (prop_type >= 0)
5142 {
5143 switch(prop_type)
5144 {
5145 case PT_ANY:
5146 for (i = min; i < max; i++)
5147 {
5148 int len = 1;
5149 if (eptr >= md->end_subject)
5150 {
5151 SCHECK_PARTIAL();
5152 break;
5153 }
5154 GETCHARLENTEST(c, eptr, len);
5155 if (prop_fail_result) break;
5156 eptr+= len;
5157 }
5158 break;
5159
5160 case PT_LAMP:
5161 for (i = min; i < max; i++)
5162 {
5163 int chartype;
5164 int len = 1;
5165 if (eptr >= md->end_subject)
5166 {
5167 SCHECK_PARTIAL();
5168 break;
5169 }
5170 GETCHARLENTEST(c, eptr, len);
5171 chartype = UCD_CHARTYPE(c);
5172 if ((chartype == ucp_Lu ||
5173 chartype == ucp_Ll ||
5174 chartype == ucp_Lt) == prop_fail_result)
5175 break;
5176 eptr+= len;
5177 }
5178 break;
5179
5180 case PT_GC:
5181 for (i = min; i < max; i++)
5182 {
5183 int len = 1;
5184 if (eptr >= md->end_subject)
5185 {
5186 SCHECK_PARTIAL();
5187 break;
5188 }
5189 GETCHARLENTEST(c, eptr, len);
5190 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5191 eptr+= len;
5192 }
5193 break;
5194
5195 case PT_PC:
5196 for (i = min; i < max; i++)
5197 {
5198 int len = 1;
5199 if (eptr >= md->end_subject)
5200 {
5201 SCHECK_PARTIAL();
5202 break;
5203 }
5204 GETCHARLENTEST(c, eptr, len);
5205 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5206 eptr+= len;
5207 }
5208 break;
5209
5210 case PT_SC:
5211 for (i = min; i < max; i++)
5212 {
5213 int len = 1;
5214 if (eptr >= md->end_subject)
5215 {
5216 SCHECK_PARTIAL();
5217 break;
5218 }
5219 GETCHARLENTEST(c, eptr, len);
5220 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5221 eptr+= len;
5222 }
5223 break;
5224
5225 case PT_ALNUM:
5226 for (i = min; i < max; i++)
5227 {
5228 int category;
5229 int len = 1;
5230 if (eptr >= md->end_subject)
5231 {
5232 SCHECK_PARTIAL();
5233 break;
5234 }
5235 GETCHARLENTEST(c, eptr, len);
5236 category = UCD_CATEGORY(c);
5237 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5238 break;
5239 eptr+= len;
5240 }
5241 break;
5242
5243 case PT_SPACE: /* Perl space */
5244 for (i = min; i < max; i++)
5245 {
5246 int len = 1;
5247 if (eptr >= md->end_subject)
5248 {
5249 SCHECK_PARTIAL();
5250 break;
5251 }
5252 GETCHARLENTEST(c, eptr, len);
5253 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5254 c == CHAR_FF || c == CHAR_CR)
5255 == prop_fail_result)
5256 break;
5257 eptr+= len;
5258 }
5259 break;
5260
5261 case PT_PXSPACE: /* POSIX space */
5262 for (i = min; i < max; i++)
5263 {
5264 int len = 1;
5265 if (eptr >= md->end_subject)
5266 {
5267 SCHECK_PARTIAL();
5268 break;
5269 }
5270 GETCHARLENTEST(c, eptr, len);
5271 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5272 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5273 == prop_fail_result)
5274 break;
5275 eptr+= len;
5276 }
5277 break;
5278
5279 case PT_WORD:
5280 for (i = min; i < max; i++)
5281 {
5282 int category;
5283 int len = 1;
5284 if (eptr >= md->end_subject)
5285 {
5286 SCHECK_PARTIAL();
5287 break;
5288 }
5289 GETCHARLENTEST(c, eptr, len);
5290 category = UCD_CATEGORY(c);
5291 if ((category == ucp_L || category == ucp_N ||
5292 c == CHAR_UNDERSCORE) == prop_fail_result)
5293 break;
5294 eptr+= len;
5295 }
5296 break;
5297
5298 default:
5299 RRETURN(PCRE_ERROR_INTERNAL);
5300 }
5301
5302 /* eptr is now past the end of the maximum run */
5303
5304 if (possessive) continue;
5305 for(;;)
5306 {
5307 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5308 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5309 if (eptr-- == pp) break; /* Stop if tried at original pos */
5310 if (utf) BACKCHAR(eptr);
5311 }
5312 }
5313
5314 /* Match extended Unicode sequences. We will get here only if the
5315 support is in the binary; otherwise a compile-time error occurs. */
5316
5317 else if (ctype == OP_EXTUNI)
5318 {
5319 for (i = min; i < max; i++)
5320 {
5321 int len = 1;
5322 if (eptr >= md->end_subject)
5323 {
5324 SCHECK_PARTIAL();
5325 break;
5326 }
5327 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5328 if (UCD_CATEGORY(c) == ucp_M) break;
5329 eptr += len;
5330 while (eptr < md->end_subject)
5331 {
5332 len = 1;
5333 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5334 if (UCD_CATEGORY(c) != ucp_M) break;
5335 eptr += len;
5336 }
5337 }
5338
5339 /* eptr is now past the end of the maximum run */
5340
5341 if (possessive) continue;
5342
5343 for(;;)
5344 {
5345 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5346 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5347 if (eptr-- == pp) break; /* Stop if tried at original pos */
5348 for (;;) /* Move back over one extended */
5349 {
5350 if (!utf) c = *eptr; else
5351 {
5352 BACKCHAR(eptr);
5353 GETCHAR(c, eptr);
5354 }
5355 if (UCD_CATEGORY(c) != ucp_M) break;
5356 eptr--;
5357 }
5358 }
5359 }
5360
5361 else
5362 #endif /* SUPPORT_UCP */
5363
5364 #ifdef SUPPORT_UTF
5365 if (utf)
5366 {
5367 switch(ctype)
5368 {
5369 case OP_ANY:
5370 if (max < INT_MAX)
5371 {
5372 for (i = min; i < max; i++)
5373 {
5374 if (eptr >= md->end_subject)
5375 {
5376 SCHECK_PARTIAL();
5377 break;
5378 }
5379 if (IS_NEWLINE(eptr)) break;
5380 eptr++;
5381 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5382 }
5383 }
5384
5385 /* Handle unlimited UTF-8 repeat */
5386
5387 else
5388 {
5389 for (i = min; i < max; i++)
5390 {
5391 if (eptr >= md->end_subject)
5392 {
5393 SCHECK_PARTIAL();
5394 break;
5395 }
5396 if (IS_NEWLINE(eptr)) break;
5397 eptr++;
5398 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5399 }
5400 }
5401 break;
5402
5403 case OP_ALLANY:
5404 if (max < INT_MAX)
5405 {
5406 for (i = min; i < max; i++)
5407 {
5408 if (eptr >= md->end_subject)
5409 {
5410 SCHECK_PARTIAL();
5411 break;
5412 }
5413 eptr++;
5414 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5415 }
5416 }
5417 else
5418 {
5419 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5420 SCHECK_PARTIAL();
5421 }
5422 break;
5423
5424 /* The byte case is the same as non-UTF8 */
5425
5426 case OP_ANYBYTE:
5427 c = max - min;
5428 if (c > (unsigned int)(md->end_subject - eptr))
5429 {
5430 eptr = md->end_subject;
5431 SCHECK_PARTIAL();
5432 }
5433 else eptr += c;
5434 break;
5435
5436 case OP_ANYNL:
5437 for (i = min; i < max; i++)
5438 {
5439 int len = 1;
5440 if (eptr >= md->end_subject)
5441 {
5442 SCHECK_PARTIAL();
5443 break;
5444 }
5445 GETCHARLEN(c, eptr, len);
5446 if (c == 0x000d)
5447 {
5448 if (++eptr >= md->end_subject) break;
5449 if (*eptr == 0x000a) eptr++;
5450 }
5451 else
5452 {
5453 if (c != 0x000a &&
5454 (md->bsr_anycrlf ||
5455 (c != 0x000b && c != 0x000c &&
5456 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5457 break;
5458 eptr += len;
5459 }
5460 }
5461 break;
5462
5463 case OP_NOT_HSPACE:
5464 case OP_HSPACE:
5465 for (i = min; i < max; i++)
5466 {
5467 BOOL gotspace;
5468 int len = 1;
5469 if (eptr >= md->end_subject)
5470 {
5471 SCHECK_PARTIAL();
5472 break;
5473 }
5474 GETCHARLEN(c, eptr, len);
5475 switch(c)
5476 {
5477 default: gotspace = FALSE; break;
5478 case 0x09: /* HT */
5479 case 0x20: /* SPACE */
5480 case 0xa0: /* NBSP */
5481 case 0x1680: /* OGHAM SPACE MARK */
5482 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5483 case 0x2000: /* EN QUAD */
5484 case 0x2001: /* EM QUAD */
5485 case 0x2002: /* EN SPACE */
5486 case 0x2003: /* EM SPACE */
5487 case 0x2004: /* THREE-PER-EM SPACE */
5488 case 0x2005: /* FOUR-PER-EM SPACE */
5489 case 0x2006: /* SIX-PER-EM SPACE */
5490 case 0x2007: /* FIGURE SPACE */
5491 case 0x2008: /* PUNCTUATION SPACE */
5492 case 0x2009: /* THIN SPACE */
5493 case 0x200A: /* HAIR SPACE */
5494 case 0x202f: /* NARROW NO-BREAK SPACE */
5495 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5496 case 0x3000: /* IDEOGRAPHIC SPACE */
5497 gotspace = TRUE;
5498 break;
5499 }
5500 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5501 eptr += len;
5502 }
5503 break;
5504
5505 case OP_NOT_VSPACE:
5506 case OP_VSPACE:
5507 for (i = min; i < max; i++)
5508 {
5509 BOOL gotspace;
5510 int len = 1;
5511 if (eptr >= md->end_subject)
5512 {
5513 SCHECK_PARTIAL();
5514 break;
5515 }
5516 GETCHARLEN(c, eptr, len);
5517 switch(c)
5518 {
5519 default: gotspace = FALSE; break;
5520 case 0x0a: /* LF */
5521 case 0x0b: /* VT */
5522 case 0x0c: /* FF */
5523 case 0x0d: /* CR */
5524 case 0x85: /* NEL */
5525 case 0x2028: /* LINE SEPARATOR */
5526 case 0x2029: /* PARAGRAPH SEPARATOR */
5527 gotspace = TRUE;
5528 break;
5529 }
5530 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5531 eptr += len;
5532 }
5533 break;
5534
5535 case OP_NOT_DIGIT:
5536 for (i = min; i < max; i++)
5537 {
5538 int len = 1;
5539 if (eptr >= md->end_subject)
5540 {
5541 SCHECK_PARTIAL();
5542 break;
5543 }
5544 GETCHARLEN(c, eptr, len);
5545 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5546 eptr+= len;
5547 }
5548 break;
5549
5550 case OP_DIGIT:
5551 for (i = min; i < max; i++)
5552 {
5553 int len = 1;
5554 if (eptr >= md->end_subject)
5555 {
5556 SCHECK_PARTIAL();
5557 break;
5558 }
5559 GETCHARLEN(c, eptr, len);
5560 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5561 eptr+= len;
5562 }
5563 break;
5564
5565 case OP_NOT_WHITESPACE:
5566 for (i = min; i < max; i++)
5567 {
5568 int len = 1;
5569 if (eptr >= md->end_subject)
5570 {
5571 SCHECK_PARTIAL();
5572 break;
5573 }
5574 GETCHARLEN(c, eptr, len);
5575 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5576 eptr+= len;
5577 }
5578 break;
5579
5580 case OP_WHITESPACE:
5581 for (i = min; i < max; i++)
5582 {
5583 int len = 1;
5584 if (eptr >= md->end_subject)
5585 {
5586 SCHECK_PARTIAL();
5587 break;
5588 }
5589 GETCHARLEN(c, eptr, len);
5590 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5591 eptr+= len;
5592 }
5593 break;
5594
5595 case OP_NOT_WORDCHAR:
5596 for (i = min; i < max; i++)
5597 {
5598 int len = 1;
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 GETCHARLEN(c, eptr, len);
5605 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5606 eptr+= len;
5607 }
5608 break;
5609
5610 case OP_WORDCHAR:
5611 for (i = min; i < max; i++)
5612 {
5613 int len = 1;
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 GETCHARLEN(c, eptr, len);
5620 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5621 eptr+= len;
5622 }
5623 break;
5624
5625 default:
5626 RRETURN(PCRE_ERROR_INTERNAL);
5627 }
5628
5629 /* eptr is now past the end of the maximum run. If possessive, we are
5630 done (no backing up). Otherwise, match at this position; anything other
5631 than no match is immediately returned. For nomatch, back up one
5632 character, unless we are matching \R and the last thing matched was
5633 \r\n, in which case, back up two bytes. */
5634
5635 if (possessive) continue;
5636 for(;;)
5637 {
5638 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5640 if (eptr-- == pp) break; /* Stop if tried at original pos */
5641 BACKCHAR(eptr);
5642 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5643 eptr[-1] == '\r') eptr--;
5644 }
5645 }
5646 else
5647 #endif /* SUPPORT_UTF */
5648 /* Not UTF mode */
5649 {
5650 switch(ctype)
5651 {
5652 case OP_ANY:
5653 for (i = min; i < max; i++)
5654 {
5655 if (eptr >= md->end_subject)
5656 {
5657 SCHECK_PARTIAL();
5658 break;
5659 }
5660 if (IS_NEWLINE(eptr)) break;
5661 eptr++;
5662 }
5663 break;
5664
5665 case OP_ALLANY:
5666 case OP_ANYBYTE:
5667 c = max - min;
5668 if (c > (unsigned int)(md->end_subject - eptr))
5669 {
5670 eptr = md->end_subject;
5671 SCHECK_PARTIAL();
5672 }
5673 else eptr += c;
5674 break;
5675
5676 case OP_ANYNL:
5677 for (i = min; i < max; i++)
5678 {
5679 if (eptr >= md->end_subject)
5680 {
5681 SCHECK_PARTIAL();
5682 break;
5683 }
5684 c = *eptr;
5685 if (c == 0x000d)
5686 {
5687 if (++eptr >= md->end_subject) break;
5688 if (*eptr == 0x000a) eptr++;
5689 }
5690 else
5691 {
5692 if (c != 0x000a &&
5693 (md->bsr_anycrlf ||
5694 (c != 0x000b && c != 0x000c && c != 0x0085)))
5695 break;
5696 eptr++;
5697 }
5698 }
5699 break;
5700
5701 case OP_NOT_HSPACE:
5702 for (i = min; i < max; i++)
5703 {
5704 if (eptr >= md->end_subject)
5705 {
5706 SCHECK_PARTIAL();
5707 break;
5708 }
5709 c = *eptr;
5710 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5711 eptr++;
5712 }
5713 break;
5714
5715 case OP_HSPACE:
5716 for (i = min; i < max; i++)
5717 {
5718 if (eptr >= md->end_subject)
5719 {
5720 SCHECK_PARTIAL();
5721 break;
5722 }
5723 c = *eptr;
5724 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5725 eptr++;
5726 }
5727 break;
5728
5729 case OP_NOT_VSPACE:
5730 for (i = min; i < max; i++)
5731 {
5732 if (eptr >= md->end_subject)
5733 {
5734 SCHECK_PARTIAL();
5735 break;
5736 }
5737 c = *eptr;
5738 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5739 break;
5740 eptr++;
5741 }
5742 break;
5743
5744 case OP_VSPACE:
5745 for (i = min; i < max; i++)
5746 {
5747 if (eptr >= md->end_subject)
5748 {
5749 SCHECK_PARTIAL();
5750 break;
5751 }
5752 c = *eptr;
5753 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5754 break;
5755 eptr++;
5756 }
5757 break;
5758
5759 case OP_NOT_DIGIT:
5760 for (i = min; i < max; i++)
5761 {
5762 if (eptr >= md->end_subject)
5763 {
5764 SCHECK_PARTIAL();
5765 break;
5766 }
5767 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5768 eptr++;
5769 }
5770 break;
5771
5772 case OP_DIGIT:
5773 for (i = min; i < max; i++)
5774 {
5775 if (eptr >= md->end_subject)
5776 {
5777 SCHECK_PARTIAL();
5778 break;
5779 }
5780 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5781 eptr++;
5782 }
5783 break;
5784
5785 case OP_NOT_WHITESPACE:
5786 for (i = min; i < max; i++)
5787 {
5788 if (eptr >= md->end_subject)
5789 {
5790 SCHECK_PARTIAL();
5791 break;
5792 }
5793 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5794 eptr++;
5795 }
5796 break;
5797
5798 case OP_WHITESPACE:
5799 for (i = min; i < max; i++)
5800 {
5801 if (eptr >= md->end_subject)
5802 {
5803 SCHECK_PARTIAL();
5804 break;
5805 }
5806 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5807 eptr++;
5808 }
5809 break;
5810
5811 case OP_NOT_WORDCHAR:
5812 for (i = min; i < max; i++)
5813 {
5814 if (eptr >= md->end_subject)
5815 {
5816 SCHECK_PARTIAL();
5817 break;
5818 }
5819 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5820 eptr++;
5821 }
5822 break;
5823
5824 case OP_WORDCHAR:
5825 for (i = min; i < max; i++)
5826 {
5827 if (eptr >= md->end_subject)
5828 {
5829 SCHECK_PARTIAL();
5830 break;
5831 }
5832 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5833 eptr++;
5834 }
5835 break;
5836
5837 default:
5838 RRETURN(PCRE_ERROR_INTERNAL);
5839 }
5840
5841 /* eptr is now past the end of the maximum run. If possessive, we are
5842 done (no backing up). Otherwise, match at this position; anything other
5843 than no match is immediately returned. For nomatch, back up one
5844 character (byte), unless we are matching \R and the last thing matched
5845 was \r\n, in which case, back up two bytes. */
5846
5847 if (possessive) continue;
5848 while (eptr >= pp)
5849 {
5850 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5851 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5852 eptr--;
5853 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5854 eptr[-1] == '\r') eptr--;
5855 }
5856 }
5857
5858 /* Get here if we can't make it match with any permitted repetitions */
5859
5860 RRETURN(MATCH_NOMATCH);
5861 }
5862 /* Control never gets here */
5863
5864 /* There's been some horrible disaster. Arrival here can only mean there is
5865 something seriously wrong in the code above or the OP_xxx definitions. */
5866
5867 default:
5868 DPRINTF(("Unknown opcode %d\n", *ecode));
5869 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5870 }
5871
5872 /* Do not stick any code in here without much thought; it is assumed
5873 that "continue" in the code above comes out to here to repeat the main
5874 loop. */
5875
5876 } /* End of main loop */
5877 /* Control never reaches here */
5878
5879
5880 /* When compiling to use the heap rather than the stack for recursive calls to
5881 match(), the RRETURN() macro jumps here. The number that is saved in
5882 frame->Xwhere indicates which label we actually want to return to. */
5883
5884 #ifdef NO_RECURSE
5885 #define LBL(val) case val: goto L_RM##val;
5886 HEAP_RETURN:
5887 switch (frame->Xwhere)
5888 {
5889 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5890 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5891 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5892 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5893 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
5894 LBL(65) LBL(66)
5895 #ifdef SUPPORT_UTF
5896 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5897 LBL(32) LBL(34) LBL(42) LBL(46)
5898 #ifdef SUPPORT_UCP
5899 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5900 LBL(59) LBL(60) LBL(61) LBL(62)
5901 #endif /* SUPPORT_UCP */
5902 #endif /* SUPPORT_UTF */
5903 default:
5904 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5905 return PCRE_ERROR_INTERNAL;
5906 }
5907 #undef LBL
5908 #endif /* NO_RECURSE */
5909 }
5910
5911
5912 /***************************************************************************
5913 ****************************************************************************
5914 RECURSION IN THE match() FUNCTION
5915
5916 Undefine all the macros that were defined above to handle this. */
5917
5918 #ifdef NO_RECURSE
5919 #undef eptr
5920 #undef ecode
5921 #undef mstart
5922 #undef offset_top
5923 #undef eptrb
5924 #undef flags
5925
5926 #undef callpat
5927 #undef charptr
5928 #undef data
5929 #undef next
5930 #undef pp
5931 #undef prev
5932 #undef saved_eptr
5933
5934 #undef new_recursive
5935
5936 #undef cur_is_word
5937 #undef condition
5938 #undef prev_is_word
5939
5940 #undef ctype
5941 #undef length
5942 #undef max
5943 #undef min
5944 #undef number
5945 #undef offset
5946 #undef op
5947 #undef save_capture_last
5948 #undef save_offset1
5949 #undef save_offset2
5950 #undef save_offset3
5951 #undef stacksave
5952
5953 #undef newptrb
5954
5955 #endif
5956
5957 /* These two are defined as macros in both cases */
5958
5959 #undef fc
5960 #undef fi
5961
5962 /***************************************************************************
5963 ***************************************************************************/
5964
5965
5966
5967 /*************************************************
5968 * Execute a Regular Expression *
5969 *************************************************/
5970
5971 /* This function applies a compiled re to a subject string and picks out
5972 portions of the string if it matches. Two elements in the vector are set for
5973 each substring: the offsets to the start and end of the substring.
5974
5975 Arguments:
5976 argument_re points to the compiled expression
5977 extra_data points to extra data or is NULL
5978 subject points to the subject string
5979 length length of subject string (may contain binary zeros)
5980 start_offset where to start in the subject string
5981 options option bits
5982 offsets points to a vector of ints to be filled in with offsets
5983 offsetcount the number of elements in the vector
5984
5985 Returns: > 0 => success; value is the number of elements filled in
5986 = 0 => success, but offsets is not big enough
5987 -1 => failed to match
5988 < -1 => some kind of unexpected problem
5989 */
5990
5991 #ifdef COMPILE_PCRE8
5992 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5993 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5994 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5995 int offsetcount)
5996 #else
5997 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5998 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
5999 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6000 int offsetcount)
6001 #endif
6002 {
6003 int rc, ocount, arg_offset_max;
6004 int newline;
6005 BOOL using_temporary_offsets = FALSE;
6006 BOOL anchored;
6007 BOOL startline;
6008 BOOL firstline;
6009 BOOL utf;
6010 BOOL has_first_char = FALSE;
6011 BOOL has_req_char = FALSE;
6012 pcre_uchar first_char = 0;
6013 pcre_uchar first_char2 = 0;
6014 pcre_uchar req_char = 0;
6015 pcre_uchar req_char2 = 0;
6016 match_data match_block;
6017 match_data *md = &match_block;
6018 const pcre_uint8 *tables;
6019 const pcre_uint8 *start_bits = NULL;
6020 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6021 PCRE_PUCHAR end_subject;
6022 PCRE_PUCHAR start_partial = NULL;
6023 PCRE_PUCHAR req_char_ptr = start_match - 1;
6024
6025 pcre_study_data internal_study;
6026 const pcre_study_data *study;
6027
6028 real_pcre internal_re;
6029 const real_pcre *external_re = (const real_pcre *)argument_re;
6030 const real_pcre *re = external_re;
6031
6032 /* Plausibility checks */
6033
6034 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6035 if (re == NULL || subject == NULL ||
6036 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6037 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6038 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6039
6040 /* These two settings are used in the code for checking a UTF-8 string that
6041 follows immediately afterwards. Other values in the md block are used only
6042 during "normal" pcre_exec() processing, not when the JIT support is in use,
6043 so they are set up later. */
6044
6045 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6046 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6047 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6048 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6049
6050 /* Check a UTF-8 string if required. Pass back the character offset and error
6051 code for an invalid string if a results vector is available. */
6052
6053 #ifdef SUPPORT_UTF
6054 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6055 {
6056 int erroroffset;
6057 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6058 if (errorcode != 0)
6059 {
6060 if (offsetcount >= 2)
6061 {
6062 offsets[0] = erroroffset;
6063 offsets[1] = errorcode;
6064 }
6065 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6066 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6067 }
6068
6069 /* Check that a start_offset points to the start of a UTF character. */
6070 if (start_offset > 0 && start_offset < length &&
6071 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6072 return PCRE_ERROR_BADUTF8_OFFSET;
6073 }
6074 #endif
6075
6076 /* If the pattern was successfully studied with JIT support, run the JIT
6077 executable instead of the rest of this function. Most options must be set at
6078 compile time for the JIT code to be usable. Fallback to the normal code path if
6079 an unsupported flag is set. In particular, JIT does not support partial
6080 matching. */
6081
6082 #ifdef SUPPORT_JIT
6083 if (extra_data != NULL
6084 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6085 && extra_data->executable_jit != NULL
6086 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6087 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6088 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6089 return PRIV(jit_exec)(re, extra_data->executable_jit,
6090 (const pcre_uchar *)subject, length, start_offset, options,
6091 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6092 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6093 #endif
6094
6095 /* Carry on with non-JIT matching. This information is for finding all the
6096 numbers associated with a given name, for condition testing. */
6097
6098 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6099 md->name_count = re->name_count;
6100 md->name_entry_size = re->name_entry_size;
6101
6102 /* Fish out the optional data from the extra_data structure, first setting
6103 the default values. */
6104
6105 study = NULL;
6106 md->match_limit = MATCH_LIMIT;
6107 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6108 md->callout_data = NULL;
6109
6110 /* The table pointer is always in native byte order. */
6111
6112 tables = external_re->tables;
6113
6114 if (extra_data != NULL)
6115 {
6116 register unsigned int flags = extra_data->flags;
6117 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6118 study = (const pcre_study_data *)extra_data->study_data;
6119 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6120 md->match_limit = extra_data->match_limit;
6121 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6122 md->match_limit_recursion = extra_data->match_limit_recursion;
6123 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6124 md->callout_data = extra_data->callout_data;
6125 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6126 }
6127
6128 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6129 is a feature that makes it possible to save compiled regex and re-use them
6130 in other programs later. */
6131
6132 if (tables == NULL) tables = PRIV(default_tables);
6133
6134 /* Check that the first field in the block is the magic number. If it is not,
6135 test for a regex that was compiled on a host of opposite endianness. If this is
6136 the case, flipped values are put in internal_re and internal_study if there was
6137 study data too. */
6138
6139 if (re->magic_number != MAGIC_NUMBER)
6140 {
6141 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
6142 if (re == NULL) return PCRE_ERROR_BADMAGIC;
6143 if (study != NULL) study = &internal_study;
6144 }
6145 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6146
6147 /* Set up other data */
6148
6149 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6150 startline = (re->flags & PCRE_STARTLINE) != 0;
6151 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6152
6153 /* The code starts after the real_pcre block and the capture name table. */
6154
6155 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6156 re->name_count * re->name_entry_size;
6157
6158 md->start_subject = (PCRE_PUCHAR)subject;
6159 md->start_offset = start_offset;
6160 md->end_subject = md->start_subject + length;
6161 end_subject = md->end_subject;
6162
6163 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6164 md->use_ucp = (re->options & PCRE_UCP) != 0;
6165 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6166 md->ignore_skip_arg = FALSE;
6167
6168 /* Some options are unpacked into BOOL variables in the hope that testing
6169 them will be faster than individual option bits. */
6170
6171 md->notbol = (options & PCRE_NOTBOL) != 0;
6172 md->noteol = (options & PCRE_NOTEOL) != 0;
6173 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6174 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6175
6176 md->hitend = FALSE;
6177 md->mark = md->nomatch_mark = NULL; /* In case never set */
6178
6179 md->recursive = NULL; /* No recursion at top level */
6180 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6181
6182 md->lcc = tables + lcc_offset;
6183 md->fcc = tables + fcc_offset;
6184 md->ctypes = tables + ctypes_offset;
6185
6186 /* Handle different \R options. */
6187
6188 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6189 {
6190 case 0:
6191 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6192 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6193 else
6194 #ifdef BSR_ANYCRLF
6195 md->bsr_anycrlf = TRUE;
6196 #else
6197 md->bsr_anycrlf = FALSE;
6198 #endif
6199 break;
6200
6201 case PCRE_BSR_ANYCRLF:
6202 md->bsr_anycrlf = TRUE;
6203 break;
6204
6205 case PCRE_BSR_UNICODE:
6206 md->bsr_anycrlf = FALSE;
6207 break;
6208
6209 default: return PCRE_ERROR_BADNEWLINE;
6210 }
6211
6212 /* Handle different types of newline. The three bits give eight cases. If
6213 nothing is set at run time, whatever was used at compile time applies. */
6214
6215 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6216 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6217 {
6218 case 0: newline = NEWLINE; break; /* Compile-time default */
6219 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6220 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6221 case PCRE_NEWLINE_CR+
6222 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6223 case PCRE_NEWLINE_ANY: newline = -1; break;
6224 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6225 default: return PCRE_ERROR_BADNEWLINE;
6226 }
6227
6228 if (newline == -2)
6229 {
6230 md->nltype = NLTYPE_ANYCRLF;
6231 }
6232 else if (newline < 0)
6233 {
6234 md->nltype = NLTYPE_ANY;
6235 }
6236 else
6237 {
6238 md->nltype = NLTYPE_FIXED;
6239 if (newline > 255)
6240 {
6241 md->nllen = 2;
6242 md->nl[0] = (newline >> 8) & 255;
6243 md->nl[1] = newline & 255;
6244 }
6245 else
6246 {
6247 md->nllen = 1;
6248 md->nl[0] = newline;
6249 }
6250 }
6251
6252 /* Partial matching was originally supported only for a restricted set of
6253 regexes; from release 8.00 there are no restrictions, but the bits are still
6254 defined (though never set). So there's no harm in leaving this code. */
6255
6256 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6257 return PCRE_ERROR_BADPARTIAL;
6258
6259 /* If the expression has got more back references than the offsets supplied can
6260 hold, we get a temporary chunk of working store to use during the matching.
6261 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6262 of 3. */
6263
6264 ocount = offsetcount - (offsetcount % 3);
6265 arg_offset_max = (2*ocount)/3;
6266
6267 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6268 {
6269 ocount = re->top_backref * 3 + 3;
6270 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6271 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6272 using_temporary_offsets = TRUE;
6273 DPRINTF(("Got memory to hold back references\n"));
6274 }
6275 else md->offset_vector = offsets;
6276
6277 md->offset_end = ocount;
6278 md->offset_max = (2*ocount)/3;
6279 md->offset_overflow = FALSE;
6280 md->capture_last = -1;
6281
6282 /* Reset the working variable associated with each extraction. These should
6283 never be used unless previously set, but they get saved and restored, and so we
6284 initialize them to avoid reading uninitialized locations. Also, unset the
6285 offsets for the matched string. This is really just for tidiness with callouts,
6286 in case they inspect these fields. */
6287
6288 if (md->offset_vector != NULL)
6289 {
6290 register int *iptr = md->offset_vector + ocount;
6291 register int *iend = iptr - re->top_bracket;
6292 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6293 while (--iptr >= iend) *iptr = -1;
6294 md->offset_vector[0] = md->offset_vector[1] = -1;
6295 }
6296
6297 /* Set up the first character to match, if available. The first_char value is
6298 never set for an anchored regular expression, but the anchoring may be forced
6299 at run time, so we have to test for anchoring. The first char may be unset for
6300 an unanchored pattern, of course. If there's no first char and the pattern was
6301 studied, there may be a bitmap of possible first characters. */
6302
6303 if (!anchored)
6304 {
6305 if ((re->flags & PCRE_FIRSTSET) != 0)
6306 {
6307 has_first_char = TRUE;
6308 first_char = first_char2 = re->first_char;
6309 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6310 {
6311 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6312 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6313 if (utf && first_char > 127)
6314 first_char2 = UCD_OTHERCASE(first_char);
6315 #endif
6316 }
6317 }
6318 else
6319 if (!startline && study != NULL &&
6320 (study->flags & PCRE_STUDY_MAPPED) != 0)
6321 start_bits = study->start_bits;
6322 }
6323
6324 /* For anchored or unanchored matches, there may be a "last known required
6325 character" set. */
6326
6327 if ((re->flags & PCRE_REQCHSET) != 0)
6328 {
6329 has_req_char = TRUE;
6330 req_char = req_char2 = re->req_char;
6331 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6332 {
6333 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6334 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6335 if (utf && req_char > 127)
6336 req_char2 = UCD_OTHERCASE(req_char);
6337 #endif
6338 }
6339 }
6340
6341
6342 /* ==========================================================================*/
6343
6344 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6345 the loop runs just once. */
6346
6347 for(;;)
6348 {
6349 PCRE_PUCHAR save_end_subject = end_subject;
6350 PCRE_PUCHAR new_start_match;
6351
6352 /* If firstline is TRUE, the start of the match is constrained to the first
6353 line of a multiline string. That is, the match must be before or at the first
6354 newline. Implement this by temporarily adjusting end_subject so that we stop
6355 scanning at a newline. If the match fails at the newline, later code breaks
6356 this loop. */
6357
6358 if (firstline)
6359 {
6360 PCRE_PUCHAR t = start_match;
6361 #ifdef SUPPORT_UTF
6362 if (utf)
6363 {
6364 while (t < md->end_subject && !IS_NEWLINE(t))
6365 {
6366 t++;
6367 ACROSSCHAR(t < end_subject, *t, t++);
6368 }
6369 }
6370 else
6371 #endif
6372 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6373 end_subject = t;
6374 }
6375
6376 /* There are some optimizations that avoid running the match if a known
6377 starting point is not found, or if a known later character is not present.
6378 However, there is an option that disables these, for testing and for ensuring
6379 that all callouts do actually occur. The option can be set in the regex by
6380 (*NO_START_OPT) or passed in match-time options. */
6381
6382 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6383 {
6384 /* Advance to a unique first char if there is one. */
6385
6386 if (has_first_char)
6387 {
6388 if (first_char != first_char2)
6389 while (start_match < end_subject &&
6390 *start_match != first_char && *start_match != first_char2)
6391 start_match++;
6392 else
6393 while (start_match < end_subject && *start_match != first_char)
6394 start_match++;
6395 }
6396
6397 /* Or to just after a linebreak for a multiline match */
6398
6399 else if (startline)
6400 {
6401 if (start_match > md->start_subject + start_offset)
6402 {
6403 #ifdef SUPPORT_UTF
6404 if (utf)
6405 {
6406 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6407 {
6408 start_match++;
6409 ACROSSCHAR(start_match < end_subject, *start_match,
6410 start_match++);
6411 }
6412 }
6413 else
6414 #endif
6415 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6416 start_match++;
6417
6418 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6419 and we are now at a LF, advance the match position by one more character.
6420 */
6421
6422 if (start_match[-1] == CHAR_CR &&
6423 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6424 start_match < end_subject &&
6425 *start_match == CHAR_NL)
6426 start_match++;
6427 }
6428 }
6429
6430 /* Or to a non-unique first byte after study */
6431
6432 else if (start_bits != NULL)
6433 {
6434 while (start_match < end_subject)
6435 {
6436 register unsigned int c = *start_match;
6437 #ifndef COMPILE_PCRE8
6438 if (c > 255) c = 255;
6439 #endif
6440 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6441 {
6442 start_match++;
6443 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6444 /* In non 8-bit mode, the iteration will stop for
6445 characters > 255 at the beginning or not stop at all. */
6446 if (utf)
6447 ACROSSCHAR(start_match < end_subject, *start_match,
6448 start_match++);
6449 #endif
6450 }
6451 else break;
6452 }
6453 }
6454 } /* Starting optimizations */
6455
6456 /* Restore fudged end_subject */
6457
6458 end_subject = save_end_subject;
6459
6460 /* The following two optimizations are disabled for partial matching or if
6461 disabling is explicitly requested. */
6462
6463 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6464 {
6465 /* If the pattern was studied, a minimum subject length may be set. This is
6466 a lower bound; no actual string of that length may actually match the
6467 pattern. Although the value is, strictly, in characters, we treat it as
6468 bytes to avoid spending too much time in this optimization. */
6469
6470 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6471 (pcre_uint32)(end_subject - start_match) < study->minlength)
6472 {
6473 rc = MATCH_NOMATCH;
6474 break;
6475 }
6476
6477 /* If req_char is set, we know that that character must appear in the
6478 subject for the match to succeed. If the first character is set, req_char
6479 must be later in the subject; otherwise the test starts at the match point.
6480 This optimization can save a huge amount of backtracking in patterns with
6481 nested unlimited repeats that aren't going to match. Writing separate code
6482 for cased/caseless versions makes it go faster, as does using an
6483 autoincrement and backing off on a match.
6484
6485 HOWEVER: when the subject string is very, very long, searching to its end
6486 can take a long time, and give bad performance on quite ordinary patterns.
6487 This showed up when somebody was matching something like /^\d+C/ on a
6488 32-megabyte string... so we don't do this when the string is sufficiently
6489 long. */
6490
6491 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6492 {
6493 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6494
6495 /* We don't need to repeat the search if we haven't yet reached the
6496 place we found it at last time. */
6497
6498 if (p > req_char_ptr)
6499 {
6500 if (req_char != req_char2)
6501 {
6502 while (p < end_subject)
6503 {
6504 register int pp = *p++;
6505 if (pp == req_char || pp == req_char2) { p--; break; }
6506 }
6507 }
6508 else
6509 {
6510 while (p < end_subject)
6511 {
6512 if (*p++ == req_char) { p--; break; }
6513 }
6514 }
6515
6516 /* If we can't find the required character, break the matching loop,
6517 forcing a match failure. */
6518
6519 if (p >= end_subject)
6520 {
6521 rc = MATCH_NOMATCH;
6522 break;
6523 }
6524
6525 /* If we have found the required character, save the point where we
6526 found it, so that we don't search again next time round the loop if
6527 the start hasn't passed this character yet. */
6528
6529 req_char_ptr = p;
6530 }
6531 }
6532 }
6533
6534 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6535 printf(">>>> Match against: ");
6536 pchars(start_match, end_subject - start_match, TRUE, md);
6537 printf("\n");
6538 #endif
6539
6540 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6541 first starting point for which a partial match was found. */
6542
6543 md->start_match_ptr = start_match;
6544 md->start_used_ptr = start_match;
6545 md->match_call_count = 0;
6546 md->match_function_type = 0;
6547 md->end_offset_top = 0;
6548 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6549 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6550
6551 switch(rc)
6552 {
6553 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6554 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6555 entirely. The only way we can do that is to re-do the match at the same
6556 point, with a flag to force SKIP with an argument to be ignored. Just
6557 treating this case as NOMATCH does not work because it does not check other
6558 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6559
6560 case MATCH_SKIP_ARG:
6561 new_start_match = start_match;
6562 md->ignore_skip_arg = TRUE;
6563 break;
6564
6565 /* SKIP passes back the next starting point explicitly, but if it is the
6566 same as the match we have just done, treat it as NOMATCH. */
6567
6568 case MATCH_SKIP:
6569 if (md->start_match_ptr != start_match)
6570 {
6571 new_start_match = md->start_match_ptr;
6572 break;
6573 }
6574 /* Fall through */
6575
6576 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6577 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6578
6579 case MATCH_NOMATCH:
6580 case MATCH_PRUNE:
6581 case MATCH_THEN:
6582 md->ignore_skip_arg = FALSE;
6583 new_start_match = start_match + 1;
6584 #ifdef SUPPORT_UTF
6585 if (utf)
6586 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6587 new_start_match++);
6588 #endif
6589 break;
6590
6591 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6592
6593 case MATCH_COMMIT:
6594 rc = MATCH_NOMATCH;
6595 goto ENDLOOP;
6596
6597 /* Any other return is either a match, or some kind of error. */
6598
6599 default:
6600 goto ENDLOOP;
6601 }
6602
6603 /* Control reaches here for the various types of "no match at this point"
6604 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6605
6606 rc = MATCH_NOMATCH;
6607
6608 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6609 newline in the subject (though it may continue over the newline). Therefore,
6610 if we have just failed to match, starting at a newline, do not continue. */
6611
6612 if (firstline && IS_NEWLINE(start_match)) break;
6613
6614 /* Advance to new matching position */
6615
6616 start_match = new_start_match;
6617
6618 /* Break the loop if the pattern is anchored or if we have passed the end of
6619 the subject. */
6620
6621 if (anchored || start_match > end_subject) break;
6622
6623 /* If we have just passed a CR and we are now at a LF, and the pattern does
6624 not contain any explicit matches for \r or \n, and the newline option is CRLF
6625 or ANY or ANYCRLF, advance the match position by one more character. */
6626
6627 if (start_match[-1] == CHAR_CR &&
6628 start_match < end_subject &&
6629 *start_match == CHAR_NL &&
6630 (re->flags & PCRE_HASCRORLF) == 0 &&
6631 (md->nltype == NLTYPE_ANY ||
6632 md->nltype == NLTYPE_ANYCRLF ||
6633 md->nllen == 2))
6634 start_match++;
6635
6636 md->mark = NULL; /* Reset for start of next match attempt */
6637 } /* End of for(;;) "bumpalong" loop */
6638
6639 /* ==========================================================================*/
6640
6641 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6642 conditions is true:
6643
6644 (1) The pattern is anchored or the match was failed by (*COMMIT);
6645
6646 (2) We are past the end of the subject;
6647
6648 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6649 this option requests that a match occur at or before the first newline in
6650 the subject.
6651
6652 When we have a match and the offset vector is big enough to deal with any
6653 backreferences, captured substring offsets will already be set up. In the case
6654 where we had to get some local store to hold offsets for backreference
6655 processing, copy those that we can. In this case there need not be overflow if
6656 certain parts of the pattern were not used, even though there are more
6657 capturing parentheses than vector slots. */
6658
6659 ENDLOOP:
6660
6661 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6662 {
6663 if (using_temporary_offsets)
6664 {
6665 if (arg_offset_max >= 4)
6666 {
6667 memcpy(offsets + 2, md->offset_vector + 2,
6668 (arg_offset_max - 2) * sizeof(int));
6669 DPRINTF(("Copied offsets from temporary memory\n"));
6670 }
6671 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6672 DPRINTF(("Freeing temporary memory\n"));
6673 (pcre_free)(md->offset_vector);
6674 }
6675
6676 /* Set the return code to the number of captured strings, or 0 if there were
6677 too many to fit into the vector. */
6678
6679 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6680 0 : md->end_offset_top/2;
6681
6682 /* If there is space in the offset vector, set any unused pairs at the end of
6683 the pattern to -1 for backwards compatibility. It is documented that this
6684 happens. In earlier versions, the whole set of potential capturing offsets
6685 was set to -1 each time round the loop, but this is handled differently now.
6686 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6687 those at the end that need unsetting here. We can't just unset them all at
6688 the start of the whole thing because they may get set in one branch that is
6689 not the final matching branch. */
6690
6691 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6692 {
6693 register int *iptr, *iend;
6694 int resetcount = 2 + re->top_bracket * 2;
6695 if (resetcount > offsetcount) resetcount = ocount;
6696 iptr = offsets + md->end_offset_top;
6697 iend = offsets + resetcount;
6698 while (iptr < iend) *iptr++ = -1;
6699 }
6700
6701 /* If there is space, set up the whole thing as substring 0. The value of
6702 md->start_match_ptr might be modified if \K was encountered on the success
6703 matching path. */
6704
6705 if (offsetcount < 2) rc = 0; else
6706 {
6707 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6708 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6709 }
6710
6711 /* Return MARK data if requested */
6712
6713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6714 *(extra_data->mark) = (unsigned char *)(md->mark);
6715 DPRINTF((">>>> returning %d\n", rc));
6716 return rc;
6717 }
6718
6719 /* Control gets here if there has been an error, or if the overall match
6720 attempt has failed at all permitted starting positions. */
6721
6722 if (using_temporary_offsets)
6723 {
6724 DPRINTF(("Freeing temporary memory\n"));
6725 (pcre_free)(md->offset_vector);
6726 }
6727
6728 /* For anything other than nomatch or partial match, just return the code. */
6729
6730 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6731 {
6732 DPRINTF((">>>> error: returning %d\n", rc));
6733 return rc;
6734 }
6735
6736 /* Handle partial matches - disable any mark data */
6737
6738 if (start_partial != NULL)
6739 {
6740 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6741 md->mark = NULL;
6742 if (offsetcount > 1)
6743 {
6744 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
6745 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
6746 }
6747 rc = PCRE_ERROR_PARTIAL;
6748 }