/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 211 - (show annotations)
Thu Aug 9 09:52:43 2007 UTC (7 years, 11 months ago) by ph10
File MIME type: text/plain
File size: 148898 byte(s)
Error occurred while calculating annotation data.
Update UTF-8 validity check and documentation.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include <config.h>
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caselesss case for speed */
162
163 if ((ims & PCRE_CASELESS) != 0)
164 {
165 while (length-- > 0)
166 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167 }
168 else
169 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170
171 return TRUE;
172 }
173
174
175
176 /***************************************************************************
177 ****************************************************************************
178 RECURSION IN THE match() FUNCTION
179
180 The match() function is highly recursive, though not every recursive call
181 increases the recursive depth. Nevertheless, some regular expressions can cause
182 it to recurse to a great depth. I was writing for Unix, so I just let it call
183 itself recursively. This uses the stack for saving everything that has to be
184 saved for a recursive call. On Unix, the stack can be large, and this works
185 fine.
186
187 It turns out that on some non-Unix-like systems there are problems with
188 programs that use a lot of stack. (This despite the fact that every last chip
189 has oodles of memory these days, and techniques for extending the stack have
190 been known for decades.) So....
191
192 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193 calls by keeping local variables that need to be preserved in blocks of memory
194 obtained from malloc() instead instead of on the stack. Macros are used to
195 achieve this so that the actual code doesn't look very different to what it
196 always used to.
197
198 The original heap-recursive code used longjmp(). However, it seems that this
199 can be very slow on some operating systems. Following a suggestion from Stan
200 Switzer, the use of longjmp() has been abolished, at the cost of having to
201 provide a unique number for each call to RMATCH. There is no way of generating
202 a sequence of numbers at compile time in C. I have given them names, to make
203 them stand out more clearly.
204
205 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 tests. Furthermore, not using longjmp() means that local dynamic variables
208 don't have indeterminate values; this has meant that the frame size can be
209 reduced because the result can be "passed back" by straight setting of the
210 variable instead of being passed in the frame.
211 ****************************************************************************
212 ***************************************************************************/
213
214
215 /* Numbers for RMATCH calls */
216
217 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 RM51, RM52, RM53 };
223
224
225 /* These versions of the macros use the stack, as normal. There are debugging
226 versions and production versions. Note that the "rw" argument of RMATCH isn't
227 actuall used in this definition. */
228
229 #ifndef NO_RECURSE
230 #define REGISTER register
231
232 #ifdef DEBUG
233 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
234 { \
235 printf("match() called in line %d\n", __LINE__); \
236 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
237 printf("to line %d\n", __LINE__); \
238 }
239 #define RRETURN(ra) \
240 { \
241 printf("match() returned %d from line %d ", ra, __LINE__); \
242 return ra; \
243 }
244 #else
245 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
246 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
247 #define RRETURN(ra) return ra
248 #endif
249
250 #else
251
252
253 /* These versions of the macros manage a private stack on the heap. Note that
254 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
255 argument of match(), which never changes. */
256
257 #define REGISTER
258
259 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
260 {\
261 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
262 frame->Xwhere = rw; \
263 newframe->Xeptr = ra;\
264 newframe->Xecode = rb;\
265 newframe->Xmstart = mstart;\
266 newframe->Xoffset_top = rc;\
267 newframe->Xims = re;\
268 newframe->Xeptrb = rf;\
269 newframe->Xflags = rg;\
270 newframe->Xrdepth = frame->Xrdepth + 1;\
271 newframe->Xprevframe = frame;\
272 frame = newframe;\
273 DPRINTF(("restarting from line %d\n", __LINE__));\
274 goto HEAP_RECURSE;\
275 L_##rw:\
276 DPRINTF(("jumped back to line %d\n", __LINE__));\
277 }
278
279 #define RRETURN(ra)\
280 {\
281 heapframe *newframe = frame;\
282 frame = newframe->Xprevframe;\
283 (pcre_stack_free)(newframe);\
284 if (frame != NULL)\
285 {\
286 rrc = ra;\
287 goto HEAP_RETURN;\
288 }\
289 return ra;\
290 }
291
292
293 /* Structure for remembering the local variables in a private frame */
294
295 typedef struct heapframe {
296 struct heapframe *Xprevframe;
297
298 /* Function arguments that may change */
299
300 const uschar *Xeptr;
301 const uschar *Xecode;
302 const uschar *Xmstart;
303 int Xoffset_top;
304 long int Xims;
305 eptrblock *Xeptrb;
306 int Xflags;
307 unsigned int Xrdepth;
308
309 /* Function local variables */
310
311 const uschar *Xcallpat;
312 const uschar *Xcharptr;
313 const uschar *Xdata;
314 const uschar *Xnext;
315 const uschar *Xpp;
316 const uschar *Xprev;
317 const uschar *Xsaved_eptr;
318
319 recursion_info Xnew_recursive;
320
321 BOOL Xcur_is_word;
322 BOOL Xcondition;
323 BOOL Xprev_is_word;
324
325 unsigned long int Xoriginal_ims;
326
327 #ifdef SUPPORT_UCP
328 int Xprop_type;
329 int Xprop_value;
330 int Xprop_fail_result;
331 int Xprop_category;
332 int Xprop_chartype;
333 int Xprop_script;
334 int Xoclength;
335 uschar Xocchars[8];
336 #endif
337
338 int Xctype;
339 unsigned int Xfc;
340 int Xfi;
341 int Xlength;
342 int Xmax;
343 int Xmin;
344 int Xnumber;
345 int Xoffset;
346 int Xop;
347 int Xsave_capture_last;
348 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
349 int Xstacksave[REC_STACK_SAVE_MAX];
350
351 eptrblock Xnewptrb;
352
353 /* Where to jump back to */
354
355 int Xwhere;
356
357 } heapframe;
358
359 #endif
360
361
362 /***************************************************************************
363 ***************************************************************************/
364
365
366
367 /*************************************************
368 * Match from current position *
369 *************************************************/
370
371 /* This function is called recursively in many circumstances. Whenever it
372 returns a negative (error) response, the outer incarnation must also return the
373 same response.
374
375 Performance note: It might be tempting to extract commonly used fields from the
376 md structure (e.g. utf8, end_subject) into individual variables to improve
377 performance. Tests using gcc on a SPARC disproved this; in the first case, it
378 made performance worse.
379
380 Arguments:
381 eptr pointer to current character in subject
382 ecode pointer to current position in compiled code
383 mstart pointer to the current match start position (can be modified
384 by encountering \K)
385 offset_top current top pointer
386 md pointer to "static" info for the match
387 ims current /i, /m, and /s options
388 eptrb pointer to chain of blocks containing eptr at start of
389 brackets - for testing for empty matches
390 flags can contain
391 match_condassert - this is an assertion condition
392 match_cbegroup - this is the start of an unlimited repeat
393 group that can match an empty string
394 rdepth the recursion depth
395
396 Returns: MATCH_MATCH if matched ) these values are >= 0
397 MATCH_NOMATCH if failed to match )
398 a negative PCRE_ERROR_xxx value if aborted by an error condition
399 (e.g. stopped by repeated call or recursion limit)
400 */
401
402 static int
403 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
404 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
405 int flags, unsigned int rdepth)
406 {
407 /* These variables do not need to be preserved over recursion in this function,
408 so they can be ordinary variables in all cases. Mark some of them with
409 "register" because they are used a lot in loops. */
410
411 register int rrc; /* Returns from recursive calls */
412 register int i; /* Used for loops not involving calls to RMATCH() */
413 register unsigned int c; /* Character values not kept over RMATCH() calls */
414 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
415
416 BOOL minimize, possessive; /* Quantifier options */
417
418 /* When recursion is not being used, all "local" variables that have to be
419 preserved over calls to RMATCH() are part of a "frame" which is obtained from
420 heap storage. Set up the top-level frame here; others are obtained from the
421 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
422
423 #ifdef NO_RECURSE
424 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
425 frame->Xprevframe = NULL; /* Marks the top level */
426
427 /* Copy in the original argument variables */
428
429 frame->Xeptr = eptr;
430 frame->Xecode = ecode;
431 frame->Xmstart = mstart;
432 frame->Xoffset_top = offset_top;
433 frame->Xims = ims;
434 frame->Xeptrb = eptrb;
435 frame->Xflags = flags;
436 frame->Xrdepth = rdepth;
437
438 /* This is where control jumps back to to effect "recursion" */
439
440 HEAP_RECURSE:
441
442 /* Macros make the argument variables come from the current frame */
443
444 #define eptr frame->Xeptr
445 #define ecode frame->Xecode
446 #define mstart frame->Xmstart
447 #define offset_top frame->Xoffset_top
448 #define ims frame->Xims
449 #define eptrb frame->Xeptrb
450 #define flags frame->Xflags
451 #define rdepth frame->Xrdepth
452
453 /* Ditto for the local variables */
454
455 #ifdef SUPPORT_UTF8
456 #define charptr frame->Xcharptr
457 #endif
458 #define callpat frame->Xcallpat
459 #define data frame->Xdata
460 #define next frame->Xnext
461 #define pp frame->Xpp
462 #define prev frame->Xprev
463 #define saved_eptr frame->Xsaved_eptr
464
465 #define new_recursive frame->Xnew_recursive
466
467 #define cur_is_word frame->Xcur_is_word
468 #define condition frame->Xcondition
469 #define prev_is_word frame->Xprev_is_word
470
471 #define original_ims frame->Xoriginal_ims
472
473 #ifdef SUPPORT_UCP
474 #define prop_type frame->Xprop_type
475 #define prop_value frame->Xprop_value
476 #define prop_fail_result frame->Xprop_fail_result
477 #define prop_category frame->Xprop_category
478 #define prop_chartype frame->Xprop_chartype
479 #define prop_script frame->Xprop_script
480 #define oclength frame->Xoclength
481 #define occhars frame->Xocchars
482 #endif
483
484 #define ctype frame->Xctype
485 #define fc frame->Xfc
486 #define fi frame->Xfi
487 #define length frame->Xlength
488 #define max frame->Xmax
489 #define min frame->Xmin
490 #define number frame->Xnumber
491 #define offset frame->Xoffset
492 #define op frame->Xop
493 #define save_capture_last frame->Xsave_capture_last
494 #define save_offset1 frame->Xsave_offset1
495 #define save_offset2 frame->Xsave_offset2
496 #define save_offset3 frame->Xsave_offset3
497 #define stacksave frame->Xstacksave
498
499 #define newptrb frame->Xnewptrb
500
501 /* When recursion is being used, local variables are allocated on the stack and
502 get preserved during recursion in the normal way. In this environment, fi and
503 i, and fc and c, can be the same variables. */
504
505 #else /* NO_RECURSE not defined */
506 #define fi i
507 #define fc c
508
509
510 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
511 const uschar *charptr; /* in small blocks of the code. My normal */
512 #endif /* style of coding would have declared */
513 const uschar *callpat; /* them within each of those blocks. */
514 const uschar *data; /* However, in order to accommodate the */
515 const uschar *next; /* version of this code that uses an */
516 USPTR pp; /* external "stack" implemented on the */
517 const uschar *prev; /* heap, it is easier to declare them all */
518 USPTR saved_eptr; /* here, so the declarations can be cut */
519 /* out in a block. The only declarations */
520 recursion_info new_recursive; /* within blocks below are for variables */
521 /* that do not have to be preserved over */
522 BOOL cur_is_word; /* a recursive call to RMATCH(). */
523 BOOL condition;
524 BOOL prev_is_word;
525
526 unsigned long int original_ims;
527
528 #ifdef SUPPORT_UCP
529 int prop_type;
530 int prop_value;
531 int prop_fail_result;
532 int prop_category;
533 int prop_chartype;
534 int prop_script;
535 int oclength;
536 uschar occhars[8];
537 #endif
538
539 int ctype;
540 int length;
541 int max;
542 int min;
543 int number;
544 int offset;
545 int op;
546 int save_capture_last;
547 int save_offset1, save_offset2, save_offset3;
548 int stacksave[REC_STACK_SAVE_MAX];
549
550 eptrblock newptrb;
551 #endif /* NO_RECURSE */
552
553 /* These statements are here to stop the compiler complaining about unitialized
554 variables. */
555
556 #ifdef SUPPORT_UCP
557 prop_value = 0;
558 prop_fail_result = 0;
559 #endif
560
561
562 /* This label is used for tail recursion, which is used in a few cases even
563 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
564 used. Thanks to Ian Taylor for noticing this possibility and sending the
565 original patch. */
566
567 TAIL_RECURSE:
568
569 /* OK, now we can get on with the real code of the function. Recursive calls
570 are specified by the macro RMATCH and RRETURN is used to return. When
571 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
572 and a "return", respectively (possibly with some debugging if DEBUG is
573 defined). However, RMATCH isn't like a function call because it's quite a
574 complicated macro. It has to be used in one particular way. This shouldn't,
575 however, impact performance when true recursion is being used. */
576
577 #ifdef SUPPORT_UTF8
578 utf8 = md->utf8; /* Local copy of the flag */
579 #else
580 utf8 = FALSE;
581 #endif
582
583 /* First check that we haven't called match() too many times, or that we
584 haven't exceeded the recursive call limit. */
585
586 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
587 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
588
589 original_ims = ims; /* Save for resetting on ')' */
590
591 /* At the start of a group with an unlimited repeat that may match an empty
592 string, the match_cbegroup flag is set. When this is the case, add the current
593 subject pointer to the chain of such remembered pointers, to be checked when we
594 hit the closing ket, in order to break infinite loops that match no characters.
595 When match() is called in other circumstances, don't add to the chain. The
596 match_cbegroup flag must NOT be used with tail recursion, because the memory
597 block that is used is on the stack, so a new one may be required for each
598 match(). */
599
600 if ((flags & match_cbegroup) != 0)
601 {
602 newptrb.epb_saved_eptr = eptr;
603 newptrb.epb_prev = eptrb;
604 eptrb = &newptrb;
605 }
606
607 /* Now start processing the opcodes. */
608
609 for (;;)
610 {
611 minimize = possessive = FALSE;
612 op = *ecode;
613
614 /* For partial matching, remember if we ever hit the end of the subject after
615 matching at least one subject character. */
616
617 if (md->partial &&
618 eptr >= md->end_subject &&
619 eptr > mstart)
620 md->hitend = TRUE;
621
622 switch(op)
623 {
624 case OP_FAIL:
625 return MATCH_NOMATCH;
626
627 case OP_PRUNE:
628 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
629 ims, eptrb, flags, RM51);
630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
631 return MATCH_PRUNE;
632
633 case OP_COMMIT:
634 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
635 ims, eptrb, flags, RM52);
636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
637 return MATCH_COMMIT;
638
639 case OP_SKIP:
640 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
641 ims, eptrb, flags, RM53);
642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
643 md->start_match_ptr = eptr; /* Pass back current position */
644 return MATCH_SKIP;
645
646 case OP_THEN:
647 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
648 ims, eptrb, flags, RM53);
649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
650 return MATCH_THEN;
651
652 /* Handle a capturing bracket. If there is space in the offset vector, save
653 the current subject position in the working slot at the top of the vector.
654 We mustn't change the current values of the data slot, because they may be
655 set from a previous iteration of this group, and be referred to by a
656 reference inside the group.
657
658 If the bracket fails to match, we need to restore this value and also the
659 values of the final offsets, in case they were set by a previous iteration
660 of the same bracket.
661
662 If there isn't enough space in the offset vector, treat this as if it were
663 a non-capturing bracket. Don't worry about setting the flag for the error
664 case here; that is handled in the code for KET. */
665
666 case OP_CBRA:
667 case OP_SCBRA:
668 number = GET2(ecode, 1+LINK_SIZE);
669 offset = number << 1;
670
671 #ifdef DEBUG
672 printf("start bracket %d\n", number);
673 printf("subject=");
674 pchars(eptr, 16, TRUE, md);
675 printf("\n");
676 #endif
677
678 if (offset < md->offset_max)
679 {
680 save_offset1 = md->offset_vector[offset];
681 save_offset2 = md->offset_vector[offset+1];
682 save_offset3 = md->offset_vector[md->offset_end - number];
683 save_capture_last = md->capture_last;
684
685 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
686 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
687
688 flags = (op == OP_SCBRA)? match_cbegroup : 0;
689 do
690 {
691 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
692 ims, eptrb, flags, RM1);
693 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
694 md->capture_last = save_capture_last;
695 ecode += GET(ecode, 1);
696 }
697 while (*ecode == OP_ALT);
698
699 DPRINTF(("bracket %d failed\n", number));
700
701 md->offset_vector[offset] = save_offset1;
702 md->offset_vector[offset+1] = save_offset2;
703 md->offset_vector[md->offset_end - number] = save_offset3;
704
705 RRETURN(MATCH_NOMATCH);
706 }
707
708 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
709 as a non-capturing bracket. */
710
711 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
713
714 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
715
716 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
718
719 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
720 final alternative within the brackets, we would return the result of a
721 recursive call to match() whatever happened. We can reduce stack usage by
722 turning this into a tail recursion, except in the case when match_cbegroup
723 is set.*/
724
725 case OP_BRA:
726 case OP_SBRA:
727 DPRINTF(("start non-capturing bracket\n"));
728 flags = (op >= OP_SBRA)? match_cbegroup : 0;
729 for (;;)
730 {
731 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
732 {
733 if (flags == 0) /* Not a possibly empty group */
734 {
735 ecode += _pcre_OP_lengths[*ecode];
736 DPRINTF(("bracket 0 tail recursion\n"));
737 goto TAIL_RECURSE;
738 }
739
740 /* Possibly empty group; can't use tail recursion. */
741
742 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
743 eptrb, flags, RM48);
744 RRETURN(rrc);
745 }
746
747 /* For non-final alternatives, continue the loop for a NOMATCH result;
748 otherwise return. */
749
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
751 eptrb, flags, RM2);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 ecode += GET(ecode, 1);
754 }
755 /* Control never reaches here. */
756
757 /* Conditional group: compilation checked that there are no more than
758 two branches. If the condition is false, skipping the first branch takes us
759 past the end if there is only one branch, but that's OK because that is
760 exactly what going to the ket would do. As there is only one branch to be
761 obeyed, we can use tail recursion to avoid using another stack frame. */
762
763 case OP_COND:
764 case OP_SCOND:
765 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
766 {
767 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
768 condition = md->recursive != NULL &&
769 (offset == RREF_ANY || offset == md->recursive->group_num);
770 ecode += condition? 3 : GET(ecode, 1);
771 }
772
773 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
774 {
775 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
776 condition = offset < offset_top && md->offset_vector[offset] >= 0;
777 ecode += condition? 3 : GET(ecode, 1);
778 }
779
780 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
781 {
782 condition = FALSE;
783 ecode += GET(ecode, 1);
784 }
785
786 /* The condition is an assertion. Call match() to evaluate it - setting
787 the final argument match_condassert causes it to stop at the end of an
788 assertion. */
789
790 else
791 {
792 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
793 match_condassert, RM3);
794 if (rrc == MATCH_MATCH)
795 {
796 condition = TRUE;
797 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
798 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
799 }
800 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
801 {
802 RRETURN(rrc); /* Need braces because of following else */
803 }
804 else
805 {
806 condition = FALSE;
807 ecode += GET(ecode, 1);
808 }
809 }
810
811 /* We are now at the branch that is to be obeyed. As there is only one,
812 we can use tail recursion to avoid using another stack frame, except when
813 match_cbegroup is required for an unlimited repeat of a possibly empty
814 group. If the second alternative doesn't exist, we can just plough on. */
815
816 if (condition || *ecode == OP_ALT)
817 {
818 ecode += 1 + LINK_SIZE;
819 if (op == OP_SCOND) /* Possibly empty group */
820 {
821 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
822 RRETURN(rrc);
823 }
824 else /* Group must match something */
825 {
826 flags = 0;
827 goto TAIL_RECURSE;
828 }
829 }
830 else /* Condition false & no 2nd alternative */
831 {
832 ecode += 1 + LINK_SIZE;
833 }
834 break;
835
836
837 /* End of the pattern, either real or forced. If we are in a top-level
838 recursion, we should restore the offsets appropriately and continue from
839 after the call. */
840
841 case OP_ACCEPT:
842 case OP_END:
843 if (md->recursive != NULL && md->recursive->group_num == 0)
844 {
845 recursion_info *rec = md->recursive;
846 DPRINTF(("End of pattern in a (?0) recursion\n"));
847 md->recursive = rec->prevrec;
848 memmove(md->offset_vector, rec->offset_save,
849 rec->saved_max * sizeof(int));
850 mstart = rec->save_start;
851 ims = original_ims;
852 ecode = rec->after_call;
853 break;
854 }
855
856 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
857 string - backtracking will then try other alternatives, if any. */
858
859 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
860 md->end_match_ptr = eptr; /* Record where we ended */
861 md->end_offset_top = offset_top; /* and how many extracts were taken */
862 md->start_match_ptr = mstart; /* and the start (\K can modify) */
863 RRETURN(MATCH_MATCH);
864
865 /* Change option settings */
866
867 case OP_OPT:
868 ims = ecode[1];
869 ecode += 2;
870 DPRINTF(("ims set to %02lx\n", ims));
871 break;
872
873 /* Assertion brackets. Check the alternative branches in turn - the
874 matching won't pass the KET for an assertion. If any one branch matches,
875 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
876 start of each branch to move the current point backwards, so the code at
877 this level is identical to the lookahead case. */
878
879 case OP_ASSERT:
880 case OP_ASSERTBACK:
881 do
882 {
883 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
884 RM4);
885 if (rrc == MATCH_MATCH) break;
886 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
887 ecode += GET(ecode, 1);
888 }
889 while (*ecode == OP_ALT);
890 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
891
892 /* If checking an assertion for a condition, return MATCH_MATCH. */
893
894 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
895
896 /* Continue from after the assertion, updating the offsets high water
897 mark, since extracts may have been taken during the assertion. */
898
899 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
900 ecode += 1 + LINK_SIZE;
901 offset_top = md->end_offset_top;
902 continue;
903
904 /* Negative assertion: all branches must fail to match */
905
906 case OP_ASSERT_NOT:
907 case OP_ASSERTBACK_NOT:
908 do
909 {
910 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
911 RM5);
912 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
913 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
914 ecode += GET(ecode,1);
915 }
916 while (*ecode == OP_ALT);
917
918 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
919
920 ecode += 1 + LINK_SIZE;
921 continue;
922
923 /* Move the subject pointer back. This occurs only at the start of
924 each branch of a lookbehind assertion. If we are too close to the start to
925 move back, this match function fails. When working with UTF-8 we move
926 back a number of characters, not bytes. */
927
928 case OP_REVERSE:
929 #ifdef SUPPORT_UTF8
930 if (utf8)
931 {
932 i = GET(ecode, 1);
933 while (i-- > 0)
934 {
935 eptr--;
936 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
937 BACKCHAR(eptr);
938 }
939 }
940 else
941 #endif
942
943 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
944
945 {
946 eptr -= GET(ecode, 1);
947 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
948 }
949
950 /* Skip to next op code */
951
952 ecode += 1 + LINK_SIZE;
953 break;
954
955 /* The callout item calls an external function, if one is provided, passing
956 details of the match so far. This is mainly for debugging, though the
957 function is able to force a failure. */
958
959 case OP_CALLOUT:
960 if (pcre_callout != NULL)
961 {
962 pcre_callout_block cb;
963 cb.version = 1; /* Version 1 of the callout block */
964 cb.callout_number = ecode[1];
965 cb.offset_vector = md->offset_vector;
966 cb.subject = (PCRE_SPTR)md->start_subject;
967 cb.subject_length = md->end_subject - md->start_subject;
968 cb.start_match = mstart - md->start_subject;
969 cb.current_position = eptr - md->start_subject;
970 cb.pattern_position = GET(ecode, 2);
971 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
972 cb.capture_top = offset_top/2;
973 cb.capture_last = md->capture_last;
974 cb.callout_data = md->callout_data;
975 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
976 if (rrc < 0) RRETURN(rrc);
977 }
978 ecode += 2 + 2*LINK_SIZE;
979 break;
980
981 /* Recursion either matches the current regex, or some subexpression. The
982 offset data is the offset to the starting bracket from the start of the
983 whole pattern. (This is so that it works from duplicated subpatterns.)
984
985 If there are any capturing brackets started but not finished, we have to
986 save their starting points and reinstate them after the recursion. However,
987 we don't know how many such there are (offset_top records the completed
988 total) so we just have to save all the potential data. There may be up to
989 65535 such values, which is too large to put on the stack, but using malloc
990 for small numbers seems expensive. As a compromise, the stack is used when
991 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
992 is used. A problem is what to do if the malloc fails ... there is no way of
993 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
994 values on the stack, and accept that the rest may be wrong.
995
996 There are also other values that have to be saved. We use a chained
997 sequence of blocks that actually live on the stack. Thanks to Robin Houston
998 for the original version of this logic. */
999
1000 case OP_RECURSE:
1001 {
1002 callpat = md->start_code + GET(ecode, 1);
1003 new_recursive.group_num = (callpat == md->start_code)? 0 :
1004 GET2(callpat, 1 + LINK_SIZE);
1005
1006 /* Add to "recursing stack" */
1007
1008 new_recursive.prevrec = md->recursive;
1009 md->recursive = &new_recursive;
1010
1011 /* Find where to continue from afterwards */
1012
1013 ecode += 1 + LINK_SIZE;
1014 new_recursive.after_call = ecode;
1015
1016 /* Now save the offset data. */
1017
1018 new_recursive.saved_max = md->offset_end;
1019 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1020 new_recursive.offset_save = stacksave;
1021 else
1022 {
1023 new_recursive.offset_save =
1024 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1025 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1026 }
1027
1028 memcpy(new_recursive.offset_save, md->offset_vector,
1029 new_recursive.saved_max * sizeof(int));
1030 new_recursive.save_start = mstart;
1031 mstart = eptr;
1032
1033 /* OK, now we can do the recursion. For each top-level alternative we
1034 restore the offset and recursion data. */
1035
1036 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1037 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1038 do
1039 {
1040 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1041 md, ims, eptrb, flags, RM6);
1042 if (rrc == MATCH_MATCH)
1043 {
1044 DPRINTF(("Recursion matched\n"));
1045 md->recursive = new_recursive.prevrec;
1046 if (new_recursive.offset_save != stacksave)
1047 (pcre_free)(new_recursive.offset_save);
1048 RRETURN(MATCH_MATCH);
1049 }
1050 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1051 {
1052 DPRINTF(("Recursion gave error %d\n", rrc));
1053 RRETURN(rrc);
1054 }
1055
1056 md->recursive = &new_recursive;
1057 memcpy(md->offset_vector, new_recursive.offset_save,
1058 new_recursive.saved_max * sizeof(int));
1059 callpat += GET(callpat, 1);
1060 }
1061 while (*callpat == OP_ALT);
1062
1063 DPRINTF(("Recursion didn't match\n"));
1064 md->recursive = new_recursive.prevrec;
1065 if (new_recursive.offset_save != stacksave)
1066 (pcre_free)(new_recursive.offset_save);
1067 RRETURN(MATCH_NOMATCH);
1068 }
1069 /* Control never reaches here */
1070
1071 /* "Once" brackets are like assertion brackets except that after a match,
1072 the point in the subject string is not moved back. Thus there can never be
1073 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1074 Check the alternative branches in turn - the matching won't pass the KET
1075 for this kind of subpattern. If any one branch matches, we carry on as at
1076 the end of a normal bracket, leaving the subject pointer. */
1077
1078 case OP_ONCE:
1079 prev = ecode;
1080 saved_eptr = eptr;
1081
1082 do
1083 {
1084 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1085 if (rrc == MATCH_MATCH) break;
1086 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1087 ecode += GET(ecode,1);
1088 }
1089 while (*ecode == OP_ALT);
1090
1091 /* If hit the end of the group (which could be repeated), fail */
1092
1093 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1094
1095 /* Continue as from after the assertion, updating the offsets high water
1096 mark, since extracts may have been taken. */
1097
1098 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1099
1100 offset_top = md->end_offset_top;
1101 eptr = md->end_match_ptr;
1102
1103 /* For a non-repeating ket, just continue at this level. This also
1104 happens for a repeating ket if no characters were matched in the group.
1105 This is the forcible breaking of infinite loops as implemented in Perl
1106 5.005. If there is an options reset, it will get obeyed in the normal
1107 course of events. */
1108
1109 if (*ecode == OP_KET || eptr == saved_eptr)
1110 {
1111 ecode += 1+LINK_SIZE;
1112 break;
1113 }
1114
1115 /* The repeating kets try the rest of the pattern or restart from the
1116 preceding bracket, in the appropriate order. The second "call" of match()
1117 uses tail recursion, to avoid using another stack frame. We need to reset
1118 any options that changed within the bracket before re-running it, so
1119 check the next opcode. */
1120
1121 if (ecode[1+LINK_SIZE] == OP_OPT)
1122 {
1123 ims = (ims & ~PCRE_IMS) | ecode[4];
1124 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1125 }
1126
1127 if (*ecode == OP_KETRMIN)
1128 {
1129 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1130 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1131 ecode = prev;
1132 flags = 0;
1133 goto TAIL_RECURSE;
1134 }
1135 else /* OP_KETRMAX */
1136 {
1137 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1138 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1139 ecode += 1 + LINK_SIZE;
1140 flags = 0;
1141 goto TAIL_RECURSE;
1142 }
1143 /* Control never gets here */
1144
1145 /* An alternation is the end of a branch; scan along to find the end of the
1146 bracketed group and go to there. */
1147
1148 case OP_ALT:
1149 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1150 break;
1151
1152 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1153 that it may occur zero times. It may repeat infinitely, or not at all -
1154 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1155 repeat limits are compiled as a number of copies, with the optional ones
1156 preceded by BRAZERO or BRAMINZERO. */
1157
1158 case OP_BRAZERO:
1159 {
1160 next = ecode+1;
1161 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1162 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1163 do next += GET(next,1); while (*next == OP_ALT);
1164 ecode = next + 1 + LINK_SIZE;
1165 }
1166 break;
1167
1168 case OP_BRAMINZERO:
1169 {
1170 next = ecode+1;
1171 do next += GET(next, 1); while (*next == OP_ALT);
1172 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1174 ecode++;
1175 }
1176 break;
1177
1178 /* End of a group, repeated or non-repeating. */
1179
1180 case OP_KET:
1181 case OP_KETRMIN:
1182 case OP_KETRMAX:
1183 prev = ecode - GET(ecode, 1);
1184
1185 /* If this was a group that remembered the subject start, in order to break
1186 infinite repeats of empty string matches, retrieve the subject start from
1187 the chain. Otherwise, set it NULL. */
1188
1189 if (*prev >= OP_SBRA)
1190 {
1191 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1192 eptrb = eptrb->epb_prev; /* Backup to previous group */
1193 }
1194 else saved_eptr = NULL;
1195
1196 /* If we are at the end of an assertion group, stop matching and return
1197 MATCH_MATCH, but record the current high water mark for use by positive
1198 assertions. Do this also for the "once" (atomic) groups. */
1199
1200 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1201 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1202 *prev == OP_ONCE)
1203 {
1204 md->end_match_ptr = eptr; /* For ONCE */
1205 md->end_offset_top = offset_top;
1206 RRETURN(MATCH_MATCH);
1207 }
1208
1209 /* For capturing groups we have to check the group number back at the start
1210 and if necessary complete handling an extraction by setting the offsets and
1211 bumping the high water mark. Note that whole-pattern recursion is coded as
1212 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1213 when the OP_END is reached. Other recursion is handled here. */
1214
1215 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1216 {
1217 number = GET2(prev, 1+LINK_SIZE);
1218 offset = number << 1;
1219
1220 #ifdef DEBUG
1221 printf("end bracket %d", number);
1222 printf("\n");
1223 #endif
1224
1225 md->capture_last = number;
1226 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1227 {
1228 md->offset_vector[offset] =
1229 md->offset_vector[md->offset_end - number];
1230 md->offset_vector[offset+1] = eptr - md->start_subject;
1231 if (offset_top <= offset) offset_top = offset + 2;
1232 }
1233
1234 /* Handle a recursively called group. Restore the offsets
1235 appropriately and continue from after the call. */
1236
1237 if (md->recursive != NULL && md->recursive->group_num == number)
1238 {
1239 recursion_info *rec = md->recursive;
1240 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1241 md->recursive = rec->prevrec;
1242 mstart = rec->save_start;
1243 memcpy(md->offset_vector, rec->offset_save,
1244 rec->saved_max * sizeof(int));
1245 ecode = rec->after_call;
1246 ims = original_ims;
1247 break;
1248 }
1249 }
1250
1251 /* For both capturing and non-capturing groups, reset the value of the ims
1252 flags, in case they got changed during the group. */
1253
1254 ims = original_ims;
1255 DPRINTF(("ims reset to %02lx\n", ims));
1256
1257 /* For a non-repeating ket, just continue at this level. This also
1258 happens for a repeating ket if no characters were matched in the group.
1259 This is the forcible breaking of infinite loops as implemented in Perl
1260 5.005. If there is an options reset, it will get obeyed in the normal
1261 course of events. */
1262
1263 if (*ecode == OP_KET || eptr == saved_eptr)
1264 {
1265 ecode += 1 + LINK_SIZE;
1266 break;
1267 }
1268
1269 /* The repeating kets try the rest of the pattern or restart from the
1270 preceding bracket, in the appropriate order. In the second case, we can use
1271 tail recursion to avoid using another stack frame, unless we have an
1272 unlimited repeat of a group that can match an empty string. */
1273
1274 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1275
1276 if (*ecode == OP_KETRMIN)
1277 {
1278 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1280 if (flags != 0) /* Could match an empty string */
1281 {
1282 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1283 RRETURN(rrc);
1284 }
1285 ecode = prev;
1286 goto TAIL_RECURSE;
1287 }
1288 else /* OP_KETRMAX */
1289 {
1290 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1291 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1292 ecode += 1 + LINK_SIZE;
1293 flags = 0;
1294 goto TAIL_RECURSE;
1295 }
1296 /* Control never gets here */
1297
1298 /* Start of subject unless notbol, or after internal newline if multiline */
1299
1300 case OP_CIRC:
1301 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1302 if ((ims & PCRE_MULTILINE) != 0)
1303 {
1304 if (eptr != md->start_subject &&
1305 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1306 RRETURN(MATCH_NOMATCH);
1307 ecode++;
1308 break;
1309 }
1310 /* ... else fall through */
1311
1312 /* Start of subject assertion */
1313
1314 case OP_SOD:
1315 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1316 ecode++;
1317 break;
1318
1319 /* Start of match assertion */
1320
1321 case OP_SOM:
1322 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1323 ecode++;
1324 break;
1325
1326 /* Reset the start of match point */
1327
1328 case OP_SET_SOM:
1329 mstart = eptr;
1330 ecode++;
1331 break;
1332
1333 /* Assert before internal newline if multiline, or before a terminating
1334 newline unless endonly is set, else end of subject unless noteol is set. */
1335
1336 case OP_DOLL:
1337 if ((ims & PCRE_MULTILINE) != 0)
1338 {
1339 if (eptr < md->end_subject)
1340 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1341 else
1342 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1343 ecode++;
1344 break;
1345 }
1346 else
1347 {
1348 if (md->noteol) RRETURN(MATCH_NOMATCH);
1349 if (!md->endonly)
1350 {
1351 if (eptr != md->end_subject &&
1352 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1353 RRETURN(MATCH_NOMATCH);
1354 ecode++;
1355 break;
1356 }
1357 }
1358 /* ... else fall through for endonly */
1359
1360 /* End of subject assertion (\z) */
1361
1362 case OP_EOD:
1363 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1364 ecode++;
1365 break;
1366
1367 /* End of subject or ending \n assertion (\Z) */
1368
1369 case OP_EODN:
1370 if (eptr != md->end_subject &&
1371 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1372 RRETURN(MATCH_NOMATCH);
1373 ecode++;
1374 break;
1375
1376 /* Word boundary assertions */
1377
1378 case OP_NOT_WORD_BOUNDARY:
1379 case OP_WORD_BOUNDARY:
1380 {
1381
1382 /* Find out if the previous and current characters are "word" characters.
1383 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1384 be "non-word" characters. */
1385
1386 #ifdef SUPPORT_UTF8
1387 if (utf8)
1388 {
1389 if (eptr == md->start_subject) prev_is_word = FALSE; else
1390 {
1391 const uschar *lastptr = eptr - 1;
1392 while((*lastptr & 0xc0) == 0x80) lastptr--;
1393 GETCHAR(c, lastptr);
1394 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1395 }
1396 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1397 {
1398 GETCHAR(c, eptr);
1399 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1400 }
1401 }
1402 else
1403 #endif
1404
1405 /* More streamlined when not in UTF-8 mode */
1406
1407 {
1408 prev_is_word = (eptr != md->start_subject) &&
1409 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1410 cur_is_word = (eptr < md->end_subject) &&
1411 ((md->ctypes[*eptr] & ctype_word) != 0);
1412 }
1413
1414 /* Now see if the situation is what we want */
1415
1416 if ((*ecode++ == OP_WORD_BOUNDARY)?
1417 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1418 RRETURN(MATCH_NOMATCH);
1419 }
1420 break;
1421
1422 /* Match a single character type; inline for speed */
1423
1424 case OP_ANY:
1425 if ((ims & PCRE_DOTALL) == 0)
1426 {
1427 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1428 }
1429 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1430 if (utf8)
1431 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1432 ecode++;
1433 break;
1434
1435 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1436 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1437
1438 case OP_ANYBYTE:
1439 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440 ecode++;
1441 break;
1442
1443 case OP_NOT_DIGIT:
1444 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1445 GETCHARINCTEST(c, eptr);
1446 if (
1447 #ifdef SUPPORT_UTF8
1448 c < 256 &&
1449 #endif
1450 (md->ctypes[c] & ctype_digit) != 0
1451 )
1452 RRETURN(MATCH_NOMATCH);
1453 ecode++;
1454 break;
1455
1456 case OP_DIGIT:
1457 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1458 GETCHARINCTEST(c, eptr);
1459 if (
1460 #ifdef SUPPORT_UTF8
1461 c >= 256 ||
1462 #endif
1463 (md->ctypes[c] & ctype_digit) == 0
1464 )
1465 RRETURN(MATCH_NOMATCH);
1466 ecode++;
1467 break;
1468
1469 case OP_NOT_WHITESPACE:
1470 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1471 GETCHARINCTEST(c, eptr);
1472 if (
1473 #ifdef SUPPORT_UTF8
1474 c < 256 &&
1475 #endif
1476 (md->ctypes[c] & ctype_space) != 0
1477 )
1478 RRETURN(MATCH_NOMATCH);
1479 ecode++;
1480 break;
1481
1482 case OP_WHITESPACE:
1483 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1484 GETCHARINCTEST(c, eptr);
1485 if (
1486 #ifdef SUPPORT_UTF8
1487 c >= 256 ||
1488 #endif
1489 (md->ctypes[c] & ctype_space) == 0
1490 )
1491 RRETURN(MATCH_NOMATCH);
1492 ecode++;
1493 break;
1494
1495 case OP_NOT_WORDCHAR:
1496 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1497 GETCHARINCTEST(c, eptr);
1498 if (
1499 #ifdef SUPPORT_UTF8
1500 c < 256 &&
1501 #endif
1502 (md->ctypes[c] & ctype_word) != 0
1503 )
1504 RRETURN(MATCH_NOMATCH);
1505 ecode++;
1506 break;
1507
1508 case OP_WORDCHAR:
1509 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1510 GETCHARINCTEST(c, eptr);
1511 if (
1512 #ifdef SUPPORT_UTF8
1513 c >= 256 ||
1514 #endif
1515 (md->ctypes[c] & ctype_word) == 0
1516 )
1517 RRETURN(MATCH_NOMATCH);
1518 ecode++;
1519 break;
1520
1521 case OP_ANYNL:
1522 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1523 GETCHARINCTEST(c, eptr);
1524 switch(c)
1525 {
1526 default: RRETURN(MATCH_NOMATCH);
1527 case 0x000d:
1528 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1529 break;
1530 case 0x000a:
1531 case 0x000b:
1532 case 0x000c:
1533 case 0x0085:
1534 case 0x2028:
1535 case 0x2029:
1536 break;
1537 }
1538 ecode++;
1539 break;
1540
1541 case OP_NOT_HSPACE:
1542 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543 GETCHARINCTEST(c, eptr);
1544 switch(c)
1545 {
1546 default: break;
1547 case 0x09: /* HT */
1548 case 0x20: /* SPACE */
1549 case 0xa0: /* NBSP */
1550 case 0x1680: /* OGHAM SPACE MARK */
1551 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1552 case 0x2000: /* EN QUAD */
1553 case 0x2001: /* EM QUAD */
1554 case 0x2002: /* EN SPACE */
1555 case 0x2003: /* EM SPACE */
1556 case 0x2004: /* THREE-PER-EM SPACE */
1557 case 0x2005: /* FOUR-PER-EM SPACE */
1558 case 0x2006: /* SIX-PER-EM SPACE */
1559 case 0x2007: /* FIGURE SPACE */
1560 case 0x2008: /* PUNCTUATION SPACE */
1561 case 0x2009: /* THIN SPACE */
1562 case 0x200A: /* HAIR SPACE */
1563 case 0x202f: /* NARROW NO-BREAK SPACE */
1564 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1565 case 0x3000: /* IDEOGRAPHIC SPACE */
1566 RRETURN(MATCH_NOMATCH);
1567 }
1568 ecode++;
1569 break;
1570
1571 case OP_HSPACE:
1572 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1573 GETCHARINCTEST(c, eptr);
1574 switch(c)
1575 {
1576 default: RRETURN(MATCH_NOMATCH);
1577 case 0x09: /* HT */
1578 case 0x20: /* SPACE */
1579 case 0xa0: /* NBSP */
1580 case 0x1680: /* OGHAM SPACE MARK */
1581 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1582 case 0x2000: /* EN QUAD */
1583 case 0x2001: /* EM QUAD */
1584 case 0x2002: /* EN SPACE */
1585 case 0x2003: /* EM SPACE */
1586 case 0x2004: /* THREE-PER-EM SPACE */
1587 case 0x2005: /* FOUR-PER-EM SPACE */
1588 case 0x2006: /* SIX-PER-EM SPACE */
1589 case 0x2007: /* FIGURE SPACE */
1590 case 0x2008: /* PUNCTUATION SPACE */
1591 case 0x2009: /* THIN SPACE */
1592 case 0x200A: /* HAIR SPACE */
1593 case 0x202f: /* NARROW NO-BREAK SPACE */
1594 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1595 case 0x3000: /* IDEOGRAPHIC SPACE */
1596 break;
1597 }
1598 ecode++;
1599 break;
1600
1601 case OP_NOT_VSPACE:
1602 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1603 GETCHARINCTEST(c, eptr);
1604 switch(c)
1605 {
1606 default: break;
1607 case 0x0a: /* LF */
1608 case 0x0b: /* VT */
1609 case 0x0c: /* FF */
1610 case 0x0d: /* CR */
1611 case 0x85: /* NEL */
1612 case 0x2028: /* LINE SEPARATOR */
1613 case 0x2029: /* PARAGRAPH SEPARATOR */
1614 RRETURN(MATCH_NOMATCH);
1615 }
1616 ecode++;
1617 break;
1618
1619 case OP_VSPACE:
1620 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1621 GETCHARINCTEST(c, eptr);
1622 switch(c)
1623 {
1624 default: RRETURN(MATCH_NOMATCH);
1625 case 0x0a: /* LF */
1626 case 0x0b: /* VT */
1627 case 0x0c: /* FF */
1628 case 0x0d: /* CR */
1629 case 0x85: /* NEL */
1630 case 0x2028: /* LINE SEPARATOR */
1631 case 0x2029: /* PARAGRAPH SEPARATOR */
1632 break;
1633 }
1634 ecode++;
1635 break;
1636
1637 #ifdef SUPPORT_UCP
1638 /* Check the next character by Unicode property. We will get here only
1639 if the support is in the binary; otherwise a compile-time error occurs. */
1640
1641 case OP_PROP:
1642 case OP_NOTPROP:
1643 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1644 GETCHARINCTEST(c, eptr);
1645 {
1646 int chartype, script;
1647 int category = _pcre_ucp_findprop(c, &chartype, &script);
1648
1649 switch(ecode[1])
1650 {
1651 case PT_ANY:
1652 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1653 break;
1654
1655 case PT_LAMP:
1656 if ((chartype == ucp_Lu ||
1657 chartype == ucp_Ll ||
1658 chartype == ucp_Lt) == (op == OP_NOTPROP))
1659 RRETURN(MATCH_NOMATCH);
1660 break;
1661
1662 case PT_GC:
1663 if ((ecode[2] != category) == (op == OP_PROP))
1664 RRETURN(MATCH_NOMATCH);
1665 break;
1666
1667 case PT_PC:
1668 if ((ecode[2] != chartype) == (op == OP_PROP))
1669 RRETURN(MATCH_NOMATCH);
1670 break;
1671
1672 case PT_SC:
1673 if ((ecode[2] != script) == (op == OP_PROP))
1674 RRETURN(MATCH_NOMATCH);
1675 break;
1676
1677 default:
1678 RRETURN(PCRE_ERROR_INTERNAL);
1679 }
1680
1681 ecode += 3;
1682 }
1683 break;
1684
1685 /* Match an extended Unicode sequence. We will get here only if the support
1686 is in the binary; otherwise a compile-time error occurs. */
1687
1688 case OP_EXTUNI:
1689 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1690 GETCHARINCTEST(c, eptr);
1691 {
1692 int chartype, script;
1693 int category = _pcre_ucp_findprop(c, &chartype, &script);
1694 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1695 while (eptr < md->end_subject)
1696 {
1697 int len = 1;
1698 if (!utf8) c = *eptr; else
1699 {
1700 GETCHARLEN(c, eptr, len);
1701 }
1702 category = _pcre_ucp_findprop(c, &chartype, &script);
1703 if (category != ucp_M) break;
1704 eptr += len;
1705 }
1706 }
1707 ecode++;
1708 break;
1709 #endif
1710
1711
1712 /* Match a back reference, possibly repeatedly. Look past the end of the
1713 item to see if there is repeat information following. The code is similar
1714 to that for character classes, but repeated for efficiency. Then obey
1715 similar code to character type repeats - written out again for speed.
1716 However, if the referenced string is the empty string, always treat
1717 it as matched, any number of times (otherwise there could be infinite
1718 loops). */
1719
1720 case OP_REF:
1721 {
1722 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1723 ecode += 3; /* Advance past item */
1724
1725 /* If the reference is unset, set the length to be longer than the amount
1726 of subject left; this ensures that every attempt at a match fails. We
1727 can't just fail here, because of the possibility of quantifiers with zero
1728 minima. */
1729
1730 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1731 md->end_subject - eptr + 1 :
1732 md->offset_vector[offset+1] - md->offset_vector[offset];
1733
1734 /* Set up for repetition, or handle the non-repeated case */
1735
1736 switch (*ecode)
1737 {
1738 case OP_CRSTAR:
1739 case OP_CRMINSTAR:
1740 case OP_CRPLUS:
1741 case OP_CRMINPLUS:
1742 case OP_CRQUERY:
1743 case OP_CRMINQUERY:
1744 c = *ecode++ - OP_CRSTAR;
1745 minimize = (c & 1) != 0;
1746 min = rep_min[c]; /* Pick up values from tables; */
1747 max = rep_max[c]; /* zero for max => infinity */
1748 if (max == 0) max = INT_MAX;
1749 break;
1750
1751 case OP_CRRANGE:
1752 case OP_CRMINRANGE:
1753 minimize = (*ecode == OP_CRMINRANGE);
1754 min = GET2(ecode, 1);
1755 max = GET2(ecode, 3);
1756 if (max == 0) max = INT_MAX;
1757 ecode += 5;
1758 break;
1759
1760 default: /* No repeat follows */
1761 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1762 eptr += length;
1763 continue; /* With the main loop */
1764 }
1765
1766 /* If the length of the reference is zero, just continue with the
1767 main loop. */
1768
1769 if (length == 0) continue;
1770
1771 /* First, ensure the minimum number of matches are present. We get back
1772 the length of the reference string explicitly rather than passing the
1773 address of eptr, so that eptr can be a register variable. */
1774
1775 for (i = 1; i <= min; i++)
1776 {
1777 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1778 eptr += length;
1779 }
1780
1781 /* If min = max, continue at the same level without recursion.
1782 They are not both allowed to be zero. */
1783
1784 if (min == max) continue;
1785
1786 /* If minimizing, keep trying and advancing the pointer */
1787
1788 if (minimize)
1789 {
1790 for (fi = min;; fi++)
1791 {
1792 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1794 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1795 RRETURN(MATCH_NOMATCH);
1796 eptr += length;
1797 }
1798 /* Control never gets here */
1799 }
1800
1801 /* If maximizing, find the longest string and work backwards */
1802
1803 else
1804 {
1805 pp = eptr;
1806 for (i = min; i < max; i++)
1807 {
1808 if (!match_ref(offset, eptr, length, md, ims)) break;
1809 eptr += length;
1810 }
1811 while (eptr >= pp)
1812 {
1813 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1814 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815 eptr -= length;
1816 }
1817 RRETURN(MATCH_NOMATCH);
1818 }
1819 }
1820 /* Control never gets here */
1821
1822
1823
1824 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1825 used when all the characters in the class have values in the range 0-255,
1826 and either the matching is caseful, or the characters are in the range
1827 0-127 when UTF-8 processing is enabled. The only difference between
1828 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1829 encountered.
1830
1831 First, look past the end of the item to see if there is repeat information
1832 following. Then obey similar code to character type repeats - written out
1833 again for speed. */
1834
1835 case OP_NCLASS:
1836 case OP_CLASS:
1837 {
1838 data = ecode + 1; /* Save for matching */
1839 ecode += 33; /* Advance past the item */
1840
1841 switch (*ecode)
1842 {
1843 case OP_CRSTAR:
1844 case OP_CRMINSTAR:
1845 case OP_CRPLUS:
1846 case OP_CRMINPLUS:
1847 case OP_CRQUERY:
1848 case OP_CRMINQUERY:
1849 c = *ecode++ - OP_CRSTAR;
1850 minimize = (c & 1) != 0;
1851 min = rep_min[c]; /* Pick up values from tables; */
1852 max = rep_max[c]; /* zero for max => infinity */
1853 if (max == 0) max = INT_MAX;
1854 break;
1855
1856 case OP_CRRANGE:
1857 case OP_CRMINRANGE:
1858 minimize = (*ecode == OP_CRMINRANGE);
1859 min = GET2(ecode, 1);
1860 max = GET2(ecode, 3);
1861 if (max == 0) max = INT_MAX;
1862 ecode += 5;
1863 break;
1864
1865 default: /* No repeat follows */
1866 min = max = 1;
1867 break;
1868 }
1869
1870 /* First, ensure the minimum number of matches are present. */
1871
1872 #ifdef SUPPORT_UTF8
1873 /* UTF-8 mode */
1874 if (utf8)
1875 {
1876 for (i = 1; i <= min; i++)
1877 {
1878 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1879 GETCHARINC(c, eptr);
1880 if (c > 255)
1881 {
1882 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1883 }
1884 else
1885 {
1886 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1887 }
1888 }
1889 }
1890 else
1891 #endif
1892 /* Not UTF-8 mode */
1893 {
1894 for (i = 1; i <= min; i++)
1895 {
1896 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1897 c = *eptr++;
1898 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1899 }
1900 }
1901
1902 /* If max == min we can continue with the main loop without the
1903 need to recurse. */
1904
1905 if (min == max) continue;
1906
1907 /* If minimizing, keep testing the rest of the expression and advancing
1908 the pointer while it matches the class. */
1909
1910 if (minimize)
1911 {
1912 #ifdef SUPPORT_UTF8
1913 /* UTF-8 mode */
1914 if (utf8)
1915 {
1916 for (fi = min;; fi++)
1917 {
1918 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1920 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1921 GETCHARINC(c, eptr);
1922 if (c > 255)
1923 {
1924 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1925 }
1926 else
1927 {
1928 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1929 }
1930 }
1931 }
1932 else
1933 #endif
1934 /* Not UTF-8 mode */
1935 {
1936 for (fi = min;; fi++)
1937 {
1938 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1940 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1941 c = *eptr++;
1942 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1943 }
1944 }
1945 /* Control never gets here */
1946 }
1947
1948 /* If maximizing, find the longest possible run, then work backwards. */
1949
1950 else
1951 {
1952 pp = eptr;
1953
1954 #ifdef SUPPORT_UTF8
1955 /* UTF-8 mode */
1956 if (utf8)
1957 {
1958 for (i = min; i < max; i++)
1959 {
1960 int len = 1;
1961 if (eptr >= md->end_subject) break;
1962 GETCHARLEN(c, eptr, len);
1963 if (c > 255)
1964 {
1965 if (op == OP_CLASS) break;
1966 }
1967 else
1968 {
1969 if ((data[c/8] & (1 << (c&7))) == 0) break;
1970 }
1971 eptr += len;
1972 }
1973 for (;;)
1974 {
1975 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1977 if (eptr-- == pp) break; /* Stop if tried at original pos */
1978 BACKCHAR(eptr);
1979 }
1980 }
1981 else
1982 #endif
1983 /* Not UTF-8 mode */
1984 {
1985 for (i = min; i < max; i++)
1986 {
1987 if (eptr >= md->end_subject) break;
1988 c = *eptr;
1989 if ((data[c/8] & (1 << (c&7))) == 0) break;
1990 eptr++;
1991 }
1992 while (eptr >= pp)
1993 {
1994 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 eptr--;
1997 }
1998 }
1999
2000 RRETURN(MATCH_NOMATCH);
2001 }
2002 }
2003 /* Control never gets here */
2004
2005
2006 /* Match an extended character class. This opcode is encountered only
2007 in UTF-8 mode, because that's the only time it is compiled. */
2008
2009 #ifdef SUPPORT_UTF8
2010 case OP_XCLASS:
2011 {
2012 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2013 ecode += GET(ecode, 1); /* Advance past the item */
2014
2015 switch (*ecode)
2016 {
2017 case OP_CRSTAR:
2018 case OP_CRMINSTAR:
2019 case OP_CRPLUS:
2020 case OP_CRMINPLUS:
2021 case OP_CRQUERY:
2022 case OP_CRMINQUERY:
2023 c = *ecode++ - OP_CRSTAR;
2024 minimize = (c & 1) != 0;
2025 min = rep_min[c]; /* Pick up values from tables; */
2026 max = rep_max[c]; /* zero for max => infinity */
2027 if (max == 0) max = INT_MAX;
2028 break;
2029
2030 case OP_CRRANGE:
2031 case OP_CRMINRANGE:
2032 minimize = (*ecode == OP_CRMINRANGE);
2033 min = GET2(ecode, 1);
2034 max = GET2(ecode, 3);
2035 if (max == 0) max = INT_MAX;
2036 ecode += 5;
2037 break;
2038
2039 default: /* No repeat follows */
2040 min = max = 1;
2041 break;
2042 }
2043
2044 /* First, ensure the minimum number of matches are present. */
2045
2046 for (i = 1; i <= min; i++)
2047 {
2048 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2049 GETCHARINC(c, eptr);
2050 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2051 }
2052
2053 /* If max == min we can continue with the main loop without the
2054 need to recurse. */
2055
2056 if (min == max) continue;
2057
2058 /* If minimizing, keep testing the rest of the expression and advancing
2059 the pointer while it matches the class. */
2060
2061 if (minimize)
2062 {
2063 for (fi = min;; fi++)
2064 {
2065 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2068 GETCHARINC(c, eptr);
2069 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2070 }
2071 /* Control never gets here */
2072 }
2073
2074 /* If maximizing, find the longest possible run, then work backwards. */
2075
2076 else
2077 {
2078 pp = eptr;
2079 for (i = min; i < max; i++)
2080 {
2081 int len = 1;
2082 if (eptr >= md->end_subject) break;
2083 GETCHARLEN(c, eptr, len);
2084 if (!_pcre_xclass(c, data)) break;
2085 eptr += len;
2086 }
2087 for(;;)
2088 {
2089 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2090 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2091 if (eptr-- == pp) break; /* Stop if tried at original pos */
2092 BACKCHAR(eptr);
2093 }
2094 RRETURN(MATCH_NOMATCH);
2095 }
2096
2097 /* Control never gets here */
2098 }
2099 #endif /* End of XCLASS */
2100
2101 /* Match a single character, casefully */
2102
2103 case OP_CHAR:
2104 #ifdef SUPPORT_UTF8
2105 if (utf8)
2106 {
2107 length = 1;
2108 ecode++;
2109 GETCHARLEN(fc, ecode, length);
2110 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2111 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2112 }
2113 else
2114 #endif
2115
2116 /* Non-UTF-8 mode */
2117 {
2118 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2119 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2120 ecode += 2;
2121 }
2122 break;
2123
2124 /* Match a single character, caselessly */
2125
2126 case OP_CHARNC:
2127 #ifdef SUPPORT_UTF8
2128 if (utf8)
2129 {
2130 length = 1;
2131 ecode++;
2132 GETCHARLEN(fc, ecode, length);
2133
2134 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2135
2136 /* If the pattern character's value is < 128, we have only one byte, and
2137 can use the fast lookup table. */
2138
2139 if (fc < 128)
2140 {
2141 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2142 }
2143
2144 /* Otherwise we must pick up the subject character */
2145
2146 else
2147 {
2148 unsigned int dc;
2149 GETCHARINC(dc, eptr);
2150 ecode += length;
2151
2152 /* If we have Unicode property support, we can use it to test the other
2153 case of the character, if there is one. */
2154
2155 if (fc != dc)
2156 {
2157 #ifdef SUPPORT_UCP
2158 if (dc != _pcre_ucp_othercase(fc))
2159 #endif
2160 RRETURN(MATCH_NOMATCH);
2161 }
2162 }
2163 }
2164 else
2165 #endif /* SUPPORT_UTF8 */
2166
2167 /* Non-UTF-8 mode */
2168 {
2169 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2170 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2171 ecode += 2;
2172 }
2173 break;
2174
2175 /* Match a single character repeatedly. */
2176
2177 case OP_EXACT:
2178 min = max = GET2(ecode, 1);
2179 ecode += 3;
2180 goto REPEATCHAR;
2181
2182 case OP_POSUPTO:
2183 possessive = TRUE;
2184 /* Fall through */
2185
2186 case OP_UPTO:
2187 case OP_MINUPTO:
2188 min = 0;
2189 max = GET2(ecode, 1);
2190 minimize = *ecode == OP_MINUPTO;
2191 ecode += 3;
2192 goto REPEATCHAR;
2193
2194 case OP_POSSTAR:
2195 possessive = TRUE;
2196 min = 0;
2197 max = INT_MAX;
2198 ecode++;
2199 goto REPEATCHAR;
2200
2201 case OP_POSPLUS:
2202 possessive = TRUE;
2203 min = 1;
2204 max = INT_MAX;
2205 ecode++;
2206 goto REPEATCHAR;
2207
2208 case OP_POSQUERY:
2209 possessive = TRUE;
2210 min = 0;
2211 max = 1;
2212 ecode++;
2213 goto REPEATCHAR;
2214
2215 case OP_STAR:
2216 case OP_MINSTAR:
2217 case OP_PLUS:
2218 case OP_MINPLUS:
2219 case OP_QUERY:
2220 case OP_MINQUERY:
2221 c = *ecode++ - OP_STAR;
2222 minimize = (c & 1) != 0;
2223 min = rep_min[c]; /* Pick up values from tables; */
2224 max = rep_max[c]; /* zero for max => infinity */
2225 if (max == 0) max = INT_MAX;
2226
2227 /* Common code for all repeated single-character matches. We can give
2228 up quickly if there are fewer than the minimum number of characters left in
2229 the subject. */
2230
2231 REPEATCHAR:
2232 #ifdef SUPPORT_UTF8
2233 if (utf8)
2234 {
2235 length = 1;
2236 charptr = ecode;
2237 GETCHARLEN(fc, ecode, length);
2238 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2239 ecode += length;
2240
2241 /* Handle multibyte character matching specially here. There is
2242 support for caseless matching if UCP support is present. */
2243
2244 if (length > 1)
2245 {
2246 #ifdef SUPPORT_UCP
2247 unsigned int othercase;
2248 if ((ims & PCRE_CASELESS) != 0 &&
2249 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2250 oclength = _pcre_ord2utf8(othercase, occhars);
2251 else oclength = 0;
2252 #endif /* SUPPORT_UCP */
2253
2254 for (i = 1; i <= min; i++)
2255 {
2256 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2257 #ifdef SUPPORT_UCP
2258 /* Need braces because of following else */
2259 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2260 else
2261 {
2262 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2263 eptr += oclength;
2264 }
2265 #else /* without SUPPORT_UCP */
2266 else { RRETURN(MATCH_NOMATCH); }
2267 #endif /* SUPPORT_UCP */
2268 }
2269
2270 if (min == max) continue;
2271
2272 if (minimize)
2273 {
2274 for (fi = min;; fi++)
2275 {
2276 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2279 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2280 #ifdef SUPPORT_UCP
2281 /* Need braces because of following else */
2282 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2283 else
2284 {
2285 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2286 eptr += oclength;
2287 }
2288 #else /* without SUPPORT_UCP */
2289 else { RRETURN (MATCH_NOMATCH); }
2290 #endif /* SUPPORT_UCP */
2291 }
2292 /* Control never gets here */
2293 }
2294
2295 else /* Maximize */
2296 {
2297 pp = eptr;
2298 for (i = min; i < max; i++)
2299 {
2300 if (eptr > md->end_subject - length) break;
2301 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2302 #ifdef SUPPORT_UCP
2303 else if (oclength == 0) break;
2304 else
2305 {
2306 if (memcmp(eptr, occhars, oclength) != 0) break;
2307 eptr += oclength;
2308 }
2309 #else /* without SUPPORT_UCP */
2310 else break;
2311 #endif /* SUPPORT_UCP */
2312 }
2313
2314 if (possessive) continue;
2315 for(;;)
2316 {
2317 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2318 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2319 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2320 #ifdef SUPPORT_UCP
2321 eptr--;
2322 BACKCHAR(eptr);
2323 #else /* without SUPPORT_UCP */
2324 eptr -= length;
2325 #endif /* SUPPORT_UCP */
2326 }
2327 }
2328 /* Control never gets here */
2329 }
2330
2331 /* If the length of a UTF-8 character is 1, we fall through here, and
2332 obey the code as for non-UTF-8 characters below, though in this case the
2333 value of fc will always be < 128. */
2334 }
2335 else
2336 #endif /* SUPPORT_UTF8 */
2337
2338 /* When not in UTF-8 mode, load a single-byte character. */
2339 {
2340 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2341 fc = *ecode++;
2342 }
2343
2344 /* The value of fc at this point is always less than 256, though we may or
2345 may not be in UTF-8 mode. The code is duplicated for the caseless and
2346 caseful cases, for speed, since matching characters is likely to be quite
2347 common. First, ensure the minimum number of matches are present. If min =
2348 max, continue at the same level without recursing. Otherwise, if
2349 minimizing, keep trying the rest of the expression and advancing one
2350 matching character if failing, up to the maximum. Alternatively, if
2351 maximizing, find the maximum number of characters and work backwards. */
2352
2353 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2354 max, eptr));
2355
2356 if ((ims & PCRE_CASELESS) != 0)
2357 {
2358 fc = md->lcc[fc];
2359 for (i = 1; i <= min; i++)
2360 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2361 if (min == max) continue;
2362 if (minimize)
2363 {
2364 for (fi = min;; fi++)
2365 {
2366 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2367 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2368 if (fi >= max || eptr >= md->end_subject ||
2369 fc != md->lcc[*eptr++])
2370 RRETURN(MATCH_NOMATCH);
2371 }
2372 /* Control never gets here */
2373 }
2374 else /* Maximize */
2375 {
2376 pp = eptr;
2377 for (i = min; i < max; i++)
2378 {
2379 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2380 eptr++;
2381 }
2382 if (possessive) continue;
2383 while (eptr >= pp)
2384 {
2385 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2386 eptr--;
2387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2388 }
2389 RRETURN(MATCH_NOMATCH);
2390 }
2391 /* Control never gets here */
2392 }
2393
2394 /* Caseful comparisons (includes all multi-byte characters) */
2395
2396 else
2397 {
2398 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2399 if (min == max) continue;
2400 if (minimize)
2401 {
2402 for (fi = min;; fi++)
2403 {
2404 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2405 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2406 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2407 RRETURN(MATCH_NOMATCH);
2408 }
2409 /* Control never gets here */
2410 }
2411 else /* Maximize */
2412 {
2413 pp = eptr;
2414 for (i = min; i < max; i++)
2415 {
2416 if (eptr >= md->end_subject || fc != *eptr) break;
2417 eptr++;
2418 }
2419 if (possessive) continue;
2420 while (eptr >= pp)
2421 {
2422 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2423 eptr--;
2424 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2425 }
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 }
2429 /* Control never gets here */
2430
2431 /* Match a negated single one-byte character. The character we are
2432 checking can be multibyte. */
2433
2434 case OP_NOT:
2435 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2436 ecode++;
2437 GETCHARINCTEST(c, eptr);
2438 if ((ims & PCRE_CASELESS) != 0)
2439 {
2440 #ifdef SUPPORT_UTF8
2441 if (c < 256)
2442 #endif
2443 c = md->lcc[c];
2444 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2445 }
2446 else
2447 {
2448 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2449 }
2450 break;
2451
2452 /* Match a negated single one-byte character repeatedly. This is almost a
2453 repeat of the code for a repeated single character, but I haven't found a
2454 nice way of commoning these up that doesn't require a test of the
2455 positive/negative option for each character match. Maybe that wouldn't add
2456 very much to the time taken, but character matching *is* what this is all
2457 about... */
2458
2459 case OP_NOTEXACT:
2460 min = max = GET2(ecode, 1);
2461 ecode += 3;
2462 goto REPEATNOTCHAR;
2463
2464 case OP_NOTUPTO:
2465 case OP_NOTMINUPTO:
2466 min = 0;
2467 max = GET2(ecode, 1);
2468 minimize = *ecode == OP_NOTMINUPTO;
2469 ecode += 3;
2470 goto REPEATNOTCHAR;
2471
2472 case OP_NOTPOSSTAR:
2473 possessive = TRUE;
2474 min = 0;
2475 max = INT_MAX;
2476 ecode++;
2477 goto REPEATNOTCHAR;
2478
2479 case OP_NOTPOSPLUS:
2480 possessive = TRUE;
2481 min = 1;
2482 max = INT_MAX;
2483 ecode++;
2484 goto REPEATNOTCHAR;
2485
2486 case OP_NOTPOSQUERY:
2487 possessive = TRUE;
2488 min = 0;
2489 max = 1;
2490 ecode++;
2491 goto REPEATNOTCHAR;
2492
2493 case OP_NOTPOSUPTO:
2494 possessive = TRUE;
2495 min = 0;
2496 max = GET2(ecode, 1);
2497 ecode += 3;
2498 goto REPEATNOTCHAR;
2499
2500 case OP_NOTSTAR:
2501 case OP_NOTMINSTAR:
2502 case OP_NOTPLUS:
2503 case OP_NOTMINPLUS:
2504 case OP_NOTQUERY:
2505 case OP_NOTMINQUERY:
2506 c = *ecode++ - OP_NOTSTAR;
2507 minimize = (c & 1) != 0;
2508 min = rep_min[c]; /* Pick up values from tables; */
2509 max = rep_max[c]; /* zero for max => infinity */
2510 if (max == 0) max = INT_MAX;
2511
2512 /* Common code for all repeated single-byte matches. We can give up quickly
2513 if there are fewer than the minimum number of bytes left in the
2514 subject. */
2515
2516 REPEATNOTCHAR:
2517 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2518 fc = *ecode++;
2519
2520 /* The code is duplicated for the caseless and caseful cases, for speed,
2521 since matching characters is likely to be quite common. First, ensure the
2522 minimum number of matches are present. If min = max, continue at the same
2523 level without recursing. Otherwise, if minimizing, keep trying the rest of
2524 the expression and advancing one matching character if failing, up to the
2525 maximum. Alternatively, if maximizing, find the maximum number of
2526 characters and work backwards. */
2527
2528 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2529 max, eptr));
2530
2531 if ((ims & PCRE_CASELESS) != 0)
2532 {
2533 fc = md->lcc[fc];
2534
2535 #ifdef SUPPORT_UTF8
2536 /* UTF-8 mode */
2537 if (utf8)
2538 {
2539 register unsigned int d;
2540 for (i = 1; i <= min; i++)
2541 {
2542 GETCHARINC(d, eptr);
2543 if (d < 256) d = md->lcc[d];
2544 if (fc == d) RRETURN(MATCH_NOMATCH);
2545 }
2546 }
2547 else
2548 #endif
2549
2550 /* Not UTF-8 mode */
2551 {
2552 for (i = 1; i <= min; i++)
2553 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2554 }
2555
2556 if (min == max) continue;
2557
2558 if (minimize)
2559 {
2560 #ifdef SUPPORT_UTF8
2561 /* UTF-8 mode */
2562 if (utf8)
2563 {
2564 register unsigned int d;
2565 for (fi = min;; fi++)
2566 {
2567 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2569 GETCHARINC(d, eptr);
2570 if (d < 256) d = md->lcc[d];
2571 if (fi >= max || eptr >= md->end_subject || fc == d)
2572 RRETURN(MATCH_NOMATCH);
2573 }
2574 }
2575 else
2576 #endif
2577 /* Not UTF-8 mode */
2578 {
2579 for (fi = min;; fi++)
2580 {
2581 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2582 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2583 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2584 RRETURN(MATCH_NOMATCH);
2585 }
2586 }
2587 /* Control never gets here */
2588 }
2589
2590 /* Maximize case */
2591
2592 else
2593 {
2594 pp = eptr;
2595
2596 #ifdef SUPPORT_UTF8
2597 /* UTF-8 mode */
2598 if (utf8)
2599 {
2600 register unsigned int d;
2601 for (i = min; i < max; i++)
2602 {
2603 int len = 1;
2604 if (eptr >= md->end_subject) break;
2605 GETCHARLEN(d, eptr, len);
2606 if (d < 256) d = md->lcc[d];
2607 if (fc == d) break;
2608 eptr += len;
2609 }
2610 if (possessive) continue;
2611 for(;;)
2612 {
2613 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2615 if (eptr-- == pp) break; /* Stop if tried at original pos */
2616 BACKCHAR(eptr);
2617 }
2618 }
2619 else
2620 #endif
2621 /* Not UTF-8 mode */
2622 {
2623 for (i = min; i < max; i++)
2624 {
2625 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2626 eptr++;
2627 }
2628 if (possessive) continue;
2629 while (eptr >= pp)
2630 {
2631 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2633 eptr--;
2634 }
2635 }
2636
2637 RRETURN(MATCH_NOMATCH);
2638 }
2639 /* Control never gets here */
2640 }
2641
2642 /* Caseful comparisons */
2643
2644 else
2645 {
2646 #ifdef SUPPORT_UTF8
2647 /* UTF-8 mode */
2648 if (utf8)
2649 {
2650 register unsigned int d;
2651 for (i = 1; i <= min; i++)
2652 {
2653 GETCHARINC(d, eptr);
2654 if (fc == d) RRETURN(MATCH_NOMATCH);
2655 }
2656 }
2657 else
2658 #endif
2659 /* Not UTF-8 mode */
2660 {
2661 for (i = 1; i <= min; i++)
2662 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2663 }
2664
2665 if (min == max) continue;
2666
2667 if (minimize)
2668 {
2669 #ifdef SUPPORT_UTF8
2670 /* UTF-8 mode */
2671 if (utf8)
2672 {
2673 register unsigned int d;
2674 for (fi = min;; fi++)
2675 {
2676 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678 GETCHARINC(d, eptr);
2679 if (fi >= max || eptr >= md->end_subject || fc == d)
2680 RRETURN(MATCH_NOMATCH);
2681 }
2682 }
2683 else
2684 #endif
2685 /* Not UTF-8 mode */
2686 {
2687 for (fi = min;; fi++)
2688 {
2689 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2690 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2691 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2692 RRETURN(MATCH_NOMATCH);
2693 }
2694 }
2695 /* Control never gets here */
2696 }
2697
2698 /* Maximize case */
2699
2700 else
2701 {
2702 pp = eptr;
2703
2704 #ifdef SUPPORT_UTF8
2705 /* UTF-8 mode */
2706 if (utf8)
2707 {
2708 register unsigned int d;
2709 for (i = min; i < max; i++)
2710 {
2711 int len = 1;
2712 if (eptr >= md->end_subject) break;
2713 GETCHARLEN(d, eptr, len);
2714 if (fc == d) break;
2715 eptr += len;
2716 }
2717 if (possessive) continue;
2718 for(;;)
2719 {
2720 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2721 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722 if (eptr-- == pp) break; /* Stop if tried at original pos */
2723 BACKCHAR(eptr);
2724 }
2725 }
2726 else
2727 #endif
2728 /* Not UTF-8 mode */
2729 {
2730 for (i = min; i < max; i++)
2731 {
2732 if (eptr >= md->end_subject || fc == *eptr) break;
2733 eptr++;
2734 }
2735 if (possessive) continue;
2736 while (eptr >= pp)
2737 {
2738 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2740 eptr--;
2741 }
2742 }
2743
2744 RRETURN(MATCH_NOMATCH);
2745 }
2746 }
2747 /* Control never gets here */
2748
2749 /* Match a single character type repeatedly; several different opcodes
2750 share code. This is very similar to the code for single characters, but we
2751 repeat it in the interests of efficiency. */
2752
2753 case OP_TYPEEXACT:
2754 min = max = GET2(ecode, 1);
2755 minimize = TRUE;
2756 ecode += 3;
2757 goto REPEATTYPE;
2758
2759 case OP_TYPEUPTO:
2760 case OP_TYPEMINUPTO:
2761 min = 0;
2762 max = GET2(ecode, 1);
2763 minimize = *ecode == OP_TYPEMINUPTO;
2764 ecode += 3;
2765 goto REPEATTYPE;
2766
2767 case OP_TYPEPOSSTAR:
2768 possessive = TRUE;
2769 min = 0;
2770 max = INT_MAX;
2771 ecode++;
2772 goto REPEATTYPE;
2773
2774 case OP_TYPEPOSPLUS:
2775 possessive = TRUE;
2776 min = 1;
2777 max = INT_MAX;
2778 ecode++;
2779 goto REPEATTYPE;
2780
2781 case OP_TYPEPOSQUERY:
2782 possessive = TRUE;
2783 min = 0;
2784 max = 1;
2785 ecode++;
2786 goto REPEATTYPE;
2787
2788 case OP_TYPEPOSUPTO:
2789 possessive = TRUE;
2790 min = 0;
2791 max = GET2(ecode, 1);
2792 ecode += 3;
2793 goto REPEATTYPE;
2794
2795 case OP_TYPESTAR:
2796 case OP_TYPEMINSTAR:
2797 case OP_TYPEPLUS:
2798 case OP_TYPEMINPLUS:
2799 case OP_TYPEQUERY:
2800 case OP_TYPEMINQUERY:
2801 c = *ecode++ - OP_TYPESTAR;
2802 minimize = (c & 1) != 0;
2803 min = rep_min[c]; /* Pick up values from tables; */
2804 max = rep_max[c]; /* zero for max => infinity */
2805 if (max == 0) max = INT_MAX;
2806
2807 /* Common code for all repeated single character type matches. Note that
2808 in UTF-8 mode, '.' matches a character of any length, but for the other
2809 character types, the valid characters are all one-byte long. */
2810
2811 REPEATTYPE:
2812 ctype = *ecode++; /* Code for the character type */
2813
2814 #ifdef SUPPORT_UCP
2815 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2816 {
2817 prop_fail_result = ctype == OP_NOTPROP;
2818 prop_type = *ecode++;
2819 prop_value = *ecode++;
2820 }
2821 else prop_type = -1;
2822 #endif
2823
2824 /* First, ensure the minimum number of matches are present. Use inline
2825 code for maximizing the speed, and do the type test once at the start
2826 (i.e. keep it out of the loop). Also we can test that there are at least
2827 the minimum number of bytes before we start. This isn't as effective in
2828 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2829 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2830 and single-bytes. */
2831
2832 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2833 if (min > 0)
2834 {
2835 #ifdef SUPPORT_UCP
2836 if (prop_type >= 0)
2837 {
2838 switch(prop_type)
2839 {
2840 case PT_ANY:
2841 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2842 for (i = 1; i <= min; i++)
2843 {
2844 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2845 GETCHARINCTEST(c, eptr);
2846 }
2847 break;
2848
2849 case PT_LAMP:
2850 for (i = 1; i <= min; i++)
2851 {
2852 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2853 GETCHARINCTEST(c, eptr);
2854 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2855 if ((prop_chartype == ucp_Lu ||
2856 prop_chartype == ucp_Ll ||
2857 prop_chartype == ucp_Lt) == prop_fail_result)
2858 RRETURN(MATCH_NOMATCH);
2859 }
2860 break;
2861
2862 case PT_GC:
2863 for (i = 1; i <= min; i++)
2864 {
2865 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2866 GETCHARINCTEST(c, eptr);
2867 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2868 if ((prop_category == prop_value) == prop_fail_result)
2869 RRETURN(MATCH_NOMATCH);
2870 }
2871 break;
2872
2873 case PT_PC:
2874 for (i = 1; i <= min; i++)
2875 {
2876 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2877 GETCHARINCTEST(c, eptr);
2878 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2879 if ((prop_chartype == prop_value) == prop_fail_result)
2880 RRETURN(MATCH_NOMATCH);
2881 }
2882 break;
2883
2884 case PT_SC:
2885 for (i = 1; i <= min; i++)
2886 {
2887 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 GETCHARINCTEST(c, eptr);
2889 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2890 if ((prop_script == prop_value) == prop_fail_result)
2891 RRETURN(MATCH_NOMATCH);
2892 }
2893 break;
2894
2895 default:
2896 RRETURN(PCRE_ERROR_INTERNAL);
2897 }
2898 }
2899
2900 /* Match extended Unicode sequences. We will get here only if the
2901 support is in the binary; otherwise a compile-time error occurs. */
2902
2903 else if (ctype == OP_EXTUNI)
2904 {
2905 for (i = 1; i <= min; i++)
2906 {
2907 GETCHARINCTEST(c, eptr);
2908 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2909 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2910 while (eptr < md->end_subject)
2911 {
2912 int len = 1;
2913 if (!utf8) c = *eptr; else
2914 {
2915 GETCHARLEN(c, eptr, len);
2916 }
2917 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2918 if (prop_category != ucp_M) break;
2919 eptr += len;
2920 }
2921 }
2922 }
2923
2924 else
2925 #endif /* SUPPORT_UCP */
2926
2927 /* Handle all other cases when the coding is UTF-8 */
2928
2929 #ifdef SUPPORT_UTF8
2930 if (utf8) switch(ctype)
2931 {
2932 case OP_ANY:
2933 for (i = 1; i <= min; i++)
2934 {
2935 if (eptr >= md->end_subject ||
2936 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2937 RRETURN(MATCH_NOMATCH);
2938 eptr++;
2939 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2940 }
2941 break;
2942
2943 case OP_ANYBYTE:
2944 eptr += min;
2945 break;
2946
2947 case OP_ANYNL:
2948 for (i = 1; i <= min; i++)
2949 {
2950 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 GETCHARINC(c, eptr);
2952 switch(c)
2953 {
2954 default: RRETURN(MATCH_NOMATCH);
2955 case 0x000d:
2956 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2957 break;
2958 case 0x000a:
2959 case 0x000b:
2960 case 0x000c:
2961 case 0x0085:
2962 case 0x2028:
2963 case 0x2029:
2964 break;
2965 }
2966 }
2967 break;
2968
2969 case OP_NOT_HSPACE:
2970 for (i = 1; i <= min; i++)
2971 {
2972 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973 GETCHARINC(c, eptr);
2974 switch(c)
2975 {
2976 default: break;
2977 case 0x09: /* HT */
2978 case 0x20: /* SPACE */
2979 case 0xa0: /* NBSP */
2980 case 0x1680: /* OGHAM SPACE MARK */
2981 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2982 case 0x2000: /* EN QUAD */
2983 case 0x2001: /* EM QUAD */
2984 case 0x2002: /* EN SPACE */
2985 case 0x2003: /* EM SPACE */
2986 case 0x2004: /* THREE-PER-EM SPACE */
2987 case 0x2005: /* FOUR-PER-EM SPACE */
2988 case 0x2006: /* SIX-PER-EM SPACE */
2989 case 0x2007: /* FIGURE SPACE */
2990 case 0x2008: /* PUNCTUATION SPACE */
2991 case 0x2009: /* THIN SPACE */
2992 case 0x200A: /* HAIR SPACE */
2993 case 0x202f: /* NARROW NO-BREAK SPACE */
2994 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2995 case 0x3000: /* IDEOGRAPHIC SPACE */
2996 RRETURN(MATCH_NOMATCH);
2997 }
2998 }
2999 break;
3000
3001 case OP_HSPACE:
3002 for (i = 1; i <= min; i++)
3003 {
3004 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3005 GETCHARINC(c, eptr);
3006 switch(c)
3007 {
3008 default: RRETURN(MATCH_NOMATCH);
3009 case 0x09: /* HT */
3010 case 0x20: /* SPACE */
3011 case 0xa0: /* NBSP */
3012 case 0x1680: /* OGHAM SPACE MARK */
3013 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3014 case 0x2000: /* EN QUAD */
3015 case 0x2001: /* EM QUAD */
3016 case 0x2002: /* EN SPACE */
3017 case 0x2003: /* EM SPACE */
3018 case 0x2004: /* THREE-PER-EM SPACE */
3019 case 0x2005: /* FOUR-PER-EM SPACE */
3020 case 0x2006: /* SIX-PER-EM SPACE */
3021 case 0x2007: /* FIGURE SPACE */
3022 case 0x2008: /* PUNCTUATION SPACE */
3023 case 0x2009: /* THIN SPACE */
3024 case 0x200A: /* HAIR SPACE */
3025 case 0x202f: /* NARROW NO-BREAK SPACE */
3026 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3027 case 0x3000: /* IDEOGRAPHIC SPACE */
3028 break;
3029 }
3030 }
3031 break;
3032
3033 case OP_NOT_VSPACE:
3034 for (i = 1; i <= min; i++)
3035 {
3036 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3037 GETCHARINC(c, eptr);
3038 switch(c)
3039 {
3040 default: break;
3041 case 0x0a: /* LF */
3042 case 0x0b: /* VT */
3043 case 0x0c: /* FF */
3044 case 0x0d: /* CR */
3045 case 0x85: /* NEL */
3046 case 0x2028: /* LINE SEPARATOR */
3047 case 0x2029: /* PARAGRAPH SEPARATOR */
3048 RRETURN(MATCH_NOMATCH);
3049 }
3050 }
3051 break;
3052
3053 case OP_VSPACE:
3054 for (i = 1; i <= min; i++)
3055 {
3056 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3057 GETCHARINC(c, eptr);
3058 switch(c)
3059 {
3060 default: RRETURN(MATCH_NOMATCH);
3061 case 0x0a: /* LF */
3062 case 0x0b: /* VT */
3063 case 0x0c: /* FF */
3064 case 0x0d: /* CR */
3065 case 0x85: /* NEL */
3066 case 0x2028: /* LINE SEPARATOR */
3067 case 0x2029: /* PARAGRAPH SEPARATOR */
3068 break;
3069 }
3070 }
3071 break;
3072
3073 case OP_NOT_DIGIT:
3074 for (i = 1; i <= min; i++)
3075 {
3076 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3077 GETCHARINC(c, eptr);
3078 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3079 RRETURN(MATCH_NOMATCH);
3080 }
3081 break;
3082
3083 case OP_DIGIT:
3084 for (i = 1; i <= min; i++)
3085 {
3086 if (eptr >= md->end_subject ||
3087 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3088 RRETURN(MATCH_NOMATCH);
3089 /* No need to skip more bytes - we know it's a 1-byte character */
3090 }
3091 break;
3092
3093 case OP_NOT_WHITESPACE:
3094 for (i = 1; i <= min; i++)
3095 {
3096 if (eptr >= md->end_subject ||
3097 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3098 RRETURN(MATCH_NOMATCH);
3099 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3100 }
3101 break;
3102
3103 case OP_WHITESPACE:
3104 for (i = 1; i <= min; i++)
3105 {
3106 if (eptr >= md->end_subject ||
3107 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3108 RRETURN(MATCH_NOMATCH);
3109 /* No need to skip more bytes - we know it's a 1-byte character */
3110 }
3111 break;
3112
3113 case OP_NOT_WORDCHAR:
3114 for (i = 1; i <= min; i++)
3115 {
3116 if (eptr >= md->end_subject ||
3117 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3118 RRETURN(MATCH_NOMATCH);
3119 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3120 }
3121 break;
3122
3123 case OP_WORDCHAR:
3124 for (i = 1; i <= min; i++)
3125 {
3126 if (eptr >= md->end_subject ||
3127 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3128 RRETURN(MATCH_NOMATCH);
3129 /* No need to skip more bytes - we know it's a 1-byte character */
3130 }
3131 break;
3132
3133 default:
3134 RRETURN(PCRE_ERROR_INTERNAL);
3135 } /* End switch(ctype) */
3136
3137 else
3138 #endif /* SUPPORT_UTF8 */
3139
3140 /* Code for the non-UTF-8 case for minimum matching of operators other
3141 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3142 number of bytes present, as this was tested above. */
3143
3144 switch(ctype)
3145 {
3146 case OP_ANY:
3147 if ((ims & PCRE_DOTALL) == 0)
3148 {
3149 for (i = 1; i <= min; i++)
3150 {
3151 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3152 eptr++;
3153 }
3154 }
3155 else eptr += min;
3156 break;
3157
3158 case OP_ANYBYTE:
3159 eptr += min;
3160 break;
3161
3162 /* Because of the CRLF case, we can't assume the minimum number of
3163 bytes are present in this case. */
3164
3165 case OP_ANYNL:
3166 for (i = 1; i <= min; i++)
3167 {
3168 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3169 switch(*eptr++)
3170 {
3171 default: RRETURN(MATCH_NOMATCH);
3172 case 0x000d:
3173 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3174 break;
3175 case 0x000a:
3176 case 0x000b:
3177 case 0x000c:
3178 case 0x0085:
3179 break;
3180 }
3181 }
3182 break;
3183
3184 case OP_NOT_HSPACE:
3185 for (i = 1; i <= min; i++)
3186 {
3187 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3188 switch(*eptr++)
3189 {
3190 default: break;
3191 case 0x09: /* HT */
3192 case 0x20: /* SPACE */
3193 case 0xa0: /* NBSP */
3194 RRETURN(MATCH_NOMATCH);
3195 }
3196 }
3197 break;
3198
3199 case OP_HSPACE:
3200 for (i = 1; i <= min; i++)
3201 {
3202 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3203 switch(*eptr++)
3204 {
3205 default: RRETURN(MATCH_NOMATCH);
3206 case 0x09: /* HT */
3207 case 0x20: /* SPACE */
3208 case 0xa0: /* NBSP */
3209 break;
3210 }
3211 }
3212 break;
3213
3214 case OP_NOT_VSPACE:
3215 for (i = 1; i <= min; i++)
3216 {
3217 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3218 switch(*eptr++)
3219 {
3220 default: break;
3221 case 0x0a: /* LF */
3222 case 0x0b: /* VT */
3223 case 0x0c: /* FF */
3224 case 0x0d: /* CR */
3225 case 0x85: /* NEL */
3226 RRETURN(MATCH_NOMATCH);
3227 }
3228 }
3229 break;
3230
3231 case OP_VSPACE:
3232 for (i = 1; i <= min; i++)
3233 {
3234 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3235 switch(*eptr++)
3236 {
3237 default: RRETURN(MATCH_NOMATCH);
3238 case 0x0a: /* LF */
3239 case 0x0b: /* VT */
3240 case 0x0c: /* FF */
3241 case 0x0d: /* CR */
3242 case 0x85: /* NEL */
3243 break;
3244 }
3245 }
3246 break;
3247
3248 case OP_NOT_DIGIT:
3249 for (i = 1; i <= min; i++)
3250 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3251 break;
3252
3253 case OP_DIGIT:
3254 for (i = 1; i <= min; i++)
3255 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3256 break;
3257
3258 case OP_NOT_WHITESPACE:
3259 for (i = 1; i <= min; i++)
3260 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3261 break;
3262
3263 case OP_WHITESPACE:
3264 for (i = 1; i <= min; i++)
3265 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3266 break;
3267
3268 case OP_NOT_WORDCHAR:
3269 for (i = 1; i <= min; i++)
3270 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3271 RRETURN(MATCH_NOMATCH);
3272 break;
3273
3274 case OP_WORDCHAR:
3275 for (i = 1; i <= min; i++)
3276 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3277 RRETURN(MATCH_NOMATCH);
3278 break;
3279
3280 default:
3281 RRETURN(PCRE_ERROR_INTERNAL);
3282 }
3283 }
3284
3285 /* If min = max, continue at the same level without recursing */
3286
3287 if (min == max) continue;
3288
3289 /* If minimizing, we have to test the rest of the pattern before each
3290 subsequent match. Again, separate the UTF-8 case for speed, and also
3291 separate the UCP cases. */
3292
3293 if (minimize)
3294 {
3295 #ifdef SUPPORT_UCP
3296 if (prop_type >= 0)
3297 {
3298 switch(prop_type)
3299 {
3300 case PT_ANY:
3301 for (fi = min;; fi++)
3302 {
3303 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3304 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3305 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3306 GETCHARINC(c, eptr);
3307 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3308 }
3309 /* Control never gets here */
3310
3311 case PT_LAMP:
3312 for (fi = min;; fi++)
3313 {
3314 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3316 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3317 GETCHARINC(c, eptr);
3318 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3319 if ((prop_chartype == ucp_Lu ||
3320 prop_chartype == ucp_Ll ||
3321 prop_chartype == ucp_Lt) == prop_fail_result)
3322 RRETURN(MATCH_NOMATCH);
3323 }
3324 /* Control never gets here */
3325
3326 case PT_GC:
3327 for (fi = min;; fi++)
3328 {
3329 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3330 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3331 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3332 GETCHARINC(c, eptr);
3333 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3334 if ((prop_category == prop_value) == prop_fail_result)
3335 RRETURN(MATCH_NOMATCH);
3336 }
3337 /* Control never gets here */
3338
3339 case PT_PC:
3340 for (fi = min;; fi++)
3341 {
3342 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3344 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3345 GETCHARINC(c, eptr);
3346 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3347 if ((prop_chartype == prop_value) == prop_fail_result)
3348 RRETURN(MATCH_NOMATCH);
3349 }
3350 /* Control never gets here */
3351
3352 case PT_SC:
3353 for (fi = min;; fi++)
3354 {
3355 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3356 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3357 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3358 GETCHARINC(c, eptr);
3359 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3360 if ((prop_script == prop_value) == prop_fail_result)
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 /* Control never gets here */
3364
3365 default:
3366 RRETURN(PCRE_ERROR_INTERNAL);
3367 }
3368 }
3369
3370 /* Match extended Unicode sequences. We will get here only if the
3371 support is in the binary; otherwise a compile-time error occurs. */
3372
3373 else if (ctype == OP_EXTUNI)
3374 {
3375 for (fi = min;; fi++)
3376 {
3377 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3378 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3379 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3380 GETCHARINCTEST(c, eptr);
3381 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3382 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3383 while (eptr < md->end_subject)
3384 {
3385 int len = 1;
3386 if (!utf8) c = *eptr; else
3387 {
3388 GETCHARLEN(c, eptr, len);
3389 }
3390 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3391 if (prop_category != ucp_M) break;
3392 eptr += len;
3393 }
3394 }
3395 }
3396
3397 else
3398 #endif /* SUPPORT_UCP */
3399
3400 #ifdef SUPPORT_UTF8
3401 /* UTF-8 mode */
3402 if (utf8)
3403 {
3404 for (fi = min;; fi++)
3405 {
3406 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3407 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3408 if (fi >= max || eptr >= md->end_subject ||
3409 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3410 IS_NEWLINE(eptr)))
3411 RRETURN(MATCH_NOMATCH);
3412
3413 GETCHARINC(c, eptr);
3414 switch(ctype)
3415 {
3416 case OP_ANY: /* This is the DOTALL case */
3417 break;
3418
3419 case OP_ANYBYTE:
3420 break;
3421
3422 case OP_ANYNL:
3423 switch(c)
3424 {
3425 default: RRETURN(MATCH_NOMATCH);
3426 case 0x000d:
3427 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3428 break;
3429 case 0x000a:
3430 case 0x000b:
3431 case 0x000c:
3432 case 0x0085:
3433 case 0x2028:
3434 case 0x2029:
3435 break;
3436 }
3437 break;
3438
3439 case OP_NOT_HSPACE:
3440 switch(c)
3441 {
3442 default: break;
3443 case 0x09: /* HT */
3444 case 0x20: /* SPACE */
3445 case 0xa0: /* NBSP */
3446 case 0x1680: /* OGHAM SPACE MARK */
3447 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3448 case 0x2000: /* EN QUAD */
3449 case 0x2001: /* EM QUAD */
3450 case 0x2002: /* EN SPACE */
3451 case 0x2003: /* EM SPACE */
3452 case 0x2004: /* THREE-PER-EM SPACE */
3453 case 0x2005: /* FOUR-PER-EM SPACE */
3454 case 0x2006: /* SIX-PER-EM SPACE */
3455 case 0x2007: /* FIGURE SPACE */
3456 case 0x2008: /* PUNCTUATION SPACE */
3457 case 0x2009: /* THIN SPACE */
3458 case 0x200A: /* HAIR SPACE */
3459 case 0x202f: /* NARROW NO-BREAK SPACE */
3460 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3461 case 0x3000: /* IDEOGRAPHIC SPACE */
3462 RRETURN(MATCH_NOMATCH);
3463 }
3464 break;
3465
3466 case OP_HSPACE:
3467 switch(c)
3468 {
3469 default: RRETURN(MATCH_NOMATCH);
3470 case 0x09: /* HT */
3471 case 0x20: /* SPACE */
3472 case 0xa0: /* NBSP */
3473 case 0x1680: /* OGHAM SPACE MARK */
3474 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3475 case 0x2000: /* EN QUAD */
3476 case 0x2001: /* EM QUAD */
3477 case 0x2002: /* EN SPACE */
3478 case 0x2003: /* EM SPACE */
3479 case 0x2004: /* THREE-PER-EM SPACE */
3480 case 0x2005: /* FOUR-PER-EM SPACE */
3481 case 0x2006: /* SIX-PER-EM SPACE */
3482 case 0x2007: /* FIGURE SPACE */
3483 case 0x2008: /* PUNCTUATION SPACE */
3484 case 0x2009: /* THIN SPACE */
3485 case 0x200A: /* HAIR SPACE */
3486 case 0x202f: /* NARROW NO-BREAK SPACE */
3487 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3488 case 0x3000: /* IDEOGRAPHIC SPACE */
3489 break;
3490 }
3491 break;
3492
3493 case OP_NOT_VSPACE:
3494 switch(c)
3495 {
3496 default: break;
3497 case 0x0a: /* LF */
3498 case 0x0b: /* VT */
3499 case 0x0c: /* FF */
3500 case 0x0d: /* CR */
3501 case 0x85: /* NEL */
3502 case 0x2028: /* LINE SEPARATOR */
3503 case 0x2029: /* PARAGRAPH SEPARATOR */
3504 RRETURN(MATCH_NOMATCH);
3505 }
3506 break;
3507
3508 case OP_VSPACE:
3509 switch(c)
3510 {
3511 default: RRETURN(MATCH_NOMATCH);
3512 case 0x0a: /* LF */
3513 case 0x0b: /* VT */
3514 case 0x0c: /* FF */
3515 case 0x0d: /* CR */
3516 case 0x85: /* NEL */
3517 case 0x2028: /* LINE SEPARATOR */
3518 case 0x2029: /* PARAGRAPH SEPARATOR */
3519 break;
3520 }
3521 break;
3522
3523 case OP_NOT_DIGIT:
3524 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3525 RRETURN(MATCH_NOMATCH);
3526 break;
3527
3528 case OP_DIGIT:
3529 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3530 RRETURN(MATCH_NOMATCH);
3531 break;
3532
3533 case OP_NOT_WHITESPACE:
3534 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3535 RRETURN(MATCH_NOMATCH);
3536 break;
3537
3538 case OP_WHITESPACE:
3539 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3540 RRETURN(MATCH_NOMATCH);
3541 break;
3542
3543 case OP_NOT_WORDCHAR:
3544 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3545 RRETURN(MATCH_NOMATCH);
3546 break;
3547
3548 case OP_WORDCHAR:
3549 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3550 RRETURN(MATCH_NOMATCH);
3551 break;
3552
3553 default:
3554 RRETURN(PCRE_ERROR_INTERNAL);
3555 }
3556 }
3557 }
3558 else
3559 #endif
3560 /* Not UTF-8 mode */
3561 {
3562 for (fi = min;; fi++)
3563 {
3564 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3565 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3566 if (fi >= max || eptr >= md->end_subject ||
3567 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3568 RRETURN(MATCH_NOMATCH);
3569
3570 c = *eptr++;
3571 switch(ctype)
3572 {
3573 case OP_ANY: /* This is the DOTALL case */
3574 break;
3575
3576 case OP_ANYBYTE:
3577 break;
3578
3579 case OP_ANYNL:
3580 switch(c)
3581 {
3582 default: RRETURN(MATCH_NOMATCH);
3583 case 0x000d:
3584 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3585 break;
3586 case 0x000a:
3587 case 0x000b:
3588 case 0x000c:
3589 case 0x0085:
3590 break;
3591 }
3592 break;
3593
3594 case OP_NOT_HSPACE:
3595 switch(c)
3596 {
3597 default: break;
3598 case 0x09: /* HT */
3599 case 0x20: /* SPACE */
3600 case 0xa0: /* NBSP */
3601 RRETURN(MATCH_NOMATCH);
3602 }
3603 break;
3604
3605 case OP_HSPACE:
3606 switch(c)
3607 {
3608 default: RRETURN(MATCH_NOMATCH);
3609 case 0x09: /* HT */
3610 case 0x20: /* SPACE */
3611 case 0xa0: /* NBSP */
3612 break;
3613 }
3614 break;
3615
3616 case OP_NOT_VSPACE:
3617 switch(c)
3618 {
3619 default: break;
3620 case 0x0a: /* LF */
3621 case 0x0b: /* VT */
3622 case 0x0c: /* FF */
3623 case 0x0d: /* CR */
3624 case 0x85: /* NEL */
3625 RRETURN(MATCH_NOMATCH);
3626 }
3627 break;
3628
3629 case OP_VSPACE:
3630 switch(c)
3631 {
3632 default: RRETURN(MATCH_NOMATCH);
3633 case 0x0a: /* LF */
3634 case 0x0b: /* VT */
3635 case 0x0c: /* FF */
3636 case 0x0d: /* CR */
3637 case 0x85: /* NEL */
3638 break;
3639 }
3640 break;
3641
3642 case OP_NOT_DIGIT:
3643 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3644 break;
3645
3646 case OP_DIGIT:
3647 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3648 break;
3649
3650 case OP_NOT_WHITESPACE:
3651 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3652 break;
3653
3654 case OP_WHITESPACE:
3655 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3656 break;
3657
3658 case OP_NOT_WORDCHAR:
3659 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3660 break;
3661
3662 case OP_WORDCHAR:
3663 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3664 break;
3665
3666 default:
3667 RRETURN(PCRE_ERROR_INTERNAL);
3668 }
3669 }
3670 }
3671 /* Control never gets here */
3672 }
3673
3674 /* If maximizing, it is worth using inline code for speed, doing the type
3675 test once at the start (i.e. keep it out of the loop). Again, keep the
3676 UTF-8 and UCP stuff separate. */
3677
3678 else
3679 {
3680 pp = eptr; /* Remember where we started */
3681
3682 #ifdef SUPPORT_UCP
3683 if (prop_type >= 0)
3684 {
3685 switch(prop_type)
3686 {
3687 case PT_ANY:
3688 for (i = min; i < max; i++)
3689 {
3690 int len = 1;
3691 if (eptr >= md->end_subject) break;
3692 GETCHARLEN(c, eptr, len);
3693 if (prop_fail_result) break;
3694 eptr+= len;
3695 }
3696 break;
3697
3698 case PT_LAMP:
3699 for (i = min; i < max; i++)
3700 {
3701 int len = 1;
3702 if (eptr >= md->end_subject) break;
3703 GETCHARLEN(c, eptr, len);
3704 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3705 if ((prop_chartype == ucp_Lu ||
3706 prop_chartype == ucp_Ll ||
3707 prop_chartype == ucp_Lt) == prop_fail_result)
3708 break;
3709 eptr+= len;
3710 }
3711 break;
3712
3713 case PT_GC:
3714 for (i = min; i < max; i++)
3715 {
3716 int len = 1;
3717 if (eptr >= md->end_subject) break;
3718 GETCHARLEN(c, eptr, len);
3719 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3720 if ((prop_category == prop_value) == prop_fail_result)
3721 break;
3722 eptr+= len;
3723 }
3724 break;
3725
3726 case PT_PC:
3727 for (i = min; i < max; i++)
3728 {
3729 int len = 1;
3730 if (eptr >= md->end_subject) break;
3731 GETCHARLEN(c, eptr, len);
3732 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3733 if ((prop_chartype == prop_value) == prop_fail_result)
3734 break;
3735 eptr+= len;
3736 }
3737 break;
3738
3739 case PT_SC:
3740 for (i = min; i < max; i++)
3741 {
3742 int len = 1;
3743 if (eptr >= md->end_subject) break;
3744 GETCHARLEN(c, eptr, len);
3745 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3746 if ((prop_script == prop_value) == prop_fail_result)
3747 break;
3748 eptr+= len;
3749 }
3750 break;
3751 }
3752
3753 /* eptr is now past the end of the maximum run */
3754
3755 if (possessive) continue;
3756 for(;;)
3757 {
3758 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3760 if (eptr-- == pp) break; /* Stop if tried at original pos */
3761 if (utf8) BACKCHAR(eptr);
3762 }
3763 }
3764
3765 /* Match extended Unicode sequences. We will get here only if the
3766 support is in the binary; otherwise a compile-time error occurs. */
3767
3768 else if (ctype == OP_EXTUNI)
3769 {
3770 for (i = min; i < max; i++)
3771 {
3772 if (eptr >= md->end_subject) break;
3773 GETCHARINCTEST(c, eptr);
3774 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3775 if (prop_category == ucp_M) break;
3776 while (eptr < md->end_subject)
3777 {
3778 int len = 1;
3779 if (!utf8) c = *eptr; else
3780 {
3781 GETCHARLEN(c, eptr, len);
3782 }
3783 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3784 if (prop_category != ucp_M) break;
3785 eptr += len;
3786 }
3787 }
3788
3789 /* eptr is now past the end of the maximum run */
3790
3791 if (possessive) continue;
3792 for(;;)
3793 {
3794 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796 if (eptr-- == pp) break; /* Stop if tried at original pos */
3797 for (;;) /* Move back over one extended */
3798 {
3799 int len = 1;
3800 if (!utf8) c = *eptr; else
3801 {
3802 BACKCHAR(eptr);
3803 GETCHARLEN(c, eptr, len);
3804 }
3805 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3806 if (prop_category != ucp_M) break;
3807 eptr--;
3808 }
3809 }
3810 }
3811
3812 else
3813 #endif /* SUPPORT_UCP */
3814
3815 #ifdef SUPPORT_UTF8
3816 /* UTF-8 mode */
3817
3818 if (utf8)
3819 {
3820 switch(ctype)
3821 {
3822 case OP_ANY:
3823 if (max < INT_MAX)
3824 {
3825 if ((ims & PCRE_DOTALL) == 0)
3826 {
3827 for (i = min; i < max; i++)
3828 {
3829 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3830 eptr++;
3831 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3832 }
3833 }
3834 else
3835 {
3836 for (i = min; i < max; i++)
3837 {
3838 if (eptr >= md->end_subject) break;
3839 eptr++;
3840 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3841 }
3842 }
3843 }
3844
3845 /* Handle unlimited UTF-8 repeat */
3846
3847 else
3848 {
3849 if ((ims & PCRE_DOTALL) == 0)
3850 {
3851 for (i = min; i < max; i++)
3852 {
3853 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3854 eptr++;
3855 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3856 }
3857 }
3858 else
3859 {
3860 eptr = md->end_subject;
3861 }
3862 }
3863 break;
3864
3865 /* The byte case is the same as non-UTF8 */
3866
3867 case OP_ANYBYTE:
3868 c = max - min;
3869 if (c > (unsigned int)(md->end_subject - eptr))
3870 c = md->end_subject - eptr;
3871 eptr += c;
3872 break;
3873
3874 case OP_ANYNL:
3875 for (i = min; i < max; i++)
3876 {
3877 int len = 1;
3878 if (eptr >= md->end_subject) break;
3879 GETCHARLEN(c, eptr, len);
3880 if (c == 0x000d)
3881 {
3882 if (++eptr >= md->end_subject) break;
3883 if (*eptr == 0x000a) eptr++;
3884 }
3885 else
3886 {
3887 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3888 c != 0x0085 && c != 0x2028 && c != 0x2029)
3889 break;
3890 eptr += len;
3891 }
3892 }
3893 break;
3894
3895 case OP_NOT_HSPACE:
3896 case OP_HSPACE:
3897 for (i = min; i < max; i++)
3898 {
3899 BOOL gotspace;
3900 int len = 1;
3901 if (eptr >= md->end_subject) break;
3902 GETCHARLEN(c, eptr, len);
3903 switch(c)
3904 {
3905 default: gotspace = FALSE; break;
3906 case 0x09: /* HT */
3907 case 0x20: /* SPACE */
3908 case 0xa0: /* NBSP */
3909 case 0x1680: /* OGHAM SPACE MARK */
3910 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3911 case 0x2000: /* EN QUAD */
3912 case 0x2001: /* EM QUAD */
3913 case 0x2002: /* EN SPACE */
3914 case 0x2003: /* EM SPACE */
3915 case 0x2004: /* THREE-PER-EM SPACE */
3916 case 0x2005: /* FOUR-PER-EM SPACE */
3917 case 0x2006: /* SIX-PER-EM SPACE */
3918 case 0x2007: /* FIGURE SPACE */
3919 case 0x2008: /* PUNCTUATION SPACE */
3920 case 0x2009: /* THIN SPACE */
3921 case 0x200A: /* HAIR SPACE */
3922 case 0x202f: /* NARROW NO-BREAK SPACE */
3923 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3924 case 0x3000: /* IDEOGRAPHIC SPACE */
3925 gotspace = TRUE;
3926 break;
3927 }
3928 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3929 eptr += len;
3930 }
3931 break;
3932
3933 case OP_NOT_VSPACE:
3934 case OP_VSPACE:
3935 for (i = min; i < max; i++)
3936 {
3937 BOOL gotspace;
3938 int len = 1;
3939 if (eptr >= md->end_subject) break;
3940 GETCHARLEN(c, eptr, len);
3941 switch(c)
3942 {
3943 default: gotspace = FALSE; break;
3944 case 0x0a: /* LF */
3945 case 0x0b: /* VT */
3946 case 0x0c: /* FF */
3947 case 0x0d: /* CR */
3948 case 0x85: /* NEL */
3949 case 0x2028: /* LINE SEPARATOR */
3950 case 0x2029: /* PARAGRAPH SEPARATOR */
3951 gotspace = TRUE;
3952 break;
3953 }
3954 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3955 eptr += len;
3956 }
3957 break;
3958
3959 case OP_NOT_DIGIT:
3960 for (i = min; i < max; i++)
3961 {
3962 int len = 1;
3963 if (eptr >= md->end_subject) break;
3964 GETCHARLEN(c, eptr, len);
3965 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3966 eptr+= len;
3967 }
3968 break;
3969
3970 case OP_DIGIT:
3971 for (i = min; i < max; i++)
3972 {
3973 int len = 1;
3974 if (eptr >= md->end_subject) break;
3975 GETCHARLEN(c, eptr, len);
3976 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3977 eptr+= len;
3978 }
3979 break;
3980
3981 case OP_NOT_WHITESPACE:
3982 for (i = min; i < max; i++)
3983 {
3984 int len = 1;
3985 if (eptr >= md->end_subject) break;
3986 GETCHARLEN(c, eptr, len);
3987 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3988 eptr+= len;
3989 }
3990 break;
3991
3992 case OP_WHITESPACE:
3993 for (i = min; i < max; i++)
3994 {
3995 int len = 1;
3996 if (eptr >= md->end_subject) break;
3997 GETCHARLEN(c, eptr, len);
3998 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3999 eptr+= len;
4000 }
4001 break;
4002
4003 case OP_NOT_WORDCHAR:
4004 for (i = min; i < max; i++)
4005 {
4006 int len = 1;
4007 if (eptr >= md->end_subject) break;
4008 GETCHARLEN(c, eptr, len);
4009 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4010 eptr+= len;
4011 }
4012 break;
4013
4014 case OP_WORDCHAR:
4015 for (i = min; i < max; i++)
4016 {
4017 int len = 1;
4018 if (eptr >= md->end_subject) break;
4019 GETCHARLEN(c, eptr, len);
4020 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4021 eptr+= len;
4022 }
4023 break;
4024
4025 default:
4026 RRETURN(PCRE_ERROR_INTERNAL);
4027 }
4028
4029 /* eptr is now past the end of the maximum run */
4030
4031 if (possessive) continue;
4032 for(;;)
4033 {
4034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4036 if (eptr-- == pp) break; /* Stop if tried at original pos */
4037 BACKCHAR(eptr);
4038 }
4039 }
4040 else
4041 #endif /* SUPPORT_UTF8 */
4042
4043 /* Not UTF-8 mode */
4044 {
4045 switch(ctype)
4046 {
4047 case OP_ANY:
4048 if ((ims & PCRE_DOTALL) == 0)
4049 {
4050 for (i = min; i < max; i++)
4051 {
4052 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4053 eptr++;
4054 }
4055 break;
4056 }
4057 /* For DOTALL case, fall through and treat as \C */
4058
4059 case OP_ANYBYTE:
4060 c = max - min;
4061 if (c > (unsigned int)(md->end_subject - eptr))
4062 c = md->end_subject - eptr;
4063 eptr += c;
4064 break;
4065
4066 case OP_ANYNL:
4067 for (i = min; i < max; i++)
4068 {
4069 if (eptr >= md->end_subject) break;
4070 c = *eptr;
4071 if (c == 0x000d)
4072 {
4073 if (++eptr >= md->end_subject) break;
4074 if (*eptr == 0x000a) eptr++;
4075 }
4076 else
4077 {
4078 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4079 break;
4080 eptr++;
4081 }
4082 }
4083 break;
4084
4085 case OP_NOT_HSPACE:
4086 for (i = min; i < max; i++)
4087 {
4088 if (eptr >= md->end_subject) break;
4089 c = *eptr;
4090 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4091 eptr++;
4092 }
4093 break;
4094
4095 case OP_HSPACE:
4096 for (i = min; i < max; i++)
4097 {
4098 if (eptr >= md->end_subject) break;
4099 c = *eptr;
4100 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4101 eptr++;
4102 }
4103 break;
4104
4105 case OP_NOT_VSPACE:
4106 for (i = min; i < max; i++)
4107 {
4108 if (eptr >= md->end_subject) break;
4109 c = *eptr;
4110 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4111 break;
4112 eptr++;
4113 }
4114 break;
4115
4116 case OP_VSPACE:
4117 for (i = min; i < max; i++)
4118 {
4119 if (eptr >= md->end_subject) break;
4120 c = *eptr;
4121 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4122 break;
4123 eptr++;
4124 }
4125 break;
4126
4127 case OP_NOT_DIGIT:
4128 for (i = min; i < max; i++)
4129 {
4130 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4131 break;
4132 eptr++;
4133 }
4134 break;
4135
4136 case OP_DIGIT:
4137 for (i = min; i < max; i++)
4138 {
4139 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4140 break;
4141 eptr++;
4142 }
4143 break;
4144
4145 case OP_NOT_WHITESPACE:
4146 for (i = min; i < max; i++)
4147 {
4148 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4149 break;
4150 eptr++;
4151 }
4152 break;
4153
4154 case OP_WHITESPACE:
4155 for (i = min; i < max; i++)
4156 {
4157 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4158 break;
4159 eptr++;
4160 }
4161 break;
4162
4163 case OP_NOT_WORDCHAR:
4164 for (i = min; i < max; i++)
4165 {
4166 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4167 break;
4168 eptr++;
4169 }
4170 break;
4171
4172 case OP_WORDCHAR:
4173 for (i = min; i < max; i++)
4174 {
4175 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4176 break;
4177 eptr++;
4178 }
4179 break;
4180
4181 default:
4182 RRETURN(PCRE_ERROR_INTERNAL);
4183 }
4184
4185 /* eptr is now past the end of the maximum run */
4186
4187 if (possessive) continue;
4188 while (eptr >= pp)
4189 {
4190 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4191 eptr--;
4192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4193 }
4194 }
4195
4196 /* Get here if we can't make it match with any permitted repetitions */
4197
4198 RRETURN(MATCH_NOMATCH);
4199 }
4200 /* Control never gets here */
4201
4202 /* There's been some horrible disaster. Arrival here can only mean there is
4203 something seriously wrong in the code above or the OP_xxx definitions. */
4204
4205 default:
4206 DPRINTF(("Unknown opcode %d\n", *ecode));
4207 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4208 }
4209
4210 /* Do not stick any code in here without much thought; it is assumed
4211 that "continue" in the code above comes out to here to repeat the main
4212 loop. */
4213
4214 } /* End of main loop */
4215 /* Control never reaches here */
4216
4217
4218 /* When compiling to use the heap rather than the stack for recursive calls to
4219 match(), the RRETURN() macro jumps here. The number that is saved in
4220 frame->Xwhere indicates which label we actually want to return to. */
4221
4222 #ifdef NO_RECURSE
4223 #define LBL(val) case val: goto L_RM##val;
4224 HEAP_RETURN:
4225 switch (frame->Xwhere)
4226 {
4227 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4228 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
4229 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
4230 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
4231 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
4232 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
4233 default:
4234 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4235 return PCRE_ERROR_INTERNAL;
4236 }
4237 #undef LBL
4238 #endif /* NO_RECURSE */
4239 }
4240
4241
4242 /***************************************************************************
4243 ****************************************************************************
4244 RECURSION IN THE match() FUNCTION
4245
4246 Undefine all the macros that were defined above to handle this. */
4247
4248 #ifdef NO_RECURSE
4249 #undef eptr
4250 #undef ecode
4251 #undef mstart
4252 #undef offset_top
4253 #undef ims
4254 #undef eptrb
4255 #undef flags
4256
4257 #undef callpat
4258 #undef charptr
4259 #undef data
4260 #undef next
4261 #undef pp
4262 #undef prev
4263 #undef saved_eptr
4264
4265 #undef new_recursive
4266
4267 #undef cur_is_word
4268 #undef condition
4269 #undef prev_is_word
4270
4271 #undef original_ims
4272
4273 #undef ctype
4274 #undef length
4275 #undef max
4276 #undef min
4277 #undef number
4278 #undef offset
4279 #undef op
4280 #undef save_capture_last
4281 #undef save_offset1
4282 #undef save_offset2
4283 #undef save_offset3
4284 #undef stacksave
4285
4286 #undef newptrb
4287
4288 #endif
4289
4290 /* These two are defined as macros in both cases */
4291
4292 #undef fc
4293 #undef fi
4294
4295 /***************************************************************************
4296 ***************************************************************************/
4297
4298
4299
4300 /*************************************************
4301 * Execute a Regular Expression *
4302 *************************************************/
4303
4304 /* This function applies a compiled re to a subject string and picks out
4305 portions of the string if it matches. Two elements in the vector are set for
4306 each substring: the offsets to the start and end of the substring.
4307
4308 Arguments:
4309 argument_re points to the compiled expression
4310 extra_data points to extra data or is NULL
4311 subject points to the subject string
4312 length length of subject string (may contain binary zeros)
4313 start_offset where to start in the subject string
4314 options option bits
4315 offsets points to a vector of ints to be filled in with offsets
4316 offsetcount the number of elements in the vector
4317
4318 Returns: > 0 => success; value is the number of elements filled in
4319 = 0 => success, but offsets is not big enough
4320 -1 => failed to match
4321 < -1 => some kind of unexpected problem
4322 */
4323
4324 PCRE_EXP_DEFN int
4325 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4326 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4327 int offsetcount)
4328 {
4329 int rc, resetcount, ocount;
4330 int first_byte = -1;
4331 int req_byte = -1;
4332 int req_byte2 = -1;
4333 int newline;
4334 unsigned long int ims;
4335 BOOL using_temporary_offsets = FALSE;
4336 BOOL anchored;
4337 BOOL startline;
4338 BOOL firstline;
4339 BOOL first_byte_caseless = FALSE;
4340 BOOL req_byte_caseless = FALSE;
4341 BOOL utf8;
4342 match_data match_block;
4343 match_data *md = &match_block;
4344 const uschar *tables;
4345 const uschar *start_bits = NULL;
4346 USPTR start_match = (USPTR)subject + start_offset;
4347 USPTR end_subject;
4348 USPTR req_byte_ptr = start_match - 1;
4349
4350 pcre_study_data internal_study;
4351 const pcre_study_data *study;
4352
4353 real_pcre internal_re;
4354 const real_pcre *external_re = (const real_pcre *)argument_re;
4355 const real_pcre *re = external_re;
4356
4357 /* Plausibility checks */
4358
4359 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4360 if (re == NULL || subject == NULL ||
4361 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4362 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4363
4364 /* Fish out the optional data from the extra_data structure, first setting
4365 the default values. */
4366
4367 study = NULL;
4368 md->match_limit = MATCH_LIMIT;
4369 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4370 md->callout_data = NULL;
4371
4372 /* The table pointer is always in native byte order. */
4373
4374 tables = external_re->tables;
4375
4376 if (extra_data != NULL)
4377 {
4378 register unsigned int flags = extra_data->flags;
4379 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4380 study = (const pcre_study_data *)extra_data->study_data;
4381 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4382 md->match_limit = extra_data->match_limit;
4383 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4384 md->match_limit_recursion = extra_data->match_limit_recursion;
4385 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4386 md->callout_data = extra_data->callout_data;
4387 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4388 }
4389
4390 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4391 is a feature that makes it possible to save compiled regex and re-use them
4392 in other programs later. */
4393
4394 if (tables == NULL) tables = _pcre_default_tables;
4395
4396 /* Check that the first field in the block is the magic number. If it is not,
4397 test for a regex that was compiled on a host of opposite endianness. If this is
4398 the case, flipped values are put in internal_re and internal_study if there was
4399 study data too. */
4400
4401 if (re->magic_number != MAGIC_NUMBER)
4402 {
4403 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4404 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4405 if (study != NULL) study = &internal_study;
4406 }
4407
4408 /* Set up other data */
4409
4410 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4411 startline = (re->options & PCRE_STARTLINE) != 0;
4412 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4413
4414 /* The code starts after the real_pcre block and the capture name table. */
4415
4416 md->start_code = (const uschar *)external_re + re->name_table_offset +
4417 re->name_count * re->name_entry_size;
4418
4419 md->start_subject = (USPTR)subject;
4420 md->start_offset = start_offset;
4421 md->end_subject = md->start_subject + length;
4422 end_subject = md->end_subject;
4423
4424 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4425 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4426
4427 md->notbol = (options & PCRE_NOTBOL) != 0;
4428 md->noteol = (options & PCRE_NOTEOL) != 0;
4429 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4430 md->partial = (options & PCRE_PARTIAL) != 0;
4431 md->hitend = FALSE;
4432
4433 md->recursive = NULL; /* No recursion at top level */
4434
4435 md->lcc = tables + lcc_offset;
4436 md->ctypes = tables + ctypes_offset;
4437
4438 /* Handle different types of newline. The three bits give eight cases. If
4439 nothing is set at run time, whatever was used at compile time applies. */
4440
4441 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
4442 PCRE_NEWLINE_BITS)
4443 {
4444 case 0: newline = NEWLINE; break; /* Compile-time default */
4445 case PCRE_NEWLINE_CR: newline = '\r'; break;
4446 case PCRE_NEWLINE_LF: newline = '\n'; break;
4447 case PCRE_NEWLINE_CR+
4448 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4449 case PCRE_NEWLINE_ANY: newline = -1; break;
4450 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4451 default: return PCRE_ERROR_BADNEWLINE;
4452 }
4453
4454 if (newline == -2)
4455 {
4456 md->nltype = NLTYPE_ANYCRLF;
4457 }
4458 else if (newline < 0)
4459 {
4460 md->nltype = NLTYPE_ANY;
4461 }
4462 else
4463 {
4464 md->nltype = NLTYPE_FIXED;
4465 if (newline > 255)
4466 {
4467 md->nllen = 2;
4468 md->nl[0] = (newline >> 8) & 255;
4469 md->nl[1] = newline & 255;
4470 }
4471 else
4472 {
4473 md->nllen = 1;
4474 md->nl[0] = newline;
4475 }
4476 }
4477
4478 /* Partial matching is supported only for a restricted set of regexes at the
4479 moment. */
4480
4481 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
4482 return PCRE_ERROR_BADPARTIAL;
4483
4484 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4485 back the character offset. */
4486
4487 #ifdef SUPPORT_UTF8
4488 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4489 {
4490 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4491 return PCRE_ERROR_BADUTF8;
4492 if (start_offset > 0 && start_offset < length)
4493 {
4494 int tb = ((uschar *)subject)[start_offset];
4495 if (tb > 127)
4496 {
4497 tb &= 0xc0;
4498 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4499 }
4500 }
4501 }
4502 #endif
4503
4504 /* The ims options can vary during the matching as a result of the presence
4505 of (?ims) items in the pattern. They are kept in a local variable so that
4506 restoring at the exit of a group is easy. */
4507
4508 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4509
4510 /* If the expression has got more back references than the offsets supplied can
4511 hold, we get a temporary chunk of working store to use during the matching.
4512 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4513 of 3. */
4514
4515 ocount = offsetcount - (offsetcount % 3);
4516
4517 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4518 {
4519 ocount = re->top_backref * 3 + 3;
4520 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4521 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4522 using_temporary_offsets = TRUE;
4523 DPRINTF(("Got memory to hold back references\n"));
4524 }
4525 else md->offset_vector = offsets;
4526
4527 md->offset_end = ocount;
4528 md->offset_max = (2*ocount)/3;
4529 md->offset_overflow = FALSE;
4530 md->capture_last = -1;
4531
4532 /* Compute the minimum number of offsets that we need to reset each time. Doing
4533 this makes a huge difference to execution time when there aren't many brackets
4534 in the pattern. */
4535
4536 resetcount = 2 + re->top_bracket * 2;
4537 if (resetcount > offsetcount) resetcount = ocount;
4538
4539 /* Reset the working variable associated with each extraction. These should
4540 never be used unless previously set, but they get saved and restored, and so we
4541 initialize them to avoid reading uninitialized locations. */
4542
4543 if (md->offset_vector != NULL)
4544 {
4545 register int *iptr = md->offset_vector + ocount;
4546 register int *iend = iptr - resetcount/2 + 1;
4547 while (--iptr >= iend) *iptr = -1;
4548 }
4549
4550 /* Set up the first character to match, if available. The first_byte value is
4551 never set for an anchored regular expression, but the anchoring may be forced
4552 at run time, so we have to test for anchoring. The first char may be unset for
4553 an unanchored pattern, of course. If there's no first char and the pattern was
4554 studied, there may be a bitmap of possible first characters. */
4555
4556 if (!anchored)
4557 {
4558 if ((re->options & PCRE_FIRSTSET) != 0)
4559 {
4560 first_byte = re->first_byte & 255;
4561 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4562 first_byte = md->lcc[first_byte];
4563 }
4564 else
4565 if (!startline && study != NULL &&
4566 (study->options & PCRE_STUDY_MAPPED) != 0)
4567 start_bits = study->start_bits;
4568 }
4569
4570 /* For anchored or unanchored matches, there may be a "last known required
4571 character" set. */
4572
4573 if ((re->options & PCRE_REQCHSET) != 0)
4574 {
4575 req_byte = re->req_byte & 255;
4576 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4577 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4578 }
4579
4580
4581 /* ==========================================================================*/
4582
4583 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4584 the loop runs just once. */
4585
4586 for(;;)
4587 {
4588 USPTR save_end_subject = end_subject;
4589 USPTR new_start_match;
4590
4591 /* Reset the maximum number of extractions we might see. */
4592
4593 if (md->offset_vector != NULL)
4594 {
4595 register int *iptr = md->offset_vector;
4596 register int *iend = iptr + resetcount;
4597 while (iptr < iend) *iptr++ = -1;
4598 }
4599
4600 /* Advance to a unique first char if possible. If firstline is TRUE, the
4601 start of the match is constrained to the first line of a multiline string.
4602 That is, the match must be before or at the first newline. Implement this by
4603 temporarily adjusting end_subject so that we stop scanning at a newline. If
4604 the match fails at the newline, later code breaks this loop. */
4605
4606 if (firstline)
4607 {
4608 USPTR t = start_match;
4609 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4610 end_subject = t;
4611 }
4612
4613 /* Now test for a unique first byte */
4614
4615 if (first_byte >= 0)
4616 {
4617 if (first_byte_caseless)
4618 while (start_match < end_subject &&
4619 md->lcc[*start_match] != first_byte)
4620 start_match++;
4621 else
4622 while (start_match < end_subject && *start_match != first_byte)
4623 start_match++;
4624 }
4625
4626 /* Or to just after a linebreak for a multiline match if possible */
4627
4628 else if (startline)
4629 {
4630 if (start_match > md->start_subject + start_offset)
4631 {
4632 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4633 start_match++;
4634
4635 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4636 and we are now at a LF, advance the match position by one more character.
4637 */
4638
4639 if (start_match[-1] == '\r' &&
4640 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4641 start_match < end_subject &&
4642 *start_match == '\n')
4643 start_match++;
4644 }
4645 }
4646
4647 /* Or to a non-unique first char after study */
4648
4649 else if (start_bits != NULL)
4650 {
4651 while (start_match < end_subject)
4652 {
4653 register unsigned int c = *start_match;
4654 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4655 }
4656 }
4657
4658 /* Restore fudged end_subject */
4659
4660 end_subject = save_end_subject;
4661
4662 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4663 printf(">>>> Match against: ");
4664 pchars(start_match, end_subject - start_match, TRUE, md);
4665 printf("\n");
4666 #endif
4667
4668 /* If req_byte is set, we know that that character must appear in the subject
4669 for the match to succeed. If the first character is set, req_byte must be
4670 later in the subject; otherwise the test starts at the match point. This
4671 optimization can save a huge amount of backtracking in patterns with nested
4672 unlimited repeats that aren't going to match. Writing separate code for
4673 cased/caseless versions makes it go faster, as does using an autoincrement
4674 and backing off on a match.
4675
4676 HOWEVER: when the subject string is very, very long, searching to its end can
4677 take a long time, and give bad performance on quite ordinary patterns. This
4678 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4679 string... so we don't do this when the string is sufficiently long.
4680
4681 ALSO: this processing is disabled when partial matching is requested.
4682 */
4683
4684 if (req_byte >= 0 &&
4685 end_subject - start_match < REQ_BYTE_MAX &&
4686 !md->partial)
4687 {
4688 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4689
4690 /* We don't need to repeat the search if we haven't yet reached the
4691 place we found it at last time. */
4692
4693 if (p > req_byte_ptr)
4694 {
4695 if (req_byte_caseless)
4696 {
4697 while (p < end_subject)
4698 {
4699 register int pp = *p++;
4700 if (pp == req_byte || pp == req_byte2) { p--; break; }
4701 }
4702 }
4703 else
4704 {
4705 while (p < end_subject)
4706 {
4707 if (*p++ == req_byte) { p--; break; }
4708 }
4709 }
4710
4711 /* If we can't find the required character, break the matching loop,
4712 forcing a match failure. */
4713
4714 if (p >= end_subject)
4715 {
4716 rc = MATCH_NOMATCH;
4717 break;
4718 }
4719
4720 /* If we have found the required character, save the point where we
4721 found it, so that we don't search again next time round the loop if
4722 the start hasn't passed this character yet. */
4723
4724 req_byte_ptr = p;
4725 }
4726 }
4727
4728 /* OK, we can now run the match. */
4729
4730 md->start_match_ptr = start_match;
4731 md->match_call_count = 0;
4732 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4733
4734 switch(rc)
4735 {
4736 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4737 exactly like PRUNE. */
4738
4739 case MATCH_NOMATCH:
4740 case MATCH_PRUNE:
4741 case MATCH_THEN:
4742 new_start_match = start_match + 1;
4743 #ifdef SUPPORT_UTF8
4744 if (utf8)
4745 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4746 new_start_match++;
4747 #endif
4748 break;
4749
4750 /* SKIP passes back the next starting point explicitly. */
4751
4752 case MATCH_SKIP:
4753 new_start_match = md->start_match_ptr;
4754 break;
4755
4756 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4757
4758 case MATCH_COMMIT:
4759 rc = MATCH_NOMATCH;
4760 goto ENDLOOP;
4761
4762 /* Any other return is some kind of error. */
4763
4764 default:
4765 goto ENDLOOP;
4766 }
4767
4768 /* Control reaches here for the various types of "no match at this point"
4769 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4770
4771 rc = MATCH_NOMATCH;
4772
4773 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4774 newline in the subject (though it may continue over the newline). Therefore,
4775 if we have just failed to match, starting at a newline, do not continue. */
4776
4777 if (firstline && IS_NEWLINE(start_match)) break;
4778
4779 /* Advance to new matching position */
4780
4781 start_match = new_start_match;
4782
4783 /* Break the loop if the pattern is anchored or if we have passed the end of
4784 the subject. */
4785
4786 if (anchored || start_match > end_subject) break;
4787
4788 /* If we have just passed a CR and the newline option is CRLF or ANY or
4789 ANYCRLF, and we are now at a LF, advance the match position by one more
4790 character. */
4791
4792 if (start_match[-1] == '\r' &&
4793 (md->nltype == NLTYPE_ANY ||
4794 md->nltype == NLTYPE_ANYCRLF ||
4795 md->nllen == 2) &&
4796 start_match < end_subject &&
4797 *start_match == '\n')
4798 start_match++;
4799
4800 } /* End of for(;;) "bumpalong" loop */
4801
4802 /* ==========================================================================*/
4803
4804 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4805 conditions is true:
4806
4807 (1) The pattern is anchored or the match was failed by (*COMMIT);
4808
4809 (2) We are past the end of the subject;
4810
4811 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4812 this option requests that a match occur at or before the first newline in
4813 the subject.
4814
4815 When we have a match and the offset vector is big enough to deal with any
4816 backreferences, captured substring offsets will already be set up. In the case
4817 where we had to get some local store to hold offsets for backreference
4818 processing, copy those that we can. In this case there need not be overflow if
4819 certain parts of the pattern were not used, even though there are more
4820 capturing parentheses than vector slots. */
4821
4822 ENDLOOP:
4823
4824 if (rc == MATCH_MATCH)
4825 {
4826 if (using_temporary_offsets)
4827 {
4828 if (offsetcount >= 4)
4829 {
4830 memcpy(offsets + 2, md->offset_vector + 2,
4831 (offsetcount - 2) * sizeof(int));
4832 DPRINTF(("Copied offsets from temporary memory\n"));
4833 }
4834 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4835 DPRINTF(("Freeing temporary memory\n"));
4836 (pcre_free)(md->offset_vector);
4837 }
4838
4839 /* Set the return code to the number of captured strings, or 0 if there are
4840 too many to fit into the vector. */
4841
4842 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4843
4844 /* If there is space, set up the whole thing as substring 0. The value of
4845 md->start_match_ptr might be modified if \K was encountered on the success
4846 matching path. */
4847
4848 if (offsetcount < 2) rc = 0; else
4849 {
4850 offsets[0] = md->start_match_ptr - md->start_subject;
4851 offsets[1] = md->end_match_ptr - md->start_subject;
4852 }
4853
4854 DPRINTF((">>>> returning %d\n", rc));
4855 return rc;
4856 }
4857
4858 /* Control gets here if there has been an error, or if the overall match
4859 attempt has failed at all permitted starting positions. */
4860
4861 if (using_temporary_offsets)
4862 {
4863 DPRINTF(("Freeing temporary memory\n"));
4864 (pcre_free)(md->offset_vector);
4865 }
4866
4867 if (rc != MATCH_NOMATCH)
4868 {
4869 DPRINTF((">>>> error: returning %d\n", rc));
4870 return rc;
4871 }
4872 else if (md->partial && md->hitend)
4873 {
4874 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4875 return PCRE_ERROR_PARTIAL;
4876 }
4877 else
4878 {
4879 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4880 return PCRE_ERROR_NOMATCH;
4881 }
4882 }
4883
4884 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5