/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 428 - (show annotations)
Mon Aug 31 17:10:26 2009 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 99289 byte(s)
Error occurred while calculating annotation data.
Further partial match change: add PCRE_PARTIAL_HARD and make more intuitive.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2009 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 #ifdef HAVE_CONFIG_H
49 #include "config.h"
50 #endif
51
52 #define NLBLOCK md /* Block containing newline information */
53 #define PSSTART start_subject /* Field containing processed string start */
54 #define PSEND end_subject /* Field containing processed string end */
55
56 #include "pcre_internal.h"
57
58
59 /* For use to indent debugging output */
60
61 #define SP " "
62
63
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
67
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
72
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
78
79
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
86
87 static const uschar coptable[] = {
88 0, /* End */
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, 0, /* Any, AllAny, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95 1, /* Char */
96 1, /* Charnc */
97 1, /* not */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
113 0, /* CLASS */
114 0, /* NCLASS */
115 0, /* XCLASS - variable length */
116 0, /* REF */
117 0, /* RECURSE */
118 0, /* CALLOUT */
119 0, /* Alt */
120 0, /* Ket */
121 0, /* KetRmax */
122 0, /* KetRmin */
123 0, /* Assert */
124 0, /* Assert not */
125 0, /* Assert behind */
126 0, /* Assert behind not */
127 0, /* Reverse */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
130 0, /* CREF */
131 0, /* RREF */
132 0, /* DEF */
133 0, 0, /* BRAZERO, BRAMINZERO */
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 };
137
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139 and \w */
140
141 static const uschar toptable1[] = {
142 0, 0, 0, 0, 0, 0,
143 ctype_digit, ctype_digit,
144 ctype_space, ctype_space,
145 ctype_word, ctype_word,
146 0, 0 /* OP_ANY, OP_ALLANY */
147 };
148
149 static const uschar toptable2[] = {
150 0, 0, 0, 0, 0, 0,
151 ctype_digit, 0,
152 ctype_space, 0,
153 ctype_word, 0,
154 1, 1 /* OP_ANY, OP_ALLANY */
155 };
156
157
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
162
163 typedef struct stateblock {
164 int offset; /* Offset to opcode */
165 int count; /* Count for repeats */
166 int ims; /* ims flag bits */
167 int data; /* Some use extra data */
168 } stateblock;
169
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171
172
173 #ifdef DEBUG
174 /*************************************************
175 * Print character string *
176 *************************************************/
177
178 /* Character string printing function for debugging.
179
180 Arguments:
181 p points to string
182 length number of bytes
183 f where to print
184
185 Returns: nothing
186 */
187
188 static void
189 pchars(unsigned char *p, int length, FILE *f)
190 {
191 int c;
192 while (length-- > 0)
193 {
194 if (isprint(c = *(p++)))
195 fprintf(f, "%c", c);
196 else
197 fprintf(f, "\\x%02x", c);
198 }
199 }
200 #endif
201
202
203
204 /*************************************************
205 * Execute a Regular Expression - DFA engine *
206 *************************************************/
207
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
212
213 Arguments:
214 md the match_data block with fixed information
215 this_start_code the opening bracket of this subexpression's code
216 current_subject where we currently are in the subject string
217 start_offset start offset in the subject string
218 offsets vector to contain the matching string offsets
219 offsetcount size of same
220 workspace vector of workspace
221 wscount size of same
222 ims the current ims flags
223 rlevel function call recursion level
224 recursing regex recursive call level
225
226 Returns: > 0 => number of match offset pairs placed in offsets
227 = 0 => offsets overflowed; longest matches are present
228 -1 => failed to match
229 < -1 => some kind of unexpected problem
230
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
233
234 #define ADD_ACTIVE(x,y) \
235 if (active_count++ < wscount) \
236 { \
237 next_active_state->offset = (x); \
238 next_active_state->count = (y); \
239 next_active_state->ims = ims; \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_ACTIVE_DATA(x,y,z) \
246 if (active_count++ < wscount) \
247 { \
248 next_active_state->offset = (x); \
249 next_active_state->count = (y); \
250 next_active_state->ims = ims; \
251 next_active_state->data = (z); \
252 next_active_state++; \
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254 } \
255 else return PCRE_ERROR_DFA_WSSIZE
256
257 #define ADD_NEW(x,y) \
258 if (new_count++ < wscount) \
259 { \
260 next_new_state->offset = (x); \
261 next_new_state->count = (y); \
262 next_new_state->ims = ims; \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 #define ADD_NEW_DATA(x,y,z) \
269 if (new_count++ < wscount) \
270 { \
271 next_new_state->offset = (x); \
272 next_new_state->count = (y); \
273 next_new_state->ims = ims; \
274 next_new_state->data = (z); \
275 next_new_state++; \
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277 } \
278 else return PCRE_ERROR_DFA_WSSIZE
279
280 /* And now, here is the code */
281
282 static int
283 internal_dfa_exec(
284 dfa_match_data *md,
285 const uschar *this_start_code,
286 const uschar *current_subject,
287 int start_offset,
288 int *offsets,
289 int offsetcount,
290 int *workspace,
291 int wscount,
292 int ims,
293 int rlevel,
294 int recursing)
295 {
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
298
299 const uschar *ctypes, *lcc, *fcc;
300 const uschar *ptr;
301 const uschar *end_code, *first_op;
302
303 int active_count, new_count, match_count;
304
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
307
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
311
312 #ifdef SUPPORT_UTF8
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 #else
315 BOOL utf8 = FALSE;
316 #endif
317
318 rlevel++;
319 offsetcount &= (-2);
320
321 wscount -= 2;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323 (2 * INTS_PER_STATEBLOCK);
324
325 DPRINTF(("\n%.*s---------------------\n"
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
332
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
337 new_count = 0;
338
339 first_op = this_start_code + 1 + LINK_SIZE +
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
346
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
350
351 if (*first_op == OP_REVERSE)
352 {
353 int max_back = 0;
354 int gone_back;
355
356 end_code = this_start_code;
357 do
358 {
359 int back = GET(end_code, 2+LINK_SIZE);
360 if (back > max_back) max_back = back;
361 end_code += GET(end_code, 1);
362 }
363 while (*end_code == OP_ALT);
364
365 /* If we can't go back the amount required for the longest lookbehind
366 pattern, go back as far as we can; some alternatives may still be viable. */
367
368 #ifdef SUPPORT_UTF8
369 /* In character mode we have to step back character by character */
370
371 if (utf8)
372 {
373 for (gone_back = 0; gone_back < max_back; gone_back++)
374 {
375 if (current_subject <= start_subject) break;
376 current_subject--;
377 while (current_subject > start_subject &&
378 (*current_subject & 0xc0) == 0x80)
379 current_subject--;
380 }
381 }
382 else
383 #endif
384
385 /* In byte-mode we can do this quickly. */
386
387 {
388 gone_back = (current_subject - max_back < start_subject)?
389 current_subject - start_subject : max_back;
390 current_subject -= gone_back;
391 }
392
393 /* Now we can process the individual branches. */
394
395 end_code = this_start_code;
396 do
397 {
398 int back = GET(end_code, 2+LINK_SIZE);
399 if (back <= gone_back)
400 {
401 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402 ADD_NEW_DATA(-bstate, 0, gone_back - back);
403 }
404 end_code += GET(end_code, 1);
405 }
406 while (*end_code == OP_ALT);
407 }
408
409 /* This is the code for a "normal" subpattern (not a backward assertion). The
410 start of a whole pattern is always one of these. If we are at the top level,
411 we may be asked to restart matching from the same point that we reached for a
412 previous partial match. We still have to scan through the top-level branches to
413 find the end state. */
414
415 else
416 {
417 end_code = this_start_code;
418
419 /* Restarting */
420
421 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422 {
423 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424 new_count = workspace[1];
425 if (!workspace[0])
426 memcpy(new_states, active_states, new_count * sizeof(stateblock));
427 }
428
429 /* Not restarting */
430
431 else
432 {
433 int length = 1 + LINK_SIZE +
434 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 do
436 {
437 ADD_NEW(end_code - start_code + length, 0);
438 end_code += GET(end_code, 1);
439 length = 1 + LINK_SIZE;
440 }
441 while (*end_code == OP_ALT);
442 }
443 }
444
445 workspace[0] = 0; /* Bit indicating which vector is current */
446
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448
449 /* Loop for scanning the subject */
450
451 ptr = current_subject;
452 for (;;)
453 {
454 int i, j;
455 int clen, dlen;
456 unsigned int c, d;
457 int forced_fail = 0;
458 int reached_end = 0;
459
460 /* Make the new state list into the active state list and empty the
461 new state list. */
462
463 temp_states = active_states;
464 active_states = new_states;
465 new_states = temp_states;
466 active_count = new_count;
467 new_count = 0;
468
469 workspace[0] ^= 1; /* Remember for the restarting feature */
470 workspace[1] = active_count;
471
472 #ifdef DEBUG
473 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
474 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
475 printf("\"\n");
476
477 printf("%.*sActive states: ", rlevel*2-2, SP);
478 for (i = 0; i < active_count; i++)
479 printf("%d/%d ", active_states[i].offset, active_states[i].count);
480 printf("\n");
481 #endif
482
483 /* Set the pointers for adding new states */
484
485 next_active_state = active_states + active_count;
486 next_new_state = new_states;
487
488 /* Load the current character from the subject outside the loop, as many
489 different states may want to look at it, and we assume that at least one
490 will. */
491
492 if (ptr < end_subject)
493 {
494 clen = 1; /* Number of bytes in the character */
495 #ifdef SUPPORT_UTF8
496 if (utf8) { GETCHARLEN(c, ptr, clen); } else
497 #endif /* SUPPORT_UTF8 */
498 c = *ptr;
499 }
500 else
501 {
502 clen = 0; /* This indicates the end of the subject */
503 c = NOTACHAR; /* This value should never actually be used */
504 }
505
506 /* Scan up the active states and act on each one. The result of an action
507 may be to add more states to the currently active list (e.g. on hitting a
508 parenthesis) or it may be to put states on the new list, for considering
509 when we move the character pointer on. */
510
511 for (i = 0; i < active_count; i++)
512 {
513 stateblock *current_state = active_states + i;
514 const uschar *code;
515 int state_offset = current_state->offset;
516 int count, codevalue, rrc;
517
518 #ifdef DEBUG
519 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
520 if (clen == 0) printf("EOL\n");
521 else if (c > 32 && c < 127) printf("'%c'\n", c);
522 else printf("0x%02x\n", c);
523 #endif
524
525 /* This variable is referred to implicity in the ADD_xxx macros. */
526
527 ims = current_state->ims;
528
529 /* A negative offset is a special case meaning "hold off going to this
530 (negated) state until the number of characters in the data field have
531 been skipped". */
532
533 if (state_offset < 0)
534 {
535 if (current_state->data > 0)
536 {
537 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
538 ADD_NEW_DATA(state_offset, current_state->count,
539 current_state->data - 1);
540 continue;
541 }
542 else
543 {
544 current_state->offset = state_offset = -state_offset;
545 }
546 }
547
548 /* Check for a duplicate state with the same count, and skip if found. */
549
550 for (j = 0; j < i; j++)
551 {
552 if (active_states[j].offset == state_offset &&
553 active_states[j].count == current_state->count)
554 {
555 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
556 goto NEXT_ACTIVE_STATE;
557 }
558 }
559
560 /* The state offset is the offset to the opcode */
561
562 code = start_code + state_offset;
563 codevalue = *code;
564
565 /* If this opcode is followed by an inline character, load it. It is
566 tempting to test for the presence of a subject character here, but that
567 is wrong, because sometimes zero repetitions of the subject are
568 permitted.
569
570 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
571 argument that is not a data character - but is always one byte long. We
572 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
573 this case. To keep the other cases fast, convert these ones to new opcodes.
574 */
575
576 if (coptable[codevalue] > 0)
577 {
578 dlen = 1;
579 #ifdef SUPPORT_UTF8
580 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
581 #endif /* SUPPORT_UTF8 */
582 d = code[coptable[codevalue]];
583 if (codevalue >= OP_TYPESTAR)
584 {
585 switch(d)
586 {
587 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
588 case OP_NOTPROP:
589 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
590 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
591 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
592 case OP_NOT_HSPACE:
593 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
594 case OP_NOT_VSPACE:
595 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
596 default: break;
597 }
598 }
599 }
600 else
601 {
602 dlen = 0; /* Not strictly necessary, but compilers moan */
603 d = NOTACHAR; /* if these variables are not set. */
604 }
605
606
607 /* Now process the individual opcodes */
608
609 switch (codevalue)
610 {
611
612 /* ========================================================================== */
613 /* Reached a closing bracket. If not at the end of the pattern, carry
614 on with the next opcode. Otherwise, unless we have an empty string and
615 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
616 matches so we always have the longest first. */
617
618 case OP_KET:
619 case OP_KETRMIN:
620 case OP_KETRMAX:
621 if (code != end_code)
622 {
623 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
624 if (codevalue != OP_KET)
625 {
626 ADD_ACTIVE(state_offset - GET(code, 1), 0);
627 }
628 }
629 else
630 {
631 reached_end++; /* Count branches that reach the end */
632 if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
633 {
634 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
635 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
636 match_count = 0;
637 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
638 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
639 if (offsetcount >= 2)
640 {
641 offsets[0] = current_subject - start_subject;
642 offsets[1] = ptr - start_subject;
643 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
644 offsets[1] - offsets[0], current_subject));
645 }
646 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
647 {
648 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
649 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
650 match_count, rlevel*2-2, SP));
651 return match_count;
652 }
653 }
654 }
655 break;
656
657 /* ========================================================================== */
658 /* These opcodes add to the current list of states without looking
659 at the current character. */
660
661 /*-----------------------------------------------------------------*/
662 case OP_ALT:
663 do { code += GET(code, 1); } while (*code == OP_ALT);
664 ADD_ACTIVE(code - start_code, 0);
665 break;
666
667 /*-----------------------------------------------------------------*/
668 case OP_BRA:
669 case OP_SBRA:
670 do
671 {
672 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
673 code += GET(code, 1);
674 }
675 while (*code == OP_ALT);
676 break;
677
678 /*-----------------------------------------------------------------*/
679 case OP_CBRA:
680 case OP_SCBRA:
681 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
682 code += GET(code, 1);
683 while (*code == OP_ALT)
684 {
685 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
686 code += GET(code, 1);
687 }
688 break;
689
690 /*-----------------------------------------------------------------*/
691 case OP_BRAZERO:
692 case OP_BRAMINZERO:
693 ADD_ACTIVE(state_offset + 1, 0);
694 code += 1 + GET(code, 2);
695 while (*code == OP_ALT) code += GET(code, 1);
696 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
697 break;
698
699 /*-----------------------------------------------------------------*/
700 case OP_SKIPZERO:
701 code += 1 + GET(code, 2);
702 while (*code == OP_ALT) code += GET(code, 1);
703 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
704 break;
705
706 /*-----------------------------------------------------------------*/
707 case OP_CIRC:
708 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
709 ((ims & PCRE_MULTILINE) != 0 &&
710 ptr != end_subject &&
711 WAS_NEWLINE(ptr)))
712 { ADD_ACTIVE(state_offset + 1, 0); }
713 break;
714
715 /*-----------------------------------------------------------------*/
716 case OP_EOD:
717 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
718 break;
719
720 /*-----------------------------------------------------------------*/
721 case OP_OPT:
722 ims = code[1];
723 ADD_ACTIVE(state_offset + 2, 0);
724 break;
725
726 /*-----------------------------------------------------------------*/
727 case OP_SOD:
728 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
729 break;
730
731 /*-----------------------------------------------------------------*/
732 case OP_SOM:
733 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
734 break;
735
736
737 /* ========================================================================== */
738 /* These opcodes inspect the next subject character, and sometimes
739 the previous one as well, but do not have an argument. The variable
740 clen contains the length of the current character and is zero if we are
741 at the end of the subject. */
742
743 /*-----------------------------------------------------------------*/
744 case OP_ANY:
745 if (clen > 0 && !IS_NEWLINE(ptr))
746 { ADD_NEW(state_offset + 1, 0); }
747 break;
748
749 /*-----------------------------------------------------------------*/
750 case OP_ALLANY:
751 if (clen > 0)
752 { ADD_NEW(state_offset + 1, 0); }
753 break;
754
755 /*-----------------------------------------------------------------*/
756 case OP_EODN:
757 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
758 { ADD_ACTIVE(state_offset + 1, 0); }
759 break;
760
761 /*-----------------------------------------------------------------*/
762 case OP_DOLL:
763 if ((md->moptions & PCRE_NOTEOL) == 0)
764 {
765 if (clen == 0 ||
766 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
767 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
768 ))
769 { ADD_ACTIVE(state_offset + 1, 0); }
770 }
771 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
772 { ADD_ACTIVE(state_offset + 1, 0); }
773 break;
774
775 /*-----------------------------------------------------------------*/
776
777 case OP_DIGIT:
778 case OP_WHITESPACE:
779 case OP_WORDCHAR:
780 if (clen > 0 && c < 256 &&
781 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
782 { ADD_NEW(state_offset + 1, 0); }
783 break;
784
785 /*-----------------------------------------------------------------*/
786 case OP_NOT_DIGIT:
787 case OP_NOT_WHITESPACE:
788 case OP_NOT_WORDCHAR:
789 if (clen > 0 && (c >= 256 ||
790 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
791 { ADD_NEW(state_offset + 1, 0); }
792 break;
793
794 /*-----------------------------------------------------------------*/
795 case OP_WORD_BOUNDARY:
796 case OP_NOT_WORD_BOUNDARY:
797 {
798 int left_word, right_word;
799
800 if (ptr > start_subject)
801 {
802 const uschar *temp = ptr - 1;
803 #ifdef SUPPORT_UTF8
804 if (utf8) BACKCHAR(temp);
805 #endif
806 GETCHARTEST(d, temp);
807 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
808 }
809 else left_word = 0;
810
811 if (clen > 0)
812 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
813 else /* This is a fudge to ensure that if this is the */
814 { /* last item in the pattern, we don't count it as */
815 reached_end--; /* reached, thus disabling a partial match. */
816 right_word = 0;
817 }
818
819 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
820 { ADD_ACTIVE(state_offset + 1, 0); }
821 }
822 break;
823
824
825 /*-----------------------------------------------------------------*/
826 /* Check the next character by Unicode property. We will get here only
827 if the support is in the binary; otherwise a compile-time error occurs.
828 */
829
830 #ifdef SUPPORT_UCP
831 case OP_PROP:
832 case OP_NOTPROP:
833 if (clen > 0)
834 {
835 BOOL OK;
836 const ucd_record * prop = GET_UCD(c);
837 switch(code[1])
838 {
839 case PT_ANY:
840 OK = TRUE;
841 break;
842
843 case PT_LAMP:
844 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
845 break;
846
847 case PT_GC:
848 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
849 break;
850
851 case PT_PC:
852 OK = prop->chartype == code[2];
853 break;
854
855 case PT_SC:
856 OK = prop->script == code[2];
857 break;
858
859 /* Should never occur, but keep compilers from grumbling. */
860
861 default:
862 OK = codevalue != OP_PROP;
863 break;
864 }
865
866 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
867 }
868 break;
869 #endif
870
871
872
873 /* ========================================================================== */
874 /* These opcodes likewise inspect the subject character, but have an
875 argument that is not a data character. It is one of these opcodes:
876 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
877 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
878
879 case OP_TYPEPLUS:
880 case OP_TYPEMINPLUS:
881 case OP_TYPEPOSPLUS:
882 count = current_state->count; /* Already matched */
883 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
884 if (clen > 0)
885 {
886 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
887 (c < 256 &&
888 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
889 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
890 {
891 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
892 {
893 active_count--; /* Remove non-match possibility */
894 next_active_state--;
895 }
896 count++;
897 ADD_NEW(state_offset, count);
898 }
899 }
900 break;
901
902 /*-----------------------------------------------------------------*/
903 case OP_TYPEQUERY:
904 case OP_TYPEMINQUERY:
905 case OP_TYPEPOSQUERY:
906 ADD_ACTIVE(state_offset + 2, 0);
907 if (clen > 0)
908 {
909 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
910 (c < 256 &&
911 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
912 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
913 {
914 if (codevalue == OP_TYPEPOSQUERY)
915 {
916 active_count--; /* Remove non-match possibility */
917 next_active_state--;
918 }
919 ADD_NEW(state_offset + 2, 0);
920 }
921 }
922 break;
923
924 /*-----------------------------------------------------------------*/
925 case OP_TYPESTAR:
926 case OP_TYPEMINSTAR:
927 case OP_TYPEPOSSTAR:
928 ADD_ACTIVE(state_offset + 2, 0);
929 if (clen > 0)
930 {
931 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
932 (c < 256 &&
933 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
934 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
935 {
936 if (codevalue == OP_TYPEPOSSTAR)
937 {
938 active_count--; /* Remove non-match possibility */
939 next_active_state--;
940 }
941 ADD_NEW(state_offset, 0);
942 }
943 }
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_TYPEEXACT:
948 count = current_state->count; /* Number already matched */
949 if (clen > 0)
950 {
951 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
952 (c < 256 &&
953 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
954 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
955 {
956 if (++count >= GET2(code, 1))
957 { ADD_NEW(state_offset + 4, 0); }
958 else
959 { ADD_NEW(state_offset, count); }
960 }
961 }
962 break;
963
964 /*-----------------------------------------------------------------*/
965 case OP_TYPEUPTO:
966 case OP_TYPEMINUPTO:
967 case OP_TYPEPOSUPTO:
968 ADD_ACTIVE(state_offset + 4, 0);
969 count = current_state->count; /* Number already matched */
970 if (clen > 0)
971 {
972 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
973 (c < 256 &&
974 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
975 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
976 {
977 if (codevalue == OP_TYPEPOSUPTO)
978 {
979 active_count--; /* Remove non-match possibility */
980 next_active_state--;
981 }
982 if (++count >= GET2(code, 1))
983 { ADD_NEW(state_offset + 4, 0); }
984 else
985 { ADD_NEW(state_offset, count); }
986 }
987 }
988 break;
989
990 /* ========================================================================== */
991 /* These are virtual opcodes that are used when something like
992 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
993 argument. It keeps the code above fast for the other cases. The argument
994 is in the d variable. */
995
996 #ifdef SUPPORT_UCP
997 case OP_PROP_EXTRA + OP_TYPEPLUS:
998 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
999 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1000 count = current_state->count; /* Already matched */
1001 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1002 if (clen > 0)
1003 {
1004 BOOL OK;
1005 const ucd_record * prop = GET_UCD(c);
1006 switch(code[2])
1007 {
1008 case PT_ANY:
1009 OK = TRUE;
1010 break;
1011
1012 case PT_LAMP:
1013 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1014 break;
1015
1016 case PT_GC:
1017 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1018 break;
1019
1020 case PT_PC:
1021 OK = prop->chartype == code[3];
1022 break;
1023
1024 case PT_SC:
1025 OK = prop->script == code[3];
1026 break;
1027
1028 /* Should never occur, but keep compilers from grumbling. */
1029
1030 default:
1031 OK = codevalue != OP_PROP;
1032 break;
1033 }
1034
1035 if (OK == (d == OP_PROP))
1036 {
1037 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1038 {
1039 active_count--; /* Remove non-match possibility */
1040 next_active_state--;
1041 }
1042 count++;
1043 ADD_NEW(state_offset, count);
1044 }
1045 }
1046 break;
1047
1048 /*-----------------------------------------------------------------*/
1049 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1050 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1051 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1052 count = current_state->count; /* Already matched */
1053 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1054 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1055 {
1056 const uschar *nptr = ptr + clen;
1057 int ncount = 0;
1058 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1059 {
1060 active_count--; /* Remove non-match possibility */
1061 next_active_state--;
1062 }
1063 while (nptr < end_subject)
1064 {
1065 int nd;
1066 int ndlen = 1;
1067 GETCHARLEN(nd, nptr, ndlen);
1068 if (UCD_CATEGORY(nd) != ucp_M) break;
1069 ncount++;
1070 nptr += ndlen;
1071 }
1072 count++;
1073 ADD_NEW_DATA(-state_offset, count, ncount);
1074 }
1075 break;
1076 #endif
1077
1078 /*-----------------------------------------------------------------*/
1079 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1080 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1081 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1082 count = current_state->count; /* Already matched */
1083 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1084 if (clen > 0)
1085 {
1086 int ncount = 0;
1087 switch (c)
1088 {
1089 case 0x000b:
1090 case 0x000c:
1091 case 0x0085:
1092 case 0x2028:
1093 case 0x2029:
1094 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1095 goto ANYNL01;
1096
1097 case 0x000d:
1098 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1099 /* Fall through */
1100
1101 ANYNL01:
1102 case 0x000a:
1103 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1104 {
1105 active_count--; /* Remove non-match possibility */
1106 next_active_state--;
1107 }
1108 count++;
1109 ADD_NEW_DATA(-state_offset, count, ncount);
1110 break;
1111
1112 default:
1113 break;
1114 }
1115 }
1116 break;
1117
1118 /*-----------------------------------------------------------------*/
1119 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1120 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1121 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1122 count = current_state->count; /* Already matched */
1123 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1124 if (clen > 0)
1125 {
1126 BOOL OK;
1127 switch (c)
1128 {
1129 case 0x000a:
1130 case 0x000b:
1131 case 0x000c:
1132 case 0x000d:
1133 case 0x0085:
1134 case 0x2028:
1135 case 0x2029:
1136 OK = TRUE;
1137 break;
1138
1139 default:
1140 OK = FALSE;
1141 break;
1142 }
1143
1144 if (OK == (d == OP_VSPACE))
1145 {
1146 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1147 {
1148 active_count--; /* Remove non-match possibility */
1149 next_active_state--;
1150 }
1151 count++;
1152 ADD_NEW_DATA(-state_offset, count, 0);
1153 }
1154 }
1155 break;
1156
1157 /*-----------------------------------------------------------------*/
1158 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1159 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1160 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1161 count = current_state->count; /* Already matched */
1162 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1163 if (clen > 0)
1164 {
1165 BOOL OK;
1166 switch (c)
1167 {
1168 case 0x09: /* HT */
1169 case 0x20: /* SPACE */
1170 case 0xa0: /* NBSP */
1171 case 0x1680: /* OGHAM SPACE MARK */
1172 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1173 case 0x2000: /* EN QUAD */
1174 case 0x2001: /* EM QUAD */
1175 case 0x2002: /* EN SPACE */
1176 case 0x2003: /* EM SPACE */
1177 case 0x2004: /* THREE-PER-EM SPACE */
1178 case 0x2005: /* FOUR-PER-EM SPACE */
1179 case 0x2006: /* SIX-PER-EM SPACE */
1180 case 0x2007: /* FIGURE SPACE */
1181 case 0x2008: /* PUNCTUATION SPACE */
1182 case 0x2009: /* THIN SPACE */
1183 case 0x200A: /* HAIR SPACE */
1184 case 0x202f: /* NARROW NO-BREAK SPACE */
1185 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1186 case 0x3000: /* IDEOGRAPHIC SPACE */
1187 OK = TRUE;
1188 break;
1189
1190 default:
1191 OK = FALSE;
1192 break;
1193 }
1194
1195 if (OK == (d == OP_HSPACE))
1196 {
1197 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1198 {
1199 active_count--; /* Remove non-match possibility */
1200 next_active_state--;
1201 }
1202 count++;
1203 ADD_NEW_DATA(-state_offset, count, 0);
1204 }
1205 }
1206 break;
1207
1208 /*-----------------------------------------------------------------*/
1209 #ifdef SUPPORT_UCP
1210 case OP_PROP_EXTRA + OP_TYPEQUERY:
1211 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1212 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1213 count = 4;
1214 goto QS1;
1215
1216 case OP_PROP_EXTRA + OP_TYPESTAR:
1217 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1218 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1219 count = 0;
1220
1221 QS1:
1222
1223 ADD_ACTIVE(state_offset + 4, 0);
1224 if (clen > 0)
1225 {
1226 BOOL OK;
1227 const ucd_record * prop = GET_UCD(c);
1228 switch(code[2])
1229 {
1230 case PT_ANY:
1231 OK = TRUE;
1232 break;
1233
1234 case PT_LAMP:
1235 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1236 break;
1237
1238 case PT_GC:
1239 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1240 break;
1241
1242 case PT_PC:
1243 OK = prop->chartype == code[3];
1244 break;
1245
1246 case PT_SC:
1247 OK = prop->script == code[3];
1248 break;
1249
1250 /* Should never occur, but keep compilers from grumbling. */
1251
1252 default:
1253 OK = codevalue != OP_PROP;
1254 break;
1255 }
1256
1257 if (OK == (d == OP_PROP))
1258 {
1259 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1260 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1261 {
1262 active_count--; /* Remove non-match possibility */
1263 next_active_state--;
1264 }
1265 ADD_NEW(state_offset + count, 0);
1266 }
1267 }
1268 break;
1269
1270 /*-----------------------------------------------------------------*/
1271 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1272 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1273 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1274 count = 2;
1275 goto QS2;
1276
1277 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1278 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1279 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1280 count = 0;
1281
1282 QS2:
1283
1284 ADD_ACTIVE(state_offset + 2, 0);
1285 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1286 {
1287 const uschar *nptr = ptr + clen;
1288 int ncount = 0;
1289 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1290 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1291 {
1292 active_count--; /* Remove non-match possibility */
1293 next_active_state--;
1294 }
1295 while (nptr < end_subject)
1296 {
1297 int nd;
1298 int ndlen = 1;
1299 GETCHARLEN(nd, nptr, ndlen);
1300 if (UCD_CATEGORY(nd) != ucp_M) break;
1301 ncount++;
1302 nptr += ndlen;
1303 }
1304 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1305 }
1306 break;
1307 #endif
1308
1309 /*-----------------------------------------------------------------*/
1310 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1311 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1312 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1313 count = 2;
1314 goto QS3;
1315
1316 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1317 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1318 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1319 count = 0;
1320
1321 QS3:
1322 ADD_ACTIVE(state_offset + 2, 0);
1323 if (clen > 0)
1324 {
1325 int ncount = 0;
1326 switch (c)
1327 {
1328 case 0x000b:
1329 case 0x000c:
1330 case 0x0085:
1331 case 0x2028:
1332 case 0x2029:
1333 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1334 goto ANYNL02;
1335
1336 case 0x000d:
1337 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1338 /* Fall through */
1339
1340 ANYNL02:
1341 case 0x000a:
1342 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1343 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1344 {
1345 active_count--; /* Remove non-match possibility */
1346 next_active_state--;
1347 }
1348 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1349 break;
1350
1351 default:
1352 break;
1353 }
1354 }
1355 break;
1356
1357 /*-----------------------------------------------------------------*/
1358 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1359 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1360 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1361 count = 2;
1362 goto QS4;
1363
1364 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1365 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1366 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1367 count = 0;
1368
1369 QS4:
1370 ADD_ACTIVE(state_offset + 2, 0);
1371 if (clen > 0)
1372 {
1373 BOOL OK;
1374 switch (c)
1375 {
1376 case 0x000a:
1377 case 0x000b:
1378 case 0x000c:
1379 case 0x000d:
1380 case 0x0085:
1381 case 0x2028:
1382 case 0x2029:
1383 OK = TRUE;
1384 break;
1385
1386 default:
1387 OK = FALSE;
1388 break;
1389 }
1390 if (OK == (d == OP_VSPACE))
1391 {
1392 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1393 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1394 {
1395 active_count--; /* Remove non-match possibility */
1396 next_active_state--;
1397 }
1398 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1399 }
1400 }
1401 break;
1402
1403 /*-----------------------------------------------------------------*/
1404 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1405 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1406 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1407 count = 2;
1408 goto QS5;
1409
1410 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1411 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1412 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1413 count = 0;
1414
1415 QS5:
1416 ADD_ACTIVE(state_offset + 2, 0);
1417 if (clen > 0)
1418 {
1419 BOOL OK;
1420 switch (c)
1421 {
1422 case 0x09: /* HT */
1423 case 0x20: /* SPACE */
1424 case 0xa0: /* NBSP */
1425 case 0x1680: /* OGHAM SPACE MARK */
1426 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1427 case 0x2000: /* EN QUAD */
1428 case 0x2001: /* EM QUAD */
1429 case 0x2002: /* EN SPACE */
1430 case 0x2003: /* EM SPACE */
1431 case 0x2004: /* THREE-PER-EM SPACE */
1432 case 0x2005: /* FOUR-PER-EM SPACE */
1433 case 0x2006: /* SIX-PER-EM SPACE */
1434 case 0x2007: /* FIGURE SPACE */
1435 case 0x2008: /* PUNCTUATION SPACE */
1436 case 0x2009: /* THIN SPACE */
1437 case 0x200A: /* HAIR SPACE */
1438 case 0x202f: /* NARROW NO-BREAK SPACE */
1439 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1440 case 0x3000: /* IDEOGRAPHIC SPACE */
1441 OK = TRUE;
1442 break;
1443
1444 default:
1445 OK = FALSE;
1446 break;
1447 }
1448
1449 if (OK == (d == OP_HSPACE))
1450 {
1451 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1452 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1453 {
1454 active_count--; /* Remove non-match possibility */
1455 next_active_state--;
1456 }
1457 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1458 }
1459 }
1460 break;
1461
1462 /*-----------------------------------------------------------------*/
1463 #ifdef SUPPORT_UCP
1464 case OP_PROP_EXTRA + OP_TYPEEXACT:
1465 case OP_PROP_EXTRA + OP_TYPEUPTO:
1466 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1467 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1468 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1469 { ADD_ACTIVE(state_offset + 6, 0); }
1470 count = current_state->count; /* Number already matched */
1471 if (clen > 0)
1472 {
1473 BOOL OK;
1474 const ucd_record * prop = GET_UCD(c);
1475 switch(code[4])
1476 {
1477 case PT_ANY:
1478 OK = TRUE;
1479 break;
1480
1481 case PT_LAMP:
1482 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1483 break;
1484
1485 case PT_GC:
1486 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1487 break;
1488
1489 case PT_PC:
1490 OK = prop->chartype == code[5];
1491 break;
1492
1493 case PT_SC:
1494 OK = prop->script == code[5];
1495 break;
1496
1497 /* Should never occur, but keep compilers from grumbling. */
1498
1499 default:
1500 OK = codevalue != OP_PROP;
1501 break;
1502 }
1503
1504 if (OK == (d == OP_PROP))
1505 {
1506 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1507 {
1508 active_count--; /* Remove non-match possibility */
1509 next_active_state--;
1510 }
1511 if (++count >= GET2(code, 1))
1512 { ADD_NEW(state_offset + 6, 0); }
1513 else
1514 { ADD_NEW(state_offset, count); }
1515 }
1516 }
1517 break;
1518
1519 /*-----------------------------------------------------------------*/
1520 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1521 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1522 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1523 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1524 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1525 { ADD_ACTIVE(state_offset + 4, 0); }
1526 count = current_state->count; /* Number already matched */
1527 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1528 {
1529 const uschar *nptr = ptr + clen;
1530 int ncount = 0;
1531 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1532 {
1533 active_count--; /* Remove non-match possibility */
1534 next_active_state--;
1535 }
1536 while (nptr < end_subject)
1537 {
1538 int nd;
1539 int ndlen = 1;
1540 GETCHARLEN(nd, nptr, ndlen);
1541 if (UCD_CATEGORY(nd) != ucp_M) break;
1542 ncount++;
1543 nptr += ndlen;
1544 }
1545 if (++count >= GET2(code, 1))
1546 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1547 else
1548 { ADD_NEW_DATA(-state_offset, count, ncount); }
1549 }
1550 break;
1551 #endif
1552
1553 /*-----------------------------------------------------------------*/
1554 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1555 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1556 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1557 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1558 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1559 { ADD_ACTIVE(state_offset + 4, 0); }
1560 count = current_state->count; /* Number already matched */
1561 if (clen > 0)
1562 {
1563 int ncount = 0;
1564 switch (c)
1565 {
1566 case 0x000b:
1567 case 0x000c:
1568 case 0x0085:
1569 case 0x2028:
1570 case 0x2029:
1571 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1572 goto ANYNL03;
1573
1574 case 0x000d:
1575 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1576 /* Fall through */
1577
1578 ANYNL03:
1579 case 0x000a:
1580 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1581 {
1582 active_count--; /* Remove non-match possibility */
1583 next_active_state--;
1584 }
1585 if (++count >= GET2(code, 1))
1586 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1587 else
1588 { ADD_NEW_DATA(-state_offset, count, ncount); }
1589 break;
1590
1591 default:
1592 break;
1593 }
1594 }
1595 break;
1596
1597 /*-----------------------------------------------------------------*/
1598 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1599 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1600 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1601 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1602 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1603 { ADD_ACTIVE(state_offset + 4, 0); }
1604 count = current_state->count; /* Number already matched */
1605 if (clen > 0)
1606 {
1607 BOOL OK;
1608 switch (c)
1609 {
1610 case 0x000a:
1611 case 0x000b:
1612 case 0x000c:
1613 case 0x000d:
1614 case 0x0085:
1615 case 0x2028:
1616 case 0x2029:
1617 OK = TRUE;
1618 break;
1619
1620 default:
1621 OK = FALSE;
1622 }
1623
1624 if (OK == (d == OP_VSPACE))
1625 {
1626 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1627 {
1628 active_count--; /* Remove non-match possibility */
1629 next_active_state--;
1630 }
1631 if (++count >= GET2(code, 1))
1632 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1633 else
1634 { ADD_NEW_DATA(-state_offset, count, 0); }
1635 }
1636 }
1637 break;
1638
1639 /*-----------------------------------------------------------------*/
1640 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1641 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1642 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1643 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1644 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1645 { ADD_ACTIVE(state_offset + 4, 0); }
1646 count = current_state->count; /* Number already matched */
1647 if (clen > 0)
1648 {
1649 BOOL OK;
1650 switch (c)
1651 {
1652 case 0x09: /* HT */
1653 case 0x20: /* SPACE */
1654 case 0xa0: /* NBSP */
1655 case 0x1680: /* OGHAM SPACE MARK */
1656 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1657 case 0x2000: /* EN QUAD */
1658 case 0x2001: /* EM QUAD */
1659 case 0x2002: /* EN SPACE */
1660 case 0x2003: /* EM SPACE */
1661 case 0x2004: /* THREE-PER-EM SPACE */
1662 case 0x2005: /* FOUR-PER-EM SPACE */
1663 case 0x2006: /* SIX-PER-EM SPACE */
1664 case 0x2007: /* FIGURE SPACE */
1665 case 0x2008: /* PUNCTUATION SPACE */
1666 case 0x2009: /* THIN SPACE */
1667 case 0x200A: /* HAIR SPACE */
1668 case 0x202f: /* NARROW NO-BREAK SPACE */
1669 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1670 case 0x3000: /* IDEOGRAPHIC SPACE */
1671 OK = TRUE;
1672 break;
1673
1674 default:
1675 OK = FALSE;
1676 break;
1677 }
1678
1679 if (OK == (d == OP_HSPACE))
1680 {
1681 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1682 {
1683 active_count--; /* Remove non-match possibility */
1684 next_active_state--;
1685 }
1686 if (++count >= GET2(code, 1))
1687 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1688 else
1689 { ADD_NEW_DATA(-state_offset, count, 0); }
1690 }
1691 }
1692 break;
1693
1694 /* ========================================================================== */
1695 /* These opcodes are followed by a character that is usually compared
1696 to the current subject character; it is loaded into d. We still get
1697 here even if there is no subject character, because in some cases zero
1698 repetitions are permitted. */
1699
1700 /*-----------------------------------------------------------------*/
1701 case OP_CHAR:
1702 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1703 break;
1704
1705 /*-----------------------------------------------------------------*/
1706 case OP_CHARNC:
1707 if (clen == 0) break;
1708
1709 #ifdef SUPPORT_UTF8
1710 if (utf8)
1711 {
1712 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1713 {
1714 unsigned int othercase;
1715 if (c < 128) othercase = fcc[c]; else
1716
1717 /* If we have Unicode property support, we can use it to test the
1718 other case of the character. */
1719
1720 #ifdef SUPPORT_UCP
1721 othercase = UCD_OTHERCASE(c);
1722 #else
1723 othercase = NOTACHAR;
1724 #endif
1725
1726 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1727 }
1728 }
1729 else
1730 #endif /* SUPPORT_UTF8 */
1731
1732 /* Non-UTF-8 mode */
1733 {
1734 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1735 }
1736 break;
1737
1738
1739 #ifdef SUPPORT_UCP
1740 /*-----------------------------------------------------------------*/
1741 /* This is a tricky one because it can match more than one character.
1742 Find out how many characters to skip, and then set up a negative state
1743 to wait for them to pass before continuing. */
1744
1745 case OP_EXTUNI:
1746 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1747 {
1748 const uschar *nptr = ptr + clen;
1749 int ncount = 0;
1750 while (nptr < end_subject)
1751 {
1752 int nclen = 1;
1753 GETCHARLEN(c, nptr, nclen);
1754 if (UCD_CATEGORY(c) != ucp_M) break;
1755 ncount++;
1756 nptr += nclen;
1757 }
1758 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1759 }
1760 break;
1761 #endif
1762
1763 /*-----------------------------------------------------------------*/
1764 /* This is a tricky like EXTUNI because it too can match more than one
1765 character (when CR is followed by LF). In this case, set up a negative
1766 state to wait for one character to pass before continuing. */
1767
1768 case OP_ANYNL:
1769 if (clen > 0) switch(c)
1770 {
1771 case 0x000b:
1772 case 0x000c:
1773 case 0x0085:
1774 case 0x2028:
1775 case 0x2029:
1776 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1777
1778 case 0x000a:
1779 ADD_NEW(state_offset + 1, 0);
1780 break;
1781
1782 case 0x000d:
1783 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1784 {
1785 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1786 }
1787 else
1788 {
1789 ADD_NEW(state_offset + 1, 0);
1790 }
1791 break;
1792 }
1793 break;
1794
1795 /*-----------------------------------------------------------------*/
1796 case OP_NOT_VSPACE:
1797 if (clen > 0) switch(c)
1798 {
1799 case 0x000a:
1800 case 0x000b:
1801 case 0x000c:
1802 case 0x000d:
1803 case 0x0085:
1804 case 0x2028:
1805 case 0x2029:
1806 break;
1807
1808 default:
1809 ADD_NEW(state_offset + 1, 0);
1810 break;
1811 }
1812 break;
1813
1814 /*-----------------------------------------------------------------*/
1815 case OP_VSPACE:
1816 if (clen > 0) switch(c)
1817 {
1818 case 0x000a:
1819 case 0x000b:
1820 case 0x000c:
1821 case 0x000d:
1822 case 0x0085:
1823 case 0x2028:
1824 case 0x2029:
1825 ADD_NEW(state_offset + 1, 0);
1826 break;
1827
1828 default: break;
1829 }
1830 break;
1831
1832 /*-----------------------------------------------------------------*/
1833 case OP_NOT_HSPACE:
1834 if (clen > 0) switch(c)
1835 {
1836 case 0x09: /* HT */
1837 case 0x20: /* SPACE */
1838 case 0xa0: /* NBSP */
1839 case 0x1680: /* OGHAM SPACE MARK */
1840 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1841 case 0x2000: /* EN QUAD */
1842 case 0x2001: /* EM QUAD */
1843 case 0x2002: /* EN SPACE */
1844 case 0x2003: /* EM SPACE */
1845 case 0x2004: /* THREE-PER-EM SPACE */
1846 case 0x2005: /* FOUR-PER-EM SPACE */
1847 case 0x2006: /* SIX-PER-EM SPACE */
1848 case 0x2007: /* FIGURE SPACE */
1849 case 0x2008: /* PUNCTUATION SPACE */
1850 case 0x2009: /* THIN SPACE */
1851 case 0x200A: /* HAIR SPACE */
1852 case 0x202f: /* NARROW NO-BREAK SPACE */
1853 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1854 case 0x3000: /* IDEOGRAPHIC SPACE */
1855 break;
1856
1857 default:
1858 ADD_NEW(state_offset + 1, 0);
1859 break;
1860 }
1861 break;
1862
1863 /*-----------------------------------------------------------------*/
1864 case OP_HSPACE:
1865 if (clen > 0) switch(c)
1866 {
1867 case 0x09: /* HT */
1868 case 0x20: /* SPACE */
1869 case 0xa0: /* NBSP */
1870 case 0x1680: /* OGHAM SPACE MARK */
1871 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1872 case 0x2000: /* EN QUAD */
1873 case 0x2001: /* EM QUAD */
1874 case 0x2002: /* EN SPACE */
1875 case 0x2003: /* EM SPACE */
1876 case 0x2004: /* THREE-PER-EM SPACE */
1877 case 0x2005: /* FOUR-PER-EM SPACE */
1878 case 0x2006: /* SIX-PER-EM SPACE */
1879 case 0x2007: /* FIGURE SPACE */
1880 case 0x2008: /* PUNCTUATION SPACE */
1881 case 0x2009: /* THIN SPACE */
1882 case 0x200A: /* HAIR SPACE */
1883 case 0x202f: /* NARROW NO-BREAK SPACE */
1884 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1885 case 0x3000: /* IDEOGRAPHIC SPACE */
1886 ADD_NEW(state_offset + 1, 0);
1887 break;
1888 }
1889 break;
1890
1891 /*-----------------------------------------------------------------*/
1892 /* Match a negated single character. This is only used for one-byte
1893 characters, that is, we know that d < 256. The character we are
1894 checking (c) can be multibyte. */
1895
1896 case OP_NOT:
1897 if (clen > 0)
1898 {
1899 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1900 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1901 }
1902 break;
1903
1904 /*-----------------------------------------------------------------*/
1905 case OP_PLUS:
1906 case OP_MINPLUS:
1907 case OP_POSPLUS:
1908 case OP_NOTPLUS:
1909 case OP_NOTMINPLUS:
1910 case OP_NOTPOSPLUS:
1911 count = current_state->count; /* Already matched */
1912 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1913 if (clen > 0)
1914 {
1915 unsigned int otherd = NOTACHAR;
1916 if ((ims & PCRE_CASELESS) != 0)
1917 {
1918 #ifdef SUPPORT_UTF8
1919 if (utf8 && d >= 128)
1920 {
1921 #ifdef SUPPORT_UCP
1922 otherd = UCD_OTHERCASE(d);
1923 #endif /* SUPPORT_UCP */
1924 }
1925 else
1926 #endif /* SUPPORT_UTF8 */
1927 otherd = fcc[d];
1928 }
1929 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1930 {
1931 if (count > 0 &&
1932 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1933 {
1934 active_count--; /* Remove non-match possibility */
1935 next_active_state--;
1936 }
1937 count++;
1938 ADD_NEW(state_offset, count);
1939 }
1940 }
1941 break;
1942
1943 /*-----------------------------------------------------------------*/
1944 case OP_QUERY:
1945 case OP_MINQUERY:
1946 case OP_POSQUERY:
1947 case OP_NOTQUERY:
1948 case OP_NOTMINQUERY:
1949 case OP_NOTPOSQUERY:
1950 ADD_ACTIVE(state_offset + dlen + 1, 0);
1951 if (clen > 0)
1952 {
1953 unsigned int otherd = NOTACHAR;
1954 if ((ims & PCRE_CASELESS) != 0)
1955 {
1956 #ifdef SUPPORT_UTF8
1957 if (utf8 && d >= 128)
1958 {
1959 #ifdef SUPPORT_UCP
1960 otherd = UCD_OTHERCASE(d);
1961 #endif /* SUPPORT_UCP */
1962 }
1963 else
1964 #endif /* SUPPORT_UTF8 */
1965 otherd = fcc[d];
1966 }
1967 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1968 {
1969 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1970 {
1971 active_count--; /* Remove non-match possibility */
1972 next_active_state--;
1973 }
1974 ADD_NEW(state_offset + dlen + 1, 0);
1975 }
1976 }
1977 break;
1978
1979 /*-----------------------------------------------------------------*/
1980 case OP_STAR:
1981 case OP_MINSTAR:
1982 case OP_POSSTAR:
1983 case OP_NOTSTAR:
1984 case OP_NOTMINSTAR:
1985 case OP_NOTPOSSTAR:
1986 ADD_ACTIVE(state_offset + dlen + 1, 0);
1987 if (clen > 0)
1988 {
1989 unsigned int otherd = NOTACHAR;
1990 if ((ims & PCRE_CASELESS) != 0)
1991 {
1992 #ifdef SUPPORT_UTF8
1993 if (utf8 && d >= 128)
1994 {
1995 #ifdef SUPPORT_UCP
1996 otherd = UCD_OTHERCASE(d);
1997 #endif /* SUPPORT_UCP */
1998 }
1999 else
2000 #endif /* SUPPORT_UTF8 */
2001 otherd = fcc[d];
2002 }
2003 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004 {
2005 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2006 {
2007 active_count--; /* Remove non-match possibility */
2008 next_active_state--;
2009 }
2010 ADD_NEW(state_offset, 0);
2011 }
2012 }
2013 break;
2014
2015 /*-----------------------------------------------------------------*/
2016 case OP_EXACT:
2017 case OP_NOTEXACT:
2018 count = current_state->count; /* Number already matched */
2019 if (clen > 0)
2020 {
2021 unsigned int otherd = NOTACHAR;
2022 if ((ims & PCRE_CASELESS) != 0)
2023 {
2024 #ifdef SUPPORT_UTF8
2025 if (utf8 && d >= 128)
2026 {
2027 #ifdef SUPPORT_UCP
2028 otherd = UCD_OTHERCASE(d);
2029 #endif /* SUPPORT_UCP */
2030 }
2031 else
2032 #endif /* SUPPORT_UTF8 */
2033 otherd = fcc[d];
2034 }
2035 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2036 {
2037 if (++count >= GET2(code, 1))
2038 { ADD_NEW(state_offset + dlen + 3, 0); }
2039 else
2040 { ADD_NEW(state_offset, count); }
2041 }
2042 }
2043 break;
2044
2045 /*-----------------------------------------------------------------*/
2046 case OP_UPTO:
2047 case OP_MINUPTO:
2048 case OP_POSUPTO:
2049 case OP_NOTUPTO:
2050 case OP_NOTMINUPTO:
2051 case OP_NOTPOSUPTO:
2052 ADD_ACTIVE(state_offset + dlen + 3, 0);
2053 count = current_state->count; /* Number already matched */
2054 if (clen > 0)
2055 {
2056 unsigned int otherd = NOTACHAR;
2057 if ((ims & PCRE_CASELESS) != 0)
2058 {
2059 #ifdef SUPPORT_UTF8
2060 if (utf8 && d >= 128)
2061 {
2062 #ifdef SUPPORT_UCP
2063 otherd = UCD_OTHERCASE(d);
2064 #endif /* SUPPORT_UCP */
2065 }
2066 else
2067 #endif /* SUPPORT_UTF8 */
2068 otherd = fcc[d];
2069 }
2070 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2071 {
2072 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2073 {
2074 active_count--; /* Remove non-match possibility */
2075 next_active_state--;
2076 }
2077 if (++count >= GET2(code, 1))
2078 { ADD_NEW(state_offset + dlen + 3, 0); }
2079 else
2080 { ADD_NEW(state_offset, count); }
2081 }
2082 }
2083 break;
2084
2085
2086 /* ========================================================================== */
2087 /* These are the class-handling opcodes */
2088
2089 case OP_CLASS:
2090 case OP_NCLASS:
2091 case OP_XCLASS:
2092 {
2093 BOOL isinclass = FALSE;
2094 int next_state_offset;
2095 const uschar *ecode;
2096
2097 /* For a simple class, there is always just a 32-byte table, and we
2098 can set isinclass from it. */
2099
2100 if (codevalue != OP_XCLASS)
2101 {
2102 ecode = code + 33;
2103 if (clen > 0)
2104 {
2105 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2106 ((code[1 + c/8] & (1 << (c&7))) != 0);
2107 }
2108 }
2109
2110 /* An extended class may have a table or a list of single characters,
2111 ranges, or both, and it may be positive or negative. There's a
2112 function that sorts all this out. */
2113
2114 else
2115 {
2116 ecode = code + GET(code, 1);
2117 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2118 }
2119
2120 /* At this point, isinclass is set for all kinds of class, and ecode
2121 points to the byte after the end of the class. If there is a
2122 quantifier, this is where it will be. */
2123
2124 next_state_offset = ecode - start_code;
2125
2126 switch (*ecode)
2127 {
2128 case OP_CRSTAR:
2129 case OP_CRMINSTAR:
2130 ADD_ACTIVE(next_state_offset + 1, 0);
2131 if (isinclass) { ADD_NEW(state_offset, 0); }
2132 break;
2133
2134 case OP_CRPLUS:
2135 case OP_CRMINPLUS:
2136 count = current_state->count; /* Already matched */
2137 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2138 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2139 break;
2140
2141 case OP_CRQUERY:
2142 case OP_CRMINQUERY:
2143 ADD_ACTIVE(next_state_offset + 1, 0);
2144 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2145 break;
2146
2147 case OP_CRRANGE:
2148 case OP_CRMINRANGE:
2149 count = current_state->count; /* Already matched */
2150 if (count >= GET2(ecode, 1))
2151 { ADD_ACTIVE(next_state_offset + 5, 0); }
2152 if (isinclass)
2153 {
2154 int max = GET2(ecode, 3);
2155 if (++count >= max && max != 0) /* Max 0 => no limit */
2156 { ADD_NEW(next_state_offset + 5, 0); }
2157 else
2158 { ADD_NEW(state_offset, count); }
2159 }
2160 break;
2161
2162 default:
2163 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2164 break;
2165 }
2166 }
2167 break;
2168
2169 /* ========================================================================== */
2170 /* These are the opcodes for fancy brackets of various kinds. We have
2171 to use recursion in order to handle them. The "always failing" assertion
2172 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2173 though the other "backtracking verbs" are not supported. */
2174
2175 case OP_FAIL:
2176 forced_fail++; /* Count FAILs for multiple states */
2177 break;
2178
2179 case OP_ASSERT:
2180 case OP_ASSERT_NOT:
2181 case OP_ASSERTBACK:
2182 case OP_ASSERTBACK_NOT:
2183 {
2184 int rc;
2185 int local_offsets[2];
2186 int local_workspace[1000];
2187 const uschar *endasscode = code + GET(code, 1);
2188
2189 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2190
2191 rc = internal_dfa_exec(
2192 md, /* static match data */
2193 code, /* this subexpression's code */
2194 ptr, /* where we currently are */
2195 ptr - start_subject, /* start offset */
2196 local_offsets, /* offset vector */
2197 sizeof(local_offsets)/sizeof(int), /* size of same */
2198 local_workspace, /* workspace vector */
2199 sizeof(local_workspace)/sizeof(int), /* size of same */
2200 ims, /* the current ims flags */
2201 rlevel, /* function recursion level */
2202 recursing); /* pass on regex recursion */
2203
2204 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2205 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2206 }
2207 break;
2208
2209 /*-----------------------------------------------------------------*/
2210 case OP_COND:
2211 case OP_SCOND:
2212 {
2213 int local_offsets[1000];
2214 int local_workspace[1000];
2215 int codelink = GET(code, 1);
2216 int condcode;
2217
2218 /* Because of the way auto-callout works during compile, a callout item
2219 is inserted between OP_COND and an assertion condition. This does not
2220 happen for the other conditions. */
2221
2222 if (code[LINK_SIZE+1] == OP_CALLOUT)
2223 {
2224 rrc = 0;
2225 if (pcre_callout != NULL)
2226 {
2227 pcre_callout_block cb;
2228 cb.version = 1; /* Version 1 of the callout block */
2229 cb.callout_number = code[LINK_SIZE+2];
2230 cb.offset_vector = offsets;
2231 cb.subject = (PCRE_SPTR)start_subject;
2232 cb.subject_length = end_subject - start_subject;
2233 cb.start_match = current_subject - start_subject;
2234 cb.current_position = ptr - start_subject;
2235 cb.pattern_position = GET(code, LINK_SIZE + 3);
2236 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2237 cb.capture_top = 1;
2238 cb.capture_last = -1;
2239 cb.callout_data = md->callout_data;
2240 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2241 }
2242 if (rrc > 0) break; /* Fail this thread */
2243 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2244 }
2245
2246 condcode = code[LINK_SIZE+1];
2247
2248 /* Back reference conditions are not supported */
2249
2250 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2251
2252 /* The DEFINE condition is always false */
2253
2254 if (condcode == OP_DEF)
2255 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256
2257 /* The only supported version of OP_RREF is for the value RREF_ANY,
2258 which means "test if in any recursion". We can't test for specifically
2259 recursed groups. */
2260
2261 else if (condcode == OP_RREF)
2262 {
2263 int value = GET2(code, LINK_SIZE+2);
2264 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2265 if (recursing > 0)
2266 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2267 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2268 }
2269
2270 /* Otherwise, the condition is an assertion */
2271
2272 else
2273 {
2274 int rc;
2275 const uschar *asscode = code + LINK_SIZE + 1;
2276 const uschar *endasscode = asscode + GET(asscode, 1);
2277
2278 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2279
2280 rc = internal_dfa_exec(
2281 md, /* fixed match data */
2282 asscode, /* this subexpression's code */
2283 ptr, /* where we currently are */
2284 ptr - start_subject, /* start offset */
2285 local_offsets, /* offset vector */
2286 sizeof(local_offsets)/sizeof(int), /* size of same */
2287 local_workspace, /* workspace vector */
2288 sizeof(local_workspace)/sizeof(int), /* size of same */
2289 ims, /* the current ims flags */
2290 rlevel, /* function recursion level */
2291 recursing); /* pass on regex recursion */
2292
2293 if ((rc >= 0) ==
2294 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2295 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2296 else
2297 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2298 }
2299 }
2300 break;
2301
2302 /*-----------------------------------------------------------------*/
2303 case OP_RECURSE:
2304 {
2305 int local_offsets[1000];
2306 int local_workspace[1000];
2307 int rc;
2308
2309 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2310 recursing + 1));
2311
2312 rc = internal_dfa_exec(
2313 md, /* fixed match data */
2314 start_code + GET(code, 1), /* this subexpression's code */
2315 ptr, /* where we currently are */
2316 ptr - start_subject, /* start offset */
2317 local_offsets, /* offset vector */
2318 sizeof(local_offsets)/sizeof(int), /* size of same */
2319 local_workspace, /* workspace vector */
2320 sizeof(local_workspace)/sizeof(int), /* size of same */
2321 ims, /* the current ims flags */
2322 rlevel, /* function recursion level */
2323 recursing + 1); /* regex recurse level */
2324
2325 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2326 recursing + 1, rc));
2327
2328 /* Ran out of internal offsets */
2329
2330 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2331
2332 /* For each successful matched substring, set up the next state with a
2333 count of characters to skip before trying it. Note that the count is in
2334 characters, not bytes. */
2335
2336 if (rc > 0)
2337 {
2338 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2339 {
2340 const uschar *p = start_subject + local_offsets[rc];
2341 const uschar *pp = start_subject + local_offsets[rc+1];
2342 int charcount = local_offsets[rc+1] - local_offsets[rc];
2343 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2344 if (charcount > 0)
2345 {
2346 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2347 }
2348 else
2349 {
2350 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2351 }
2352 }
2353 }
2354 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2355 }
2356 break;
2357
2358 /*-----------------------------------------------------------------*/
2359 case OP_ONCE:
2360 {
2361 int local_offsets[2];
2362 int local_workspace[1000];
2363
2364 int rc = internal_dfa_exec(
2365 md, /* fixed match data */
2366 code, /* this subexpression's code */
2367 ptr, /* where we currently are */
2368 ptr - start_subject, /* start offset */
2369 local_offsets, /* offset vector */
2370 sizeof(local_offsets)/sizeof(int), /* size of same */
2371 local_workspace, /* workspace vector */
2372 sizeof(local_workspace)/sizeof(int), /* size of same */
2373 ims, /* the current ims flags */
2374 rlevel, /* function recursion level */
2375 recursing); /* pass on regex recursion */
2376
2377 if (rc >= 0)
2378 {
2379 const uschar *end_subpattern = code;
2380 int charcount = local_offsets[1] - local_offsets[0];
2381 int next_state_offset, repeat_state_offset;
2382
2383 do { end_subpattern += GET(end_subpattern, 1); }
2384 while (*end_subpattern == OP_ALT);
2385 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2386
2387 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2388 arrange for the repeat state also to be added to the relevant list.
2389 Calculate the offset, or set -1 for no repeat. */
2390
2391 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2392 *end_subpattern == OP_KETRMIN)?
2393 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2394
2395 /* If we have matched an empty string, add the next state at the
2396 current character pointer. This is important so that the duplicate
2397 checking kicks in, which is what breaks infinite loops that match an
2398 empty string. */
2399
2400 if (charcount == 0)
2401 {
2402 ADD_ACTIVE(next_state_offset, 0);
2403 }
2404
2405 /* Optimization: if there are no more active states, and there
2406 are no new states yet set up, then skip over the subject string
2407 right here, to save looping. Otherwise, set up the new state to swing
2408 into action when the end of the substring is reached. */
2409
2410 else if (i + 1 >= active_count && new_count == 0)
2411 {
2412 ptr += charcount;
2413 clen = 0;
2414 ADD_NEW(next_state_offset, 0);
2415
2416 /* If we are adding a repeat state at the new character position,
2417 we must fudge things so that it is the only current state.
2418 Otherwise, it might be a duplicate of one we processed before, and
2419 that would cause it to be skipped. */
2420
2421 if (repeat_state_offset >= 0)
2422 {
2423 next_active_state = active_states;
2424 active_count = 0;
2425 i = -1;
2426 ADD_ACTIVE(repeat_state_offset, 0);
2427 }
2428 }
2429 else
2430 {
2431 const uschar *p = start_subject + local_offsets[0];
2432 const uschar *pp = start_subject + local_offsets[1];
2433 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2434 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2435 if (repeat_state_offset >= 0)
2436 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2437 }
2438
2439 }
2440 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2441 }
2442 break;
2443
2444
2445 /* ========================================================================== */
2446 /* Handle callouts */
2447
2448 case OP_CALLOUT:
2449 rrc = 0;
2450 if (pcre_callout != NULL)
2451 {
2452 pcre_callout_block cb;
2453 cb.version = 1; /* Version 1 of the callout block */
2454 cb.callout_number = code[1];
2455 cb.offset_vector = offsets;
2456 cb.subject = (PCRE_SPTR)start_subject;
2457 cb.subject_length = end_subject - start_subject;
2458 cb.start_match = current_subject - start_subject;
2459 cb.current_position = ptr - start_subject;
2460 cb.pattern_position = GET(code, 2);
2461 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2462 cb.capture_top = 1;
2463 cb.capture_last = -1;
2464 cb.callout_data = md->callout_data;
2465 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2466 }
2467 if (rrc == 0)
2468 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2469 break;
2470
2471
2472 /* ========================================================================== */
2473 default: /* Unsupported opcode */
2474 return PCRE_ERROR_DFA_UITEM;
2475 }
2476
2477 NEXT_ACTIVE_STATE: continue;
2478
2479 } /* End of loop scanning active states */
2480
2481 /* We have finished the processing at the current subject character. If no
2482 new states have been set for the next character, we have found all the
2483 matches that we are going to find. If we are at the top level and partial
2484 matching has been requested, check for appropriate conditions. The "forced_
2485 fail" variable counts the number of (*F) encountered for the character. If it
2486 is equal to the original active_count (saved in workspace[1]) it means that
2487 (*F) was found on every active state. In this case we don't want to give a
2488 partial match. */
2489
2490 if (new_count <= 0)
2491 {
2492 if (rlevel == 1 && /* Top level, and */
2493 reached_end != workspace[1] && /* Not all reached end */
2494 forced_fail != workspace[1] && /* Not all forced fail & */
2495 ( /* either... */
2496 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2497 || /* or... */
2498 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2499 match_count < 0) /* no matches */
2500 ) && /* And... */
2501 ptr >= end_subject && /* Reached end of subject */
2502 ptr > current_subject) /* Matched non-empty string */
2503 {
2504 if (offsetcount >= 2)
2505 {
2506 offsets[0] = current_subject - start_subject;
2507 offsets[1] = end_subject - start_subject;
2508 }
2509 match_count = PCRE_ERROR_PARTIAL;
2510 }
2511
2512 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2513 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2514 rlevel*2-2, SP));
2515 break; /* In effect, "return", but see the comment below */
2516 }
2517
2518 /* One or more states are active for the next character. */
2519
2520 ptr += clen; /* Advance to next subject character */
2521 } /* Loop to move along the subject string */
2522
2523 /* Control gets here from "break" a few lines above. We do it this way because
2524 if we use "return" above, we have compiler trouble. Some compilers warn if
2525 there's nothing here because they think the function doesn't return a value. On
2526 the other hand, if we put a dummy statement here, some more clever compilers
2527 complain that it can't be reached. Sigh. */
2528
2529 return match_count;
2530 }
2531
2532
2533
2534
2535 /*************************************************
2536 * Execute a Regular Expression - DFA engine *
2537 *************************************************/
2538
2539 /* This external function applies a compiled re to a subject string using a DFA
2540 engine. This function calls the internal function multiple times if the pattern
2541 is not anchored.
2542
2543 Arguments:
2544 argument_re points to the compiled expression
2545 extra_data points to extra data or is NULL
2546 subject points to the subject string
2547 length length of subject string (may contain binary zeros)
2548 start_offset where to start in the subject string
2549 options option bits
2550 offsets vector of match offsets
2551 offsetcount size of same
2552 workspace workspace vector
2553 wscount size of same
2554
2555 Returns: > 0 => number of match offset pairs placed in offsets
2556 = 0 => offsets overflowed; longest matches are present
2557 -1 => failed to match
2558 < -1 => some kind of unexpected problem
2559 */
2560
2561 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2562 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2563 const char *subject, int length, int start_offset, int options, int *offsets,
2564 int offsetcount, int *workspace, int wscount)
2565 {
2566 real_pcre *re = (real_pcre *)argument_re;
2567 dfa_match_data match_block;
2568 dfa_match_data *md = &match_block;
2569 BOOL utf8, anchored, startline, firstline;
2570 const uschar *current_subject, *end_subject, *lcc;
2571
2572 pcre_study_data internal_study;
2573 const pcre_study_data *study = NULL;
2574 real_pcre internal_re;
2575
2576 const uschar *req_byte_ptr;
2577 const uschar *start_bits = NULL;
2578 BOOL first_byte_caseless = FALSE;
2579 BOOL req_byte_caseless = FALSE;
2580 int first_byte = -1;
2581 int req_byte = -1;
2582 int req_byte2 = -1;
2583 int newline;
2584
2585 /* Plausibility checks */
2586
2587 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2588 if (re == NULL || subject == NULL || workspace == NULL ||
2589 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2590 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2591 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2592
2593 /* We need to find the pointer to any study data before we test for byte
2594 flipping, so we scan the extra_data block first. This may set two fields in the
2595 match block, so we must initialize them beforehand. However, the other fields
2596 in the match block must not be set until after the byte flipping. */
2597
2598 md->tables = re->tables;
2599 md->callout_data = NULL;
2600
2601 if (extra_data != NULL)
2602 {
2603 unsigned int flags = extra_data->flags;
2604 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2605 study = (const pcre_study_data *)extra_data->study_data;
2606 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2607 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2608 return PCRE_ERROR_DFA_UMLIMIT;
2609 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2610 md->callout_data = extra_data->callout_data;
2611 if ((flags & PCRE_EXTRA_TABLES) != 0)
2612 md->tables = extra_data->tables;
2613 }
2614
2615 /* Check that the first field in the block is the magic number. If it is not,
2616 test for a regex that was compiled on a host of opposite endianness. If this is
2617 the case, flipped values are put in internal_re and internal_study if there was
2618 study data too. */
2619
2620 if (re->magic_number != MAGIC_NUMBER)
2621 {
2622 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2623 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2624 if (study != NULL) study = &internal_study;
2625 }
2626
2627 /* Set some local values */
2628
2629 current_subject = (const unsigned char *)subject + start_offset;
2630 end_subject = (const unsigned char *)subject + length;
2631 req_byte_ptr = current_subject - 1;
2632
2633 #ifdef SUPPORT_UTF8
2634 utf8 = (re->options & PCRE_UTF8) != 0;
2635 #else
2636 utf8 = FALSE;
2637 #endif
2638
2639 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2640 (re->options & PCRE_ANCHORED) != 0;
2641
2642 /* The remaining fixed data for passing around. */
2643
2644 md->start_code = (const uschar *)argument_re +
2645 re->name_table_offset + re->name_count * re->name_entry_size;
2646 md->start_subject = (const unsigned char *)subject;
2647 md->end_subject = end_subject;
2648 md->moptions = options;
2649 md->poptions = re->options;
2650
2651 /* If the BSR option is not set at match time, copy what was set
2652 at compile time. */
2653
2654 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2655 {
2656 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2657 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2658 #ifdef BSR_ANYCRLF
2659 else md->moptions |= PCRE_BSR_ANYCRLF;
2660 #endif
2661 }
2662
2663 /* Handle different types of newline. The three bits give eight cases. If
2664 nothing is set at run time, whatever was used at compile time applies. */
2665
2666 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2667 PCRE_NEWLINE_BITS)
2668 {
2669 case 0: newline = NEWLINE; break; /* Compile-time default */
2670 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2671 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2672 case PCRE_NEWLINE_CR+
2673 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2674 case PCRE_NEWLINE_ANY: newline = -1; break;
2675 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2676 default: return PCRE_ERROR_BADNEWLINE;
2677 }
2678
2679 if (newline == -2)
2680 {
2681 md->nltype = NLTYPE_ANYCRLF;
2682 }
2683 else if (newline < 0)
2684 {
2685 md->nltype = NLTYPE_ANY;
2686 }
2687 else
2688 {
2689 md->nltype = NLTYPE_FIXED;
2690 if (newline > 255)
2691 {
2692 md->nllen = 2;
2693 md->nl[0] = (newline >> 8) & 255;
2694 md->nl[1] = newline & 255;
2695 }
2696 else
2697 {
2698 md->nllen = 1;
2699 md->nl[0] = newline;
2700 }
2701 }
2702
2703 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2704 back the character offset. */
2705
2706 #ifdef SUPPORT_UTF8
2707 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2708 {
2709 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2710 return PCRE_ERROR_BADUTF8;
2711 if (start_offset > 0 && start_offset < length)
2712 {
2713 int tb = ((uschar *)subject)[start_offset];
2714 if (tb > 127)
2715 {
2716 tb &= 0xc0;
2717 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2718 }
2719 }
2720 }
2721 #endif
2722
2723 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2724 is a feature that makes it possible to save compiled regex and re-use them
2725 in other programs later. */
2726
2727 if (md->tables == NULL) md->tables = _pcre_default_tables;
2728
2729 /* The lower casing table and the "must be at the start of a line" flag are
2730 used in a loop when finding where to start. */
2731
2732 lcc = md->tables + lcc_offset;
2733 startline = (re->flags & PCRE_STARTLINE) != 0;
2734 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2735
2736 /* Set up the first character to match, if available. The first_byte value is
2737 never set for an anchored regular expression, but the anchoring may be forced
2738 at run time, so we have to test for anchoring. The first char may be unset for
2739 an unanchored pattern, of course. If there's no first char and the pattern was
2740 studied, there may be a bitmap of possible first characters. */
2741
2742 if (!anchored)
2743 {
2744 if ((re->flags & PCRE_FIRSTSET) != 0)
2745 {
2746 first_byte = re->first_byte & 255;
2747 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2748 first_byte = lcc[first_byte];
2749 }
2750 else
2751 {
2752 if (startline && study != NULL &&
2753 (study->options & PCRE_STUDY_MAPPED) != 0)
2754 start_bits = study->start_bits;
2755 }
2756 }
2757
2758 /* For anchored or unanchored matches, there may be a "last known required
2759 character" set. */
2760
2761 if ((re->flags & PCRE_REQCHSET) != 0)
2762 {
2763 req_byte = re->req_byte & 255;
2764 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2765 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2766 }
2767
2768 /* Call the main matching function, looping for a non-anchored regex after a
2769 failed match. If not restarting, perform certain optimizations at the start of
2770 a match. */
2771
2772 for (;;)
2773 {
2774 int rc;
2775
2776 if ((options & PCRE_DFA_RESTART) == 0)
2777 {
2778 const uschar *save_end_subject = end_subject;
2779
2780 /* If firstline is TRUE, the start of the match is constrained to the first
2781 line of a multiline string. Implement this by temporarily adjusting
2782 end_subject so that we stop scanning at a newline. If the match fails at
2783 the newline, later code breaks this loop. */
2784
2785 if (firstline)
2786 {
2787 USPTR t = current_subject;
2788 #ifdef SUPPORT_UTF8
2789 if (utf8)
2790 {
2791 while (t < md->end_subject && !IS_NEWLINE(t))
2792 {
2793 t++;
2794 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2795 }
2796 }
2797 else
2798 #endif
2799 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2800 end_subject = t;
2801 }
2802
2803 /* There are some optimizations that avoid running the match if a known
2804 starting point is not found, or if a known later character is not present.
2805 However, there is an option that disables these, for testing and for
2806 ensuring that all callouts do actually occur. */
2807
2808 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2809 {
2810
2811 /* Advance to a known first byte. */
2812
2813 if (first_byte >= 0)
2814 {
2815 if (first_byte_caseless)
2816 while (current_subject < end_subject &&
2817 lcc[*current_subject] != first_byte)
2818 current_subject++;
2819 else
2820 while (current_subject < end_subject &&
2821 *current_subject != first_byte)
2822 current_subject++;
2823 }
2824
2825 /* Or to just after a linebreak for a multiline match if possible */
2826
2827 else if (startline)
2828 {
2829 if (current_subject > md->start_subject + start_offset)
2830 {
2831 #ifdef SUPPORT_UTF8
2832 if (utf8)
2833 {
2834 while (current_subject < end_subject &&
2835 !WAS_NEWLINE(current_subject))
2836 {
2837 current_subject++;
2838 while(current_subject < end_subject &&
2839 (*current_subject & 0xc0) == 0x80)
2840 current_subject++;
2841 }
2842 }
2843 else
2844 #endif
2845 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2846 current_subject++;
2847
2848 /* If we have just passed a CR and the newline option is ANY or
2849 ANYCRLF, and we are now at a LF, advance the match position by one
2850 more character. */
2851
2852 if (current_subject[-1] == CHAR_CR &&
2853 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2854 current_subject < end_subject &&
2855 *current_subject == CHAR_NL)
2856 current_subject++;
2857 }
2858 }
2859
2860 /* Or to a non-unique first char after study */
2861
2862 else if (start_bits != NULL)
2863 {
2864 while (current_subject < end_subject)
2865 {
2866 register unsigned int c = *current_subject;
2867 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2868 else break;
2869 }
2870 }
2871 }
2872
2873 /* Restore fudged end_subject */
2874
2875 end_subject = save_end_subject;
2876 }
2877
2878 /* If req_byte is set, we know that that character must appear in the subject
2879 for the match to succeed. If the first character is set, req_byte must be
2880 later in the subject; otherwise the test starts at the match point. This
2881 optimization can save a huge amount of work in patterns with nested unlimited
2882 repeats that aren't going to match. Writing separate code for cased/caseless
2883 versions makes it go faster, as does using an autoincrement and backing off
2884 on a match.
2885
2886 HOWEVER: when the subject string is very, very long, searching to its end can
2887 take a long time, and give bad performance on quite ordinary patterns. This
2888 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2889 don't do this when the string is sufficiently long.
2890
2891 ALSO: this processing is disabled when partial matching is requested, and can
2892 also be explicitly deactivated. Furthermore, we have to disable when
2893 restarting after a partial match, because the required character may have
2894 already been matched. */
2895
2896 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2897 req_byte >= 0 &&
2898 end_subject - current_subject < REQ_BYTE_MAX &&
2899 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2900 {
2901 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2902
2903 /* We don't need to repeat the search if we haven't yet reached the
2904 place we found it at last time. */
2905
2906 if (p > req_byte_ptr)
2907 {
2908 if (req_byte_caseless)
2909 {
2910 while (p < end_subject)
2911 {
2912 register int pp = *p++;
2913 if (pp == req_byte || pp == req_byte2) { p--; break; }
2914 }
2915 }
2916 else
2917 {
2918 while (p < end_subject)
2919 {
2920 if (*p++ == req_byte) { p--; break; }
2921 }
2922 }
2923
2924 /* If we can't find the required character, break the matching loop,
2925 which will cause a return or PCRE_ERROR_NOMATCH. */
2926
2927 if (p >= end_subject) break;
2928
2929 /* If we have found the required character, save the point where we
2930 found it, so that we don't search again next time round the loop if
2931 the start hasn't passed this character yet. */
2932
2933 req_byte_ptr = p;
2934 }
2935 }
2936
2937 /* OK, now we can do the business */
2938
2939 rc = internal_dfa_exec(
2940 md, /* fixed match data */
2941 md->start_code, /* this subexpression's code */
2942 current_subject, /* where we currently are */
2943 start_offset, /* start offset in subject */
2944 offsets, /* offset vector */
2945 offsetcount, /* size of same */
2946 workspace, /* workspace vector */
2947 wscount, /* size of same */
2948 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2949 0, /* function recurse level */
2950 0); /* regex recurse level */
2951
2952 /* Anything other than "no match" means we are done, always; otherwise, carry
2953 on only if not anchored. */
2954
2955 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2956
2957 /* Advance to the next subject character unless we are at the end of a line
2958 and firstline is set. */
2959
2960 if (firstline && IS_NEWLINE(current_subject)) break;
2961 current_subject++;
2962 if (utf8)
2963 {
2964 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2965 current_subject++;
2966 }
2967 if (current_subject > end_subject) break;
2968
2969 /* If we have just passed a CR and we are now at a LF, and the pattern does
2970 not contain any explicit matches for \r or \n, and the newline option is CRLF
2971 or ANY or ANYCRLF, advance the match position by one more character. */
2972
2973 if (current_subject[-1] == CHAR_CR &&
2974 current_subject < end_subject &&
2975 *current_subject == CHAR_NL &&
2976 (re->flags & PCRE_HASCRORLF) == 0 &&
2977 (md->nltype == NLTYPE_ANY ||
2978 md->nltype == NLTYPE_ANYCRLF ||
2979 md->nllen == 2))
2980 current_subject++;
2981
2982 } /* "Bumpalong" loop */
2983
2984 return PCRE_ERROR_NOMATCH;
2985 }
2986
2987 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5