/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (show annotations)
Sat Apr 12 14:36:14 2008 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 95430 byte(s)
Error occurred while calculating annotation data.
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
54
55 #include "pcre_internal.h"
56
57
58 /* For use to indent debugging output */
59
60 #define SP " "
61
62
63
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
67
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
72
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
78
79
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
86
87 static const uschar coptable[] = {
88 0, /* End */
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, /* Any, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95 1, /* Char */
96 1, /* Charnc */
97 1, /* not */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
113 0, /* CLASS */
114 0, /* NCLASS */
115 0, /* XCLASS - variable length */
116 0, /* REF */
117 0, /* RECURSE */
118 0, /* CALLOUT */
119 0, /* Alt */
120 0, /* Ket */
121 0, /* KetRmax */
122 0, /* KetRmin */
123 0, /* Assert */
124 0, /* Assert not */
125 0, /* Assert behind */
126 0, /* Assert behind not */
127 0, /* Reverse */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
130 0, /* CREF */
131 0, /* RREF */
132 0, /* DEF */
133 0, 0, /* BRAZERO, BRAMINZERO */
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 0, 0 /* FAIL, ACCEPT */
136 };
137
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139 and \w */
140
141 static const uschar toptable1[] = {
142 0, 0, 0, 0, 0, 0,
143 ctype_digit, ctype_digit,
144 ctype_space, ctype_space,
145 ctype_word, ctype_word,
146 0 /* OP_ANY */
147 };
148
149 static const uschar toptable2[] = {
150 0, 0, 0, 0, 0, 0,
151 ctype_digit, 0,
152 ctype_space, 0,
153 ctype_word, 0,
154 1 /* OP_ANY */
155 };
156
157
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
162
163 typedef struct stateblock {
164 int offset; /* Offset to opcode */
165 int count; /* Count for repeats */
166 int ims; /* ims flag bits */
167 int data; /* Some use extra data */
168 } stateblock;
169
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171
172
173 #ifdef DEBUG
174 /*************************************************
175 * Print character string *
176 *************************************************/
177
178 /* Character string printing function for debugging.
179
180 Arguments:
181 p points to string
182 length number of bytes
183 f where to print
184
185 Returns: nothing
186 */
187
188 static void
189 pchars(unsigned char *p, int length, FILE *f)
190 {
191 int c;
192 while (length-- > 0)
193 {
194 if (isprint(c = *(p++)))
195 fprintf(f, "%c", c);
196 else
197 fprintf(f, "\\x%02x", c);
198 }
199 }
200 #endif
201
202
203
204 /*************************************************
205 * Execute a Regular Expression - DFA engine *
206 *************************************************/
207
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
212
213 Arguments:
214 md the match_data block with fixed information
215 this_start_code the opening bracket of this subexpression's code
216 current_subject where we currently are in the subject string
217 start_offset start offset in the subject string
218 offsets vector to contain the matching string offsets
219 offsetcount size of same
220 workspace vector of workspace
221 wscount size of same
222 ims the current ims flags
223 rlevel function call recursion level
224 recursing regex recursive call level
225
226 Returns: > 0 =>
227 = 0 =>
228 -1 => failed to match
229 < -1 => some kind of unexpected problem
230
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
233
234 #define ADD_ACTIVE(x,y) \
235 if (active_count++ < wscount) \
236 { \
237 next_active_state->offset = (x); \
238 next_active_state->count = (y); \
239 next_active_state->ims = ims; \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_ACTIVE_DATA(x,y,z) \
246 if (active_count++ < wscount) \
247 { \
248 next_active_state->offset = (x); \
249 next_active_state->count = (y); \
250 next_active_state->ims = ims; \
251 next_active_state->data = (z); \
252 next_active_state++; \
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254 } \
255 else return PCRE_ERROR_DFA_WSSIZE
256
257 #define ADD_NEW(x,y) \
258 if (new_count++ < wscount) \
259 { \
260 next_new_state->offset = (x); \
261 next_new_state->count = (y); \
262 next_new_state->ims = ims; \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 #define ADD_NEW_DATA(x,y,z) \
269 if (new_count++ < wscount) \
270 { \
271 next_new_state->offset = (x); \
272 next_new_state->count = (y); \
273 next_new_state->ims = ims; \
274 next_new_state->data = (z); \
275 next_new_state++; \
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277 } \
278 else return PCRE_ERROR_DFA_WSSIZE
279
280 /* And now, here is the code */
281
282 static int
283 internal_dfa_exec(
284 dfa_match_data *md,
285 const uschar *this_start_code,
286 const uschar *current_subject,
287 int start_offset,
288 int *offsets,
289 int offsetcount,
290 int *workspace,
291 int wscount,
292 int ims,
293 int rlevel,
294 int recursing)
295 {
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
298
299 const uschar *ctypes, *lcc, *fcc;
300 const uschar *ptr;
301 const uschar *end_code, *first_op;
302
303 int active_count, new_count, match_count;
304
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
307
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
311
312 #ifdef SUPPORT_UTF8
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 #else
315 BOOL utf8 = FALSE;
316 #endif
317
318 rlevel++;
319 offsetcount &= (-2);
320
321 wscount -= 2;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323 (2 * INTS_PER_STATEBLOCK);
324
325 DPRINTF(("\n%.*s---------------------\n"
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
332
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
337 new_count = 0;
338
339 first_op = this_start_code + 1 + LINK_SIZE +
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
346
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
350
351 if (*first_op == OP_REVERSE)
352 {
353 int max_back = 0;
354 int gone_back;
355
356 end_code = this_start_code;
357 do
358 {
359 int back = GET(end_code, 2+LINK_SIZE);
360 if (back > max_back) max_back = back;
361 end_code += GET(end_code, 1);
362 }
363 while (*end_code == OP_ALT);
364
365 /* If we can't go back the amount required for the longest lookbehind
366 pattern, go back as far as we can; some alternatives may still be viable. */
367
368 #ifdef SUPPORT_UTF8
369 /* In character mode we have to step back character by character */
370
371 if (utf8)
372 {
373 for (gone_back = 0; gone_back < max_back; gone_back++)
374 {
375 if (current_subject <= start_subject) break;
376 current_subject--;
377 while (current_subject > start_subject &&
378 (*current_subject & 0xc0) == 0x80)
379 current_subject--;
380 }
381 }
382 else
383 #endif
384
385 /* In byte-mode we can do this quickly. */
386
387 {
388 gone_back = (current_subject - max_back < start_subject)?
389 current_subject - start_subject : max_back;
390 current_subject -= gone_back;
391 }
392
393 /* Now we can process the individual branches. */
394
395 end_code = this_start_code;
396 do
397 {
398 int back = GET(end_code, 2+LINK_SIZE);
399 if (back <= gone_back)
400 {
401 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402 ADD_NEW_DATA(-bstate, 0, gone_back - back);
403 }
404 end_code += GET(end_code, 1);
405 }
406 while (*end_code == OP_ALT);
407 }
408
409 /* This is the code for a "normal" subpattern (not a backward assertion). The
410 start of a whole pattern is always one of these. If we are at the top level,
411 we may be asked to restart matching from the same point that we reached for a
412 previous partial match. We still have to scan through the top-level branches to
413 find the end state. */
414
415 else
416 {
417 end_code = this_start_code;
418
419 /* Restarting */
420
421 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422 {
423 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424 new_count = workspace[1];
425 if (!workspace[0])
426 memcpy(new_states, active_states, new_count * sizeof(stateblock));
427 }
428
429 /* Not restarting */
430
431 else
432 {
433 int length = 1 + LINK_SIZE +
434 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 do
436 {
437 ADD_NEW(end_code - start_code + length, 0);
438 end_code += GET(end_code, 1);
439 length = 1 + LINK_SIZE;
440 }
441 while (*end_code == OP_ALT);
442 }
443 }
444
445 workspace[0] = 0; /* Bit indicating which vector is current */
446
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448
449 /* Loop for scanning the subject */
450
451 ptr = current_subject;
452 for (;;)
453 {
454 int i, j;
455 int clen, dlen;
456 unsigned int c, d;
457
458 /* Make the new state list into the active state list and empty the
459 new state list. */
460
461 temp_states = active_states;
462 active_states = new_states;
463 new_states = temp_states;
464 active_count = new_count;
465 new_count = 0;
466
467 workspace[0] ^= 1; /* Remember for the restarting feature */
468 workspace[1] = active_count;
469
470 #ifdef DEBUG
471 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473 printf("\"\n");
474
475 printf("%.*sActive states: ", rlevel*2-2, SP);
476 for (i = 0; i < active_count; i++)
477 printf("%d/%d ", active_states[i].offset, active_states[i].count);
478 printf("\n");
479 #endif
480
481 /* Set the pointers for adding new states */
482
483 next_active_state = active_states + active_count;
484 next_new_state = new_states;
485
486 /* Load the current character from the subject outside the loop, as many
487 different states may want to look at it, and we assume that at least one
488 will. */
489
490 if (ptr < end_subject)
491 {
492 clen = 1; /* Number of bytes in the character */
493 #ifdef SUPPORT_UTF8
494 if (utf8) { GETCHARLEN(c, ptr, clen); } else
495 #endif /* SUPPORT_UTF8 */
496 c = *ptr;
497 }
498 else
499 {
500 clen = 0; /* This indicates the end of the subject */
501 c = NOTACHAR; /* This value should never actually be used */
502 }
503
504 /* Scan up the active states and act on each one. The result of an action
505 may be to add more states to the currently active list (e.g. on hitting a
506 parenthesis) or it may be to put states on the new list, for considering
507 when we move the character pointer on. */
508
509 for (i = 0; i < active_count; i++)
510 {
511 stateblock *current_state = active_states + i;
512 const uschar *code;
513 int state_offset = current_state->offset;
514 int count, codevalue;
515 #ifdef SUPPORT_UCP
516 int chartype, script;
517 #endif
518
519 #ifdef DEBUG
520 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 if (clen == 0) printf("EOL\n");
522 else if (c > 32 && c < 127) printf("'%c'\n", c);
523 else printf("0x%02x\n", c);
524 #endif
525
526 /* This variable is referred to implicity in the ADD_xxx macros. */
527
528 ims = current_state->ims;
529
530 /* A negative offset is a special case meaning "hold off going to this
531 (negated) state until the number of characters in the data field have
532 been skipped". */
533
534 if (state_offset < 0)
535 {
536 if (current_state->data > 0)
537 {
538 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539 ADD_NEW_DATA(state_offset, current_state->count,
540 current_state->data - 1);
541 continue;
542 }
543 else
544 {
545 current_state->offset = state_offset = -state_offset;
546 }
547 }
548
549 /* Check for a duplicate state with the same count, and skip if found. */
550
551 for (j = 0; j < i; j++)
552 {
553 if (active_states[j].offset == state_offset &&
554 active_states[j].count == current_state->count)
555 {
556 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557 goto NEXT_ACTIVE_STATE;
558 }
559 }
560
561 /* The state offset is the offset to the opcode */
562
563 code = start_code + state_offset;
564 codevalue = *code;
565
566 /* If this opcode is followed by an inline character, load it. It is
567 tempting to test for the presence of a subject character here, but that
568 is wrong, because sometimes zero repetitions of the subject are
569 permitted.
570
571 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 argument that is not a data character - but is always one byte long. We
573 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574 this case. To keep the other cases fast, convert these ones to new opcodes.
575 */
576
577 if (coptable[codevalue] > 0)
578 {
579 dlen = 1;
580 #ifdef SUPPORT_UTF8
581 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582 #endif /* SUPPORT_UTF8 */
583 d = code[coptable[codevalue]];
584 if (codevalue >= OP_TYPESTAR)
585 {
586 switch(d)
587 {
588 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589 case OP_NOTPROP:
590 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 case OP_NOT_HSPACE:
594 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 case OP_NOT_VSPACE:
596 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 default: break;
598 }
599 }
600 }
601 else
602 {
603 dlen = 0; /* Not strictly necessary, but compilers moan */
604 d = NOTACHAR; /* if these variables are not set. */
605 }
606
607
608 /* Now process the individual opcodes */
609
610 switch (codevalue)
611 {
612
613 /* ========================================================================== */
614 /* Reached a closing bracket. If not at the end of the pattern, carry
615 on with the next opcode. Otherwise, unless we have an empty string and
616 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617 matches so we always have the longest first. */
618
619 case OP_KET:
620 case OP_KETRMIN:
621 case OP_KETRMAX:
622 if (code != end_code)
623 {
624 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625 if (codevalue != OP_KET)
626 {
627 ADD_ACTIVE(state_offset - GET(code, 1), 0);
628 }
629 }
630 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631 {
632 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634 match_count = 0;
635 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637 if (offsetcount >= 2)
638 {
639 offsets[0] = current_subject - start_subject;
640 offsets[1] = ptr - start_subject;
641 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642 offsets[1] - offsets[0], current_subject));
643 }
644 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645 {
646 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648 match_count, rlevel*2-2, SP));
649 return match_count;
650 }
651 }
652 break;
653
654 /* ========================================================================== */
655 /* These opcodes add to the current list of states without looking
656 at the current character. */
657
658 /*-----------------------------------------------------------------*/
659 case OP_ALT:
660 do { code += GET(code, 1); } while (*code == OP_ALT);
661 ADD_ACTIVE(code - start_code, 0);
662 break;
663
664 /*-----------------------------------------------------------------*/
665 case OP_BRA:
666 case OP_SBRA:
667 do
668 {
669 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670 code += GET(code, 1);
671 }
672 while (*code == OP_ALT);
673 break;
674
675 /*-----------------------------------------------------------------*/
676 case OP_CBRA:
677 case OP_SCBRA:
678 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679 code += GET(code, 1);
680 while (*code == OP_ALT)
681 {
682 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683 code += GET(code, 1);
684 }
685 break;
686
687 /*-----------------------------------------------------------------*/
688 case OP_BRAZERO:
689 case OP_BRAMINZERO:
690 ADD_ACTIVE(state_offset + 1, 0);
691 code += 1 + GET(code, 2);
692 while (*code == OP_ALT) code += GET(code, 1);
693 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694 break;
695
696 /*-----------------------------------------------------------------*/
697 case OP_SKIPZERO:
698 code += 1 + GET(code, 2);
699 while (*code == OP_ALT) code += GET(code, 1);
700 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701 break;
702
703 /*-----------------------------------------------------------------*/
704 case OP_CIRC:
705 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 ((ims & PCRE_MULTILINE) != 0 &&
707 ptr != end_subject &&
708 WAS_NEWLINE(ptr)))
709 { ADD_ACTIVE(state_offset + 1, 0); }
710 break;
711
712 /*-----------------------------------------------------------------*/
713 case OP_EOD:
714 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715 break;
716
717 /*-----------------------------------------------------------------*/
718 case OP_OPT:
719 ims = code[1];
720 ADD_ACTIVE(state_offset + 2, 0);
721 break;
722
723 /*-----------------------------------------------------------------*/
724 case OP_SOD:
725 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
726 break;
727
728 /*-----------------------------------------------------------------*/
729 case OP_SOM:
730 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
731 break;
732
733
734 /* ========================================================================== */
735 /* These opcodes inspect the next subject character, and sometimes
736 the previous one as well, but do not have an argument. The variable
737 clen contains the length of the current character and is zero if we are
738 at the end of the subject. */
739
740 /*-----------------------------------------------------------------*/
741 case OP_ANY:
742 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
743 { ADD_NEW(state_offset + 1, 0); }
744 break;
745
746 /*-----------------------------------------------------------------*/
747 case OP_EODN:
748 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
749 { ADD_ACTIVE(state_offset + 1, 0); }
750 break;
751
752 /*-----------------------------------------------------------------*/
753 case OP_DOLL:
754 if ((md->moptions & PCRE_NOTEOL) == 0)
755 {
756 if (clen == 0 ||
757 (IS_NEWLINE(ptr) &&
758 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
759 ))
760 { ADD_ACTIVE(state_offset + 1, 0); }
761 }
762 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
763 { ADD_ACTIVE(state_offset + 1, 0); }
764 break;
765
766 /*-----------------------------------------------------------------*/
767
768 case OP_DIGIT:
769 case OP_WHITESPACE:
770 case OP_WORDCHAR:
771 if (clen > 0 && c < 256 &&
772 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
773 { ADD_NEW(state_offset + 1, 0); }
774 break;
775
776 /*-----------------------------------------------------------------*/
777 case OP_NOT_DIGIT:
778 case OP_NOT_WHITESPACE:
779 case OP_NOT_WORDCHAR:
780 if (clen > 0 && (c >= 256 ||
781 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
782 { ADD_NEW(state_offset + 1, 0); }
783 break;
784
785 /*-----------------------------------------------------------------*/
786 case OP_WORD_BOUNDARY:
787 case OP_NOT_WORD_BOUNDARY:
788 {
789 int left_word, right_word;
790
791 if (ptr > start_subject)
792 {
793 const uschar *temp = ptr - 1;
794 #ifdef SUPPORT_UTF8
795 if (utf8) BACKCHAR(temp);
796 #endif
797 GETCHARTEST(d, temp);
798 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
799 }
800 else left_word = 0;
801
802 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
803 else right_word = 0;
804
805 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
806 { ADD_ACTIVE(state_offset + 1, 0); }
807 }
808 break;
809
810
811 /*-----------------------------------------------------------------*/
812 /* Check the next character by Unicode property. We will get here only
813 if the support is in the binary; otherwise a compile-time error occurs.
814 */
815
816 #ifdef SUPPORT_UCP
817 case OP_PROP:
818 case OP_NOTPROP:
819 if (clen > 0)
820 {
821 BOOL OK;
822 int category = _pcre_ucp_findprop(c, &chartype, &script);
823 switch(code[1])
824 {
825 case PT_ANY:
826 OK = TRUE;
827 break;
828
829 case PT_LAMP:
830 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
831 break;
832
833 case PT_GC:
834 OK = category == code[2];
835 break;
836
837 case PT_PC:
838 OK = chartype == code[2];
839 break;
840
841 case PT_SC:
842 OK = script == code[2];
843 break;
844
845 /* Should never occur, but keep compilers from grumbling. */
846
847 default:
848 OK = codevalue != OP_PROP;
849 break;
850 }
851
852 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
853 }
854 break;
855 #endif
856
857
858
859 /* ========================================================================== */
860 /* These opcodes likewise inspect the subject character, but have an
861 argument that is not a data character. It is one of these opcodes:
862 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
863 OP_NOT_WORDCHAR. The value is loaded into d. */
864
865 case OP_TYPEPLUS:
866 case OP_TYPEMINPLUS:
867 case OP_TYPEPOSPLUS:
868 count = current_state->count; /* Already matched */
869 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
870 if (clen > 0)
871 {
872 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
873 (c < 256 &&
874 (d != OP_ANY ||
875 (ims & PCRE_DOTALL) != 0 ||
876 !IS_NEWLINE(ptr)
877 ) &&
878 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879 {
880 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881 {
882 active_count--; /* Remove non-match possibility */
883 next_active_state--;
884 }
885 count++;
886 ADD_NEW(state_offset, count);
887 }
888 }
889 break;
890
891 /*-----------------------------------------------------------------*/
892 case OP_TYPEQUERY:
893 case OP_TYPEMINQUERY:
894 case OP_TYPEPOSQUERY:
895 ADD_ACTIVE(state_offset + 2, 0);
896 if (clen > 0)
897 {
898 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899 (c < 256 &&
900 (d != OP_ANY ||
901 (ims & PCRE_DOTALL) != 0 ||
902 !IS_NEWLINE(ptr)
903 ) &&
904 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
905 {
906 if (codevalue == OP_TYPEPOSQUERY)
907 {
908 active_count--; /* Remove non-match possibility */
909 next_active_state--;
910 }
911 ADD_NEW(state_offset + 2, 0);
912 }
913 }
914 break;
915
916 /*-----------------------------------------------------------------*/
917 case OP_TYPESTAR:
918 case OP_TYPEMINSTAR:
919 case OP_TYPEPOSSTAR:
920 ADD_ACTIVE(state_offset + 2, 0);
921 if (clen > 0)
922 {
923 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924 (c < 256 &&
925 (d != OP_ANY ||
926 (ims & PCRE_DOTALL) != 0 ||
927 !IS_NEWLINE(ptr)
928 ) &&
929 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930 {
931 if (codevalue == OP_TYPEPOSSTAR)
932 {
933 active_count--; /* Remove non-match possibility */
934 next_active_state--;
935 }
936 ADD_NEW(state_offset, 0);
937 }
938 }
939 break;
940
941 /*-----------------------------------------------------------------*/
942 case OP_TYPEEXACT:
943 count = current_state->count; /* Number already matched */
944 if (clen > 0)
945 {
946 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
947 (c < 256 &&
948 (d != OP_ANY ||
949 (ims & PCRE_DOTALL) != 0 ||
950 !IS_NEWLINE(ptr)
951 ) &&
952 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953 {
954 if (++count >= GET2(code, 1))
955 { ADD_NEW(state_offset + 4, 0); }
956 else
957 { ADD_NEW(state_offset, count); }
958 }
959 }
960 break;
961
962 /*-----------------------------------------------------------------*/
963 case OP_TYPEUPTO:
964 case OP_TYPEMINUPTO:
965 case OP_TYPEPOSUPTO:
966 ADD_ACTIVE(state_offset + 4, 0);
967 count = current_state->count; /* Number already matched */
968 if (clen > 0)
969 {
970 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
971 (c < 256 &&
972 (d != OP_ANY ||
973 (ims & PCRE_DOTALL) != 0 ||
974 !IS_NEWLINE(ptr)
975 ) &&
976 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
977 {
978 if (codevalue == OP_TYPEPOSUPTO)
979 {
980 active_count--; /* Remove non-match possibility */
981 next_active_state--;
982 }
983 if (++count >= GET2(code, 1))
984 { ADD_NEW(state_offset + 4, 0); }
985 else
986 { ADD_NEW(state_offset, count); }
987 }
988 }
989 break;
990
991 /* ========================================================================== */
992 /* These are virtual opcodes that are used when something like
993 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
994 argument. It keeps the code above fast for the other cases. The argument
995 is in the d variable. */
996
997 #ifdef SUPPORT_UCP
998 case OP_PROP_EXTRA + OP_TYPEPLUS:
999 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1000 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1001 count = current_state->count; /* Already matched */
1002 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1003 if (clen > 0)
1004 {
1005 BOOL OK;
1006 int category = _pcre_ucp_findprop(c, &chartype, &script);
1007 switch(code[2])
1008 {
1009 case PT_ANY:
1010 OK = TRUE;
1011 break;
1012
1013 case PT_LAMP:
1014 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1015 break;
1016
1017 case PT_GC:
1018 OK = category == code[3];
1019 break;
1020
1021 case PT_PC:
1022 OK = chartype == code[3];
1023 break;
1024
1025 case PT_SC:
1026 OK = script == code[3];
1027 break;
1028
1029 /* Should never occur, but keep compilers from grumbling. */
1030
1031 default:
1032 OK = codevalue != OP_PROP;
1033 break;
1034 }
1035
1036 if (OK == (d == OP_PROP))
1037 {
1038 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1039 {
1040 active_count--; /* Remove non-match possibility */
1041 next_active_state--;
1042 }
1043 count++;
1044 ADD_NEW(state_offset, count);
1045 }
1046 }
1047 break;
1048
1049 /*-----------------------------------------------------------------*/
1050 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1051 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1052 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1053 count = current_state->count; /* Already matched */
1054 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1055 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1056 {
1057 const uschar *nptr = ptr + clen;
1058 int ncount = 0;
1059 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1060 {
1061 active_count--; /* Remove non-match possibility */
1062 next_active_state--;
1063 }
1064 while (nptr < end_subject)
1065 {
1066 int nd;
1067 int ndlen = 1;
1068 GETCHARLEN(nd, nptr, ndlen);
1069 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1070 ncount++;
1071 nptr += ndlen;
1072 }
1073 count++;
1074 ADD_NEW_DATA(-state_offset, count, ncount);
1075 }
1076 break;
1077 #endif
1078
1079 /*-----------------------------------------------------------------*/
1080 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1081 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1082 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1083 count = current_state->count; /* Already matched */
1084 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1085 if (clen > 0)
1086 {
1087 int ncount = 0;
1088 switch (c)
1089 {
1090 case 0x000b:
1091 case 0x000c:
1092 case 0x0085:
1093 case 0x2028:
1094 case 0x2029:
1095 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1096 goto ANYNL01;
1097
1098 case 0x000d:
1099 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1100 /* Fall through */
1101
1102 ANYNL01:
1103 case 0x000a:
1104 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1105 {
1106 active_count--; /* Remove non-match possibility */
1107 next_active_state--;
1108 }
1109 count++;
1110 ADD_NEW_DATA(-state_offset, count, ncount);
1111 break;
1112
1113 default:
1114 break;
1115 }
1116 }
1117 break;
1118
1119 /*-----------------------------------------------------------------*/
1120 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1121 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1122 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1123 count = current_state->count; /* Already matched */
1124 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1125 if (clen > 0)
1126 {
1127 BOOL OK;
1128 switch (c)
1129 {
1130 case 0x000a:
1131 case 0x000b:
1132 case 0x000c:
1133 case 0x000d:
1134 case 0x0085:
1135 case 0x2028:
1136 case 0x2029:
1137 OK = TRUE;
1138 break;
1139
1140 default:
1141 OK = FALSE;
1142 break;
1143 }
1144
1145 if (OK == (d == OP_VSPACE))
1146 {
1147 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1148 {
1149 active_count--; /* Remove non-match possibility */
1150 next_active_state--;
1151 }
1152 count++;
1153 ADD_NEW_DATA(-state_offset, count, 0);
1154 }
1155 }
1156 break;
1157
1158 /*-----------------------------------------------------------------*/
1159 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1160 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1161 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1162 count = current_state->count; /* Already matched */
1163 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1164 if (clen > 0)
1165 {
1166 BOOL OK;
1167 switch (c)
1168 {
1169 case 0x09: /* HT */
1170 case 0x20: /* SPACE */
1171 case 0xa0: /* NBSP */
1172 case 0x1680: /* OGHAM SPACE MARK */
1173 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1174 case 0x2000: /* EN QUAD */
1175 case 0x2001: /* EM QUAD */
1176 case 0x2002: /* EN SPACE */
1177 case 0x2003: /* EM SPACE */
1178 case 0x2004: /* THREE-PER-EM SPACE */
1179 case 0x2005: /* FOUR-PER-EM SPACE */
1180 case 0x2006: /* SIX-PER-EM SPACE */
1181 case 0x2007: /* FIGURE SPACE */
1182 case 0x2008: /* PUNCTUATION SPACE */
1183 case 0x2009: /* THIN SPACE */
1184 case 0x200A: /* HAIR SPACE */
1185 case 0x202f: /* NARROW NO-BREAK SPACE */
1186 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1187 case 0x3000: /* IDEOGRAPHIC SPACE */
1188 OK = TRUE;
1189 break;
1190
1191 default:
1192 OK = FALSE;
1193 break;
1194 }
1195
1196 if (OK == (d == OP_HSPACE))
1197 {
1198 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1199 {
1200 active_count--; /* Remove non-match possibility */
1201 next_active_state--;
1202 }
1203 count++;
1204 ADD_NEW_DATA(-state_offset, count, 0);
1205 }
1206 }
1207 break;
1208
1209 /*-----------------------------------------------------------------*/
1210 #ifdef SUPPORT_UCP
1211 case OP_PROP_EXTRA + OP_TYPEQUERY:
1212 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1213 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1214 count = 4;
1215 goto QS1;
1216
1217 case OP_PROP_EXTRA + OP_TYPESTAR:
1218 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1219 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1220 count = 0;
1221
1222 QS1:
1223
1224 ADD_ACTIVE(state_offset + 4, 0);
1225 if (clen > 0)
1226 {
1227 BOOL OK;
1228 int category = _pcre_ucp_findprop(c, &chartype, &script);
1229 switch(code[2])
1230 {
1231 case PT_ANY:
1232 OK = TRUE;
1233 break;
1234
1235 case PT_LAMP:
1236 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1237 break;
1238
1239 case PT_GC:
1240 OK = category == code[3];
1241 break;
1242
1243 case PT_PC:
1244 OK = chartype == code[3];
1245 break;
1246
1247 case PT_SC:
1248 OK = script == code[3];
1249 break;
1250
1251 /* Should never occur, but keep compilers from grumbling. */
1252
1253 default:
1254 OK = codevalue != OP_PROP;
1255 break;
1256 }
1257
1258 if (OK == (d == OP_PROP))
1259 {
1260 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1261 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1262 {
1263 active_count--; /* Remove non-match possibility */
1264 next_active_state--;
1265 }
1266 ADD_NEW(state_offset + count, 0);
1267 }
1268 }
1269 break;
1270
1271 /*-----------------------------------------------------------------*/
1272 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1273 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1274 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1275 count = 2;
1276 goto QS2;
1277
1278 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1279 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1280 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1281 count = 0;
1282
1283 QS2:
1284
1285 ADD_ACTIVE(state_offset + 2, 0);
1286 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1287 {
1288 const uschar *nptr = ptr + clen;
1289 int ncount = 0;
1290 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1291 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1292 {
1293 active_count--; /* Remove non-match possibility */
1294 next_active_state--;
1295 }
1296 while (nptr < end_subject)
1297 {
1298 int nd;
1299 int ndlen = 1;
1300 GETCHARLEN(nd, nptr, ndlen);
1301 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1302 ncount++;
1303 nptr += ndlen;
1304 }
1305 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1306 }
1307 break;
1308 #endif
1309
1310 /*-----------------------------------------------------------------*/
1311 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1312 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1313 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1314 count = 2;
1315 goto QS3;
1316
1317 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1318 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1319 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1320 count = 0;
1321
1322 QS3:
1323 ADD_ACTIVE(state_offset + 2, 0);
1324 if (clen > 0)
1325 {
1326 int ncount = 0;
1327 switch (c)
1328 {
1329 case 0x000b:
1330 case 0x000c:
1331 case 0x0085:
1332 case 0x2028:
1333 case 0x2029:
1334 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1335 goto ANYNL02;
1336
1337 case 0x000d:
1338 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339 /* Fall through */
1340
1341 ANYNL02:
1342 case 0x000a:
1343 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1344 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1345 {
1346 active_count--; /* Remove non-match possibility */
1347 next_active_state--;
1348 }
1349 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1350 break;
1351
1352 default:
1353 break;
1354 }
1355 }
1356 break;
1357
1358 /*-----------------------------------------------------------------*/
1359 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1360 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1361 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1362 count = 2;
1363 goto QS4;
1364
1365 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1366 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1367 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1368 count = 0;
1369
1370 QS4:
1371 ADD_ACTIVE(state_offset + 2, 0);
1372 if (clen > 0)
1373 {
1374 BOOL OK;
1375 switch (c)
1376 {
1377 case 0x000a:
1378 case 0x000b:
1379 case 0x000c:
1380 case 0x000d:
1381 case 0x0085:
1382 case 0x2028:
1383 case 0x2029:
1384 OK = TRUE;
1385 break;
1386
1387 default:
1388 OK = FALSE;
1389 break;
1390 }
1391 if (OK == (d == OP_VSPACE))
1392 {
1393 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1394 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1395 {
1396 active_count--; /* Remove non-match possibility */
1397 next_active_state--;
1398 }
1399 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1400 }
1401 }
1402 break;
1403
1404 /*-----------------------------------------------------------------*/
1405 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1406 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1407 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1408 count = 2;
1409 goto QS5;
1410
1411 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1412 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1413 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1414 count = 0;
1415
1416 QS5:
1417 ADD_ACTIVE(state_offset + 2, 0);
1418 if (clen > 0)
1419 {
1420 BOOL OK;
1421 switch (c)
1422 {
1423 case 0x09: /* HT */
1424 case 0x20: /* SPACE */
1425 case 0xa0: /* NBSP */
1426 case 0x1680: /* OGHAM SPACE MARK */
1427 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1428 case 0x2000: /* EN QUAD */
1429 case 0x2001: /* EM QUAD */
1430 case 0x2002: /* EN SPACE */
1431 case 0x2003: /* EM SPACE */
1432 case 0x2004: /* THREE-PER-EM SPACE */
1433 case 0x2005: /* FOUR-PER-EM SPACE */
1434 case 0x2006: /* SIX-PER-EM SPACE */
1435 case 0x2007: /* FIGURE SPACE */
1436 case 0x2008: /* PUNCTUATION SPACE */
1437 case 0x2009: /* THIN SPACE */
1438 case 0x200A: /* HAIR SPACE */
1439 case 0x202f: /* NARROW NO-BREAK SPACE */
1440 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1441 case 0x3000: /* IDEOGRAPHIC SPACE */
1442 OK = TRUE;
1443 break;
1444
1445 default:
1446 OK = FALSE;
1447 break;
1448 }
1449
1450 if (OK == (d == OP_HSPACE))
1451 {
1452 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1453 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1454 {
1455 active_count--; /* Remove non-match possibility */
1456 next_active_state--;
1457 }
1458 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1459 }
1460 }
1461 break;
1462
1463 /*-----------------------------------------------------------------*/
1464 #ifdef SUPPORT_UCP
1465 case OP_PROP_EXTRA + OP_TYPEEXACT:
1466 case OP_PROP_EXTRA + OP_TYPEUPTO:
1467 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1468 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1469 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1470 { ADD_ACTIVE(state_offset + 6, 0); }
1471 count = current_state->count; /* Number already matched */
1472 if (clen > 0)
1473 {
1474 BOOL OK;
1475 int category = _pcre_ucp_findprop(c, &chartype, &script);
1476 switch(code[4])
1477 {
1478 case PT_ANY:
1479 OK = TRUE;
1480 break;
1481
1482 case PT_LAMP:
1483 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1484 break;
1485
1486 case PT_GC:
1487 OK = category == code[5];
1488 break;
1489
1490 case PT_PC:
1491 OK = chartype == code[5];
1492 break;
1493
1494 case PT_SC:
1495 OK = script == code[5];
1496 break;
1497
1498 /* Should never occur, but keep compilers from grumbling. */
1499
1500 default:
1501 OK = codevalue != OP_PROP;
1502 break;
1503 }
1504
1505 if (OK == (d == OP_PROP))
1506 {
1507 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1508 {
1509 active_count--; /* Remove non-match possibility */
1510 next_active_state--;
1511 }
1512 if (++count >= GET2(code, 1))
1513 { ADD_NEW(state_offset + 6, 0); }
1514 else
1515 { ADD_NEW(state_offset, count); }
1516 }
1517 }
1518 break;
1519
1520 /*-----------------------------------------------------------------*/
1521 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1522 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1523 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1524 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1525 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1526 { ADD_ACTIVE(state_offset + 4, 0); }
1527 count = current_state->count; /* Number already matched */
1528 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1529 {
1530 const uschar *nptr = ptr + clen;
1531 int ncount = 0;
1532 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1533 {
1534 active_count--; /* Remove non-match possibility */
1535 next_active_state--;
1536 }
1537 while (nptr < end_subject)
1538 {
1539 int nd;
1540 int ndlen = 1;
1541 GETCHARLEN(nd, nptr, ndlen);
1542 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1543 ncount++;
1544 nptr += ndlen;
1545 }
1546 if (++count >= GET2(code, 1))
1547 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1548 else
1549 { ADD_NEW_DATA(-state_offset, count, ncount); }
1550 }
1551 break;
1552 #endif
1553
1554 /*-----------------------------------------------------------------*/
1555 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1556 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1557 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1558 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1559 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1560 { ADD_ACTIVE(state_offset + 4, 0); }
1561 count = current_state->count; /* Number already matched */
1562 if (clen > 0)
1563 {
1564 int ncount = 0;
1565 switch (c)
1566 {
1567 case 0x000b:
1568 case 0x000c:
1569 case 0x0085:
1570 case 0x2028:
1571 case 0x2029:
1572 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1573 goto ANYNL03;
1574
1575 case 0x000d:
1576 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1577 /* Fall through */
1578
1579 ANYNL03:
1580 case 0x000a:
1581 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1582 {
1583 active_count--; /* Remove non-match possibility */
1584 next_active_state--;
1585 }
1586 if (++count >= GET2(code, 1))
1587 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1588 else
1589 { ADD_NEW_DATA(-state_offset, count, ncount); }
1590 break;
1591
1592 default:
1593 break;
1594 }
1595 }
1596 break;
1597
1598 /*-----------------------------------------------------------------*/
1599 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1600 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1601 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1602 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1603 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1604 { ADD_ACTIVE(state_offset + 4, 0); }
1605 count = current_state->count; /* Number already matched */
1606 if (clen > 0)
1607 {
1608 BOOL OK;
1609 switch (c)
1610 {
1611 case 0x000a:
1612 case 0x000b:
1613 case 0x000c:
1614 case 0x000d:
1615 case 0x0085:
1616 case 0x2028:
1617 case 0x2029:
1618 OK = TRUE;
1619 break;
1620
1621 default:
1622 OK = FALSE;
1623 }
1624
1625 if (OK == (d == OP_VSPACE))
1626 {
1627 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1628 {
1629 active_count--; /* Remove non-match possibility */
1630 next_active_state--;
1631 }
1632 if (++count >= GET2(code, 1))
1633 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1634 else
1635 { ADD_NEW_DATA(-state_offset, count, 0); }
1636 }
1637 }
1638 break;
1639
1640 /*-----------------------------------------------------------------*/
1641 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1642 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1643 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1644 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1645 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1646 { ADD_ACTIVE(state_offset + 4, 0); }
1647 count = current_state->count; /* Number already matched */
1648 if (clen > 0)
1649 {
1650 BOOL OK;
1651 switch (c)
1652 {
1653 case 0x09: /* HT */
1654 case 0x20: /* SPACE */
1655 case 0xa0: /* NBSP */
1656 case 0x1680: /* OGHAM SPACE MARK */
1657 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658 case 0x2000: /* EN QUAD */
1659 case 0x2001: /* EM QUAD */
1660 case 0x2002: /* EN SPACE */
1661 case 0x2003: /* EM SPACE */
1662 case 0x2004: /* THREE-PER-EM SPACE */
1663 case 0x2005: /* FOUR-PER-EM SPACE */
1664 case 0x2006: /* SIX-PER-EM SPACE */
1665 case 0x2007: /* FIGURE SPACE */
1666 case 0x2008: /* PUNCTUATION SPACE */
1667 case 0x2009: /* THIN SPACE */
1668 case 0x200A: /* HAIR SPACE */
1669 case 0x202f: /* NARROW NO-BREAK SPACE */
1670 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671 case 0x3000: /* IDEOGRAPHIC SPACE */
1672 OK = TRUE;
1673 break;
1674
1675 default:
1676 OK = FALSE;
1677 break;
1678 }
1679
1680 if (OK == (d == OP_HSPACE))
1681 {
1682 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1683 {
1684 active_count--; /* Remove non-match possibility */
1685 next_active_state--;
1686 }
1687 if (++count >= GET2(code, 1))
1688 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1689 else
1690 { ADD_NEW_DATA(-state_offset, count, 0); }
1691 }
1692 }
1693 break;
1694
1695 /* ========================================================================== */
1696 /* These opcodes are followed by a character that is usually compared
1697 to the current subject character; it is loaded into d. We still get
1698 here even if there is no subject character, because in some cases zero
1699 repetitions are permitted. */
1700
1701 /*-----------------------------------------------------------------*/
1702 case OP_CHAR:
1703 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1704 break;
1705
1706 /*-----------------------------------------------------------------*/
1707 case OP_CHARNC:
1708 if (clen == 0) break;
1709
1710 #ifdef SUPPORT_UTF8
1711 if (utf8)
1712 {
1713 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1714 {
1715 unsigned int othercase;
1716 if (c < 128) othercase = fcc[c]; else
1717
1718 /* If we have Unicode property support, we can use it to test the
1719 other case of the character. */
1720
1721 #ifdef SUPPORT_UCP
1722 othercase = _pcre_ucp_othercase(c);
1723 #else
1724 othercase = NOTACHAR;
1725 #endif
1726
1727 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1728 }
1729 }
1730 else
1731 #endif /* SUPPORT_UTF8 */
1732
1733 /* Non-UTF-8 mode */
1734 {
1735 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1736 }
1737 break;
1738
1739
1740 #ifdef SUPPORT_UCP
1741 /*-----------------------------------------------------------------*/
1742 /* This is a tricky one because it can match more than one character.
1743 Find out how many characters to skip, and then set up a negative state
1744 to wait for them to pass before continuing. */
1745
1746 case OP_EXTUNI:
1747 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1748 {
1749 const uschar *nptr = ptr + clen;
1750 int ncount = 0;
1751 while (nptr < end_subject)
1752 {
1753 int nclen = 1;
1754 GETCHARLEN(c, nptr, nclen);
1755 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1756 ncount++;
1757 nptr += nclen;
1758 }
1759 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1760 }
1761 break;
1762 #endif
1763
1764 /*-----------------------------------------------------------------*/
1765 /* This is a tricky like EXTUNI because it too can match more than one
1766 character (when CR is followed by LF). In this case, set up a negative
1767 state to wait for one character to pass before continuing. */
1768
1769 case OP_ANYNL:
1770 if (clen > 0) switch(c)
1771 {
1772 case 0x000b:
1773 case 0x000c:
1774 case 0x0085:
1775 case 0x2028:
1776 case 0x2029:
1777 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1778
1779 case 0x000a:
1780 ADD_NEW(state_offset + 1, 0);
1781 break;
1782
1783 case 0x000d:
1784 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1785 {
1786 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1787 }
1788 else
1789 {
1790 ADD_NEW(state_offset + 1, 0);
1791 }
1792 break;
1793 }
1794 break;
1795
1796 /*-----------------------------------------------------------------*/
1797 case OP_NOT_VSPACE:
1798 if (clen > 0) switch(c)
1799 {
1800 case 0x000a:
1801 case 0x000b:
1802 case 0x000c:
1803 case 0x000d:
1804 case 0x0085:
1805 case 0x2028:
1806 case 0x2029:
1807 break;
1808
1809 default:
1810 ADD_NEW(state_offset + 1, 0);
1811 break;
1812 }
1813 break;
1814
1815 /*-----------------------------------------------------------------*/
1816 case OP_VSPACE:
1817 if (clen > 0) switch(c)
1818 {
1819 case 0x000a:
1820 case 0x000b:
1821 case 0x000c:
1822 case 0x000d:
1823 case 0x0085:
1824 case 0x2028:
1825 case 0x2029:
1826 ADD_NEW(state_offset + 1, 0);
1827 break;
1828
1829 default: break;
1830 }
1831 break;
1832
1833 /*-----------------------------------------------------------------*/
1834 case OP_NOT_HSPACE:
1835 if (clen > 0) switch(c)
1836 {
1837 case 0x09: /* HT */
1838 case 0x20: /* SPACE */
1839 case 0xa0: /* NBSP */
1840 case 0x1680: /* OGHAM SPACE MARK */
1841 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1842 case 0x2000: /* EN QUAD */
1843 case 0x2001: /* EM QUAD */
1844 case 0x2002: /* EN SPACE */
1845 case 0x2003: /* EM SPACE */
1846 case 0x2004: /* THREE-PER-EM SPACE */
1847 case 0x2005: /* FOUR-PER-EM SPACE */
1848 case 0x2006: /* SIX-PER-EM SPACE */
1849 case 0x2007: /* FIGURE SPACE */
1850 case 0x2008: /* PUNCTUATION SPACE */
1851 case 0x2009: /* THIN SPACE */
1852 case 0x200A: /* HAIR SPACE */
1853 case 0x202f: /* NARROW NO-BREAK SPACE */
1854 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1855 case 0x3000: /* IDEOGRAPHIC SPACE */
1856 break;
1857
1858 default:
1859 ADD_NEW(state_offset + 1, 0);
1860 break;
1861 }
1862 break;
1863
1864 /*-----------------------------------------------------------------*/
1865 case OP_HSPACE:
1866 if (clen > 0) switch(c)
1867 {
1868 case 0x09: /* HT */
1869 case 0x20: /* SPACE */
1870 case 0xa0: /* NBSP */
1871 case 0x1680: /* OGHAM SPACE MARK */
1872 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1873 case 0x2000: /* EN QUAD */
1874 case 0x2001: /* EM QUAD */
1875 case 0x2002: /* EN SPACE */
1876 case 0x2003: /* EM SPACE */
1877 case 0x2004: /* THREE-PER-EM SPACE */
1878 case 0x2005: /* FOUR-PER-EM SPACE */
1879 case 0x2006: /* SIX-PER-EM SPACE */
1880 case 0x2007: /* FIGURE SPACE */
1881 case 0x2008: /* PUNCTUATION SPACE */
1882 case 0x2009: /* THIN SPACE */
1883 case 0x200A: /* HAIR SPACE */
1884 case 0x202f: /* NARROW NO-BREAK SPACE */
1885 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1886 case 0x3000: /* IDEOGRAPHIC SPACE */
1887 ADD_NEW(state_offset + 1, 0);
1888 break;
1889 }
1890 break;
1891
1892 /*-----------------------------------------------------------------*/
1893 /* Match a negated single character. This is only used for one-byte
1894 characters, that is, we know that d < 256. The character we are
1895 checking (c) can be multibyte. */
1896
1897 case OP_NOT:
1898 if (clen > 0)
1899 {
1900 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1901 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1902 }
1903 break;
1904
1905 /*-----------------------------------------------------------------*/
1906 case OP_PLUS:
1907 case OP_MINPLUS:
1908 case OP_POSPLUS:
1909 case OP_NOTPLUS:
1910 case OP_NOTMINPLUS:
1911 case OP_NOTPOSPLUS:
1912 count = current_state->count; /* Already matched */
1913 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1914 if (clen > 0)
1915 {
1916 unsigned int otherd = NOTACHAR;
1917 if ((ims & PCRE_CASELESS) != 0)
1918 {
1919 #ifdef SUPPORT_UTF8
1920 if (utf8 && d >= 128)
1921 {
1922 #ifdef SUPPORT_UCP
1923 otherd = _pcre_ucp_othercase(d);
1924 #endif /* SUPPORT_UCP */
1925 }
1926 else
1927 #endif /* SUPPORT_UTF8 */
1928 otherd = fcc[d];
1929 }
1930 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1931 {
1932 if (count > 0 &&
1933 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1934 {
1935 active_count--; /* Remove non-match possibility */
1936 next_active_state--;
1937 }
1938 count++;
1939 ADD_NEW(state_offset, count);
1940 }
1941 }
1942 break;
1943
1944 /*-----------------------------------------------------------------*/
1945 case OP_QUERY:
1946 case OP_MINQUERY:
1947 case OP_POSQUERY:
1948 case OP_NOTQUERY:
1949 case OP_NOTMINQUERY:
1950 case OP_NOTPOSQUERY:
1951 ADD_ACTIVE(state_offset + dlen + 1, 0);
1952 if (clen > 0)
1953 {
1954 unsigned int otherd = NOTACHAR;
1955 if ((ims & PCRE_CASELESS) != 0)
1956 {
1957 #ifdef SUPPORT_UTF8
1958 if (utf8 && d >= 128)
1959 {
1960 #ifdef SUPPORT_UCP
1961 otherd = _pcre_ucp_othercase(d);
1962 #endif /* SUPPORT_UCP */
1963 }
1964 else
1965 #endif /* SUPPORT_UTF8 */
1966 otherd = fcc[d];
1967 }
1968 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1969 {
1970 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1971 {
1972 active_count--; /* Remove non-match possibility */
1973 next_active_state--;
1974 }
1975 ADD_NEW(state_offset + dlen + 1, 0);
1976 }
1977 }
1978 break;
1979
1980 /*-----------------------------------------------------------------*/
1981 case OP_STAR:
1982 case OP_MINSTAR:
1983 case OP_POSSTAR:
1984 case OP_NOTSTAR:
1985 case OP_NOTMINSTAR:
1986 case OP_NOTPOSSTAR:
1987 ADD_ACTIVE(state_offset + dlen + 1, 0);
1988 if (clen > 0)
1989 {
1990 unsigned int otherd = NOTACHAR;
1991 if ((ims & PCRE_CASELESS) != 0)
1992 {
1993 #ifdef SUPPORT_UTF8
1994 if (utf8 && d >= 128)
1995 {
1996 #ifdef SUPPORT_UCP
1997 otherd = _pcre_ucp_othercase(d);
1998 #endif /* SUPPORT_UCP */
1999 }
2000 else
2001 #endif /* SUPPORT_UTF8 */
2002 otherd = fcc[d];
2003 }
2004 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2005 {
2006 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2007 {
2008 active_count--; /* Remove non-match possibility */
2009 next_active_state--;
2010 }
2011 ADD_NEW(state_offset, 0);
2012 }
2013 }
2014 break;
2015
2016 /*-----------------------------------------------------------------*/
2017 case OP_EXACT:
2018 case OP_NOTEXACT:
2019 count = current_state->count; /* Number already matched */
2020 if (clen > 0)
2021 {
2022 unsigned int otherd = NOTACHAR;
2023 if ((ims & PCRE_CASELESS) != 0)
2024 {
2025 #ifdef SUPPORT_UTF8
2026 if (utf8 && d >= 128)
2027 {
2028 #ifdef SUPPORT_UCP
2029 otherd = _pcre_ucp_othercase(d);
2030 #endif /* SUPPORT_UCP */
2031 }
2032 else
2033 #endif /* SUPPORT_UTF8 */
2034 otherd = fcc[d];
2035 }
2036 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2037 {
2038 if (++count >= GET2(code, 1))
2039 { ADD_NEW(state_offset + dlen + 3, 0); }
2040 else
2041 { ADD_NEW(state_offset, count); }
2042 }
2043 }
2044 break;
2045
2046 /*-----------------------------------------------------------------*/
2047 case OP_UPTO:
2048 case OP_MINUPTO:
2049 case OP_POSUPTO:
2050 case OP_NOTUPTO:
2051 case OP_NOTMINUPTO:
2052 case OP_NOTPOSUPTO:
2053 ADD_ACTIVE(state_offset + dlen + 3, 0);
2054 count = current_state->count; /* Number already matched */
2055 if (clen > 0)
2056 {
2057 unsigned int otherd = NOTACHAR;
2058 if ((ims & PCRE_CASELESS) != 0)
2059 {
2060 #ifdef SUPPORT_UTF8
2061 if (utf8 && d >= 128)
2062 {
2063 #ifdef SUPPORT_UCP
2064 otherd = _pcre_ucp_othercase(d);
2065 #endif /* SUPPORT_UCP */
2066 }
2067 else
2068 #endif /* SUPPORT_UTF8 */
2069 otherd = fcc[d];
2070 }
2071 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2072 {
2073 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2074 {
2075 active_count--; /* Remove non-match possibility */
2076 next_active_state--;
2077 }
2078 if (++count >= GET2(code, 1))
2079 { ADD_NEW(state_offset + dlen + 3, 0); }
2080 else
2081 { ADD_NEW(state_offset, count); }
2082 }
2083 }
2084 break;
2085
2086
2087 /* ========================================================================== */
2088 /* These are the class-handling opcodes */
2089
2090 case OP_CLASS:
2091 case OP_NCLASS:
2092 case OP_XCLASS:
2093 {
2094 BOOL isinclass = FALSE;
2095 int next_state_offset;
2096 const uschar *ecode;
2097
2098 /* For a simple class, there is always just a 32-byte table, and we
2099 can set isinclass from it. */
2100
2101 if (codevalue != OP_XCLASS)
2102 {
2103 ecode = code + 33;
2104 if (clen > 0)
2105 {
2106 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2107 ((code[1 + c/8] & (1 << (c&7))) != 0);
2108 }
2109 }
2110
2111 /* An extended class may have a table or a list of single characters,
2112 ranges, or both, and it may be positive or negative. There's a
2113 function that sorts all this out. */
2114
2115 else
2116 {
2117 ecode = code + GET(code, 1);
2118 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2119 }
2120
2121 /* At this point, isinclass is set for all kinds of class, and ecode
2122 points to the byte after the end of the class. If there is a
2123 quantifier, this is where it will be. */
2124
2125 next_state_offset = ecode - start_code;
2126
2127 switch (*ecode)
2128 {
2129 case OP_CRSTAR:
2130 case OP_CRMINSTAR:
2131 ADD_ACTIVE(next_state_offset + 1, 0);
2132 if (isinclass) { ADD_NEW(state_offset, 0); }
2133 break;
2134
2135 case OP_CRPLUS:
2136 case OP_CRMINPLUS:
2137 count = current_state->count; /* Already matched */
2138 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2139 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2140 break;
2141
2142 case OP_CRQUERY:
2143 case OP_CRMINQUERY:
2144 ADD_ACTIVE(next_state_offset + 1, 0);
2145 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2146 break;
2147
2148 case OP_CRRANGE:
2149 case OP_CRMINRANGE:
2150 count = current_state->count; /* Already matched */
2151 if (count >= GET2(ecode, 1))
2152 { ADD_ACTIVE(next_state_offset + 5, 0); }
2153 if (isinclass)
2154 {
2155 int max = GET2(ecode, 3);
2156 if (++count >= max && max != 0) /* Max 0 => no limit */
2157 { ADD_NEW(next_state_offset + 5, 0); }
2158 else
2159 { ADD_NEW(state_offset, count); }
2160 }
2161 break;
2162
2163 default:
2164 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2165 break;
2166 }
2167 }
2168 break;
2169
2170 /* ========================================================================== */
2171 /* These are the opcodes for fancy brackets of various kinds. We have
2172 to use recursion in order to handle them. */
2173
2174 case OP_ASSERT:
2175 case OP_ASSERT_NOT:
2176 case OP_ASSERTBACK:
2177 case OP_ASSERTBACK_NOT:
2178 {
2179 int rc;
2180 int local_offsets[2];
2181 int local_workspace[1000];
2182 const uschar *endasscode = code + GET(code, 1);
2183
2184 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2185
2186 rc = internal_dfa_exec(
2187 md, /* static match data */
2188 code, /* this subexpression's code */
2189 ptr, /* where we currently are */
2190 ptr - start_subject, /* start offset */
2191 local_offsets, /* offset vector */
2192 sizeof(local_offsets)/sizeof(int), /* size of same */
2193 local_workspace, /* workspace vector */
2194 sizeof(local_workspace)/sizeof(int), /* size of same */
2195 ims, /* the current ims flags */
2196 rlevel, /* function recursion level */
2197 recursing); /* pass on regex recursion */
2198
2199 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2200 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2201 }
2202 break;
2203
2204 /*-----------------------------------------------------------------*/
2205 case OP_COND:
2206 case OP_SCOND:
2207 {
2208 int local_offsets[1000];
2209 int local_workspace[1000];
2210 int condcode = code[LINK_SIZE+1];
2211
2212 /* Back reference conditions are not supported */
2213
2214 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2215
2216 /* The DEFINE condition is always false */
2217
2218 if (condcode == OP_DEF)
2219 {
2220 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2221 }
2222
2223 /* The only supported version of OP_RREF is for the value RREF_ANY,
2224 which means "test if in any recursion". We can't test for specifically
2225 recursed groups. */
2226
2227 else if (condcode == OP_RREF)
2228 {
2229 int value = GET2(code, LINK_SIZE+2);
2230 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2231 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2232 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2233 }
2234
2235 /* Otherwise, the condition is an assertion */
2236
2237 else
2238 {
2239 int rc;
2240 const uschar *asscode = code + LINK_SIZE + 1;
2241 const uschar *endasscode = asscode + GET(asscode, 1);
2242
2243 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2244
2245 rc = internal_dfa_exec(
2246 md, /* fixed match data */
2247 asscode, /* this subexpression's code */
2248 ptr, /* where we currently are */
2249 ptr - start_subject, /* start offset */
2250 local_offsets, /* offset vector */
2251 sizeof(local_offsets)/sizeof(int), /* size of same */
2252 local_workspace, /* workspace vector */
2253 sizeof(local_workspace)/sizeof(int), /* size of same */
2254 ims, /* the current ims flags */
2255 rlevel, /* function recursion level */
2256 recursing); /* pass on regex recursion */
2257
2258 if ((rc >= 0) ==
2259 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2260 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2261 else
2262 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2263 }
2264 }
2265 break;
2266
2267 /*-----------------------------------------------------------------*/
2268 case OP_RECURSE:
2269 {
2270 int local_offsets[1000];
2271 int local_workspace[1000];
2272 int rc;
2273
2274 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2275 recursing + 1));
2276
2277 rc = internal_dfa_exec(
2278 md, /* fixed match data */
2279 start_code + GET(code, 1), /* this subexpression's code */
2280 ptr, /* where we currently are */
2281 ptr - start_subject, /* start offset */
2282 local_offsets, /* offset vector */
2283 sizeof(local_offsets)/sizeof(int), /* size of same */
2284 local_workspace, /* workspace vector */
2285 sizeof(local_workspace)/sizeof(int), /* size of same */
2286 ims, /* the current ims flags */
2287 rlevel, /* function recursion level */
2288 recursing + 1); /* regex recurse level */
2289
2290 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2291 recursing + 1, rc));
2292
2293 /* Ran out of internal offsets */
2294
2295 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2296
2297 /* For each successful matched substring, set up the next state with a
2298 count of characters to skip before trying it. Note that the count is in
2299 characters, not bytes. */
2300
2301 if (rc > 0)
2302 {
2303 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2304 {
2305 const uschar *p = start_subject + local_offsets[rc];
2306 const uschar *pp = start_subject + local_offsets[rc+1];
2307 int charcount = local_offsets[rc+1] - local_offsets[rc];
2308 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2309 if (charcount > 0)
2310 {
2311 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2312 }
2313 else
2314 {
2315 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2316 }
2317 }
2318 }
2319 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2320 }
2321 break;
2322
2323 /*-----------------------------------------------------------------*/
2324 case OP_ONCE:
2325 {
2326 int local_offsets[2];
2327 int local_workspace[1000];
2328
2329 int rc = internal_dfa_exec(
2330 md, /* fixed match data */
2331 code, /* this subexpression's code */
2332 ptr, /* where we currently are */
2333 ptr - start_subject, /* start offset */
2334 local_offsets, /* offset vector */
2335 sizeof(local_offsets)/sizeof(int), /* size of same */
2336 local_workspace, /* workspace vector */
2337 sizeof(local_workspace)/sizeof(int), /* size of same */
2338 ims, /* the current ims flags */
2339 rlevel, /* function recursion level */
2340 recursing); /* pass on regex recursion */
2341
2342 if (rc >= 0)
2343 {
2344 const uschar *end_subpattern = code;
2345 int charcount = local_offsets[1] - local_offsets[0];
2346 int next_state_offset, repeat_state_offset;
2347
2348 do { end_subpattern += GET(end_subpattern, 1); }
2349 while (*end_subpattern == OP_ALT);
2350 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2351
2352 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2353 arrange for the repeat state also to be added to the relevant list.
2354 Calculate the offset, or set -1 for no repeat. */
2355
2356 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2357 *end_subpattern == OP_KETRMIN)?
2358 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2359
2360 /* If we have matched an empty string, add the next state at the
2361 current character pointer. This is important so that the duplicate
2362 checking kicks in, which is what breaks infinite loops that match an
2363 empty string. */
2364
2365 if (charcount == 0)
2366 {
2367 ADD_ACTIVE(next_state_offset, 0);
2368 }
2369
2370 /* Optimization: if there are no more active states, and there
2371 are no new states yet set up, then skip over the subject string
2372 right here, to save looping. Otherwise, set up the new state to swing
2373 into action when the end of the substring is reached. */
2374
2375 else if (i + 1 >= active_count && new_count == 0)
2376 {
2377 ptr += charcount;
2378 clen = 0;
2379 ADD_NEW(next_state_offset, 0);
2380
2381 /* If we are adding a repeat state at the new character position,
2382 we must fudge things so that it is the only current state.
2383 Otherwise, it might be a duplicate of one we processed before, and
2384 that would cause it to be skipped. */
2385
2386 if (repeat_state_offset >= 0)
2387 {
2388 next_active_state = active_states;
2389 active_count = 0;
2390 i = -1;
2391 ADD_ACTIVE(repeat_state_offset, 0);
2392 }
2393 }
2394 else
2395 {
2396 const uschar *p = start_subject + local_offsets[0];
2397 const uschar *pp = start_subject + local_offsets[1];
2398 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2399 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2400 if (repeat_state_offset >= 0)
2401 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2402 }
2403
2404 }
2405 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2406 }
2407 break;
2408
2409
2410 /* ========================================================================== */
2411 /* Handle callouts */
2412
2413 case OP_CALLOUT:
2414 if (pcre_callout != NULL)
2415 {
2416 int rrc;
2417 pcre_callout_block cb;
2418 cb.version = 1; /* Version 1 of the callout block */
2419 cb.callout_number = code[1];
2420 cb.offset_vector = offsets;
2421 cb.subject = (PCRE_SPTR)start_subject;
2422 cb.subject_length = end_subject - start_subject;
2423 cb.start_match = current_subject - start_subject;
2424 cb.current_position = ptr - start_subject;
2425 cb.pattern_position = GET(code, 2);
2426 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2427 cb.capture_top = 1;
2428 cb.capture_last = -1;
2429 cb.callout_data = md->callout_data;
2430 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2431 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2432 }
2433 break;
2434
2435
2436 /* ========================================================================== */
2437 default: /* Unsupported opcode */
2438 return PCRE_ERROR_DFA_UITEM;
2439 }
2440
2441 NEXT_ACTIVE_STATE: continue;
2442
2443 } /* End of loop scanning active states */
2444
2445 /* We have finished the processing at the current subject character. If no
2446 new states have been set for the next character, we have found all the
2447 matches that we are going to find. If we are at the top level and partial
2448 matching has been requested, check for appropriate conditions. */
2449
2450 if (new_count <= 0)
2451 {
2452 if (match_count < 0 && /* No matches found */
2453 rlevel == 1 && /* Top level match function */
2454 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2455 ptr >= end_subject && /* Reached end of subject */
2456 ptr > current_subject) /* Matched non-empty string */
2457 {
2458 if (offsetcount >= 2)
2459 {
2460 offsets[0] = current_subject - start_subject;
2461 offsets[1] = end_subject - start_subject;
2462 }
2463 match_count = PCRE_ERROR_PARTIAL;
2464 }
2465
2466 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2467 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2468 rlevel*2-2, SP));
2469 break; /* In effect, "return", but see the comment below */
2470 }
2471
2472 /* One or more states are active for the next character. */
2473
2474 ptr += clen; /* Advance to next subject character */
2475 } /* Loop to move along the subject string */
2476
2477 /* Control gets here from "break" a few lines above. We do it this way because
2478 if we use "return" above, we have compiler trouble. Some compilers warn if
2479 there's nothing here because they think the function doesn't return a value. On
2480 the other hand, if we put a dummy statement here, some more clever compilers
2481 complain that it can't be reached. Sigh. */
2482
2483 return match_count;
2484 }
2485
2486
2487
2488
2489 /*************************************************
2490 * Execute a Regular Expression - DFA engine *
2491 *************************************************/
2492
2493 /* This external function applies a compiled re to a subject string using a DFA
2494 engine. This function calls the internal function multiple times if the pattern
2495 is not anchored.
2496
2497 Arguments:
2498 argument_re points to the compiled expression
2499 extra_data points to extra data or is NULL
2500 subject points to the subject string
2501 length length of subject string (may contain binary zeros)
2502 start_offset where to start in the subject string
2503 options option bits
2504 offsets vector of match offsets
2505 offsetcount size of same
2506 workspace workspace vector
2507 wscount size of same
2508
2509 Returns: > 0 => number of match offset pairs placed in offsets
2510 = 0 => offsets overflowed; longest matches are present
2511 -1 => failed to match
2512 < -1 => some kind of unexpected problem
2513 */
2514
2515 PCRE_EXP_DEFN int
2516 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2517 const char *subject, int length, int start_offset, int options, int *offsets,
2518 int offsetcount, int *workspace, int wscount)
2519 {
2520 real_pcre *re = (real_pcre *)argument_re;
2521 dfa_match_data match_block;
2522 dfa_match_data *md = &match_block;
2523 BOOL utf8, anchored, startline, firstline;
2524 const uschar *current_subject, *end_subject, *lcc;
2525
2526 pcre_study_data internal_study;
2527 const pcre_study_data *study = NULL;
2528 real_pcre internal_re;
2529
2530 const uschar *req_byte_ptr;
2531 const uschar *start_bits = NULL;
2532 BOOL first_byte_caseless = FALSE;
2533 BOOL req_byte_caseless = FALSE;
2534 int first_byte = -1;
2535 int req_byte = -1;
2536 int req_byte2 = -1;
2537 int newline;
2538
2539 /* Plausibility checks */
2540
2541 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2542 if (re == NULL || subject == NULL || workspace == NULL ||
2543 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2544 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2545 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2546
2547 /* We need to find the pointer to any study data before we test for byte
2548 flipping, so we scan the extra_data block first. This may set two fields in the
2549 match block, so we must initialize them beforehand. However, the other fields
2550 in the match block must not be set until after the byte flipping. */
2551
2552 md->tables = re->tables;
2553 md->callout_data = NULL;
2554
2555 if (extra_data != NULL)
2556 {
2557 unsigned int flags = extra_data->flags;
2558 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2559 study = (const pcre_study_data *)extra_data->study_data;
2560 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2561 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2562 return PCRE_ERROR_DFA_UMLIMIT;
2563 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2564 md->callout_data = extra_data->callout_data;
2565 if ((flags & PCRE_EXTRA_TABLES) != 0)
2566 md->tables = extra_data->tables;
2567 }
2568
2569 /* Check that the first field in the block is the magic number. If it is not,
2570 test for a regex that was compiled on a host of opposite endianness. If this is
2571 the case, flipped values are put in internal_re and internal_study if there was
2572 study data too. */
2573
2574 if (re->magic_number != MAGIC_NUMBER)
2575 {
2576 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2577 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2578 if (study != NULL) study = &internal_study;
2579 }
2580
2581 /* Set some local values */
2582
2583 current_subject = (const unsigned char *)subject + start_offset;
2584 end_subject = (const unsigned char *)subject + length;
2585 req_byte_ptr = current_subject - 1;
2586
2587 #ifdef SUPPORT_UTF8
2588 utf8 = (re->options & PCRE_UTF8) != 0;
2589 #else
2590 utf8 = FALSE;
2591 #endif
2592
2593 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2594 (re->options & PCRE_ANCHORED) != 0;
2595
2596 /* The remaining fixed data for passing around. */
2597
2598 md->start_code = (const uschar *)argument_re +
2599 re->name_table_offset + re->name_count * re->name_entry_size;
2600 md->start_subject = (const unsigned char *)subject;
2601 md->end_subject = end_subject;
2602 md->moptions = options;
2603 md->poptions = re->options;
2604
2605 /* If the BSR option is not set at match time, copy what was set
2606 at compile time. */
2607
2608 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2609 {
2610 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2611 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2612 #ifdef BSR_ANYCRLF
2613 else md->moptions |= PCRE_BSR_ANYCRLF;
2614 #endif
2615 }
2616
2617 /* Handle different types of newline. The three bits give eight cases. If
2618 nothing is set at run time, whatever was used at compile time applies. */
2619
2620 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2621 PCRE_NEWLINE_BITS)
2622 {
2623 case 0: newline = NEWLINE; break; /* Compile-time default */
2624 case PCRE_NEWLINE_CR: newline = '\r'; break;
2625 case PCRE_NEWLINE_LF: newline = '\n'; break;
2626 case PCRE_NEWLINE_CR+
2627 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2628 case PCRE_NEWLINE_ANY: newline = -1; break;
2629 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2630 default: return PCRE_ERROR_BADNEWLINE;
2631 }
2632
2633 if (newline == -2)
2634 {
2635 md->nltype = NLTYPE_ANYCRLF;
2636 }
2637 else if (newline < 0)
2638 {
2639 md->nltype = NLTYPE_ANY;
2640 }
2641 else
2642 {
2643 md->nltype = NLTYPE_FIXED;
2644 if (newline > 255)
2645 {
2646 md->nllen = 2;
2647 md->nl[0] = (newline >> 8) & 255;
2648 md->nl[1] = newline & 255;
2649 }
2650 else
2651 {
2652 md->nllen = 1;
2653 md->nl[0] = newline;
2654 }
2655 }
2656
2657 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2658 back the character offset. */
2659
2660 #ifdef SUPPORT_UTF8
2661 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2662 {
2663 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2664 return PCRE_ERROR_BADUTF8;
2665 if (start_offset > 0 && start_offset < length)
2666 {
2667 int tb = ((uschar *)subject)[start_offset];
2668 if (tb > 127)
2669 {
2670 tb &= 0xc0;
2671 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2672 }
2673 }
2674 }
2675 #endif
2676
2677 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2678 is a feature that makes it possible to save compiled regex and re-use them
2679 in other programs later. */
2680
2681 if (md->tables == NULL) md->tables = _pcre_default_tables;
2682
2683 /* The lower casing table and the "must be at the start of a line" flag are
2684 used in a loop when finding where to start. */
2685
2686 lcc = md->tables + lcc_offset;
2687 startline = (re->flags & PCRE_STARTLINE) != 0;
2688 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2689
2690 /* Set up the first character to match, if available. The first_byte value is
2691 never set for an anchored regular expression, but the anchoring may be forced
2692 at run time, so we have to test for anchoring. The first char may be unset for
2693 an unanchored pattern, of course. If there's no first char and the pattern was
2694 studied, there may be a bitmap of possible first characters. */
2695
2696 if (!anchored)
2697 {
2698 if ((re->flags & PCRE_FIRSTSET) != 0)
2699 {
2700 first_byte = re->first_byte & 255;
2701 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2702 first_byte = lcc[first_byte];
2703 }
2704 else
2705 {
2706 if (startline && study != NULL &&
2707 (study->options & PCRE_STUDY_MAPPED) != 0)
2708 start_bits = study->start_bits;
2709 }
2710 }
2711
2712 /* For anchored or unanchored matches, there may be a "last known required
2713 character" set. */
2714
2715 if ((re->flags & PCRE_REQCHSET) != 0)
2716 {
2717 req_byte = re->req_byte & 255;
2718 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2719 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2720 }
2721
2722 /* Call the main matching function, looping for a non-anchored regex after a
2723 failed match. Unless restarting, optimize by moving to the first match
2724 character if possible, when not anchored. Then unless wanting a partial match,
2725 check for a required later character. */
2726
2727 for (;;)
2728 {
2729 int rc;
2730
2731 if ((options & PCRE_DFA_RESTART) == 0)
2732 {
2733 const uschar *save_end_subject = end_subject;
2734
2735 /* Advance to a unique first char if possible. If firstline is TRUE, the
2736 start of the match is constrained to the first line of a multiline string.
2737 Implement this by temporarily adjusting end_subject so that we stop
2738 scanning at a newline. If the match fails at the newline, later code breaks
2739 this loop. */
2740
2741 if (firstline)
2742 {
2743 const uschar *t = current_subject;
2744 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2745 end_subject = t;
2746 }
2747
2748 if (first_byte >= 0)
2749 {
2750 if (first_byte_caseless)
2751 while (current_subject < end_subject &&
2752 lcc[*current_subject] != first_byte)
2753 current_subject++;
2754 else
2755 while (current_subject < end_subject && *current_subject != first_byte)
2756 current_subject++;
2757 }
2758
2759 /* Or to just after a linebreak for a multiline match if possible */
2760
2761 else if (startline)
2762 {
2763 if (current_subject > md->start_subject + start_offset)
2764 {
2765 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2766 current_subject++;
2767
2768 /* If we have just passed a CR and the newline option is ANY or
2769 ANYCRLF, and we are now at a LF, advance the match position by one more
2770 character. */
2771
2772 if (current_subject[-1] == '\r' &&
2773 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2774 current_subject < end_subject &&
2775 *current_subject == '\n')
2776 current_subject++;
2777 }
2778 }
2779
2780 /* Or to a non-unique first char after study */
2781
2782 else if (start_bits != NULL)
2783 {
2784 while (current_subject < end_subject)
2785 {
2786 register unsigned int c = *current_subject;
2787 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2788 else break;
2789 }
2790 }
2791
2792 /* Restore fudged end_subject */
2793
2794 end_subject = save_end_subject;
2795 }
2796
2797 /* If req_byte is set, we know that that character must appear in the subject
2798 for the match to succeed. If the first character is set, req_byte must be
2799 later in the subject; otherwise the test starts at the match point. This
2800 optimization can save a huge amount of work in patterns with nested unlimited
2801 repeats that aren't going to match. Writing separate code for cased/caseless
2802 versions makes it go faster, as does using an autoincrement and backing off
2803 on a match.
2804
2805 HOWEVER: when the subject string is very, very long, searching to its end can
2806 take a long time, and give bad performance on quite ordinary patterns. This
2807 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2808 don't do this when the string is sufficiently long.
2809
2810 ALSO: this processing is disabled when partial matching is requested.
2811 */
2812
2813 if (req_byte >= 0 &&
2814 end_subject - current_subject < REQ_BYTE_MAX &&
2815 (options & PCRE_PARTIAL) == 0)
2816 {
2817 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2818
2819 /* We don't need to repeat the search if we haven't yet reached the
2820 place we found it at last time. */
2821
2822 if (p > req_byte_ptr)
2823 {
2824 if (req_byte_caseless)
2825 {
2826 while (p < end_subject)
2827 {
2828 register int pp = *p++;
2829 if (pp == req_byte || pp == req_byte2) { p--; break; }
2830 }
2831 }
2832 else
2833 {
2834 while (p < end_subject)
2835 {
2836 if (*p++ == req_byte) { p--; break; }
2837 }
2838 }
2839
2840 /* If we can't find the required character, break the matching loop,
2841 which will cause a return or PCRE_ERROR_NOMATCH. */
2842
2843 if (p >= end_subject) break;
2844
2845 /* If we have found the required character, save the point where we
2846 found it, so that we don't search again next time round the loop if
2847 the start hasn't passed this character yet. */
2848
2849 req_byte_ptr = p;
2850 }
2851 }
2852
2853 /* OK, now we can do the business */
2854
2855 rc = internal_dfa_exec(
2856 md, /* fixed match data */
2857 md->start_code, /* this subexpression's code */
2858 current_subject, /* where we currently are */
2859 start_offset, /* start offset in subject */
2860 offsets, /* offset vector */
2861 offsetcount, /* size of same */
2862 workspace, /* workspace vector */
2863 wscount, /* size of same */
2864 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2865 0, /* function recurse level */
2866 0); /* regex recurse level */
2867
2868 /* Anything other than "no match" means we are done, always; otherwise, carry
2869 on only if not anchored. */
2870
2871 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2872
2873 /* Advance to the next subject character unless we are at the end of a line
2874 and firstline is set. */
2875
2876 if (firstline && IS_NEWLINE(current_subject)) break;
2877 current_subject++;
2878 if (utf8)
2879 {
2880 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2881 current_subject++;
2882 }
2883 if (current_subject > end_subject) break;
2884
2885 /* If we have just passed a CR and we are now at a LF, and the pattern does
2886 not contain any explicit matches for \r or \n, and the newline option is CRLF
2887 or ANY or ANYCRLF, advance the match position by one more character. */
2888
2889 if (current_subject[-1] == '\r' &&
2890 current_subject < end_subject &&
2891 *current_subject == '\n' &&
2892 (re->flags & PCRE_HASCRORLF) == 0 &&
2893 (md->nltype == NLTYPE_ANY ||
2894 md->nltype == NLTYPE_ANYCRLF ||
2895 md->nllen == 2))
2896 current_subject++;
2897
2898 } /* "Bumpalong" loop */
2899
2900 return PCRE_ERROR_NOMATCH;
2901 }
2902
2903 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5