/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 439 - (show annotations)
Tue Sep 8 17:27:24 2009 UTC (5 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 101264 byte(s)
Error occurred while calculating annotation data.
Added performance comment to pcre_exec.c.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2009 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes is possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. ***NOTE*** If the start of this table is modified, the two tables
113 that follow must also be modified. */
114
115 static const uschar coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
123 1, /* Char */
124 1, /* Charnc */
125 1, /* not */
126 /* Positive single-char repeats */
127 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
128 3, 3, 3, /* upto, minupto, exact */
129 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
130 /* Negative single-char repeats - only for chars < 256 */
131 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
132 3, 3, 3, /* NOT upto, minupto, exact */
133 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
134 /* Positive type repeats */
135 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
136 3, 3, 3, /* Type upto, minupto, exact */
137 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
138 /* Character class & ref repeats */
139 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
140 0, 0, /* CRRANGE, CRMINRANGE */
141 0, /* CLASS */
142 0, /* NCLASS */
143 0, /* XCLASS - variable length */
144 0, /* REF */
145 0, /* RECURSE */
146 0, /* CALLOUT */
147 0, /* Alt */
148 0, /* Ket */
149 0, /* KetRmax */
150 0, /* KetRmin */
151 0, /* Assert */
152 0, /* Assert not */
153 0, /* Assert behind */
154 0, /* Assert behind not */
155 0, /* Reverse */
156 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
157 0, 0, 0, /* SBRA, SCBRA, SCOND */
158 0, /* CREF */
159 0, /* RREF */
160 0, /* DEF */
161 0, 0, /* BRAZERO, BRAMINZERO */
162 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
163 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
164 };
165
166 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
167 and \w */
168
169 static const uschar toptable1[] = {
170 0, 0, 0, 0, 0, 0,
171 ctype_digit, ctype_digit,
172 ctype_space, ctype_space,
173 ctype_word, ctype_word,
174 0, 0 /* OP_ANY, OP_ALLANY */
175 };
176
177 static const uschar toptable2[] = {
178 0, 0, 0, 0, 0, 0,
179 ctype_digit, 0,
180 ctype_space, 0,
181 ctype_word, 0,
182 1, 1 /* OP_ANY, OP_ALLANY */
183 };
184
185
186 /* Structure for holding data about a particular state, which is in effect the
187 current data for an active path through the match tree. It must consist
188 entirely of ints because the working vector we are passed, and which we put
189 these structures in, is a vector of ints. */
190
191 typedef struct stateblock {
192 int offset; /* Offset to opcode */
193 int count; /* Count for repeats */
194 int ims; /* ims flag bits */
195 int data; /* Some use extra data */
196 } stateblock;
197
198 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
199
200
201 #ifdef DEBUG
202 /*************************************************
203 * Print character string *
204 *************************************************/
205
206 /* Character string printing function for debugging.
207
208 Arguments:
209 p points to string
210 length number of bytes
211 f where to print
212
213 Returns: nothing
214 */
215
216 static void
217 pchars(unsigned char *p, int length, FILE *f)
218 {
219 int c;
220 while (length-- > 0)
221 {
222 if (isprint(c = *(p++)))
223 fprintf(f, "%c", c);
224 else
225 fprintf(f, "\\x%02x", c);
226 }
227 }
228 #endif
229
230
231
232 /*************************************************
233 * Execute a Regular Expression - DFA engine *
234 *************************************************/
235
236 /* This internal function applies a compiled pattern to a subject string,
237 starting at a given point, using a DFA engine. This function is called from the
238 external one, possibly multiple times if the pattern is not anchored. The
239 function calls itself recursively for some kinds of subpattern.
240
241 Arguments:
242 md the match_data block with fixed information
243 this_start_code the opening bracket of this subexpression's code
244 current_subject where we currently are in the subject string
245 start_offset start offset in the subject string
246 offsets vector to contain the matching string offsets
247 offsetcount size of same
248 workspace vector of workspace
249 wscount size of same
250 ims the current ims flags
251 rlevel function call recursion level
252 recursing regex recursive call level
253
254 Returns: > 0 => number of match offset pairs placed in offsets
255 = 0 => offsets overflowed; longest matches are present
256 -1 => failed to match
257 < -1 => some kind of unexpected problem
258
259 The following macros are used for adding states to the two state vectors (one
260 for the current character, one for the following character). */
261
262 #define ADD_ACTIVE(x,y) \
263 if (active_count++ < wscount) \
264 { \
265 next_active_state->offset = (x); \
266 next_active_state->count = (y); \
267 next_active_state->ims = ims; \
268 next_active_state++; \
269 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
270 } \
271 else return PCRE_ERROR_DFA_WSSIZE
272
273 #define ADD_ACTIVE_DATA(x,y,z) \
274 if (active_count++ < wscount) \
275 { \
276 next_active_state->offset = (x); \
277 next_active_state->count = (y); \
278 next_active_state->ims = ims; \
279 next_active_state->data = (z); \
280 next_active_state++; \
281 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
282 } \
283 else return PCRE_ERROR_DFA_WSSIZE
284
285 #define ADD_NEW(x,y) \
286 if (new_count++ < wscount) \
287 { \
288 next_new_state->offset = (x); \
289 next_new_state->count = (y); \
290 next_new_state->ims = ims; \
291 next_new_state++; \
292 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
293 } \
294 else return PCRE_ERROR_DFA_WSSIZE
295
296 #define ADD_NEW_DATA(x,y,z) \
297 if (new_count++ < wscount) \
298 { \
299 next_new_state->offset = (x); \
300 next_new_state->count = (y); \
301 next_new_state->ims = ims; \
302 next_new_state->data = (z); \
303 next_new_state++; \
304 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
305 } \
306 else return PCRE_ERROR_DFA_WSSIZE
307
308 /* And now, here is the code */
309
310 static int
311 internal_dfa_exec(
312 dfa_match_data *md,
313 const uschar *this_start_code,
314 const uschar *current_subject,
315 int start_offset,
316 int *offsets,
317 int offsetcount,
318 int *workspace,
319 int wscount,
320 int ims,
321 int rlevel,
322 int recursing)
323 {
324 stateblock *active_states, *new_states, *temp_states;
325 stateblock *next_active_state, *next_new_state;
326
327 const uschar *ctypes, *lcc, *fcc;
328 const uschar *ptr;
329 const uschar *end_code, *first_op;
330
331 int active_count, new_count, match_count;
332
333 /* Some fields in the md block are frequently referenced, so we load them into
334 independent variables in the hope that this will perform better. */
335
336 const uschar *start_subject = md->start_subject;
337 const uschar *end_subject = md->end_subject;
338 const uschar *start_code = md->start_code;
339
340 #ifdef SUPPORT_UTF8
341 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
342 #else
343 BOOL utf8 = FALSE;
344 #endif
345
346 rlevel++;
347 offsetcount &= (-2);
348
349 wscount -= 2;
350 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
351 (2 * INTS_PER_STATEBLOCK);
352
353 DPRINTF(("\n%.*s---------------------\n"
354 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
355 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
356
357 ctypes = md->tables + ctypes_offset;
358 lcc = md->tables + lcc_offset;
359 fcc = md->tables + fcc_offset;
360
361 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
362
363 active_states = (stateblock *)(workspace + 2);
364 next_new_state = new_states = active_states + wscount;
365 new_count = 0;
366
367 first_op = this_start_code + 1 + LINK_SIZE +
368 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
369
370 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
371 the alternative states onto the list, and find out where the end is. This
372 makes is possible to use this function recursively, when we want to stop at a
373 matching internal ket rather than at the end.
374
375 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
376 a backward assertion. In that case, we have to find out the maximum amount to
377 move back, and set up each alternative appropriately. */
378
379 if (*first_op == OP_REVERSE)
380 {
381 int max_back = 0;
382 int gone_back;
383
384 end_code = this_start_code;
385 do
386 {
387 int back = GET(end_code, 2+LINK_SIZE);
388 if (back > max_back) max_back = back;
389 end_code += GET(end_code, 1);
390 }
391 while (*end_code == OP_ALT);
392
393 /* If we can't go back the amount required for the longest lookbehind
394 pattern, go back as far as we can; some alternatives may still be viable. */
395
396 #ifdef SUPPORT_UTF8
397 /* In character mode we have to step back character by character */
398
399 if (utf8)
400 {
401 for (gone_back = 0; gone_back < max_back; gone_back++)
402 {
403 if (current_subject <= start_subject) break;
404 current_subject--;
405 while (current_subject > start_subject &&
406 (*current_subject & 0xc0) == 0x80)
407 current_subject--;
408 }
409 }
410 else
411 #endif
412
413 /* In byte-mode we can do this quickly. */
414
415 {
416 gone_back = (current_subject - max_back < start_subject)?
417 current_subject - start_subject : max_back;
418 current_subject -= gone_back;
419 }
420
421 /* Save the earliest consulted character */
422
423 if (current_subject < md->start_used_ptr)
424 md->start_used_ptr = current_subject;
425
426 /* Now we can process the individual branches. */
427
428 end_code = this_start_code;
429 do
430 {
431 int back = GET(end_code, 2+LINK_SIZE);
432 if (back <= gone_back)
433 {
434 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
435 ADD_NEW_DATA(-bstate, 0, gone_back - back);
436 }
437 end_code += GET(end_code, 1);
438 }
439 while (*end_code == OP_ALT);
440 }
441
442 /* This is the code for a "normal" subpattern (not a backward assertion). The
443 start of a whole pattern is always one of these. If we are at the top level,
444 we may be asked to restart matching from the same point that we reached for a
445 previous partial match. We still have to scan through the top-level branches to
446 find the end state. */
447
448 else
449 {
450 end_code = this_start_code;
451
452 /* Restarting */
453
454 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
455 {
456 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
457 new_count = workspace[1];
458 if (!workspace[0])
459 memcpy(new_states, active_states, new_count * sizeof(stateblock));
460 }
461
462 /* Not restarting */
463
464 else
465 {
466 int length = 1 + LINK_SIZE +
467 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
468 do
469 {
470 ADD_NEW(end_code - start_code + length, 0);
471 end_code += GET(end_code, 1);
472 length = 1 + LINK_SIZE;
473 }
474 while (*end_code == OP_ALT);
475 }
476 }
477
478 workspace[0] = 0; /* Bit indicating which vector is current */
479
480 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
481
482 /* Loop for scanning the subject */
483
484 ptr = current_subject;
485 for (;;)
486 {
487 int i, j;
488 int clen, dlen;
489 unsigned int c, d;
490 int forced_fail = 0;
491 int reached_end = 0;
492
493 /* Make the new state list into the active state list and empty the
494 new state list. */
495
496 temp_states = active_states;
497 active_states = new_states;
498 new_states = temp_states;
499 active_count = new_count;
500 new_count = 0;
501
502 workspace[0] ^= 1; /* Remember for the restarting feature */
503 workspace[1] = active_count;
504
505 #ifdef DEBUG
506 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
507 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
508 printf("\"\n");
509
510 printf("%.*sActive states: ", rlevel*2-2, SP);
511 for (i = 0; i < active_count; i++)
512 printf("%d/%d ", active_states[i].offset, active_states[i].count);
513 printf("\n");
514 #endif
515
516 /* Set the pointers for adding new states */
517
518 next_active_state = active_states + active_count;
519 next_new_state = new_states;
520
521 /* Load the current character from the subject outside the loop, as many
522 different states may want to look at it, and we assume that at least one
523 will. */
524
525 if (ptr < end_subject)
526 {
527 clen = 1; /* Number of bytes in the character */
528 #ifdef SUPPORT_UTF8
529 if (utf8) { GETCHARLEN(c, ptr, clen); } else
530 #endif /* SUPPORT_UTF8 */
531 c = *ptr;
532 }
533 else
534 {
535 clen = 0; /* This indicates the end of the subject */
536 c = NOTACHAR; /* This value should never actually be used */
537 }
538
539 /* Scan up the active states and act on each one. The result of an action
540 may be to add more states to the currently active list (e.g. on hitting a
541 parenthesis) or it may be to put states on the new list, for considering
542 when we move the character pointer on. */
543
544 for (i = 0; i < active_count; i++)
545 {
546 stateblock *current_state = active_states + i;
547 const uschar *code;
548 int state_offset = current_state->offset;
549 int count, codevalue, rrc;
550
551 #ifdef DEBUG
552 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
553 if (clen == 0) printf("EOL\n");
554 else if (c > 32 && c < 127) printf("'%c'\n", c);
555 else printf("0x%02x\n", c);
556 #endif
557
558 /* This variable is referred to implicity in the ADD_xxx macros. */
559
560 ims = current_state->ims;
561
562 /* A negative offset is a special case meaning "hold off going to this
563 (negated) state until the number of characters in the data field have
564 been skipped". */
565
566 if (state_offset < 0)
567 {
568 if (current_state->data > 0)
569 {
570 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
571 ADD_NEW_DATA(state_offset, current_state->count,
572 current_state->data - 1);
573 continue;
574 }
575 else
576 {
577 current_state->offset = state_offset = -state_offset;
578 }
579 }
580
581 /* Check for a duplicate state with the same count, and skip if found.
582 See the note at the head of this module about the possibility of improving
583 performance here. */
584
585 for (j = 0; j < i; j++)
586 {
587 if (active_states[j].offset == state_offset &&
588 active_states[j].count == current_state->count)
589 {
590 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
591 goto NEXT_ACTIVE_STATE;
592 }
593 }
594
595 /* The state offset is the offset to the opcode */
596
597 code = start_code + state_offset;
598 codevalue = *code;
599
600 /* If this opcode is followed by an inline character, load it. It is
601 tempting to test for the presence of a subject character here, but that
602 is wrong, because sometimes zero repetitions of the subject are
603 permitted.
604
605 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
606 argument that is not a data character - but is always one byte long. We
607 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
608 this case. To keep the other cases fast, convert these ones to new opcodes.
609 */
610
611 if (coptable[codevalue] > 0)
612 {
613 dlen = 1;
614 #ifdef SUPPORT_UTF8
615 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
616 #endif /* SUPPORT_UTF8 */
617 d = code[coptable[codevalue]];
618 if (codevalue >= OP_TYPESTAR)
619 {
620 switch(d)
621 {
622 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
623 case OP_NOTPROP:
624 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
625 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
626 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
627 case OP_NOT_HSPACE:
628 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
629 case OP_NOT_VSPACE:
630 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
631 default: break;
632 }
633 }
634 }
635 else
636 {
637 dlen = 0; /* Not strictly necessary, but compilers moan */
638 d = NOTACHAR; /* if these variables are not set. */
639 }
640
641
642 /* Now process the individual opcodes */
643
644 switch (codevalue)
645 {
646
647 /* ========================================================================== */
648 /* Reached a closing bracket. If not at the end of the pattern, carry
649 on with the next opcode. Otherwise, unless we have an empty string and
650 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
651 matches so we always have the longest first. */
652
653 case OP_KET:
654 case OP_KETRMIN:
655 case OP_KETRMAX:
656 if (code != end_code)
657 {
658 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
659 if (codevalue != OP_KET)
660 {
661 ADD_ACTIVE(state_offset - GET(code, 1), 0);
662 }
663 }
664 else
665 {
666 reached_end++; /* Count branches that reach the end */
667 if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
668 {
669 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
670 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
671 match_count = 0;
672 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
673 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
674 if (offsetcount >= 2)
675 {
676 offsets[0] = current_subject - start_subject;
677 offsets[1] = ptr - start_subject;
678 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
679 offsets[1] - offsets[0], current_subject));
680 }
681 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
682 {
683 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
684 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
685 match_count, rlevel*2-2, SP));
686 return match_count;
687 }
688 }
689 }
690 break;
691
692 /* ========================================================================== */
693 /* These opcodes add to the current list of states without looking
694 at the current character. */
695
696 /*-----------------------------------------------------------------*/
697 case OP_ALT:
698 do { code += GET(code, 1); } while (*code == OP_ALT);
699 ADD_ACTIVE(code - start_code, 0);
700 break;
701
702 /*-----------------------------------------------------------------*/
703 case OP_BRA:
704 case OP_SBRA:
705 do
706 {
707 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
708 code += GET(code, 1);
709 }
710 while (*code == OP_ALT);
711 break;
712
713 /*-----------------------------------------------------------------*/
714 case OP_CBRA:
715 case OP_SCBRA:
716 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
717 code += GET(code, 1);
718 while (*code == OP_ALT)
719 {
720 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
721 code += GET(code, 1);
722 }
723 break;
724
725 /*-----------------------------------------------------------------*/
726 case OP_BRAZERO:
727 case OP_BRAMINZERO:
728 ADD_ACTIVE(state_offset + 1, 0);
729 code += 1 + GET(code, 2);
730 while (*code == OP_ALT) code += GET(code, 1);
731 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
732 break;
733
734 /*-----------------------------------------------------------------*/
735 case OP_SKIPZERO:
736 code += 1 + GET(code, 2);
737 while (*code == OP_ALT) code += GET(code, 1);
738 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
739 break;
740
741 /*-----------------------------------------------------------------*/
742 case OP_CIRC:
743 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
744 ((ims & PCRE_MULTILINE) != 0 &&
745 ptr != end_subject &&
746 WAS_NEWLINE(ptr)))
747 { ADD_ACTIVE(state_offset + 1, 0); }
748 break;
749
750 /*-----------------------------------------------------------------*/
751 case OP_EOD:
752 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
753 break;
754
755 /*-----------------------------------------------------------------*/
756 case OP_OPT:
757 ims = code[1];
758 ADD_ACTIVE(state_offset + 2, 0);
759 break;
760
761 /*-----------------------------------------------------------------*/
762 case OP_SOD:
763 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
764 break;
765
766 /*-----------------------------------------------------------------*/
767 case OP_SOM:
768 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
769 break;
770
771
772 /* ========================================================================== */
773 /* These opcodes inspect the next subject character, and sometimes
774 the previous one as well, but do not have an argument. The variable
775 clen contains the length of the current character and is zero if we are
776 at the end of the subject. */
777
778 /*-----------------------------------------------------------------*/
779 case OP_ANY:
780 if (clen > 0 && !IS_NEWLINE(ptr))
781 { ADD_NEW(state_offset + 1, 0); }
782 break;
783
784 /*-----------------------------------------------------------------*/
785 case OP_ALLANY:
786 if (clen > 0)
787 { ADD_NEW(state_offset + 1, 0); }
788 break;
789
790 /*-----------------------------------------------------------------*/
791 case OP_EODN:
792 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
793 { ADD_ACTIVE(state_offset + 1, 0); }
794 break;
795
796 /*-----------------------------------------------------------------*/
797 case OP_DOLL:
798 if ((md->moptions & PCRE_NOTEOL) == 0)
799 {
800 if (clen == 0 ||
801 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
802 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
803 ))
804 { ADD_ACTIVE(state_offset + 1, 0); }
805 }
806 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
807 { ADD_ACTIVE(state_offset + 1, 0); }
808 break;
809
810 /*-----------------------------------------------------------------*/
811
812 case OP_DIGIT:
813 case OP_WHITESPACE:
814 case OP_WORDCHAR:
815 if (clen > 0 && c < 256 &&
816 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
817 { ADD_NEW(state_offset + 1, 0); }
818 break;
819
820 /*-----------------------------------------------------------------*/
821 case OP_NOT_DIGIT:
822 case OP_NOT_WHITESPACE:
823 case OP_NOT_WORDCHAR:
824 if (clen > 0 && (c >= 256 ||
825 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
826 { ADD_NEW(state_offset + 1, 0); }
827 break;
828
829 /*-----------------------------------------------------------------*/
830 case OP_WORD_BOUNDARY:
831 case OP_NOT_WORD_BOUNDARY:
832 {
833 int left_word, right_word;
834
835 if (ptr > start_subject)
836 {
837 const uschar *temp = ptr - 1;
838 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
839 #ifdef SUPPORT_UTF8
840 if (utf8) BACKCHAR(temp);
841 #endif
842 GETCHARTEST(d, temp);
843 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
844 }
845 else left_word = 0;
846
847 if (clen > 0)
848 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
849 else /* This is a fudge to ensure that if this is the */
850 { /* last item in the pattern, we don't count it as */
851 reached_end--; /* reached, thus disabling a partial match. */
852 right_word = 0;
853 }
854
855 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
856 { ADD_ACTIVE(state_offset + 1, 0); }
857 }
858 break;
859
860
861 /*-----------------------------------------------------------------*/
862 /* Check the next character by Unicode property. We will get here only
863 if the support is in the binary; otherwise a compile-time error occurs.
864 */
865
866 #ifdef SUPPORT_UCP
867 case OP_PROP:
868 case OP_NOTPROP:
869 if (clen > 0)
870 {
871 BOOL OK;
872 const ucd_record * prop = GET_UCD(c);
873 switch(code[1])
874 {
875 case PT_ANY:
876 OK = TRUE;
877 break;
878
879 case PT_LAMP:
880 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
881 break;
882
883 case PT_GC:
884 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
885 break;
886
887 case PT_PC:
888 OK = prop->chartype == code[2];
889 break;
890
891 case PT_SC:
892 OK = prop->script == code[2];
893 break;
894
895 /* Should never occur, but keep compilers from grumbling. */
896
897 default:
898 OK = codevalue != OP_PROP;
899 break;
900 }
901
902 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
903 }
904 break;
905 #endif
906
907
908
909 /* ========================================================================== */
910 /* These opcodes likewise inspect the subject character, but have an
911 argument that is not a data character. It is one of these opcodes:
912 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
913 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
914
915 case OP_TYPEPLUS:
916 case OP_TYPEMINPLUS:
917 case OP_TYPEPOSPLUS:
918 count = current_state->count; /* Already matched */
919 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
920 if (clen > 0)
921 {
922 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
923 (c < 256 &&
924 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
925 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
926 {
927 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
928 {
929 active_count--; /* Remove non-match possibility */
930 next_active_state--;
931 }
932 count++;
933 ADD_NEW(state_offset, count);
934 }
935 }
936 break;
937
938 /*-----------------------------------------------------------------*/
939 case OP_TYPEQUERY:
940 case OP_TYPEMINQUERY:
941 case OP_TYPEPOSQUERY:
942 ADD_ACTIVE(state_offset + 2, 0);
943 if (clen > 0)
944 {
945 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
946 (c < 256 &&
947 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
948 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
949 {
950 if (codevalue == OP_TYPEPOSQUERY)
951 {
952 active_count--; /* Remove non-match possibility */
953 next_active_state--;
954 }
955 ADD_NEW(state_offset + 2, 0);
956 }
957 }
958 break;
959
960 /*-----------------------------------------------------------------*/
961 case OP_TYPESTAR:
962 case OP_TYPEMINSTAR:
963 case OP_TYPEPOSSTAR:
964 ADD_ACTIVE(state_offset + 2, 0);
965 if (clen > 0)
966 {
967 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
968 (c < 256 &&
969 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
970 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
971 {
972 if (codevalue == OP_TYPEPOSSTAR)
973 {
974 active_count--; /* Remove non-match possibility */
975 next_active_state--;
976 }
977 ADD_NEW(state_offset, 0);
978 }
979 }
980 break;
981
982 /*-----------------------------------------------------------------*/
983 case OP_TYPEEXACT:
984 count = current_state->count; /* Number already matched */
985 if (clen > 0)
986 {
987 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
988 (c < 256 &&
989 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
990 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
991 {
992 if (++count >= GET2(code, 1))
993 { ADD_NEW(state_offset + 4, 0); }
994 else
995 { ADD_NEW(state_offset, count); }
996 }
997 }
998 break;
999
1000 /*-----------------------------------------------------------------*/
1001 case OP_TYPEUPTO:
1002 case OP_TYPEMINUPTO:
1003 case OP_TYPEPOSUPTO:
1004 ADD_ACTIVE(state_offset + 4, 0);
1005 count = current_state->count; /* Number already matched */
1006 if (clen > 0)
1007 {
1008 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1009 (c < 256 &&
1010 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1011 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1012 {
1013 if (codevalue == OP_TYPEPOSUPTO)
1014 {
1015 active_count--; /* Remove non-match possibility */
1016 next_active_state--;
1017 }
1018 if (++count >= GET2(code, 1))
1019 { ADD_NEW(state_offset + 4, 0); }
1020 else
1021 { ADD_NEW(state_offset, count); }
1022 }
1023 }
1024 break;
1025
1026 /* ========================================================================== */
1027 /* These are virtual opcodes that are used when something like
1028 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1029 argument. It keeps the code above fast for the other cases. The argument
1030 is in the d variable. */
1031
1032 #ifdef SUPPORT_UCP
1033 case OP_PROP_EXTRA + OP_TYPEPLUS:
1034 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1035 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1036 count = current_state->count; /* Already matched */
1037 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1038 if (clen > 0)
1039 {
1040 BOOL OK;
1041 const ucd_record * prop = GET_UCD(c);
1042 switch(code[2])
1043 {
1044 case PT_ANY:
1045 OK = TRUE;
1046 break;
1047
1048 case PT_LAMP:
1049 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1050 break;
1051
1052 case PT_GC:
1053 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1054 break;
1055
1056 case PT_PC:
1057 OK = prop->chartype == code[3];
1058 break;
1059
1060 case PT_SC:
1061 OK = prop->script == code[3];
1062 break;
1063
1064 /* Should never occur, but keep compilers from grumbling. */
1065
1066 default:
1067 OK = codevalue != OP_PROP;
1068 break;
1069 }
1070
1071 if (OK == (d == OP_PROP))
1072 {
1073 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1074 {
1075 active_count--; /* Remove non-match possibility */
1076 next_active_state--;
1077 }
1078 count++;
1079 ADD_NEW(state_offset, count);
1080 }
1081 }
1082 break;
1083
1084 /*-----------------------------------------------------------------*/
1085 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1086 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1087 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1088 count = current_state->count; /* Already matched */
1089 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1090 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1091 {
1092 const uschar *nptr = ptr + clen;
1093 int ncount = 0;
1094 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1095 {
1096 active_count--; /* Remove non-match possibility */
1097 next_active_state--;
1098 }
1099 while (nptr < end_subject)
1100 {
1101 int nd;
1102 int ndlen = 1;
1103 GETCHARLEN(nd, nptr, ndlen);
1104 if (UCD_CATEGORY(nd) != ucp_M) break;
1105 ncount++;
1106 nptr += ndlen;
1107 }
1108 count++;
1109 ADD_NEW_DATA(-state_offset, count, ncount);
1110 }
1111 break;
1112 #endif
1113
1114 /*-----------------------------------------------------------------*/
1115 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1116 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1117 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1118 count = current_state->count; /* Already matched */
1119 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1120 if (clen > 0)
1121 {
1122 int ncount = 0;
1123 switch (c)
1124 {
1125 case 0x000b:
1126 case 0x000c:
1127 case 0x0085:
1128 case 0x2028:
1129 case 0x2029:
1130 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1131 goto ANYNL01;
1132
1133 case 0x000d:
1134 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1135 /* Fall through */
1136
1137 ANYNL01:
1138 case 0x000a:
1139 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1140 {
1141 active_count--; /* Remove non-match possibility */
1142 next_active_state--;
1143 }
1144 count++;
1145 ADD_NEW_DATA(-state_offset, count, ncount);
1146 break;
1147
1148 default:
1149 break;
1150 }
1151 }
1152 break;
1153
1154 /*-----------------------------------------------------------------*/
1155 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1156 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1157 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1158 count = current_state->count; /* Already matched */
1159 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1160 if (clen > 0)
1161 {
1162 BOOL OK;
1163 switch (c)
1164 {
1165 case 0x000a:
1166 case 0x000b:
1167 case 0x000c:
1168 case 0x000d:
1169 case 0x0085:
1170 case 0x2028:
1171 case 0x2029:
1172 OK = TRUE;
1173 break;
1174
1175 default:
1176 OK = FALSE;
1177 break;
1178 }
1179
1180 if (OK == (d == OP_VSPACE))
1181 {
1182 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1183 {
1184 active_count--; /* Remove non-match possibility */
1185 next_active_state--;
1186 }
1187 count++;
1188 ADD_NEW_DATA(-state_offset, count, 0);
1189 }
1190 }
1191 break;
1192
1193 /*-----------------------------------------------------------------*/
1194 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1195 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1196 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1197 count = current_state->count; /* Already matched */
1198 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1199 if (clen > 0)
1200 {
1201 BOOL OK;
1202 switch (c)
1203 {
1204 case 0x09: /* HT */
1205 case 0x20: /* SPACE */
1206 case 0xa0: /* NBSP */
1207 case 0x1680: /* OGHAM SPACE MARK */
1208 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1209 case 0x2000: /* EN QUAD */
1210 case 0x2001: /* EM QUAD */
1211 case 0x2002: /* EN SPACE */
1212 case 0x2003: /* EM SPACE */
1213 case 0x2004: /* THREE-PER-EM SPACE */
1214 case 0x2005: /* FOUR-PER-EM SPACE */
1215 case 0x2006: /* SIX-PER-EM SPACE */
1216 case 0x2007: /* FIGURE SPACE */
1217 case 0x2008: /* PUNCTUATION SPACE */
1218 case 0x2009: /* THIN SPACE */
1219 case 0x200A: /* HAIR SPACE */
1220 case 0x202f: /* NARROW NO-BREAK SPACE */
1221 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1222 case 0x3000: /* IDEOGRAPHIC SPACE */
1223 OK = TRUE;
1224 break;
1225
1226 default:
1227 OK = FALSE;
1228 break;
1229 }
1230
1231 if (OK == (d == OP_HSPACE))
1232 {
1233 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1234 {
1235 active_count--; /* Remove non-match possibility */
1236 next_active_state--;
1237 }
1238 count++;
1239 ADD_NEW_DATA(-state_offset, count, 0);
1240 }
1241 }
1242 break;
1243
1244 /*-----------------------------------------------------------------*/
1245 #ifdef SUPPORT_UCP
1246 case OP_PROP_EXTRA + OP_TYPEQUERY:
1247 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1248 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1249 count = 4;
1250 goto QS1;
1251
1252 case OP_PROP_EXTRA + OP_TYPESTAR:
1253 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1254 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1255 count = 0;
1256
1257 QS1:
1258
1259 ADD_ACTIVE(state_offset + 4, 0);
1260 if (clen > 0)
1261 {
1262 BOOL OK;
1263 const ucd_record * prop = GET_UCD(c);
1264 switch(code[2])
1265 {
1266 case PT_ANY:
1267 OK = TRUE;
1268 break;
1269
1270 case PT_LAMP:
1271 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1272 break;
1273
1274 case PT_GC:
1275 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1276 break;
1277
1278 case PT_PC:
1279 OK = prop->chartype == code[3];
1280 break;
1281
1282 case PT_SC:
1283 OK = prop->script == code[3];
1284 break;
1285
1286 /* Should never occur, but keep compilers from grumbling. */
1287
1288 default:
1289 OK = codevalue != OP_PROP;
1290 break;
1291 }
1292
1293 if (OK == (d == OP_PROP))
1294 {
1295 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1296 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1297 {
1298 active_count--; /* Remove non-match possibility */
1299 next_active_state--;
1300 }
1301 ADD_NEW(state_offset + count, 0);
1302 }
1303 }
1304 break;
1305
1306 /*-----------------------------------------------------------------*/
1307 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1308 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1309 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1310 count = 2;
1311 goto QS2;
1312
1313 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1314 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1315 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1316 count = 0;
1317
1318 QS2:
1319
1320 ADD_ACTIVE(state_offset + 2, 0);
1321 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1322 {
1323 const uschar *nptr = ptr + clen;
1324 int ncount = 0;
1325 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1326 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1327 {
1328 active_count--; /* Remove non-match possibility */
1329 next_active_state--;
1330 }
1331 while (nptr < end_subject)
1332 {
1333 int nd;
1334 int ndlen = 1;
1335 GETCHARLEN(nd, nptr, ndlen);
1336 if (UCD_CATEGORY(nd) != ucp_M) break;
1337 ncount++;
1338 nptr += ndlen;
1339 }
1340 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341 }
1342 break;
1343 #endif
1344
1345 /*-----------------------------------------------------------------*/
1346 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1347 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1348 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1349 count = 2;
1350 goto QS3;
1351
1352 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1353 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1354 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1355 count = 0;
1356
1357 QS3:
1358 ADD_ACTIVE(state_offset + 2, 0);
1359 if (clen > 0)
1360 {
1361 int ncount = 0;
1362 switch (c)
1363 {
1364 case 0x000b:
1365 case 0x000c:
1366 case 0x0085:
1367 case 0x2028:
1368 case 0x2029:
1369 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1370 goto ANYNL02;
1371
1372 case 0x000d:
1373 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1374 /* Fall through */
1375
1376 ANYNL02:
1377 case 0x000a:
1378 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1379 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1380 {
1381 active_count--; /* Remove non-match possibility */
1382 next_active_state--;
1383 }
1384 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1385 break;
1386
1387 default:
1388 break;
1389 }
1390 }
1391 break;
1392
1393 /*-----------------------------------------------------------------*/
1394 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1395 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1396 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1397 count = 2;
1398 goto QS4;
1399
1400 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1401 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1402 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1403 count = 0;
1404
1405 QS4:
1406 ADD_ACTIVE(state_offset + 2, 0);
1407 if (clen > 0)
1408 {
1409 BOOL OK;
1410 switch (c)
1411 {
1412 case 0x000a:
1413 case 0x000b:
1414 case 0x000c:
1415 case 0x000d:
1416 case 0x0085:
1417 case 0x2028:
1418 case 0x2029:
1419 OK = TRUE;
1420 break;
1421
1422 default:
1423 OK = FALSE;
1424 break;
1425 }
1426 if (OK == (d == OP_VSPACE))
1427 {
1428 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1429 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1430 {
1431 active_count--; /* Remove non-match possibility */
1432 next_active_state--;
1433 }
1434 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1435 }
1436 }
1437 break;
1438
1439 /*-----------------------------------------------------------------*/
1440 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1441 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1442 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1443 count = 2;
1444 goto QS5;
1445
1446 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1447 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1448 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1449 count = 0;
1450
1451 QS5:
1452 ADD_ACTIVE(state_offset + 2, 0);
1453 if (clen > 0)
1454 {
1455 BOOL OK;
1456 switch (c)
1457 {
1458 case 0x09: /* HT */
1459 case 0x20: /* SPACE */
1460 case 0xa0: /* NBSP */
1461 case 0x1680: /* OGHAM SPACE MARK */
1462 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1463 case 0x2000: /* EN QUAD */
1464 case 0x2001: /* EM QUAD */
1465 case 0x2002: /* EN SPACE */
1466 case 0x2003: /* EM SPACE */
1467 case 0x2004: /* THREE-PER-EM SPACE */
1468 case 0x2005: /* FOUR-PER-EM SPACE */
1469 case 0x2006: /* SIX-PER-EM SPACE */
1470 case 0x2007: /* FIGURE SPACE */
1471 case 0x2008: /* PUNCTUATION SPACE */
1472 case 0x2009: /* THIN SPACE */
1473 case 0x200A: /* HAIR SPACE */
1474 case 0x202f: /* NARROW NO-BREAK SPACE */
1475 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1476 case 0x3000: /* IDEOGRAPHIC SPACE */
1477 OK = TRUE;
1478 break;
1479
1480 default:
1481 OK = FALSE;
1482 break;
1483 }
1484
1485 if (OK == (d == OP_HSPACE))
1486 {
1487 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1488 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1489 {
1490 active_count--; /* Remove non-match possibility */
1491 next_active_state--;
1492 }
1493 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1494 }
1495 }
1496 break;
1497
1498 /*-----------------------------------------------------------------*/
1499 #ifdef SUPPORT_UCP
1500 case OP_PROP_EXTRA + OP_TYPEEXACT:
1501 case OP_PROP_EXTRA + OP_TYPEUPTO:
1502 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1503 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1504 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1505 { ADD_ACTIVE(state_offset + 6, 0); }
1506 count = current_state->count; /* Number already matched */
1507 if (clen > 0)
1508 {
1509 BOOL OK;
1510 const ucd_record * prop = GET_UCD(c);
1511 switch(code[4])
1512 {
1513 case PT_ANY:
1514 OK = TRUE;
1515 break;
1516
1517 case PT_LAMP:
1518 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1519 break;
1520
1521 case PT_GC:
1522 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1523 break;
1524
1525 case PT_PC:
1526 OK = prop->chartype == code[5];
1527 break;
1528
1529 case PT_SC:
1530 OK = prop->script == code[5];
1531 break;
1532
1533 /* Should never occur, but keep compilers from grumbling. */
1534
1535 default:
1536 OK = codevalue != OP_PROP;
1537 break;
1538 }
1539
1540 if (OK == (d == OP_PROP))
1541 {
1542 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1543 {
1544 active_count--; /* Remove non-match possibility */
1545 next_active_state--;
1546 }
1547 if (++count >= GET2(code, 1))
1548 { ADD_NEW(state_offset + 6, 0); }
1549 else
1550 { ADD_NEW(state_offset, count); }
1551 }
1552 }
1553 break;
1554
1555 /*-----------------------------------------------------------------*/
1556 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1557 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1558 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1559 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1560 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1561 { ADD_ACTIVE(state_offset + 4, 0); }
1562 count = current_state->count; /* Number already matched */
1563 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1564 {
1565 const uschar *nptr = ptr + clen;
1566 int ncount = 0;
1567 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1568 {
1569 active_count--; /* Remove non-match possibility */
1570 next_active_state--;
1571 }
1572 while (nptr < end_subject)
1573 {
1574 int nd;
1575 int ndlen = 1;
1576 GETCHARLEN(nd, nptr, ndlen);
1577 if (UCD_CATEGORY(nd) != ucp_M) break;
1578 ncount++;
1579 nptr += ndlen;
1580 }
1581 if (++count >= GET2(code, 1))
1582 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1583 else
1584 { ADD_NEW_DATA(-state_offset, count, ncount); }
1585 }
1586 break;
1587 #endif
1588
1589 /*-----------------------------------------------------------------*/
1590 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1591 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1592 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1593 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1594 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1595 { ADD_ACTIVE(state_offset + 4, 0); }
1596 count = current_state->count; /* Number already matched */
1597 if (clen > 0)
1598 {
1599 int ncount = 0;
1600 switch (c)
1601 {
1602 case 0x000b:
1603 case 0x000c:
1604 case 0x0085:
1605 case 0x2028:
1606 case 0x2029:
1607 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1608 goto ANYNL03;
1609
1610 case 0x000d:
1611 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1612 /* Fall through */
1613
1614 ANYNL03:
1615 case 0x000a:
1616 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1617 {
1618 active_count--; /* Remove non-match possibility */
1619 next_active_state--;
1620 }
1621 if (++count >= GET2(code, 1))
1622 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1623 else
1624 { ADD_NEW_DATA(-state_offset, count, ncount); }
1625 break;
1626
1627 default:
1628 break;
1629 }
1630 }
1631 break;
1632
1633 /*-----------------------------------------------------------------*/
1634 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1635 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1636 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1637 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1638 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1639 { ADD_ACTIVE(state_offset + 4, 0); }
1640 count = current_state->count; /* Number already matched */
1641 if (clen > 0)
1642 {
1643 BOOL OK;
1644 switch (c)
1645 {
1646 case 0x000a:
1647 case 0x000b:
1648 case 0x000c:
1649 case 0x000d:
1650 case 0x0085:
1651 case 0x2028:
1652 case 0x2029:
1653 OK = TRUE;
1654 break;
1655
1656 default:
1657 OK = FALSE;
1658 }
1659
1660 if (OK == (d == OP_VSPACE))
1661 {
1662 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1663 {
1664 active_count--; /* Remove non-match possibility */
1665 next_active_state--;
1666 }
1667 if (++count >= GET2(code, 1))
1668 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1669 else
1670 { ADD_NEW_DATA(-state_offset, count, 0); }
1671 }
1672 }
1673 break;
1674
1675 /*-----------------------------------------------------------------*/
1676 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1677 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1678 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1679 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1680 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1681 { ADD_ACTIVE(state_offset + 4, 0); }
1682 count = current_state->count; /* Number already matched */
1683 if (clen > 0)
1684 {
1685 BOOL OK;
1686 switch (c)
1687 {
1688 case 0x09: /* HT */
1689 case 0x20: /* SPACE */
1690 case 0xa0: /* NBSP */
1691 case 0x1680: /* OGHAM SPACE MARK */
1692 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1693 case 0x2000: /* EN QUAD */
1694 case 0x2001: /* EM QUAD */
1695 case 0x2002: /* EN SPACE */
1696 case 0x2003: /* EM SPACE */
1697 case 0x2004: /* THREE-PER-EM SPACE */
1698 case 0x2005: /* FOUR-PER-EM SPACE */
1699 case 0x2006: /* SIX-PER-EM SPACE */
1700 case 0x2007: /* FIGURE SPACE */
1701 case 0x2008: /* PUNCTUATION SPACE */
1702 case 0x2009: /* THIN SPACE */
1703 case 0x200A: /* HAIR SPACE */
1704 case 0x202f: /* NARROW NO-BREAK SPACE */
1705 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1706 case 0x3000: /* IDEOGRAPHIC SPACE */
1707 OK = TRUE;
1708 break;
1709
1710 default:
1711 OK = FALSE;
1712 break;
1713 }
1714
1715 if (OK == (d == OP_HSPACE))
1716 {
1717 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1718 {
1719 active_count--; /* Remove non-match possibility */
1720 next_active_state--;
1721 }
1722 if (++count >= GET2(code, 1))
1723 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1724 else
1725 { ADD_NEW_DATA(-state_offset, count, 0); }
1726 }
1727 }
1728 break;
1729
1730 /* ========================================================================== */
1731 /* These opcodes are followed by a character that is usually compared
1732 to the current subject character; it is loaded into d. We still get
1733 here even if there is no subject character, because in some cases zero
1734 repetitions are permitted. */
1735
1736 /*-----------------------------------------------------------------*/
1737 case OP_CHAR:
1738 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1739 break;
1740
1741 /*-----------------------------------------------------------------*/
1742 case OP_CHARNC:
1743 if (clen == 0) break;
1744
1745 #ifdef SUPPORT_UTF8
1746 if (utf8)
1747 {
1748 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1749 {
1750 unsigned int othercase;
1751 if (c < 128) othercase = fcc[c]; else
1752
1753 /* If we have Unicode property support, we can use it to test the
1754 other case of the character. */
1755
1756 #ifdef SUPPORT_UCP
1757 othercase = UCD_OTHERCASE(c);
1758 #else
1759 othercase = NOTACHAR;
1760 #endif
1761
1762 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1763 }
1764 }
1765 else
1766 #endif /* SUPPORT_UTF8 */
1767
1768 /* Non-UTF-8 mode */
1769 {
1770 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1771 }
1772 break;
1773
1774
1775 #ifdef SUPPORT_UCP
1776 /*-----------------------------------------------------------------*/
1777 /* This is a tricky one because it can match more than one character.
1778 Find out how many characters to skip, and then set up a negative state
1779 to wait for them to pass before continuing. */
1780
1781 case OP_EXTUNI:
1782 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1783 {
1784 const uschar *nptr = ptr + clen;
1785 int ncount = 0;
1786 while (nptr < end_subject)
1787 {
1788 int nclen = 1;
1789 GETCHARLEN(c, nptr, nclen);
1790 if (UCD_CATEGORY(c) != ucp_M) break;
1791 ncount++;
1792 nptr += nclen;
1793 }
1794 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1795 }
1796 break;
1797 #endif
1798
1799 /*-----------------------------------------------------------------*/
1800 /* This is a tricky like EXTUNI because it too can match more than one
1801 character (when CR is followed by LF). In this case, set up a negative
1802 state to wait for one character to pass before continuing. */
1803
1804 case OP_ANYNL:
1805 if (clen > 0) switch(c)
1806 {
1807 case 0x000b:
1808 case 0x000c:
1809 case 0x0085:
1810 case 0x2028:
1811 case 0x2029:
1812 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1813
1814 case 0x000a:
1815 ADD_NEW(state_offset + 1, 0);
1816 break;
1817
1818 case 0x000d:
1819 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1820 {
1821 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1822 }
1823 else
1824 {
1825 ADD_NEW(state_offset + 1, 0);
1826 }
1827 break;
1828 }
1829 break;
1830
1831 /*-----------------------------------------------------------------*/
1832 case OP_NOT_VSPACE:
1833 if (clen > 0) switch(c)
1834 {
1835 case 0x000a:
1836 case 0x000b:
1837 case 0x000c:
1838 case 0x000d:
1839 case 0x0085:
1840 case 0x2028:
1841 case 0x2029:
1842 break;
1843
1844 default:
1845 ADD_NEW(state_offset + 1, 0);
1846 break;
1847 }
1848 break;
1849
1850 /*-----------------------------------------------------------------*/
1851 case OP_VSPACE:
1852 if (clen > 0) switch(c)
1853 {
1854 case 0x000a:
1855 case 0x000b:
1856 case 0x000c:
1857 case 0x000d:
1858 case 0x0085:
1859 case 0x2028:
1860 case 0x2029:
1861 ADD_NEW(state_offset + 1, 0);
1862 break;
1863
1864 default: break;
1865 }
1866 break;
1867
1868 /*-----------------------------------------------------------------*/
1869 case OP_NOT_HSPACE:
1870 if (clen > 0) switch(c)
1871 {
1872 case 0x09: /* HT */
1873 case 0x20: /* SPACE */
1874 case 0xa0: /* NBSP */
1875 case 0x1680: /* OGHAM SPACE MARK */
1876 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1877 case 0x2000: /* EN QUAD */
1878 case 0x2001: /* EM QUAD */
1879 case 0x2002: /* EN SPACE */
1880 case 0x2003: /* EM SPACE */
1881 case 0x2004: /* THREE-PER-EM SPACE */
1882 case 0x2005: /* FOUR-PER-EM SPACE */
1883 case 0x2006: /* SIX-PER-EM SPACE */
1884 case 0x2007: /* FIGURE SPACE */
1885 case 0x2008: /* PUNCTUATION SPACE */
1886 case 0x2009: /* THIN SPACE */
1887 case 0x200A: /* HAIR SPACE */
1888 case 0x202f: /* NARROW NO-BREAK SPACE */
1889 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1890 case 0x3000: /* IDEOGRAPHIC SPACE */
1891 break;
1892
1893 default:
1894 ADD_NEW(state_offset + 1, 0);
1895 break;
1896 }
1897 break;
1898
1899 /*-----------------------------------------------------------------*/
1900 case OP_HSPACE:
1901 if (clen > 0) switch(c)
1902 {
1903 case 0x09: /* HT */
1904 case 0x20: /* SPACE */
1905 case 0xa0: /* NBSP */
1906 case 0x1680: /* OGHAM SPACE MARK */
1907 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1908 case 0x2000: /* EN QUAD */
1909 case 0x2001: /* EM QUAD */
1910 case 0x2002: /* EN SPACE */
1911 case 0x2003: /* EM SPACE */
1912 case 0x2004: /* THREE-PER-EM SPACE */
1913 case 0x2005: /* FOUR-PER-EM SPACE */
1914 case 0x2006: /* SIX-PER-EM SPACE */
1915 case 0x2007: /* FIGURE SPACE */
1916 case 0x2008: /* PUNCTUATION SPACE */
1917 case 0x2009: /* THIN SPACE */
1918 case 0x200A: /* HAIR SPACE */
1919 case 0x202f: /* NARROW NO-BREAK SPACE */
1920 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1921 case 0x3000: /* IDEOGRAPHIC SPACE */
1922 ADD_NEW(state_offset + 1, 0);
1923 break;
1924 }
1925 break;
1926
1927 /*-----------------------------------------------------------------*/
1928 /* Match a negated single character. This is only used for one-byte
1929 characters, that is, we know that d < 256. The character we are
1930 checking (c) can be multibyte. */
1931
1932 case OP_NOT:
1933 if (clen > 0)
1934 {
1935 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1936 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1937 }
1938 break;
1939
1940 /*-----------------------------------------------------------------*/
1941 case OP_PLUS:
1942 case OP_MINPLUS:
1943 case OP_POSPLUS:
1944 case OP_NOTPLUS:
1945 case OP_NOTMINPLUS:
1946 case OP_NOTPOSPLUS:
1947 count = current_state->count; /* Already matched */
1948 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1949 if (clen > 0)
1950 {
1951 unsigned int otherd = NOTACHAR;
1952 if ((ims & PCRE_CASELESS) != 0)
1953 {
1954 #ifdef SUPPORT_UTF8
1955 if (utf8 && d >= 128)
1956 {
1957 #ifdef SUPPORT_UCP
1958 otherd = UCD_OTHERCASE(d);
1959 #endif /* SUPPORT_UCP */
1960 }
1961 else
1962 #endif /* SUPPORT_UTF8 */
1963 otherd = fcc[d];
1964 }
1965 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1966 {
1967 if (count > 0 &&
1968 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1969 {
1970 active_count--; /* Remove non-match possibility */
1971 next_active_state--;
1972 }
1973 count++;
1974 ADD_NEW(state_offset, count);
1975 }
1976 }
1977 break;
1978
1979 /*-----------------------------------------------------------------*/
1980 case OP_QUERY:
1981 case OP_MINQUERY:
1982 case OP_POSQUERY:
1983 case OP_NOTQUERY:
1984 case OP_NOTMINQUERY:
1985 case OP_NOTPOSQUERY:
1986 ADD_ACTIVE(state_offset + dlen + 1, 0);
1987 if (clen > 0)
1988 {
1989 unsigned int otherd = NOTACHAR;
1990 if ((ims & PCRE_CASELESS) != 0)
1991 {
1992 #ifdef SUPPORT_UTF8
1993 if (utf8 && d >= 128)
1994 {
1995 #ifdef SUPPORT_UCP
1996 otherd = UCD_OTHERCASE(d);
1997 #endif /* SUPPORT_UCP */
1998 }
1999 else
2000 #endif /* SUPPORT_UTF8 */
2001 otherd = fcc[d];
2002 }
2003 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004 {
2005 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2006 {
2007 active_count--; /* Remove non-match possibility */
2008 next_active_state--;
2009 }
2010 ADD_NEW(state_offset + dlen + 1, 0);
2011 }
2012 }
2013 break;
2014
2015 /*-----------------------------------------------------------------*/
2016 case OP_STAR:
2017 case OP_MINSTAR:
2018 case OP_POSSTAR:
2019 case OP_NOTSTAR:
2020 case OP_NOTMINSTAR:
2021 case OP_NOTPOSSTAR:
2022 ADD_ACTIVE(state_offset + dlen + 1, 0);
2023 if (clen > 0)
2024 {
2025 unsigned int otherd = NOTACHAR;
2026 if ((ims & PCRE_CASELESS) != 0)
2027 {
2028 #ifdef SUPPORT_UTF8
2029 if (utf8 && d >= 128)
2030 {
2031 #ifdef SUPPORT_UCP
2032 otherd = UCD_OTHERCASE(d);
2033 #endif /* SUPPORT_UCP */
2034 }
2035 else
2036 #endif /* SUPPORT_UTF8 */
2037 otherd = fcc[d];
2038 }
2039 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2040 {
2041 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2042 {
2043 active_count--; /* Remove non-match possibility */
2044 next_active_state--;
2045 }
2046 ADD_NEW(state_offset, 0);
2047 }
2048 }
2049 break;
2050
2051 /*-----------------------------------------------------------------*/
2052 case OP_EXACT:
2053 case OP_NOTEXACT:
2054 count = current_state->count; /* Number already matched */
2055 if (clen > 0)
2056 {
2057 unsigned int otherd = NOTACHAR;
2058 if ((ims & PCRE_CASELESS) != 0)
2059 {
2060 #ifdef SUPPORT_UTF8
2061 if (utf8 && d >= 128)
2062 {
2063 #ifdef SUPPORT_UCP
2064 otherd = UCD_OTHERCASE(d);
2065 #endif /* SUPPORT_UCP */
2066 }
2067 else
2068 #endif /* SUPPORT_UTF8 */
2069 otherd = fcc[d];
2070 }
2071 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2072 {
2073 if (++count >= GET2(code, 1))
2074 { ADD_NEW(state_offset + dlen + 3, 0); }
2075 else
2076 { ADD_NEW(state_offset, count); }
2077 }
2078 }
2079 break;
2080
2081 /*-----------------------------------------------------------------*/
2082 case OP_UPTO:
2083 case OP_MINUPTO:
2084 case OP_POSUPTO:
2085 case OP_NOTUPTO:
2086 case OP_NOTMINUPTO:
2087 case OP_NOTPOSUPTO:
2088 ADD_ACTIVE(state_offset + dlen + 3, 0);
2089 count = current_state->count; /* Number already matched */
2090 if (clen > 0)
2091 {
2092 unsigned int otherd = NOTACHAR;
2093 if ((ims & PCRE_CASELESS) != 0)
2094 {
2095 #ifdef SUPPORT_UTF8
2096 if (utf8 && d >= 128)
2097 {
2098 #ifdef SUPPORT_UCP
2099 otherd = UCD_OTHERCASE(d);
2100 #endif /* SUPPORT_UCP */
2101 }
2102 else
2103 #endif /* SUPPORT_UTF8 */
2104 otherd = fcc[d];
2105 }
2106 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2107 {
2108 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2109 {
2110 active_count--; /* Remove non-match possibility */
2111 next_active_state--;
2112 }
2113 if (++count >= GET2(code, 1))
2114 { ADD_NEW(state_offset + dlen + 3, 0); }
2115 else
2116 { ADD_NEW(state_offset, count); }
2117 }
2118 }
2119 break;
2120
2121
2122 /* ========================================================================== */
2123 /* These are the class-handling opcodes */
2124
2125 case OP_CLASS:
2126 case OP_NCLASS:
2127 case OP_XCLASS:
2128 {
2129 BOOL isinclass = FALSE;
2130 int next_state_offset;
2131 const uschar *ecode;
2132
2133 /* For a simple class, there is always just a 32-byte table, and we
2134 can set isinclass from it. */
2135
2136 if (codevalue != OP_XCLASS)
2137 {
2138 ecode = code + 33;
2139 if (clen > 0)
2140 {
2141 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2142 ((code[1 + c/8] & (1 << (c&7))) != 0);
2143 }
2144 }
2145
2146 /* An extended class may have a table or a list of single characters,
2147 ranges, or both, and it may be positive or negative. There's a
2148 function that sorts all this out. */
2149
2150 else
2151 {
2152 ecode = code + GET(code, 1);
2153 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2154 }
2155
2156 /* At this point, isinclass is set for all kinds of class, and ecode
2157 points to the byte after the end of the class. If there is a
2158 quantifier, this is where it will be. */
2159
2160 next_state_offset = ecode - start_code;
2161
2162 switch (*ecode)
2163 {
2164 case OP_CRSTAR:
2165 case OP_CRMINSTAR:
2166 ADD_ACTIVE(next_state_offset + 1, 0);
2167 if (isinclass) { ADD_NEW(state_offset, 0); }
2168 break;
2169
2170 case OP_CRPLUS:
2171 case OP_CRMINPLUS:
2172 count = current_state->count; /* Already matched */
2173 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2174 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2175 break;
2176
2177 case OP_CRQUERY:
2178 case OP_CRMINQUERY:
2179 ADD_ACTIVE(next_state_offset + 1, 0);
2180 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2181 break;
2182
2183 case OP_CRRANGE:
2184 case OP_CRMINRANGE:
2185 count = current_state->count; /* Already matched */
2186 if (count >= GET2(ecode, 1))
2187 { ADD_ACTIVE(next_state_offset + 5, 0); }
2188 if (isinclass)
2189 {
2190 int max = GET2(ecode, 3);
2191 if (++count >= max && max != 0) /* Max 0 => no limit */
2192 { ADD_NEW(next_state_offset + 5, 0); }
2193 else
2194 { ADD_NEW(state_offset, count); }
2195 }
2196 break;
2197
2198 default:
2199 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2200 break;
2201 }
2202 }
2203 break;
2204
2205 /* ========================================================================== */
2206 /* These are the opcodes for fancy brackets of various kinds. We have
2207 to use recursion in order to handle them. The "always failing" assertion
2208 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2209 though the other "backtracking verbs" are not supported. */
2210
2211 case OP_FAIL:
2212 forced_fail++; /* Count FAILs for multiple states */
2213 break;
2214
2215 case OP_ASSERT:
2216 case OP_ASSERT_NOT:
2217 case OP_ASSERTBACK:
2218 case OP_ASSERTBACK_NOT:
2219 {
2220 int rc;
2221 int local_offsets[2];
2222 int local_workspace[1000];
2223 const uschar *endasscode = code + GET(code, 1);
2224
2225 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2226
2227 rc = internal_dfa_exec(
2228 md, /* static match data */
2229 code, /* this subexpression's code */
2230 ptr, /* where we currently are */
2231 ptr - start_subject, /* start offset */
2232 local_offsets, /* offset vector */
2233 sizeof(local_offsets)/sizeof(int), /* size of same */
2234 local_workspace, /* workspace vector */
2235 sizeof(local_workspace)/sizeof(int), /* size of same */
2236 ims, /* the current ims flags */
2237 rlevel, /* function recursion level */
2238 recursing); /* pass on regex recursion */
2239
2240 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2241 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2242 }
2243 break;
2244
2245 /*-----------------------------------------------------------------*/
2246 case OP_COND:
2247 case OP_SCOND:
2248 {
2249 int local_offsets[1000];
2250 int local_workspace[1000];
2251 int codelink = GET(code, 1);
2252 int condcode;
2253
2254 /* Because of the way auto-callout works during compile, a callout item
2255 is inserted between OP_COND and an assertion condition. This does not
2256 happen for the other conditions. */
2257
2258 if (code[LINK_SIZE+1] == OP_CALLOUT)
2259 {
2260 rrc = 0;
2261 if (pcre_callout != NULL)
2262 {
2263 pcre_callout_block cb;
2264 cb.version = 1; /* Version 1 of the callout block */
2265 cb.callout_number = code[LINK_SIZE+2];
2266 cb.offset_vector = offsets;
2267 cb.subject = (PCRE_SPTR)start_subject;
2268 cb.subject_length = end_subject - start_subject;
2269 cb.start_match = current_subject - start_subject;
2270 cb.current_position = ptr - start_subject;
2271 cb.pattern_position = GET(code, LINK_SIZE + 3);
2272 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2273 cb.capture_top = 1;
2274 cb.capture_last = -1;
2275 cb.callout_data = md->callout_data;
2276 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2277 }
2278 if (rrc > 0) break; /* Fail this thread */
2279 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2280 }
2281
2282 condcode = code[LINK_SIZE+1];
2283
2284 /* Back reference conditions are not supported */
2285
2286 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2287
2288 /* The DEFINE condition is always false */
2289
2290 if (condcode == OP_DEF)
2291 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2292
2293 /* The only supported version of OP_RREF is for the value RREF_ANY,
2294 which means "test if in any recursion". We can't test for specifically
2295 recursed groups. */
2296
2297 else if (condcode == OP_RREF)
2298 {
2299 int value = GET2(code, LINK_SIZE+2);
2300 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2301 if (recursing > 0)
2302 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2303 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304 }
2305
2306 /* Otherwise, the condition is an assertion */
2307
2308 else
2309 {
2310 int rc;
2311 const uschar *asscode = code + LINK_SIZE + 1;
2312 const uschar *endasscode = asscode + GET(asscode, 1);
2313
2314 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2315
2316 rc = internal_dfa_exec(
2317 md, /* fixed match data */
2318 asscode, /* this subexpression's code */
2319 ptr, /* where we currently are */
2320 ptr - start_subject, /* start offset */
2321 local_offsets, /* offset vector */
2322 sizeof(local_offsets)/sizeof(int), /* size of same */
2323 local_workspace, /* workspace vector */
2324 sizeof(local_workspace)/sizeof(int), /* size of same */
2325 ims, /* the current ims flags */
2326 rlevel, /* function recursion level */
2327 recursing); /* pass on regex recursion */
2328
2329 if ((rc >= 0) ==
2330 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2331 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2332 else
2333 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2334 }
2335 }
2336 break;
2337
2338 /*-----------------------------------------------------------------*/
2339 case OP_RECURSE:
2340 {
2341 int local_offsets[1000];
2342 int local_workspace[1000];
2343 int rc;
2344
2345 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2346 recursing + 1));
2347
2348 rc = internal_dfa_exec(
2349 md, /* fixed match data */
2350 start_code + GET(code, 1), /* this subexpression's code */
2351 ptr, /* where we currently are */
2352 ptr - start_subject, /* start offset */
2353 local_offsets, /* offset vector */
2354 sizeof(local_offsets)/sizeof(int), /* size of same */
2355 local_workspace, /* workspace vector */
2356 sizeof(local_workspace)/sizeof(int), /* size of same */
2357 ims, /* the current ims flags */
2358 rlevel, /* function recursion level */
2359 recursing + 1); /* regex recurse level */
2360
2361 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2362 recursing + 1, rc));
2363
2364 /* Ran out of internal offsets */
2365
2366 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2367
2368 /* For each successful matched substring, set up the next state with a
2369 count of characters to skip before trying it. Note that the count is in
2370 characters, not bytes. */
2371
2372 if (rc > 0)
2373 {
2374 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2375 {
2376 const uschar *p = start_subject + local_offsets[rc];
2377 const uschar *pp = start_subject + local_offsets[rc+1];
2378 int charcount = local_offsets[rc+1] - local_offsets[rc];
2379 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2380 if (charcount > 0)
2381 {
2382 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2383 }
2384 else
2385 {
2386 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2387 }
2388 }
2389 }
2390 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2391 }
2392 break;
2393
2394 /*-----------------------------------------------------------------*/
2395 case OP_ONCE:
2396 {
2397 int local_offsets[2];
2398 int local_workspace[1000];
2399
2400 int rc = internal_dfa_exec(
2401 md, /* fixed match data */
2402 code, /* this subexpression's code */
2403 ptr, /* where we currently are */
2404 ptr - start_subject, /* start offset */
2405 local_offsets, /* offset vector */
2406 sizeof(local_offsets)/sizeof(int), /* size of same */
2407 local_workspace, /* workspace vector */
2408 sizeof(local_workspace)/sizeof(int), /* size of same */
2409 ims, /* the current ims flags */
2410 rlevel, /* function recursion level */
2411 recursing); /* pass on regex recursion */
2412
2413 if (rc >= 0)
2414 {
2415 const uschar *end_subpattern = code;
2416 int charcount = local_offsets[1] - local_offsets[0];
2417 int next_state_offset, repeat_state_offset;
2418
2419 do { end_subpattern += GET(end_subpattern, 1); }
2420 while (*end_subpattern == OP_ALT);
2421 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2422
2423 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2424 arrange for the repeat state also to be added to the relevant list.
2425 Calculate the offset, or set -1 for no repeat. */
2426
2427 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2428 *end_subpattern == OP_KETRMIN)?
2429 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2430
2431 /* If we have matched an empty string, add the next state at the
2432 current character pointer. This is important so that the duplicate
2433 checking kicks in, which is what breaks infinite loops that match an
2434 empty string. */
2435
2436 if (charcount == 0)
2437 {
2438 ADD_ACTIVE(next_state_offset, 0);
2439 }
2440
2441 /* Optimization: if there are no more active states, and there
2442 are no new states yet set up, then skip over the subject string
2443 right here, to save looping. Otherwise, set up the new state to swing
2444 into action when the end of the substring is reached. */
2445
2446 else if (i + 1 >= active_count && new_count == 0)
2447 {
2448 ptr += charcount;
2449 clen = 0;
2450 ADD_NEW(next_state_offset, 0);
2451
2452 /* If we are adding a repeat state at the new character position,
2453 we must fudge things so that it is the only current state.
2454 Otherwise, it might be a duplicate of one we processed before, and
2455 that would cause it to be skipped. */
2456
2457 if (repeat_state_offset >= 0)
2458 {
2459 next_active_state = active_states;
2460 active_count = 0;
2461 i = -1;
2462 ADD_ACTIVE(repeat_state_offset, 0);
2463 }
2464 }
2465 else
2466 {
2467 const uschar *p = start_subject + local_offsets[0];
2468 const uschar *pp = start_subject + local_offsets[1];
2469 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2470 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2471 if (repeat_state_offset >= 0)
2472 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2473 }
2474
2475 }
2476 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2477 }
2478 break;
2479
2480
2481 /* ========================================================================== */
2482 /* Handle callouts */
2483
2484 case OP_CALLOUT:
2485 rrc = 0;
2486 if (pcre_callout != NULL)
2487 {
2488 pcre_callout_block cb;
2489 cb.version = 1; /* Version 1 of the callout block */
2490 cb.callout_number = code[1];
2491 cb.offset_vector = offsets;
2492 cb.subject = (PCRE_SPTR)start_subject;
2493 cb.subject_length = end_subject - start_subject;
2494 cb.start_match = current_subject - start_subject;
2495 cb.current_position = ptr - start_subject;
2496 cb.pattern_position = GET(code, 2);
2497 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2498 cb.capture_top = 1;
2499 cb.capture_last = -1;
2500 cb.callout_data = md->callout_data;
2501 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2502 }
2503 if (rrc == 0)
2504 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2505 break;
2506
2507
2508 /* ========================================================================== */
2509 default: /* Unsupported opcode */
2510 return PCRE_ERROR_DFA_UITEM;
2511 }
2512
2513 NEXT_ACTIVE_STATE: continue;
2514
2515 } /* End of loop scanning active states */
2516
2517 /* We have finished the processing at the current subject character. If no
2518 new states have been set for the next character, we have found all the
2519 matches that we are going to find. If we are at the top level and partial
2520 matching has been requested, check for appropriate conditions. The "forced_
2521 fail" variable counts the number of (*F) encountered for the character. If it
2522 is equal to the original active_count (saved in workspace[1]) it means that
2523 (*F) was found on every active state. In this case we don't want to give a
2524 partial match. */
2525
2526 if (new_count <= 0)
2527 {
2528 if (rlevel == 1 && /* Top level, and */
2529 reached_end != workspace[1] && /* Not all reached end */
2530 forced_fail != workspace[1] && /* Not all forced fail & */
2531 ( /* either... */
2532 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2533 || /* or... */
2534 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2535 match_count < 0) /* no matches */
2536 ) && /* And... */
2537 ptr >= end_subject && /* Reached end of subject */
2538 ptr > current_subject) /* Matched non-empty string */
2539 {
2540 if (offsetcount >= 2)
2541 {
2542 offsets[0] = md->start_used_ptr - start_subject;
2543 offsets[1] = end_subject - start_subject;
2544 }
2545 match_count = PCRE_ERROR_PARTIAL;
2546 }
2547
2548 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2549 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2550 rlevel*2-2, SP));
2551 break; /* In effect, "return", but see the comment below */
2552 }
2553
2554 /* One or more states are active for the next character. */
2555
2556 ptr += clen; /* Advance to next subject character */
2557 } /* Loop to move along the subject string */
2558
2559 /* Control gets here from "break" a few lines above. We do it this way because
2560 if we use "return" above, we have compiler trouble. Some compilers warn if
2561 there's nothing here because they think the function doesn't return a value. On
2562 the other hand, if we put a dummy statement here, some more clever compilers
2563 complain that it can't be reached. Sigh. */
2564
2565 return match_count;
2566 }
2567
2568
2569
2570
2571 /*************************************************
2572 * Execute a Regular Expression - DFA engine *
2573 *************************************************/
2574
2575 /* This external function applies a compiled re to a subject string using a DFA
2576 engine. This function calls the internal function multiple times if the pattern
2577 is not anchored.
2578
2579 Arguments:
2580 argument_re points to the compiled expression
2581 extra_data points to extra data or is NULL
2582 subject points to the subject string
2583 length length of subject string (may contain binary zeros)
2584 start_offset where to start in the subject string
2585 options option bits
2586 offsets vector of match offsets
2587 offsetcount size of same
2588 workspace workspace vector
2589 wscount size of same
2590
2591 Returns: > 0 => number of match offset pairs placed in offsets
2592 = 0 => offsets overflowed; longest matches are present
2593 -1 => failed to match
2594 < -1 => some kind of unexpected problem
2595 */
2596
2597 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2598 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2599 const char *subject, int length, int start_offset, int options, int *offsets,
2600 int offsetcount, int *workspace, int wscount)
2601 {
2602 real_pcre *re = (real_pcre *)argument_re;
2603 dfa_match_data match_block;
2604 dfa_match_data *md = &match_block;
2605 BOOL utf8, anchored, startline, firstline;
2606 const uschar *current_subject, *end_subject, *lcc;
2607
2608 pcre_study_data internal_study;
2609 const pcre_study_data *study = NULL;
2610 real_pcre internal_re;
2611
2612 const uschar *req_byte_ptr;
2613 const uschar *start_bits = NULL;
2614 BOOL first_byte_caseless = FALSE;
2615 BOOL req_byte_caseless = FALSE;
2616 int first_byte = -1;
2617 int req_byte = -1;
2618 int req_byte2 = -1;
2619 int newline;
2620
2621 /* Plausibility checks */
2622
2623 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2624 if (re == NULL || subject == NULL || workspace == NULL ||
2625 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2626 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2627 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2628
2629 /* We need to find the pointer to any study data before we test for byte
2630 flipping, so we scan the extra_data block first. This may set two fields in the
2631 match block, so we must initialize them beforehand. However, the other fields
2632 in the match block must not be set until after the byte flipping. */
2633
2634 md->tables = re->tables;
2635 md->callout_data = NULL;
2636
2637 if (extra_data != NULL)
2638 {
2639 unsigned int flags = extra_data->flags;
2640 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2641 study = (const pcre_study_data *)extra_data->study_data;
2642 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2643 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2644 return PCRE_ERROR_DFA_UMLIMIT;
2645 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2646 md->callout_data = extra_data->callout_data;
2647 if ((flags & PCRE_EXTRA_TABLES) != 0)
2648 md->tables = extra_data->tables;
2649 }
2650
2651 /* Check that the first field in the block is the magic number. If it is not,
2652 test for a regex that was compiled on a host of opposite endianness. If this is
2653 the case, flipped values are put in internal_re and internal_study if there was
2654 study data too. */
2655
2656 if (re->magic_number != MAGIC_NUMBER)
2657 {
2658 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2659 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2660 if (study != NULL) study = &internal_study;
2661 }
2662
2663 /* Set some local values */
2664
2665 current_subject = (const unsigned char *)subject + start_offset;
2666 end_subject = (const unsigned char *)subject + length;
2667 req_byte_ptr = current_subject - 1;
2668
2669 #ifdef SUPPORT_UTF8
2670 utf8 = (re->options & PCRE_UTF8) != 0;
2671 #else
2672 utf8 = FALSE;
2673 #endif
2674
2675 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2676 (re->options & PCRE_ANCHORED) != 0;
2677
2678 /* The remaining fixed data for passing around. */
2679
2680 md->start_code = (const uschar *)argument_re +
2681 re->name_table_offset + re->name_count * re->name_entry_size;
2682 md->start_subject = (const unsigned char *)subject;
2683 md->end_subject = end_subject;
2684 md->moptions = options;
2685 md->poptions = re->options;
2686
2687 /* If the BSR option is not set at match time, copy what was set
2688 at compile time. */
2689
2690 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2691 {
2692 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2693 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2694 #ifdef BSR_ANYCRLF
2695 else md->moptions |= PCRE_BSR_ANYCRLF;
2696 #endif
2697 }
2698
2699 /* Handle different types of newline. The three bits give eight cases. If
2700 nothing is set at run time, whatever was used at compile time applies. */
2701
2702 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2703 PCRE_NEWLINE_BITS)
2704 {
2705 case 0: newline = NEWLINE; break; /* Compile-time default */
2706 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2707 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2708 case PCRE_NEWLINE_CR+
2709 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2710 case PCRE_NEWLINE_ANY: newline = -1; break;
2711 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2712 default: return PCRE_ERROR_BADNEWLINE;
2713 }
2714
2715 if (newline == -2)
2716 {
2717 md->nltype = NLTYPE_ANYCRLF;
2718 }
2719 else if (newline < 0)
2720 {
2721 md->nltype = NLTYPE_ANY;
2722 }
2723 else
2724 {
2725 md->nltype = NLTYPE_FIXED;
2726 if (newline > 255)
2727 {
2728 md->nllen = 2;
2729 md->nl[0] = (newline >> 8) & 255;
2730 md->nl[1] = newline & 255;
2731 }
2732 else
2733 {
2734 md->nllen = 1;
2735 md->nl[0] = newline;
2736 }
2737 }
2738
2739 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2740 back the character offset. */
2741
2742 #ifdef SUPPORT_UTF8
2743 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2744 {
2745 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2746 return PCRE_ERROR_BADUTF8;
2747 if (start_offset > 0 && start_offset < length)
2748 {
2749 int tb = ((uschar *)subject)[start_offset];
2750 if (tb > 127)
2751 {
2752 tb &= 0xc0;
2753 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2754 }
2755 }
2756 }
2757 #endif
2758
2759 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2760 is a feature that makes it possible to save compiled regex and re-use them
2761 in other programs later. */
2762
2763 if (md->tables == NULL) md->tables = _pcre_default_tables;
2764
2765 /* The lower casing table and the "must be at the start of a line" flag are
2766 used in a loop when finding where to start. */
2767
2768 lcc = md->tables + lcc_offset;
2769 startline = (re->flags & PCRE_STARTLINE) != 0;
2770 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2771
2772 /* Set up the first character to match, if available. The first_byte value is
2773 never set for an anchored regular expression, but the anchoring may be forced
2774 at run time, so we have to test for anchoring. The first char may be unset for
2775 an unanchored pattern, of course. If there's no first char and the pattern was
2776 studied, there may be a bitmap of possible first characters. */
2777
2778 if (!anchored)
2779 {
2780 if ((re->flags & PCRE_FIRSTSET) != 0)
2781 {
2782 first_byte = re->first_byte & 255;
2783 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2784 first_byte = lcc[first_byte];
2785 }
2786 else
2787 {
2788 if (startline && study != NULL &&
2789 (study->options & PCRE_STUDY_MAPPED) != 0)
2790 start_bits = study->start_bits;
2791 }
2792 }
2793
2794 /* For anchored or unanchored matches, there may be a "last known required
2795 character" set. */
2796
2797 if ((re->flags & PCRE_REQCHSET) != 0)
2798 {
2799 req_byte = re->req_byte & 255;
2800 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2801 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2802 }
2803
2804 /* Call the main matching function, looping for a non-anchored regex after a
2805 failed match. If not restarting, perform certain optimizations at the start of
2806 a match. */
2807
2808 for (;;)
2809 {
2810 int rc;
2811
2812 if ((options & PCRE_DFA_RESTART) == 0)
2813 {
2814 const uschar *save_end_subject = end_subject;
2815
2816 /* If firstline is TRUE, the start of the match is constrained to the first
2817 line of a multiline string. Implement this by temporarily adjusting
2818 end_subject so that we stop scanning at a newline. If the match fails at
2819 the newline, later code breaks this loop. */
2820
2821 if (firstline)
2822 {
2823 USPTR t = current_subject;
2824 #ifdef SUPPORT_UTF8
2825 if (utf8)
2826 {
2827 while (t < md->end_subject && !IS_NEWLINE(t))
2828 {
2829 t++;
2830 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2831 }
2832 }
2833 else
2834 #endif
2835 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2836 end_subject = t;
2837 }
2838
2839 /* There are some optimizations that avoid running the match if a known
2840 starting point is not found, or if a known later character is not present.
2841 However, there is an option that disables these, for testing and for
2842 ensuring that all callouts do actually occur. */
2843
2844 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2845 {
2846
2847 /* Advance to a known first byte. */
2848
2849 if (first_byte >= 0)
2850 {
2851 if (first_byte_caseless)
2852 while (current_subject < end_subject &&
2853 lcc[*current_subject] != first_byte)
2854 current_subject++;
2855 else
2856 while (current_subject < end_subject &&
2857 *current_subject != first_byte)
2858 current_subject++;
2859 }
2860
2861 /* Or to just after a linebreak for a multiline match if possible */
2862
2863 else if (startline)
2864 {
2865 if (current_subject > md->start_subject + start_offset)
2866 {
2867 #ifdef SUPPORT_UTF8
2868 if (utf8)
2869 {
2870 while (current_subject < end_subject &&
2871 !WAS_NEWLINE(current_subject))
2872 {
2873 current_subject++;
2874 while(current_subject < end_subject &&
2875 (*current_subject & 0xc0) == 0x80)
2876 current_subject++;
2877 }
2878 }
2879 else
2880 #endif
2881 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2882 current_subject++;
2883
2884 /* If we have just passed a CR and the newline option is ANY or
2885 ANYCRLF, and we are now at a LF, advance the match position by one
2886 more character. */
2887
2888 if (current_subject[-1] == CHAR_CR &&
2889 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2890 current_subject < end_subject &&
2891 *current_subject == CHAR_NL)
2892 current_subject++;
2893 }
2894 }
2895
2896 /* Or to a non-unique first char after study */
2897
2898 else if (start_bits != NULL)
2899 {
2900 while (current_subject < end_subject)
2901 {
2902 register unsigned int c = *current_subject;
2903 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2904 else break;
2905 }
2906 }
2907 }
2908
2909 /* Restore fudged end_subject */
2910
2911 end_subject = save_end_subject;
2912 }
2913
2914 /* If req_byte is set, we know that that character must appear in the subject
2915 for the match to succeed. If the first character is set, req_byte must be
2916 later in the subject; otherwise the test starts at the match point. This
2917 optimization can save a huge amount of work in patterns with nested unlimited
2918 repeats that aren't going to match. Writing separate code for cased/caseless
2919 versions makes it go faster, as does using an autoincrement and backing off
2920 on a match.
2921
2922 HOWEVER: when the subject string is very, very long, searching to its end can
2923 take a long time, and give bad performance on quite ordinary patterns. This
2924 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2925 don't do this when the string is sufficiently long.
2926
2927 ALSO: this processing is disabled when partial matching is requested, and can
2928 also be explicitly deactivated. Furthermore, we have to disable when
2929 restarting after a partial match, because the required character may have
2930 already been matched. */
2931
2932 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2933 req_byte >= 0 &&
2934 end_subject - current_subject < REQ_BYTE_MAX &&
2935 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2936 {
2937 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2938
2939 /* We don't need to repeat the search if we haven't yet reached the
2940 place we found it at last time. */
2941
2942 if (p > req_byte_ptr)
2943 {
2944 if (req_byte_caseless)
2945 {
2946 while (p < end_subject)
2947 {
2948 register int pp = *p++;
2949 if (pp == req_byte || pp == req_byte2) { p--; break; }
2950 }
2951 }
2952 else
2953 {
2954 while (p < end_subject)
2955 {
2956 if (*p++ == req_byte) { p--; break; }
2957 }
2958 }
2959
2960 /* If we can't find the required character, break the matching loop,
2961 which will cause a return or PCRE_ERROR_NOMATCH. */
2962
2963 if (p >= end_subject) break;
2964
2965 /* If we have found the required character, save the point where we
2966 found it, so that we don't search again next time round the loop if
2967 the start hasn't passed this character yet. */
2968
2969 req_byte_ptr = p;
2970 }
2971 }
2972
2973 /* OK, now we can do the business */
2974
2975 md->start_used_ptr = current_subject;
2976
2977 rc = internal_dfa_exec(
2978 md, /* fixed match data */
2979 md->start_code, /* this subexpression's code */
2980 current_subject, /* where we currently are */
2981 start_offset, /* start offset in subject */
2982 offsets, /* offset vector */
2983 offsetcount, /* size of same */
2984 workspace, /* workspace vector */
2985 wscount, /* size of same */
2986 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2987 0, /* function recurse level */
2988 0); /* regex recurse level */
2989
2990 /* Anything other than "no match" means we are done, always; otherwise, carry
2991 on only if not anchored. */
2992
2993 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2994
2995 /* Advance to the next subject character unless we are at the end of a line
2996 and firstline is set. */
2997
2998 if (firstline && IS_NEWLINE(current_subject)) break;
2999 current_subject++;
3000 if (utf8)
3001 {
3002 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3003 current_subject++;
3004 }
3005 if (current_subject > end_subject) break;
3006
3007 /* If we have just passed a CR and we are now at a LF, and the pattern does
3008 not contain any explicit matches for \r or \n, and the newline option is CRLF
3009 or ANY or ANYCRLF, advance the match position by one more character. */
3010
3011 if (current_subject[-1] == CHAR_CR &&
3012 current_subject < end_subject &&
3013 *current_subject == CHAR_NL &&
3014 (re->flags & PCRE_HASCRORLF) == 0 &&
3015 (md->nltype == NLTYPE_ANY ||
3016 md->nltype == NLTYPE_ANYCRLF ||
3017 md->nllen == 2))
3018 current_subject++;
3019
3020 } /* "Bumpalong" loop */
3021
3022 return PCRE_ERROR_NOMATCH;
3023 }
3024
3025 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5