/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1144 - (show annotations)
Fri Oct 19 16:19:55 2012 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 123971 byte(s)
Clean up compiler warnings in pcre_dfa_exec.c in all modes.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
51
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
60
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
69
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
72
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre_internal.h"
84
85
86 /* For use to indent debugging output */
87
88 #define SP " "
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* RECURSE */
160 0, /* CALLOUT */
161 0, /* Alt */
162 0, /* Ket */
163 0, /* KetRmax */
164 0, /* KetRmin */
165 0, /* KetRpos */
166 0, /* Reverse */
167 0, /* Assert */
168 0, /* Assert not */
169 0, /* Assert behind */
170 0, /* Assert behind not */
171 0, 0, /* ONCE, ONCE_NC */
172 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 0, 0, /* CREF, NCREF */
175 0, 0, /* RREF, NRREF */
176 0, /* DEF */
177 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181 0, 0 /* CLOSE, SKIPZERO */
182 };
183
184 /* This table identifies those opcodes that inspect a character. It is used to
185 remember the fact that a character could have been inspected when the end of
186 the subject is reached. ***NOTE*** If the start of this table is modified, the
187 two tables that follow must also be modified. */
188
189 static const pcre_uint8 poptable[] = {
190 0, /* End */
191 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193 1, 1, 1, /* Any, AllAny, Anybyte */
194 1, 1, /* \P, \p */
195 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196 1, /* \X */
197 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198 1, /* Char */
199 1, /* Chari */
200 1, /* not */
201 1, /* noti */
202 /* Positive single-char repeats */
203 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204 1, 1, 1, /* upto, minupto, exact */
205 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207 1, 1, 1, /* upto I, minupto I, exact I */
208 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 /* Negative single-char repeats - only for chars < 256 */
210 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211 1, 1, 1, /* NOT upto, minupto, exact */
212 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214 1, 1, 1, /* NOT upto I, minupto I, exact I */
215 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 /* Positive type repeats */
217 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218 1, 1, 1, /* Type upto, minupto, exact */
219 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220 /* Character class & ref repeats */
221 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222 1, 1, /* CRRANGE, CRMINRANGE */
223 1, /* CLASS */
224 1, /* NCLASS */
225 1, /* XCLASS - variable length */
226 0, /* REF */
227 0, /* REFI */
228 0, /* RECURSE */
229 0, /* CALLOUT */
230 0, /* Alt */
231 0, /* Ket */
232 0, /* KetRmax */
233 0, /* KetRmin */
234 0, /* KetRpos */
235 0, /* Reverse */
236 0, /* Assert */
237 0, /* Assert not */
238 0, /* Assert behind */
239 0, /* Assert behind not */
240 0, 0, /* ONCE, ONCE_NC */
241 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 0, 0, /* CREF, NCREF */
244 0, 0, /* RREF, NRREF */
245 0, /* DEF */
246 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250 0, 0 /* CLOSE, SKIPZERO */
251 };
252
253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254 and \w */
255
256 static const pcre_uint8 toptable1[] = {
257 0, 0, 0, 0, 0, 0,
258 ctype_digit, ctype_digit,
259 ctype_space, ctype_space,
260 ctype_word, ctype_word,
261 0, 0 /* OP_ANY, OP_ALLANY */
262 };
263
264 static const pcre_uint8 toptable2[] = {
265 0, 0, 0, 0, 0, 0,
266 ctype_digit, 0,
267 ctype_space, 0,
268 ctype_word, 0,
269 1, 1 /* OP_ANY, OP_ALLANY */
270 };
271
272
273 /* Structure for holding data about a particular state, which is in effect the
274 current data for an active path through the match tree. It must consist
275 entirely of ints because the working vector we are passed, and which we put
276 these structures in, is a vector of ints. */
277
278 typedef struct stateblock {
279 int offset; /* Offset to opcode */
280 int count; /* Count for repeats */
281 int data; /* Some use extra data */
282 } stateblock;
283
284 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
285
286
287 #ifdef PCRE_DEBUG
288 /*************************************************
289 * Print character string *
290 *************************************************/
291
292 /* Character string printing function for debugging.
293
294 Arguments:
295 p points to string
296 length number of bytes
297 f where to print
298
299 Returns: nothing
300 */
301
302 static void
303 pchars(const pcre_uchar *p, int length, FILE *f)
304 {
305 pcre_uint32 c;
306 while (length-- > 0)
307 {
308 if (isprint(c = *(p++)))
309 fprintf(f, "%c", c);
310 else
311 fprintf(f, "\\x{%02x}", c);
312 }
313 }
314 #endif
315
316
317
318 /*************************************************
319 * Execute a Regular Expression - DFA engine *
320 *************************************************/
321
322 /* This internal function applies a compiled pattern to a subject string,
323 starting at a given point, using a DFA engine. This function is called from the
324 external one, possibly multiple times if the pattern is not anchored. The
325 function calls itself recursively for some kinds of subpattern.
326
327 Arguments:
328 md the match_data block with fixed information
329 this_start_code the opening bracket of this subexpression's code
330 current_subject where we currently are in the subject string
331 start_offset start offset in the subject string
332 offsets vector to contain the matching string offsets
333 offsetcount size of same
334 workspace vector of workspace
335 wscount size of same
336 rlevel function call recursion level
337
338 Returns: > 0 => number of match offset pairs placed in offsets
339 = 0 => offsets overflowed; longest matches are present
340 -1 => failed to match
341 < -1 => some kind of unexpected problem
342
343 The following macros are used for adding states to the two state vectors (one
344 for the current character, one for the following character). */
345
346 #define ADD_ACTIVE(x,y) \
347 if (active_count++ < wscount) \
348 { \
349 next_active_state->offset = (x); \
350 next_active_state->count = (y); \
351 next_active_state++; \
352 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353 } \
354 else return PCRE_ERROR_DFA_WSSIZE
355
356 #define ADD_ACTIVE_DATA(x,y,z) \
357 if (active_count++ < wscount) \
358 { \
359 next_active_state->offset = (x); \
360 next_active_state->count = (y); \
361 next_active_state->data = (z); \
362 next_active_state++; \
363 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364 } \
365 else return PCRE_ERROR_DFA_WSSIZE
366
367 #define ADD_NEW(x,y) \
368 if (new_count++ < wscount) \
369 { \
370 next_new_state->offset = (x); \
371 next_new_state->count = (y); \
372 next_new_state++; \
373 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374 } \
375 else return PCRE_ERROR_DFA_WSSIZE
376
377 #define ADD_NEW_DATA(x,y,z) \
378 if (new_count++ < wscount) \
379 { \
380 next_new_state->offset = (x); \
381 next_new_state->count = (y); \
382 next_new_state->data = (z); \
383 next_new_state++; \
384 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385 (x), (y), (z), __LINE__)); \
386 } \
387 else return PCRE_ERROR_DFA_WSSIZE
388
389 /* And now, here is the code */
390
391 static int
392 internal_dfa_exec(
393 dfa_match_data *md,
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
396 int start_offset,
397 int *offsets,
398 int offsetcount,
399 int *workspace,
400 int wscount,
401 int rlevel)
402 {
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
405
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
409
410 dfa_recursion_info new_recursive;
411
412 int active_count, new_count, match_count;
413
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
416
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
420
421 #ifdef SUPPORT_UTF
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 #else
424 BOOL utf = FALSE;
425 #endif
426
427 BOOL reset_could_continue = FALSE;
428
429 rlevel++;
430 offsetcount &= (-2);
431
432 wscount -= 2;
433 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434 (2 * INTS_PER_STATEBLOCK);
435
436 DPRINTF(("\n%.*s---------------------\n"
437 "%.*sCall to internal_dfa_exec f=%d\n",
438 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439
440 ctypes = md->tables + ctypes_offset;
441 lcc = md->tables + lcc_offset;
442 fcc = md->tables + fcc_offset;
443
444 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
445
446 active_states = (stateblock *)(workspace + 2);
447 next_new_state = new_states = active_states + wscount;
448 new_count = 0;
449
450 first_op = this_start_code + 1 + LINK_SIZE +
451 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453 ? IMM2_SIZE:0);
454
455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456 the alternative states onto the list, and find out where the end is. This
457 makes is possible to use this function recursively, when we want to stop at a
458 matching internal ket rather than at the end.
459
460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461 a backward assertion. In that case, we have to find out the maximum amount to
462 move back, and set up each alternative appropriately. */
463
464 if (*first_op == OP_REVERSE)
465 {
466 int max_back = 0;
467 int gone_back;
468
469 end_code = this_start_code;
470 do
471 {
472 int back = GET(end_code, 2+LINK_SIZE);
473 if (back > max_back) max_back = back;
474 end_code += GET(end_code, 1);
475 }
476 while (*end_code == OP_ALT);
477
478 /* If we can't go back the amount required for the longest lookbehind
479 pattern, go back as far as we can; some alternatives may still be viable. */
480
481 #ifdef SUPPORT_UTF
482 /* In character mode we have to step back character by character */
483
484 if (utf)
485 {
486 for (gone_back = 0; gone_back < max_back; gone_back++)
487 {
488 if (current_subject <= start_subject) break;
489 current_subject--;
490 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491 }
492 }
493 else
494 #endif
495
496 /* In byte-mode we can do this quickly. */
497
498 {
499 gone_back = (current_subject - max_back < start_subject)?
500 (int)(current_subject - start_subject) : max_back;
501 current_subject -= gone_back;
502 }
503
504 /* Save the earliest consulted character */
505
506 if (current_subject < md->start_used_ptr)
507 md->start_used_ptr = current_subject;
508
509 /* Now we can process the individual branches. */
510
511 end_code = this_start_code;
512 do
513 {
514 int back = GET(end_code, 2+LINK_SIZE);
515 if (back <= gone_back)
516 {
517 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518 ADD_NEW_DATA(-bstate, 0, gone_back - back);
519 }
520 end_code += GET(end_code, 1);
521 }
522 while (*end_code == OP_ALT);
523 }
524
525 /* This is the code for a "normal" subpattern (not a backward assertion). The
526 start of a whole pattern is always one of these. If we are at the top level,
527 we may be asked to restart matching from the same point that we reached for a
528 previous partial match. We still have to scan through the top-level branches to
529 find the end state. */
530
531 else
532 {
533 end_code = this_start_code;
534
535 /* Restarting */
536
537 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538 {
539 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540 new_count = workspace[1];
541 if (!workspace[0])
542 memcpy(new_states, active_states, new_count * sizeof(stateblock));
543 }
544
545 /* Not restarting */
546
547 else
548 {
549 int length = 1 + LINK_SIZE +
550 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552 ? IMM2_SIZE:0);
553 do
554 {
555 ADD_NEW((int)(end_code - start_code + length), 0);
556 end_code += GET(end_code, 1);
557 length = 1 + LINK_SIZE;
558 }
559 while (*end_code == OP_ALT);
560 }
561 }
562
563 workspace[0] = 0; /* Bit indicating which vector is current */
564
565 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566
567 /* Loop for scanning the subject */
568
569 ptr = current_subject;
570 for (;;)
571 {
572 int i, j;
573 int clen, dlen;
574 pcre_uint32 c, d;
575 int forced_fail = 0;
576 BOOL partial_newline = FALSE;
577 BOOL could_continue = reset_could_continue;
578 reset_could_continue = FALSE;
579
580 /* Make the new state list into the active state list and empty the
581 new state list. */
582
583 temp_states = active_states;
584 active_states = new_states;
585 new_states = temp_states;
586 active_count = new_count;
587 new_count = 0;
588
589 workspace[0] ^= 1; /* Remember for the restarting feature */
590 workspace[1] = active_count;
591
592 #ifdef PCRE_DEBUG
593 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594 pchars(ptr, STRLEN_UC(ptr), stdout);
595 printf("\"\n");
596
597 printf("%.*sActive states: ", rlevel*2-2, SP);
598 for (i = 0; i < active_count; i++)
599 printf("%d/%d ", active_states[i].offset, active_states[i].count);
600 printf("\n");
601 #endif
602
603 /* Set the pointers for adding new states */
604
605 next_active_state = active_states + active_count;
606 next_new_state = new_states;
607
608 /* Load the current character from the subject outside the loop, as many
609 different states may want to look at it, and we assume that at least one
610 will. */
611
612 if (ptr < end_subject)
613 {
614 clen = 1; /* Number of data items in the character */
615 #ifdef SUPPORT_UTF
616 GETCHARLENTEST(c, ptr, clen);
617 #else
618 c = *ptr;
619 #endif /* SUPPORT_UTF */
620 }
621 else
622 {
623 clen = 0; /* This indicates the end of the subject */
624 c = NOTACHAR; /* This value should never actually be used */
625 }
626
627 /* Scan up the active states and act on each one. The result of an action
628 may be to add more states to the currently active list (e.g. on hitting a
629 parenthesis) or it may be to put states on the new list, for considering
630 when we move the character pointer on. */
631
632 for (i = 0; i < active_count; i++)
633 {
634 stateblock *current_state = active_states + i;
635 BOOL caseless = FALSE;
636 const pcre_uchar *code;
637 int state_offset = current_state->offset;
638 int codevalue, rrc;
639 unsigned int count;
640
641 #ifdef PCRE_DEBUG
642 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
643 if (clen == 0) printf("EOL\n");
644 else if (c > 32 && c < 127) printf("'%c'\n", c);
645 else printf("0x%02x\n", c);
646 #endif
647
648 /* A negative offset is a special case meaning "hold off going to this
649 (negated) state until the number of characters in the data field have
650 been skipped". If the could_continue flag was passed over from a previous
651 state, arrange for it to passed on. */
652
653 if (state_offset < 0)
654 {
655 if (current_state->data > 0)
656 {
657 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
658 ADD_NEW_DATA(state_offset, current_state->count,
659 current_state->data - 1);
660 if (could_continue) reset_could_continue = TRUE;
661 continue;
662 }
663 else
664 {
665 current_state->offset = state_offset = -state_offset;
666 }
667 }
668
669 /* Check for a duplicate state with the same count, and skip if found.
670 See the note at the head of this module about the possibility of improving
671 performance here. */
672
673 for (j = 0; j < i; j++)
674 {
675 if (active_states[j].offset == state_offset &&
676 active_states[j].count == current_state->count)
677 {
678 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
679 goto NEXT_ACTIVE_STATE;
680 }
681 }
682
683 /* The state offset is the offset to the opcode */
684
685 code = start_code + state_offset;
686 codevalue = *code;
687
688 /* If this opcode inspects a character, but we are at the end of the
689 subject, remember the fact for use when testing for a partial match. */
690
691 if (clen == 0 && poptable[codevalue] != 0)
692 could_continue = TRUE;
693
694 /* If this opcode is followed by an inline character, load it. It is
695 tempting to test for the presence of a subject character here, but that
696 is wrong, because sometimes zero repetitions of the subject are
697 permitted.
698
699 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
700 argument that is not a data character - but is always one byte long because
701 the values are small. We have to take special action to deal with \P, \p,
702 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
703 these ones to new opcodes. */
704
705 if (coptable[codevalue] > 0)
706 {
707 dlen = 1;
708 #ifdef SUPPORT_UTF
709 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
710 #endif /* SUPPORT_UTF */
711 d = code[coptable[codevalue]];
712 if (codevalue >= OP_TYPESTAR)
713 {
714 switch(d)
715 {
716 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
717 case OP_NOTPROP:
718 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
719 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
720 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
721 case OP_NOT_HSPACE:
722 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
723 case OP_NOT_VSPACE:
724 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
725 default: break;
726 }
727 }
728 }
729 else
730 {
731 dlen = 0; /* Not strictly necessary, but compilers moan */
732 d = NOTACHAR; /* if these variables are not set. */
733 }
734
735
736 /* Now process the individual opcodes */
737
738 switch (codevalue)
739 {
740 /* ========================================================================== */
741 /* These cases are never obeyed. This is a fudge that causes a compile-
742 time error if the vectors coptable or poptable, which are indexed by
743 opcode, are not the correct length. It seems to be the only way to do
744 such a check at compile time, as the sizeof() operator does not work
745 in the C preprocessor. */
746
747 case OP_TABLE_LENGTH:
748 case OP_TABLE_LENGTH +
749 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
750 (sizeof(poptable) == OP_TABLE_LENGTH)):
751 break;
752
753 /* ========================================================================== */
754 /* Reached a closing bracket. If not at the end of the pattern, carry
755 on with the next opcode. For repeating opcodes, also add the repeat
756 state. Note that KETRPOS will always be encountered at the end of the
757 subpattern, because the possessive subpattern repeats are always handled
758 using recursive calls. Thus, it never adds any new states.
759
760 At the end of the (sub)pattern, unless we have an empty string and
761 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
762 start of the subject, save the match data, shifting up all previous
763 matches so we always have the longest first. */
764
765 case OP_KET:
766 case OP_KETRMIN:
767 case OP_KETRMAX:
768 case OP_KETRPOS:
769 if (code != end_code)
770 {
771 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
772 if (codevalue != OP_KET)
773 {
774 ADD_ACTIVE(state_offset - GET(code, 1), 0);
775 }
776 }
777 else
778 {
779 if (ptr > current_subject ||
780 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
781 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
782 current_subject > start_subject + md->start_offset)))
783 {
784 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
785 else if (match_count > 0 && ++match_count * 2 > offsetcount)
786 match_count = 0;
787 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
788 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
789 if (offsetcount >= 2)
790 {
791 offsets[0] = (int)(current_subject - start_subject);
792 offsets[1] = (int)(ptr - start_subject);
793 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
794 offsets[1] - offsets[0], (char *)current_subject));
795 }
796 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
797 {
798 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
799 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
800 match_count, rlevel*2-2, SP));
801 return match_count;
802 }
803 }
804 }
805 break;
806
807 /* ========================================================================== */
808 /* These opcodes add to the current list of states without looking
809 at the current character. */
810
811 /*-----------------------------------------------------------------*/
812 case OP_ALT:
813 do { code += GET(code, 1); } while (*code == OP_ALT);
814 ADD_ACTIVE((int)(code - start_code), 0);
815 break;
816
817 /*-----------------------------------------------------------------*/
818 case OP_BRA:
819 case OP_SBRA:
820 do
821 {
822 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 code += GET(code, 1);
824 }
825 while (*code == OP_ALT);
826 break;
827
828 /*-----------------------------------------------------------------*/
829 case OP_CBRA:
830 case OP_SCBRA:
831 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
832 code += GET(code, 1);
833 while (*code == OP_ALT)
834 {
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 code += GET(code, 1);
837 }
838 break;
839
840 /*-----------------------------------------------------------------*/
841 case OP_BRAZERO:
842 case OP_BRAMINZERO:
843 ADD_ACTIVE(state_offset + 1, 0);
844 code += 1 + GET(code, 2);
845 while (*code == OP_ALT) code += GET(code, 1);
846 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
847 break;
848
849 /*-----------------------------------------------------------------*/
850 case OP_SKIPZERO:
851 code += 1 + GET(code, 2);
852 while (*code == OP_ALT) code += GET(code, 1);
853 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
854 break;
855
856 /*-----------------------------------------------------------------*/
857 case OP_CIRC:
858 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
859 { ADD_ACTIVE(state_offset + 1, 0); }
860 break;
861
862 /*-----------------------------------------------------------------*/
863 case OP_CIRCM:
864 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
865 (ptr != end_subject && WAS_NEWLINE(ptr)))
866 { ADD_ACTIVE(state_offset + 1, 0); }
867 break;
868
869 /*-----------------------------------------------------------------*/
870 case OP_EOD:
871 if (ptr >= end_subject)
872 {
873 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
874 could_continue = TRUE;
875 else { ADD_ACTIVE(state_offset + 1, 0); }
876 }
877 break;
878
879 /*-----------------------------------------------------------------*/
880 case OP_SOD:
881 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
882 break;
883
884 /*-----------------------------------------------------------------*/
885 case OP_SOM:
886 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
887 break;
888
889
890 /* ========================================================================== */
891 /* These opcodes inspect the next subject character, and sometimes
892 the previous one as well, but do not have an argument. The variable
893 clen contains the length of the current character and is zero if we are
894 at the end of the subject. */
895
896 /*-----------------------------------------------------------------*/
897 case OP_ANY:
898 if (clen > 0 && !IS_NEWLINE(ptr))
899 {
900 if (ptr + 1 >= md->end_subject &&
901 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
902 NLBLOCK->nltype == NLTYPE_FIXED &&
903 NLBLOCK->nllen == 2 &&
904 c == NLBLOCK->nl[0])
905 {
906 could_continue = partial_newline = TRUE;
907 }
908 else
909 {
910 ADD_NEW(state_offset + 1, 0);
911 }
912 }
913 break;
914
915 /*-----------------------------------------------------------------*/
916 case OP_ALLANY:
917 if (clen > 0)
918 { ADD_NEW(state_offset + 1, 0); }
919 break;
920
921 /*-----------------------------------------------------------------*/
922 case OP_EODN:
923 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924 could_continue = TRUE;
925 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
926 { ADD_ACTIVE(state_offset + 1, 0); }
927 break;
928
929 /*-----------------------------------------------------------------*/
930 case OP_DOLL:
931 if ((md->moptions & PCRE_NOTEOL) == 0)
932 {
933 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
934 could_continue = TRUE;
935 else if (clen == 0 ||
936 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
937 (ptr == end_subject - md->nllen)
938 ))
939 { ADD_ACTIVE(state_offset + 1, 0); }
940 else if (ptr + 1 >= md->end_subject &&
941 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
942 NLBLOCK->nltype == NLTYPE_FIXED &&
943 NLBLOCK->nllen == 2 &&
944 c == NLBLOCK->nl[0])
945 {
946 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
947 {
948 reset_could_continue = TRUE;
949 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
950 }
951 else could_continue = partial_newline = TRUE;
952 }
953 }
954 break;
955
956 /*-----------------------------------------------------------------*/
957 case OP_DOLLM:
958 if ((md->moptions & PCRE_NOTEOL) == 0)
959 {
960 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
961 could_continue = TRUE;
962 else if (clen == 0 ||
963 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
964 { ADD_ACTIVE(state_offset + 1, 0); }
965 else if (ptr + 1 >= md->end_subject &&
966 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
967 NLBLOCK->nltype == NLTYPE_FIXED &&
968 NLBLOCK->nllen == 2 &&
969 c == NLBLOCK->nl[0])
970 {
971 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
972 {
973 reset_could_continue = TRUE;
974 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
975 }
976 else could_continue = partial_newline = TRUE;
977 }
978 }
979 else if (IS_NEWLINE(ptr))
980 { ADD_ACTIVE(state_offset + 1, 0); }
981 break;
982
983 /*-----------------------------------------------------------------*/
984
985 case OP_DIGIT:
986 case OP_WHITESPACE:
987 case OP_WORDCHAR:
988 if (clen > 0 && c < 256 &&
989 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
990 { ADD_NEW(state_offset + 1, 0); }
991 break;
992
993 /*-----------------------------------------------------------------*/
994 case OP_NOT_DIGIT:
995 case OP_NOT_WHITESPACE:
996 case OP_NOT_WORDCHAR:
997 if (clen > 0 && (c >= 256 ||
998 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
999 { ADD_NEW(state_offset + 1, 0); }
1000 break;
1001
1002 /*-----------------------------------------------------------------*/
1003 case OP_WORD_BOUNDARY:
1004 case OP_NOT_WORD_BOUNDARY:
1005 {
1006 int left_word, right_word;
1007
1008 if (ptr > start_subject)
1009 {
1010 const pcre_uchar *temp = ptr - 1;
1011 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1012 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1013 if (utf) { BACKCHAR(temp); }
1014 #endif
1015 GETCHARTEST(d, temp);
1016 #ifdef SUPPORT_UCP
1017 if ((md->poptions & PCRE_UCP) != 0)
1018 {
1019 if (d == '_') left_word = TRUE; else
1020 {
1021 int cat = UCD_CATEGORY(d);
1022 left_word = (cat == ucp_L || cat == ucp_N);
1023 }
1024 }
1025 else
1026 #endif
1027 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1028 }
1029 else left_word = FALSE;
1030
1031 if (clen > 0)
1032 {
1033 #ifdef SUPPORT_UCP
1034 if ((md->poptions & PCRE_UCP) != 0)
1035 {
1036 if (c == '_') right_word = TRUE; else
1037 {
1038 int cat = UCD_CATEGORY(c);
1039 right_word = (cat == ucp_L || cat == ucp_N);
1040 }
1041 }
1042 else
1043 #endif
1044 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1045 }
1046 else right_word = FALSE;
1047
1048 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1049 { ADD_ACTIVE(state_offset + 1, 0); }
1050 }
1051 break;
1052
1053
1054 /*-----------------------------------------------------------------*/
1055 /* Check the next character by Unicode property. We will get here only
1056 if the support is in the binary; otherwise a compile-time error occurs.
1057 */
1058
1059 #ifdef SUPPORT_UCP
1060 case OP_PROP:
1061 case OP_NOTPROP:
1062 if (clen > 0)
1063 {
1064 BOOL OK;
1065 const pcre_uint32 *cp;
1066 const ucd_record * prop = GET_UCD(c);
1067 switch(code[1])
1068 {
1069 case PT_ANY:
1070 OK = TRUE;
1071 break;
1072
1073 case PT_LAMP:
1074 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1075 prop->chartype == ucp_Lt;
1076 break;
1077
1078 case PT_GC:
1079 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1080 break;
1081
1082 case PT_PC:
1083 OK = prop->chartype == code[2];
1084 break;
1085
1086 case PT_SC:
1087 OK = prop->script == code[2];
1088 break;
1089
1090 /* These are specials for combination cases. */
1091
1092 case PT_ALNUM:
1093 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1095 break;
1096
1097 case PT_SPACE: /* Perl space */
1098 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1099 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1100 break;
1101
1102 case PT_PXSPACE: /* POSIX space */
1103 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1104 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1105 c == CHAR_FF || c == CHAR_CR;
1106 break;
1107
1108 case PT_WORD:
1109 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1110 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1111 c == CHAR_UNDERSCORE;
1112 break;
1113
1114 case PT_CLIST:
1115 cp = PRIV(ucd_caseless_sets) + prop->caseset;
1116 for (;;)
1117 {
1118 if (c < *cp) { OK = FALSE; break; }
1119 if (c == *cp++) { OK = TRUE; break; }
1120 }
1121 break;
1122
1123 /* Should never occur, but keep compilers from grumbling. */
1124
1125 default:
1126 OK = codevalue != OP_PROP;
1127 break;
1128 }
1129
1130 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1131 }
1132 break;
1133 #endif
1134
1135
1136
1137 /* ========================================================================== */
1138 /* These opcodes likewise inspect the subject character, but have an
1139 argument that is not a data character. It is one of these opcodes:
1140 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1141 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1142
1143 case OP_TYPEPLUS:
1144 case OP_TYPEMINPLUS:
1145 case OP_TYPEPOSPLUS:
1146 count = current_state->count; /* Already matched */
1147 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1148 if (clen > 0)
1149 {
1150 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1151 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1152 NLBLOCK->nltype == NLTYPE_FIXED &&
1153 NLBLOCK->nllen == 2 &&
1154 c == NLBLOCK->nl[0])
1155 {
1156 could_continue = partial_newline = TRUE;
1157 }
1158 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159 (c < 256 &&
1160 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162 {
1163 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1164 {
1165 active_count--; /* Remove non-match possibility */
1166 next_active_state--;
1167 }
1168 count++;
1169 ADD_NEW(state_offset, count);
1170 }
1171 }
1172 break;
1173
1174 /*-----------------------------------------------------------------*/
1175 case OP_TYPEQUERY:
1176 case OP_TYPEMINQUERY:
1177 case OP_TYPEPOSQUERY:
1178 ADD_ACTIVE(state_offset + 2, 0);
1179 if (clen > 0)
1180 {
1181 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1182 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1183 NLBLOCK->nltype == NLTYPE_FIXED &&
1184 NLBLOCK->nllen == 2 &&
1185 c == NLBLOCK->nl[0])
1186 {
1187 could_continue = partial_newline = TRUE;
1188 }
1189 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1190 (c < 256 &&
1191 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1192 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1193 {
1194 if (codevalue == OP_TYPEPOSQUERY)
1195 {
1196 active_count--; /* Remove non-match possibility */
1197 next_active_state--;
1198 }
1199 ADD_NEW(state_offset + 2, 0);
1200 }
1201 }
1202 break;
1203
1204 /*-----------------------------------------------------------------*/
1205 case OP_TYPESTAR:
1206 case OP_TYPEMINSTAR:
1207 case OP_TYPEPOSSTAR:
1208 ADD_ACTIVE(state_offset + 2, 0);
1209 if (clen > 0)
1210 {
1211 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1212 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1213 NLBLOCK->nltype == NLTYPE_FIXED &&
1214 NLBLOCK->nllen == 2 &&
1215 c == NLBLOCK->nl[0])
1216 {
1217 could_continue = partial_newline = TRUE;
1218 }
1219 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1220 (c < 256 &&
1221 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1222 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1223 {
1224 if (codevalue == OP_TYPEPOSSTAR)
1225 {
1226 active_count--; /* Remove non-match possibility */
1227 next_active_state--;
1228 }
1229 ADD_NEW(state_offset, 0);
1230 }
1231 }
1232 break;
1233
1234 /*-----------------------------------------------------------------*/
1235 case OP_TYPEEXACT:
1236 count = current_state->count; /* Number already matched */
1237 if (clen > 0)
1238 {
1239 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1240 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1241 NLBLOCK->nltype == NLTYPE_FIXED &&
1242 NLBLOCK->nllen == 2 &&
1243 c == NLBLOCK->nl[0])
1244 {
1245 could_continue = partial_newline = TRUE;
1246 }
1247 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1248 (c < 256 &&
1249 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1250 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1251 {
1252 if (++count >= GET2(code, 1))
1253 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1254 else
1255 { ADD_NEW(state_offset, count); }
1256 }
1257 }
1258 break;
1259
1260 /*-----------------------------------------------------------------*/
1261 case OP_TYPEUPTO:
1262 case OP_TYPEMINUPTO:
1263 case OP_TYPEPOSUPTO:
1264 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1265 count = current_state->count; /* Number already matched */
1266 if (clen > 0)
1267 {
1268 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1269 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1270 NLBLOCK->nltype == NLTYPE_FIXED &&
1271 NLBLOCK->nllen == 2 &&
1272 c == NLBLOCK->nl[0])
1273 {
1274 could_continue = partial_newline = TRUE;
1275 }
1276 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1277 (c < 256 &&
1278 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1279 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1280 {
1281 if (codevalue == OP_TYPEPOSUPTO)
1282 {
1283 active_count--; /* Remove non-match possibility */
1284 next_active_state--;
1285 }
1286 if (++count >= GET2(code, 1))
1287 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1288 else
1289 { ADD_NEW(state_offset, count); }
1290 }
1291 }
1292 break;
1293
1294 /* ========================================================================== */
1295 /* These are virtual opcodes that are used when something like
1296 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1297 argument. It keeps the code above fast for the other cases. The argument
1298 is in the d variable. */
1299
1300 #ifdef SUPPORT_UCP
1301 case OP_PROP_EXTRA + OP_TYPEPLUS:
1302 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1303 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1304 count = current_state->count; /* Already matched */
1305 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1306 if (clen > 0)
1307 {
1308 BOOL OK;
1309 const pcre_uint32 *cp;
1310 const ucd_record * prop = GET_UCD(c);
1311 switch(code[2])
1312 {
1313 case PT_ANY:
1314 OK = TRUE;
1315 break;
1316
1317 case PT_LAMP:
1318 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1319 prop->chartype == ucp_Lt;
1320 break;
1321
1322 case PT_GC:
1323 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1324 break;
1325
1326 case PT_PC:
1327 OK = prop->chartype == code[3];
1328 break;
1329
1330 case PT_SC:
1331 OK = prop->script == code[3];
1332 break;
1333
1334 /* These are specials for combination cases. */
1335
1336 case PT_ALNUM:
1337 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1338 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1339 break;
1340
1341 case PT_SPACE: /* Perl space */
1342 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1343 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1344 break;
1345
1346 case PT_PXSPACE: /* POSIX space */
1347 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1348 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1349 c == CHAR_FF || c == CHAR_CR;
1350 break;
1351
1352 case PT_WORD:
1353 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1354 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1355 c == CHAR_UNDERSCORE;
1356 break;
1357
1358 case PT_CLIST:
1359 cp = PRIV(ucd_caseless_sets) + prop->caseset;
1360 for (;;)
1361 {
1362 if (c < *cp) { OK = FALSE; break; }
1363 if (c == *cp++) { OK = TRUE; break; }
1364 }
1365 break;
1366
1367 /* Should never occur, but keep compilers from grumbling. */
1368
1369 default:
1370 OK = codevalue != OP_PROP;
1371 break;
1372 }
1373
1374 if (OK == (d == OP_PROP))
1375 {
1376 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1377 {
1378 active_count--; /* Remove non-match possibility */
1379 next_active_state--;
1380 }
1381 count++;
1382 ADD_NEW(state_offset, count);
1383 }
1384 }
1385 break;
1386
1387 /*-----------------------------------------------------------------*/
1388 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1389 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1390 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1391 count = current_state->count; /* Already matched */
1392 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1393 if (clen > 0)
1394 {
1395 int lgb, rgb;
1396 const pcre_uchar *nptr = ptr + clen;
1397 int ncount = 0;
1398 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1399 {
1400 active_count--; /* Remove non-match possibility */
1401 next_active_state--;
1402 }
1403 lgb = UCD_GRAPHBREAK(c);
1404 while (nptr < end_subject)
1405 {
1406 dlen = 1;
1407 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1408 rgb = UCD_GRAPHBREAK(d);
1409 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1410 ncount++;
1411 lgb = rgb;
1412 nptr += dlen;
1413 }
1414 count++;
1415 ADD_NEW_DATA(-state_offset, count, ncount);
1416 }
1417 break;
1418 #endif
1419
1420 /*-----------------------------------------------------------------*/
1421 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1422 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1423 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1424 count = current_state->count; /* Already matched */
1425 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1426 if (clen > 0)
1427 {
1428 int ncount = 0;
1429 switch (c)
1430 {
1431 case CHAR_VT:
1432 case CHAR_FF:
1433 case CHAR_NEL:
1434 #ifndef EBCDIC
1435 case 0x2028:
1436 case 0x2029:
1437 #endif /* Not EBCDIC */
1438 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1439 goto ANYNL01;
1440
1441 case CHAR_CR:
1442 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1443 /* Fall through */
1444
1445 ANYNL01:
1446 case CHAR_LF:
1447 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1448 {
1449 active_count--; /* Remove non-match possibility */
1450 next_active_state--;
1451 }
1452 count++;
1453 ADD_NEW_DATA(-state_offset, count, ncount);
1454 break;
1455
1456 default:
1457 break;
1458 }
1459 }
1460 break;
1461
1462 /*-----------------------------------------------------------------*/
1463 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1464 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1465 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1466 count = current_state->count; /* Already matched */
1467 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1468 if (clen > 0)
1469 {
1470 BOOL OK;
1471 switch (c)
1472 {
1473 VSPACE_CASES:
1474 OK = TRUE;
1475 break;
1476
1477 default:
1478 OK = FALSE;
1479 break;
1480 }
1481
1482 if (OK == (d == OP_VSPACE))
1483 {
1484 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1485 {
1486 active_count--; /* Remove non-match possibility */
1487 next_active_state--;
1488 }
1489 count++;
1490 ADD_NEW_DATA(-state_offset, count, 0);
1491 }
1492 }
1493 break;
1494
1495 /*-----------------------------------------------------------------*/
1496 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1497 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1498 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1499 count = current_state->count; /* Already matched */
1500 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1501 if (clen > 0)
1502 {
1503 BOOL OK;
1504 switch (c)
1505 {
1506 HSPACE_CASES:
1507 OK = TRUE;
1508 break;
1509
1510 default:
1511 OK = FALSE;
1512 break;
1513 }
1514
1515 if (OK == (d == OP_HSPACE))
1516 {
1517 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1518 {
1519 active_count--; /* Remove non-match possibility */
1520 next_active_state--;
1521 }
1522 count++;
1523 ADD_NEW_DATA(-state_offset, count, 0);
1524 }
1525 }
1526 break;
1527
1528 /*-----------------------------------------------------------------*/
1529 #ifdef SUPPORT_UCP
1530 case OP_PROP_EXTRA + OP_TYPEQUERY:
1531 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1532 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1533 count = 4;
1534 goto QS1;
1535
1536 case OP_PROP_EXTRA + OP_TYPESTAR:
1537 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1538 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1539 count = 0;
1540
1541 QS1:
1542
1543 ADD_ACTIVE(state_offset + 4, 0);
1544 if (clen > 0)
1545 {
1546 BOOL OK;
1547 const pcre_uint32 *cp;
1548 const ucd_record * prop = GET_UCD(c);
1549 switch(code[2])
1550 {
1551 case PT_ANY:
1552 OK = TRUE;
1553 break;
1554
1555 case PT_LAMP:
1556 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1557 prop->chartype == ucp_Lt;
1558 break;
1559
1560 case PT_GC:
1561 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1562 break;
1563
1564 case PT_PC:
1565 OK = prop->chartype == code[3];
1566 break;
1567
1568 case PT_SC:
1569 OK = prop->script == code[3];
1570 break;
1571
1572 /* These are specials for combination cases. */
1573
1574 case PT_ALNUM:
1575 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1576 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1577 break;
1578
1579 case PT_SPACE: /* Perl space */
1580 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1581 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1582 break;
1583
1584 case PT_PXSPACE: /* POSIX space */
1585 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1586 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1587 c == CHAR_FF || c == CHAR_CR;
1588 break;
1589
1590 case PT_WORD:
1591 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1593 c == CHAR_UNDERSCORE;
1594 break;
1595
1596 case PT_CLIST:
1597 cp = PRIV(ucd_caseless_sets) + prop->caseset;
1598 for (;;)
1599 {
1600 if (c < *cp) { OK = FALSE; break; }
1601 if (c == *cp++) { OK = TRUE; break; }
1602 }
1603 break;
1604
1605 /* Should never occur, but keep compilers from grumbling. */
1606
1607 default:
1608 OK = codevalue != OP_PROP;
1609 break;
1610 }
1611
1612 if (OK == (d == OP_PROP))
1613 {
1614 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1615 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1616 {
1617 active_count--; /* Remove non-match possibility */
1618 next_active_state--;
1619 }
1620 ADD_NEW(state_offset + count, 0);
1621 }
1622 }
1623 break;
1624
1625 /*-----------------------------------------------------------------*/
1626 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1627 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1628 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1629 count = 2;
1630 goto QS2;
1631
1632 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1633 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1634 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1635 count = 0;
1636
1637 QS2:
1638
1639 ADD_ACTIVE(state_offset + 2, 0);
1640 if (clen > 0)
1641 {
1642 int lgb, rgb;
1643 const pcre_uchar *nptr = ptr + clen;
1644 int ncount = 0;
1645 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1646 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1647 {
1648 active_count--; /* Remove non-match possibility */
1649 next_active_state--;
1650 }
1651 lgb = UCD_GRAPHBREAK(c);
1652 while (nptr < end_subject)
1653 {
1654 dlen = 1;
1655 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1656 rgb = UCD_GRAPHBREAK(d);
1657 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1658 ncount++;
1659 lgb = rgb;
1660 nptr += dlen;
1661 }
1662 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1663 }
1664 break;
1665 #endif
1666
1667 /*-----------------------------------------------------------------*/
1668 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1669 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1670 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1671 count = 2;
1672 goto QS3;
1673
1674 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1675 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1676 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1677 count = 0;
1678
1679 QS3:
1680 ADD_ACTIVE(state_offset + 2, 0);
1681 if (clen > 0)
1682 {
1683 int ncount = 0;
1684 switch (c)
1685 {
1686 case CHAR_VT:
1687 case CHAR_FF:
1688 case CHAR_NEL:
1689 #ifndef EBCDIC
1690 case 0x2028:
1691 case 0x2029:
1692 #endif /* Not EBCDIC */
1693 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1694 goto ANYNL02;
1695
1696 case CHAR_CR:
1697 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1698 /* Fall through */
1699
1700 ANYNL02:
1701 case CHAR_LF:
1702 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1703 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1704 {
1705 active_count--; /* Remove non-match possibility */
1706 next_active_state--;
1707 }
1708 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1709 break;
1710
1711 default:
1712 break;
1713 }
1714 }
1715 break;
1716
1717 /*-----------------------------------------------------------------*/
1718 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1719 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1720 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1721 count = 2;
1722 goto QS4;
1723
1724 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1725 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1726 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1727 count = 0;
1728
1729 QS4:
1730 ADD_ACTIVE(state_offset + 2, 0);
1731 if (clen > 0)
1732 {
1733 BOOL OK;
1734 switch (c)
1735 {
1736 VSPACE_CASES:
1737 OK = TRUE;
1738 break;
1739
1740 default:
1741 OK = FALSE;
1742 break;
1743 }
1744 if (OK == (d == OP_VSPACE))
1745 {
1746 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1747 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1748 {
1749 active_count--; /* Remove non-match possibility */
1750 next_active_state--;
1751 }
1752 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1753 }
1754 }
1755 break;
1756
1757 /*-----------------------------------------------------------------*/
1758 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1759 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1760 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1761 count = 2;
1762 goto QS5;
1763
1764 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1765 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1766 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1767 count = 0;
1768
1769 QS5:
1770 ADD_ACTIVE(state_offset + 2, 0);
1771 if (clen > 0)
1772 {
1773 BOOL OK;
1774 switch (c)
1775 {
1776 HSPACE_CASES:
1777 OK = TRUE;
1778 break;
1779
1780 default:
1781 OK = FALSE;
1782 break;
1783 }
1784
1785 if (OK == (d == OP_HSPACE))
1786 {
1787 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1788 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1789 {
1790 active_count--; /* Remove non-match possibility */
1791 next_active_state--;
1792 }
1793 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1794 }
1795 }
1796 break;
1797
1798 /*-----------------------------------------------------------------*/
1799 #ifdef SUPPORT_UCP
1800 case OP_PROP_EXTRA + OP_TYPEEXACT:
1801 case OP_PROP_EXTRA + OP_TYPEUPTO:
1802 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1803 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1804 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1805 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1806 count = current_state->count; /* Number already matched */
1807 if (clen > 0)
1808 {
1809 BOOL OK;
1810 const pcre_uint32 *cp;
1811 const ucd_record * prop = GET_UCD(c);
1812 switch(code[1 + IMM2_SIZE + 1])
1813 {
1814 case PT_ANY:
1815 OK = TRUE;
1816 break;
1817
1818 case PT_LAMP:
1819 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1820 prop->chartype == ucp_Lt;
1821 break;
1822
1823 case PT_GC:
1824 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1825 break;
1826
1827 case PT_PC:
1828 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1829 break;
1830
1831 case PT_SC:
1832 OK = prop->script == code[1 + IMM2_SIZE + 2];
1833 break;
1834
1835 /* These are specials for combination cases. */
1836
1837 case PT_ALNUM:
1838 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1839 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1840 break;
1841
1842 case PT_SPACE: /* Perl space */
1843 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1844 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1845 break;
1846
1847 case PT_PXSPACE: /* POSIX space */
1848 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1850 c == CHAR_FF || c == CHAR_CR;
1851 break;
1852
1853 case PT_WORD:
1854 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1855 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1856 c == CHAR_UNDERSCORE;
1857 break;
1858
1859 case PT_CLIST:
1860 cp = PRIV(ucd_caseless_sets) + prop->caseset;
1861 for (;;)
1862 {
1863 if (c < *cp) { OK = FALSE; break; }
1864 if (c == *cp++) { OK = TRUE; break; }
1865 }
1866 break;
1867
1868 /* Should never occur, but keep compilers from grumbling. */
1869
1870 default:
1871 OK = codevalue != OP_PROP;
1872 break;
1873 }
1874
1875 if (OK == (d == OP_PROP))
1876 {
1877 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1878 {
1879 active_count--; /* Remove non-match possibility */
1880 next_active_state--;
1881 }
1882 if (++count >= GET2(code, 1))
1883 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1884 else
1885 { ADD_NEW(state_offset, count); }
1886 }
1887 }
1888 break;
1889
1890 /*-----------------------------------------------------------------*/
1891 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1892 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1893 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1894 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1895 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1896 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1897 count = current_state->count; /* Number already matched */
1898 if (clen > 0)
1899 {
1900 int lgb, rgb;
1901 const pcre_uchar *nptr = ptr + clen;
1902 int ncount = 0;
1903 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1904 {
1905 active_count--; /* Remove non-match possibility */
1906 next_active_state--;
1907 }
1908 lgb = UCD_GRAPHBREAK(c);
1909 while (nptr < end_subject)
1910 {
1911 dlen = 1;
1912 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1913 rgb = UCD_GRAPHBREAK(d);
1914 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1915 ncount++;
1916 lgb = rgb;
1917 nptr += dlen;
1918 }
1919 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1920 reset_could_continue = TRUE;
1921 if (++count >= GET2(code, 1))
1922 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1923 else
1924 { ADD_NEW_DATA(-state_offset, count, ncount); }
1925 }
1926 break;
1927 #endif
1928
1929 /*-----------------------------------------------------------------*/
1930 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1931 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1932 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1933 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1934 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1935 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1936 count = current_state->count; /* Number already matched */
1937 if (clen > 0)
1938 {
1939 int ncount = 0;
1940 switch (c)
1941 {
1942 case CHAR_VT:
1943 case CHAR_FF:
1944 case CHAR_NEL:
1945 #ifndef EBCDIC
1946 case 0x2028:
1947 case 0x2029:
1948 #endif /* Not EBCDIC */
1949 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1950 goto ANYNL03;
1951
1952 case CHAR_CR:
1953 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1954 /* Fall through */
1955
1956 ANYNL03:
1957 case CHAR_LF:
1958 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1959 {
1960 active_count--; /* Remove non-match possibility */
1961 next_active_state--;
1962 }
1963 if (++count >= GET2(code, 1))
1964 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1965 else
1966 { ADD_NEW_DATA(-state_offset, count, ncount); }
1967 break;
1968
1969 default:
1970 break;
1971 }
1972 }
1973 break;
1974
1975 /*-----------------------------------------------------------------*/
1976 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1977 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1978 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1979 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1980 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1981 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1982 count = current_state->count; /* Number already matched */
1983 if (clen > 0)
1984 {
1985 BOOL OK;
1986 switch (c)
1987 {
1988 VSPACE_CASES:
1989 OK = TRUE;
1990 break;
1991
1992 default:
1993 OK = FALSE;
1994 }
1995
1996 if (OK == (d == OP_VSPACE))
1997 {
1998 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1999 {
2000 active_count--; /* Remove non-match possibility */
2001 next_active_state--;
2002 }
2003 if (++count >= GET2(code, 1))
2004 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2005 else
2006 { ADD_NEW_DATA(-state_offset, count, 0); }
2007 }
2008 }
2009 break;
2010
2011 /*-----------------------------------------------------------------*/
2012 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2013 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2014 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2015 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2016 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2017 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2018 count = current_state->count; /* Number already matched */
2019 if (clen > 0)
2020 {
2021 BOOL OK;
2022 switch (c)
2023 {
2024 HSPACE_CASES:
2025 OK = TRUE;
2026 break;
2027
2028 default:
2029 OK = FALSE;
2030 break;
2031 }
2032
2033 if (OK == (d == OP_HSPACE))
2034 {
2035 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2036 {
2037 active_count--; /* Remove non-match possibility */
2038 next_active_state--;
2039 }
2040 if (++count >= GET2(code, 1))
2041 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2042 else
2043 { ADD_NEW_DATA(-state_offset, count, 0); }
2044 }
2045 }
2046 break;
2047
2048 /* ========================================================================== */
2049 /* These opcodes are followed by a character that is usually compared
2050 to the current subject character; it is loaded into d. We still get
2051 here even if there is no subject character, because in some cases zero
2052 repetitions are permitted. */
2053
2054 /*-----------------------------------------------------------------*/
2055 case OP_CHAR:
2056 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2057 break;
2058
2059 /*-----------------------------------------------------------------*/
2060 case OP_CHARI:
2061 if (clen == 0) break;
2062
2063 #ifdef SUPPORT_UTF
2064 if (utf)
2065 {
2066 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2067 {
2068 unsigned int othercase;
2069 if (c < 128)
2070 othercase = fcc[c];
2071 else
2072 /* If we have Unicode property support, we can use it to test the
2073 other case of the character. */
2074 #ifdef SUPPORT_UCP
2075 othercase = UCD_OTHERCASE(c);
2076 #else
2077 othercase = NOTACHAR;
2078 #endif
2079
2080 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2081 }
2082 }
2083 else
2084 #endif /* SUPPORT_UTF */
2085 /* Not UTF mode */
2086 {
2087 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088 { ADD_NEW(state_offset + 2, 0); }
2089 }
2090 break;
2091
2092
2093 #ifdef SUPPORT_UCP
2094 /*-----------------------------------------------------------------*/
2095 /* This is a tricky one because it can match more than one character.
2096 Find out how many characters to skip, and then set up a negative state
2097 to wait for them to pass before continuing. */
2098
2099 case OP_EXTUNI:
2100 if (clen > 0)
2101 {
2102 int lgb, rgb;
2103 const pcre_uchar *nptr = ptr + clen;
2104 int ncount = 0;
2105 lgb = UCD_GRAPHBREAK(c);
2106 while (nptr < end_subject)
2107 {
2108 dlen = 1;
2109 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110 rgb = UCD_GRAPHBREAK(d);
2111 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2112 ncount++;
2113 lgb = rgb;
2114 nptr += dlen;
2115 }
2116 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2117 reset_could_continue = TRUE;
2118 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2119 }
2120 break;
2121 #endif
2122
2123 /*-----------------------------------------------------------------*/
2124 /* This is a tricky like EXTUNI because it too can match more than one
2125 character (when CR is followed by LF). In this case, set up a negative
2126 state to wait for one character to pass before continuing. */
2127
2128 case OP_ANYNL:
2129 if (clen > 0) switch(c)
2130 {
2131 case CHAR_VT:
2132 case CHAR_FF:
2133 case CHAR_NEL:
2134 #ifndef EBCDIC
2135 case 0x2028:
2136 case 0x2029:
2137 #endif /* Not EBCDIC */
2138 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2139
2140 case CHAR_LF:
2141 ADD_NEW(state_offset + 1, 0);
2142 break;
2143
2144 case CHAR_CR:
2145 if (ptr + 1 >= end_subject)
2146 {
2147 ADD_NEW(state_offset + 1, 0);
2148 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2149 reset_could_continue = TRUE;
2150 }
2151 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2152 {
2153 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2154 }
2155 else
2156 {
2157 ADD_NEW(state_offset + 1, 0);
2158 }
2159 break;
2160 }
2161 break;
2162
2163 /*-----------------------------------------------------------------*/
2164 case OP_NOT_VSPACE:
2165 if (clen > 0) switch(c)
2166 {
2167 VSPACE_CASES:
2168 break;
2169
2170 default:
2171 ADD_NEW(state_offset + 1, 0);
2172 break;
2173 }
2174 break;
2175
2176 /*-----------------------------------------------------------------*/
2177 case OP_VSPACE:
2178 if (clen > 0) switch(c)
2179 {
2180 VSPACE_CASES:
2181 ADD_NEW(state_offset + 1, 0);
2182 break;
2183
2184 default:
2185 break;
2186 }
2187 break;
2188
2189 /*-----------------------------------------------------------------*/
2190 case OP_NOT_HSPACE:
2191 if (clen > 0) switch(c)
2192 {
2193 HSPACE_CASES:
2194 break;
2195
2196 default:
2197 ADD_NEW(state_offset + 1, 0);
2198 break;
2199 }
2200 break;
2201
2202 /*-----------------------------------------------------------------*/
2203 case OP_HSPACE:
2204 if (clen > 0) switch(c)
2205 {
2206 HSPACE_CASES:
2207 ADD_NEW(state_offset + 1, 0);
2208 break;
2209
2210 default:
2211 break;
2212 }
2213 break;
2214
2215 /*-----------------------------------------------------------------*/
2216 /* Match a negated single character casefully. */
2217
2218 case OP_NOT:
2219 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220 break;
2221
2222 /*-----------------------------------------------------------------*/
2223 /* Match a negated single character caselessly. */
2224
2225 case OP_NOTI:
2226 if (clen > 0)
2227 {
2228 unsigned int otherd;
2229 #ifdef SUPPORT_UTF
2230 if (utf && d >= 128)
2231 {
2232 #ifdef SUPPORT_UCP
2233 otherd = UCD_OTHERCASE(d);
2234 #endif /* SUPPORT_UCP */
2235 }
2236 else
2237 #endif /* SUPPORT_UTF */
2238 otherd = TABLE_GET(d, fcc, d);
2239 if (c != d && c != otherd)
2240 { ADD_NEW(state_offset + dlen + 1, 0); }
2241 }
2242 break;
2243
2244 /*-----------------------------------------------------------------*/
2245 case OP_PLUSI:
2246 case OP_MINPLUSI:
2247 case OP_POSPLUSI:
2248 case OP_NOTPLUSI:
2249 case OP_NOTMINPLUSI:
2250 case OP_NOTPOSPLUSI:
2251 caseless = TRUE;
2252 codevalue -= OP_STARI - OP_STAR;
2253
2254 /* Fall through */
2255 case OP_PLUS:
2256 case OP_MINPLUS:
2257 case OP_POSPLUS:
2258 case OP_NOTPLUS:
2259 case OP_NOTMINPLUS:
2260 case OP_NOTPOSPLUS:
2261 count = current_state->count; /* Already matched */
2262 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2263 if (clen > 0)
2264 {
2265 pcre_uint32 otherd = NOTACHAR;
2266 if (caseless)
2267 {
2268 #ifdef SUPPORT_UTF
2269 if (utf && d >= 128)
2270 {
2271 #ifdef SUPPORT_UCP
2272 otherd = UCD_OTHERCASE(d);
2273 #endif /* SUPPORT_UCP */
2274 }
2275 else
2276 #endif /* SUPPORT_UTF */
2277 otherd = TABLE_GET(d, fcc, d);
2278 }
2279 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2280 {
2281 if (count > 0 &&
2282 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2283 {
2284 active_count--; /* Remove non-match possibility */
2285 next_active_state--;
2286 }
2287 count++;
2288 ADD_NEW(state_offset, count);
2289 }
2290 }
2291 break;
2292
2293 /*-----------------------------------------------------------------*/
2294 case OP_QUERYI:
2295 case OP_MINQUERYI:
2296 case OP_POSQUERYI:
2297 case OP_NOTQUERYI:
2298 case OP_NOTMINQUERYI:
2299 case OP_NOTPOSQUERYI:
2300 caseless = TRUE;
2301 codevalue -= OP_STARI - OP_STAR;
2302 /* Fall through */
2303 case OP_QUERY:
2304 case OP_MINQUERY:
2305 case OP_POSQUERY:
2306 case OP_NOTQUERY:
2307 case OP_NOTMINQUERY:
2308 case OP_NOTPOSQUERY:
2309 ADD_ACTIVE(state_offset + dlen + 1, 0);
2310 if (clen > 0)
2311 {
2312 pcre_uint32 otherd = NOTACHAR;
2313 if (caseless)
2314 {
2315 #ifdef SUPPORT_UTF
2316 if (utf && d >= 128)
2317 {
2318 #ifdef SUPPORT_UCP
2319 otherd = UCD_OTHERCASE(d);
2320 #endif /* SUPPORT_UCP */
2321 }
2322 else
2323 #endif /* SUPPORT_UTF */
2324 otherd = TABLE_GET(d, fcc, d);
2325 }
2326 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2327 {
2328 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2329 {
2330 active_count--; /* Remove non-match possibility */
2331 next_active_state--;
2332 }
2333 ADD_NEW(state_offset + dlen + 1, 0);
2334 }
2335 }
2336 break;
2337
2338 /*-----------------------------------------------------------------*/
2339 case OP_STARI:
2340 case OP_MINSTARI:
2341 case OP_POSSTARI:
2342 case OP_NOTSTARI:
2343 case OP_NOTMINSTARI:
2344 case OP_NOTPOSSTARI:
2345 caseless = TRUE;
2346 codevalue -= OP_STARI - OP_STAR;
2347 /* Fall through */
2348 case OP_STAR:
2349 case OP_MINSTAR:
2350 case OP_POSSTAR:
2351 case OP_NOTSTAR:
2352 case OP_NOTMINSTAR:
2353 case OP_NOTPOSSTAR:
2354 ADD_ACTIVE(state_offset + dlen + 1, 0);
2355 if (clen > 0)
2356 {
2357 pcre_uint32 otherd = NOTACHAR;
2358 if (caseless)
2359 {
2360 #ifdef SUPPORT_UTF
2361 if (utf && d >= 128)
2362 {
2363 #ifdef SUPPORT_UCP
2364 otherd = UCD_OTHERCASE(d);
2365 #endif /* SUPPORT_UCP */
2366 }
2367 else
2368 #endif /* SUPPORT_UTF */
2369 otherd = TABLE_GET(d, fcc, d);
2370 }
2371 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2372 {
2373 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2374 {
2375 active_count--; /* Remove non-match possibility */
2376 next_active_state--;
2377 }
2378 ADD_NEW(state_offset, 0);
2379 }
2380 }
2381 break;
2382
2383 /*-----------------------------------------------------------------*/
2384 case OP_EXACTI:
2385 case OP_NOTEXACTI:
2386 caseless = TRUE;
2387 codevalue -= OP_STARI - OP_STAR;
2388 /* Fall through */
2389 case OP_EXACT:
2390 case OP_NOTEXACT:
2391 count = current_state->count; /* Number already matched */
2392 if (clen > 0)
2393 {
2394 pcre_uint32 otherd = NOTACHAR;
2395 if (caseless)
2396 {
2397 #ifdef SUPPORT_UTF
2398 if (utf && d >= 128)
2399 {
2400 #ifdef SUPPORT_UCP
2401 otherd = UCD_OTHERCASE(d);
2402 #endif /* SUPPORT_UCP */
2403 }
2404 else
2405 #endif /* SUPPORT_UTF */
2406 otherd = TABLE_GET(d, fcc, d);
2407 }
2408 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2409 {
2410 if (++count >= GET2(code, 1))
2411 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2412 else
2413 { ADD_NEW(state_offset, count); }
2414 }
2415 }
2416 break;
2417
2418 /*-----------------------------------------------------------------*/
2419 case OP_UPTOI:
2420 case OP_MINUPTOI:
2421 case OP_POSUPTOI:
2422 case OP_NOTUPTOI:
2423 case OP_NOTMINUPTOI:
2424 case OP_NOTPOSUPTOI:
2425 caseless = TRUE;
2426 codevalue -= OP_STARI - OP_STAR;
2427 /* Fall through */
2428 case OP_UPTO:
2429 case OP_MINUPTO:
2430 case OP_POSUPTO:
2431 case OP_NOTUPTO:
2432 case OP_NOTMINUPTO:
2433 case OP_NOTPOSUPTO:
2434 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2435 count = current_state->count; /* Number already matched */
2436 if (clen > 0)
2437 {
2438 pcre_uint32 otherd = NOTACHAR;
2439 if (caseless)
2440 {
2441 #ifdef SUPPORT_UTF
2442 if (utf && d >= 128)
2443 {
2444 #ifdef SUPPORT_UCP
2445 otherd = UCD_OTHERCASE(d);
2446 #endif /* SUPPORT_UCP */
2447 }
2448 else
2449 #endif /* SUPPORT_UTF */
2450 otherd = TABLE_GET(d, fcc, d);
2451 }
2452 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2453 {
2454 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2455 {
2456 active_count--; /* Remove non-match possibility */
2457 next_active_state--;
2458 }
2459 if (++count >= GET2(code, 1))
2460 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2461 else
2462 { ADD_NEW(state_offset, count); }
2463 }
2464 }
2465 break;
2466
2467
2468 /* ========================================================================== */
2469 /* These are the class-handling opcodes */
2470
2471 case OP_CLASS:
2472 case OP_NCLASS:
2473 case OP_XCLASS:
2474 {
2475 BOOL isinclass = FALSE;
2476 int next_state_offset;
2477 const pcre_uchar *ecode;
2478
2479 /* For a simple class, there is always just a 32-byte table, and we
2480 can set isinclass from it. */
2481
2482 if (codevalue != OP_XCLASS)
2483 {
2484 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2485 if (clen > 0)
2486 {
2487 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2488 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2489 }
2490 }
2491
2492 /* An extended class may have a table or a list of single characters,
2493 ranges, or both, and it may be positive or negative. There's a
2494 function that sorts all this out. */
2495
2496 else
2497 {
2498 ecode = code + GET(code, 1);
2499 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2500 }
2501
2502 /* At this point, isinclass is set for all kinds of class, and ecode
2503 points to the byte after the end of the class. If there is a
2504 quantifier, this is where it will be. */
2505
2506 next_state_offset = (int)(ecode - start_code);
2507
2508 switch (*ecode)
2509 {
2510 case OP_CRSTAR:
2511 case OP_CRMINSTAR:
2512 ADD_ACTIVE(next_state_offset + 1, 0);
2513 if (isinclass) { ADD_NEW(state_offset, 0); }
2514 break;
2515
2516 case OP_CRPLUS:
2517 case OP_CRMINPLUS:
2518 count = current_state->count; /* Already matched */
2519 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2520 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2521 break;
2522
2523 case OP_CRQUERY:
2524 case OP_CRMINQUERY:
2525 ADD_ACTIVE(next_state_offset + 1, 0);
2526 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2527 break;
2528
2529 case OP_CRRANGE:
2530 case OP_CRMINRANGE:
2531 count = current_state->count; /* Already matched */
2532 if (count >= GET2(ecode, 1))
2533 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2534 if (isinclass)
2535 {
2536 unsigned int max = GET2(ecode, 1 + IMM2_SIZE);
2537 if (++count >= max && max != 0) /* Max 0 => no limit */
2538 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539 else
2540 { ADD_NEW(state_offset, count); }
2541 }
2542 break;
2543
2544 default:
2545 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2546 break;
2547 }
2548 }
2549 break;
2550
2551 /* ========================================================================== */
2552 /* These are the opcodes for fancy brackets of various kinds. We have
2553 to use recursion in order to handle them. The "always failing" assertion
2554 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2555 though the other "backtracking verbs" are not supported. */
2556
2557 case OP_FAIL:
2558 forced_fail++; /* Count FAILs for multiple states */
2559 break;
2560
2561 case OP_ASSERT:
2562 case OP_ASSERT_NOT:
2563 case OP_ASSERTBACK:
2564 case OP_ASSERTBACK_NOT:
2565 {
2566 int rc;
2567 int local_offsets[2];
2568 int local_workspace[1000];
2569 const pcre_uchar *endasscode = code + GET(code, 1);
2570
2571 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2572
2573 rc = internal_dfa_exec(
2574 md, /* static match data */
2575 code, /* this subexpression's code */
2576 ptr, /* where we currently are */
2577 (int)(ptr - start_subject), /* start offset */
2578 local_offsets, /* offset vector */
2579 sizeof(local_offsets)/sizeof(int), /* size of same */
2580 local_workspace, /* workspace vector */
2581 sizeof(local_workspace)/sizeof(int), /* size of same */
2582 rlevel); /* function recursion level */
2583
2584 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2585 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2586 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2587 }
2588 break;
2589
2590 /*-----------------------------------------------------------------*/
2591 case OP_COND:
2592 case OP_SCOND:
2593 {
2594 int local_offsets[1000];
2595 int local_workspace[1000];
2596 int codelink = GET(code, 1);
2597 int condcode;
2598
2599 /* Because of the way auto-callout works during compile, a callout item
2600 is inserted between OP_COND and an assertion condition. This does not
2601 happen for the other conditions. */
2602
2603 if (code[LINK_SIZE+1] == OP_CALLOUT)
2604 {
2605 rrc = 0;
2606 if (PUBL(callout) != NULL)
2607 {
2608 PUBL(callout_block) cb;
2609 cb.version = 1; /* Version 1 of the callout block */
2610 cb.callout_number = code[LINK_SIZE+2];
2611 cb.offset_vector = offsets;
2612 #if defined COMPILE_PCRE8
2613 cb.subject = (PCRE_SPTR)start_subject;
2614 #elif defined COMPILE_PCRE16
2615 cb.subject = (PCRE_SPTR16)start_subject;
2616 #elif defined COMPILE_PCRE32
2617 cb.subject = (PCRE_SPTR32)start_subject;
2618 #endif
2619 cb.subject_length = (int)(end_subject - start_subject);
2620 cb.start_match = (int)(current_subject - start_subject);
2621 cb.current_position = (int)(ptr - start_subject);
2622 cb.pattern_position = GET(code, LINK_SIZE + 3);
2623 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2624 cb.capture_top = 1;
2625 cb.capture_last = -1;
2626 cb.callout_data = md->callout_data;
2627 cb.mark = NULL; /* No (*MARK) support */
2628 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2629 }
2630 if (rrc > 0) break; /* Fail this thread */
2631 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2632 }
2633
2634 condcode = code[LINK_SIZE+1];
2635
2636 /* Back reference conditions are not supported */
2637
2638 if (condcode == OP_CREF || condcode == OP_NCREF)
2639 return PCRE_ERROR_DFA_UCOND;
2640
2641 /* The DEFINE condition is always false */
2642
2643 if (condcode == OP_DEF)
2644 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2645
2646 /* The only supported version of OP_RREF is for the value RREF_ANY,
2647 which means "test if in any recursion". We can't test for specifically
2648 recursed groups. */
2649
2650 else if (condcode == OP_RREF || condcode == OP_NRREF)
2651 {
2652 int value = GET2(code, LINK_SIZE + 2);
2653 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2654 if (md->recursive != NULL)
2655 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2656 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2657 }
2658
2659 /* Otherwise, the condition is an assertion */
2660
2661 else
2662 {
2663 int rc;
2664 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2665 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2666
2667 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668
2669 rc = internal_dfa_exec(
2670 md, /* fixed match data */
2671 asscode, /* this subexpression's code */
2672 ptr, /* where we currently are */
2673 (int)(ptr - start_subject), /* start offset */
2674 local_offsets, /* offset vector */
2675 sizeof(local_offsets)/sizeof(int), /* size of same */
2676 local_workspace, /* workspace vector */
2677 sizeof(local_workspace)/sizeof(int), /* size of same */
2678 rlevel); /* function recursion level */
2679
2680 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681 if ((rc >= 0) ==
2682 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2683 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2684 else
2685 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2686 }
2687 }
2688 break;
2689
2690 /*-----------------------------------------------------------------*/
2691 case OP_RECURSE:
2692 {
2693 dfa_recursion_info *ri;
2694 int local_offsets[1000];
2695 int local_workspace[1000];
2696 const pcre_uchar *callpat = start_code + GET(code, 1);
2697 int recno = (callpat == md->start_code)? 0 :
2698 GET2(callpat, 1 + LINK_SIZE);
2699 int rc;
2700
2701 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2702
2703 /* Check for repeating a recursion without advancing the subject
2704 pointer. This should catch convoluted mutual recursions. (Some simple
2705 cases are caught at compile time.) */
2706
2707 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2708 if (recno == ri->group_num && ptr == ri->subject_position)
2709 return PCRE_ERROR_RECURSELOOP;
2710
2711 /* Remember this recursion and where we started it so as to
2712 catch infinite loops. */
2713
2714 new_recursive.group_num = recno;
2715 new_recursive.subject_position = ptr;
2716 new_recursive.prevrec = md->recursive;
2717 md->recursive = &new_recursive;
2718
2719 rc = internal_dfa_exec(
2720 md, /* fixed match data */
2721 callpat, /* this subexpression's code */
2722 ptr, /* where we currently are */
2723 (int)(ptr - start_subject), /* start offset */
2724 local_offsets, /* offset vector */
2725 sizeof(local_offsets)/sizeof(int), /* size of same */
2726 local_workspace, /* workspace vector */
2727 sizeof(local_workspace)/sizeof(int), /* size of same */
2728 rlevel); /* function recursion level */
2729
2730 md->recursive = new_recursive.prevrec; /* Done this recursion */
2731
2732 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2733 rc));
2734
2735 /* Ran out of internal offsets */
2736
2737 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2738
2739 /* For each successful matched substring, set up the next state with a
2740 count of characters to skip before trying it. Note that the count is in
2741 characters, not bytes. */
2742
2743 if (rc > 0)
2744 {
2745 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2746 {
2747 int charcount = local_offsets[rc+1] - local_offsets[rc];
2748 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2749 if (utf)
2750 {
2751 const pcre_uchar *p = start_subject + local_offsets[rc];
2752 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2753 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2754 }
2755 #endif
2756 if (charcount > 0)
2757 {
2758 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2759 }
2760 else
2761 {
2762 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2763 }
2764 }
2765 }
2766 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2767 }
2768 break;
2769
2770 /*-----------------------------------------------------------------*/
2771 case OP_BRAPOS:
2772 case OP_SBRAPOS:
2773 case OP_CBRAPOS:
2774 case OP_SCBRAPOS:
2775 case OP_BRAPOSZERO:
2776 {
2777 int charcount, matched_count;
2778 const pcre_uchar *local_ptr = ptr;
2779 BOOL allow_zero;
2780
2781 if (codevalue == OP_BRAPOSZERO)
2782 {
2783 allow_zero = TRUE;
2784 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2785 }
2786 else allow_zero = FALSE;
2787
2788 /* Loop to match the subpattern as many times as possible as if it were
2789 a complete pattern. */
2790
2791 for (matched_count = 0;; matched_count++)
2792 {
2793 int local_offsets[2];
2794 int local_workspace[1000];
2795
2796 int rc = internal_dfa_exec(
2797 md, /* fixed match data */
2798 code, /* this subexpression's code */
2799 local_ptr, /* where we currently are */
2800 (int)(ptr - start_subject), /* start offset */
2801 local_offsets, /* offset vector */
2802 sizeof(local_offsets)/sizeof(int), /* size of same */
2803 local_workspace, /* workspace vector */
2804 sizeof(local_workspace)/sizeof(int), /* size of same */
2805 rlevel); /* function recursion level */
2806
2807 /* Failed to match */
2808
2809 if (rc < 0)
2810 {
2811 if (rc != PCRE_ERROR_NOMATCH) return rc;
2812 break;
2813 }
2814
2815 /* Matched: break the loop if zero characters matched. */
2816
2817 charcount = local_offsets[1] - local_offsets[0];
2818 if (charcount == 0) break;
2819 local_ptr += charcount; /* Advance temporary position ptr */
2820 }
2821
2822 /* At this point we have matched the subpattern matched_count
2823 times, and local_ptr is pointing to the character after the end of the
2824 last match. */
2825
2826 if (matched_count > 0 || allow_zero)
2827 {
2828 const pcre_uchar *end_subpattern = code;
2829 int next_state_offset;
2830
2831 do { end_subpattern += GET(end_subpattern, 1); }
2832 while (*end_subpattern == OP_ALT);
2833 next_state_offset =
2834 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2835
2836 /* Optimization: if there are no more active states, and there
2837 are no new states yet set up, then skip over the subject string
2838 right here, to save looping. Otherwise, set up the new state to swing
2839 into action when the end of the matched substring is reached. */
2840
2841 if (i + 1 >= active_count && new_count == 0)
2842 {
2843 ptr = local_ptr;
2844 clen = 0;
2845 ADD_NEW(next_state_offset, 0);
2846 }
2847 else
2848 {
2849 const pcre_uchar *p = ptr;
2850 const pcre_uchar *pp = local_ptr;
2851 charcount = (int)(pp - p);
2852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2853 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2854 #endif
2855 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2856 }
2857 }
2858 }
2859 break;
2860
2861 /*-----------------------------------------------------------------*/
2862 case OP_ONCE:
2863 case OP_ONCE_NC:
2864 {
2865 int local_offsets[2];
2866 int local_workspace[1000];
2867
2868 int rc = internal_dfa_exec(
2869 md, /* fixed match data */
2870 code, /* this subexpression's code */
2871 ptr, /* where we currently are */
2872 (int)(ptr - start_subject), /* start offset */
2873 local_offsets, /* offset vector */
2874 sizeof(local_offsets)/sizeof(int), /* size of same */
2875 local_workspace, /* workspace vector */
2876 sizeof(local_workspace)/sizeof(int), /* size of same */
2877 rlevel); /* function recursion level */
2878
2879 if (rc >= 0)
2880 {
2881 const pcre_uchar *end_subpattern = code;
2882 int charcount = local_offsets[1] - local_offsets[0];
2883 int next_state_offset, repeat_state_offset;
2884
2885 do { end_subpattern += GET(end_subpattern, 1); }
2886 while (*end_subpattern == OP_ALT);
2887 next_state_offset =
2888 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2889
2890 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2891 arrange for the repeat state also to be added to the relevant list.
2892 Calculate the offset, or set -1 for no repeat. */
2893
2894 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2895 *end_subpattern == OP_KETRMIN)?
2896 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2897
2898 /* If we have matched an empty string, add the next state at the
2899 current character pointer. This is important so that the duplicate
2900 checking kicks in, which is what breaks infinite loops that match an
2901 empty string. */
2902
2903 if (charcount == 0)
2904 {
2905 ADD_ACTIVE(next_state_offset, 0);
2906 }
2907
2908 /* Optimization: if there are no more active states, and there
2909 are no new states yet set up, then skip over the subject string
2910 right here, to save looping. Otherwise, set up the new state to swing
2911 into action when the end of the matched substring is reached. */
2912
2913 else if (i + 1 >= active_count && new_count == 0)
2914 {
2915 ptr += charcount;
2916 clen = 0;
2917 ADD_NEW(next_state_offset, 0);
2918
2919 /* If we are adding a repeat state at the new character position,
2920 we must fudge things so that it is the only current state.
2921 Otherwise, it might be a duplicate of one we processed before, and
2922 that would cause it to be skipped. */
2923
2924 if (repeat_state_offset >= 0)
2925 {
2926 next_active_state = active_states;
2927 active_count = 0;
2928 i = -1;
2929 ADD_ACTIVE(repeat_state_offset, 0);
2930 }
2931 }
2932 else
2933 {
2934 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2935 if (utf)
2936 {
2937 const pcre_uchar *p = start_subject + local_offsets[0];
2938 const pcre_uchar *pp = start_subject + local_offsets[1];
2939 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2940 }
2941 #endif
2942 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2943 if (repeat_state_offset >= 0)
2944 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2945 }
2946 }
2947 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2948 }
2949 break;
2950
2951
2952 /* ========================================================================== */
2953 /* Handle callouts */
2954
2955 case OP_CALLOUT:
2956 rrc = 0;
2957 if (PUBL(callout) != NULL)
2958 {
2959 PUBL(callout_block) cb;
2960 cb.version = 1; /* Version 1 of the callout block */
2961 cb.callout_number = code[1];
2962 cb.offset_vector = offsets;
2963 #if defined COMPILE_PCRE8
2964 cb.subject = (PCRE_SPTR)start_subject;
2965 #elif defined COMPILE_PCRE16
2966 cb.subject = (PCRE_SPTR16)start_subject;
2967 #elif defined COMPILE_PCRE32
2968 cb.subject = (PCRE_SPTR32)start_subject;
2969 #endif
2970 cb.subject_length = (int)(end_subject - start_subject);
2971 cb.start_match = (int)(current_subject - start_subject);
2972 cb.current_position = (int)(ptr - start_subject);
2973 cb.pattern_position = GET(code, 2);
2974 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2975 cb.capture_top = 1;
2976 cb.capture_last = -1;
2977 cb.callout_data = md->callout_data;
2978 cb.mark = NULL; /* No (*MARK) support */
2979 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2980 }
2981 if (rrc == 0)
2982 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2983 break;
2984
2985
2986 /* ========================================================================== */
2987 default: /* Unsupported opcode */
2988 return PCRE_ERROR_DFA_UITEM;
2989 }
2990
2991 NEXT_ACTIVE_STATE: continue;
2992
2993 } /* End of loop scanning active states */
2994
2995 /* We have finished the processing at the current subject character. If no
2996 new states have been set for the next character, we have found all the
2997 matches that we are going to find. If we are at the top level and partial
2998 matching has been requested, check for appropriate conditions.
2999
3000 The "forced_ fail" variable counts the number of (*F) encountered for the
3001 character. If it is equal to the original active_count (saved in
3002 workspace[1]) it means that (*F) was found on every active state. In this
3003 case we don't want to give a partial match.
3004
3005 The "could_continue" variable is true if a state could have continued but
3006 for the fact that the end of the subject was reached. */
3007
3008 if (new_count <= 0)
3009 {
3010 if (rlevel == 1 && /* Top level, and */
3011 could_continue && /* Some could go on, and */
3012 forced_fail != workspace[1] && /* Not all forced fail & */
3013 ( /* either... */
3014 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3015 || /* or... */
3016 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3017 match_count < 0) /* no matches */
3018 ) && /* And... */
3019 (
3020 partial_newline || /* Either partial NL */
3021 ( /* or ... */
3022 ptr >= end_subject && /* End of subject and */
3023 ptr > md->start_used_ptr) /* Inspected non-empty string */
3024 )
3025 )
3026 {
3027 if (offsetcount >= 2)
3028 {
3029 offsets[0] = (int)(md->start_used_ptr - start_subject);
3030 offsets[1] = (int)(end_subject - start_subject);
3031 }
3032 match_count = PCRE_ERROR_PARTIAL;
3033 }
3034
3035 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3036 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3037 rlevel*2-2, SP));
3038 break; /* In effect, "return", but see the comment below */
3039 }
3040
3041 /* One or more states are active for the next character. */
3042
3043 ptr += clen; /* Advance to next subject character */
3044 } /* Loop to move along the subject string */
3045
3046 /* Control gets here from "break" a few lines above. We do it this way because
3047 if we use "return" above, we have compiler trouble. Some compilers warn if
3048 there's nothing here because they think the function doesn't return a value. On
3049 the other hand, if we put a dummy statement here, some more clever compilers
3050 complain that it can't be reached. Sigh. */
3051
3052 return match_count;
3053 }
3054
3055
3056
3057
3058 /*************************************************
3059 * Execute a Regular Expression - DFA engine *
3060 *************************************************/
3061
3062 /* This external function applies a compiled re to a subject string using a DFA
3063 engine. This function calls the internal function multiple times if the pattern
3064 is not anchored.
3065
3066 Arguments:
3067 argument_re points to the compiled expression
3068 extra_data points to extra data or is NULL
3069 subject points to the subject string
3070 length length of subject string (may contain binary zeros)
3071 start_offset where to start in the subject string
3072 options option bits
3073 offsets vector of match offsets
3074 offsetcount size of same
3075 workspace workspace vector
3076 wscount size of same
3077
3078 Returns: > 0 => number of match offset pairs placed in offsets
3079 = 0 => offsets overflowed; longest matches are present
3080 -1 => failed to match
3081 < -1 => some kind of unexpected problem
3082 */
3083
3084 #if defined COMPILE_PCRE8
3085 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3086 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3087 const char *subject, int length, int start_offset, int options, int *offsets,
3088 int offsetcount, int *workspace, int wscount)
3089 #elif defined COMPILE_PCRE16
3090 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3091 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3092 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3093 int offsetcount, int *workspace, int wscount)
3094 #elif defined COMPILE_PCRE32
3095 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3096 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3097 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3098 int offsetcount, int *workspace, int wscount)
3099 #endif
3100 {
3101 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3102 dfa_match_data match_block;
3103 dfa_match_data *md = &match_block;
3104 BOOL utf, anchored, startline, firstline;
3105 const pcre_uchar *current_subject, *end_subject;
3106 const pcre_study_data *study = NULL;
3107
3108 const pcre_uchar *req_char_ptr;
3109 const pcre_uint8 *start_bits = NULL;
3110 BOOL has_first_char = FALSE;
3111 BOOL has_req_char = FALSE;
3112 pcre_uchar first_char = 0;
3113 pcre_uchar first_char2 = 0;
3114 pcre_uchar req_char = 0;
3115 pcre_uchar req_char2 = 0;
3116 int newline;
3117
3118 /* Plausibility checks */
3119
3120 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3121 if (re == NULL || subject == NULL || workspace == NULL ||
3122 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3123 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3124 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3125 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3126
3127 /* Check that the first field in the block is the magic number. If it is not,
3128 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3129 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3130 means that the pattern is likely compiled with different endianness. */
3131
3132 if (re->magic_number != MAGIC_NUMBER)
3133 return re->magic_number == REVERSED_MAGIC_NUMBER?
3134 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3135 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3136
3137 /* If restarting after a partial match, do some sanity checks on the contents
3138 of the workspace. */
3139
3140 if ((options & PCRE_DFA_RESTART) != 0)
3141 {
3142 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3143 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3144 return PCRE_ERROR_DFA_BADRESTART;
3145 }
3146
3147 /* Set up study, callout, and table data */
3148
3149 md->tables = re->tables;
3150 md->callout_data = NULL;
3151
3152 if (extra_data != NULL)
3153 {
3154 unsigned int flags = extra_data->flags;
3155 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3156 study = (const pcre_study_data *)extra_data->study_data;
3157 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3158 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3159 return PCRE_ERROR_DFA_UMLIMIT;
3160 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3161 md->callout_data = extra_data->callout_data;
3162 if ((flags & PCRE_EXTRA_TABLES) != 0)
3163 md->tables = extra_data->tables;
3164 }
3165
3166 /* Set some local values */
3167
3168 current_subject = (const pcre_uchar *)subject + start_offset;
3169 end_subject = (const pcre_uchar *)subject + length;
3170 req_char_ptr = current_subject - 1;
3171
3172 #ifdef SUPPORT_UTF
3173 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3174 utf = (re->options & PCRE_UTF8) != 0;
3175 #else
3176 utf = FALSE;
3177 #endif
3178
3179 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3180 (re->options & PCRE_ANCHORED) != 0;
3181
3182 /* The remaining fixed data for passing around. */
3183
3184 md->start_code = (const pcre_uchar *)argument_re +
3185 re->name_table_offset + re->name_count * re->name_entry_size;
3186 md->start_subject = (const pcre_uchar *)subject;
3187 md->end_subject = end_subject;
3188 md->start_offset = start_offset;
3189 md->moptions = options;
3190 md->poptions = re->options;
3191
3192 /* If the BSR option is not set at match time, copy what was set
3193 at compile time. */
3194
3195 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3196 {
3197 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3198 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3199 #ifdef BSR_ANYCRLF
3200 else md->moptions |= PCRE_BSR_ANYCRLF;
3201 #endif
3202 }
3203
3204 /* Handle different types of newline. The three bits give eight cases. If
3205 nothing is set at run time, whatever was used at compile time applies. */
3206
3207 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3208 PCRE_NEWLINE_BITS)
3209 {
3210 case 0: newline = NEWLINE; break; /* Compile-time default */
3211 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3212 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3213 case PCRE_NEWLINE_CR+
3214 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3215 case PCRE_NEWLINE_ANY: newline = -1; break;
3216 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3217 default: return PCRE_ERROR_BADNEWLINE;
3218 }
3219
3220 if (newline == -2)
3221 {
3222 md->nltype = NLTYPE_ANYCRLF;
3223 }
3224 else if (newline < 0)
3225 {
3226 md->nltype = NLTYPE_ANY;
3227 }
3228 else
3229 {
3230 md->nltype = NLTYPE_FIXED;
3231 if (newline > 255)
3232 {
3233 md->nllen = 2;
3234 md->nl[0] = (newline >> 8) & 255;
3235 md->nl[1] = newline & 255;
3236 }
3237 else
3238 {
3239 md->nllen = 1;
3240 md->nl[0] = newline;
3241 }
3242 }
3243
3244 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3245 back the character offset. */
3246
3247 #ifdef SUPPORT_UTF
3248 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3249 {
3250 int erroroffset;
3251 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3252 if (errorcode != 0)
3253 {
3254 if (offsetcount >= 2)
3255 {
3256 offsets[0] = erroroffset;
3257 offsets[1] = errorcode;
3258 }
3259 #if defined COMPILE_PCRE8
3260 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3261 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3262 #elif defined COMPILE_PCRE16
3263 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3264 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3265 #elif defined COMPILE_PCRE32
3266 return PCRE_ERROR_BADUTF32;
3267 #endif
3268 }
3269 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3270 if (start_offset > 0 && start_offset < length &&
3271 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3272 return PCRE_ERROR_BADUTF8_OFFSET;
3273 #endif
3274 }
3275 #endif
3276
3277 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3278 is a feature that makes it possible to save compiled regex and re-use them
3279 in other programs later. */
3280
3281 if (md->tables == NULL) md->tables = PRIV(default_tables);
3282
3283 /* The "must be at the start of a line" flags are used in a loop when finding
3284 where to start. */
3285
3286 startline = (re->flags & PCRE_STARTLINE) != 0;
3287 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3288
3289 /* Set up the first character to match, if available. The first_byte value is
3290 never set for an anchored regular expression, but the anchoring may be forced
3291 at run time, so we have to test for anchoring. The first char may be unset for
3292 an unanchored pattern, of course. If there's no first char and the pattern was
3293 studied, there may be a bitmap of possible first characters. */
3294
3295 if (!anchored)
3296 {
3297 if ((re->flags & PCRE_FIRSTSET) != 0)
3298 {
3299 has_first_char = TRUE;
3300 first_char = first_char2 = (pcre_uchar)(re->first_char);
3301 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3302 {
3303 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3304 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3305 if (utf && first_char > 127)
3306 first_char2 = UCD_OTHERCASE(first_char);
3307 #endif
3308 }
3309 }
3310 else
3311 {
3312 if (!startline && study != NULL &&
3313 (study->flags & PCRE_STUDY_MAPPED) != 0)
3314 start_bits = study->start_bits;
3315 }
3316 }
3317
3318 /* For anchored or unanchored matches, there may be a "last known required
3319 character" set. */
3320
3321 if ((re->flags & PCRE_REQCHSET) != 0)
3322 {
3323 has_req_char = TRUE;
3324 req_char = req_char2 = (pcre_uchar)(re->req_char);
3325 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3326 {
3327 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3328 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3329 if (utf && req_char > 127)
3330 req_char2 = UCD_OTHERCASE(req_char);
3331 #endif
3332 }
3333 }
3334
3335 /* Call the main matching function, looping for a non-anchored regex after a
3336 failed match. If not restarting, perform certain optimizations at the start of
3337 a match. */
3338
3339 for (;;)
3340 {
3341 int rc;
3342
3343 if ((options & PCRE_DFA_RESTART) == 0)
3344 {
3345 const pcre_uchar *save_end_subject = end_subject;
3346
3347 /* If firstline is TRUE, the start of the match is constrained to the first
3348 line of a multiline string. Implement this by temporarily adjusting
3349 end_subject so that we stop scanning at a newline. If the match fails at
3350 the newline, later code breaks this loop. */
3351
3352 if (firstline)
3353 {
3354 PCRE_PUCHAR t = current_subject;
3355 #ifdef SUPPORT_UTF
3356 if (utf)
3357 {
3358 while (t < md->end_subject && !IS_NEWLINE(t))
3359 {
3360 t++;
3361 ACROSSCHAR(t < end_subject, *t, t++);
3362 }
3363 }
3364 else
3365 #endif
3366 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3367 end_subject = t;
3368 }
3369
3370 /* There are some optimizations that avoid running the match if a known
3371 starting point is not found. However, there is an option that disables
3372 these, for testing and for ensuring that all callouts do actually occur.
3373 The option can be set in the regex by (*NO_START_OPT) or passed in
3374 match-time options. */
3375
3376 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3377 {
3378 /* Advance to a known first char. */
3379
3380 if (has_first_char)
3381 {
3382 if (first_char != first_char2)
3383 {
3384 pcre_uchar csc;
3385 while (current_subject < end_subject &&
3386 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3387 current_subject++;
3388 }
3389 else
3390 while (current_subject < end_subject &&
3391 RAWUCHARTEST(current_subject) != first_char)
3392 current_subject++;
3393 }
3394
3395 /* Or to just after a linebreak for a multiline match if possible */
3396
3397 else if (startline)
3398 {
3399 if (current_subject > md->start_subject + start_offset)
3400 {
3401 #ifdef SUPPORT_UTF
3402 if (utf)
3403 {
3404 while (current_subject < end_subject &&
3405 !WAS_NEWLINE(current_subject))
3406 {
3407 current_subject++;
3408 ACROSSCHAR(current_subject < end_subject, *current_subject,
3409 current_subject++);
3410 }
3411 }
3412 else
3413 #endif
3414 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3415 current_subject++;
3416
3417 /* If we have just passed a CR and the newline option is ANY or
3418 ANYCRLF, and we are now at a LF, advance the match position by one
3419 more character. */
3420
3421 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3422 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3423 current_subject < end_subject &&
3424 RAWUCHARTEST(current_subject) == CHAR_NL)
3425 current_subject++;
3426 }
3427 }
3428
3429 /* Or to a non-unique first char after study */
3430
3431 else if (start_bits != NULL)
3432 {
3433 while (current_subject < end_subject)
3434 {
3435 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3436 #ifndef COMPILE_PCRE8
3437 if (c > 255) c = 255;
3438 #endif
3439 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3440 {
3441 current_subject++;
3442 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3443 /* In non 8-bit mode, the iteration will stop for
3444 characters > 255 at the beginning or not stop at all. */
3445 if (utf)
3446 ACROSSCHAR(current_subject < end_subject, *current_subject,
3447 current_subject++);
3448 #endif
3449 }
3450 else break;
3451 }
3452 }
3453 }
3454
3455 /* Restore fudged end_subject */
3456
3457 end_subject = save_end_subject;
3458
3459 /* The following two optimizations are disabled for partial matching or if
3460 disabling is explicitly requested (and of course, by the test above, this
3461 code is not obeyed when restarting after a partial match). */
3462
3463 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3464 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3465 {
3466 /* If the pattern was studied, a minimum subject length may be set. This
3467 is a lower bound; no actual string of that length may actually match the
3468 pattern. Although the value is, strictly, in characters, we treat it as
3469 bytes to avoid spending too much time in this optimization. */
3470
3471 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3472 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3473 return PCRE_ERROR_NOMATCH;
3474
3475 /* If req_char is set, we know that that character must appear in the
3476 subject for the match to succeed. If the first character is set, req_char
3477 must be later in the subject; otherwise the test starts at the match
3478 point. This optimization can save a huge amount of work in patterns with
3479 nested unlimited repeats that aren't going to match. Writing separate
3480 code for cased/caseless versions makes it go faster, as does using an
3481 autoincrement and backing off on a match.
3482
3483 HOWEVER: when the subject string is very, very long, searching to its end
3484 can take a long time, and give bad performance on quite ordinary
3485 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3486 string... so we don't do this when the string is sufficiently long. */
3487
3488 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3489 {
3490 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3491
3492 /* We don't need to repeat the search if we haven't yet reached the
3493 place we found it at last time. */
3494
3495 if (p > req_char_ptr)
3496 {
3497 if (req_char != req_char2)
3498 {
3499 while (p < end_subject)
3500 {
3501 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3502 if (pp == req_char || pp == req_char2) { p--; break; }
3503 }
3504 }
3505 else
3506 {
3507 while (p < end_subject)
3508 {
3509 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3510 }
3511 }
3512
3513 /* If we can't find the required character, break the matching loop,
3514 which will cause a return or PCRE_ERROR_NOMATCH. */
3515
3516 if (p >= end_subject) break;
3517
3518 /* If we have found the required character, save the point where we
3519 found it, so that we don't search again next time round the loop if
3520 the start hasn't passed this character yet. */
3521
3522 req_char_ptr = p;
3523 }
3524 }
3525 }
3526 } /* End of optimizations that are done when not restarting */
3527
3528 /* OK, now we can do the business */
3529
3530 md->start_used_ptr = current_subject;
3531 md->recursive = NULL;
3532
3533 rc = internal_dfa_exec(
3534 md, /* fixed match data */
3535 md->start_code, /* this subexpression's code */
3536 current_subject, /* where we currently are */
3537 start_offset, /* start offset in subject */
3538 offsets, /* offset vector */
3539 offsetcount, /* size of same */
3540 workspace, /* workspace vector */
3541 wscount, /* size of same */
3542 0); /* function recurse level */
3543
3544 /* Anything other than "no match" means we are done, always; otherwise, carry
3545 on only if not anchored. */
3546
3547 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3548
3549 /* Advance to the next subject character unless we are at the end of a line
3550 and firstline is set. */
3551
3552 if (firstline && IS_NEWLINE(current_subject)) break;
3553 current_subject++;
3554 #ifdef SUPPORT_UTF
3555 if (utf)
3556 {
3557 ACROSSCHAR(current_subject < end_subject, *current_subject,
3558 current_subject++);
3559 }
3560 #endif
3561 if (current_subject > end_subject) break;
3562
3563 /* If we have just passed a CR and we are now at a LF, and the pattern does
3564 not contain any explicit matches for \r or \n, and the newline option is CRLF
3565 or ANY or ANYCRLF, advance the match position by one more character. */
3566
3567 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3568 current_subject < end_subject &&
3569 RAWUCHARTEST(current_subject) == CHAR_NL &&
3570 (re->flags & PCRE_HASCRORLF) == 0 &&
3571 (md->nltype == NLTYPE_ANY ||
3572 md->nltype == NLTYPE_ANYCRLF ||
3573 md->nllen == 2))
3574 current_subject++;
3575
3576 } /* "Bumpalong" loop */
3577
3578 return PCRE_ERROR_NOMATCH;
3579 }
3580
3581 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5