/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1320 - (show annotations)
Wed May 1 16:39:35 2013 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 124922 byte(s)
Error occurred while calculating annotation data.
Source tidies (trails spaces, html updates) for 8.33-RC1.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2013 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
51
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
60
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
69
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
72
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre_internal.h"
84
85
86 /* For use to indent debugging output */
87
88 #define SP " "
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* RECURSE */
160 0, /* CALLOUT */
161 0, /* Alt */
162 0, /* Ket */
163 0, /* KetRmax */
164 0, /* KetRmin */
165 0, /* KetRpos */
166 0, /* Reverse */
167 0, /* Assert */
168 0, /* Assert not */
169 0, /* Assert behind */
170 0, /* Assert behind not */
171 0, 0, /* ONCE, ONCE_NC */
172 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 0, 0, /* CREF, NCREF */
175 0, 0, /* RREF, NRREF */
176 0, /* DEF */
177 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181 0, 0 /* CLOSE, SKIPZERO */
182 };
183
184 /* This table identifies those opcodes that inspect a character. It is used to
185 remember the fact that a character could have been inspected when the end of
186 the subject is reached. ***NOTE*** If the start of this table is modified, the
187 two tables that follow must also be modified. */
188
189 static const pcre_uint8 poptable[] = {
190 0, /* End */
191 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193 1, 1, 1, /* Any, AllAny, Anybyte */
194 1, 1, /* \P, \p */
195 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196 1, /* \X */
197 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198 1, /* Char */
199 1, /* Chari */
200 1, /* not */
201 1, /* noti */
202 /* Positive single-char repeats */
203 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204 1, 1, 1, /* upto, minupto, exact */
205 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207 1, 1, 1, /* upto I, minupto I, exact I */
208 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 /* Negative single-char repeats - only for chars < 256 */
210 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211 1, 1, 1, /* NOT upto, minupto, exact */
212 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214 1, 1, 1, /* NOT upto I, minupto I, exact I */
215 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 /* Positive type repeats */
217 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218 1, 1, 1, /* Type upto, minupto, exact */
219 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220 /* Character class & ref repeats */
221 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222 1, 1, /* CRRANGE, CRMINRANGE */
223 1, /* CLASS */
224 1, /* NCLASS */
225 1, /* XCLASS - variable length */
226 0, /* REF */
227 0, /* REFI */
228 0, /* RECURSE */
229 0, /* CALLOUT */
230 0, /* Alt */
231 0, /* Ket */
232 0, /* KetRmax */
233 0, /* KetRmin */
234 0, /* KetRpos */
235 0, /* Reverse */
236 0, /* Assert */
237 0, /* Assert not */
238 0, /* Assert behind */
239 0, /* Assert behind not */
240 0, 0, /* ONCE, ONCE_NC */
241 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 0, 0, /* CREF, NCREF */
244 0, 0, /* RREF, NRREF */
245 0, /* DEF */
246 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250 0, 0 /* CLOSE, SKIPZERO */
251 };
252
253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254 and \w */
255
256 static const pcre_uint8 toptable1[] = {
257 0, 0, 0, 0, 0, 0,
258 ctype_digit, ctype_digit,
259 ctype_space, ctype_space,
260 ctype_word, ctype_word,
261 0, 0 /* OP_ANY, OP_ALLANY */
262 };
263
264 static const pcre_uint8 toptable2[] = {
265 0, 0, 0, 0, 0, 0,
266 ctype_digit, 0,
267 ctype_space, 0,
268 ctype_word, 0,
269 1, 1 /* OP_ANY, OP_ALLANY */
270 };
271
272
273 /* Structure for holding data about a particular state, which is in effect the
274 current data for an active path through the match tree. It must consist
275 entirely of ints because the working vector we are passed, and which we put
276 these structures in, is a vector of ints. */
277
278 typedef struct stateblock {
279 int offset; /* Offset to opcode */
280 int count; /* Count for repeats */
281 int data; /* Some use extra data */
282 } stateblock;
283
284 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
285
286
287 #ifdef PCRE_DEBUG
288 /*************************************************
289 * Print character string *
290 *************************************************/
291
292 /* Character string printing function for debugging.
293
294 Arguments:
295 p points to string
296 length number of bytes
297 f where to print
298
299 Returns: nothing
300 */
301
302 static void
303 pchars(const pcre_uchar *p, int length, FILE *f)
304 {
305 pcre_uint32 c;
306 while (length-- > 0)
307 {
308 if (isprint(c = *(p++)))
309 fprintf(f, "%c", c);
310 else
311 fprintf(f, "\\x{%02x}", c);
312 }
313 }
314 #endif
315
316
317
318 /*************************************************
319 * Execute a Regular Expression - DFA engine *
320 *************************************************/
321
322 /* This internal function applies a compiled pattern to a subject string,
323 starting at a given point, using a DFA engine. This function is called from the
324 external one, possibly multiple times if the pattern is not anchored. The
325 function calls itself recursively for some kinds of subpattern.
326
327 Arguments:
328 md the match_data block with fixed information
329 this_start_code the opening bracket of this subexpression's code
330 current_subject where we currently are in the subject string
331 start_offset start offset in the subject string
332 offsets vector to contain the matching string offsets
333 offsetcount size of same
334 workspace vector of workspace
335 wscount size of same
336 rlevel function call recursion level
337
338 Returns: > 0 => number of match offset pairs placed in offsets
339 = 0 => offsets overflowed; longest matches are present
340 -1 => failed to match
341 < -1 => some kind of unexpected problem
342
343 The following macros are used for adding states to the two state vectors (one
344 for the current character, one for the following character). */
345
346 #define ADD_ACTIVE(x,y) \
347 if (active_count++ < wscount) \
348 { \
349 next_active_state->offset = (x); \
350 next_active_state->count = (y); \
351 next_active_state++; \
352 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353 } \
354 else return PCRE_ERROR_DFA_WSSIZE
355
356 #define ADD_ACTIVE_DATA(x,y,z) \
357 if (active_count++ < wscount) \
358 { \
359 next_active_state->offset = (x); \
360 next_active_state->count = (y); \
361 next_active_state->data = (z); \
362 next_active_state++; \
363 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364 } \
365 else return PCRE_ERROR_DFA_WSSIZE
366
367 #define ADD_NEW(x,y) \
368 if (new_count++ < wscount) \
369 { \
370 next_new_state->offset = (x); \
371 next_new_state->count = (y); \
372 next_new_state++; \
373 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374 } \
375 else return PCRE_ERROR_DFA_WSSIZE
376
377 #define ADD_NEW_DATA(x,y,z) \
378 if (new_count++ < wscount) \
379 { \
380 next_new_state->offset = (x); \
381 next_new_state->count = (y); \
382 next_new_state->data = (z); \
383 next_new_state++; \
384 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385 (x), (y), (z), __LINE__)); \
386 } \
387 else return PCRE_ERROR_DFA_WSSIZE
388
389 /* And now, here is the code */
390
391 static int
392 internal_dfa_exec(
393 dfa_match_data *md,
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
396 int start_offset,
397 int *offsets,
398 int offsetcount,
399 int *workspace,
400 int wscount,
401 int rlevel)
402 {
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
405
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
409
410 dfa_recursion_info new_recursive;
411
412 int active_count, new_count, match_count;
413
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
416
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
420
421 #ifdef SUPPORT_UTF
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 #else
424 BOOL utf = FALSE;
425 #endif
426
427 BOOL reset_could_continue = FALSE;
428
429 rlevel++;
430 offsetcount &= (-2);
431
432 wscount -= 2;
433 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434 (2 * INTS_PER_STATEBLOCK);
435
436 DPRINTF(("\n%.*s---------------------\n"
437 "%.*sCall to internal_dfa_exec f=%d\n",
438 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439
440 ctypes = md->tables + ctypes_offset;
441 lcc = md->tables + lcc_offset;
442 fcc = md->tables + fcc_offset;
443
444 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
445
446 active_states = (stateblock *)(workspace + 2);
447 next_new_state = new_states = active_states + wscount;
448 new_count = 0;
449
450 first_op = this_start_code + 1 + LINK_SIZE +
451 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453 ? IMM2_SIZE:0);
454
455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456 the alternative states onto the list, and find out where the end is. This
457 makes is possible to use this function recursively, when we want to stop at a
458 matching internal ket rather than at the end.
459
460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461 a backward assertion. In that case, we have to find out the maximum amount to
462 move back, and set up each alternative appropriately. */
463
464 if (*first_op == OP_REVERSE)
465 {
466 int max_back = 0;
467 int gone_back;
468
469 end_code = this_start_code;
470 do
471 {
472 int back = GET(end_code, 2+LINK_SIZE);
473 if (back > max_back) max_back = back;
474 end_code += GET(end_code, 1);
475 }
476 while (*end_code == OP_ALT);
477
478 /* If we can't go back the amount required for the longest lookbehind
479 pattern, go back as far as we can; some alternatives may still be viable. */
480
481 #ifdef SUPPORT_UTF
482 /* In character mode we have to step back character by character */
483
484 if (utf)
485 {
486 for (gone_back = 0; gone_back < max_back; gone_back++)
487 {
488 if (current_subject <= start_subject) break;
489 current_subject--;
490 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491 }
492 }
493 else
494 #endif
495
496 /* In byte-mode we can do this quickly. */
497
498 {
499 gone_back = (current_subject - max_back < start_subject)?
500 (int)(current_subject - start_subject) : max_back;
501 current_subject -= gone_back;
502 }
503
504 /* Save the earliest consulted character */
505
506 if (current_subject < md->start_used_ptr)
507 md->start_used_ptr = current_subject;
508
509 /* Now we can process the individual branches. */
510
511 end_code = this_start_code;
512 do
513 {
514 int back = GET(end_code, 2+LINK_SIZE);
515 if (back <= gone_back)
516 {
517 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518 ADD_NEW_DATA(-bstate, 0, gone_back - back);
519 }
520 end_code += GET(end_code, 1);
521 }
522 while (*end_code == OP_ALT);
523 }
524
525 /* This is the code for a "normal" subpattern (not a backward assertion). The
526 start of a whole pattern is always one of these. If we are at the top level,
527 we may be asked to restart matching from the same point that we reached for a
528 previous partial match. We still have to scan through the top-level branches to
529 find the end state. */
530
531 else
532 {
533 end_code = this_start_code;
534
535 /* Restarting */
536
537 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538 {
539 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540 new_count = workspace[1];
541 if (!workspace[0])
542 memcpy(new_states, active_states, new_count * sizeof(stateblock));
543 }
544
545 /* Not restarting */
546
547 else
548 {
549 int length = 1 + LINK_SIZE +
550 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552 ? IMM2_SIZE:0);
553 do
554 {
555 ADD_NEW((int)(end_code - start_code + length), 0);
556 end_code += GET(end_code, 1);
557 length = 1 + LINK_SIZE;
558 }
559 while (*end_code == OP_ALT);
560 }
561 }
562
563 workspace[0] = 0; /* Bit indicating which vector is current */
564
565 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566
567 /* Loop for scanning the subject */
568
569 ptr = current_subject;
570 for (;;)
571 {
572 int i, j;
573 int clen, dlen;
574 pcre_uint32 c, d;
575 int forced_fail = 0;
576 BOOL partial_newline = FALSE;
577 BOOL could_continue = reset_could_continue;
578 reset_could_continue = FALSE;
579
580 /* Make the new state list into the active state list and empty the
581 new state list. */
582
583 temp_states = active_states;
584 active_states = new_states;
585 new_states = temp_states;
586 active_count = new_count;
587 new_count = 0;
588
589 workspace[0] ^= 1; /* Remember for the restarting feature */
590 workspace[1] = active_count;
591
592 #ifdef PCRE_DEBUG
593 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594 pchars(ptr, STRLEN_UC(ptr), stdout);
595 printf("\"\n");
596
597 printf("%.*sActive states: ", rlevel*2-2, SP);
598 for (i = 0; i < active_count; i++)
599 printf("%d/%d ", active_states[i].offset, active_states[i].count);
600 printf("\n");
601 #endif
602
603 /* Set the pointers for adding new states */
604
605 next_active_state = active_states + active_count;
606 next_new_state = new_states;
607
608 /* Load the current character from the subject outside the loop, as many
609 different states may want to look at it, and we assume that at least one
610 will. */
611
612 if (ptr < end_subject)
613 {
614 clen = 1; /* Number of data items in the character */
615 #ifdef SUPPORT_UTF
616 GETCHARLENTEST(c, ptr, clen);
617 #else
618 c = *ptr;
619 #endif /* SUPPORT_UTF */
620 }
621 else
622 {
623 clen = 0; /* This indicates the end of the subject */
624 c = NOTACHAR; /* This value should never actually be used */
625 }
626
627 /* Scan up the active states and act on each one. The result of an action
628 may be to add more states to the currently active list (e.g. on hitting a
629 parenthesis) or it may be to put states on the new list, for considering
630 when we move the character pointer on. */
631
632 for (i = 0; i < active_count; i++)
633 {
634 stateblock *current_state = active_states + i;
635 BOOL caseless = FALSE;
636 const pcre_uchar *code;
637 int state_offset = current_state->offset;
638 int codevalue, rrc;
639 unsigned int count;
640
641 #ifdef PCRE_DEBUG
642 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
643 if (clen == 0) printf("EOL\n");
644 else if (c > 32 && c < 127) printf("'%c'\n", c);
645 else printf("0x%02x\n", c);
646 #endif
647
648 /* A negative offset is a special case meaning "hold off going to this
649 (negated) state until the number of characters in the data field have
650 been skipped". If the could_continue flag was passed over from a previous
651 state, arrange for it to passed on. */
652
653 if (state_offset < 0)
654 {
655 if (current_state->data > 0)
656 {
657 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
658 ADD_NEW_DATA(state_offset, current_state->count,
659 current_state->data - 1);
660 if (could_continue) reset_could_continue = TRUE;
661 continue;
662 }
663 else
664 {
665 current_state->offset = state_offset = -state_offset;
666 }
667 }
668
669 /* Check for a duplicate state with the same count, and skip if found.
670 See the note at the head of this module about the possibility of improving
671 performance here. */
672
673 for (j = 0; j < i; j++)
674 {
675 if (active_states[j].offset == state_offset &&
676 active_states[j].count == current_state->count)
677 {
678 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
679 goto NEXT_ACTIVE_STATE;
680 }
681 }
682
683 /* The state offset is the offset to the opcode */
684
685 code = start_code + state_offset;
686 codevalue = *code;
687
688 /* If this opcode inspects a character, but we are at the end of the
689 subject, remember the fact for use when testing for a partial match. */
690
691 if (clen == 0 && poptable[codevalue] != 0)
692 could_continue = TRUE;
693
694 /* If this opcode is followed by an inline character, load it. It is
695 tempting to test for the presence of a subject character here, but that
696 is wrong, because sometimes zero repetitions of the subject are
697 permitted.
698
699 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
700 argument that is not a data character - but is always one byte long because
701 the values are small. We have to take special action to deal with \P, \p,
702 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
703 these ones to new opcodes. */
704
705 if (coptable[codevalue] > 0)
706 {
707 dlen = 1;
708 #ifdef SUPPORT_UTF
709 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
710 #endif /* SUPPORT_UTF */
711 d = code[coptable[codevalue]];
712 if (codevalue >= OP_TYPESTAR)
713 {
714 switch(d)
715 {
716 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
717 case OP_NOTPROP:
718 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
719 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
720 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
721 case OP_NOT_HSPACE:
722 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
723 case OP_NOT_VSPACE:
724 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
725 default: break;
726 }
727 }
728 }
729 else
730 {
731 dlen = 0; /* Not strictly necessary, but compilers moan */
732 d = NOTACHAR; /* if these variables are not set. */
733 }
734
735
736 /* Now process the individual opcodes */
737
738 switch (codevalue)
739 {
740 /* ========================================================================== */
741 /* These cases are never obeyed. This is a fudge that causes a compile-
742 time error if the vectors coptable or poptable, which are indexed by
743 opcode, are not the correct length. It seems to be the only way to do
744 such a check at compile time, as the sizeof() operator does not work
745 in the C preprocessor. */
746
747 case OP_TABLE_LENGTH:
748 case OP_TABLE_LENGTH +
749 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
750 (sizeof(poptable) == OP_TABLE_LENGTH)):
751 break;
752
753 /* ========================================================================== */
754 /* Reached a closing bracket. If not at the end of the pattern, carry
755 on with the next opcode. For repeating opcodes, also add the repeat
756 state. Note that KETRPOS will always be encountered at the end of the
757 subpattern, because the possessive subpattern repeats are always handled
758 using recursive calls. Thus, it never adds any new states.
759
760 At the end of the (sub)pattern, unless we have an empty string and
761 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
762 start of the subject, save the match data, shifting up all previous
763 matches so we always have the longest first. */
764
765 case OP_KET:
766 case OP_KETRMIN:
767 case OP_KETRMAX:
768 case OP_KETRPOS:
769 if (code != end_code)
770 {
771 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
772 if (codevalue != OP_KET)
773 {
774 ADD_ACTIVE(state_offset - GET(code, 1), 0);
775 }
776 }
777 else
778 {
779 if (ptr > current_subject ||
780 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
781 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
782 current_subject > start_subject + md->start_offset)))
783 {
784 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
785 else if (match_count > 0 && ++match_count * 2 > offsetcount)
786 match_count = 0;
787 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
788 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
789 if (offsetcount >= 2)
790 {
791 offsets[0] = (int)(current_subject - start_subject);
792 offsets[1] = (int)(ptr - start_subject);
793 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
794 offsets[1] - offsets[0], (char *)current_subject));
795 }
796 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
797 {
798 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
799 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
800 match_count, rlevel*2-2, SP));
801 return match_count;
802 }
803 }
804 }
805 break;
806
807 /* ========================================================================== */
808 /* These opcodes add to the current list of states without looking
809 at the current character. */
810
811 /*-----------------------------------------------------------------*/
812 case OP_ALT:
813 do { code += GET(code, 1); } while (*code == OP_ALT);
814 ADD_ACTIVE((int)(code - start_code), 0);
815 break;
816
817 /*-----------------------------------------------------------------*/
818 case OP_BRA:
819 case OP_SBRA:
820 do
821 {
822 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 code += GET(code, 1);
824 }
825 while (*code == OP_ALT);
826 break;
827
828 /*-----------------------------------------------------------------*/
829 case OP_CBRA:
830 case OP_SCBRA:
831 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
832 code += GET(code, 1);
833 while (*code == OP_ALT)
834 {
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 code += GET(code, 1);
837 }
838 break;
839
840 /*-----------------------------------------------------------------*/
841 case OP_BRAZERO:
842 case OP_BRAMINZERO:
843 ADD_ACTIVE(state_offset + 1, 0);
844 code += 1 + GET(code, 2);
845 while (*code == OP_ALT) code += GET(code, 1);
846 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
847 break;
848
849 /*-----------------------------------------------------------------*/
850 case OP_SKIPZERO:
851 code += 1 + GET(code, 2);
852 while (*code == OP_ALT) code += GET(code, 1);
853 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
854 break;
855
856 /*-----------------------------------------------------------------*/
857 case OP_CIRC:
858 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
859 { ADD_ACTIVE(state_offset + 1, 0); }
860 break;
861
862 /*-----------------------------------------------------------------*/
863 case OP_CIRCM:
864 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
865 (ptr != end_subject && WAS_NEWLINE(ptr)))
866 { ADD_ACTIVE(state_offset + 1, 0); }
867 break;
868
869 /*-----------------------------------------------------------------*/
870 case OP_EOD:
871 if (ptr >= end_subject)
872 {
873 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
874 could_continue = TRUE;
875 else { ADD_ACTIVE(state_offset + 1, 0); }
876 }
877 break;
878
879 /*-----------------------------------------------------------------*/
880 case OP_SOD:
881 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
882 break;
883
884 /*-----------------------------------------------------------------*/
885 case OP_SOM:
886 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
887 break;
888
889
890 /* ========================================================================== */
891 /* These opcodes inspect the next subject character, and sometimes
892 the previous one as well, but do not have an argument. The variable
893 clen contains the length of the current character and is zero if we are
894 at the end of the subject. */
895
896 /*-----------------------------------------------------------------*/
897 case OP_ANY:
898 if (clen > 0 && !IS_NEWLINE(ptr))
899 {
900 if (ptr + 1 >= md->end_subject &&
901 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
902 NLBLOCK->nltype == NLTYPE_FIXED &&
903 NLBLOCK->nllen == 2 &&
904 c == NLBLOCK->nl[0])
905 {
906 could_continue = partial_newline = TRUE;
907 }
908 else
909 {
910 ADD_NEW(state_offset + 1, 0);
911 }
912 }
913 break;
914
915 /*-----------------------------------------------------------------*/
916 case OP_ALLANY:
917 if (clen > 0)
918 { ADD_NEW(state_offset + 1, 0); }
919 break;
920
921 /*-----------------------------------------------------------------*/
922 case OP_EODN:
923 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924 could_continue = TRUE;
925 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
926 { ADD_ACTIVE(state_offset + 1, 0); }
927 break;
928
929 /*-----------------------------------------------------------------*/
930 case OP_DOLL:
931 if ((md->moptions & PCRE_NOTEOL) == 0)
932 {
933 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
934 could_continue = TRUE;
935 else if (clen == 0 ||
936 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
937 (ptr == end_subject - md->nllen)
938 ))
939 { ADD_ACTIVE(state_offset + 1, 0); }
940 else if (ptr + 1 >= md->end_subject &&
941 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
942 NLBLOCK->nltype == NLTYPE_FIXED &&
943 NLBLOCK->nllen == 2 &&
944 c == NLBLOCK->nl[0])
945 {
946 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
947 {
948 reset_could_continue = TRUE;
949 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
950 }
951 else could_continue = partial_newline = TRUE;
952 }
953 }
954 break;
955
956 /*-----------------------------------------------------------------*/
957 case OP_DOLLM:
958 if ((md->moptions & PCRE_NOTEOL) == 0)
959 {
960 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
961 could_continue = TRUE;
962 else if (clen == 0 ||
963 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
964 { ADD_ACTIVE(state_offset + 1, 0); }
965 else if (ptr + 1 >= md->end_subject &&
966 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
967 NLBLOCK->nltype == NLTYPE_FIXED &&
968 NLBLOCK->nllen == 2 &&
969 c == NLBLOCK->nl[0])
970 {
971 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
972 {
973 reset_could_continue = TRUE;
974 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
975 }
976 else could_continue = partial_newline = TRUE;
977 }
978 }
979 else if (IS_NEWLINE(ptr))
980 { ADD_ACTIVE(state_offset + 1, 0); }
981 break;
982
983 /*-----------------------------------------------------------------*/
984
985 case OP_DIGIT:
986 case OP_WHITESPACE:
987 case OP_WORDCHAR:
988 if (clen > 0 && c < 256 &&
989 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
990 { ADD_NEW(state_offset + 1, 0); }
991 break;
992
993 /*-----------------------------------------------------------------*/
994 case OP_NOT_DIGIT:
995 case OP_NOT_WHITESPACE:
996 case OP_NOT_WORDCHAR:
997 if (clen > 0 && (c >= 256 ||
998 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
999 { ADD_NEW(state_offset + 1, 0); }
1000 break;
1001
1002 /*-----------------------------------------------------------------*/
1003 case OP_WORD_BOUNDARY:
1004 case OP_NOT_WORD_BOUNDARY:
1005 {
1006 int left_word, right_word;
1007
1008 if (ptr > start_subject)
1009 {
1010 const pcre_uchar *temp = ptr - 1;
1011 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1012 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1013 if (utf) { BACKCHAR(temp); }
1014 #endif
1015 GETCHARTEST(d, temp);
1016 #ifdef SUPPORT_UCP
1017 if ((md->poptions & PCRE_UCP) != 0)
1018 {
1019 if (d == '_') left_word = TRUE; else
1020 {
1021 int cat = UCD_CATEGORY(d);
1022 left_word = (cat == ucp_L || cat == ucp_N);
1023 }
1024 }
1025 else
1026 #endif
1027 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1028 }
1029 else left_word = FALSE;
1030
1031 if (clen > 0)
1032 {
1033 #ifdef SUPPORT_UCP
1034 if ((md->poptions & PCRE_UCP) != 0)
1035 {
1036 if (c == '_') right_word = TRUE; else
1037 {
1038 int cat = UCD_CATEGORY(c);
1039 right_word = (cat == ucp_L || cat == ucp_N);
1040 }
1041 }
1042 else
1043 #endif
1044 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1045 }
1046 else right_word = FALSE;
1047
1048 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1049 { ADD_ACTIVE(state_offset + 1, 0); }
1050 }
1051 break;
1052
1053
1054 /*-----------------------------------------------------------------*/
1055 /* Check the next character by Unicode property. We will get here only
1056 if the support is in the binary; otherwise a compile-time error occurs.
1057 */
1058
1059 #ifdef SUPPORT_UCP
1060 case OP_PROP:
1061 case OP_NOTPROP:
1062 if (clen > 0)
1063 {
1064 BOOL OK;
1065 const pcre_uint32 *cp;
1066 const ucd_record * prop = GET_UCD(c);
1067 switch(code[1])
1068 {
1069 case PT_ANY:
1070 OK = TRUE;
1071 break;
1072
1073 case PT_LAMP:
1074 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1075 prop->chartype == ucp_Lt;
1076 break;
1077
1078 case PT_GC:
1079 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1080 break;
1081
1082 case PT_PC:
1083 OK = prop->chartype == code[2];
1084 break;
1085
1086 case PT_SC:
1087 OK = prop->script == code[2];
1088 break;
1089
1090 /* These are specials for combination cases. */
1091
1092 case PT_ALNUM:
1093 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1095 break;
1096
1097 case PT_SPACE: /* Perl space */
1098 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1099 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1100 break;
1101
1102 case PT_PXSPACE: /* POSIX space */
1103 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1104 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1105 c == CHAR_FF || c == CHAR_CR;
1106 break;
1107
1108 case PT_WORD:
1109 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1110 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1111 c == CHAR_UNDERSCORE;
1112 break;
1113
1114 case PT_CLIST:
1115 cp = PRIV(ucd_caseless_sets) + code[2];
1116 for (;;)
1117 {
1118 if (c < *cp) { OK = FALSE; break; }
1119 if (c == *cp++) { OK = TRUE; break; }
1120 }
1121 break;
1122
1123 case PT_UCNC:
1124 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1125 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1126 c >= 0xe000;
1127 break;
1128
1129 /* Should never occur, but keep compilers from grumbling. */
1130
1131 default:
1132 OK = codevalue != OP_PROP;
1133 break;
1134 }
1135
1136 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1137 }
1138 break;
1139 #endif
1140
1141
1142
1143 /* ========================================================================== */
1144 /* These opcodes likewise inspect the subject character, but have an
1145 argument that is not a data character. It is one of these opcodes:
1146 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1147 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1148
1149 case OP_TYPEPLUS:
1150 case OP_TYPEMINPLUS:
1151 case OP_TYPEPOSPLUS:
1152 count = current_state->count; /* Already matched */
1153 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1154 if (clen > 0)
1155 {
1156 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1157 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1158 NLBLOCK->nltype == NLTYPE_FIXED &&
1159 NLBLOCK->nllen == 2 &&
1160 c == NLBLOCK->nl[0])
1161 {
1162 could_continue = partial_newline = TRUE;
1163 }
1164 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1165 (c < 256 &&
1166 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1167 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1168 {
1169 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1170 {
1171 active_count--; /* Remove non-match possibility */
1172 next_active_state--;
1173 }
1174 count++;
1175 ADD_NEW(state_offset, count);
1176 }
1177 }
1178 break;
1179
1180 /*-----------------------------------------------------------------*/
1181 case OP_TYPEQUERY:
1182 case OP_TYPEMINQUERY:
1183 case OP_TYPEPOSQUERY:
1184 ADD_ACTIVE(state_offset + 2, 0);
1185 if (clen > 0)
1186 {
1187 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1188 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1189 NLBLOCK->nltype == NLTYPE_FIXED &&
1190 NLBLOCK->nllen == 2 &&
1191 c == NLBLOCK->nl[0])
1192 {
1193 could_continue = partial_newline = TRUE;
1194 }
1195 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1196 (c < 256 &&
1197 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1198 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1199 {
1200 if (codevalue == OP_TYPEPOSQUERY)
1201 {
1202 active_count--; /* Remove non-match possibility */
1203 next_active_state--;
1204 }
1205 ADD_NEW(state_offset + 2, 0);
1206 }
1207 }
1208 break;
1209
1210 /*-----------------------------------------------------------------*/
1211 case OP_TYPESTAR:
1212 case OP_TYPEMINSTAR:
1213 case OP_TYPEPOSSTAR:
1214 ADD_ACTIVE(state_offset + 2, 0);
1215 if (clen > 0)
1216 {
1217 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1218 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1219 NLBLOCK->nltype == NLTYPE_FIXED &&
1220 NLBLOCK->nllen == 2 &&
1221 c == NLBLOCK->nl[0])
1222 {
1223 could_continue = partial_newline = TRUE;
1224 }
1225 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1226 (c < 256 &&
1227 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1228 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1229 {
1230 if (codevalue == OP_TYPEPOSSTAR)
1231 {
1232 active_count--; /* Remove non-match possibility */
1233 next_active_state--;
1234 }
1235 ADD_NEW(state_offset, 0);
1236 }
1237 }
1238 break;
1239
1240 /*-----------------------------------------------------------------*/
1241 case OP_TYPEEXACT:
1242 count = current_state->count; /* Number already matched */
1243 if (clen > 0)
1244 {
1245 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1246 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1247 NLBLOCK->nltype == NLTYPE_FIXED &&
1248 NLBLOCK->nllen == 2 &&
1249 c == NLBLOCK->nl[0])
1250 {
1251 could_continue = partial_newline = TRUE;
1252 }
1253 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1254 (c < 256 &&
1255 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1256 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1257 {
1258 if (++count >= GET2(code, 1))
1259 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1260 else
1261 { ADD_NEW(state_offset, count); }
1262 }
1263 }
1264 break;
1265
1266 /*-----------------------------------------------------------------*/
1267 case OP_TYPEUPTO:
1268 case OP_TYPEMINUPTO:
1269 case OP_TYPEPOSUPTO:
1270 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1271 count = current_state->count; /* Number already matched */
1272 if (clen > 0)
1273 {
1274 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1275 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1276 NLBLOCK->nltype == NLTYPE_FIXED &&
1277 NLBLOCK->nllen == 2 &&
1278 c == NLBLOCK->nl[0])
1279 {
1280 could_continue = partial_newline = TRUE;
1281 }
1282 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1283 (c < 256 &&
1284 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1285 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1286 {
1287 if (codevalue == OP_TYPEPOSUPTO)
1288 {
1289 active_count--; /* Remove non-match possibility */
1290 next_active_state--;
1291 }
1292 if (++count >= GET2(code, 1))
1293 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1294 else
1295 { ADD_NEW(state_offset, count); }
1296 }
1297 }
1298 break;
1299
1300 /* ========================================================================== */
1301 /* These are virtual opcodes that are used when something like
1302 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1303 argument. It keeps the code above fast for the other cases. The argument
1304 is in the d variable. */
1305
1306 #ifdef SUPPORT_UCP
1307 case OP_PROP_EXTRA + OP_TYPEPLUS:
1308 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1309 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1310 count = current_state->count; /* Already matched */
1311 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1312 if (clen > 0)
1313 {
1314 BOOL OK;
1315 const pcre_uint32 *cp;
1316 const ucd_record * prop = GET_UCD(c);
1317 switch(code[2])
1318 {
1319 case PT_ANY:
1320 OK = TRUE;
1321 break;
1322
1323 case PT_LAMP:
1324 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1325 prop->chartype == ucp_Lt;
1326 break;
1327
1328 case PT_GC:
1329 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1330 break;
1331
1332 case PT_PC:
1333 OK = prop->chartype == code[3];
1334 break;
1335
1336 case PT_SC:
1337 OK = prop->script == code[3];
1338 break;
1339
1340 /* These are specials for combination cases. */
1341
1342 case PT_ALNUM:
1343 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1344 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1345 break;
1346
1347 case PT_SPACE: /* Perl space */
1348 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1349 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1350 break;
1351
1352 case PT_PXSPACE: /* POSIX space */
1353 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1354 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1355 c == CHAR_FF || c == CHAR_CR;
1356 break;
1357
1358 case PT_WORD:
1359 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1360 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1361 c == CHAR_UNDERSCORE;
1362 break;
1363
1364 case PT_CLIST:
1365 cp = PRIV(ucd_caseless_sets) + code[3];
1366 for (;;)
1367 {
1368 if (c < *cp) { OK = FALSE; break; }
1369 if (c == *cp++) { OK = TRUE; break; }
1370 }
1371 break;
1372
1373 case PT_UCNC:
1374 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1375 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1376 c >= 0xe000;
1377 break;
1378
1379 /* Should never occur, but keep compilers from grumbling. */
1380
1381 default:
1382 OK = codevalue != OP_PROP;
1383 break;
1384 }
1385
1386 if (OK == (d == OP_PROP))
1387 {
1388 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1389 {
1390 active_count--; /* Remove non-match possibility */
1391 next_active_state--;
1392 }
1393 count++;
1394 ADD_NEW(state_offset, count);
1395 }
1396 }
1397 break;
1398
1399 /*-----------------------------------------------------------------*/
1400 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1401 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1402 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1403 count = current_state->count; /* Already matched */
1404 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1405 if (clen > 0)
1406 {
1407 int lgb, rgb;
1408 const pcre_uchar *nptr = ptr + clen;
1409 int ncount = 0;
1410 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1411 {
1412 active_count--; /* Remove non-match possibility */
1413 next_active_state--;
1414 }
1415 lgb = UCD_GRAPHBREAK(c);
1416 while (nptr < end_subject)
1417 {
1418 dlen = 1;
1419 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1420 rgb = UCD_GRAPHBREAK(d);
1421 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1422 ncount++;
1423 lgb = rgb;
1424 nptr += dlen;
1425 }
1426 count++;
1427 ADD_NEW_DATA(-state_offset, count, ncount);
1428 }
1429 break;
1430 #endif
1431
1432 /*-----------------------------------------------------------------*/
1433 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1434 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1435 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1436 count = current_state->count; /* Already matched */
1437 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1438 if (clen > 0)
1439 {
1440 int ncount = 0;
1441 switch (c)
1442 {
1443 case CHAR_VT:
1444 case CHAR_FF:
1445 case CHAR_NEL:
1446 #ifndef EBCDIC
1447 case 0x2028:
1448 case 0x2029:
1449 #endif /* Not EBCDIC */
1450 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1451 goto ANYNL01;
1452
1453 case CHAR_CR:
1454 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1455 /* Fall through */
1456
1457 ANYNL01:
1458 case CHAR_LF:
1459 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1460 {
1461 active_count--; /* Remove non-match possibility */
1462 next_active_state--;
1463 }
1464 count++;
1465 ADD_NEW_DATA(-state_offset, count, ncount);
1466 break;
1467
1468 default:
1469 break;
1470 }
1471 }
1472 break;
1473
1474 /*-----------------------------------------------------------------*/
1475 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1476 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1477 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1478 count = current_state->count; /* Already matched */
1479 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480 if (clen > 0)
1481 {
1482 BOOL OK;
1483 switch (c)
1484 {
1485 VSPACE_CASES:
1486 OK = TRUE;
1487 break;
1488
1489 default:
1490 OK = FALSE;
1491 break;
1492 }
1493
1494 if (OK == (d == OP_VSPACE))
1495 {
1496 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1497 {
1498 active_count--; /* Remove non-match possibility */
1499 next_active_state--;
1500 }
1501 count++;
1502 ADD_NEW_DATA(-state_offset, count, 0);
1503 }
1504 }
1505 break;
1506
1507 /*-----------------------------------------------------------------*/
1508 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1509 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1510 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1511 count = current_state->count; /* Already matched */
1512 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1513 if (clen > 0)
1514 {
1515 BOOL OK;
1516 switch (c)
1517 {
1518 HSPACE_CASES:
1519 OK = TRUE;
1520 break;
1521
1522 default:
1523 OK = FALSE;
1524 break;
1525 }
1526
1527 if (OK == (d == OP_HSPACE))
1528 {
1529 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1530 {
1531 active_count--; /* Remove non-match possibility */
1532 next_active_state--;
1533 }
1534 count++;
1535 ADD_NEW_DATA(-state_offset, count, 0);
1536 }
1537 }
1538 break;
1539
1540 /*-----------------------------------------------------------------*/
1541 #ifdef SUPPORT_UCP
1542 case OP_PROP_EXTRA + OP_TYPEQUERY:
1543 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1544 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1545 count = 4;
1546 goto QS1;
1547
1548 case OP_PROP_EXTRA + OP_TYPESTAR:
1549 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1550 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1551 count = 0;
1552
1553 QS1:
1554
1555 ADD_ACTIVE(state_offset + 4, 0);
1556 if (clen > 0)
1557 {
1558 BOOL OK;
1559 const pcre_uint32 *cp;
1560 const ucd_record * prop = GET_UCD(c);
1561 switch(code[2])
1562 {
1563 case PT_ANY:
1564 OK = TRUE;
1565 break;
1566
1567 case PT_LAMP:
1568 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1569 prop->chartype == ucp_Lt;
1570 break;
1571
1572 case PT_GC:
1573 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1574 break;
1575
1576 case PT_PC:
1577 OK = prop->chartype == code[3];
1578 break;
1579
1580 case PT_SC:
1581 OK = prop->script == code[3];
1582 break;
1583
1584 /* These are specials for combination cases. */
1585
1586 case PT_ALNUM:
1587 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1589 break;
1590
1591 case PT_SPACE: /* Perl space */
1592 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1593 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1594 break;
1595
1596 case PT_PXSPACE: /* POSIX space */
1597 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1598 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1599 c == CHAR_FF || c == CHAR_CR;
1600 break;
1601
1602 case PT_WORD:
1603 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1604 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1605 c == CHAR_UNDERSCORE;
1606 break;
1607
1608 case PT_CLIST:
1609 cp = PRIV(ucd_caseless_sets) + code[3];
1610 for (;;)
1611 {
1612 if (c < *cp) { OK = FALSE; break; }
1613 if (c == *cp++) { OK = TRUE; break; }
1614 }
1615 break;
1616
1617 case PT_UCNC:
1618 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1619 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1620 c >= 0xe000;
1621 break;
1622
1623 /* Should never occur, but keep compilers from grumbling. */
1624
1625 default:
1626 OK = codevalue != OP_PROP;
1627 break;
1628 }
1629
1630 if (OK == (d == OP_PROP))
1631 {
1632 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1633 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1634 {
1635 active_count--; /* Remove non-match possibility */
1636 next_active_state--;
1637 }
1638 ADD_NEW(state_offset + count, 0);
1639 }
1640 }
1641 break;
1642
1643 /*-----------------------------------------------------------------*/
1644 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1645 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1646 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1647 count = 2;
1648 goto QS2;
1649
1650 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1651 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1652 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1653 count = 0;
1654
1655 QS2:
1656
1657 ADD_ACTIVE(state_offset + 2, 0);
1658 if (clen > 0)
1659 {
1660 int lgb, rgb;
1661 const pcre_uchar *nptr = ptr + clen;
1662 int ncount = 0;
1663 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1664 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1665 {
1666 active_count--; /* Remove non-match possibility */
1667 next_active_state--;
1668 }
1669 lgb = UCD_GRAPHBREAK(c);
1670 while (nptr < end_subject)
1671 {
1672 dlen = 1;
1673 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1674 rgb = UCD_GRAPHBREAK(d);
1675 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1676 ncount++;
1677 lgb = rgb;
1678 nptr += dlen;
1679 }
1680 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1681 }
1682 break;
1683 #endif
1684
1685 /*-----------------------------------------------------------------*/
1686 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1687 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1688 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1689 count = 2;
1690 goto QS3;
1691
1692 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1693 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1694 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1695 count = 0;
1696
1697 QS3:
1698 ADD_ACTIVE(state_offset + 2, 0);
1699 if (clen > 0)
1700 {
1701 int ncount = 0;
1702 switch (c)
1703 {
1704 case CHAR_VT:
1705 case CHAR_FF:
1706 case CHAR_NEL:
1707 #ifndef EBCDIC
1708 case 0x2028:
1709 case 0x2029:
1710 #endif /* Not EBCDIC */
1711 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1712 goto ANYNL02;
1713
1714 case CHAR_CR:
1715 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1716 /* Fall through */
1717
1718 ANYNL02:
1719 case CHAR_LF:
1720 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1721 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1722 {
1723 active_count--; /* Remove non-match possibility */
1724 next_active_state--;
1725 }
1726 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1727 break;
1728
1729 default:
1730 break;
1731 }
1732 }
1733 break;
1734
1735 /*-----------------------------------------------------------------*/
1736 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1737 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1738 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1739 count = 2;
1740 goto QS4;
1741
1742 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1743 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1744 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1745 count = 0;
1746
1747 QS4:
1748 ADD_ACTIVE(state_offset + 2, 0);
1749 if (clen > 0)
1750 {
1751 BOOL OK;
1752 switch (c)
1753 {
1754 VSPACE_CASES:
1755 OK = TRUE;
1756 break;
1757
1758 default:
1759 OK = FALSE;
1760 break;
1761 }
1762 if (OK == (d == OP_VSPACE))
1763 {
1764 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1765 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1766 {
1767 active_count--; /* Remove non-match possibility */
1768 next_active_state--;
1769 }
1770 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1771 }
1772 }
1773 break;
1774
1775 /*-----------------------------------------------------------------*/
1776 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1777 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1778 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1779 count = 2;
1780 goto QS5;
1781
1782 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1783 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1784 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1785 count = 0;
1786
1787 QS5:
1788 ADD_ACTIVE(state_offset + 2, 0);
1789 if (clen > 0)
1790 {
1791 BOOL OK;
1792 switch (c)
1793 {
1794 HSPACE_CASES:
1795 OK = TRUE;
1796 break;
1797
1798 default:
1799 OK = FALSE;
1800 break;
1801 }
1802
1803 if (OK == (d == OP_HSPACE))
1804 {
1805 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1806 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1807 {
1808 active_count--; /* Remove non-match possibility */
1809 next_active_state--;
1810 }
1811 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1812 }
1813 }
1814 break;
1815
1816 /*-----------------------------------------------------------------*/
1817 #ifdef SUPPORT_UCP
1818 case OP_PROP_EXTRA + OP_TYPEEXACT:
1819 case OP_PROP_EXTRA + OP_TYPEUPTO:
1820 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1821 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1822 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1823 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1824 count = current_state->count; /* Number already matched */
1825 if (clen > 0)
1826 {
1827 BOOL OK;
1828 const pcre_uint32 *cp;
1829 const ucd_record * prop = GET_UCD(c);
1830 switch(code[1 + IMM2_SIZE + 1])
1831 {
1832 case PT_ANY:
1833 OK = TRUE;
1834 break;
1835
1836 case PT_LAMP:
1837 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1838 prop->chartype == ucp_Lt;
1839 break;
1840
1841 case PT_GC:
1842 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1843 break;
1844
1845 case PT_PC:
1846 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1847 break;
1848
1849 case PT_SC:
1850 OK = prop->script == code[1 + IMM2_SIZE + 2];
1851 break;
1852
1853 /* These are specials for combination cases. */
1854
1855 case PT_ALNUM:
1856 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1857 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1858 break;
1859
1860 case PT_SPACE: /* Perl space */
1861 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1862 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1863 break;
1864
1865 case PT_PXSPACE: /* POSIX space */
1866 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1867 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1868 c == CHAR_FF || c == CHAR_CR;
1869 break;
1870
1871 case PT_WORD:
1872 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1873 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1874 c == CHAR_UNDERSCORE;
1875 break;
1876
1877 case PT_CLIST:
1878 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1879 for (;;)
1880 {
1881 if (c < *cp) { OK = FALSE; break; }
1882 if (c == *cp++) { OK = TRUE; break; }
1883 }
1884 break;
1885
1886 case PT_UCNC:
1887 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1888 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1889 c >= 0xe000;
1890 break;
1891
1892 /* Should never occur, but keep compilers from grumbling. */
1893
1894 default:
1895 OK = codevalue != OP_PROP;
1896 break;
1897 }
1898
1899 if (OK == (d == OP_PROP))
1900 {
1901 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1902 {
1903 active_count--; /* Remove non-match possibility */
1904 next_active_state--;
1905 }
1906 if (++count >= GET2(code, 1))
1907 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1908 else
1909 { ADD_NEW(state_offset, count); }
1910 }
1911 }
1912 break;
1913
1914 /*-----------------------------------------------------------------*/
1915 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1916 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1917 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1918 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1919 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1920 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1921 count = current_state->count; /* Number already matched */
1922 if (clen > 0)
1923 {
1924 int lgb, rgb;
1925 const pcre_uchar *nptr = ptr + clen;
1926 int ncount = 0;
1927 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1928 {
1929 active_count--; /* Remove non-match possibility */
1930 next_active_state--;
1931 }
1932 lgb = UCD_GRAPHBREAK(c);
1933 while (nptr < end_subject)
1934 {
1935 dlen = 1;
1936 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1937 rgb = UCD_GRAPHBREAK(d);
1938 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1939 ncount++;
1940 lgb = rgb;
1941 nptr += dlen;
1942 }
1943 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1944 reset_could_continue = TRUE;
1945 if (++count >= GET2(code, 1))
1946 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1947 else
1948 { ADD_NEW_DATA(-state_offset, count, ncount); }
1949 }
1950 break;
1951 #endif
1952
1953 /*-----------------------------------------------------------------*/
1954 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1955 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1956 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1957 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1958 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1959 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1960 count = current_state->count; /* Number already matched */
1961 if (clen > 0)
1962 {
1963 int ncount = 0;
1964 switch (c)
1965 {
1966 case CHAR_VT:
1967 case CHAR_FF:
1968 case CHAR_NEL:
1969 #ifndef EBCDIC
1970 case 0x2028:
1971 case 0x2029:
1972 #endif /* Not EBCDIC */
1973 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1974 goto ANYNL03;
1975
1976 case CHAR_CR:
1977 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1978 /* Fall through */
1979
1980 ANYNL03:
1981 case CHAR_LF:
1982 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1983 {
1984 active_count--; /* Remove non-match possibility */
1985 next_active_state--;
1986 }
1987 if (++count >= GET2(code, 1))
1988 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1989 else
1990 { ADD_NEW_DATA(-state_offset, count, ncount); }
1991 break;
1992
1993 default:
1994 break;
1995 }
1996 }
1997 break;
1998
1999 /*-----------------------------------------------------------------*/
2000 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2001 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2002 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2003 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2004 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2005 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2006 count = current_state->count; /* Number already matched */
2007 if (clen > 0)
2008 {
2009 BOOL OK;
2010 switch (c)
2011 {
2012 VSPACE_CASES:
2013 OK = TRUE;
2014 break;
2015
2016 default:
2017 OK = FALSE;
2018 }
2019
2020 if (OK == (d == OP_VSPACE))
2021 {
2022 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2023 {
2024 active_count--; /* Remove non-match possibility */
2025 next_active_state--;
2026 }
2027 if (++count >= GET2(code, 1))
2028 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2029 else
2030 { ADD_NEW_DATA(-state_offset, count, 0); }
2031 }
2032 }
2033 break;
2034
2035 /*-----------------------------------------------------------------*/
2036 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2037 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2038 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2039 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2040 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2041 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2042 count = current_state->count; /* Number already matched */
2043 if (clen > 0)
2044 {
2045 BOOL OK;
2046 switch (c)
2047 {
2048 HSPACE_CASES:
2049 OK = TRUE;
2050 break;
2051
2052 default:
2053 OK = FALSE;
2054 break;
2055 }
2056
2057 if (OK == (d == OP_HSPACE))
2058 {
2059 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2060 {
2061 active_count--; /* Remove non-match possibility */
2062 next_active_state--;
2063 }
2064 if (++count >= GET2(code, 1))
2065 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2066 else
2067 { ADD_NEW_DATA(-state_offset, count, 0); }
2068 }
2069 }
2070 break;
2071
2072 /* ========================================================================== */
2073 /* These opcodes are followed by a character that is usually compared
2074 to the current subject character; it is loaded into d. We still get
2075 here even if there is no subject character, because in some cases zero
2076 repetitions are permitted. */
2077
2078 /*-----------------------------------------------------------------*/
2079 case OP_CHAR:
2080 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2081 break;
2082
2083 /*-----------------------------------------------------------------*/
2084 case OP_CHARI:
2085 if (clen == 0) break;
2086
2087 #ifdef SUPPORT_UTF
2088 if (utf)
2089 {
2090 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2091 {
2092 unsigned int othercase;
2093 if (c < 128)
2094 othercase = fcc[c];
2095 else
2096 /* If we have Unicode property support, we can use it to test the
2097 other case of the character. */
2098 #ifdef SUPPORT_UCP
2099 othercase = UCD_OTHERCASE(c);
2100 #else
2101 othercase = NOTACHAR;
2102 #endif
2103
2104 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2105 }
2106 }
2107 else
2108 #endif /* SUPPORT_UTF */
2109 /* Not UTF mode */
2110 {
2111 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2112 { ADD_NEW(state_offset + 2, 0); }
2113 }
2114 break;
2115
2116
2117 #ifdef SUPPORT_UCP
2118 /*-----------------------------------------------------------------*/
2119 /* This is a tricky one because it can match more than one character.
2120 Find out how many characters to skip, and then set up a negative state
2121 to wait for them to pass before continuing. */
2122
2123 case OP_EXTUNI:
2124 if (clen > 0)
2125 {
2126 int lgb, rgb;
2127 const pcre_uchar *nptr = ptr + clen;
2128 int ncount = 0;
2129 lgb = UCD_GRAPHBREAK(c);
2130 while (nptr < end_subject)
2131 {
2132 dlen = 1;
2133 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2134 rgb = UCD_GRAPHBREAK(d);
2135 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2136 ncount++;
2137 lgb = rgb;
2138 nptr += dlen;
2139 }
2140 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2141 reset_could_continue = TRUE;
2142 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2143 }
2144 break;
2145 #endif
2146
2147 /*-----------------------------------------------------------------*/
2148 /* This is a tricky like EXTUNI because it too can match more than one
2149 character (when CR is followed by LF). In this case, set up a negative
2150 state to wait for one character to pass before continuing. */
2151
2152 case OP_ANYNL:
2153 if (clen > 0) switch(c)
2154 {
2155 case CHAR_VT:
2156 case CHAR_FF:
2157 case CHAR_NEL:
2158 #ifndef EBCDIC
2159 case 0x2028:
2160 case 0x2029:
2161 #endif /* Not EBCDIC */
2162 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2163
2164 case CHAR_LF:
2165 ADD_NEW(state_offset + 1, 0);
2166 break;
2167
2168 case CHAR_CR:
2169 if (ptr + 1 >= end_subject)
2170 {
2171 ADD_NEW(state_offset + 1, 0);
2172 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2173 reset_could_continue = TRUE;
2174 }
2175 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2176 {
2177 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2178 }
2179 else
2180 {
2181 ADD_NEW(state_offset + 1, 0);
2182 }
2183 break;
2184 }
2185 break;
2186
2187 /*-----------------------------------------------------------------*/
2188 case OP_NOT_VSPACE:
2189 if (clen > 0) switch(c)
2190 {
2191 VSPACE_CASES:
2192 break;
2193
2194 default:
2195 ADD_NEW(state_offset + 1, 0);
2196 break;
2197 }
2198 break;
2199
2200 /*-----------------------------------------------------------------*/
2201 case OP_VSPACE:
2202 if (clen > 0) switch(c)
2203 {
2204 VSPACE_CASES:
2205 ADD_NEW(state_offset + 1, 0);
2206 break;
2207
2208 default:
2209 break;
2210 }
2211 break;
2212
2213 /*-----------------------------------------------------------------*/
2214 case OP_NOT_HSPACE:
2215 if (clen > 0) switch(c)
2216 {
2217 HSPACE_CASES:
2218 break;
2219
2220 default:
2221 ADD_NEW(state_offset + 1, 0);
2222 break;
2223 }
2224 break;
2225
2226 /*-----------------------------------------------------------------*/
2227 case OP_HSPACE:
2228 if (clen > 0) switch(c)
2229 {
2230 HSPACE_CASES:
2231 ADD_NEW(state_offset + 1, 0);
2232 break;
2233
2234 default:
2235 break;
2236 }
2237 break;
2238
2239 /*-----------------------------------------------------------------*/
2240 /* Match a negated single character casefully. */
2241
2242 case OP_NOT:
2243 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2244 break;
2245
2246 /*-----------------------------------------------------------------*/
2247 /* Match a negated single character caselessly. */
2248
2249 case OP_NOTI:
2250 if (clen > 0)
2251 {
2252 unsigned int otherd;
2253 #ifdef SUPPORT_UTF
2254 if (utf && d >= 128)
2255 {
2256 #ifdef SUPPORT_UCP
2257 otherd = UCD_OTHERCASE(d);
2258 #endif /* SUPPORT_UCP */
2259 }
2260 else
2261 #endif /* SUPPORT_UTF */
2262 otherd = TABLE_GET(d, fcc, d);
2263 if (c != d && c != otherd)
2264 { ADD_NEW(state_offset + dlen + 1, 0); }
2265 }
2266 break;
2267
2268 /*-----------------------------------------------------------------*/
2269 case OP_PLUSI:
2270 case OP_MINPLUSI:
2271 case OP_POSPLUSI:
2272 case OP_NOTPLUSI:
2273 case OP_NOTMINPLUSI:
2274 case OP_NOTPOSPLUSI:
2275 caseless = TRUE;
2276 codevalue -= OP_STARI - OP_STAR;
2277
2278 /* Fall through */
2279 case OP_PLUS:
2280 case OP_MINPLUS:
2281 case OP_POSPLUS:
2282 case OP_NOTPLUS:
2283 case OP_NOTMINPLUS:
2284 case OP_NOTPOSPLUS:
2285 count = current_state->count; /* Already matched */
2286 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2287 if (clen > 0)
2288 {
2289 pcre_uint32 otherd = NOTACHAR;
2290 if (caseless)
2291 {
2292 #ifdef SUPPORT_UTF
2293 if (utf && d >= 128)
2294 {
2295 #ifdef SUPPORT_UCP
2296 otherd = UCD_OTHERCASE(d);
2297 #endif /* SUPPORT_UCP */
2298 }
2299 else
2300 #endif /* SUPPORT_UTF */
2301 otherd = TABLE_GET(d, fcc, d);
2302 }
2303 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2304 {
2305 if (count > 0 &&
2306 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2307 {
2308 active_count--; /* Remove non-match possibility */
2309 next_active_state--;
2310 }
2311 count++;
2312 ADD_NEW(state_offset, count);
2313 }
2314 }
2315 break;
2316
2317 /*-----------------------------------------------------------------*/
2318 case OP_QUERYI:
2319 case OP_MINQUERYI:
2320 case OP_POSQUERYI:
2321 case OP_NOTQUERYI:
2322 case OP_NOTMINQUERYI:
2323 case OP_NOTPOSQUERYI:
2324 caseless = TRUE;
2325 codevalue -= OP_STARI - OP_STAR;
2326 /* Fall through */
2327 case OP_QUERY:
2328 case OP_MINQUERY:
2329 case OP_POSQUERY:
2330 case OP_NOTQUERY:
2331 case OP_NOTMINQUERY:
2332 case OP_NOTPOSQUERY:
2333 ADD_ACTIVE(state_offset + dlen + 1, 0);
2334 if (clen > 0)
2335 {
2336 pcre_uint32 otherd = NOTACHAR;
2337 if (caseless)
2338 {
2339 #ifdef SUPPORT_UTF
2340 if (utf && d >= 128)
2341 {
2342 #ifdef SUPPORT_UCP
2343 otherd = UCD_OTHERCASE(d);
2344 #endif /* SUPPORT_UCP */
2345 }
2346 else
2347 #endif /* SUPPORT_UTF */
2348 otherd = TABLE_GET(d, fcc, d);
2349 }
2350 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351 {
2352 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2353 {
2354 active_count--; /* Remove non-match possibility */
2355 next_active_state--;
2356 }
2357 ADD_NEW(state_offset + dlen + 1, 0);
2358 }
2359 }
2360 break;
2361
2362 /*-----------------------------------------------------------------*/
2363 case OP_STARI:
2364 case OP_MINSTARI:
2365 case OP_POSSTARI:
2366 case OP_NOTSTARI:
2367 case OP_NOTMINSTARI:
2368 case OP_NOTPOSSTARI:
2369 caseless = TRUE;
2370 codevalue -= OP_STARI - OP_STAR;
2371 /* Fall through */
2372 case OP_STAR:
2373 case OP_MINSTAR:
2374 case OP_POSSTAR:
2375 case OP_NOTSTAR:
2376 case OP_NOTMINSTAR:
2377 case OP_NOTPOSSTAR:
2378 ADD_ACTIVE(state_offset + dlen + 1, 0);
2379 if (clen > 0)
2380 {
2381 pcre_uint32 otherd = NOTACHAR;
2382 if (caseless)
2383 {
2384 #ifdef SUPPORT_UTF
2385 if (utf && d >= 128)
2386 {
2387 #ifdef SUPPORT_UCP
2388 otherd = UCD_OTHERCASE(d);
2389 #endif /* SUPPORT_UCP */
2390 }
2391 else
2392 #endif /* SUPPORT_UTF */
2393 otherd = TABLE_GET(d, fcc, d);
2394 }
2395 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2396 {
2397 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2398 {
2399 active_count--; /* Remove non-match possibility */
2400 next_active_state--;
2401 }
2402 ADD_NEW(state_offset, 0);
2403 }
2404 }
2405 break;
2406
2407 /*-----------------------------------------------------------------*/
2408 case OP_EXACTI:
2409 case OP_NOTEXACTI:
2410 caseless = TRUE;
2411 codevalue -= OP_STARI - OP_STAR;
2412 /* Fall through */
2413 case OP_EXACT:
2414 case OP_NOTEXACT:
2415 count = current_state->count; /* Number already matched */
2416 if (clen > 0)
2417 {
2418 pcre_uint32 otherd = NOTACHAR;
2419 if (caseless)
2420 {
2421 #ifdef SUPPORT_UTF
2422 if (utf && d >= 128)
2423 {
2424 #ifdef SUPPORT_UCP
2425 otherd = UCD_OTHERCASE(d);
2426 #endif /* SUPPORT_UCP */
2427 }
2428 else
2429 #endif /* SUPPORT_UTF */
2430 otherd = TABLE_GET(d, fcc, d);
2431 }
2432 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2433 {
2434 if (++count >= GET2(code, 1))
2435 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2436 else
2437 { ADD_NEW(state_offset, count); }
2438 }
2439 }
2440 break;
2441
2442 /*-----------------------------------------------------------------*/
2443 case OP_UPTOI:
2444 case OP_MINUPTOI:
2445 case OP_POSUPTOI:
2446 case OP_NOTUPTOI:
2447 case OP_NOTMINUPTOI:
2448 case OP_NOTPOSUPTOI:
2449 caseless = TRUE;
2450 codevalue -= OP_STARI - OP_STAR;
2451 /* Fall through */
2452 case OP_UPTO:
2453 case OP_MINUPTO:
2454 case OP_POSUPTO:
2455 case OP_NOTUPTO:
2456 case OP_NOTMINUPTO:
2457 case OP_NOTPOSUPTO:
2458 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2459 count = current_state->count; /* Number already matched */
2460 if (clen > 0)
2461 {
2462 pcre_uint32 otherd = NOTACHAR;
2463 if (caseless)
2464 {
2465 #ifdef SUPPORT_UTF
2466 if (utf && d >= 128)
2467 {
2468 #ifdef SUPPORT_UCP
2469 otherd = UCD_OTHERCASE(d);
2470 #endif /* SUPPORT_UCP */
2471 }
2472 else
2473 #endif /* SUPPORT_UTF */
2474 otherd = TABLE_GET(d, fcc, d);
2475 }
2476 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2477 {
2478 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2479 {
2480 active_count--; /* Remove non-match possibility */
2481 next_active_state--;
2482 }
2483 if (++count >= GET2(code, 1))
2484 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2485 else
2486 { ADD_NEW(state_offset, count); }
2487 }
2488 }
2489 break;
2490
2491
2492 /* ========================================================================== */
2493 /* These are the class-handling opcodes */
2494
2495 case OP_CLASS:
2496 case OP_NCLASS:
2497 case OP_XCLASS:
2498 {
2499 BOOL isinclass = FALSE;
2500 int next_state_offset;
2501 const pcre_uchar *ecode;
2502
2503 /* For a simple class, there is always just a 32-byte table, and we
2504 can set isinclass from it. */
2505
2506 if (codevalue != OP_XCLASS)
2507 {
2508 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2509 if (clen > 0)
2510 {
2511 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2512 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2513 }
2514 }
2515
2516 /* An extended class may have a table or a list of single characters,
2517 ranges, or both, and it may be positive or negative. There's a
2518 function that sorts all this out. */
2519
2520 else
2521 {
2522 ecode = code + GET(code, 1);
2523 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2524 }
2525
2526 /* At this point, isinclass is set for all kinds of class, and ecode
2527 points to the byte after the end of the class. If there is a
2528 quantifier, this is where it will be. */
2529
2530 next_state_offset = (int)(ecode - start_code);
2531
2532 switch (*ecode)
2533 {
2534 case OP_CRSTAR:
2535 case OP_CRMINSTAR:
2536 ADD_ACTIVE(next_state_offset + 1, 0);
2537 if (isinclass) { ADD_NEW(state_offset, 0); }
2538 break;
2539
2540 case OP_CRPLUS:
2541 case OP_CRMINPLUS:
2542 count = current_state->count; /* Already matched */
2543 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2544 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2545 break;
2546
2547 case OP_CRQUERY:
2548 case OP_CRMINQUERY:
2549 ADD_ACTIVE(next_state_offset + 1, 0);
2550 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2551 break;
2552
2553 case OP_CRRANGE:
2554 case OP_CRMINRANGE:
2555 count = current_state->count; /* Already matched */
2556 if (count >= GET2(ecode, 1))
2557 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2558 if (isinclass)
2559 {
2560 unsigned int max = GET2(ecode, 1 + IMM2_SIZE);
2561 if (++count >= max && max != 0) /* Max 0 => no limit */
2562 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2563 else
2564 { ADD_NEW(state_offset, count); }
2565 }
2566 break;
2567
2568 default:
2569 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2570 break;
2571 }
2572 }
2573 break;
2574
2575 /* ========================================================================== */
2576 /* These are the opcodes for fancy brackets of various kinds. We have
2577 to use recursion in order to handle them. The "always failing" assertion
2578 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2579 though the other "backtracking verbs" are not supported. */
2580
2581 case OP_FAIL:
2582 forced_fail++; /* Count FAILs for multiple states */
2583 break;
2584
2585 case OP_ASSERT:
2586 case OP_ASSERT_NOT:
2587 case OP_ASSERTBACK:
2588 case OP_ASSERTBACK_NOT:
2589 {
2590 int rc;
2591 int local_offsets[2];
2592 int local_workspace[1000];
2593 const pcre_uchar *endasscode = code + GET(code, 1);
2594
2595 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2596
2597 rc = internal_dfa_exec(
2598 md, /* static match data */
2599 code, /* this subexpression's code */
2600 ptr, /* where we currently are */
2601 (int)(ptr - start_subject), /* start offset */
2602 local_offsets, /* offset vector */
2603 sizeof(local_offsets)/sizeof(int), /* size of same */
2604 local_workspace, /* workspace vector */
2605 sizeof(local_workspace)/sizeof(int), /* size of same */
2606 rlevel); /* function recursion level */
2607
2608 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2609 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2610 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2611 }
2612 break;
2613
2614 /*-----------------------------------------------------------------*/
2615 case OP_COND:
2616 case OP_SCOND:
2617 {
2618 int local_offsets[1000];
2619 int local_workspace[1000];
2620 int codelink = GET(code, 1);
2621 int condcode;
2622
2623 /* Because of the way auto-callout works during compile, a callout item
2624 is inserted between OP_COND and an assertion condition. This does not
2625 happen for the other conditions. */
2626
2627 if (code[LINK_SIZE+1] == OP_CALLOUT)
2628 {
2629 rrc = 0;
2630 if (PUBL(callout) != NULL)
2631 {
2632 PUBL(callout_block) cb;
2633 cb.version = 1; /* Version 1 of the callout block */
2634 cb.callout_number = code[LINK_SIZE+2];
2635 cb.offset_vector = offsets;
2636 #if defined COMPILE_PCRE8
2637 cb.subject = (PCRE_SPTR)start_subject;
2638 #elif defined COMPILE_PCRE16
2639 cb.subject = (PCRE_SPTR16)start_subject;
2640 #elif defined COMPILE_PCRE32
2641 cb.subject = (PCRE_SPTR32)start_subject;
2642 #endif
2643 cb.subject_length = (int)(end_subject - start_subject);
2644 cb.start_match = (int)(current_subject - start_subject);
2645 cb.current_position = (int)(ptr - start_subject);
2646 cb.pattern_position = GET(code, LINK_SIZE + 3);
2647 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2648 cb.capture_top = 1;
2649 cb.capture_last = -1;
2650 cb.callout_data = md->callout_data;
2651 cb.mark = NULL; /* No (*MARK) support */
2652 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2653 }
2654 if (rrc > 0) break; /* Fail this thread */
2655 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2656 }
2657
2658 condcode = code[LINK_SIZE+1];
2659
2660 /* Back reference conditions are not supported */
2661
2662 if (condcode == OP_CREF || condcode == OP_NCREF)
2663 return PCRE_ERROR_DFA_UCOND;
2664
2665 /* The DEFINE condition is always false */
2666
2667 if (condcode == OP_DEF)
2668 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2669
2670 /* The only supported version of OP_RREF is for the value RREF_ANY,
2671 which means "test if in any recursion". We can't test for specifically
2672 recursed groups. */
2673
2674 else if (condcode == OP_RREF || condcode == OP_NRREF)
2675 {
2676 int value = GET2(code, LINK_SIZE + 2);
2677 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2678 if (md->recursive != NULL)
2679 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2680 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2681 }
2682
2683 /* Otherwise, the condition is an assertion */
2684
2685 else
2686 {
2687 int rc;
2688 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2689 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2690
2691 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2692
2693 rc = internal_dfa_exec(
2694 md, /* fixed match data */
2695 asscode, /* this subexpression's code */
2696 ptr, /* where we currently are */
2697 (int)(ptr - start_subject), /* start offset */
2698 local_offsets, /* offset vector */
2699 sizeof(local_offsets)/sizeof(int), /* size of same */
2700 local_workspace, /* workspace vector */
2701 sizeof(local_workspace)/sizeof(int), /* size of same */
2702 rlevel); /* function recursion level */
2703
2704 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2705 if ((rc >= 0) ==
2706 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2707 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2708 else
2709 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2710 }
2711 }
2712 break;
2713
2714 /*-----------------------------------------------------------------*/
2715 case OP_RECURSE:
2716 {
2717 dfa_recursion_info *ri;
2718 int local_offsets[1000];
2719 int local_workspace[1000];
2720 const pcre_uchar *callpat = start_code + GET(code, 1);
2721 int recno = (callpat == md->start_code)? 0 :
2722 GET2(callpat, 1 + LINK_SIZE);
2723 int rc;
2724
2725 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2726
2727 /* Check for repeating a recursion without advancing the subject
2728 pointer. This should catch convoluted mutual recursions. (Some simple
2729 cases are caught at compile time.) */
2730
2731 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2732 if (recno == ri->group_num && ptr == ri->subject_position)
2733 return PCRE_ERROR_RECURSELOOP;
2734
2735 /* Remember this recursion and where we started it so as to
2736 catch infinite loops. */
2737
2738 new_recursive.group_num = recno;
2739 new_recursive.subject_position = ptr;
2740 new_recursive.prevrec = md->recursive;
2741 md->recursive = &new_recursive;
2742
2743 rc = internal_dfa_exec(
2744 md, /* fixed match data */
2745 callpat, /* this subexpression's code */
2746 ptr, /* where we currently are */
2747 (int)(ptr - start_subject), /* start offset */
2748 local_offsets, /* offset vector */
2749 sizeof(local_offsets)/sizeof(int), /* size of same */
2750 local_workspace, /* workspace vector */
2751 sizeof(local_workspace)/sizeof(int), /* size of same */
2752 rlevel); /* function recursion level */
2753
2754 md->recursive = new_recursive.prevrec; /* Done this recursion */
2755
2756 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2757 rc));
2758
2759 /* Ran out of internal offsets */
2760
2761 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2762
2763 /* For each successful matched substring, set up the next state with a
2764 count of characters to skip before trying it. Note that the count is in
2765 characters, not bytes. */
2766
2767 if (rc > 0)
2768 {
2769 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2770 {
2771 int charcount = local_offsets[rc+1] - local_offsets[rc];
2772 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2773 if (utf)
2774 {
2775 const pcre_uchar *p = start_subject + local_offsets[rc];
2776 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2777 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2778 }
2779 #endif
2780 if (charcount > 0)
2781 {
2782 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2783 }
2784 else
2785 {
2786 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2787 }
2788 }
2789 }
2790 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2791 }
2792 break;
2793
2794 /*-----------------------------------------------------------------*/
2795 case OP_BRAPOS:
2796 case OP_SBRAPOS:
2797 case OP_CBRAPOS:
2798 case OP_SCBRAPOS:
2799 case OP_BRAPOSZERO:
2800 {
2801 int charcount, matched_count;
2802 const pcre_uchar *local_ptr = ptr;
2803 BOOL allow_zero;
2804
2805 if (codevalue == OP_BRAPOSZERO)
2806 {
2807 allow_zero = TRUE;
2808 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2809 }
2810 else allow_zero = FALSE;
2811
2812 /* Loop to match the subpattern as many times as possible as if it were
2813 a complete pattern. */
2814
2815 for (matched_count = 0;; matched_count++)
2816 {
2817 int local_offsets[2];
2818 int local_workspace[1000];
2819
2820 int rc = internal_dfa_exec(
2821 md, /* fixed match data */
2822 code, /* this subexpression's code */
2823 local_ptr, /* where we currently are */
2824 (int)(ptr - start_subject), /* start offset */
2825 local_offsets, /* offset vector */
2826 sizeof(local_offsets)/sizeof(int), /* size of same */
2827 local_workspace, /* workspace vector */
2828 sizeof(local_workspace)/sizeof(int), /* size of same */
2829 rlevel); /* function recursion level */
2830
2831 /* Failed to match */
2832
2833 if (rc < 0)
2834 {
2835 if (rc != PCRE_ERROR_NOMATCH) return rc;
2836 break;
2837 }
2838
2839 /* Matched: break the loop if zero characters matched. */
2840
2841 charcount = local_offsets[1] - local_offsets[0];
2842 if (charcount == 0) break;
2843 local_ptr += charcount; /* Advance temporary position ptr */
2844 }
2845
2846 /* At this point we have matched the subpattern matched_count
2847 times, and local_ptr is pointing to the character after the end of the
2848 last match. */
2849
2850 if (matched_count > 0 || allow_zero)
2851 {
2852 const pcre_uchar *end_subpattern = code;
2853 int next_state_offset;
2854
2855 do { end_subpattern += GET(end_subpattern, 1); }
2856 while (*end_subpattern == OP_ALT);
2857 next_state_offset =
2858 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2859
2860 /* Optimization: if there are no more active states, and there
2861 are no new states yet set up, then skip over the subject string
2862 right here, to save looping. Otherwise, set up the new state to swing
2863 into action when the end of the matched substring is reached. */
2864
2865 if (i + 1 >= active_count && new_count == 0)
2866 {
2867 ptr = local_ptr;
2868 clen = 0;
2869 ADD_NEW(next_state_offset, 0);
2870 }
2871 else
2872 {
2873 const pcre_uchar *p = ptr;
2874 const pcre_uchar *pp = local_ptr;
2875 charcount = (int)(pp - p);
2876 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2877 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2878 #endif
2879 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2880 }
2881 }
2882 }
2883 break;
2884
2885 /*-----------------------------------------------------------------*/
2886 case OP_ONCE:
2887 case OP_ONCE_NC:
2888 {
2889 int local_offsets[2];
2890 int local_workspace[1000];
2891
2892 int rc = internal_dfa_exec(
2893 md, /* fixed match data */
2894 code, /* this subexpression's code */
2895 ptr, /* where we currently are */
2896 (int)(ptr - start_subject), /* start offset */
2897 local_offsets, /* offset vector */
2898 sizeof(local_offsets)/sizeof(int), /* size of same */
2899 local_workspace, /* workspace vector */
2900 sizeof(local_workspace)/sizeof(int), /* size of same */
2901 rlevel); /* function recursion level */
2902
2903 if (rc >= 0)
2904 {
2905 const pcre_uchar *end_subpattern = code;
2906 int charcount = local_offsets[1] - local_offsets[0];
2907 int next_state_offset, repeat_state_offset;
2908
2909 do { end_subpattern += GET(end_subpattern, 1); }
2910 while (*end_subpattern == OP_ALT);
2911 next_state_offset =
2912 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2913
2914 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2915 arrange for the repeat state also to be added to the relevant list.
2916 Calculate the offset, or set -1 for no repeat. */
2917
2918 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2919 *end_subpattern == OP_KETRMIN)?
2920 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2921
2922 /* If we have matched an empty string, add the next state at the
2923 current character pointer. This is important so that the duplicate
2924 checking kicks in, which is what breaks infinite loops that match an
2925 empty string. */
2926
2927 if (charcount == 0)
2928 {
2929 ADD_ACTIVE(next_state_offset, 0);
2930 }
2931
2932 /* Optimization: if there are no more active states, and there
2933 are no new states yet set up, then skip over the subject string
2934 right here, to save looping. Otherwise, set up the new state to swing
2935 into action when the end of the matched substring is reached. */
2936
2937 else if (i + 1 >= active_count && new_count == 0)
2938 {
2939 ptr += charcount;
2940 clen = 0;
2941 ADD_NEW(next_state_offset, 0);
2942
2943 /* If we are adding a repeat state at the new character position,
2944 we must fudge things so that it is the only current state.
2945 Otherwise, it might be a duplicate of one we processed before, and
2946 that would cause it to be skipped. */
2947
2948 if (repeat_state_offset >= 0)
2949 {
2950 next_active_state = active_states;
2951 active_count = 0;
2952 i = -1;
2953 ADD_ACTIVE(repeat_state_offset, 0);
2954 }
2955 }
2956 else
2957 {
2958 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2959 if (utf)
2960 {
2961 const pcre_uchar *p = start_subject + local_offsets[0];
2962 const pcre_uchar *pp = start_subject + local_offsets[1];
2963 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2964 }
2965 #endif
2966 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2967 if (repeat_state_offset >= 0)
2968 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2969 }
2970 }
2971 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2972 }
2973 break;
2974
2975
2976 /* ========================================================================== */
2977 /* Handle callouts */
2978
2979 case OP_CALLOUT:
2980 rrc = 0;
2981 if (PUBL(callout) != NULL)
2982 {
2983 PUBL(callout_block) cb;
2984 cb.version = 1; /* Version 1 of the callout block */
2985 cb.callout_number = code[1];
2986 cb.offset_vector = offsets;
2987 #if defined COMPILE_PCRE8
2988 cb.subject = (PCRE_SPTR)start_subject;
2989 #elif defined COMPILE_PCRE16
2990 cb.subject = (PCRE_SPTR16)start_subject;
2991 #elif defined COMPILE_PCRE32
2992 cb.subject = (PCRE_SPTR32)start_subject;
2993 #endif
2994 cb.subject_length = (int)(end_subject - start_subject);
2995 cb.start_match = (int)(current_subject - start_subject);
2996 cb.current_position = (int)(ptr - start_subject);
2997 cb.pattern_position = GET(code, 2);
2998 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2999 cb.capture_top = 1;
3000 cb.capture_last = -1;
3001 cb.callout_data = md->callout_data;
3002 cb.mark = NULL; /* No (*MARK) support */
3003 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3004 }
3005 if (rrc == 0)
3006 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3007 break;
3008
3009
3010 /* ========================================================================== */
3011 default: /* Unsupported opcode */
3012 return PCRE_ERROR_DFA_UITEM;
3013 }
3014
3015 NEXT_ACTIVE_STATE: continue;
3016
3017 } /* End of loop scanning active states */
3018
3019 /* We have finished the processing at the current subject character. If no
3020 new states have been set for the next character, we have found all the
3021 matches that we are going to find. If we are at the top level and partial
3022 matching has been requested, check for appropriate conditions.
3023
3024 The "forced_ fail" variable counts the number of (*F) encountered for the
3025 character. If it is equal to the original active_count (saved in
3026 workspace[1]) it means that (*F) was found on every active state. In this
3027 case we don't want to give a partial match.
3028
3029 The "could_continue" variable is true if a state could have continued but
3030 for the fact that the end of the subject was reached. */
3031
3032 if (new_count <= 0)
3033 {
3034 if (rlevel == 1 && /* Top level, and */
3035 could_continue && /* Some could go on, and */
3036 forced_fail != workspace[1] && /* Not all forced fail & */
3037 ( /* either... */
3038 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3039 || /* or... */
3040 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3041 match_count < 0) /* no matches */
3042 ) && /* And... */
3043 (
3044 partial_newline || /* Either partial NL */
3045 ( /* or ... */
3046 ptr >= end_subject && /* End of subject and */
3047 ptr > md->start_used_ptr) /* Inspected non-empty string */
3048 )
3049 )
3050 match_count = PCRE_ERROR_PARTIAL;
3051 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3052 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3053 rlevel*2-2, SP));
3054 break; /* In effect, "return", but see the comment below */
3055 }
3056
3057 /* One or more states are active for the next character. */
3058
3059 ptr += clen; /* Advance to next subject character */
3060 } /* Loop to move along the subject string */
3061
3062 /* Control gets here from "break" a few lines above. We do it this way because
3063 if we use "return" above, we have compiler trouble. Some compilers warn if
3064 there's nothing here because they think the function doesn't return a value. On
3065 the other hand, if we put a dummy statement here, some more clever compilers
3066 complain that it can't be reached. Sigh. */
3067
3068 return match_count;
3069 }
3070
3071
3072
3073
3074 /*************************************************
3075 * Execute a Regular Expression - DFA engine *
3076 *************************************************/
3077
3078 /* This external function applies a compiled re to a subject string using a DFA
3079 engine. This function calls the internal function multiple times if the pattern
3080 is not anchored.
3081
3082 Arguments:
3083 argument_re points to the compiled expression
3084 extra_data points to extra data or is NULL
3085 subject points to the subject string
3086 length length of subject string (may contain binary zeros)
3087 start_offset where to start in the subject string
3088 options option bits
3089 offsets vector of match offsets
3090 offsetcount size of same
3091 workspace workspace vector
3092 wscount size of same
3093
3094 Returns: > 0 => number of match offset pairs placed in offsets
3095 = 0 => offsets overflowed; longest matches are present
3096 -1 => failed to match
3097 < -1 => some kind of unexpected problem
3098 */
3099
3100 #if defined COMPILE_PCRE8
3101 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3102 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3103 const char *subject, int length, int start_offset, int options, int *offsets,
3104 int offsetcount, int *workspace, int wscount)
3105 #elif defined COMPILE_PCRE16
3106 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3107 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3108 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3109 int offsetcount, int *workspace, int wscount)
3110 #elif defined COMPILE_PCRE32
3111 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3112 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3113 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3114 int offsetcount, int *workspace, int wscount)
3115 #endif
3116 {
3117 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3118 dfa_match_data match_block;
3119 dfa_match_data *md = &match_block;
3120 BOOL utf, anchored, startline, firstline;
3121 const pcre_uchar *current_subject, *end_subject;
3122 const pcre_study_data *study = NULL;
3123
3124 const pcre_uchar *req_char_ptr;
3125 const pcre_uint8 *start_bits = NULL;
3126 BOOL has_first_char = FALSE;
3127 BOOL has_req_char = FALSE;
3128 pcre_uchar first_char = 0;
3129 pcre_uchar first_char2 = 0;
3130 pcre_uchar req_char = 0;
3131 pcre_uchar req_char2 = 0;
3132 int newline;
3133
3134 /* Plausibility checks */
3135
3136 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3137 if (re == NULL || subject == NULL || workspace == NULL ||
3138 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3139 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3140 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3141 if (length < 0) return PCRE_ERROR_BADLENGTH;
3142 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3143
3144 /* Check that the first field in the block is the magic number. If it is not,
3145 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3146 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3147 means that the pattern is likely compiled with different endianness. */
3148
3149 if (re->magic_number != MAGIC_NUMBER)
3150 return re->magic_number == REVERSED_MAGIC_NUMBER?
3151 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3152 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3153
3154 /* If restarting after a partial match, do some sanity checks on the contents
3155 of the workspace. */
3156
3157 if ((options & PCRE_DFA_RESTART) != 0)
3158 {
3159 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3160 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3161 return PCRE_ERROR_DFA_BADRESTART;
3162 }
3163
3164 /* Set up study, callout, and table data */
3165
3166 md->tables = re->tables;
3167 md->callout_data = NULL;
3168
3169 if (extra_data != NULL)
3170 {
3171 unsigned int flags = extra_data->flags;
3172 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3173 study = (const pcre_study_data *)extra_data->study_data;
3174 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3175 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3176 return PCRE_ERROR_DFA_UMLIMIT;
3177 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3178 md->callout_data = extra_data->callout_data;
3179 if ((flags & PCRE_EXTRA_TABLES) != 0)
3180 md->tables = extra_data->tables;
3181 }
3182
3183 /* Set some local values */
3184
3185 current_subject = (const pcre_uchar *)subject + start_offset;
3186 end_subject = (const pcre_uchar *)subject + length;
3187 req_char_ptr = current_subject - 1;
3188
3189 #ifdef SUPPORT_UTF
3190 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3191 utf = (re->options & PCRE_UTF8) != 0;
3192 #else
3193 utf = FALSE;
3194 #endif
3195
3196 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3197 (re->options & PCRE_ANCHORED) != 0;
3198
3199 /* The remaining fixed data for passing around. */
3200
3201 md->start_code = (const pcre_uchar *)argument_re +
3202 re->name_table_offset + re->name_count * re->name_entry_size;
3203 md->start_subject = (const pcre_uchar *)subject;
3204 md->end_subject = end_subject;
3205 md->start_offset = start_offset;
3206 md->moptions = options;
3207 md->poptions = re->options;
3208
3209 /* If the BSR option is not set at match time, copy what was set
3210 at compile time. */
3211
3212 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3213 {
3214 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3215 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3216 #ifdef BSR_ANYCRLF
3217 else md->moptions |= PCRE_BSR_ANYCRLF;
3218 #endif
3219 }
3220
3221 /* Handle different types of newline. The three bits give eight cases. If
3222 nothing is set at run time, whatever was used at compile time applies. */
3223
3224 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3225 PCRE_NEWLINE_BITS)
3226 {
3227 case 0: newline = NEWLINE; break; /* Compile-time default */
3228 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3229 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3230 case PCRE_NEWLINE_CR+
3231 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3232 case PCRE_NEWLINE_ANY: newline = -1; break;
3233 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3234 default: return PCRE_ERROR_BADNEWLINE;
3235 }
3236
3237 if (newline == -2)
3238 {
3239 md->nltype = NLTYPE_ANYCRLF;
3240 }
3241 else if (newline < 0)
3242 {
3243 md->nltype = NLTYPE_ANY;
3244 }
3245 else
3246 {
3247 md->nltype = NLTYPE_FIXED;
3248 if (newline > 255)
3249 {
3250 md->nllen = 2;
3251 md->nl[0] = (newline >> 8) & 255;
3252 md->nl[1] = newline & 255;
3253 }
3254 else
3255 {
3256 md->nllen = 1;
3257 md->nl[0] = newline;
3258 }
3259 }
3260
3261 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3262 back the character offset. */
3263
3264 #ifdef SUPPORT_UTF
3265 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3266 {
3267 int erroroffset;
3268 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3269 if (errorcode != 0)
3270 {
3271 if (offsetcount >= 2)
3272 {
3273 offsets[0] = erroroffset;
3274 offsets[1] = errorcode;
3275 }
3276 #if defined COMPILE_PCRE8
3277 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3278 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3279 #elif defined COMPILE_PCRE16
3280 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3281 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3282 #elif defined COMPILE_PCRE32
3283 return PCRE_ERROR_BADUTF32;
3284 #endif
3285 }
3286 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3287 if (start_offset > 0 && start_offset < length &&
3288 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3289 return PCRE_ERROR_BADUTF8_OFFSET;
3290 #endif
3291 }
3292 #endif
3293
3294 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3295 is a feature that makes it possible to save compiled regex and re-use them
3296 in other programs later. */
3297
3298 if (md->tables == NULL) md->tables = PRIV(default_tables);
3299
3300 /* The "must be at the start of a line" flags are used in a loop when finding
3301 where to start. */
3302
3303 startline = (re->flags & PCRE_STARTLINE) != 0;
3304 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3305
3306 /* Set up the first character to match, if available. The first_byte value is
3307 never set for an anchored regular expression, but the anchoring may be forced
3308 at run time, so we have to test for anchoring. The first char may be unset for
3309 an unanchored pattern, of course. If there's no first char and the pattern was
3310 studied, there may be a bitmap of possible first characters. */
3311
3312 if (!anchored)
3313 {
3314 if ((re->flags & PCRE_FIRSTSET) != 0)
3315 {
3316 has_first_char = TRUE;
3317 first_char = first_char2 = (pcre_uchar)(re->first_char);
3318 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3319 {
3320 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3321 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3322 if (utf && first_char > 127)
3323 first_char2 = UCD_OTHERCASE(first_char);
3324 #endif
3325 }
3326 }
3327 else
3328 {
3329 if (!startline && study != NULL &&
3330 (study->flags & PCRE_STUDY_MAPPED) != 0)
3331 start_bits = study->start_bits;
3332 }
3333 }
3334
3335 /* For anchored or unanchored matches, there may be a "last known required
3336 character" set. */
3337
3338 if ((re->flags & PCRE_REQCHSET) != 0)
3339 {
3340 has_req_char = TRUE;
3341 req_char = req_char2 = (pcre_uchar)(re->req_char);
3342 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3343 {
3344 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3345 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3346 if (utf && req_char > 127)
3347 req_char2 = UCD_OTHERCASE(req_char);
3348 #endif
3349 }
3350 }
3351
3352 /* Call the main matching function, looping for a non-anchored regex after a
3353 failed match. If not restarting, perform certain optimizations at the start of
3354 a match. */
3355
3356 for (;;)
3357 {
3358 int rc;
3359
3360 if ((options & PCRE_DFA_RESTART) == 0)
3361 {
3362 const pcre_uchar *save_end_subject = end_subject;
3363
3364 /* If firstline is TRUE, the start of the match is constrained to the first
3365 line of a multiline string. Implement this by temporarily adjusting
3366 end_subject so that we stop scanning at a newline. If the match fails at
3367 the newline, later code breaks this loop. */
3368
3369 if (firstline)
3370 {
3371 PCRE_PUCHAR t = current_subject;
3372 #ifdef SUPPORT_UTF
3373 if (utf)
3374 {
3375 while (t < md->end_subject && !IS_NEWLINE(t))
3376 {
3377 t++;
3378 ACROSSCHAR(t < end_subject, *t, t++);
3379 }
3380 }
3381 else
3382 #endif
3383 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3384 end_subject = t;
3385 }
3386
3387 /* There are some optimizations that avoid running the match if a known
3388 starting point is not found. However, there is an option that disables
3389 these, for testing and for ensuring that all callouts do actually occur.
3390 The option can be set in the regex by (*NO_START_OPT) or passed in
3391 match-time options. */
3392
3393 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3394 {
3395 /* Advance to a known first char. */
3396
3397 if (has_first_char)
3398 {
3399 if (first_char != first_char2)
3400 {
3401 pcre_uchar csc;
3402 while (current_subject < end_subject &&
3403 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3404 current_subject++;
3405 }
3406 else
3407 while (current_subject < end_subject &&
3408 RAWUCHARTEST(current_subject) != first_char)
3409 current_subject++;
3410 }
3411
3412 /* Or to just after a linebreak for a multiline match if possible */
3413
3414 else if (startline)
3415 {
3416 if (current_subject > md->start_subject + start_offset)
3417 {
3418 #ifdef SUPPORT_UTF
3419 if (utf)
3420 {
3421 while (current_subject < end_subject &&
3422 !WAS_NEWLINE(current_subject))
3423 {
3424 current_subject++;
3425 ACROSSCHAR(current_subject < end_subject, *current_subject,
3426 current_subject++);
3427 }
3428 }
3429 else
3430 #endif
3431 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3432 current_subject++;
3433
3434 /* If we have just passed a CR and the newline option is ANY or
3435 ANYCRLF, and we are now at a LF, advance the match position by one
3436 more character. */
3437
3438 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3439 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3440 current_subject < end_subject &&
3441 RAWUCHARTEST(current_subject) == CHAR_NL)
3442 current_subject++;
3443 }
3444 }
3445
3446 /* Or to a non-unique first char after study */
3447
3448 else if (start_bits != NULL)
3449 {
3450 while (current_subject < end_subject)
3451 {
3452 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3453 #ifndef COMPILE_PCRE8
3454 if (c > 255) c = 255;
3455 #endif
3456 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3457 {
3458 current_subject++;
3459 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3460 /* In non 8-bit mode, the iteration will stop for
3461 characters > 255 at the beginning or not stop at all. */
3462 if (utf)
3463 ACROSSCHAR(current_subject < end_subject, *current_subject,
3464 current_subject++);
3465 #endif
3466 }
3467 else break;
3468 }
3469 }
3470 }
3471
3472 /* Restore fudged end_subject */
3473
3474 end_subject = save_end_subject;
3475
3476 /* The following two optimizations are disabled for partial matching or if
3477 disabling is explicitly requested (and of course, by the test above, this
3478 code is not obeyed when restarting after a partial match). */
3479
3480 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3481 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3482 {
3483 /* If the pattern was studied, a minimum subject length may be set. This
3484 is a lower bound; no actual string of that length may actually match the
3485 pattern. Although the value is, strictly, in characters, we treat it as
3486 bytes to avoid spending too much time in this optimization. */
3487
3488 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3489 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3490 return PCRE_ERROR_NOMATCH;
3491
3492 /* If req_char is set, we know that that character must appear in the
3493 subject for the match to succeed. If the first character is set, req_char
3494 must be later in the subject; otherwise the test starts at the match
3495 point. This optimization can save a huge amount of work in patterns with
3496 nested unlimited repeats that aren't going to match. Writing separate
3497 code for cased/caseless versions makes it go faster, as does using an
3498 autoincrement and backing off on a match.
3499
3500 HOWEVER: when the subject string is very, very long, searching to its end
3501 can take a long time, and give bad performance on quite ordinary
3502 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3503 string... so we don't do this when the string is sufficiently long. */
3504
3505 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3506 {
3507 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3508
3509 /* We don't need to repeat the search if we haven't yet reached the
3510 place we found it at last time. */
3511
3512 if (p > req_char_ptr)
3513 {
3514 if (req_char != req_char2)
3515 {
3516 while (p < end_subject)
3517 {
3518 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3519 if (pp == req_char || pp == req_char2) { p--; break; }
3520 }
3521 }
3522 else
3523 {
3524 while (p < end_subject)
3525 {
3526 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3527 }
3528 }
3529
3530 /* If we can't find the required character, break the matching loop,
3531 which will cause a return or PCRE_ERROR_NOMATCH. */
3532
3533 if (p >= end_subject) break;
3534
3535 /* If we have found the required character, save the point where we
3536 found it, so that we don't search again next time round the loop if
3537 the start hasn't passed this character yet. */
3538
3539 req_char_ptr = p;
3540 }
3541 }
3542 }
3543 } /* End of optimizations that are done when not restarting */
3544
3545 /* OK, now we can do the business */
3546
3547 md->start_used_ptr = current_subject;
3548 md->recursive = NULL;
3549
3550 rc = internal_dfa_exec(
3551 md, /* fixed match data */
3552 md->start_code, /* this subexpression's code */
3553 current_subject, /* where we currently are */
3554 start_offset, /* start offset in subject */
3555 offsets, /* offset vector */
3556 offsetcount, /* size of same */
3557 workspace, /* workspace vector */
3558 wscount, /* size of same */
3559 0); /* function recurse level */
3560
3561 /* Anything other than "no match" means we are done, always; otherwise, carry
3562 on only if not anchored. */
3563
3564 if (rc != PCRE_ERROR_NOMATCH || anchored)
3565 {
3566 if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3567 {
3568 offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3569 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3570 if (offsetcount > 2)
3571 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3572 }
3573 return rc;
3574 }
3575
3576 /* Advance to the next subject character unless we are at the end of a line
3577 and firstline is set. */
3578
3579 if (firstline && IS_NEWLINE(current_subject)) break;
3580 current_subject++;
3581 #ifdef SUPPORT_UTF
3582 if (utf)
3583 {
3584 ACROSSCHAR(current_subject < end_subject, *current_subject,
3585 current_subject++);
3586 }
3587 #endif
3588 if (current_subject > end_subject) break;
3589
3590 /* If we have just passed a CR and we are now at a LF, and the pattern does
3591 not contain any explicit matches for \r or \n, and the newline option is CRLF
3592 or ANY or ANYCRLF, advance the match position by one more character. */
3593
3594 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3595 current_subject < end_subject &&
3596 RAWUCHARTEST(current_subject) == CHAR_NL &&
3597 (re->flags & PCRE_HASCRORLF) == 0 &&
3598 (md->nltype == NLTYPE_ANY ||
3599 md->nltype == NLTYPE_ANYCRLF ||
3600 md->nllen == 2))
3601 current_subject++;
3602
3603 } /* "Bumpalong" loop */
3604
3605 return PCRE_ERROR_NOMATCH;
3606 }
3607
3608 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5