/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1364 - (show annotations)
Sat Oct 5 15:45:11 2013 UTC (6 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 125463 byte(s)
Add VT to the set of characters recognized as white space.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2013 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
51
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
60
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
69
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
72
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre_internal.h"
84
85
86 /* For use to indent debugging output */
87
88 #define SP " "
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* DNREF */
160 0, /* DNREFI */
161 0, /* RECURSE */
162 0, /* CALLOUT */
163 0, /* Alt */
164 0, /* Ket */
165 0, /* KetRmax */
166 0, /* KetRmin */
167 0, /* KetRpos */
168 0, /* Reverse */
169 0, /* Assert */
170 0, /* Assert not */
171 0, /* Assert behind */
172 0, /* Assert behind not */
173 0, 0, /* ONCE, ONCE_NC */
174 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
175 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
176 0, 0, /* CREF, NCREF */
177 0, 0, /* RREF, NRREF */
178 0, /* DEF */
179 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
180 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
181 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
182 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
183 0, 0 /* CLOSE, SKIPZERO */
184 };
185
186 /* This table identifies those opcodes that inspect a character. It is used to
187 remember the fact that a character could have been inspected when the end of
188 the subject is reached. ***NOTE*** If the start of this table is modified, the
189 two tables that follow must also be modified. */
190
191 static const pcre_uint8 poptable[] = {
192 0, /* End */
193 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
194 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
195 1, 1, 1, /* Any, AllAny, Anybyte */
196 1, 1, /* \P, \p */
197 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
198 1, /* \X */
199 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
200 1, /* Char */
201 1, /* Chari */
202 1, /* not */
203 1, /* noti */
204 /* Positive single-char repeats */
205 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
206 1, 1, 1, /* upto, minupto, exact */
207 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
208 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
209 1, 1, 1, /* upto I, minupto I, exact I */
210 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
211 /* Negative single-char repeats - only for chars < 256 */
212 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* NOT upto, minupto, exact */
214 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* NOT upto I, minupto I, exact I */
217 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
218 /* Positive type repeats */
219 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* Type upto, minupto, exact */
221 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
222 /* Character class & ref repeats */
223 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
224 1, 1, /* CRRANGE, CRMINRANGE */
225 1, /* CLASS */
226 1, /* NCLASS */
227 1, /* XCLASS - variable length */
228 0, /* REF */
229 0, /* REFI */
230 0, /* DNREF */
231 0, /* DNREFI */
232 0, /* RECURSE */
233 0, /* CALLOUT */
234 0, /* Alt */
235 0, /* Ket */
236 0, /* KetRmax */
237 0, /* KetRmin */
238 0, /* KetRpos */
239 0, /* Reverse */
240 0, /* Assert */
241 0, /* Assert not */
242 0, /* Assert behind */
243 0, /* Assert behind not */
244 0, 0, /* ONCE, ONCE_NC */
245 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
246 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
247 0, 0, /* CREF, NCREF */
248 0, 0, /* RREF, NRREF */
249 0, /* DEF */
250 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
251 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
252 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
253 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
254 0, 0 /* CLOSE, SKIPZERO */
255 };
256
257 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258 and \w */
259
260 static const pcre_uint8 toptable1[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, ctype_digit,
263 ctype_space, ctype_space,
264 ctype_word, ctype_word,
265 0, 0 /* OP_ANY, OP_ALLANY */
266 };
267
268 static const pcre_uint8 toptable2[] = {
269 0, 0, 0, 0, 0, 0,
270 ctype_digit, 0,
271 ctype_space, 0,
272 ctype_word, 0,
273 1, 1 /* OP_ANY, OP_ALLANY */
274 };
275
276
277 /* Structure for holding data about a particular state, which is in effect the
278 current data for an active path through the match tree. It must consist
279 entirely of ints because the working vector we are passed, and which we put
280 these structures in, is a vector of ints. */
281
282 typedef struct stateblock {
283 int offset; /* Offset to opcode */
284 int count; /* Count for repeats */
285 int data; /* Some use extra data */
286 } stateblock;
287
288 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
289
290
291 #ifdef PCRE_DEBUG
292 /*************************************************
293 * Print character string *
294 *************************************************/
295
296 /* Character string printing function for debugging.
297
298 Arguments:
299 p points to string
300 length number of bytes
301 f where to print
302
303 Returns: nothing
304 */
305
306 static void
307 pchars(const pcre_uchar *p, int length, FILE *f)
308 {
309 pcre_uint32 c;
310 while (length-- > 0)
311 {
312 if (isprint(c = *(p++)))
313 fprintf(f, "%c", c);
314 else
315 fprintf(f, "\\x{%02x}", c);
316 }
317 }
318 #endif
319
320
321
322 /*************************************************
323 * Execute a Regular Expression - DFA engine *
324 *************************************************/
325
326 /* This internal function applies a compiled pattern to a subject string,
327 starting at a given point, using a DFA engine. This function is called from the
328 external one, possibly multiple times if the pattern is not anchored. The
329 function calls itself recursively for some kinds of subpattern.
330
331 Arguments:
332 md the match_data block with fixed information
333 this_start_code the opening bracket of this subexpression's code
334 current_subject where we currently are in the subject string
335 start_offset start offset in the subject string
336 offsets vector to contain the matching string offsets
337 offsetcount size of same
338 workspace vector of workspace
339 wscount size of same
340 rlevel function call recursion level
341
342 Returns: > 0 => number of match offset pairs placed in offsets
343 = 0 => offsets overflowed; longest matches are present
344 -1 => failed to match
345 < -1 => some kind of unexpected problem
346
347 The following macros are used for adding states to the two state vectors (one
348 for the current character, one for the following character). */
349
350 #define ADD_ACTIVE(x,y) \
351 if (active_count++ < wscount) \
352 { \
353 next_active_state->offset = (x); \
354 next_active_state->count = (y); \
355 next_active_state++; \
356 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357 } \
358 else return PCRE_ERROR_DFA_WSSIZE
359
360 #define ADD_ACTIVE_DATA(x,y,z) \
361 if (active_count++ < wscount) \
362 { \
363 next_active_state->offset = (x); \
364 next_active_state->count = (y); \
365 next_active_state->data = (z); \
366 next_active_state++; \
367 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
368 } \
369 else return PCRE_ERROR_DFA_WSSIZE
370
371 #define ADD_NEW(x,y) \
372 if (new_count++ < wscount) \
373 { \
374 next_new_state->offset = (x); \
375 next_new_state->count = (y); \
376 next_new_state++; \
377 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378 } \
379 else return PCRE_ERROR_DFA_WSSIZE
380
381 #define ADD_NEW_DATA(x,y,z) \
382 if (new_count++ < wscount) \
383 { \
384 next_new_state->offset = (x); \
385 next_new_state->count = (y); \
386 next_new_state->data = (z); \
387 next_new_state++; \
388 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389 (x), (y), (z), __LINE__)); \
390 } \
391 else return PCRE_ERROR_DFA_WSSIZE
392
393 /* And now, here is the code */
394
395 static int
396 internal_dfa_exec(
397 dfa_match_data *md,
398 const pcre_uchar *this_start_code,
399 const pcre_uchar *current_subject,
400 int start_offset,
401 int *offsets,
402 int offsetcount,
403 int *workspace,
404 int wscount,
405 int rlevel)
406 {
407 stateblock *active_states, *new_states, *temp_states;
408 stateblock *next_active_state, *next_new_state;
409
410 const pcre_uint8 *ctypes, *lcc, *fcc;
411 const pcre_uchar *ptr;
412 const pcre_uchar *end_code, *first_op;
413
414 dfa_recursion_info new_recursive;
415
416 int active_count, new_count, match_count;
417
418 /* Some fields in the md block are frequently referenced, so we load them into
419 independent variables in the hope that this will perform better. */
420
421 const pcre_uchar *start_subject = md->start_subject;
422 const pcre_uchar *end_subject = md->end_subject;
423 const pcre_uchar *start_code = md->start_code;
424
425 #ifdef SUPPORT_UTF
426 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427 #else
428 BOOL utf = FALSE;
429 #endif
430
431 BOOL reset_could_continue = FALSE;
432
433 rlevel++;
434 offsetcount &= (-2);
435
436 wscount -= 2;
437 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
438 (2 * INTS_PER_STATEBLOCK);
439
440 DPRINTF(("\n%.*s---------------------\n"
441 "%.*sCall to internal_dfa_exec f=%d\n",
442 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443
444 ctypes = md->tables + ctypes_offset;
445 lcc = md->tables + lcc_offset;
446 fcc = md->tables + fcc_offset;
447
448 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
449
450 active_states = (stateblock *)(workspace + 2);
451 next_new_state = new_states = active_states + wscount;
452 new_count = 0;
453
454 first_op = this_start_code + 1 + LINK_SIZE +
455 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457 ? IMM2_SIZE:0);
458
459 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460 the alternative states onto the list, and find out where the end is. This
461 makes is possible to use this function recursively, when we want to stop at a
462 matching internal ket rather than at the end.
463
464 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
465 a backward assertion. In that case, we have to find out the maximum amount to
466 move back, and set up each alternative appropriately. */
467
468 if (*first_op == OP_REVERSE)
469 {
470 int max_back = 0;
471 int gone_back;
472
473 end_code = this_start_code;
474 do
475 {
476 int back = GET(end_code, 2+LINK_SIZE);
477 if (back > max_back) max_back = back;
478 end_code += GET(end_code, 1);
479 }
480 while (*end_code == OP_ALT);
481
482 /* If we can't go back the amount required for the longest lookbehind
483 pattern, go back as far as we can; some alternatives may still be viable. */
484
485 #ifdef SUPPORT_UTF
486 /* In character mode we have to step back character by character */
487
488 if (utf)
489 {
490 for (gone_back = 0; gone_back < max_back; gone_back++)
491 {
492 if (current_subject <= start_subject) break;
493 current_subject--;
494 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
495 }
496 }
497 else
498 #endif
499
500 /* In byte-mode we can do this quickly. */
501
502 {
503 gone_back = (current_subject - max_back < start_subject)?
504 (int)(current_subject - start_subject) : max_back;
505 current_subject -= gone_back;
506 }
507
508 /* Save the earliest consulted character */
509
510 if (current_subject < md->start_used_ptr)
511 md->start_used_ptr = current_subject;
512
513 /* Now we can process the individual branches. */
514
515 end_code = this_start_code;
516 do
517 {
518 int back = GET(end_code, 2+LINK_SIZE);
519 if (back <= gone_back)
520 {
521 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522 ADD_NEW_DATA(-bstate, 0, gone_back - back);
523 }
524 end_code += GET(end_code, 1);
525 }
526 while (*end_code == OP_ALT);
527 }
528
529 /* This is the code for a "normal" subpattern (not a backward assertion). The
530 start of a whole pattern is always one of these. If we are at the top level,
531 we may be asked to restart matching from the same point that we reached for a
532 previous partial match. We still have to scan through the top-level branches to
533 find the end state. */
534
535 else
536 {
537 end_code = this_start_code;
538
539 /* Restarting */
540
541 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
542 {
543 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
544 new_count = workspace[1];
545 if (!workspace[0])
546 memcpy(new_states, active_states, new_count * sizeof(stateblock));
547 }
548
549 /* Not restarting */
550
551 else
552 {
553 int length = 1 + LINK_SIZE +
554 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556 ? IMM2_SIZE:0);
557 do
558 {
559 ADD_NEW((int)(end_code - start_code + length), 0);
560 end_code += GET(end_code, 1);
561 length = 1 + LINK_SIZE;
562 }
563 while (*end_code == OP_ALT);
564 }
565 }
566
567 workspace[0] = 0; /* Bit indicating which vector is current */
568
569 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570
571 /* Loop for scanning the subject */
572
573 ptr = current_subject;
574 for (;;)
575 {
576 int i, j;
577 int clen, dlen;
578 pcre_uint32 c, d;
579 int forced_fail = 0;
580 BOOL partial_newline = FALSE;
581 BOOL could_continue = reset_could_continue;
582 reset_could_continue = FALSE;
583
584 /* Make the new state list into the active state list and empty the
585 new state list. */
586
587 temp_states = active_states;
588 active_states = new_states;
589 new_states = temp_states;
590 active_count = new_count;
591 new_count = 0;
592
593 workspace[0] ^= 1; /* Remember for the restarting feature */
594 workspace[1] = active_count;
595
596 #ifdef PCRE_DEBUG
597 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598 pchars(ptr, STRLEN_UC(ptr), stdout);
599 printf("\"\n");
600
601 printf("%.*sActive states: ", rlevel*2-2, SP);
602 for (i = 0; i < active_count; i++)
603 printf("%d/%d ", active_states[i].offset, active_states[i].count);
604 printf("\n");
605 #endif
606
607 /* Set the pointers for adding new states */
608
609 next_active_state = active_states + active_count;
610 next_new_state = new_states;
611
612 /* Load the current character from the subject outside the loop, as many
613 different states may want to look at it, and we assume that at least one
614 will. */
615
616 if (ptr < end_subject)
617 {
618 clen = 1; /* Number of data items in the character */
619 #ifdef SUPPORT_UTF
620 GETCHARLENTEST(c, ptr, clen);
621 #else
622 c = *ptr;
623 #endif /* SUPPORT_UTF */
624 }
625 else
626 {
627 clen = 0; /* This indicates the end of the subject */
628 c = NOTACHAR; /* This value should never actually be used */
629 }
630
631 /* Scan up the active states and act on each one. The result of an action
632 may be to add more states to the currently active list (e.g. on hitting a
633 parenthesis) or it may be to put states on the new list, for considering
634 when we move the character pointer on. */
635
636 for (i = 0; i < active_count; i++)
637 {
638 stateblock *current_state = active_states + i;
639 BOOL caseless = FALSE;
640 const pcre_uchar *code;
641 int state_offset = current_state->offset;
642 int codevalue, rrc;
643 int count;
644
645 #ifdef PCRE_DEBUG
646 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647 if (clen == 0) printf("EOL\n");
648 else if (c > 32 && c < 127) printf("'%c'\n", c);
649 else printf("0x%02x\n", c);
650 #endif
651
652 /* A negative offset is a special case meaning "hold off going to this
653 (negated) state until the number of characters in the data field have
654 been skipped". If the could_continue flag was passed over from a previous
655 state, arrange for it to passed on. */
656
657 if (state_offset < 0)
658 {
659 if (current_state->data > 0)
660 {
661 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662 ADD_NEW_DATA(state_offset, current_state->count,
663 current_state->data - 1);
664 if (could_continue) reset_could_continue = TRUE;
665 continue;
666 }
667 else
668 {
669 current_state->offset = state_offset = -state_offset;
670 }
671 }
672
673 /* Check for a duplicate state with the same count, and skip if found.
674 See the note at the head of this module about the possibility of improving
675 performance here. */
676
677 for (j = 0; j < i; j++)
678 {
679 if (active_states[j].offset == state_offset &&
680 active_states[j].count == current_state->count)
681 {
682 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
683 goto NEXT_ACTIVE_STATE;
684 }
685 }
686
687 /* The state offset is the offset to the opcode */
688
689 code = start_code + state_offset;
690 codevalue = *code;
691
692 /* If this opcode inspects a character, but we are at the end of the
693 subject, remember the fact for use when testing for a partial match. */
694
695 if (clen == 0 && poptable[codevalue] != 0)
696 could_continue = TRUE;
697
698 /* If this opcode is followed by an inline character, load it. It is
699 tempting to test for the presence of a subject character here, but that
700 is wrong, because sometimes zero repetitions of the subject are
701 permitted.
702
703 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704 argument that is not a data character - but is always one byte long because
705 the values are small. We have to take special action to deal with \P, \p,
706 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707 these ones to new opcodes. */
708
709 if (coptable[codevalue] > 0)
710 {
711 dlen = 1;
712 #ifdef SUPPORT_UTF
713 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714 #endif /* SUPPORT_UTF */
715 d = code[coptable[codevalue]];
716 if (codevalue >= OP_TYPESTAR)
717 {
718 switch(d)
719 {
720 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
721 case OP_NOTPROP:
722 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
723 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
724 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
725 case OP_NOT_HSPACE:
726 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
727 case OP_NOT_VSPACE:
728 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
729 default: break;
730 }
731 }
732 }
733 else
734 {
735 dlen = 0; /* Not strictly necessary, but compilers moan */
736 d = NOTACHAR; /* if these variables are not set. */
737 }
738
739
740 /* Now process the individual opcodes */
741
742 switch (codevalue)
743 {
744 /* ========================================================================== */
745 /* These cases are never obeyed. This is a fudge that causes a compile-
746 time error if the vectors coptable or poptable, which are indexed by
747 opcode, are not the correct length. It seems to be the only way to do
748 such a check at compile time, as the sizeof() operator does not work
749 in the C preprocessor. */
750
751 case OP_TABLE_LENGTH:
752 case OP_TABLE_LENGTH +
753 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754 (sizeof(poptable) == OP_TABLE_LENGTH)):
755 break;
756
757 /* ========================================================================== */
758 /* Reached a closing bracket. If not at the end of the pattern, carry
759 on with the next opcode. For repeating opcodes, also add the repeat
760 state. Note that KETRPOS will always be encountered at the end of the
761 subpattern, because the possessive subpattern repeats are always handled
762 using recursive calls. Thus, it never adds any new states.
763
764 At the end of the (sub)pattern, unless we have an empty string and
765 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766 start of the subject, save the match data, shifting up all previous
767 matches so we always have the longest first. */
768
769 case OP_KET:
770 case OP_KETRMIN:
771 case OP_KETRMAX:
772 case OP_KETRPOS:
773 if (code != end_code)
774 {
775 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
776 if (codevalue != OP_KET)
777 {
778 ADD_ACTIVE(state_offset - GET(code, 1), 0);
779 }
780 }
781 else
782 {
783 if (ptr > current_subject ||
784 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
785 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
786 current_subject > start_subject + md->start_offset)))
787 {
788 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789 else if (match_count > 0 && ++match_count * 2 > offsetcount)
790 match_count = 0;
791 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793 if (offsetcount >= 2)
794 {
795 offsets[0] = (int)(current_subject - start_subject);
796 offsets[1] = (int)(ptr - start_subject);
797 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798 offsets[1] - offsets[0], (char *)current_subject));
799 }
800 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801 {
802 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
803 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
804 match_count, rlevel*2-2, SP));
805 return match_count;
806 }
807 }
808 }
809 break;
810
811 /* ========================================================================== */
812 /* These opcodes add to the current list of states without looking
813 at the current character. */
814
815 /*-----------------------------------------------------------------*/
816 case OP_ALT:
817 do { code += GET(code, 1); } while (*code == OP_ALT);
818 ADD_ACTIVE((int)(code - start_code), 0);
819 break;
820
821 /*-----------------------------------------------------------------*/
822 case OP_BRA:
823 case OP_SBRA:
824 do
825 {
826 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827 code += GET(code, 1);
828 }
829 while (*code == OP_ALT);
830 break;
831
832 /*-----------------------------------------------------------------*/
833 case OP_CBRA:
834 case OP_SCBRA:
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
836 code += GET(code, 1);
837 while (*code == OP_ALT)
838 {
839 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
840 code += GET(code, 1);
841 }
842 break;
843
844 /*-----------------------------------------------------------------*/
845 case OP_BRAZERO:
846 case OP_BRAMINZERO:
847 ADD_ACTIVE(state_offset + 1, 0);
848 code += 1 + GET(code, 2);
849 while (*code == OP_ALT) code += GET(code, 1);
850 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851 break;
852
853 /*-----------------------------------------------------------------*/
854 case OP_SKIPZERO:
855 code += 1 + GET(code, 2);
856 while (*code == OP_ALT) code += GET(code, 1);
857 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858 break;
859
860 /*-----------------------------------------------------------------*/
861 case OP_CIRC:
862 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
863 { ADD_ACTIVE(state_offset + 1, 0); }
864 break;
865
866 /*-----------------------------------------------------------------*/
867 case OP_CIRCM:
868 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869 (ptr != end_subject && WAS_NEWLINE(ptr)))
870 { ADD_ACTIVE(state_offset + 1, 0); }
871 break;
872
873 /*-----------------------------------------------------------------*/
874 case OP_EOD:
875 if (ptr >= end_subject)
876 {
877 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878 could_continue = TRUE;
879 else { ADD_ACTIVE(state_offset + 1, 0); }
880 }
881 break;
882
883 /*-----------------------------------------------------------------*/
884 case OP_SOD:
885 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
886 break;
887
888 /*-----------------------------------------------------------------*/
889 case OP_SOM:
890 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
891 break;
892
893
894 /* ========================================================================== */
895 /* These opcodes inspect the next subject character, and sometimes
896 the previous one as well, but do not have an argument. The variable
897 clen contains the length of the current character and is zero if we are
898 at the end of the subject. */
899
900 /*-----------------------------------------------------------------*/
901 case OP_ANY:
902 if (clen > 0 && !IS_NEWLINE(ptr))
903 {
904 if (ptr + 1 >= md->end_subject &&
905 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906 NLBLOCK->nltype == NLTYPE_FIXED &&
907 NLBLOCK->nllen == 2 &&
908 c == NLBLOCK->nl[0])
909 {
910 could_continue = partial_newline = TRUE;
911 }
912 else
913 {
914 ADD_NEW(state_offset + 1, 0);
915 }
916 }
917 break;
918
919 /*-----------------------------------------------------------------*/
920 case OP_ALLANY:
921 if (clen > 0)
922 { ADD_NEW(state_offset + 1, 0); }
923 break;
924
925 /*-----------------------------------------------------------------*/
926 case OP_EODN:
927 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928 could_continue = TRUE;
929 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 break;
932
933 /*-----------------------------------------------------------------*/
934 case OP_DOLL:
935 if ((md->moptions & PCRE_NOTEOL) == 0)
936 {
937 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938 could_continue = TRUE;
939 else if (clen == 0 ||
940 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941 (ptr == end_subject - md->nllen)
942 ))
943 { ADD_ACTIVE(state_offset + 1, 0); }
944 else if (ptr + 1 >= md->end_subject &&
945 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946 NLBLOCK->nltype == NLTYPE_FIXED &&
947 NLBLOCK->nllen == 2 &&
948 c == NLBLOCK->nl[0])
949 {
950 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951 {
952 reset_could_continue = TRUE;
953 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954 }
955 else could_continue = partial_newline = TRUE;
956 }
957 }
958 break;
959
960 /*-----------------------------------------------------------------*/
961 case OP_DOLLM:
962 if ((md->moptions & PCRE_NOTEOL) == 0)
963 {
964 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965 could_continue = TRUE;
966 else if (clen == 0 ||
967 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968 { ADD_ACTIVE(state_offset + 1, 0); }
969 else if (ptr + 1 >= md->end_subject &&
970 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971 NLBLOCK->nltype == NLTYPE_FIXED &&
972 NLBLOCK->nllen == 2 &&
973 c == NLBLOCK->nl[0])
974 {
975 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976 {
977 reset_could_continue = TRUE;
978 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979 }
980 else could_continue = partial_newline = TRUE;
981 }
982 }
983 else if (IS_NEWLINE(ptr))
984 { ADD_ACTIVE(state_offset + 1, 0); }
985 break;
986
987 /*-----------------------------------------------------------------*/
988
989 case OP_DIGIT:
990 case OP_WHITESPACE:
991 case OP_WORDCHAR:
992 if (clen > 0 && c < 256 &&
993 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
994 { ADD_NEW(state_offset + 1, 0); }
995 break;
996
997 /*-----------------------------------------------------------------*/
998 case OP_NOT_DIGIT:
999 case OP_NOT_WHITESPACE:
1000 case OP_NOT_WORDCHAR:
1001 if (clen > 0 && (c >= 256 ||
1002 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1003 { ADD_NEW(state_offset + 1, 0); }
1004 break;
1005
1006 /*-----------------------------------------------------------------*/
1007 case OP_WORD_BOUNDARY:
1008 case OP_NOT_WORD_BOUNDARY:
1009 {
1010 int left_word, right_word;
1011
1012 if (ptr > start_subject)
1013 {
1014 const pcre_uchar *temp = ptr - 1;
1015 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017 if (utf) { BACKCHAR(temp); }
1018 #endif
1019 GETCHARTEST(d, temp);
1020 #ifdef SUPPORT_UCP
1021 if ((md->poptions & PCRE_UCP) != 0)
1022 {
1023 if (d == '_') left_word = TRUE; else
1024 {
1025 int cat = UCD_CATEGORY(d);
1026 left_word = (cat == ucp_L || cat == ucp_N);
1027 }
1028 }
1029 else
1030 #endif
1031 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032 }
1033 else left_word = FALSE;
1034
1035 if (clen > 0)
1036 {
1037 #ifdef SUPPORT_UCP
1038 if ((md->poptions & PCRE_UCP) != 0)
1039 {
1040 if (c == '_') right_word = TRUE; else
1041 {
1042 int cat = UCD_CATEGORY(c);
1043 right_word = (cat == ucp_L || cat == ucp_N);
1044 }
1045 }
1046 else
1047 #endif
1048 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049 }
1050 else right_word = FALSE;
1051
1052 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053 { ADD_ACTIVE(state_offset + 1, 0); }
1054 }
1055 break;
1056
1057
1058 /*-----------------------------------------------------------------*/
1059 /* Check the next character by Unicode property. We will get here only
1060 if the support is in the binary; otherwise a compile-time error occurs.
1061 */
1062
1063 #ifdef SUPPORT_UCP
1064 case OP_PROP:
1065 case OP_NOTPROP:
1066 if (clen > 0)
1067 {
1068 BOOL OK;
1069 const pcre_uint32 *cp;
1070 const ucd_record * prop = GET_UCD(c);
1071 switch(code[1])
1072 {
1073 case PT_ANY:
1074 OK = TRUE;
1075 break;
1076
1077 case PT_LAMP:
1078 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079 prop->chartype == ucp_Lt;
1080 break;
1081
1082 case PT_GC:
1083 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084 break;
1085
1086 case PT_PC:
1087 OK = prop->chartype == code[2];
1088 break;
1089
1090 case PT_SC:
1091 OK = prop->script == code[2];
1092 break;
1093
1094 /* These are specials for combination cases. */
1095
1096 case PT_ALNUM:
1097 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099 break;
1100
1101 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102 which means that Perl space and POSIX space are now identical. PCRE
1103 was changed at release 8.34. */
1104
1105 case PT_SPACE: /* Perl space */
1106 case PT_PXSPACE: /* POSIX space */
1107 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109 c == CHAR_FF || c == CHAR_CR;
1110 break;
1111
1112 case PT_WORD:
1113 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115 c == CHAR_UNDERSCORE;
1116 break;
1117
1118 case PT_CLIST:
1119 cp = PRIV(ucd_caseless_sets) + code[2];
1120 for (;;)
1121 {
1122 if (c < *cp) { OK = FALSE; break; }
1123 if (c == *cp++) { OK = TRUE; break; }
1124 }
1125 break;
1126
1127 case PT_UCNC:
1128 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130 c >= 0xe000;
1131 break;
1132
1133 /* Should never occur, but keep compilers from grumbling. */
1134
1135 default:
1136 OK = codevalue != OP_PROP;
1137 break;
1138 }
1139
1140 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1141 }
1142 break;
1143 #endif
1144
1145
1146
1147 /* ========================================================================== */
1148 /* These opcodes likewise inspect the subject character, but have an
1149 argument that is not a data character. It is one of these opcodes:
1150 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1151 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1152
1153 case OP_TYPEPLUS:
1154 case OP_TYPEMINPLUS:
1155 case OP_TYPEPOSPLUS:
1156 count = current_state->count; /* Already matched */
1157 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158 if (clen > 0)
1159 {
1160 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162 NLBLOCK->nltype == NLTYPE_FIXED &&
1163 NLBLOCK->nllen == 2 &&
1164 c == NLBLOCK->nl[0])
1165 {
1166 could_continue = partial_newline = TRUE;
1167 }
1168 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169 (c < 256 &&
1170 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1171 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1172 {
1173 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1174 {
1175 active_count--; /* Remove non-match possibility */
1176 next_active_state--;
1177 }
1178 count++;
1179 ADD_NEW(state_offset, count);
1180 }
1181 }
1182 break;
1183
1184 /*-----------------------------------------------------------------*/
1185 case OP_TYPEQUERY:
1186 case OP_TYPEMINQUERY:
1187 case OP_TYPEPOSQUERY:
1188 ADD_ACTIVE(state_offset + 2, 0);
1189 if (clen > 0)
1190 {
1191 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193 NLBLOCK->nltype == NLTYPE_FIXED &&
1194 NLBLOCK->nllen == 2 &&
1195 c == NLBLOCK->nl[0])
1196 {
1197 could_continue = partial_newline = TRUE;
1198 }
1199 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200 (c < 256 &&
1201 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1202 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1203 {
1204 if (codevalue == OP_TYPEPOSQUERY)
1205 {
1206 active_count--; /* Remove non-match possibility */
1207 next_active_state--;
1208 }
1209 ADD_NEW(state_offset + 2, 0);
1210 }
1211 }
1212 break;
1213
1214 /*-----------------------------------------------------------------*/
1215 case OP_TYPESTAR:
1216 case OP_TYPEMINSTAR:
1217 case OP_TYPEPOSSTAR:
1218 ADD_ACTIVE(state_offset + 2, 0);
1219 if (clen > 0)
1220 {
1221 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223 NLBLOCK->nltype == NLTYPE_FIXED &&
1224 NLBLOCK->nllen == 2 &&
1225 c == NLBLOCK->nl[0])
1226 {
1227 could_continue = partial_newline = TRUE;
1228 }
1229 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230 (c < 256 &&
1231 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1232 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1233 {
1234 if (codevalue == OP_TYPEPOSSTAR)
1235 {
1236 active_count--; /* Remove non-match possibility */
1237 next_active_state--;
1238 }
1239 ADD_NEW(state_offset, 0);
1240 }
1241 }
1242 break;
1243
1244 /*-----------------------------------------------------------------*/
1245 case OP_TYPEEXACT:
1246 count = current_state->count; /* Number already matched */
1247 if (clen > 0)
1248 {
1249 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251 NLBLOCK->nltype == NLTYPE_FIXED &&
1252 NLBLOCK->nllen == 2 &&
1253 c == NLBLOCK->nl[0])
1254 {
1255 could_continue = partial_newline = TRUE;
1256 }
1257 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258 (c < 256 &&
1259 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1260 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261 {
1262 if (++count >= (int)GET2(code, 1))
1263 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264 else
1265 { ADD_NEW(state_offset, count); }
1266 }
1267 }
1268 break;
1269
1270 /*-----------------------------------------------------------------*/
1271 case OP_TYPEUPTO:
1272 case OP_TYPEMINUPTO:
1273 case OP_TYPEPOSUPTO:
1274 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275 count = current_state->count; /* Number already matched */
1276 if (clen > 0)
1277 {
1278 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280 NLBLOCK->nltype == NLTYPE_FIXED &&
1281 NLBLOCK->nllen == 2 &&
1282 c == NLBLOCK->nl[0])
1283 {
1284 could_continue = partial_newline = TRUE;
1285 }
1286 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287 (c < 256 &&
1288 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1289 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1290 {
1291 if (codevalue == OP_TYPEPOSUPTO)
1292 {
1293 active_count--; /* Remove non-match possibility */
1294 next_active_state--;
1295 }
1296 if (++count >= (int)GET2(code, 1))
1297 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298 else
1299 { ADD_NEW(state_offset, count); }
1300 }
1301 }
1302 break;
1303
1304 /* ========================================================================== */
1305 /* These are virtual opcodes that are used when something like
1306 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1307 argument. It keeps the code above fast for the other cases. The argument
1308 is in the d variable. */
1309
1310 #ifdef SUPPORT_UCP
1311 case OP_PROP_EXTRA + OP_TYPEPLUS:
1312 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1313 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1314 count = current_state->count; /* Already matched */
1315 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1316 if (clen > 0)
1317 {
1318 BOOL OK;
1319 const pcre_uint32 *cp;
1320 const ucd_record * prop = GET_UCD(c);
1321 switch(code[2])
1322 {
1323 case PT_ANY:
1324 OK = TRUE;
1325 break;
1326
1327 case PT_LAMP:
1328 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329 prop->chartype == ucp_Lt;
1330 break;
1331
1332 case PT_GC:
1333 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334 break;
1335
1336 case PT_PC:
1337 OK = prop->chartype == code[3];
1338 break;
1339
1340 case PT_SC:
1341 OK = prop->script == code[3];
1342 break;
1343
1344 /* These are specials for combination cases. */
1345
1346 case PT_ALNUM:
1347 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349 break;
1350
1351 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1352 which means that Perl space and POSIX space are now identical. PCRE
1353 was changed at release 8.34. */
1354
1355 case PT_SPACE: /* Perl space */
1356 case PT_PXSPACE: /* POSIX space */
1357 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359 c == CHAR_FF || c == CHAR_CR;
1360 break;
1361
1362 case PT_WORD:
1363 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365 c == CHAR_UNDERSCORE;
1366 break;
1367
1368 case PT_CLIST:
1369 cp = PRIV(ucd_caseless_sets) + code[3];
1370 for (;;)
1371 {
1372 if (c < *cp) { OK = FALSE; break; }
1373 if (c == *cp++) { OK = TRUE; break; }
1374 }
1375 break;
1376
1377 case PT_UCNC:
1378 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380 c >= 0xe000;
1381 break;
1382
1383 /* Should never occur, but keep compilers from grumbling. */
1384
1385 default:
1386 OK = codevalue != OP_PROP;
1387 break;
1388 }
1389
1390 if (OK == (d == OP_PROP))
1391 {
1392 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1393 {
1394 active_count--; /* Remove non-match possibility */
1395 next_active_state--;
1396 }
1397 count++;
1398 ADD_NEW(state_offset, count);
1399 }
1400 }
1401 break;
1402
1403 /*-----------------------------------------------------------------*/
1404 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1405 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1406 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407 count = current_state->count; /* Already matched */
1408 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409 if (clen > 0)
1410 {
1411 int lgb, rgb;
1412 const pcre_uchar *nptr = ptr + clen;
1413 int ncount = 0;
1414 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415 {
1416 active_count--; /* Remove non-match possibility */
1417 next_active_state--;
1418 }
1419 lgb = UCD_GRAPHBREAK(c);
1420 while (nptr < end_subject)
1421 {
1422 dlen = 1;
1423 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424 rgb = UCD_GRAPHBREAK(d);
1425 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426 ncount++;
1427 lgb = rgb;
1428 nptr += dlen;
1429 }
1430 count++;
1431 ADD_NEW_DATA(-state_offset, count, ncount);
1432 }
1433 break;
1434 #endif
1435
1436 /*-----------------------------------------------------------------*/
1437 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1438 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1439 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1440 count = current_state->count; /* Already matched */
1441 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1442 if (clen > 0)
1443 {
1444 int ncount = 0;
1445 switch (c)
1446 {
1447 case CHAR_VT:
1448 case CHAR_FF:
1449 case CHAR_NEL:
1450 #ifndef EBCDIC
1451 case 0x2028:
1452 case 0x2029:
1453 #endif /* Not EBCDIC */
1454 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455 goto ANYNL01;
1456
1457 case CHAR_CR:
1458 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459 /* Fall through */
1460
1461 ANYNL01:
1462 case CHAR_LF:
1463 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464 {
1465 active_count--; /* Remove non-match possibility */
1466 next_active_state--;
1467 }
1468 count++;
1469 ADD_NEW_DATA(-state_offset, count, ncount);
1470 break;
1471
1472 default:
1473 break;
1474 }
1475 }
1476 break;
1477
1478 /*-----------------------------------------------------------------*/
1479 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1480 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1481 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1482 count = current_state->count; /* Already matched */
1483 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1484 if (clen > 0)
1485 {
1486 BOOL OK;
1487 switch (c)
1488 {
1489 VSPACE_CASES:
1490 OK = TRUE;
1491 break;
1492
1493 default:
1494 OK = FALSE;
1495 break;
1496 }
1497
1498 if (OK == (d == OP_VSPACE))
1499 {
1500 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1501 {
1502 active_count--; /* Remove non-match possibility */
1503 next_active_state--;
1504 }
1505 count++;
1506 ADD_NEW_DATA(-state_offset, count, 0);
1507 }
1508 }
1509 break;
1510
1511 /*-----------------------------------------------------------------*/
1512 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1513 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1514 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1515 count = current_state->count; /* Already matched */
1516 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1517 if (clen > 0)
1518 {
1519 BOOL OK;
1520 switch (c)
1521 {
1522 HSPACE_CASES:
1523 OK = TRUE;
1524 break;
1525
1526 default:
1527 OK = FALSE;
1528 break;
1529 }
1530
1531 if (OK == (d == OP_HSPACE))
1532 {
1533 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1534 {
1535 active_count--; /* Remove non-match possibility */
1536 next_active_state--;
1537 }
1538 count++;
1539 ADD_NEW_DATA(-state_offset, count, 0);
1540 }
1541 }
1542 break;
1543
1544 /*-----------------------------------------------------------------*/
1545 #ifdef SUPPORT_UCP
1546 case OP_PROP_EXTRA + OP_TYPEQUERY:
1547 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1548 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1549 count = 4;
1550 goto QS1;
1551
1552 case OP_PROP_EXTRA + OP_TYPESTAR:
1553 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1554 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1555 count = 0;
1556
1557 QS1:
1558
1559 ADD_ACTIVE(state_offset + 4, 0);
1560 if (clen > 0)
1561 {
1562 BOOL OK;
1563 const pcre_uint32 *cp;
1564 const ucd_record * prop = GET_UCD(c);
1565 switch(code[2])
1566 {
1567 case PT_ANY:
1568 OK = TRUE;
1569 break;
1570
1571 case PT_LAMP:
1572 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573 prop->chartype == ucp_Lt;
1574 break;
1575
1576 case PT_GC:
1577 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578 break;
1579
1580 case PT_PC:
1581 OK = prop->chartype == code[3];
1582 break;
1583
1584 case PT_SC:
1585 OK = prop->script == code[3];
1586 break;
1587
1588 /* These are specials for combination cases. */
1589
1590 case PT_ALNUM:
1591 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593 break;
1594
1595 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1596 which means that Perl space and POSIX space are now identical. PCRE
1597 was changed at release 8.34. */
1598
1599 case PT_SPACE: /* Perl space */
1600 case PT_PXSPACE: /* POSIX space */
1601 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603 c == CHAR_FF || c == CHAR_CR;
1604 break;
1605
1606 case PT_WORD:
1607 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609 c == CHAR_UNDERSCORE;
1610 break;
1611
1612 case PT_CLIST:
1613 cp = PRIV(ucd_caseless_sets) + code[3];
1614 for (;;)
1615 {
1616 if (c < *cp) { OK = FALSE; break; }
1617 if (c == *cp++) { OK = TRUE; break; }
1618 }
1619 break;
1620
1621 case PT_UCNC:
1622 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624 c >= 0xe000;
1625 break;
1626
1627 /* Should never occur, but keep compilers from grumbling. */
1628
1629 default:
1630 OK = codevalue != OP_PROP;
1631 break;
1632 }
1633
1634 if (OK == (d == OP_PROP))
1635 {
1636 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1637 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1638 {
1639 active_count--; /* Remove non-match possibility */
1640 next_active_state--;
1641 }
1642 ADD_NEW(state_offset + count, 0);
1643 }
1644 }
1645 break;
1646
1647 /*-----------------------------------------------------------------*/
1648 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1649 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1650 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1651 count = 2;
1652 goto QS2;
1653
1654 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1655 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1656 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1657 count = 0;
1658
1659 QS2:
1660
1661 ADD_ACTIVE(state_offset + 2, 0);
1662 if (clen > 0)
1663 {
1664 int lgb, rgb;
1665 const pcre_uchar *nptr = ptr + clen;
1666 int ncount = 0;
1667 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1669 {
1670 active_count--; /* Remove non-match possibility */
1671 next_active_state--;
1672 }
1673 lgb = UCD_GRAPHBREAK(c);
1674 while (nptr < end_subject)
1675 {
1676 dlen = 1;
1677 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678 rgb = UCD_GRAPHBREAK(d);
1679 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680 ncount++;
1681 lgb = rgb;
1682 nptr += dlen;
1683 }
1684 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685 }
1686 break;
1687 #endif
1688
1689 /*-----------------------------------------------------------------*/
1690 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1691 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1692 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1693 count = 2;
1694 goto QS3;
1695
1696 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1697 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1698 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1699 count = 0;
1700
1701 QS3:
1702 ADD_ACTIVE(state_offset + 2, 0);
1703 if (clen > 0)
1704 {
1705 int ncount = 0;
1706 switch (c)
1707 {
1708 case CHAR_VT:
1709 case CHAR_FF:
1710 case CHAR_NEL:
1711 #ifndef EBCDIC
1712 case 0x2028:
1713 case 0x2029:
1714 #endif /* Not EBCDIC */
1715 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716 goto ANYNL02;
1717
1718 case CHAR_CR:
1719 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720 /* Fall through */
1721
1722 ANYNL02:
1723 case CHAR_LF:
1724 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726 {
1727 active_count--; /* Remove non-match possibility */
1728 next_active_state--;
1729 }
1730 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731 break;
1732
1733 default:
1734 break;
1735 }
1736 }
1737 break;
1738
1739 /*-----------------------------------------------------------------*/
1740 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1741 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1742 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1743 count = 2;
1744 goto QS4;
1745
1746 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1747 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1748 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1749 count = 0;
1750
1751 QS4:
1752 ADD_ACTIVE(state_offset + 2, 0);
1753 if (clen > 0)
1754 {
1755 BOOL OK;
1756 switch (c)
1757 {
1758 VSPACE_CASES:
1759 OK = TRUE;
1760 break;
1761
1762 default:
1763 OK = FALSE;
1764 break;
1765 }
1766 if (OK == (d == OP_VSPACE))
1767 {
1768 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1769 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1770 {
1771 active_count--; /* Remove non-match possibility */
1772 next_active_state--;
1773 }
1774 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775 }
1776 }
1777 break;
1778
1779 /*-----------------------------------------------------------------*/
1780 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1781 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1782 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1783 count = 2;
1784 goto QS5;
1785
1786 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1787 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1788 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1789 count = 0;
1790
1791 QS5:
1792 ADD_ACTIVE(state_offset + 2, 0);
1793 if (clen > 0)
1794 {
1795 BOOL OK;
1796 switch (c)
1797 {
1798 HSPACE_CASES:
1799 OK = TRUE;
1800 break;
1801
1802 default:
1803 OK = FALSE;
1804 break;
1805 }
1806
1807 if (OK == (d == OP_HSPACE))
1808 {
1809 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1810 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1811 {
1812 active_count--; /* Remove non-match possibility */
1813 next_active_state--;
1814 }
1815 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816 }
1817 }
1818 break;
1819
1820 /*-----------------------------------------------------------------*/
1821 #ifdef SUPPORT_UCP
1822 case OP_PROP_EXTRA + OP_TYPEEXACT:
1823 case OP_PROP_EXTRA + OP_TYPEUPTO:
1824 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828 count = current_state->count; /* Number already matched */
1829 if (clen > 0)
1830 {
1831 BOOL OK;
1832 const pcre_uint32 *cp;
1833 const ucd_record * prop = GET_UCD(c);
1834 switch(code[1 + IMM2_SIZE + 1])
1835 {
1836 case PT_ANY:
1837 OK = TRUE;
1838 break;
1839
1840 case PT_LAMP:
1841 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842 prop->chartype == ucp_Lt;
1843 break;
1844
1845 case PT_GC:
1846 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847 break;
1848
1849 case PT_PC:
1850 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851 break;
1852
1853 case PT_SC:
1854 OK = prop->script == code[1 + IMM2_SIZE + 2];
1855 break;
1856
1857 /* These are specials for combination cases. */
1858
1859 case PT_ALNUM:
1860 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862 break;
1863
1864 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1865 which means that Perl space and POSIX space are now identical. PCRE
1866 was changed at release 8.34. */
1867
1868 case PT_SPACE: /* Perl space */
1869 case PT_PXSPACE: /* POSIX space */
1870 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872 c == CHAR_FF || c == CHAR_CR;
1873 break;
1874
1875 case PT_WORD:
1876 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878 c == CHAR_UNDERSCORE;
1879 break;
1880
1881 case PT_CLIST:
1882 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883 for (;;)
1884 {
1885 if (c < *cp) { OK = FALSE; break; }
1886 if (c == *cp++) { OK = TRUE; break; }
1887 }
1888 break;
1889
1890 case PT_UCNC:
1891 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893 c >= 0xe000;
1894 break;
1895
1896 /* Should never occur, but keep compilers from grumbling. */
1897
1898 default:
1899 OK = codevalue != OP_PROP;
1900 break;
1901 }
1902
1903 if (OK == (d == OP_PROP))
1904 {
1905 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1906 {
1907 active_count--; /* Remove non-match possibility */
1908 next_active_state--;
1909 }
1910 if (++count >= (int)GET2(code, 1))
1911 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912 else
1913 { ADD_NEW(state_offset, count); }
1914 }
1915 }
1916 break;
1917
1918 /*-----------------------------------------------------------------*/
1919 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1920 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1921 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925 count = current_state->count; /* Number already matched */
1926 if (clen > 0)
1927 {
1928 int lgb, rgb;
1929 const pcre_uchar *nptr = ptr + clen;
1930 int ncount = 0;
1931 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932 {
1933 active_count--; /* Remove non-match possibility */
1934 next_active_state--;
1935 }
1936 lgb = UCD_GRAPHBREAK(c);
1937 while (nptr < end_subject)
1938 {
1939 dlen = 1;
1940 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941 rgb = UCD_GRAPHBREAK(d);
1942 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943 ncount++;
1944 lgb = rgb;
1945 nptr += dlen;
1946 }
1947 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948 reset_could_continue = TRUE;
1949 if (++count >= (int)GET2(code, 1))
1950 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951 else
1952 { ADD_NEW_DATA(-state_offset, count, ncount); }
1953 }
1954 break;
1955 #endif
1956
1957 /*-----------------------------------------------------------------*/
1958 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1959 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1960 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964 count = current_state->count; /* Number already matched */
1965 if (clen > 0)
1966 {
1967 int ncount = 0;
1968 switch (c)
1969 {
1970 case CHAR_VT:
1971 case CHAR_FF:
1972 case CHAR_NEL:
1973 #ifndef EBCDIC
1974 case 0x2028:
1975 case 0x2029:
1976 #endif /* Not EBCDIC */
1977 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978 goto ANYNL03;
1979
1980 case CHAR_CR:
1981 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982 /* Fall through */
1983
1984 ANYNL03:
1985 case CHAR_LF:
1986 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987 {
1988 active_count--; /* Remove non-match possibility */
1989 next_active_state--;
1990 }
1991 if (++count >= (int)GET2(code, 1))
1992 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993 else
1994 { ADD_NEW_DATA(-state_offset, count, ncount); }
1995 break;
1996
1997 default:
1998 break;
1999 }
2000 }
2001 break;
2002
2003 /*-----------------------------------------------------------------*/
2004 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2005 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2006 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010 count = current_state->count; /* Number already matched */
2011 if (clen > 0)
2012 {
2013 BOOL OK;
2014 switch (c)
2015 {
2016 VSPACE_CASES:
2017 OK = TRUE;
2018 break;
2019
2020 default:
2021 OK = FALSE;
2022 }
2023
2024 if (OK == (d == OP_VSPACE))
2025 {
2026 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2027 {
2028 active_count--; /* Remove non-match possibility */
2029 next_active_state--;
2030 }
2031 if (++count >= (int)GET2(code, 1))
2032 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033 else
2034 { ADD_NEW_DATA(-state_offset, count, 0); }
2035 }
2036 }
2037 break;
2038
2039 /*-----------------------------------------------------------------*/
2040 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2041 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2042 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046 count = current_state->count; /* Number already matched */
2047 if (clen > 0)
2048 {
2049 BOOL OK;
2050 switch (c)
2051 {
2052 HSPACE_CASES:
2053 OK = TRUE;
2054 break;
2055
2056 default:
2057 OK = FALSE;
2058 break;
2059 }
2060
2061 if (OK == (d == OP_HSPACE))
2062 {
2063 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2064 {
2065 active_count--; /* Remove non-match possibility */
2066 next_active_state--;
2067 }
2068 if (++count >= (int)GET2(code, 1))
2069 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070 else
2071 { ADD_NEW_DATA(-state_offset, count, 0); }
2072 }
2073 }
2074 break;
2075
2076 /* ========================================================================== */
2077 /* These opcodes are followed by a character that is usually compared
2078 to the current subject character; it is loaded into d. We still get
2079 here even if there is no subject character, because in some cases zero
2080 repetitions are permitted. */
2081
2082 /*-----------------------------------------------------------------*/
2083 case OP_CHAR:
2084 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2085 break;
2086
2087 /*-----------------------------------------------------------------*/
2088 case OP_CHARI:
2089 if (clen == 0) break;
2090
2091 #ifdef SUPPORT_UTF
2092 if (utf)
2093 {
2094 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095 {
2096 unsigned int othercase;
2097 if (c < 128)
2098 othercase = fcc[c];
2099 else
2100 /* If we have Unicode property support, we can use it to test the
2101 other case of the character. */
2102 #ifdef SUPPORT_UCP
2103 othercase = UCD_OTHERCASE(c);
2104 #else
2105 othercase = NOTACHAR;
2106 #endif
2107
2108 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109 }
2110 }
2111 else
2112 #endif /* SUPPORT_UTF */
2113 /* Not UTF mode */
2114 {
2115 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116 { ADD_NEW(state_offset + 2, 0); }
2117 }
2118 break;
2119
2120
2121 #ifdef SUPPORT_UCP
2122 /*-----------------------------------------------------------------*/
2123 /* This is a tricky one because it can match more than one character.
2124 Find out how many characters to skip, and then set up a negative state
2125 to wait for them to pass before continuing. */
2126
2127 case OP_EXTUNI:
2128 if (clen > 0)
2129 {
2130 int lgb, rgb;
2131 const pcre_uchar *nptr = ptr + clen;
2132 int ncount = 0;
2133 lgb = UCD_GRAPHBREAK(c);
2134 while (nptr < end_subject)
2135 {
2136 dlen = 1;
2137 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138 rgb = UCD_GRAPHBREAK(d);
2139 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140 ncount++;
2141 lgb = rgb;
2142 nptr += dlen;
2143 }
2144 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145 reset_could_continue = TRUE;
2146 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147 }
2148 break;
2149 #endif
2150
2151 /*-----------------------------------------------------------------*/
2152 /* This is a tricky like EXTUNI because it too can match more than one
2153 character (when CR is followed by LF). In this case, set up a negative
2154 state to wait for one character to pass before continuing. */
2155
2156 case OP_ANYNL:
2157 if (clen > 0) switch(c)
2158 {
2159 case CHAR_VT:
2160 case CHAR_FF:
2161 case CHAR_NEL:
2162 #ifndef EBCDIC
2163 case 0x2028:
2164 case 0x2029:
2165 #endif /* Not EBCDIC */
2166 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167
2168 case CHAR_LF:
2169 ADD_NEW(state_offset + 1, 0);
2170 break;
2171
2172 case CHAR_CR:
2173 if (ptr + 1 >= end_subject)
2174 {
2175 ADD_NEW(state_offset + 1, 0);
2176 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177 reset_could_continue = TRUE;
2178 }
2179 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180 {
2181 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182 }
2183 else
2184 {
2185 ADD_NEW(state_offset + 1, 0);
2186 }
2187 break;
2188 }
2189 break;
2190
2191 /*-----------------------------------------------------------------*/
2192 case OP_NOT_VSPACE:
2193 if (clen > 0) switch(c)
2194 {
2195 VSPACE_CASES:
2196 break;
2197
2198 default:
2199 ADD_NEW(state_offset + 1, 0);
2200 break;
2201 }
2202 break;
2203
2204 /*-----------------------------------------------------------------*/
2205 case OP_VSPACE:
2206 if (clen > 0) switch(c)
2207 {
2208 VSPACE_CASES:
2209 ADD_NEW(state_offset + 1, 0);
2210 break;
2211
2212 default:
2213 break;
2214 }
2215 break;
2216
2217 /*-----------------------------------------------------------------*/
2218 case OP_NOT_HSPACE:
2219 if (clen > 0) switch(c)
2220 {
2221 HSPACE_CASES:
2222 break;
2223
2224 default:
2225 ADD_NEW(state_offset + 1, 0);
2226 break;
2227 }
2228 break;
2229
2230 /*-----------------------------------------------------------------*/
2231 case OP_HSPACE:
2232 if (clen > 0) switch(c)
2233 {
2234 HSPACE_CASES:
2235 ADD_NEW(state_offset + 1, 0);
2236 break;
2237
2238 default:
2239 break;
2240 }
2241 break;
2242
2243 /*-----------------------------------------------------------------*/
2244 /* Match a negated single character casefully. */
2245
2246 case OP_NOT:
2247 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248 break;
2249
2250 /*-----------------------------------------------------------------*/
2251 /* Match a negated single character caselessly. */
2252
2253 case OP_NOTI:
2254 if (clen > 0)
2255 {
2256 unsigned int otherd;
2257 #ifdef SUPPORT_UTF
2258 if (utf && d >= 128)
2259 {
2260 #ifdef SUPPORT_UCP
2261 otherd = UCD_OTHERCASE(d);
2262 #endif /* SUPPORT_UCP */
2263 }
2264 else
2265 #endif /* SUPPORT_UTF */
2266 otherd = TABLE_GET(d, fcc, d);
2267 if (c != d && c != otherd)
2268 { ADD_NEW(state_offset + dlen + 1, 0); }
2269 }
2270 break;
2271
2272 /*-----------------------------------------------------------------*/
2273 case OP_PLUSI:
2274 case OP_MINPLUSI:
2275 case OP_POSPLUSI:
2276 case OP_NOTPLUSI:
2277 case OP_NOTMINPLUSI:
2278 case OP_NOTPOSPLUSI:
2279 caseless = TRUE;
2280 codevalue -= OP_STARI - OP_STAR;
2281
2282 /* Fall through */
2283 case OP_PLUS:
2284 case OP_MINPLUS:
2285 case OP_POSPLUS:
2286 case OP_NOTPLUS:
2287 case OP_NOTMINPLUS:
2288 case OP_NOTPOSPLUS:
2289 count = current_state->count; /* Already matched */
2290 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291 if (clen > 0)
2292 {
2293 pcre_uint32 otherd = NOTACHAR;
2294 if (caseless)
2295 {
2296 #ifdef SUPPORT_UTF
2297 if (utf && d >= 128)
2298 {
2299 #ifdef SUPPORT_UCP
2300 otherd = UCD_OTHERCASE(d);
2301 #endif /* SUPPORT_UCP */
2302 }
2303 else
2304 #endif /* SUPPORT_UTF */
2305 otherd = TABLE_GET(d, fcc, d);
2306 }
2307 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308 {
2309 if (count > 0 &&
2310 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2311 {
2312 active_count--; /* Remove non-match possibility */
2313 next_active_state--;
2314 }
2315 count++;
2316 ADD_NEW(state_offset, count);
2317 }
2318 }
2319 break;
2320
2321 /*-----------------------------------------------------------------*/
2322 case OP_QUERYI:
2323 case OP_MINQUERYI:
2324 case OP_POSQUERYI:
2325 case OP_NOTQUERYI:
2326 case OP_NOTMINQUERYI:
2327 case OP_NOTPOSQUERYI:
2328 caseless = TRUE;
2329 codevalue -= OP_STARI - OP_STAR;
2330 /* Fall through */
2331 case OP_QUERY:
2332 case OP_MINQUERY:
2333 case OP_POSQUERY:
2334 case OP_NOTQUERY:
2335 case OP_NOTMINQUERY:
2336 case OP_NOTPOSQUERY:
2337 ADD_ACTIVE(state_offset + dlen + 1, 0);
2338 if (clen > 0)
2339 {
2340 pcre_uint32 otherd = NOTACHAR;
2341 if (caseless)
2342 {
2343 #ifdef SUPPORT_UTF
2344 if (utf && d >= 128)
2345 {
2346 #ifdef SUPPORT_UCP
2347 otherd = UCD_OTHERCASE(d);
2348 #endif /* SUPPORT_UCP */
2349 }
2350 else
2351 #endif /* SUPPORT_UTF */
2352 otherd = TABLE_GET(d, fcc, d);
2353 }
2354 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355 {
2356 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2357 {
2358 active_count--; /* Remove non-match possibility */
2359 next_active_state--;
2360 }
2361 ADD_NEW(state_offset + dlen + 1, 0);
2362 }
2363 }
2364 break;
2365
2366 /*-----------------------------------------------------------------*/
2367 case OP_STARI:
2368 case OP_MINSTARI:
2369 case OP_POSSTARI:
2370 case OP_NOTSTARI:
2371 case OP_NOTMINSTARI:
2372 case OP_NOTPOSSTARI:
2373 caseless = TRUE;
2374 codevalue -= OP_STARI - OP_STAR;
2375 /* Fall through */
2376 case OP_STAR:
2377 case OP_MINSTAR:
2378 case OP_POSSTAR:
2379 case OP_NOTSTAR:
2380 case OP_NOTMINSTAR:
2381 case OP_NOTPOSSTAR:
2382 ADD_ACTIVE(state_offset + dlen + 1, 0);
2383 if (clen > 0)
2384 {
2385 pcre_uint32 otherd = NOTACHAR;
2386 if (caseless)
2387 {
2388 #ifdef SUPPORT_UTF
2389 if (utf && d >= 128)
2390 {
2391 #ifdef SUPPORT_UCP
2392 otherd = UCD_OTHERCASE(d);
2393 #endif /* SUPPORT_UCP */
2394 }
2395 else
2396 #endif /* SUPPORT_UTF */
2397 otherd = TABLE_GET(d, fcc, d);
2398 }
2399 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400 {
2401 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2402 {
2403 active_count--; /* Remove non-match possibility */
2404 next_active_state--;
2405 }
2406 ADD_NEW(state_offset, 0);
2407 }
2408 }
2409 break;
2410
2411 /*-----------------------------------------------------------------*/
2412 case OP_EXACTI:
2413 case OP_NOTEXACTI:
2414 caseless = TRUE;
2415 codevalue -= OP_STARI - OP_STAR;
2416 /* Fall through */
2417 case OP_EXACT:
2418 case OP_NOTEXACT:
2419 count = current_state->count; /* Number already matched */
2420 if (clen > 0)
2421 {
2422 pcre_uint32 otherd = NOTACHAR;
2423 if (caseless)
2424 {
2425 #ifdef SUPPORT_UTF
2426 if (utf && d >= 128)
2427 {
2428 #ifdef SUPPORT_UCP
2429 otherd = UCD_OTHERCASE(d);
2430 #endif /* SUPPORT_UCP */
2431 }
2432 else
2433 #endif /* SUPPORT_UTF */
2434 otherd = TABLE_GET(d, fcc, d);
2435 }
2436 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437 {
2438 if (++count >= (int)GET2(code, 1))
2439 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440 else
2441 { ADD_NEW(state_offset, count); }
2442 }
2443 }
2444 break;
2445
2446 /*-----------------------------------------------------------------*/
2447 case OP_UPTOI:
2448 case OP_MINUPTOI:
2449 case OP_POSUPTOI:
2450 case OP_NOTUPTOI:
2451 case OP_NOTMINUPTOI:
2452 case OP_NOTPOSUPTOI:
2453 caseless = TRUE;
2454 codevalue -= OP_STARI - OP_STAR;
2455 /* Fall through */
2456 case OP_UPTO:
2457 case OP_MINUPTO:
2458 case OP_POSUPTO:
2459 case OP_NOTUPTO:
2460 case OP_NOTMINUPTO:
2461 case OP_NOTPOSUPTO:
2462 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463 count = current_state->count; /* Number already matched */
2464 if (clen > 0)
2465 {
2466 pcre_uint32 otherd = NOTACHAR;
2467 if (caseless)
2468 {
2469 #ifdef SUPPORT_UTF
2470 if (utf && d >= 128)
2471 {
2472 #ifdef SUPPORT_UCP
2473 otherd = UCD_OTHERCASE(d);
2474 #endif /* SUPPORT_UCP */
2475 }
2476 else
2477 #endif /* SUPPORT_UTF */
2478 otherd = TABLE_GET(d, fcc, d);
2479 }
2480 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481 {
2482 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2483 {
2484 active_count--; /* Remove non-match possibility */
2485 next_active_state--;
2486 }
2487 if (++count >= (int)GET2(code, 1))
2488 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489 else
2490 { ADD_NEW(state_offset, count); }
2491 }
2492 }
2493 break;
2494
2495
2496 /* ========================================================================== */
2497 /* These are the class-handling opcodes */
2498
2499 case OP_CLASS:
2500 case OP_NCLASS:
2501 case OP_XCLASS:
2502 {
2503 BOOL isinclass = FALSE;
2504 int next_state_offset;
2505 const pcre_uchar *ecode;
2506
2507 /* For a simple class, there is always just a 32-byte table, and we
2508 can set isinclass from it. */
2509
2510 if (codevalue != OP_XCLASS)
2511 {
2512 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513 if (clen > 0)
2514 {
2515 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517 }
2518 }
2519
2520 /* An extended class may have a table or a list of single characters,
2521 ranges, or both, and it may be positive or negative. There's a
2522 function that sorts all this out. */
2523
2524 else
2525 {
2526 ecode = code + GET(code, 1);
2527 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528 }
2529
2530 /* At this point, isinclass is set for all kinds of class, and ecode
2531 points to the byte after the end of the class. If there is a
2532 quantifier, this is where it will be. */
2533
2534 next_state_offset = (int)(ecode - start_code);
2535
2536 switch (*ecode)
2537 {
2538 case OP_CRSTAR:
2539 case OP_CRMINSTAR:
2540 ADD_ACTIVE(next_state_offset + 1, 0);
2541 if (isinclass) { ADD_NEW(state_offset, 0); }
2542 break;
2543
2544 case OP_CRPLUS:
2545 case OP_CRMINPLUS:
2546 count = current_state->count; /* Already matched */
2547 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2548 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2549 break;
2550
2551 case OP_CRQUERY:
2552 case OP_CRMINQUERY:
2553 ADD_ACTIVE(next_state_offset + 1, 0);
2554 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2555 break;
2556
2557 case OP_CRRANGE:
2558 case OP_CRMINRANGE:
2559 count = current_state->count; /* Already matched */
2560 if (count >= (int)GET2(ecode, 1))
2561 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562 if (isinclass)
2563 {
2564 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565 if (++count >= max && max != 0) /* Max 0 => no limit */
2566 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567 else
2568 { ADD_NEW(state_offset, count); }
2569 }
2570 break;
2571
2572 default:
2573 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2574 break;
2575 }
2576 }
2577 break;
2578
2579 /* ========================================================================== */
2580 /* These are the opcodes for fancy brackets of various kinds. We have
2581 to use recursion in order to handle them. The "always failing" assertion
2582 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2583 though the other "backtracking verbs" are not supported. */
2584
2585 case OP_FAIL:
2586 forced_fail++; /* Count FAILs for multiple states */
2587 break;
2588
2589 case OP_ASSERT:
2590 case OP_ASSERT_NOT:
2591 case OP_ASSERTBACK:
2592 case OP_ASSERTBACK_NOT:
2593 {
2594 int rc;
2595 int local_offsets[2];
2596 int local_workspace[1000];
2597 const pcre_uchar *endasscode = code + GET(code, 1);
2598
2599 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600
2601 rc = internal_dfa_exec(
2602 md, /* static match data */
2603 code, /* this subexpression's code */
2604 ptr, /* where we currently are */
2605 (int)(ptr - start_subject), /* start offset */
2606 local_offsets, /* offset vector */
2607 sizeof(local_offsets)/sizeof(int), /* size of same */
2608 local_workspace, /* workspace vector */
2609 sizeof(local_workspace)/sizeof(int), /* size of same */
2610 rlevel); /* function recursion level */
2611
2612 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2614 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2615 }
2616 break;
2617
2618 /*-----------------------------------------------------------------*/
2619 case OP_COND:
2620 case OP_SCOND:
2621 {
2622 int local_offsets[1000];
2623 int local_workspace[1000];
2624 int codelink = GET(code, 1);
2625 int condcode;
2626
2627 /* Because of the way auto-callout works during compile, a callout item
2628 is inserted between OP_COND and an assertion condition. This does not
2629 happen for the other conditions. */
2630
2631 if (code[LINK_SIZE+1] == OP_CALLOUT)
2632 {
2633 rrc = 0;
2634 if (PUBL(callout) != NULL)
2635 {
2636 PUBL(callout_block) cb;
2637 cb.version = 1; /* Version 1 of the callout block */
2638 cb.callout_number = code[LINK_SIZE+2];
2639 cb.offset_vector = offsets;
2640 #if defined COMPILE_PCRE8
2641 cb.subject = (PCRE_SPTR)start_subject;
2642 #elif defined COMPILE_PCRE16
2643 cb.subject = (PCRE_SPTR16)start_subject;
2644 #elif defined COMPILE_PCRE32
2645 cb.subject = (PCRE_SPTR32)start_subject;
2646 #endif
2647 cb.subject_length = (int)(end_subject - start_subject);
2648 cb.start_match = (int)(current_subject - start_subject);
2649 cb.current_position = (int)(ptr - start_subject);
2650 cb.pattern_position = GET(code, LINK_SIZE + 3);
2651 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2652 cb.capture_top = 1;
2653 cb.capture_last = -1;
2654 cb.callout_data = md->callout_data;
2655 cb.mark = NULL; /* No (*MARK) support */
2656 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2657 }
2658 if (rrc > 0) break; /* Fail this thread */
2659 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2660 }
2661
2662 condcode = code[LINK_SIZE+1];
2663
2664 /* Back reference conditions are not supported */
2665
2666 if (condcode == OP_CREF || condcode == OP_NCREF)
2667 return PCRE_ERROR_DFA_UCOND;
2668
2669 /* The DEFINE condition is always false */
2670
2671 if (condcode == OP_DEF)
2672 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2673
2674 /* The only supported version of OP_RREF is for the value RREF_ANY,
2675 which means "test if in any recursion". We can't test for specifically
2676 recursed groups. */
2677
2678 else if (condcode == OP_RREF || condcode == OP_NRREF)
2679 {
2680 int value = GET2(code, LINK_SIZE + 2);
2681 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2682 if (md->recursive != NULL)
2683 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2684 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2685 }
2686
2687 /* Otherwise, the condition is an assertion */
2688
2689 else
2690 {
2691 int rc;
2692 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2693 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2694
2695 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2696
2697 rc = internal_dfa_exec(
2698 md, /* fixed match data */
2699 asscode, /* this subexpression's code */
2700 ptr, /* where we currently are */
2701 (int)(ptr - start_subject), /* start offset */
2702 local_offsets, /* offset vector */
2703 sizeof(local_offsets)/sizeof(int), /* size of same */
2704 local_workspace, /* workspace vector */
2705 sizeof(local_workspace)/sizeof(int), /* size of same */
2706 rlevel); /* function recursion level */
2707
2708 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2709 if ((rc >= 0) ==
2710 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2711 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2712 else
2713 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2714 }
2715 }
2716 break;
2717
2718 /*-----------------------------------------------------------------*/
2719 case OP_RECURSE:
2720 {
2721 dfa_recursion_info *ri;
2722 int local_offsets[1000];
2723 int local_workspace[1000];
2724 const pcre_uchar *callpat = start_code + GET(code, 1);
2725 int recno = (callpat == md->start_code)? 0 :
2726 GET2(callpat, 1 + LINK_SIZE);
2727 int rc;
2728
2729 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2730
2731 /* Check for repeating a recursion without advancing the subject
2732 pointer. This should catch convoluted mutual recursions. (Some simple
2733 cases are caught at compile time.) */
2734
2735 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2736 if (recno == ri->group_num && ptr == ri->subject_position)
2737 return PCRE_ERROR_RECURSELOOP;
2738
2739 /* Remember this recursion and where we started it so as to
2740 catch infinite loops. */
2741
2742 new_recursive.group_num = recno;
2743 new_recursive.subject_position = ptr;
2744 new_recursive.prevrec = md->recursive;
2745 md->recursive = &new_recursive;
2746
2747 rc = internal_dfa_exec(
2748 md, /* fixed match data */
2749 callpat, /* this subexpression's code */
2750 ptr, /* where we currently are */
2751 (int)(ptr - start_subject), /* start offset */
2752 local_offsets, /* offset vector */
2753 sizeof(local_offsets)/sizeof(int), /* size of same */
2754 local_workspace, /* workspace vector */
2755 sizeof(local_workspace)/sizeof(int), /* size of same */
2756 rlevel); /* function recursion level */
2757
2758 md->recursive = new_recursive.prevrec; /* Done this recursion */
2759
2760 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2761 rc));
2762
2763 /* Ran out of internal offsets */
2764
2765 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2766
2767 /* For each successful matched substring, set up the next state with a
2768 count of characters to skip before trying it. Note that the count is in
2769 characters, not bytes. */
2770
2771 if (rc > 0)
2772 {
2773 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774 {
2775 int charcount = local_offsets[rc+1] - local_offsets[rc];
2776 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2777 if (utf)
2778 {
2779 const pcre_uchar *p = start_subject + local_offsets[rc];
2780 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2781 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2782 }
2783 #endif
2784 if (charcount > 0)
2785 {
2786 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2787 }
2788 else
2789 {
2790 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2791 }
2792 }
2793 }
2794 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2795 }
2796 break;
2797
2798 /*-----------------------------------------------------------------*/
2799 case OP_BRAPOS:
2800 case OP_SBRAPOS:
2801 case OP_CBRAPOS:
2802 case OP_SCBRAPOS:
2803 case OP_BRAPOSZERO:
2804 {
2805 int charcount, matched_count;
2806 const pcre_uchar *local_ptr = ptr;
2807 BOOL allow_zero;
2808
2809 if (codevalue == OP_BRAPOSZERO)
2810 {
2811 allow_zero = TRUE;
2812 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2813 }
2814 else allow_zero = FALSE;
2815
2816 /* Loop to match the subpattern as many times as possible as if it were
2817 a complete pattern. */
2818
2819 for (matched_count = 0;; matched_count++)
2820 {
2821 int local_offsets[2];
2822 int local_workspace[1000];
2823
2824 int rc = internal_dfa_exec(
2825 md, /* fixed match data */
2826 code, /* this subexpression's code */
2827 local_ptr, /* where we currently are */
2828 (int)(ptr - start_subject), /* start offset */
2829 local_offsets, /* offset vector */
2830 sizeof(local_offsets)/sizeof(int), /* size of same */
2831 local_workspace, /* workspace vector */
2832 sizeof(local_workspace)/sizeof(int), /* size of same */
2833 rlevel); /* function recursion level */
2834
2835 /* Failed to match */
2836
2837 if (rc < 0)
2838 {
2839 if (rc != PCRE_ERROR_NOMATCH) return rc;
2840 break;
2841 }
2842
2843 /* Matched: break the loop if zero characters matched. */
2844
2845 charcount = local_offsets[1] - local_offsets[0];
2846 if (charcount == 0) break;
2847 local_ptr += charcount; /* Advance temporary position ptr */
2848 }
2849
2850 /* At this point we have matched the subpattern matched_count
2851 times, and local_ptr is pointing to the character after the end of the
2852 last match. */
2853
2854 if (matched_count > 0 || allow_zero)
2855 {
2856 const pcre_uchar *end_subpattern = code;
2857 int next_state_offset;
2858
2859 do { end_subpattern += GET(end_subpattern, 1); }
2860 while (*end_subpattern == OP_ALT);
2861 next_state_offset =
2862 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2863
2864 /* Optimization: if there are no more active states, and there
2865 are no new states yet set up, then skip over the subject string
2866 right here, to save looping. Otherwise, set up the new state to swing
2867 into action when the end of the matched substring is reached. */
2868
2869 if (i + 1 >= active_count && new_count == 0)
2870 {
2871 ptr = local_ptr;
2872 clen = 0;
2873 ADD_NEW(next_state_offset, 0);
2874 }
2875 else
2876 {
2877 const pcre_uchar *p = ptr;
2878 const pcre_uchar *pp = local_ptr;
2879 charcount = (int)(pp - p);
2880 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2881 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2882 #endif
2883 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2884 }
2885 }
2886 }
2887 break;
2888
2889 /*-----------------------------------------------------------------*/
2890 case OP_ONCE:
2891 case OP_ONCE_NC:
2892 {
2893 int local_offsets[2];
2894 int local_workspace[1000];
2895
2896 int rc = internal_dfa_exec(
2897 md, /* fixed match data */
2898 code, /* this subexpression's code */
2899 ptr, /* where we currently are */
2900 (int)(ptr - start_subject), /* start offset */
2901 local_offsets, /* offset vector */
2902 sizeof(local_offsets)/sizeof(int), /* size of same */
2903 local_workspace, /* workspace vector */
2904 sizeof(local_workspace)/sizeof(int), /* size of same */
2905 rlevel); /* function recursion level */
2906
2907 if (rc >= 0)
2908 {
2909 const pcre_uchar *end_subpattern = code;
2910 int charcount = local_offsets[1] - local_offsets[0];
2911 int next_state_offset, repeat_state_offset;
2912
2913 do { end_subpattern += GET(end_subpattern, 1); }
2914 while (*end_subpattern == OP_ALT);
2915 next_state_offset =
2916 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2917
2918 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2919 arrange for the repeat state also to be added to the relevant list.
2920 Calculate the offset, or set -1 for no repeat. */
2921
2922 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2923 *end_subpattern == OP_KETRMIN)?
2924 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2925
2926 /* If we have matched an empty string, add the next state at the
2927 current character pointer. This is important so that the duplicate
2928 checking kicks in, which is what breaks infinite loops that match an
2929 empty string. */
2930
2931 if (charcount == 0)
2932 {
2933 ADD_ACTIVE(next_state_offset, 0);
2934 }
2935
2936 /* Optimization: if there are no more active states, and there
2937 are no new states yet set up, then skip over the subject string
2938 right here, to save looping. Otherwise, set up the new state to swing
2939 into action when the end of the matched substring is reached. */
2940
2941 else if (i + 1 >= active_count && new_count == 0)
2942 {
2943 ptr += charcount;
2944 clen = 0;
2945 ADD_NEW(next_state_offset, 0);
2946
2947 /* If we are adding a repeat state at the new character position,
2948 we must fudge things so that it is the only current state.
2949 Otherwise, it might be a duplicate of one we processed before, and
2950 that would cause it to be skipped. */
2951
2952 if (repeat_state_offset >= 0)
2953 {
2954 next_active_state = active_states;
2955 active_count = 0;
2956 i = -1;
2957 ADD_ACTIVE(repeat_state_offset, 0);
2958 }
2959 }
2960 else
2961 {
2962 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2963 if (utf)
2964 {
2965 const pcre_uchar *p = start_subject + local_offsets[0];
2966 const pcre_uchar *pp = start_subject + local_offsets[1];
2967 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2968 }
2969 #endif
2970 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2971 if (repeat_state_offset >= 0)
2972 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2973 }
2974 }
2975 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2976 }
2977 break;
2978
2979
2980 /* ========================================================================== */
2981 /* Handle callouts */
2982
2983 case OP_CALLOUT:
2984 rrc = 0;
2985 if (PUBL(callout) != NULL)
2986 {
2987 PUBL(callout_block) cb;
2988 cb.version = 1; /* Version 1 of the callout block */
2989 cb.callout_number = code[1];
2990 cb.offset_vector = offsets;
2991 #if defined COMPILE_PCRE8
2992 cb.subject = (PCRE_SPTR)start_subject;
2993 #elif defined COMPILE_PCRE16
2994 cb.subject = (PCRE_SPTR16)start_subject;
2995 #elif defined COMPILE_PCRE32
2996 cb.subject = (PCRE_SPTR32)start_subject;
2997 #endif
2998 cb.subject_length = (int)(end_subject - start_subject);
2999 cb.start_match = (int)(current_subject - start_subject);
3000 cb.current_position = (int)(ptr - start_subject);
3001 cb.pattern_position = GET(code, 2);
3002 cb.next_item_length = GET(code, 2 + LINK_SIZE);
3003 cb.capture_top = 1;
3004 cb.capture_last = -1;
3005 cb.callout_data = md->callout_data;
3006 cb.mark = NULL; /* No (*MARK) support */
3007 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3008 }
3009 if (rrc == 0)
3010 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3011 break;
3012
3013
3014 /* ========================================================================== */
3015 default: /* Unsupported opcode */
3016 return PCRE_ERROR_DFA_UITEM;
3017 }
3018
3019 NEXT_ACTIVE_STATE: continue;
3020
3021 } /* End of loop scanning active states */
3022
3023 /* We have finished the processing at the current subject character. If no
3024 new states have been set for the next character, we have found all the
3025 matches that we are going to find. If we are at the top level and partial
3026 matching has been requested, check for appropriate conditions.
3027
3028 The "forced_ fail" variable counts the number of (*F) encountered for the
3029 character. If it is equal to the original active_count (saved in
3030 workspace[1]) it means that (*F) was found on every active state. In this
3031 case we don't want to give a partial match.
3032
3033 The "could_continue" variable is true if a state could have continued but
3034 for the fact that the end of the subject was reached. */
3035
3036 if (new_count <= 0)
3037 {
3038 if (rlevel == 1 && /* Top level, and */
3039 could_continue && /* Some could go on, and */
3040 forced_fail != workspace[1] && /* Not all forced fail & */
3041 ( /* either... */
3042 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3043 || /* or... */
3044 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3045 match_count < 0) /* no matches */
3046 ) && /* And... */
3047 (
3048 partial_newline || /* Either partial NL */
3049 ( /* or ... */
3050 ptr >= end_subject && /* End of subject and */
3051 ptr > md->start_used_ptr) /* Inspected non-empty string */
3052 )
3053 )
3054 match_count = PCRE_ERROR_PARTIAL;
3055 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3056 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3057 rlevel*2-2, SP));
3058 break; /* In effect, "return", but see the comment below */
3059 }
3060
3061 /* One or more states are active for the next character. */
3062
3063 ptr += clen; /* Advance to next subject character */
3064 } /* Loop to move along the subject string */
3065
3066 /* Control gets here from "break" a few lines above. We do it this way because
3067 if we use "return" above, we have compiler trouble. Some compilers warn if
3068 there's nothing here because they think the function doesn't return a value. On
3069 the other hand, if we put a dummy statement here, some more clever compilers
3070 complain that it can't be reached. Sigh. */
3071
3072 return match_count;
3073 }
3074
3075
3076
3077
3078 /*************************************************
3079 * Execute a Regular Expression - DFA engine *
3080 *************************************************/
3081
3082 /* This external function applies a compiled re to a subject string using a DFA
3083 engine. This function calls the internal function multiple times if the pattern
3084 is not anchored.
3085
3086 Arguments:
3087 argument_re points to the compiled expression
3088 extra_data points to extra data or is NULL
3089 subject points to the subject string
3090 length length of subject string (may contain binary zeros)
3091 start_offset where to start in the subject string
3092 options option bits
3093 offsets vector of match offsets
3094 offsetcount size of same
3095 workspace workspace vector
3096 wscount size of same
3097
3098 Returns: > 0 => number of match offset pairs placed in offsets
3099 = 0 => offsets overflowed; longest matches are present
3100 -1 => failed to match
3101 < -1 => some kind of unexpected problem
3102 */
3103
3104 #if defined COMPILE_PCRE8
3105 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3106 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3107 const char *subject, int length, int start_offset, int options, int *offsets,
3108 int offsetcount, int *workspace, int wscount)
3109 #elif defined COMPILE_PCRE16
3110 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3111 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3112 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3113 int offsetcount, int *workspace, int wscount)
3114 #elif defined COMPILE_PCRE32
3115 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3116 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3117 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3118 int offsetcount, int *workspace, int wscount)
3119 #endif
3120 {
3121 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3122 dfa_match_data match_block;
3123 dfa_match_data *md = &match_block;
3124 BOOL utf, anchored, startline, firstline;
3125 const pcre_uchar *current_subject, *end_subject;
3126 const pcre_study_data *study = NULL;
3127
3128 const pcre_uchar *req_char_ptr;
3129 const pcre_uint8 *start_bits = NULL;
3130 BOOL has_first_char = FALSE;
3131 BOOL has_req_char = FALSE;
3132 pcre_uchar first_char = 0;
3133 pcre_uchar first_char2 = 0;
3134 pcre_uchar req_char = 0;
3135 pcre_uchar req_char2 = 0;
3136 int newline;
3137
3138 /* Plausibility checks */
3139
3140 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3141 if (re == NULL || subject == NULL || workspace == NULL ||
3142 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3143 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3144 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3145 if (length < 0) return PCRE_ERROR_BADLENGTH;
3146 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3147
3148 /* Check that the first field in the block is the magic number. If it is not,
3149 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3150 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3151 means that the pattern is likely compiled with different endianness. */
3152
3153 if (re->magic_number != MAGIC_NUMBER)
3154 return re->magic_number == REVERSED_MAGIC_NUMBER?
3155 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3156 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3157
3158 /* If restarting after a partial match, do some sanity checks on the contents
3159 of the workspace. */
3160
3161 if ((options & PCRE_DFA_RESTART) != 0)
3162 {
3163 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3164 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3165 return PCRE_ERROR_DFA_BADRESTART;
3166 }
3167
3168 /* Set up study, callout, and table data */
3169
3170 md->tables = re->tables;
3171 md->callout_data = NULL;
3172
3173 if (extra_data != NULL)
3174 {
3175 unsigned int flags = extra_data->flags;
3176 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3177 study = (const pcre_study_data *)extra_data->study_data;
3178 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3179 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3180 return PCRE_ERROR_DFA_UMLIMIT;
3181 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3182 md->callout_data = extra_data->callout_data;
3183 if ((flags & PCRE_EXTRA_TABLES) != 0)
3184 md->tables = extra_data->tables;
3185 }
3186
3187 /* Set some local values */
3188
3189 current_subject = (const pcre_uchar *)subject + start_offset;
3190 end_subject = (const pcre_uchar *)subject + length;
3191 req_char_ptr = current_subject - 1;
3192
3193 #ifdef SUPPORT_UTF
3194 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3195 utf = (re->options & PCRE_UTF8) != 0;
3196 #else
3197 utf = FALSE;
3198 #endif
3199
3200 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3201 (re->options & PCRE_ANCHORED) != 0;
3202
3203 /* The remaining fixed data for passing around. */
3204
3205 md->start_code = (const pcre_uchar *)argument_re +
3206 re->name_table_offset + re->name_count * re->name_entry_size;
3207 md->start_subject = (const pcre_uchar *)subject;
3208 md->end_subject = end_subject;
3209 md->start_offset = start_offset;
3210 md->moptions = options;
3211 md->poptions = re->options;
3212
3213 /* If the BSR option is not set at match time, copy what was set
3214 at compile time. */
3215
3216 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3217 {
3218 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3219 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3220 #ifdef BSR_ANYCRLF
3221 else md->moptions |= PCRE_BSR_ANYCRLF;
3222 #endif
3223 }
3224
3225 /* Handle different types of newline. The three bits give eight cases. If
3226 nothing is set at run time, whatever was used at compile time applies. */
3227
3228 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3229 PCRE_NEWLINE_BITS)
3230 {
3231 case 0: newline = NEWLINE; break; /* Compile-time default */
3232 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3233 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3234 case PCRE_NEWLINE_CR+
3235 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3236 case PCRE_NEWLINE_ANY: newline = -1; break;
3237 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3238 default: return PCRE_ERROR_BADNEWLINE;
3239 }
3240
3241 if (newline == -2)
3242 {
3243 md->nltype = NLTYPE_ANYCRLF;
3244 }
3245 else if (newline < 0)
3246 {
3247 md->nltype = NLTYPE_ANY;
3248 }
3249 else
3250 {
3251 md->nltype = NLTYPE_FIXED;
3252 if (newline > 255)
3253 {
3254 md->nllen = 2;
3255 md->nl[0] = (newline >> 8) & 255;
3256 md->nl[1] = newline & 255;
3257 }
3258 else
3259 {
3260 md->nllen = 1;
3261 md->nl[0] = newline;
3262 }
3263 }
3264
3265 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3266 back the character offset. */
3267
3268 #ifdef SUPPORT_UTF
3269 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3270 {
3271 int erroroffset;
3272 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3273 if (errorcode != 0)
3274 {
3275 if (offsetcount >= 2)
3276 {
3277 offsets[0] = erroroffset;
3278 offsets[1] = errorcode;
3279 }
3280 #if defined COMPILE_PCRE8
3281 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3282 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3283 #elif defined COMPILE_PCRE16
3284 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3285 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3286 #elif defined COMPILE_PCRE32
3287 return PCRE_ERROR_BADUTF32;
3288 #endif
3289 }
3290 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3291 if (start_offset > 0 && start_offset < length &&
3292 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3293 return PCRE_ERROR_BADUTF8_OFFSET;
3294 #endif
3295 }
3296 #endif
3297
3298 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3299 is a feature that makes it possible to save compiled regex and re-use them
3300 in other programs later. */
3301
3302 if (md->tables == NULL) md->tables = PRIV(default_tables);
3303
3304 /* The "must be at the start of a line" flags are used in a loop when finding
3305 where to start. */
3306
3307 startline = (re->flags & PCRE_STARTLINE) != 0;
3308 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3309
3310 /* Set up the first character to match, if available. The first_byte value is
3311 never set for an anchored regular expression, but the anchoring may be forced
3312 at run time, so we have to test for anchoring. The first char may be unset for
3313 an unanchored pattern, of course. If there's no first char and the pattern was
3314 studied, there may be a bitmap of possible first characters. */
3315
3316 if (!anchored)
3317 {
3318 if ((re->flags & PCRE_FIRSTSET) != 0)
3319 {
3320 has_first_char = TRUE;
3321 first_char = first_char2 = (pcre_uchar)(re->first_char);
3322 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3323 {
3324 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3325 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3326 if (utf && first_char > 127)
3327 first_char2 = UCD_OTHERCASE(first_char);
3328 #endif
3329 }
3330 }
3331 else
3332 {
3333 if (!startline && study != NULL &&
3334 (study->flags & PCRE_STUDY_MAPPED) != 0)
3335 start_bits = study->start_bits;
3336 }
3337 }
3338
3339 /* For anchored or unanchored matches, there may be a "last known required
3340 character" set. */
3341
3342 if ((re->flags & PCRE_REQCHSET) != 0)
3343 {
3344 has_req_char = TRUE;
3345 req_char = req_char2 = (pcre_uchar)(re->req_char);
3346 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3347 {
3348 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3349 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3350 if (utf && req_char > 127)
3351 req_char2 = UCD_OTHERCASE(req_char);
3352 #endif
3353 }
3354 }
3355
3356 /* Call the main matching function, looping for a non-anchored regex after a
3357 failed match. If not restarting, perform certain optimizations at the start of
3358 a match. */
3359
3360 for (;;)
3361 {
3362 int rc;
3363
3364 if ((options & PCRE_DFA_RESTART) == 0)
3365 {
3366 const pcre_uchar *save_end_subject = end_subject;
3367
3368 /* If firstline is TRUE, the start of the match is constrained to the first
3369 line of a multiline string. Implement this by temporarily adjusting
3370 end_subject so that we stop scanning at a newline. If the match fails at
3371 the newline, later code breaks this loop. */
3372
3373 if (firstline)
3374 {
3375 PCRE_PUCHAR t = current_subject;
3376 #ifdef SUPPORT_UTF
3377 if (utf)
3378 {
3379 while (t < md->end_subject && !IS_NEWLINE(t))
3380 {
3381 t++;
3382 ACROSSCHAR(t < end_subject, *t, t++);
3383 }
3384 }
3385 else
3386 #endif
3387 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3388 end_subject = t;
3389 }
3390
3391 /* There are some optimizations that avoid running the match if a known
3392 starting point is not found. However, there is an option that disables
3393 these, for testing and for ensuring that all callouts do actually occur.
3394 The option can be set in the regex by (*NO_START_OPT) or passed in
3395 match-time options. */
3396
3397 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3398 {
3399 /* Advance to a known first char. */
3400
3401 if (has_first_char)
3402 {
3403 if (first_char != first_char2)
3404 {
3405 pcre_uchar csc;
3406 while (current_subject < end_subject &&
3407 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3408 current_subject++;
3409 }
3410 else
3411 while (current_subject < end_subject &&
3412 RAWUCHARTEST(current_subject) != first_char)
3413 current_subject++;
3414 }
3415
3416 /* Or to just after a linebreak for a multiline match if possible */
3417
3418 else if (startline)
3419 {
3420 if (current_subject > md->start_subject + start_offset)
3421 {
3422 #ifdef SUPPORT_UTF
3423 if (utf)
3424 {
3425 while (current_subject < end_subject &&
3426 !WAS_NEWLINE(current_subject))
3427 {
3428 current_subject++;
3429 ACROSSCHAR(current_subject < end_subject, *current_subject,
3430 current_subject++);
3431 }
3432 }
3433 else
3434 #endif
3435 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3436 current_subject++;
3437
3438 /* If we have just passed a CR and the newline option is ANY or
3439 ANYCRLF, and we are now at a LF, advance the match position by one
3440 more character. */
3441
3442 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3443 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3444 current_subject < end_subject &&
3445 RAWUCHARTEST(current_subject) == CHAR_NL)
3446 current_subject++;
3447 }
3448 }
3449
3450 /* Or to a non-unique first char after study */
3451
3452 else if (start_bits != NULL)
3453 {
3454 while (current_subject < end_subject)
3455 {
3456 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3457 #ifndef COMPILE_PCRE8
3458 if (c > 255) c = 255;
3459 #endif
3460 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3461 {
3462 current_subject++;
3463 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3464 /* In non 8-bit mode, the iteration will stop for
3465 characters > 255 at the beginning or not stop at all. */
3466 if (utf)
3467 ACROSSCHAR(current_subject < end_subject, *current_subject,
3468 current_subject++);
3469 #endif
3470 }
3471 else break;
3472 }
3473 }
3474 }
3475
3476 /* Restore fudged end_subject */
3477
3478 end_subject = save_end_subject;
3479
3480 /* The following two optimizations are disabled for partial matching or if
3481 disabling is explicitly requested (and of course, by the test above, this
3482 code is not obeyed when restarting after a partial match). */
3483
3484 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3485 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3486 {
3487 /* If the pattern was studied, a minimum subject length may be set. This
3488 is a lower bound; no actual string of that length may actually match the
3489 pattern. Although the value is, strictly, in characters, we treat it as
3490 bytes to avoid spending too much time in this optimization. */
3491
3492 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3493 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3494 return PCRE_ERROR_NOMATCH;
3495
3496 /* If req_char is set, we know that that character must appear in the
3497 subject for the match to succeed. If the first character is set, req_char
3498 must be later in the subject; otherwise the test starts at the match
3499 point. This optimization can save a huge amount of work in patterns with
3500 nested unlimited repeats that aren't going to match. Writing separate
3501 code for cased/caseless versions makes it go faster, as does using an
3502 autoincrement and backing off on a match.
3503
3504 HOWEVER: when the subject string is very, very long, searching to its end
3505 can take a long time, and give bad performance on quite ordinary
3506 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3507 string... so we don't do this when the string is sufficiently long. */
3508
3509 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3510 {
3511 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3512
3513 /* We don't need to repeat the search if we haven't yet reached the
3514 place we found it at last time. */
3515
3516 if (p > req_char_ptr)
3517 {
3518 if (req_char != req_char2)
3519 {
3520 while (p < end_subject)
3521 {
3522 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3523 if (pp == req_char || pp == req_char2) { p--; break; }
3524 }
3525 }
3526 else
3527 {
3528 while (p < end_subject)
3529 {
3530 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3531 }
3532 }
3533
3534 /* If we can't find the required character, break the matching loop,
3535 which will cause a return or PCRE_ERROR_NOMATCH. */
3536
3537 if (p >= end_subject) break;
3538
3539 /* If we have found the required character, save the point where we
3540 found it, so that we don't search again next time round the loop if
3541 the start hasn't passed this character yet. */
3542
3543 req_char_ptr = p;
3544 }
3545 }
3546 }
3547 } /* End of optimizations that are done when not restarting */
3548
3549 /* OK, now we can do the business */
3550
3551 md->start_used_ptr = current_subject;
3552 md->recursive = NULL;
3553
3554 rc = internal_dfa_exec(
3555 md, /* fixed match data */
3556 md->start_code, /* this subexpression's code */
3557 current_subject, /* where we currently are */
3558 start_offset, /* start offset in subject */
3559 offsets, /* offset vector */
3560 offsetcount, /* size of same */
3561 workspace, /* workspace vector */
3562 wscount, /* size of same */
3563 0); /* function recurse level */
3564
3565 /* Anything other than "no match" means we are done, always; otherwise, carry
3566 on only if not anchored. */
3567
3568 if (rc != PCRE_ERROR_NOMATCH || anchored)
3569 {
3570 if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3571 {
3572 offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3573 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3574 if (offsetcount > 2)
3575 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3576 }
3577 return rc;
3578 }
3579
3580 /* Advance to the next subject character unless we are at the end of a line
3581 and firstline is set. */
3582
3583 if (firstline && IS_NEWLINE(current_subject)) break;
3584 current_subject++;
3585 #ifdef SUPPORT_UTF
3586 if (utf)
3587 {
3588 ACROSSCHAR(current_subject < end_subject, *current_subject,
3589 current_subject++);
3590 }
3591 #endif
3592 if (current_subject > end_subject) break;
3593
3594 /* If we have just passed a CR and we are now at a LF, and the pattern does
3595 not contain any explicit matches for \r or \n, and the newline option is CRLF
3596 or ANY or ANYCRLF, advance the match position by one more character. */
3597
3598 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3599 current_subject < end_subject &&
3600 RAWUCHARTEST(current_subject) == CHAR_NL &&
3601 (re->flags & PCRE_HASCRORLF) == 0 &&
3602 (md->nltype == NLTYPE_ANY ||
3603 md->nltype == NLTYPE_ANYCRLF ||
3604 md->nllen == 2))
3605 current_subject++;
3606
3607 } /* "Bumpalong" loop */
3608
3609 return PCRE_ERROR_NOMATCH;
3610 }
3611
3612 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5