/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1365 - (show annotations)
Sun Oct 6 18:33:56 2013 UTC (6 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 125527 byte(s)
Refactor named group handling for conditional tests.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2013 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
51
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
60
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
69
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
72
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre_internal.h"
84
85
86 /* For use to indent debugging output */
87
88 #define SP " "
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* DNREF */
160 0, /* DNREFI */
161 0, /* RECURSE */
162 0, /* CALLOUT */
163 0, /* Alt */
164 0, /* Ket */
165 0, /* KetRmax */
166 0, /* KetRmin */
167 0, /* KetRpos */
168 0, /* Reverse */
169 0, /* Assert */
170 0, /* Assert not */
171 0, /* Assert behind */
172 0, /* Assert behind not */
173 0, 0, /* ONCE, ONCE_NC */
174 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
175 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
176 0, 0, /* CREF, DNCREF */
177 0, 0, /* RREF, DNRREF */
178 0, /* DEF */
179 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
180 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
181 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
182 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
183 0, 0 /* CLOSE, SKIPZERO */
184 };
185
186 /* This table identifies those opcodes that inspect a character. It is used to
187 remember the fact that a character could have been inspected when the end of
188 the subject is reached. ***NOTE*** If the start of this table is modified, the
189 two tables that follow must also be modified. */
190
191 static const pcre_uint8 poptable[] = {
192 0, /* End */
193 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
194 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
195 1, 1, 1, /* Any, AllAny, Anybyte */
196 1, 1, /* \P, \p */
197 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
198 1, /* \X */
199 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
200 1, /* Char */
201 1, /* Chari */
202 1, /* not */
203 1, /* noti */
204 /* Positive single-char repeats */
205 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
206 1, 1, 1, /* upto, minupto, exact */
207 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
208 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
209 1, 1, 1, /* upto I, minupto I, exact I */
210 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
211 /* Negative single-char repeats - only for chars < 256 */
212 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* NOT upto, minupto, exact */
214 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* NOT upto I, minupto I, exact I */
217 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
218 /* Positive type repeats */
219 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* Type upto, minupto, exact */
221 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
222 /* Character class & ref repeats */
223 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
224 1, 1, /* CRRANGE, CRMINRANGE */
225 1, /* CLASS */
226 1, /* NCLASS */
227 1, /* XCLASS - variable length */
228 0, /* REF */
229 0, /* REFI */
230 0, /* DNREF */
231 0, /* DNREFI */
232 0, /* RECURSE */
233 0, /* CALLOUT */
234 0, /* Alt */
235 0, /* Ket */
236 0, /* KetRmax */
237 0, /* KetRmin */
238 0, /* KetRpos */
239 0, /* Reverse */
240 0, /* Assert */
241 0, /* Assert not */
242 0, /* Assert behind */
243 0, /* Assert behind not */
244 0, 0, /* ONCE, ONCE_NC */
245 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
246 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
247 0, 0, /* CREF, DNCREF */
248 0, 0, /* RREF, DNRREF */
249 0, /* DEF */
250 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
251 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
252 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
253 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
254 0, 0 /* CLOSE, SKIPZERO */
255 };
256
257 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258 and \w */
259
260 static const pcre_uint8 toptable1[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, ctype_digit,
263 ctype_space, ctype_space,
264 ctype_word, ctype_word,
265 0, 0 /* OP_ANY, OP_ALLANY */
266 };
267
268 static const pcre_uint8 toptable2[] = {
269 0, 0, 0, 0, 0, 0,
270 ctype_digit, 0,
271 ctype_space, 0,
272 ctype_word, 0,
273 1, 1 /* OP_ANY, OP_ALLANY */
274 };
275
276
277 /* Structure for holding data about a particular state, which is in effect the
278 current data for an active path through the match tree. It must consist
279 entirely of ints because the working vector we are passed, and which we put
280 these structures in, is a vector of ints. */
281
282 typedef struct stateblock {
283 int offset; /* Offset to opcode */
284 int count; /* Count for repeats */
285 int data; /* Some use extra data */
286 } stateblock;
287
288 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
289
290
291 #ifdef PCRE_DEBUG
292 /*************************************************
293 * Print character string *
294 *************************************************/
295
296 /* Character string printing function for debugging.
297
298 Arguments:
299 p points to string
300 length number of bytes
301 f where to print
302
303 Returns: nothing
304 */
305
306 static void
307 pchars(const pcre_uchar *p, int length, FILE *f)
308 {
309 pcre_uint32 c;
310 while (length-- > 0)
311 {
312 if (isprint(c = *(p++)))
313 fprintf(f, "%c", c);
314 else
315 fprintf(f, "\\x{%02x}", c);
316 }
317 }
318 #endif
319
320
321
322 /*************************************************
323 * Execute a Regular Expression - DFA engine *
324 *************************************************/
325
326 /* This internal function applies a compiled pattern to a subject string,
327 starting at a given point, using a DFA engine. This function is called from the
328 external one, possibly multiple times if the pattern is not anchored. The
329 function calls itself recursively for some kinds of subpattern.
330
331 Arguments:
332 md the match_data block with fixed information
333 this_start_code the opening bracket of this subexpression's code
334 current_subject where we currently are in the subject string
335 start_offset start offset in the subject string
336 offsets vector to contain the matching string offsets
337 offsetcount size of same
338 workspace vector of workspace
339 wscount size of same
340 rlevel function call recursion level
341
342 Returns: > 0 => number of match offset pairs placed in offsets
343 = 0 => offsets overflowed; longest matches are present
344 -1 => failed to match
345 < -1 => some kind of unexpected problem
346
347 The following macros are used for adding states to the two state vectors (one
348 for the current character, one for the following character). */
349
350 #define ADD_ACTIVE(x,y) \
351 if (active_count++ < wscount) \
352 { \
353 next_active_state->offset = (x); \
354 next_active_state->count = (y); \
355 next_active_state++; \
356 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357 } \
358 else return PCRE_ERROR_DFA_WSSIZE
359
360 #define ADD_ACTIVE_DATA(x,y,z) \
361 if (active_count++ < wscount) \
362 { \
363 next_active_state->offset = (x); \
364 next_active_state->count = (y); \
365 next_active_state->data = (z); \
366 next_active_state++; \
367 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
368 } \
369 else return PCRE_ERROR_DFA_WSSIZE
370
371 #define ADD_NEW(x,y) \
372 if (new_count++ < wscount) \
373 { \
374 next_new_state->offset = (x); \
375 next_new_state->count = (y); \
376 next_new_state++; \
377 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378 } \
379 else return PCRE_ERROR_DFA_WSSIZE
380
381 #define ADD_NEW_DATA(x,y,z) \
382 if (new_count++ < wscount) \
383 { \
384 next_new_state->offset = (x); \
385 next_new_state->count = (y); \
386 next_new_state->data = (z); \
387 next_new_state++; \
388 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389 (x), (y), (z), __LINE__)); \
390 } \
391 else return PCRE_ERROR_DFA_WSSIZE
392
393 /* And now, here is the code */
394
395 static int
396 internal_dfa_exec(
397 dfa_match_data *md,
398 const pcre_uchar *this_start_code,
399 const pcre_uchar *current_subject,
400 int start_offset,
401 int *offsets,
402 int offsetcount,
403 int *workspace,
404 int wscount,
405 int rlevel)
406 {
407 stateblock *active_states, *new_states, *temp_states;
408 stateblock *next_active_state, *next_new_state;
409
410 const pcre_uint8 *ctypes, *lcc, *fcc;
411 const pcre_uchar *ptr;
412 const pcre_uchar *end_code, *first_op;
413
414 dfa_recursion_info new_recursive;
415
416 int active_count, new_count, match_count;
417
418 /* Some fields in the md block are frequently referenced, so we load them into
419 independent variables in the hope that this will perform better. */
420
421 const pcre_uchar *start_subject = md->start_subject;
422 const pcre_uchar *end_subject = md->end_subject;
423 const pcre_uchar *start_code = md->start_code;
424
425 #ifdef SUPPORT_UTF
426 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427 #else
428 BOOL utf = FALSE;
429 #endif
430
431 BOOL reset_could_continue = FALSE;
432
433 rlevel++;
434 offsetcount &= (-2);
435
436 wscount -= 2;
437 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
438 (2 * INTS_PER_STATEBLOCK);
439
440 DPRINTF(("\n%.*s---------------------\n"
441 "%.*sCall to internal_dfa_exec f=%d\n",
442 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443
444 ctypes = md->tables + ctypes_offset;
445 lcc = md->tables + lcc_offset;
446 fcc = md->tables + fcc_offset;
447
448 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
449
450 active_states = (stateblock *)(workspace + 2);
451 next_new_state = new_states = active_states + wscount;
452 new_count = 0;
453
454 first_op = this_start_code + 1 + LINK_SIZE +
455 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457 ? IMM2_SIZE:0);
458
459 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460 the alternative states onto the list, and find out where the end is. This
461 makes is possible to use this function recursively, when we want to stop at a
462 matching internal ket rather than at the end.
463
464 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
465 a backward assertion. In that case, we have to find out the maximum amount to
466 move back, and set up each alternative appropriately. */
467
468 if (*first_op == OP_REVERSE)
469 {
470 int max_back = 0;
471 int gone_back;
472
473 end_code = this_start_code;
474 do
475 {
476 int back = GET(end_code, 2+LINK_SIZE);
477 if (back > max_back) max_back = back;
478 end_code += GET(end_code, 1);
479 }
480 while (*end_code == OP_ALT);
481
482 /* If we can't go back the amount required for the longest lookbehind
483 pattern, go back as far as we can; some alternatives may still be viable. */
484
485 #ifdef SUPPORT_UTF
486 /* In character mode we have to step back character by character */
487
488 if (utf)
489 {
490 for (gone_back = 0; gone_back < max_back; gone_back++)
491 {
492 if (current_subject <= start_subject) break;
493 current_subject--;
494 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
495 }
496 }
497 else
498 #endif
499
500 /* In byte-mode we can do this quickly. */
501
502 {
503 gone_back = (current_subject - max_back < start_subject)?
504 (int)(current_subject - start_subject) : max_back;
505 current_subject -= gone_back;
506 }
507
508 /* Save the earliest consulted character */
509
510 if (current_subject < md->start_used_ptr)
511 md->start_used_ptr = current_subject;
512
513 /* Now we can process the individual branches. */
514
515 end_code = this_start_code;
516 do
517 {
518 int back = GET(end_code, 2+LINK_SIZE);
519 if (back <= gone_back)
520 {
521 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522 ADD_NEW_DATA(-bstate, 0, gone_back - back);
523 }
524 end_code += GET(end_code, 1);
525 }
526 while (*end_code == OP_ALT);
527 }
528
529 /* This is the code for a "normal" subpattern (not a backward assertion). The
530 start of a whole pattern is always one of these. If we are at the top level,
531 we may be asked to restart matching from the same point that we reached for a
532 previous partial match. We still have to scan through the top-level branches to
533 find the end state. */
534
535 else
536 {
537 end_code = this_start_code;
538
539 /* Restarting */
540
541 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
542 {
543 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
544 new_count = workspace[1];
545 if (!workspace[0])
546 memcpy(new_states, active_states, new_count * sizeof(stateblock));
547 }
548
549 /* Not restarting */
550
551 else
552 {
553 int length = 1 + LINK_SIZE +
554 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556 ? IMM2_SIZE:0);
557 do
558 {
559 ADD_NEW((int)(end_code - start_code + length), 0);
560 end_code += GET(end_code, 1);
561 length = 1 + LINK_SIZE;
562 }
563 while (*end_code == OP_ALT);
564 }
565 }
566
567 workspace[0] = 0; /* Bit indicating which vector is current */
568
569 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570
571 /* Loop for scanning the subject */
572
573 ptr = current_subject;
574 for (;;)
575 {
576 int i, j;
577 int clen, dlen;
578 pcre_uint32 c, d;
579 int forced_fail = 0;
580 BOOL partial_newline = FALSE;
581 BOOL could_continue = reset_could_continue;
582 reset_could_continue = FALSE;
583
584 /* Make the new state list into the active state list and empty the
585 new state list. */
586
587 temp_states = active_states;
588 active_states = new_states;
589 new_states = temp_states;
590 active_count = new_count;
591 new_count = 0;
592
593 workspace[0] ^= 1; /* Remember for the restarting feature */
594 workspace[1] = active_count;
595
596 #ifdef PCRE_DEBUG
597 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598 pchars(ptr, STRLEN_UC(ptr), stdout);
599 printf("\"\n");
600
601 printf("%.*sActive states: ", rlevel*2-2, SP);
602 for (i = 0; i < active_count; i++)
603 printf("%d/%d ", active_states[i].offset, active_states[i].count);
604 printf("\n");
605 #endif
606
607 /* Set the pointers for adding new states */
608
609 next_active_state = active_states + active_count;
610 next_new_state = new_states;
611
612 /* Load the current character from the subject outside the loop, as many
613 different states may want to look at it, and we assume that at least one
614 will. */
615
616 if (ptr < end_subject)
617 {
618 clen = 1; /* Number of data items in the character */
619 #ifdef SUPPORT_UTF
620 GETCHARLENTEST(c, ptr, clen);
621 #else
622 c = *ptr;
623 #endif /* SUPPORT_UTF */
624 }
625 else
626 {
627 clen = 0; /* This indicates the end of the subject */
628 c = NOTACHAR; /* This value should never actually be used */
629 }
630
631 /* Scan up the active states and act on each one. The result of an action
632 may be to add more states to the currently active list (e.g. on hitting a
633 parenthesis) or it may be to put states on the new list, for considering
634 when we move the character pointer on. */
635
636 for (i = 0; i < active_count; i++)
637 {
638 stateblock *current_state = active_states + i;
639 BOOL caseless = FALSE;
640 const pcre_uchar *code;
641 int state_offset = current_state->offset;
642 int codevalue, rrc;
643 int count;
644
645 #ifdef PCRE_DEBUG
646 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647 if (clen == 0) printf("EOL\n");
648 else if (c > 32 && c < 127) printf("'%c'\n", c);
649 else printf("0x%02x\n", c);
650 #endif
651
652 /* A negative offset is a special case meaning "hold off going to this
653 (negated) state until the number of characters in the data field have
654 been skipped". If the could_continue flag was passed over from a previous
655 state, arrange for it to passed on. */
656
657 if (state_offset < 0)
658 {
659 if (current_state->data > 0)
660 {
661 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662 ADD_NEW_DATA(state_offset, current_state->count,
663 current_state->data - 1);
664 if (could_continue) reset_could_continue = TRUE;
665 continue;
666 }
667 else
668 {
669 current_state->offset = state_offset = -state_offset;
670 }
671 }
672
673 /* Check for a duplicate state with the same count, and skip if found.
674 See the note at the head of this module about the possibility of improving
675 performance here. */
676
677 for (j = 0; j < i; j++)
678 {
679 if (active_states[j].offset == state_offset &&
680 active_states[j].count == current_state->count)
681 {
682 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
683 goto NEXT_ACTIVE_STATE;
684 }
685 }
686
687 /* The state offset is the offset to the opcode */
688
689 code = start_code + state_offset;
690 codevalue = *code;
691
692 /* If this opcode inspects a character, but we are at the end of the
693 subject, remember the fact for use when testing for a partial match. */
694
695 if (clen == 0 && poptable[codevalue] != 0)
696 could_continue = TRUE;
697
698 /* If this opcode is followed by an inline character, load it. It is
699 tempting to test for the presence of a subject character here, but that
700 is wrong, because sometimes zero repetitions of the subject are
701 permitted.
702
703 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704 argument that is not a data character - but is always one byte long because
705 the values are small. We have to take special action to deal with \P, \p,
706 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707 these ones to new opcodes. */
708
709 if (coptable[codevalue] > 0)
710 {
711 dlen = 1;
712 #ifdef SUPPORT_UTF
713 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714 #endif /* SUPPORT_UTF */
715 d = code[coptable[codevalue]];
716 if (codevalue >= OP_TYPESTAR)
717 {
718 switch(d)
719 {
720 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
721 case OP_NOTPROP:
722 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
723 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
724 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
725 case OP_NOT_HSPACE:
726 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
727 case OP_NOT_VSPACE:
728 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
729 default: break;
730 }
731 }
732 }
733 else
734 {
735 dlen = 0; /* Not strictly necessary, but compilers moan */
736 d = NOTACHAR; /* if these variables are not set. */
737 }
738
739
740 /* Now process the individual opcodes */
741
742 switch (codevalue)
743 {
744 /* ========================================================================== */
745 /* These cases are never obeyed. This is a fudge that causes a compile-
746 time error if the vectors coptable or poptable, which are indexed by
747 opcode, are not the correct length. It seems to be the only way to do
748 such a check at compile time, as the sizeof() operator does not work
749 in the C preprocessor. */
750
751 case OP_TABLE_LENGTH:
752 case OP_TABLE_LENGTH +
753 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754 (sizeof(poptable) == OP_TABLE_LENGTH)):
755 break;
756
757 /* ========================================================================== */
758 /* Reached a closing bracket. If not at the end of the pattern, carry
759 on with the next opcode. For repeating opcodes, also add the repeat
760 state. Note that KETRPOS will always be encountered at the end of the
761 subpattern, because the possessive subpattern repeats are always handled
762 using recursive calls. Thus, it never adds any new states.
763
764 At the end of the (sub)pattern, unless we have an empty string and
765 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766 start of the subject, save the match data, shifting up all previous
767 matches so we always have the longest first. */
768
769 case OP_KET:
770 case OP_KETRMIN:
771 case OP_KETRMAX:
772 case OP_KETRPOS:
773 if (code != end_code)
774 {
775 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
776 if (codevalue != OP_KET)
777 {
778 ADD_ACTIVE(state_offset - GET(code, 1), 0);
779 }
780 }
781 else
782 {
783 if (ptr > current_subject ||
784 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
785 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
786 current_subject > start_subject + md->start_offset)))
787 {
788 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789 else if (match_count > 0 && ++match_count * 2 > offsetcount)
790 match_count = 0;
791 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793 if (offsetcount >= 2)
794 {
795 offsets[0] = (int)(current_subject - start_subject);
796 offsets[1] = (int)(ptr - start_subject);
797 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798 offsets[1] - offsets[0], (char *)current_subject));
799 }
800 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801 {
802 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
803 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
804 match_count, rlevel*2-2, SP));
805 return match_count;
806 }
807 }
808 }
809 break;
810
811 /* ========================================================================== */
812 /* These opcodes add to the current list of states without looking
813 at the current character. */
814
815 /*-----------------------------------------------------------------*/
816 case OP_ALT:
817 do { code += GET(code, 1); } while (*code == OP_ALT);
818 ADD_ACTIVE((int)(code - start_code), 0);
819 break;
820
821 /*-----------------------------------------------------------------*/
822 case OP_BRA:
823 case OP_SBRA:
824 do
825 {
826 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827 code += GET(code, 1);
828 }
829 while (*code == OP_ALT);
830 break;
831
832 /*-----------------------------------------------------------------*/
833 case OP_CBRA:
834 case OP_SCBRA:
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
836 code += GET(code, 1);
837 while (*code == OP_ALT)
838 {
839 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
840 code += GET(code, 1);
841 }
842 break;
843
844 /*-----------------------------------------------------------------*/
845 case OP_BRAZERO:
846 case OP_BRAMINZERO:
847 ADD_ACTIVE(state_offset + 1, 0);
848 code += 1 + GET(code, 2);
849 while (*code == OP_ALT) code += GET(code, 1);
850 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851 break;
852
853 /*-----------------------------------------------------------------*/
854 case OP_SKIPZERO:
855 code += 1 + GET(code, 2);
856 while (*code == OP_ALT) code += GET(code, 1);
857 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858 break;
859
860 /*-----------------------------------------------------------------*/
861 case OP_CIRC:
862 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
863 { ADD_ACTIVE(state_offset + 1, 0); }
864 break;
865
866 /*-----------------------------------------------------------------*/
867 case OP_CIRCM:
868 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869 (ptr != end_subject && WAS_NEWLINE(ptr)))
870 { ADD_ACTIVE(state_offset + 1, 0); }
871 break;
872
873 /*-----------------------------------------------------------------*/
874 case OP_EOD:
875 if (ptr >= end_subject)
876 {
877 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878 could_continue = TRUE;
879 else { ADD_ACTIVE(state_offset + 1, 0); }
880 }
881 break;
882
883 /*-----------------------------------------------------------------*/
884 case OP_SOD:
885 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
886 break;
887
888 /*-----------------------------------------------------------------*/
889 case OP_SOM:
890 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
891 break;
892
893
894 /* ========================================================================== */
895 /* These opcodes inspect the next subject character, and sometimes
896 the previous one as well, but do not have an argument. The variable
897 clen contains the length of the current character and is zero if we are
898 at the end of the subject. */
899
900 /*-----------------------------------------------------------------*/
901 case OP_ANY:
902 if (clen > 0 && !IS_NEWLINE(ptr))
903 {
904 if (ptr + 1 >= md->end_subject &&
905 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906 NLBLOCK->nltype == NLTYPE_FIXED &&
907 NLBLOCK->nllen == 2 &&
908 c == NLBLOCK->nl[0])
909 {
910 could_continue = partial_newline = TRUE;
911 }
912 else
913 {
914 ADD_NEW(state_offset + 1, 0);
915 }
916 }
917 break;
918
919 /*-----------------------------------------------------------------*/
920 case OP_ALLANY:
921 if (clen > 0)
922 { ADD_NEW(state_offset + 1, 0); }
923 break;
924
925 /*-----------------------------------------------------------------*/
926 case OP_EODN:
927 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928 could_continue = TRUE;
929 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 break;
932
933 /*-----------------------------------------------------------------*/
934 case OP_DOLL:
935 if ((md->moptions & PCRE_NOTEOL) == 0)
936 {
937 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938 could_continue = TRUE;
939 else if (clen == 0 ||
940 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941 (ptr == end_subject - md->nllen)
942 ))
943 { ADD_ACTIVE(state_offset + 1, 0); }
944 else if (ptr + 1 >= md->end_subject &&
945 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946 NLBLOCK->nltype == NLTYPE_FIXED &&
947 NLBLOCK->nllen == 2 &&
948 c == NLBLOCK->nl[0])
949 {
950 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951 {
952 reset_could_continue = TRUE;
953 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954 }
955 else could_continue = partial_newline = TRUE;
956 }
957 }
958 break;
959
960 /*-----------------------------------------------------------------*/
961 case OP_DOLLM:
962 if ((md->moptions & PCRE_NOTEOL) == 0)
963 {
964 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965 could_continue = TRUE;
966 else if (clen == 0 ||
967 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968 { ADD_ACTIVE(state_offset + 1, 0); }
969 else if (ptr + 1 >= md->end_subject &&
970 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971 NLBLOCK->nltype == NLTYPE_FIXED &&
972 NLBLOCK->nllen == 2 &&
973 c == NLBLOCK->nl[0])
974 {
975 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976 {
977 reset_could_continue = TRUE;
978 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979 }
980 else could_continue = partial_newline = TRUE;
981 }
982 }
983 else if (IS_NEWLINE(ptr))
984 { ADD_ACTIVE(state_offset + 1, 0); }
985 break;
986
987 /*-----------------------------------------------------------------*/
988
989 case OP_DIGIT:
990 case OP_WHITESPACE:
991 case OP_WORDCHAR:
992 if (clen > 0 && c < 256 &&
993 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
994 { ADD_NEW(state_offset + 1, 0); }
995 break;
996
997 /*-----------------------------------------------------------------*/
998 case OP_NOT_DIGIT:
999 case OP_NOT_WHITESPACE:
1000 case OP_NOT_WORDCHAR:
1001 if (clen > 0 && (c >= 256 ||
1002 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1003 { ADD_NEW(state_offset + 1, 0); }
1004 break;
1005
1006 /*-----------------------------------------------------------------*/
1007 case OP_WORD_BOUNDARY:
1008 case OP_NOT_WORD_BOUNDARY:
1009 {
1010 int left_word, right_word;
1011
1012 if (ptr > start_subject)
1013 {
1014 const pcre_uchar *temp = ptr - 1;
1015 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017 if (utf) { BACKCHAR(temp); }
1018 #endif
1019 GETCHARTEST(d, temp);
1020 #ifdef SUPPORT_UCP
1021 if ((md->poptions & PCRE_UCP) != 0)
1022 {
1023 if (d == '_') left_word = TRUE; else
1024 {
1025 int cat = UCD_CATEGORY(d);
1026 left_word = (cat == ucp_L || cat == ucp_N);
1027 }
1028 }
1029 else
1030 #endif
1031 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032 }
1033 else left_word = FALSE;
1034
1035 if (clen > 0)
1036 {
1037 #ifdef SUPPORT_UCP
1038 if ((md->poptions & PCRE_UCP) != 0)
1039 {
1040 if (c == '_') right_word = TRUE; else
1041 {
1042 int cat = UCD_CATEGORY(c);
1043 right_word = (cat == ucp_L || cat == ucp_N);
1044 }
1045 }
1046 else
1047 #endif
1048 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049 }
1050 else right_word = FALSE;
1051
1052 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053 { ADD_ACTIVE(state_offset + 1, 0); }
1054 }
1055 break;
1056
1057
1058 /*-----------------------------------------------------------------*/
1059 /* Check the next character by Unicode property. We will get here only
1060 if the support is in the binary; otherwise a compile-time error occurs.
1061 */
1062
1063 #ifdef SUPPORT_UCP
1064 case OP_PROP:
1065 case OP_NOTPROP:
1066 if (clen > 0)
1067 {
1068 BOOL OK;
1069 const pcre_uint32 *cp;
1070 const ucd_record * prop = GET_UCD(c);
1071 switch(code[1])
1072 {
1073 case PT_ANY:
1074 OK = TRUE;
1075 break;
1076
1077 case PT_LAMP:
1078 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079 prop->chartype == ucp_Lt;
1080 break;
1081
1082 case PT_GC:
1083 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084 break;
1085
1086 case PT_PC:
1087 OK = prop->chartype == code[2];
1088 break;
1089
1090 case PT_SC:
1091 OK = prop->script == code[2];
1092 break;
1093
1094 /* These are specials for combination cases. */
1095
1096 case PT_ALNUM:
1097 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099 break;
1100
1101 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102 which means that Perl space and POSIX space are now identical. PCRE
1103 was changed at release 8.34. */
1104
1105 case PT_SPACE: /* Perl space */
1106 case PT_PXSPACE: /* POSIX space */
1107 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1108 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1109 c == CHAR_FF || c == CHAR_CR;
1110 break;
1111
1112 case PT_WORD:
1113 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1114 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1115 c == CHAR_UNDERSCORE;
1116 break;
1117
1118 case PT_CLIST:
1119 cp = PRIV(ucd_caseless_sets) + code[2];
1120 for (;;)
1121 {
1122 if (c < *cp) { OK = FALSE; break; }
1123 if (c == *cp++) { OK = TRUE; break; }
1124 }
1125 break;
1126
1127 case PT_UCNC:
1128 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1129 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1130 c >= 0xe000;
1131 break;
1132
1133 /* Should never occur, but keep compilers from grumbling. */
1134
1135 default:
1136 OK = codevalue != OP_PROP;
1137 break;
1138 }
1139
1140 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1141 }
1142 break;
1143 #endif
1144
1145
1146
1147 /* ========================================================================== */
1148 /* These opcodes likewise inspect the subject character, but have an
1149 argument that is not a data character. It is one of these opcodes:
1150 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1151 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1152
1153 case OP_TYPEPLUS:
1154 case OP_TYPEMINPLUS:
1155 case OP_TYPEPOSPLUS:
1156 count = current_state->count; /* Already matched */
1157 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1158 if (clen > 0)
1159 {
1160 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1161 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1162 NLBLOCK->nltype == NLTYPE_FIXED &&
1163 NLBLOCK->nllen == 2 &&
1164 c == NLBLOCK->nl[0])
1165 {
1166 could_continue = partial_newline = TRUE;
1167 }
1168 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1169 (c < 256 &&
1170 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1171 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1172 {
1173 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1174 {
1175 active_count--; /* Remove non-match possibility */
1176 next_active_state--;
1177 }
1178 count++;
1179 ADD_NEW(state_offset, count);
1180 }
1181 }
1182 break;
1183
1184 /*-----------------------------------------------------------------*/
1185 case OP_TYPEQUERY:
1186 case OP_TYPEMINQUERY:
1187 case OP_TYPEPOSQUERY:
1188 ADD_ACTIVE(state_offset + 2, 0);
1189 if (clen > 0)
1190 {
1191 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1192 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1193 NLBLOCK->nltype == NLTYPE_FIXED &&
1194 NLBLOCK->nllen == 2 &&
1195 c == NLBLOCK->nl[0])
1196 {
1197 could_continue = partial_newline = TRUE;
1198 }
1199 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1200 (c < 256 &&
1201 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1202 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1203 {
1204 if (codevalue == OP_TYPEPOSQUERY)
1205 {
1206 active_count--; /* Remove non-match possibility */
1207 next_active_state--;
1208 }
1209 ADD_NEW(state_offset + 2, 0);
1210 }
1211 }
1212 break;
1213
1214 /*-----------------------------------------------------------------*/
1215 case OP_TYPESTAR:
1216 case OP_TYPEMINSTAR:
1217 case OP_TYPEPOSSTAR:
1218 ADD_ACTIVE(state_offset + 2, 0);
1219 if (clen > 0)
1220 {
1221 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1222 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1223 NLBLOCK->nltype == NLTYPE_FIXED &&
1224 NLBLOCK->nllen == 2 &&
1225 c == NLBLOCK->nl[0])
1226 {
1227 could_continue = partial_newline = TRUE;
1228 }
1229 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1230 (c < 256 &&
1231 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1232 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1233 {
1234 if (codevalue == OP_TYPEPOSSTAR)
1235 {
1236 active_count--; /* Remove non-match possibility */
1237 next_active_state--;
1238 }
1239 ADD_NEW(state_offset, 0);
1240 }
1241 }
1242 break;
1243
1244 /*-----------------------------------------------------------------*/
1245 case OP_TYPEEXACT:
1246 count = current_state->count; /* Number already matched */
1247 if (clen > 0)
1248 {
1249 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1250 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1251 NLBLOCK->nltype == NLTYPE_FIXED &&
1252 NLBLOCK->nllen == 2 &&
1253 c == NLBLOCK->nl[0])
1254 {
1255 could_continue = partial_newline = TRUE;
1256 }
1257 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1258 (c < 256 &&
1259 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1260 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1261 {
1262 if (++count >= (int)GET2(code, 1))
1263 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1264 else
1265 { ADD_NEW(state_offset, count); }
1266 }
1267 }
1268 break;
1269
1270 /*-----------------------------------------------------------------*/
1271 case OP_TYPEUPTO:
1272 case OP_TYPEMINUPTO:
1273 case OP_TYPEPOSUPTO:
1274 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1275 count = current_state->count; /* Number already matched */
1276 if (clen > 0)
1277 {
1278 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1279 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1280 NLBLOCK->nltype == NLTYPE_FIXED &&
1281 NLBLOCK->nllen == 2 &&
1282 c == NLBLOCK->nl[0])
1283 {
1284 could_continue = partial_newline = TRUE;
1285 }
1286 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1287 (c < 256 &&
1288 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1289 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1290 {
1291 if (codevalue == OP_TYPEPOSUPTO)
1292 {
1293 active_count--; /* Remove non-match possibility */
1294 next_active_state--;
1295 }
1296 if (++count >= (int)GET2(code, 1))
1297 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1298 else
1299 { ADD_NEW(state_offset, count); }
1300 }
1301 }
1302 break;
1303
1304 /* ========================================================================== */
1305 /* These are virtual opcodes that are used when something like
1306 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1307 argument. It keeps the code above fast for the other cases. The argument
1308 is in the d variable. */
1309
1310 #ifdef SUPPORT_UCP
1311 case OP_PROP_EXTRA + OP_TYPEPLUS:
1312 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1313 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1314 count = current_state->count; /* Already matched */
1315 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1316 if (clen > 0)
1317 {
1318 BOOL OK;
1319 const pcre_uint32 *cp;
1320 const ucd_record * prop = GET_UCD(c);
1321 switch(code[2])
1322 {
1323 case PT_ANY:
1324 OK = TRUE;
1325 break;
1326
1327 case PT_LAMP:
1328 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1329 prop->chartype == ucp_Lt;
1330 break;
1331
1332 case PT_GC:
1333 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1334 break;
1335
1336 case PT_PC:
1337 OK = prop->chartype == code[3];
1338 break;
1339
1340 case PT_SC:
1341 OK = prop->script == code[3];
1342 break;
1343
1344 /* These are specials for combination cases. */
1345
1346 case PT_ALNUM:
1347 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1348 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1349 break;
1350
1351 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1352 which means that Perl space and POSIX space are now identical. PCRE
1353 was changed at release 8.34. */
1354
1355 case PT_SPACE: /* Perl space */
1356 case PT_PXSPACE: /* POSIX space */
1357 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1358 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1359 c == CHAR_FF || c == CHAR_CR;
1360 break;
1361
1362 case PT_WORD:
1363 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1364 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1365 c == CHAR_UNDERSCORE;
1366 break;
1367
1368 case PT_CLIST:
1369 cp = PRIV(ucd_caseless_sets) + code[3];
1370 for (;;)
1371 {
1372 if (c < *cp) { OK = FALSE; break; }
1373 if (c == *cp++) { OK = TRUE; break; }
1374 }
1375 break;
1376
1377 case PT_UCNC:
1378 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1379 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1380 c >= 0xe000;
1381 break;
1382
1383 /* Should never occur, but keep compilers from grumbling. */
1384
1385 default:
1386 OK = codevalue != OP_PROP;
1387 break;
1388 }
1389
1390 if (OK == (d == OP_PROP))
1391 {
1392 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1393 {
1394 active_count--; /* Remove non-match possibility */
1395 next_active_state--;
1396 }
1397 count++;
1398 ADD_NEW(state_offset, count);
1399 }
1400 }
1401 break;
1402
1403 /*-----------------------------------------------------------------*/
1404 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1405 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1406 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1407 count = current_state->count; /* Already matched */
1408 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1409 if (clen > 0)
1410 {
1411 int lgb, rgb;
1412 const pcre_uchar *nptr = ptr + clen;
1413 int ncount = 0;
1414 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1415 {
1416 active_count--; /* Remove non-match possibility */
1417 next_active_state--;
1418 }
1419 lgb = UCD_GRAPHBREAK(c);
1420 while (nptr < end_subject)
1421 {
1422 dlen = 1;
1423 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1424 rgb = UCD_GRAPHBREAK(d);
1425 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1426 ncount++;
1427 lgb = rgb;
1428 nptr += dlen;
1429 }
1430 count++;
1431 ADD_NEW_DATA(-state_offset, count, ncount);
1432 }
1433 break;
1434 #endif
1435
1436 /*-----------------------------------------------------------------*/
1437 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1438 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1439 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1440 count = current_state->count; /* Already matched */
1441 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1442 if (clen > 0)
1443 {
1444 int ncount = 0;
1445 switch (c)
1446 {
1447 case CHAR_VT:
1448 case CHAR_FF:
1449 case CHAR_NEL:
1450 #ifndef EBCDIC
1451 case 0x2028:
1452 case 0x2029:
1453 #endif /* Not EBCDIC */
1454 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1455 goto ANYNL01;
1456
1457 case CHAR_CR:
1458 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1459 /* Fall through */
1460
1461 ANYNL01:
1462 case CHAR_LF:
1463 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1464 {
1465 active_count--; /* Remove non-match possibility */
1466 next_active_state--;
1467 }
1468 count++;
1469 ADD_NEW_DATA(-state_offset, count, ncount);
1470 break;
1471
1472 default:
1473 break;
1474 }
1475 }
1476 break;
1477
1478 /*-----------------------------------------------------------------*/
1479 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1480 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1481 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1482 count = current_state->count; /* Already matched */
1483 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1484 if (clen > 0)
1485 {
1486 BOOL OK;
1487 switch (c)
1488 {
1489 VSPACE_CASES:
1490 OK = TRUE;
1491 break;
1492
1493 default:
1494 OK = FALSE;
1495 break;
1496 }
1497
1498 if (OK == (d == OP_VSPACE))
1499 {
1500 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1501 {
1502 active_count--; /* Remove non-match possibility */
1503 next_active_state--;
1504 }
1505 count++;
1506 ADD_NEW_DATA(-state_offset, count, 0);
1507 }
1508 }
1509 break;
1510
1511 /*-----------------------------------------------------------------*/
1512 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1513 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1514 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1515 count = current_state->count; /* Already matched */
1516 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1517 if (clen > 0)
1518 {
1519 BOOL OK;
1520 switch (c)
1521 {
1522 HSPACE_CASES:
1523 OK = TRUE;
1524 break;
1525
1526 default:
1527 OK = FALSE;
1528 break;
1529 }
1530
1531 if (OK == (d == OP_HSPACE))
1532 {
1533 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1534 {
1535 active_count--; /* Remove non-match possibility */
1536 next_active_state--;
1537 }
1538 count++;
1539 ADD_NEW_DATA(-state_offset, count, 0);
1540 }
1541 }
1542 break;
1543
1544 /*-----------------------------------------------------------------*/
1545 #ifdef SUPPORT_UCP
1546 case OP_PROP_EXTRA + OP_TYPEQUERY:
1547 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1548 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1549 count = 4;
1550 goto QS1;
1551
1552 case OP_PROP_EXTRA + OP_TYPESTAR:
1553 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1554 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1555 count = 0;
1556
1557 QS1:
1558
1559 ADD_ACTIVE(state_offset + 4, 0);
1560 if (clen > 0)
1561 {
1562 BOOL OK;
1563 const pcre_uint32 *cp;
1564 const ucd_record * prop = GET_UCD(c);
1565 switch(code[2])
1566 {
1567 case PT_ANY:
1568 OK = TRUE;
1569 break;
1570
1571 case PT_LAMP:
1572 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1573 prop->chartype == ucp_Lt;
1574 break;
1575
1576 case PT_GC:
1577 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1578 break;
1579
1580 case PT_PC:
1581 OK = prop->chartype == code[3];
1582 break;
1583
1584 case PT_SC:
1585 OK = prop->script == code[3];
1586 break;
1587
1588 /* These are specials for combination cases. */
1589
1590 case PT_ALNUM:
1591 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1593 break;
1594
1595 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1596 which means that Perl space and POSIX space are now identical. PCRE
1597 was changed at release 8.34. */
1598
1599 case PT_SPACE: /* Perl space */
1600 case PT_PXSPACE: /* POSIX space */
1601 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1602 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1603 c == CHAR_FF || c == CHAR_CR;
1604 break;
1605
1606 case PT_WORD:
1607 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1609 c == CHAR_UNDERSCORE;
1610 break;
1611
1612 case PT_CLIST:
1613 cp = PRIV(ucd_caseless_sets) + code[3];
1614 for (;;)
1615 {
1616 if (c < *cp) { OK = FALSE; break; }
1617 if (c == *cp++) { OK = TRUE; break; }
1618 }
1619 break;
1620
1621 case PT_UCNC:
1622 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1623 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1624 c >= 0xe000;
1625 break;
1626
1627 /* Should never occur, but keep compilers from grumbling. */
1628
1629 default:
1630 OK = codevalue != OP_PROP;
1631 break;
1632 }
1633
1634 if (OK == (d == OP_PROP))
1635 {
1636 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1637 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1638 {
1639 active_count--; /* Remove non-match possibility */
1640 next_active_state--;
1641 }
1642 ADD_NEW(state_offset + count, 0);
1643 }
1644 }
1645 break;
1646
1647 /*-----------------------------------------------------------------*/
1648 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1649 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1650 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1651 count = 2;
1652 goto QS2;
1653
1654 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1655 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1656 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1657 count = 0;
1658
1659 QS2:
1660
1661 ADD_ACTIVE(state_offset + 2, 0);
1662 if (clen > 0)
1663 {
1664 int lgb, rgb;
1665 const pcre_uchar *nptr = ptr + clen;
1666 int ncount = 0;
1667 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1668 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1669 {
1670 active_count--; /* Remove non-match possibility */
1671 next_active_state--;
1672 }
1673 lgb = UCD_GRAPHBREAK(c);
1674 while (nptr < end_subject)
1675 {
1676 dlen = 1;
1677 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1678 rgb = UCD_GRAPHBREAK(d);
1679 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1680 ncount++;
1681 lgb = rgb;
1682 nptr += dlen;
1683 }
1684 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1685 }
1686 break;
1687 #endif
1688
1689 /*-----------------------------------------------------------------*/
1690 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1691 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1692 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1693 count = 2;
1694 goto QS3;
1695
1696 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1697 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1698 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1699 count = 0;
1700
1701 QS3:
1702 ADD_ACTIVE(state_offset + 2, 0);
1703 if (clen > 0)
1704 {
1705 int ncount = 0;
1706 switch (c)
1707 {
1708 case CHAR_VT:
1709 case CHAR_FF:
1710 case CHAR_NEL:
1711 #ifndef EBCDIC
1712 case 0x2028:
1713 case 0x2029:
1714 #endif /* Not EBCDIC */
1715 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1716 goto ANYNL02;
1717
1718 case CHAR_CR:
1719 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1720 /* Fall through */
1721
1722 ANYNL02:
1723 case CHAR_LF:
1724 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1725 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1726 {
1727 active_count--; /* Remove non-match possibility */
1728 next_active_state--;
1729 }
1730 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1731 break;
1732
1733 default:
1734 break;
1735 }
1736 }
1737 break;
1738
1739 /*-----------------------------------------------------------------*/
1740 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1741 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1742 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1743 count = 2;
1744 goto QS4;
1745
1746 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1747 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1748 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1749 count = 0;
1750
1751 QS4:
1752 ADD_ACTIVE(state_offset + 2, 0);
1753 if (clen > 0)
1754 {
1755 BOOL OK;
1756 switch (c)
1757 {
1758 VSPACE_CASES:
1759 OK = TRUE;
1760 break;
1761
1762 default:
1763 OK = FALSE;
1764 break;
1765 }
1766 if (OK == (d == OP_VSPACE))
1767 {
1768 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1769 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1770 {
1771 active_count--; /* Remove non-match possibility */
1772 next_active_state--;
1773 }
1774 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1775 }
1776 }
1777 break;
1778
1779 /*-----------------------------------------------------------------*/
1780 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1781 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1782 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1783 count = 2;
1784 goto QS5;
1785
1786 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1787 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1788 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1789 count = 0;
1790
1791 QS5:
1792 ADD_ACTIVE(state_offset + 2, 0);
1793 if (clen > 0)
1794 {
1795 BOOL OK;
1796 switch (c)
1797 {
1798 HSPACE_CASES:
1799 OK = TRUE;
1800 break;
1801
1802 default:
1803 OK = FALSE;
1804 break;
1805 }
1806
1807 if (OK == (d == OP_HSPACE))
1808 {
1809 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1810 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1811 {
1812 active_count--; /* Remove non-match possibility */
1813 next_active_state--;
1814 }
1815 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1816 }
1817 }
1818 break;
1819
1820 /*-----------------------------------------------------------------*/
1821 #ifdef SUPPORT_UCP
1822 case OP_PROP_EXTRA + OP_TYPEEXACT:
1823 case OP_PROP_EXTRA + OP_TYPEUPTO:
1824 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1825 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1826 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1827 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1828 count = current_state->count; /* Number already matched */
1829 if (clen > 0)
1830 {
1831 BOOL OK;
1832 const pcre_uint32 *cp;
1833 const ucd_record * prop = GET_UCD(c);
1834 switch(code[1 + IMM2_SIZE + 1])
1835 {
1836 case PT_ANY:
1837 OK = TRUE;
1838 break;
1839
1840 case PT_LAMP:
1841 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1842 prop->chartype == ucp_Lt;
1843 break;
1844
1845 case PT_GC:
1846 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1847 break;
1848
1849 case PT_PC:
1850 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1851 break;
1852
1853 case PT_SC:
1854 OK = prop->script == code[1 + IMM2_SIZE + 2];
1855 break;
1856
1857 /* These are specials for combination cases. */
1858
1859 case PT_ALNUM:
1860 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1861 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1862 break;
1863
1864 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1865 which means that Perl space and POSIX space are now identical. PCRE
1866 was changed at release 8.34. */
1867
1868 case PT_SPACE: /* Perl space */
1869 case PT_PXSPACE: /* POSIX space */
1870 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1871 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1872 c == CHAR_FF || c == CHAR_CR;
1873 break;
1874
1875 case PT_WORD:
1876 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1877 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1878 c == CHAR_UNDERSCORE;
1879 break;
1880
1881 case PT_CLIST:
1882 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1883 for (;;)
1884 {
1885 if (c < *cp) { OK = FALSE; break; }
1886 if (c == *cp++) { OK = TRUE; break; }
1887 }
1888 break;
1889
1890 case PT_UCNC:
1891 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1892 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1893 c >= 0xe000;
1894 break;
1895
1896 /* Should never occur, but keep compilers from grumbling. */
1897
1898 default:
1899 OK = codevalue != OP_PROP;
1900 break;
1901 }
1902
1903 if (OK == (d == OP_PROP))
1904 {
1905 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1906 {
1907 active_count--; /* Remove non-match possibility */
1908 next_active_state--;
1909 }
1910 if (++count >= (int)GET2(code, 1))
1911 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1912 else
1913 { ADD_NEW(state_offset, count); }
1914 }
1915 }
1916 break;
1917
1918 /*-----------------------------------------------------------------*/
1919 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1920 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1921 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1922 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1923 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1924 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925 count = current_state->count; /* Number already matched */
1926 if (clen > 0)
1927 {
1928 int lgb, rgb;
1929 const pcre_uchar *nptr = ptr + clen;
1930 int ncount = 0;
1931 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1932 {
1933 active_count--; /* Remove non-match possibility */
1934 next_active_state--;
1935 }
1936 lgb = UCD_GRAPHBREAK(c);
1937 while (nptr < end_subject)
1938 {
1939 dlen = 1;
1940 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1941 rgb = UCD_GRAPHBREAK(d);
1942 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1943 ncount++;
1944 lgb = rgb;
1945 nptr += dlen;
1946 }
1947 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1948 reset_could_continue = TRUE;
1949 if (++count >= (int)GET2(code, 1))
1950 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1951 else
1952 { ADD_NEW_DATA(-state_offset, count, ncount); }
1953 }
1954 break;
1955 #endif
1956
1957 /*-----------------------------------------------------------------*/
1958 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1959 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1960 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1961 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1962 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1963 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1964 count = current_state->count; /* Number already matched */
1965 if (clen > 0)
1966 {
1967 int ncount = 0;
1968 switch (c)
1969 {
1970 case CHAR_VT:
1971 case CHAR_FF:
1972 case CHAR_NEL:
1973 #ifndef EBCDIC
1974 case 0x2028:
1975 case 0x2029:
1976 #endif /* Not EBCDIC */
1977 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1978 goto ANYNL03;
1979
1980 case CHAR_CR:
1981 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1982 /* Fall through */
1983
1984 ANYNL03:
1985 case CHAR_LF:
1986 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1987 {
1988 active_count--; /* Remove non-match possibility */
1989 next_active_state--;
1990 }
1991 if (++count >= (int)GET2(code, 1))
1992 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1993 else
1994 { ADD_NEW_DATA(-state_offset, count, ncount); }
1995 break;
1996
1997 default:
1998 break;
1999 }
2000 }
2001 break;
2002
2003 /*-----------------------------------------------------------------*/
2004 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2005 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2006 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2007 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2008 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2009 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2010 count = current_state->count; /* Number already matched */
2011 if (clen > 0)
2012 {
2013 BOOL OK;
2014 switch (c)
2015 {
2016 VSPACE_CASES:
2017 OK = TRUE;
2018 break;
2019
2020 default:
2021 OK = FALSE;
2022 }
2023
2024 if (OK == (d == OP_VSPACE))
2025 {
2026 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2027 {
2028 active_count--; /* Remove non-match possibility */
2029 next_active_state--;
2030 }
2031 if (++count >= (int)GET2(code, 1))
2032 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2033 else
2034 { ADD_NEW_DATA(-state_offset, count, 0); }
2035 }
2036 }
2037 break;
2038
2039 /*-----------------------------------------------------------------*/
2040 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2041 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2042 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2043 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2044 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2045 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2046 count = current_state->count; /* Number already matched */
2047 if (clen > 0)
2048 {
2049 BOOL OK;
2050 switch (c)
2051 {
2052 HSPACE_CASES:
2053 OK = TRUE;
2054 break;
2055
2056 default:
2057 OK = FALSE;
2058 break;
2059 }
2060
2061 if (OK == (d == OP_HSPACE))
2062 {
2063 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2064 {
2065 active_count--; /* Remove non-match possibility */
2066 next_active_state--;
2067 }
2068 if (++count >= (int)GET2(code, 1))
2069 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2070 else
2071 { ADD_NEW_DATA(-state_offset, count, 0); }
2072 }
2073 }
2074 break;
2075
2076 /* ========================================================================== */
2077 /* These opcodes are followed by a character that is usually compared
2078 to the current subject character; it is loaded into d. We still get
2079 here even if there is no subject character, because in some cases zero
2080 repetitions are permitted. */
2081
2082 /*-----------------------------------------------------------------*/
2083 case OP_CHAR:
2084 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2085 break;
2086
2087 /*-----------------------------------------------------------------*/
2088 case OP_CHARI:
2089 if (clen == 0) break;
2090
2091 #ifdef SUPPORT_UTF
2092 if (utf)
2093 {
2094 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2095 {
2096 unsigned int othercase;
2097 if (c < 128)
2098 othercase = fcc[c];
2099 else
2100 /* If we have Unicode property support, we can use it to test the
2101 other case of the character. */
2102 #ifdef SUPPORT_UCP
2103 othercase = UCD_OTHERCASE(c);
2104 #else
2105 othercase = NOTACHAR;
2106 #endif
2107
2108 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2109 }
2110 }
2111 else
2112 #endif /* SUPPORT_UTF */
2113 /* Not UTF mode */
2114 {
2115 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2116 { ADD_NEW(state_offset + 2, 0); }
2117 }
2118 break;
2119
2120
2121 #ifdef SUPPORT_UCP
2122 /*-----------------------------------------------------------------*/
2123 /* This is a tricky one because it can match more than one character.
2124 Find out how many characters to skip, and then set up a negative state
2125 to wait for them to pass before continuing. */
2126
2127 case OP_EXTUNI:
2128 if (clen > 0)
2129 {
2130 int lgb, rgb;
2131 const pcre_uchar *nptr = ptr + clen;
2132 int ncount = 0;
2133 lgb = UCD_GRAPHBREAK(c);
2134 while (nptr < end_subject)
2135 {
2136 dlen = 1;
2137 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2138 rgb = UCD_GRAPHBREAK(d);
2139 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2140 ncount++;
2141 lgb = rgb;
2142 nptr += dlen;
2143 }
2144 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2145 reset_could_continue = TRUE;
2146 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2147 }
2148 break;
2149 #endif
2150
2151 /*-----------------------------------------------------------------*/
2152 /* This is a tricky like EXTUNI because it too can match more than one
2153 character (when CR is followed by LF). In this case, set up a negative
2154 state to wait for one character to pass before continuing. */
2155
2156 case OP_ANYNL:
2157 if (clen > 0) switch(c)
2158 {
2159 case CHAR_VT:
2160 case CHAR_FF:
2161 case CHAR_NEL:
2162 #ifndef EBCDIC
2163 case 0x2028:
2164 case 0x2029:
2165 #endif /* Not EBCDIC */
2166 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2167
2168 case CHAR_LF:
2169 ADD_NEW(state_offset + 1, 0);
2170 break;
2171
2172 case CHAR_CR:
2173 if (ptr + 1 >= end_subject)
2174 {
2175 ADD_NEW(state_offset + 1, 0);
2176 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2177 reset_could_continue = TRUE;
2178 }
2179 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2180 {
2181 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2182 }
2183 else
2184 {
2185 ADD_NEW(state_offset + 1, 0);
2186 }
2187 break;
2188 }
2189 break;
2190
2191 /*-----------------------------------------------------------------*/
2192 case OP_NOT_VSPACE:
2193 if (clen > 0) switch(c)
2194 {
2195 VSPACE_CASES:
2196 break;
2197
2198 default:
2199 ADD_NEW(state_offset + 1, 0);
2200 break;
2201 }
2202 break;
2203
2204 /*-----------------------------------------------------------------*/
2205 case OP_VSPACE:
2206 if (clen > 0) switch(c)
2207 {
2208 VSPACE_CASES:
2209 ADD_NEW(state_offset + 1, 0);
2210 break;
2211
2212 default:
2213 break;
2214 }
2215 break;
2216
2217 /*-----------------------------------------------------------------*/
2218 case OP_NOT_HSPACE:
2219 if (clen > 0) switch(c)
2220 {
2221 HSPACE_CASES:
2222 break;
2223
2224 default:
2225 ADD_NEW(state_offset + 1, 0);
2226 break;
2227 }
2228 break;
2229
2230 /*-----------------------------------------------------------------*/
2231 case OP_HSPACE:
2232 if (clen > 0) switch(c)
2233 {
2234 HSPACE_CASES:
2235 ADD_NEW(state_offset + 1, 0);
2236 break;
2237
2238 default:
2239 break;
2240 }
2241 break;
2242
2243 /*-----------------------------------------------------------------*/
2244 /* Match a negated single character casefully. */
2245
2246 case OP_NOT:
2247 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2248 break;
2249
2250 /*-----------------------------------------------------------------*/
2251 /* Match a negated single character caselessly. */
2252
2253 case OP_NOTI:
2254 if (clen > 0)
2255 {
2256 unsigned int otherd;
2257 #ifdef SUPPORT_UTF
2258 if (utf && d >= 128)
2259 {
2260 #ifdef SUPPORT_UCP
2261 otherd = UCD_OTHERCASE(d);
2262 #endif /* SUPPORT_UCP */
2263 }
2264 else
2265 #endif /* SUPPORT_UTF */
2266 otherd = TABLE_GET(d, fcc, d);
2267 if (c != d && c != otherd)
2268 { ADD_NEW(state_offset + dlen + 1, 0); }
2269 }
2270 break;
2271
2272 /*-----------------------------------------------------------------*/
2273 case OP_PLUSI:
2274 case OP_MINPLUSI:
2275 case OP_POSPLUSI:
2276 case OP_NOTPLUSI:
2277 case OP_NOTMINPLUSI:
2278 case OP_NOTPOSPLUSI:
2279 caseless = TRUE;
2280 codevalue -= OP_STARI - OP_STAR;
2281
2282 /* Fall through */
2283 case OP_PLUS:
2284 case OP_MINPLUS:
2285 case OP_POSPLUS:
2286 case OP_NOTPLUS:
2287 case OP_NOTMINPLUS:
2288 case OP_NOTPOSPLUS:
2289 count = current_state->count; /* Already matched */
2290 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2291 if (clen > 0)
2292 {
2293 pcre_uint32 otherd = NOTACHAR;
2294 if (caseless)
2295 {
2296 #ifdef SUPPORT_UTF
2297 if (utf && d >= 128)
2298 {
2299 #ifdef SUPPORT_UCP
2300 otherd = UCD_OTHERCASE(d);
2301 #endif /* SUPPORT_UCP */
2302 }
2303 else
2304 #endif /* SUPPORT_UTF */
2305 otherd = TABLE_GET(d, fcc, d);
2306 }
2307 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2308 {
2309 if (count > 0 &&
2310 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2311 {
2312 active_count--; /* Remove non-match possibility */
2313 next_active_state--;
2314 }
2315 count++;
2316 ADD_NEW(state_offset, count);
2317 }
2318 }
2319 break;
2320
2321 /*-----------------------------------------------------------------*/
2322 case OP_QUERYI:
2323 case OP_MINQUERYI:
2324 case OP_POSQUERYI:
2325 case OP_NOTQUERYI:
2326 case OP_NOTMINQUERYI:
2327 case OP_NOTPOSQUERYI:
2328 caseless = TRUE;
2329 codevalue -= OP_STARI - OP_STAR;
2330 /* Fall through */
2331 case OP_QUERY:
2332 case OP_MINQUERY:
2333 case OP_POSQUERY:
2334 case OP_NOTQUERY:
2335 case OP_NOTMINQUERY:
2336 case OP_NOTPOSQUERY:
2337 ADD_ACTIVE(state_offset + dlen + 1, 0);
2338 if (clen > 0)
2339 {
2340 pcre_uint32 otherd = NOTACHAR;
2341 if (caseless)
2342 {
2343 #ifdef SUPPORT_UTF
2344 if (utf && d >= 128)
2345 {
2346 #ifdef SUPPORT_UCP
2347 otherd = UCD_OTHERCASE(d);
2348 #endif /* SUPPORT_UCP */
2349 }
2350 else
2351 #endif /* SUPPORT_UTF */
2352 otherd = TABLE_GET(d, fcc, d);
2353 }
2354 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2355 {
2356 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2357 {
2358 active_count--; /* Remove non-match possibility */
2359 next_active_state--;
2360 }
2361 ADD_NEW(state_offset + dlen + 1, 0);
2362 }
2363 }
2364 break;
2365
2366 /*-----------------------------------------------------------------*/
2367 case OP_STARI:
2368 case OP_MINSTARI:
2369 case OP_POSSTARI:
2370 case OP_NOTSTARI:
2371 case OP_NOTMINSTARI:
2372 case OP_NOTPOSSTARI:
2373 caseless = TRUE;
2374 codevalue -= OP_STARI - OP_STAR;
2375 /* Fall through */
2376 case OP_STAR:
2377 case OP_MINSTAR:
2378 case OP_POSSTAR:
2379 case OP_NOTSTAR:
2380 case OP_NOTMINSTAR:
2381 case OP_NOTPOSSTAR:
2382 ADD_ACTIVE(state_offset + dlen + 1, 0);
2383 if (clen > 0)
2384 {
2385 pcre_uint32 otherd = NOTACHAR;
2386 if (caseless)
2387 {
2388 #ifdef SUPPORT_UTF
2389 if (utf && d >= 128)
2390 {
2391 #ifdef SUPPORT_UCP
2392 otherd = UCD_OTHERCASE(d);
2393 #endif /* SUPPORT_UCP */
2394 }
2395 else
2396 #endif /* SUPPORT_UTF */
2397 otherd = TABLE_GET(d, fcc, d);
2398 }
2399 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400 {
2401 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2402 {
2403 active_count--; /* Remove non-match possibility */
2404 next_active_state--;
2405 }
2406 ADD_NEW(state_offset, 0);
2407 }
2408 }
2409 break;
2410
2411 /*-----------------------------------------------------------------*/
2412 case OP_EXACTI:
2413 case OP_NOTEXACTI:
2414 caseless = TRUE;
2415 codevalue -= OP_STARI - OP_STAR;
2416 /* Fall through */
2417 case OP_EXACT:
2418 case OP_NOTEXACT:
2419 count = current_state->count; /* Number already matched */
2420 if (clen > 0)
2421 {
2422 pcre_uint32 otherd = NOTACHAR;
2423 if (caseless)
2424 {
2425 #ifdef SUPPORT_UTF
2426 if (utf && d >= 128)
2427 {
2428 #ifdef SUPPORT_UCP
2429 otherd = UCD_OTHERCASE(d);
2430 #endif /* SUPPORT_UCP */
2431 }
2432 else
2433 #endif /* SUPPORT_UTF */
2434 otherd = TABLE_GET(d, fcc, d);
2435 }
2436 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2437 {
2438 if (++count >= (int)GET2(code, 1))
2439 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2440 else
2441 { ADD_NEW(state_offset, count); }
2442 }
2443 }
2444 break;
2445
2446 /*-----------------------------------------------------------------*/
2447 case OP_UPTOI:
2448 case OP_MINUPTOI:
2449 case OP_POSUPTOI:
2450 case OP_NOTUPTOI:
2451 case OP_NOTMINUPTOI:
2452 case OP_NOTPOSUPTOI:
2453 caseless = TRUE;
2454 codevalue -= OP_STARI - OP_STAR;
2455 /* Fall through */
2456 case OP_UPTO:
2457 case OP_MINUPTO:
2458 case OP_POSUPTO:
2459 case OP_NOTUPTO:
2460 case OP_NOTMINUPTO:
2461 case OP_NOTPOSUPTO:
2462 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2463 count = current_state->count; /* Number already matched */
2464 if (clen > 0)
2465 {
2466 pcre_uint32 otherd = NOTACHAR;
2467 if (caseless)
2468 {
2469 #ifdef SUPPORT_UTF
2470 if (utf && d >= 128)
2471 {
2472 #ifdef SUPPORT_UCP
2473 otherd = UCD_OTHERCASE(d);
2474 #endif /* SUPPORT_UCP */
2475 }
2476 else
2477 #endif /* SUPPORT_UTF */
2478 otherd = TABLE_GET(d, fcc, d);
2479 }
2480 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2481 {
2482 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2483 {
2484 active_count--; /* Remove non-match possibility */
2485 next_active_state--;
2486 }
2487 if (++count >= (int)GET2(code, 1))
2488 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2489 else
2490 { ADD_NEW(state_offset, count); }
2491 }
2492 }
2493 break;
2494
2495
2496 /* ========================================================================== */
2497 /* These are the class-handling opcodes */
2498
2499 case OP_CLASS:
2500 case OP_NCLASS:
2501 case OP_XCLASS:
2502 {
2503 BOOL isinclass = FALSE;
2504 int next_state_offset;
2505 const pcre_uchar *ecode;
2506
2507 /* For a simple class, there is always just a 32-byte table, and we
2508 can set isinclass from it. */
2509
2510 if (codevalue != OP_XCLASS)
2511 {
2512 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2513 if (clen > 0)
2514 {
2515 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2516 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2517 }
2518 }
2519
2520 /* An extended class may have a table or a list of single characters,
2521 ranges, or both, and it may be positive or negative. There's a
2522 function that sorts all this out. */
2523
2524 else
2525 {
2526 ecode = code + GET(code, 1);
2527 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2528 }
2529
2530 /* At this point, isinclass is set for all kinds of class, and ecode
2531 points to the byte after the end of the class. If there is a
2532 quantifier, this is where it will be. */
2533
2534 next_state_offset = (int)(ecode - start_code);
2535
2536 switch (*ecode)
2537 {
2538 case OP_CRSTAR:
2539 case OP_CRMINSTAR:
2540 ADD_ACTIVE(next_state_offset + 1, 0);
2541 if (isinclass) { ADD_NEW(state_offset, 0); }
2542 break;
2543
2544 case OP_CRPLUS:
2545 case OP_CRMINPLUS:
2546 count = current_state->count; /* Already matched */
2547 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2548 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2549 break;
2550
2551 case OP_CRQUERY:
2552 case OP_CRMINQUERY:
2553 ADD_ACTIVE(next_state_offset + 1, 0);
2554 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2555 break;
2556
2557 case OP_CRRANGE:
2558 case OP_CRMINRANGE:
2559 count = current_state->count; /* Already matched */
2560 if (count >= (int)GET2(ecode, 1))
2561 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2562 if (isinclass)
2563 {
2564 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2565 if (++count >= max && max != 0) /* Max 0 => no limit */
2566 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2567 else
2568 { ADD_NEW(state_offset, count); }
2569 }
2570 break;
2571
2572 default:
2573 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2574 break;
2575 }
2576 }
2577 break;
2578
2579 /* ========================================================================== */
2580 /* These are the opcodes for fancy brackets of various kinds. We have
2581 to use recursion in order to handle them. The "always failing" assertion
2582 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2583 though the other "backtracking verbs" are not supported. */
2584
2585 case OP_FAIL:
2586 forced_fail++; /* Count FAILs for multiple states */
2587 break;
2588
2589 case OP_ASSERT:
2590 case OP_ASSERT_NOT:
2591 case OP_ASSERTBACK:
2592 case OP_ASSERTBACK_NOT:
2593 {
2594 int rc;
2595 int local_offsets[2];
2596 int local_workspace[1000];
2597 const pcre_uchar *endasscode = code + GET(code, 1);
2598
2599 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2600
2601 rc = internal_dfa_exec(
2602 md, /* static match data */
2603 code, /* this subexpression's code */
2604 ptr, /* where we currently are */
2605 (int)(ptr - start_subject), /* start offset */
2606 local_offsets, /* offset vector */
2607 sizeof(local_offsets)/sizeof(int), /* size of same */
2608 local_workspace, /* workspace vector */
2609 sizeof(local_workspace)/sizeof(int), /* size of same */
2610 rlevel); /* function recursion level */
2611
2612 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2613 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2614 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2615 }
2616 break;
2617
2618 /*-----------------------------------------------------------------*/
2619 case OP_COND:
2620 case OP_SCOND:
2621 {
2622 int local_offsets[1000];
2623 int local_workspace[1000];
2624 int codelink = GET(code, 1);
2625 int condcode;
2626
2627 /* Because of the way auto-callout works during compile, a callout item
2628 is inserted between OP_COND and an assertion condition. This does not
2629 happen for the other conditions. */
2630
2631 if (code[LINK_SIZE+1] == OP_CALLOUT)
2632 {
2633 rrc = 0;
2634 if (PUBL(callout) != NULL)
2635 {
2636 PUBL(callout_block) cb;
2637 cb.version = 1; /* Version 1 of the callout block */
2638 cb.callout_number = code[LINK_SIZE+2];
2639 cb.offset_vector = offsets;
2640 #if defined COMPILE_PCRE8
2641 cb.subject = (PCRE_SPTR)start_subject;
2642 #elif defined COMPILE_PCRE16
2643 cb.subject = (PCRE_SPTR16)start_subject;
2644 #elif defined COMPILE_PCRE32
2645 cb.subject = (PCRE_SPTR32)start_subject;
2646 #endif
2647 cb.subject_length = (int)(end_subject - start_subject);
2648 cb.start_match = (int)(current_subject - start_subject);
2649 cb.current_position = (int)(ptr - start_subject);
2650 cb.pattern_position = GET(code, LINK_SIZE + 3);
2651 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2652 cb.capture_top = 1;
2653 cb.capture_last = -1;
2654 cb.callout_data = md->callout_data;
2655 cb.mark = NULL; /* No (*MARK) support */
2656 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2657 }
2658 if (rrc > 0) break; /* Fail this thread */
2659 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2660 }
2661
2662 condcode = code[LINK_SIZE+1];
2663
2664 /* Back reference conditions and duplicate named recursion conditions
2665 are not supported */
2666
2667 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2668 condcode == OP_DNRREF)
2669 return PCRE_ERROR_DFA_UCOND;
2670
2671 /* The DEFINE condition is always false */
2672
2673 if (condcode == OP_DEF)
2674 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2675
2676 /* The only supported version of OP_RREF is for the value RREF_ANY,
2677 which means "test if in any recursion". We can't test for specifically
2678 recursed groups. */
2679
2680 else if (condcode == OP_RREF)
2681 {
2682 int value = GET2(code, LINK_SIZE + 2);
2683 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2684 if (md->recursive != NULL)
2685 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2686 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2687 }
2688
2689 /* Otherwise, the condition is an assertion */
2690
2691 else
2692 {
2693 int rc;
2694 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2695 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2696
2697 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2698
2699 rc = internal_dfa_exec(
2700 md, /* fixed match data */
2701 asscode, /* this subexpression's code */
2702 ptr, /* where we currently are */
2703 (int)(ptr - start_subject), /* start offset */
2704 local_offsets, /* offset vector */
2705 sizeof(local_offsets)/sizeof(int), /* size of same */
2706 local_workspace, /* workspace vector */
2707 sizeof(local_workspace)/sizeof(int), /* size of same */
2708 rlevel); /* function recursion level */
2709
2710 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2711 if ((rc >= 0) ==
2712 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2713 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2714 else
2715 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2716 }
2717 }
2718 break;
2719
2720 /*-----------------------------------------------------------------*/
2721 case OP_RECURSE:
2722 {
2723 dfa_recursion_info *ri;
2724 int local_offsets[1000];
2725 int local_workspace[1000];
2726 const pcre_uchar *callpat = start_code + GET(code, 1);
2727 int recno = (callpat == md->start_code)? 0 :
2728 GET2(callpat, 1 + LINK_SIZE);
2729 int rc;
2730
2731 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2732
2733 /* Check for repeating a recursion without advancing the subject
2734 pointer. This should catch convoluted mutual recursions. (Some simple
2735 cases are caught at compile time.) */
2736
2737 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2738 if (recno == ri->group_num && ptr == ri->subject_position)
2739 return PCRE_ERROR_RECURSELOOP;
2740
2741 /* Remember this recursion and where we started it so as to
2742 catch infinite loops. */
2743
2744 new_recursive.group_num = recno;
2745 new_recursive.subject_position = ptr;
2746 new_recursive.prevrec = md->recursive;
2747 md->recursive = &new_recursive;
2748
2749 rc = internal_dfa_exec(
2750 md, /* fixed match data */
2751 callpat, /* this subexpression's code */
2752 ptr, /* where we currently are */
2753 (int)(ptr - start_subject), /* start offset */
2754 local_offsets, /* offset vector */
2755 sizeof(local_offsets)/sizeof(int), /* size of same */
2756 local_workspace, /* workspace vector */
2757 sizeof(local_workspace)/sizeof(int), /* size of same */
2758 rlevel); /* function recursion level */
2759
2760 md->recursive = new_recursive.prevrec; /* Done this recursion */
2761
2762 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2763 rc));
2764
2765 /* Ran out of internal offsets */
2766
2767 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2768
2769 /* For each successful matched substring, set up the next state with a
2770 count of characters to skip before trying it. Note that the count is in
2771 characters, not bytes. */
2772
2773 if (rc > 0)
2774 {
2775 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2776 {
2777 int charcount = local_offsets[rc+1] - local_offsets[rc];
2778 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2779 if (utf)
2780 {
2781 const pcre_uchar *p = start_subject + local_offsets[rc];
2782 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2783 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2784 }
2785 #endif
2786 if (charcount > 0)
2787 {
2788 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2789 }
2790 else
2791 {
2792 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2793 }
2794 }
2795 }
2796 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2797 }
2798 break;
2799
2800 /*-----------------------------------------------------------------*/
2801 case OP_BRAPOS:
2802 case OP_SBRAPOS:
2803 case OP_CBRAPOS:
2804 case OP_SCBRAPOS:
2805 case OP_BRAPOSZERO:
2806 {
2807 int charcount, matched_count;
2808 const pcre_uchar *local_ptr = ptr;
2809 BOOL allow_zero;
2810
2811 if (codevalue == OP_BRAPOSZERO)
2812 {
2813 allow_zero = TRUE;
2814 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2815 }
2816 else allow_zero = FALSE;
2817
2818 /* Loop to match the subpattern as many times as possible as if it were
2819 a complete pattern. */
2820
2821 for (matched_count = 0;; matched_count++)
2822 {
2823 int local_offsets[2];
2824 int local_workspace[1000];
2825
2826 int rc = internal_dfa_exec(
2827 md, /* fixed match data */
2828 code, /* this subexpression's code */
2829 local_ptr, /* where we currently are */
2830 (int)(ptr - start_subject), /* start offset */
2831 local_offsets, /* offset vector */
2832 sizeof(local_offsets)/sizeof(int), /* size of same */
2833 local_workspace, /* workspace vector */
2834 sizeof(local_workspace)/sizeof(int), /* size of same */
2835 rlevel); /* function recursion level */
2836
2837 /* Failed to match */
2838
2839 if (rc < 0)
2840 {
2841 if (rc != PCRE_ERROR_NOMATCH) return rc;
2842 break;
2843 }
2844
2845 /* Matched: break the loop if zero characters matched. */
2846
2847 charcount = local_offsets[1] - local_offsets[0];
2848 if (charcount == 0) break;
2849 local_ptr += charcount; /* Advance temporary position ptr */
2850 }
2851
2852 /* At this point we have matched the subpattern matched_count
2853 times, and local_ptr is pointing to the character after the end of the
2854 last match. */
2855
2856 if (matched_count > 0 || allow_zero)
2857 {
2858 const pcre_uchar *end_subpattern = code;
2859 int next_state_offset;
2860
2861 do { end_subpattern += GET(end_subpattern, 1); }
2862 while (*end_subpattern == OP_ALT);
2863 next_state_offset =
2864 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2865
2866 /* Optimization: if there are no more active states, and there
2867 are no new states yet set up, then skip over the subject string
2868 right here, to save looping. Otherwise, set up the new state to swing
2869 into action when the end of the matched substring is reached. */
2870
2871 if (i + 1 >= active_count && new_count == 0)
2872 {
2873 ptr = local_ptr;
2874 clen = 0;
2875 ADD_NEW(next_state_offset, 0);
2876 }
2877 else
2878 {
2879 const pcre_uchar *p = ptr;
2880 const pcre_uchar *pp = local_ptr;
2881 charcount = (int)(pp - p);
2882 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2883 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2884 #endif
2885 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2886 }
2887 }
2888 }
2889 break;
2890
2891 /*-----------------------------------------------------------------*/
2892 case OP_ONCE:
2893 case OP_ONCE_NC:
2894 {
2895 int local_offsets[2];
2896 int local_workspace[1000];
2897
2898 int rc = internal_dfa_exec(
2899 md, /* fixed match data */
2900 code, /* this subexpression's code */
2901 ptr, /* where we currently are */
2902 (int)(ptr - start_subject), /* start offset */
2903 local_offsets, /* offset vector */
2904 sizeof(local_offsets)/sizeof(int), /* size of same */
2905 local_workspace, /* workspace vector */
2906 sizeof(local_workspace)/sizeof(int), /* size of same */
2907 rlevel); /* function recursion level */
2908
2909 if (rc >= 0)
2910 {
2911 const pcre_uchar *end_subpattern = code;
2912 int charcount = local_offsets[1] - local_offsets[0];
2913 int next_state_offset, repeat_state_offset;
2914
2915 do { end_subpattern += GET(end_subpattern, 1); }
2916 while (*end_subpattern == OP_ALT);
2917 next_state_offset =
2918 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2919
2920 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2921 arrange for the repeat state also to be added to the relevant list.
2922 Calculate the offset, or set -1 for no repeat. */
2923
2924 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2925 *end_subpattern == OP_KETRMIN)?
2926 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2927
2928 /* If we have matched an empty string, add the next state at the
2929 current character pointer. This is important so that the duplicate
2930 checking kicks in, which is what breaks infinite loops that match an
2931 empty string. */
2932
2933 if (charcount == 0)
2934 {
2935 ADD_ACTIVE(next_state_offset, 0);
2936 }
2937
2938 /* Optimization: if there are no more active states, and there
2939 are no new states yet set up, then skip over the subject string
2940 right here, to save looping. Otherwise, set up the new state to swing
2941 into action when the end of the matched substring is reached. */
2942
2943 else if (i + 1 >= active_count && new_count == 0)
2944 {
2945 ptr += charcount;
2946 clen = 0;
2947 ADD_NEW(next_state_offset, 0);
2948
2949 /* If we are adding a repeat state at the new character position,
2950 we must fudge things so that it is the only current state.
2951 Otherwise, it might be a duplicate of one we processed before, and
2952 that would cause it to be skipped. */
2953
2954 if (repeat_state_offset >= 0)
2955 {
2956 next_active_state = active_states;
2957 active_count = 0;
2958 i = -1;
2959 ADD_ACTIVE(repeat_state_offset, 0);
2960 }
2961 }
2962 else
2963 {
2964 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2965 if (utf)
2966 {
2967 const pcre_uchar *p = start_subject + local_offsets[0];
2968 const pcre_uchar *pp = start_subject + local_offsets[1];
2969 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2970 }
2971 #endif
2972 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2973 if (repeat_state_offset >= 0)
2974 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2975 }
2976 }
2977 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2978 }
2979 break;
2980
2981
2982 /* ========================================================================== */
2983 /* Handle callouts */
2984
2985 case OP_CALLOUT:
2986 rrc = 0;
2987 if (PUBL(callout) != NULL)
2988 {
2989 PUBL(callout_block) cb;
2990 cb.version = 1; /* Version 1 of the callout block */
2991 cb.callout_number = code[1];
2992 cb.offset_vector = offsets;
2993 #if defined COMPILE_PCRE8
2994 cb.subject = (PCRE_SPTR)start_subject;
2995 #elif defined COMPILE_PCRE16
2996 cb.subject = (PCRE_SPTR16)start_subject;
2997 #elif defined COMPILE_PCRE32
2998 cb.subject = (PCRE_SPTR32)start_subject;
2999 #endif
3000 cb.subject_length = (int)(end_subject - start_subject);
3001 cb.start_match = (int)(current_subject - start_subject);
3002 cb.current_position = (int)(ptr - start_subject);
3003 cb.pattern_position = GET(code, 2);
3004 cb.next_item_length = GET(code, 2 + LINK_SIZE);
3005 cb.capture_top = 1;
3006 cb.capture_last = -1;
3007 cb.callout_data = md->callout_data;
3008 cb.mark = NULL; /* No (*MARK) support */
3009 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3010 }
3011 if (rrc == 0)
3012 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3013 break;
3014
3015
3016 /* ========================================================================== */
3017 default: /* Unsupported opcode */
3018 return PCRE_ERROR_DFA_UITEM;
3019 }
3020
3021 NEXT_ACTIVE_STATE: continue;
3022
3023 } /* End of loop scanning active states */
3024
3025 /* We have finished the processing at the current subject character. If no
3026 new states have been set for the next character, we have found all the
3027 matches that we are going to find. If we are at the top level and partial
3028 matching has been requested, check for appropriate conditions.
3029
3030 The "forced_ fail" variable counts the number of (*F) encountered for the
3031 character. If it is equal to the original active_count (saved in
3032 workspace[1]) it means that (*F) was found on every active state. In this
3033 case we don't want to give a partial match.
3034
3035 The "could_continue" variable is true if a state could have continued but
3036 for the fact that the end of the subject was reached. */
3037
3038 if (new_count <= 0)
3039 {
3040 if (rlevel == 1 && /* Top level, and */
3041 could_continue && /* Some could go on, and */
3042 forced_fail != workspace[1] && /* Not all forced fail & */
3043 ( /* either... */
3044 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3045 || /* or... */
3046 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3047 match_count < 0) /* no matches */
3048 ) && /* And... */
3049 (
3050 partial_newline || /* Either partial NL */
3051 ( /* or ... */
3052 ptr >= end_subject && /* End of subject and */
3053 ptr > md->start_used_ptr) /* Inspected non-empty string */
3054 )
3055 )
3056 match_count = PCRE_ERROR_PARTIAL;
3057 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3058 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3059 rlevel*2-2, SP));
3060 break; /* In effect, "return", but see the comment below */
3061 }
3062
3063 /* One or more states are active for the next character. */
3064
3065 ptr += clen; /* Advance to next subject character */
3066 } /* Loop to move along the subject string */
3067
3068 /* Control gets here from "break" a few lines above. We do it this way because
3069 if we use "return" above, we have compiler trouble. Some compilers warn if
3070 there's nothing here because they think the function doesn't return a value. On
3071 the other hand, if we put a dummy statement here, some more clever compilers
3072 complain that it can't be reached. Sigh. */
3073
3074 return match_count;
3075 }
3076
3077
3078
3079
3080 /*************************************************
3081 * Execute a Regular Expression - DFA engine *
3082 *************************************************/
3083
3084 /* This external function applies a compiled re to a subject string using a DFA
3085 engine. This function calls the internal function multiple times if the pattern
3086 is not anchored.
3087
3088 Arguments:
3089 argument_re points to the compiled expression
3090 extra_data points to extra data or is NULL
3091 subject points to the subject string
3092 length length of subject string (may contain binary zeros)
3093 start_offset where to start in the subject string
3094 options option bits
3095 offsets vector of match offsets
3096 offsetcount size of same
3097 workspace workspace vector
3098 wscount size of same
3099
3100 Returns: > 0 => number of match offset pairs placed in offsets
3101 = 0 => offsets overflowed; longest matches are present
3102 -1 => failed to match
3103 < -1 => some kind of unexpected problem
3104 */
3105
3106 #if defined COMPILE_PCRE8
3107 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3108 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3109 const char *subject, int length, int start_offset, int options, int *offsets,
3110 int offsetcount, int *workspace, int wscount)
3111 #elif defined COMPILE_PCRE16
3112 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3113 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3114 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3115 int offsetcount, int *workspace, int wscount)
3116 #elif defined COMPILE_PCRE32
3117 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3118 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3119 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3120 int offsetcount, int *workspace, int wscount)
3121 #endif
3122 {
3123 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3124 dfa_match_data match_block;
3125 dfa_match_data *md = &match_block;
3126 BOOL utf, anchored, startline, firstline;
3127 const pcre_uchar *current_subject, *end_subject;
3128 const pcre_study_data *study = NULL;
3129
3130 const pcre_uchar *req_char_ptr;
3131 const pcre_uint8 *start_bits = NULL;
3132 BOOL has_first_char = FALSE;
3133 BOOL has_req_char = FALSE;
3134 pcre_uchar first_char = 0;
3135 pcre_uchar first_char2 = 0;
3136 pcre_uchar req_char = 0;
3137 pcre_uchar req_char2 = 0;
3138 int newline;
3139
3140 /* Plausibility checks */
3141
3142 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3143 if (re == NULL || subject == NULL || workspace == NULL ||
3144 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3145 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3146 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3147 if (length < 0) return PCRE_ERROR_BADLENGTH;
3148 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3149
3150 /* Check that the first field in the block is the magic number. If it is not,
3151 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3152 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3153 means that the pattern is likely compiled with different endianness. */
3154
3155 if (re->magic_number != MAGIC_NUMBER)
3156 return re->magic_number == REVERSED_MAGIC_NUMBER?
3157 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3158 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3159
3160 /* If restarting after a partial match, do some sanity checks on the contents
3161 of the workspace. */
3162
3163 if ((options & PCRE_DFA_RESTART) != 0)
3164 {
3165 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3166 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3167 return PCRE_ERROR_DFA_BADRESTART;
3168 }
3169
3170 /* Set up study, callout, and table data */
3171
3172 md->tables = re->tables;
3173 md->callout_data = NULL;
3174
3175 if (extra_data != NULL)
3176 {
3177 unsigned int flags = extra_data->flags;
3178 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3179 study = (const pcre_study_data *)extra_data->study_data;
3180 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3181 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3182 return PCRE_ERROR_DFA_UMLIMIT;
3183 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3184 md->callout_data = extra_data->callout_data;
3185 if ((flags & PCRE_EXTRA_TABLES) != 0)
3186 md->tables = extra_data->tables;
3187 }
3188
3189 /* Set some local values */
3190
3191 current_subject = (const pcre_uchar *)subject + start_offset;
3192 end_subject = (const pcre_uchar *)subject + length;
3193 req_char_ptr = current_subject - 1;
3194
3195 #ifdef SUPPORT_UTF
3196 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3197 utf = (re->options & PCRE_UTF8) != 0;
3198 #else
3199 utf = FALSE;
3200 #endif
3201
3202 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3203 (re->options & PCRE_ANCHORED) != 0;
3204
3205 /* The remaining fixed data for passing around. */
3206
3207 md->start_code = (const pcre_uchar *)argument_re +
3208 re->name_table_offset + re->name_count * re->name_entry_size;
3209 md->start_subject = (const pcre_uchar *)subject;
3210 md->end_subject = end_subject;
3211 md->start_offset = start_offset;
3212 md->moptions = options;
3213 md->poptions = re->options;
3214
3215 /* If the BSR option is not set at match time, copy what was set
3216 at compile time. */
3217
3218 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3219 {
3220 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3221 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3222 #ifdef BSR_ANYCRLF
3223 else md->moptions |= PCRE_BSR_ANYCRLF;
3224 #endif
3225 }
3226
3227 /* Handle different types of newline. The three bits give eight cases. If
3228 nothing is set at run time, whatever was used at compile time applies. */
3229
3230 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3231 PCRE_NEWLINE_BITS)
3232 {
3233 case 0: newline = NEWLINE; break; /* Compile-time default */
3234 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3235 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3236 case PCRE_NEWLINE_CR+
3237 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3238 case PCRE_NEWLINE_ANY: newline = -1; break;
3239 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3240 default: return PCRE_ERROR_BADNEWLINE;
3241 }
3242
3243 if (newline == -2)
3244 {
3245 md->nltype = NLTYPE_ANYCRLF;
3246 }
3247 else if (newline < 0)
3248 {
3249 md->nltype = NLTYPE_ANY;
3250 }
3251 else
3252 {
3253 md->nltype = NLTYPE_FIXED;
3254 if (newline > 255)
3255 {
3256 md->nllen = 2;
3257 md->nl[0] = (newline >> 8) & 255;
3258 md->nl[1] = newline & 255;
3259 }
3260 else
3261 {
3262 md->nllen = 1;
3263 md->nl[0] = newline;
3264 }
3265 }
3266
3267 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3268 back the character offset. */
3269
3270 #ifdef SUPPORT_UTF
3271 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3272 {
3273 int erroroffset;
3274 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3275 if (errorcode != 0)
3276 {
3277 if (offsetcount >= 2)
3278 {
3279 offsets[0] = erroroffset;
3280 offsets[1] = errorcode;
3281 }
3282 #if defined COMPILE_PCRE8
3283 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3284 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3285 #elif defined COMPILE_PCRE16
3286 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3287 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3288 #elif defined COMPILE_PCRE32
3289 return PCRE_ERROR_BADUTF32;
3290 #endif
3291 }
3292 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3293 if (start_offset > 0 && start_offset < length &&
3294 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3295 return PCRE_ERROR_BADUTF8_OFFSET;
3296 #endif
3297 }
3298 #endif
3299
3300 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3301 is a feature that makes it possible to save compiled regex and re-use them
3302 in other programs later. */
3303
3304 if (md->tables == NULL) md->tables = PRIV(default_tables);
3305
3306 /* The "must be at the start of a line" flags are used in a loop when finding
3307 where to start. */
3308
3309 startline = (re->flags & PCRE_STARTLINE) != 0;
3310 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3311
3312 /* Set up the first character to match, if available. The first_byte value is
3313 never set for an anchored regular expression, but the anchoring may be forced
3314 at run time, so we have to test for anchoring. The first char may be unset for
3315 an unanchored pattern, of course. If there's no first char and the pattern was
3316 studied, there may be a bitmap of possible first characters. */
3317
3318 if (!anchored)
3319 {
3320 if ((re->flags & PCRE_FIRSTSET) != 0)
3321 {
3322 has_first_char = TRUE;
3323 first_char = first_char2 = (pcre_uchar)(re->first_char);
3324 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3325 {
3326 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3327 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3328 if (utf && first_char > 127)
3329 first_char2 = UCD_OTHERCASE(first_char);
3330 #endif
3331 }
3332 }
3333 else
3334 {
3335 if (!startline && study != NULL &&
3336 (study->flags & PCRE_STUDY_MAPPED) != 0)
3337 start_bits = study->start_bits;
3338 }
3339 }
3340
3341 /* For anchored or unanchored matches, there may be a "last known required
3342 character" set. */
3343
3344 if ((re->flags & PCRE_REQCHSET) != 0)
3345 {
3346 has_req_char = TRUE;
3347 req_char = req_char2 = (pcre_uchar)(re->req_char);
3348 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3349 {
3350 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3351 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3352 if (utf && req_char > 127)
3353 req_char2 = UCD_OTHERCASE(req_char);
3354 #endif
3355 }
3356 }
3357
3358 /* Call the main matching function, looping for a non-anchored regex after a
3359 failed match. If not restarting, perform certain optimizations at the start of
3360 a match. */
3361
3362 for (;;)
3363 {
3364 int rc;
3365
3366 if ((options & PCRE_DFA_RESTART) == 0)
3367 {
3368 const pcre_uchar *save_end_subject = end_subject;
3369
3370 /* If firstline is TRUE, the start of the match is constrained to the first
3371 line of a multiline string. Implement this by temporarily adjusting
3372 end_subject so that we stop scanning at a newline. If the match fails at
3373 the newline, later code breaks this loop. */
3374
3375 if (firstline)
3376 {
3377 PCRE_PUCHAR t = current_subject;
3378 #ifdef SUPPORT_UTF
3379 if (utf)
3380 {
3381 while (t < md->end_subject && !IS_NEWLINE(t))
3382 {
3383 t++;
3384 ACROSSCHAR(t < end_subject, *t, t++);
3385 }
3386 }
3387 else
3388 #endif
3389 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3390 end_subject = t;
3391 }
3392
3393 /* There are some optimizations that avoid running the match if a known
3394 starting point is not found. However, there is an option that disables
3395 these, for testing and for ensuring that all callouts do actually occur.
3396 The option can be set in the regex by (*NO_START_OPT) or passed in
3397 match-time options. */
3398
3399 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3400 {
3401 /* Advance to a known first char. */
3402
3403 if (has_first_char)
3404 {
3405 if (first_char != first_char2)
3406 {
3407 pcre_uchar csc;
3408 while (current_subject < end_subject &&
3409 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3410 current_subject++;
3411 }
3412 else
3413 while (current_subject < end_subject &&
3414 RAWUCHARTEST(current_subject) != first_char)
3415 current_subject++;
3416 }
3417
3418 /* Or to just after a linebreak for a multiline match if possible */
3419
3420 else if (startline)
3421 {
3422 if (current_subject > md->start_subject + start_offset)
3423 {
3424 #ifdef SUPPORT_UTF
3425 if (utf)
3426 {
3427 while (current_subject < end_subject &&
3428 !WAS_NEWLINE(current_subject))
3429 {
3430 current_subject++;
3431 ACROSSCHAR(current_subject < end_subject, *current_subject,
3432 current_subject++);
3433 }
3434 }
3435 else
3436 #endif
3437 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3438 current_subject++;
3439
3440 /* If we have just passed a CR and the newline option is ANY or
3441 ANYCRLF, and we are now at a LF, advance the match position by one
3442 more character. */
3443
3444 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3445 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3446 current_subject < end_subject &&
3447 RAWUCHARTEST(current_subject) == CHAR_NL)
3448 current_subject++;
3449 }
3450 }
3451
3452 /* Or to a non-unique first char after study */
3453
3454 else if (start_bits != NULL)
3455 {
3456 while (current_subject < end_subject)
3457 {
3458 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3459 #ifndef COMPILE_PCRE8
3460 if (c > 255) c = 255;
3461 #endif
3462 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3463 {
3464 current_subject++;
3465 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3466 /* In non 8-bit mode, the iteration will stop for
3467 characters > 255 at the beginning or not stop at all. */
3468 if (utf)
3469 ACROSSCHAR(current_subject < end_subject, *current_subject,
3470 current_subject++);
3471 #endif
3472 }
3473 else break;
3474 }
3475 }
3476 }
3477
3478 /* Restore fudged end_subject */
3479
3480 end_subject = save_end_subject;
3481
3482 /* The following two optimizations are disabled for partial matching or if
3483 disabling is explicitly requested (and of course, by the test above, this
3484 code is not obeyed when restarting after a partial match). */
3485
3486 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3487 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3488 {
3489 /* If the pattern was studied, a minimum subject length may be set. This
3490 is a lower bound; no actual string of that length may actually match the
3491 pattern. Although the value is, strictly, in characters, we treat it as
3492 bytes to avoid spending too much time in this optimization. */
3493
3494 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3495 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3496 return PCRE_ERROR_NOMATCH;
3497
3498 /* If req_char is set, we know that that character must appear in the
3499 subject for the match to succeed. If the first character is set, req_char
3500 must be later in the subject; otherwise the test starts at the match
3501 point. This optimization can save a huge amount of work in patterns with
3502 nested unlimited repeats that aren't going to match. Writing separate
3503 code for cased/caseless versions makes it go faster, as does using an
3504 autoincrement and backing off on a match.
3505
3506 HOWEVER: when the subject string is very, very long, searching to its end
3507 can take a long time, and give bad performance on quite ordinary
3508 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3509 string... so we don't do this when the string is sufficiently long. */
3510
3511 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3512 {
3513 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3514
3515 /* We don't need to repeat the search if we haven't yet reached the
3516 place we found it at last time. */
3517
3518 if (p > req_char_ptr)
3519 {
3520 if (req_char != req_char2)
3521 {
3522 while (p < end_subject)
3523 {
3524 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3525 if (pp == req_char || pp == req_char2) { p--; break; }
3526 }
3527 }
3528 else
3529 {
3530 while (p < end_subject)
3531 {
3532 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3533 }
3534 }
3535
3536 /* If we can't find the required character, break the matching loop,
3537 which will cause a return or PCRE_ERROR_NOMATCH. */
3538
3539 if (p >= end_subject) break;
3540
3541 /* If we have found the required character, save the point where we
3542 found it, so that we don't search again next time round the loop if
3543 the start hasn't passed this character yet. */
3544
3545 req_char_ptr = p;
3546 }
3547 }
3548 }
3549 } /* End of optimizations that are done when not restarting */
3550
3551 /* OK, now we can do the business */
3552
3553 md->start_used_ptr = current_subject;
3554 md->recursive = NULL;
3555
3556 rc = internal_dfa_exec(
3557 md, /* fixed match data */
3558 md->start_code, /* this subexpression's code */
3559 current_subject, /* where we currently are */
3560 start_offset, /* start offset in subject */
3561 offsets, /* offset vector */
3562 offsetcount, /* size of same */
3563 workspace, /* workspace vector */
3564 wscount, /* size of same */
3565 0); /* function recurse level */
3566
3567 /* Anything other than "no match" means we are done, always; otherwise, carry
3568 on only if not anchored. */
3569
3570 if (rc != PCRE_ERROR_NOMATCH || anchored)
3571 {
3572 if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3573 {
3574 offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3575 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3576 if (offsetcount > 2)
3577 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3578 }
3579 return rc;
3580 }
3581
3582 /* Advance to the next subject character unless we are at the end of a line
3583 and firstline is set. */
3584
3585 if (firstline && IS_NEWLINE(current_subject)) break;
3586 current_subject++;
3587 #ifdef SUPPORT_UTF
3588 if (utf)
3589 {
3590 ACROSSCHAR(current_subject < end_subject, *current_subject,
3591 current_subject++);
3592 }
3593 #endif
3594 if (current_subject > end_subject) break;
3595
3596 /* If we have just passed a CR and we are now at a LF, and the pattern does
3597 not contain any explicit matches for \r or \n, and the newline option is CRLF
3598 or ANY or ANYCRLF, advance the match position by one more character. */
3599
3600 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3601 current_subject < end_subject &&
3602 RAWUCHARTEST(current_subject) == CHAR_NL &&
3603 (re->flags & PCRE_HASCRORLF) == 0 &&
3604 (md->nltype == NLTYPE_ANY ||
3605 md->nltype == NLTYPE_ANYCRLF ||
3606 md->nllen == 2))
3607 current_subject++;
3608
3609 } /* "Bumpalong" loop */
3610
3611 return PCRE_ERROR_NOMATCH;
3612 }
3613
3614 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5