/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 801 - (show annotations)
Mon Dec 12 16:23:37 2011 UTC (8 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 121406 byte(s)
Merge changes from trunk r755 to r800 into the 16-bit branch.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2011 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, /* CLASS */
156 0, /* NCLASS */
157 0, /* XCLASS - variable length */
158 0, /* REF */
159 0, /* REFI */
160 0, /* RECURSE */
161 0, /* CALLOUT */
162 0, /* Alt */
163 0, /* Ket */
164 0, /* KetRmax */
165 0, /* KetRmin */
166 0, /* KetRpos */
167 0, /* Reverse */
168 0, /* Assert */
169 0, /* Assert not */
170 0, /* Assert behind */
171 0, /* Assert behind not */
172 0, 0, /* ONCE, ONCE_NC */
173 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
174 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
175 0, 0, /* CREF, NCREF */
176 0, 0, /* RREF, NRREF */
177 0, /* DEF */
178 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
179 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
180 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
181 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
182 0, 0 /* CLOSE, SKIPZERO */
183 };
184
185 /* This table identifies those opcodes that inspect a character. It is used to
186 remember the fact that a character could have been inspected when the end of
187 the subject is reached. ***NOTE*** If the start of this table is modified, the
188 two tables that follow must also be modified. */
189
190 static const pcre_uint8 poptable[] = {
191 0, /* End */
192 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
193 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
194 1, 1, 1, /* Any, AllAny, Anybyte */
195 1, 1, /* \P, \p */
196 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
197 1, /* \X */
198 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
199 1, /* Char */
200 1, /* Chari */
201 1, /* not */
202 1, /* noti */
203 /* Positive single-char repeats */
204 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
205 1, 1, 1, /* upto, minupto, exact */
206 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
207 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
208 1, 1, 1, /* upto I, minupto I, exact I */
209 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
210 /* Negative single-char repeats - only for chars < 256 */
211 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
212 1, 1, 1, /* NOT upto, minupto, exact */
213 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
214 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
215 1, 1, 1, /* NOT upto I, minupto I, exact I */
216 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
217 /* Positive type repeats */
218 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
219 1, 1, 1, /* Type upto, minupto, exact */
220 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
221 /* Character class & ref repeats */
222 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
223 1, 1, /* CRRANGE, CRMINRANGE */
224 1, /* CLASS */
225 1, /* NCLASS */
226 1, /* XCLASS - variable length */
227 0, /* REF */
228 0, /* REFI */
229 0, /* RECURSE */
230 0, /* CALLOUT */
231 0, /* Alt */
232 0, /* Ket */
233 0, /* KetRmax */
234 0, /* KetRmin */
235 0, /* KetRpos */
236 0, /* Reverse */
237 0, /* Assert */
238 0, /* Assert not */
239 0, /* Assert behind */
240 0, /* Assert behind not */
241 0, 0, /* ONCE, ONCE_NC */
242 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
243 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
244 0, 0, /* CREF, NCREF */
245 0, 0, /* RREF, NRREF */
246 0, /* DEF */
247 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
248 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
249 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
250 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
251 0, 0 /* CLOSE, SKIPZERO */
252 };
253
254 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255 and \w */
256
257 static const pcre_uint8 toptable1[] = {
258 0, 0, 0, 0, 0, 0,
259 ctype_digit, ctype_digit,
260 ctype_space, ctype_space,
261 ctype_word, ctype_word,
262 0, 0 /* OP_ANY, OP_ALLANY */
263 };
264
265 static const pcre_uint8 toptable2[] = {
266 0, 0, 0, 0, 0, 0,
267 ctype_digit, 0,
268 ctype_space, 0,
269 ctype_word, 0,
270 1, 1 /* OP_ANY, OP_ALLANY */
271 };
272
273
274 /* Structure for holding data about a particular state, which is in effect the
275 current data for an active path through the match tree. It must consist
276 entirely of ints because the working vector we are passed, and which we put
277 these structures in, is a vector of ints. */
278
279 typedef struct stateblock {
280 int offset; /* Offset to opcode */
281 int count; /* Count for repeats */
282 int data; /* Some use extra data */
283 } stateblock;
284
285 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
286
287
288 #ifdef PCRE_DEBUG
289 /*************************************************
290 * Print character string *
291 *************************************************/
292
293 /* Character string printing function for debugging.
294
295 Arguments:
296 p points to string
297 length number of bytes
298 f where to print
299
300 Returns: nothing
301 */
302
303 static void
304 pchars(const pcre_uchar *p, int length, FILE *f)
305 {
306 int c;
307 while (length-- > 0)
308 {
309 if (isprint(c = *(p++)))
310 fprintf(f, "%c", c);
311 else
312 fprintf(f, "\\x%02x", c);
313 }
314 }
315 #endif
316
317
318
319 /*************************************************
320 * Execute a Regular Expression - DFA engine *
321 *************************************************/
322
323 /* This internal function applies a compiled pattern to a subject string,
324 starting at a given point, using a DFA engine. This function is called from the
325 external one, possibly multiple times if the pattern is not anchored. The
326 function calls itself recursively for some kinds of subpattern.
327
328 Arguments:
329 md the match_data block with fixed information
330 this_start_code the opening bracket of this subexpression's code
331 current_subject where we currently are in the subject string
332 start_offset start offset in the subject string
333 offsets vector to contain the matching string offsets
334 offsetcount size of same
335 workspace vector of workspace
336 wscount size of same
337 rlevel function call recursion level
338
339 Returns: > 0 => number of match offset pairs placed in offsets
340 = 0 => offsets overflowed; longest matches are present
341 -1 => failed to match
342 < -1 => some kind of unexpected problem
343
344 The following macros are used for adding states to the two state vectors (one
345 for the current character, one for the following character). */
346
347 #define ADD_ACTIVE(x,y) \
348 if (active_count++ < wscount) \
349 { \
350 next_active_state->offset = (x); \
351 next_active_state->count = (y); \
352 next_active_state++; \
353 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354 } \
355 else return PCRE_ERROR_DFA_WSSIZE
356
357 #define ADD_ACTIVE_DATA(x,y,z) \
358 if (active_count++ < wscount) \
359 { \
360 next_active_state->offset = (x); \
361 next_active_state->count = (y); \
362 next_active_state->data = (z); \
363 next_active_state++; \
364 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365 } \
366 else return PCRE_ERROR_DFA_WSSIZE
367
368 #define ADD_NEW(x,y) \
369 if (new_count++ < wscount) \
370 { \
371 next_new_state->offset = (x); \
372 next_new_state->count = (y); \
373 next_new_state++; \
374 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375 } \
376 else return PCRE_ERROR_DFA_WSSIZE
377
378 #define ADD_NEW_DATA(x,y,z) \
379 if (new_count++ < wscount) \
380 { \
381 next_new_state->offset = (x); \
382 next_new_state->count = (y); \
383 next_new_state->data = (z); \
384 next_new_state++; \
385 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
386 } \
387 else return PCRE_ERROR_DFA_WSSIZE
388
389 /* And now, here is the code */
390
391 static int
392 internal_dfa_exec(
393 dfa_match_data *md,
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
396 int start_offset,
397 int *offsets,
398 int offsetcount,
399 int *workspace,
400 int wscount,
401 int rlevel)
402 {
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
405
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
409
410 dfa_recursion_info new_recursive;
411
412 int active_count, new_count, match_count;
413
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
416
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
420
421 #ifdef SUPPORT_UTF
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 #else
424 BOOL utf = FALSE;
425 #endif
426
427 rlevel++;
428 offsetcount &= (-2);
429
430 wscount -= 2;
431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
432 (2 * INTS_PER_STATEBLOCK);
433
434 DPRINTF(("\n%.*s---------------------\n"
435 "%.*sCall to internal_dfa_exec f=%d\n",
436 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
437
438 ctypes = md->tables + ctypes_offset;
439 lcc = md->tables + lcc_offset;
440 fcc = md->tables + fcc_offset;
441
442 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
443
444 active_states = (stateblock *)(workspace + 2);
445 next_new_state = new_states = active_states + wscount;
446 new_count = 0;
447
448 first_op = this_start_code + 1 + LINK_SIZE +
449 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
451 ? IMM2_SIZE:0);
452
453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454 the alternative states onto the list, and find out where the end is. This
455 makes is possible to use this function recursively, when we want to stop at a
456 matching internal ket rather than at the end.
457
458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459 a backward assertion. In that case, we have to find out the maximum amount to
460 move back, and set up each alternative appropriately. */
461
462 if (*first_op == OP_REVERSE)
463 {
464 int max_back = 0;
465 int gone_back;
466
467 end_code = this_start_code;
468 do
469 {
470 int back = GET(end_code, 2+LINK_SIZE);
471 if (back > max_back) max_back = back;
472 end_code += GET(end_code, 1);
473 }
474 while (*end_code == OP_ALT);
475
476 /* If we can't go back the amount required for the longest lookbehind
477 pattern, go back as far as we can; some alternatives may still be viable. */
478
479 #ifdef SUPPORT_UTF
480 /* In character mode we have to step back character by character */
481
482 if (utf)
483 {
484 for (gone_back = 0; gone_back < max_back; gone_back++)
485 {
486 if (current_subject <= start_subject) break;
487 current_subject--;
488 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
489 }
490 }
491 else
492 #endif
493
494 /* In byte-mode we can do this quickly. */
495
496 {
497 gone_back = (current_subject - max_back < start_subject)?
498 (int)(current_subject - start_subject) : max_back;
499 current_subject -= gone_back;
500 }
501
502 /* Save the earliest consulted character */
503
504 if (current_subject < md->start_used_ptr)
505 md->start_used_ptr = current_subject;
506
507 /* Now we can process the individual branches. */
508
509 end_code = this_start_code;
510 do
511 {
512 int back = GET(end_code, 2+LINK_SIZE);
513 if (back <= gone_back)
514 {
515 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516 ADD_NEW_DATA(-bstate, 0, gone_back - back);
517 }
518 end_code += GET(end_code, 1);
519 }
520 while (*end_code == OP_ALT);
521 }
522
523 /* This is the code for a "normal" subpattern (not a backward assertion). The
524 start of a whole pattern is always one of these. If we are at the top level,
525 we may be asked to restart matching from the same point that we reached for a
526 previous partial match. We still have to scan through the top-level branches to
527 find the end state. */
528
529 else
530 {
531 end_code = this_start_code;
532
533 /* Restarting */
534
535 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
536 {
537 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
538 new_count = workspace[1];
539 if (!workspace[0])
540 memcpy(new_states, active_states, new_count * sizeof(stateblock));
541 }
542
543 /* Not restarting */
544
545 else
546 {
547 int length = 1 + LINK_SIZE +
548 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
550 ? IMM2_SIZE:0);
551 do
552 {
553 ADD_NEW((int)(end_code - start_code + length), 0);
554 end_code += GET(end_code, 1);
555 length = 1 + LINK_SIZE;
556 }
557 while (*end_code == OP_ALT);
558 }
559 }
560
561 workspace[0] = 0; /* Bit indicating which vector is current */
562
563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
564
565 /* Loop for scanning the subject */
566
567 ptr = current_subject;
568 for (;;)
569 {
570 int i, j;
571 int clen, dlen;
572 unsigned int c, d;
573 int forced_fail = 0;
574 BOOL could_continue = FALSE;
575
576 /* Make the new state list into the active state list and empty the
577 new state list. */
578
579 temp_states = active_states;
580 active_states = new_states;
581 new_states = temp_states;
582 active_count = new_count;
583 new_count = 0;
584
585 workspace[0] ^= 1; /* Remember for the restarting feature */
586 workspace[1] = active_count;
587
588 #ifdef PCRE_DEBUG
589 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
590 pchars(ptr, STRLEN_UC(ptr), stdout);
591 printf("\"\n");
592
593 printf("%.*sActive states: ", rlevel*2-2, SP);
594 for (i = 0; i < active_count; i++)
595 printf("%d/%d ", active_states[i].offset, active_states[i].count);
596 printf("\n");
597 #endif
598
599 /* Set the pointers for adding new states */
600
601 next_active_state = active_states + active_count;
602 next_new_state = new_states;
603
604 /* Load the current character from the subject outside the loop, as many
605 different states may want to look at it, and we assume that at least one
606 will. */
607
608 if (ptr < end_subject)
609 {
610 clen = 1; /* Number of bytes in the character */
611 #ifdef SUPPORT_UTF
612 if (utf) { GETCHARLEN(c, ptr, clen); } else
613 #endif /* SUPPORT_UTF */
614 c = *ptr;
615 }
616 else
617 {
618 clen = 0; /* This indicates the end of the subject */
619 c = NOTACHAR; /* This value should never actually be used */
620 }
621
622 /* Scan up the active states and act on each one. The result of an action
623 may be to add more states to the currently active list (e.g. on hitting a
624 parenthesis) or it may be to put states on the new list, for considering
625 when we move the character pointer on. */
626
627 for (i = 0; i < active_count; i++)
628 {
629 stateblock *current_state = active_states + i;
630 BOOL caseless = FALSE;
631 const pcre_uchar *code;
632 int state_offset = current_state->offset;
633 int count, codevalue, rrc;
634
635 #ifdef PCRE_DEBUG
636 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637 if (clen == 0) printf("EOL\n");
638 else if (c > 32 && c < 127) printf("'%c'\n", c);
639 else printf("0x%02x\n", c);
640 #endif
641
642 /* A negative offset is a special case meaning "hold off going to this
643 (negated) state until the number of characters in the data field have
644 been skipped". */
645
646 if (state_offset < 0)
647 {
648 if (current_state->data > 0)
649 {
650 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
651 ADD_NEW_DATA(state_offset, current_state->count,
652 current_state->data - 1);
653 continue;
654 }
655 else
656 {
657 current_state->offset = state_offset = -state_offset;
658 }
659 }
660
661 /* Check for a duplicate state with the same count, and skip if found.
662 See the note at the head of this module about the possibility of improving
663 performance here. */
664
665 for (j = 0; j < i; j++)
666 {
667 if (active_states[j].offset == state_offset &&
668 active_states[j].count == current_state->count)
669 {
670 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
671 goto NEXT_ACTIVE_STATE;
672 }
673 }
674
675 /* The state offset is the offset to the opcode */
676
677 code = start_code + state_offset;
678 codevalue = *code;
679
680 /* If this opcode inspects a character, but we are at the end of the
681 subject, remember the fact for use when testing for a partial match. */
682
683 if (clen == 0 && poptable[codevalue] != 0)
684 could_continue = TRUE;
685
686 /* If this opcode is followed by an inline character, load it. It is
687 tempting to test for the presence of a subject character here, but that
688 is wrong, because sometimes zero repetitions of the subject are
689 permitted.
690
691 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
692 argument that is not a data character - but is always one byte long. We
693 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
694 this case. To keep the other cases fast, convert these ones to new opcodes.
695 */
696
697 if (coptable[codevalue] > 0)
698 {
699 dlen = 1;
700 #ifdef SUPPORT_UTF
701 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
702 #endif /* SUPPORT_UTF */
703 d = code[coptable[codevalue]];
704 if (codevalue >= OP_TYPESTAR)
705 {
706 switch(d)
707 {
708 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
709 case OP_NOTPROP:
710 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
711 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
712 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
713 case OP_NOT_HSPACE:
714 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
715 case OP_NOT_VSPACE:
716 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
717 default: break;
718 }
719 }
720 }
721 else
722 {
723 dlen = 0; /* Not strictly necessary, but compilers moan */
724 d = NOTACHAR; /* if these variables are not set. */
725 }
726
727
728 /* Now process the individual opcodes */
729
730 switch (codevalue)
731 {
732 /* ========================================================================== */
733 /* These cases are never obeyed. This is a fudge that causes a compile-
734 time error if the vectors coptable or poptable, which are indexed by
735 opcode, are not the correct length. It seems to be the only way to do
736 such a check at compile time, as the sizeof() operator does not work
737 in the C preprocessor. */
738
739 case OP_TABLE_LENGTH:
740 case OP_TABLE_LENGTH +
741 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742 (sizeof(poptable) == OP_TABLE_LENGTH)):
743 break;
744
745 /* ========================================================================== */
746 /* Reached a closing bracket. If not at the end of the pattern, carry
747 on with the next opcode. For repeating opcodes, also add the repeat
748 state. Note that KETRPOS will always be encountered at the end of the
749 subpattern, because the possessive subpattern repeats are always handled
750 using recursive calls. Thus, it never adds any new states.
751
752 At the end of the (sub)pattern, unless we have an empty string and
753 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754 start of the subject, save the match data, shifting up all previous
755 matches so we always have the longest first. */
756
757 case OP_KET:
758 case OP_KETRMIN:
759 case OP_KETRMAX:
760 case OP_KETRPOS:
761 if (code != end_code)
762 {
763 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
764 if (codevalue != OP_KET)
765 {
766 ADD_ACTIVE(state_offset - GET(code, 1), 0);
767 }
768 }
769 else
770 {
771 if (ptr > current_subject ||
772 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774 current_subject > start_subject + md->start_offset)))
775 {
776 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777 else if (match_count > 0 && ++match_count * 2 > offsetcount)
778 match_count = 0;
779 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781 if (offsetcount >= 2)
782 {
783 offsets[0] = (int)(current_subject - start_subject);
784 offsets[1] = (int)(ptr - start_subject);
785 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786 offsets[1] - offsets[0], current_subject));
787 }
788 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
789 {
790 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792 match_count, rlevel*2-2, SP));
793 return match_count;
794 }
795 }
796 }
797 break;
798
799 /* ========================================================================== */
800 /* These opcodes add to the current list of states without looking
801 at the current character. */
802
803 /*-----------------------------------------------------------------*/
804 case OP_ALT:
805 do { code += GET(code, 1); } while (*code == OP_ALT);
806 ADD_ACTIVE((int)(code - start_code), 0);
807 break;
808
809 /*-----------------------------------------------------------------*/
810 case OP_BRA:
811 case OP_SBRA:
812 do
813 {
814 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815 code += GET(code, 1);
816 }
817 while (*code == OP_ALT);
818 break;
819
820 /*-----------------------------------------------------------------*/
821 case OP_CBRA:
822 case OP_SCBRA:
823 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
824 code += GET(code, 1);
825 while (*code == OP_ALT)
826 {
827 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
828 code += GET(code, 1);
829 }
830 break;
831
832 /*-----------------------------------------------------------------*/
833 case OP_BRAZERO:
834 case OP_BRAMINZERO:
835 ADD_ACTIVE(state_offset + 1, 0);
836 code += 1 + GET(code, 2);
837 while (*code == OP_ALT) code += GET(code, 1);
838 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
839 break;
840
841 /*-----------------------------------------------------------------*/
842 case OP_SKIPZERO:
843 code += 1 + GET(code, 2);
844 while (*code == OP_ALT) code += GET(code, 1);
845 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
846 break;
847
848 /*-----------------------------------------------------------------*/
849 case OP_CIRC:
850 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
851 { ADD_ACTIVE(state_offset + 1, 0); }
852 break;
853
854 /*-----------------------------------------------------------------*/
855 case OP_CIRCM:
856 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857 (ptr != end_subject && WAS_NEWLINE(ptr)))
858 { ADD_ACTIVE(state_offset + 1, 0); }
859 break;
860
861 /*-----------------------------------------------------------------*/
862 case OP_EOD:
863 if (ptr >= end_subject)
864 {
865 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866 could_continue = TRUE;
867 else { ADD_ACTIVE(state_offset + 1, 0); }
868 }
869 break;
870
871 /*-----------------------------------------------------------------*/
872 case OP_SOD:
873 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
874 break;
875
876 /*-----------------------------------------------------------------*/
877 case OP_SOM:
878 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
879 break;
880
881
882 /* ========================================================================== */
883 /* These opcodes inspect the next subject character, and sometimes
884 the previous one as well, but do not have an argument. The variable
885 clen contains the length of the current character and is zero if we are
886 at the end of the subject. */
887
888 /*-----------------------------------------------------------------*/
889 case OP_ANY:
890 if (clen > 0 && !IS_NEWLINE(ptr))
891 { ADD_NEW(state_offset + 1, 0); }
892 break;
893
894 /*-----------------------------------------------------------------*/
895 case OP_ALLANY:
896 if (clen > 0)
897 { ADD_NEW(state_offset + 1, 0); }
898 break;
899
900 /*-----------------------------------------------------------------*/
901 case OP_EODN:
902 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903 could_continue = TRUE;
904 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905 { ADD_ACTIVE(state_offset + 1, 0); }
906 break;
907
908 /*-----------------------------------------------------------------*/
909 case OP_DOLL:
910 if ((md->moptions & PCRE_NOTEOL) == 0)
911 {
912 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913 could_continue = TRUE;
914 else if (clen == 0 ||
915 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916 (ptr == end_subject - md->nllen)
917 ))
918 { ADD_ACTIVE(state_offset + 1, 0); }
919 }
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_DOLLM:
924 if ((md->moptions & PCRE_NOTEOL) == 0)
925 {
926 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927 could_continue = TRUE;
928 else if (clen == 0 ||
929 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 }
932 else if (IS_NEWLINE(ptr))
933 { ADD_ACTIVE(state_offset + 1, 0); }
934 break;
935
936 /*-----------------------------------------------------------------*/
937
938 case OP_DIGIT:
939 case OP_WHITESPACE:
940 case OP_WORDCHAR:
941 if (clen > 0 && c < 256 &&
942 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
943 { ADD_NEW(state_offset + 1, 0); }
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_NOT_DIGIT:
948 case OP_NOT_WHITESPACE:
949 case OP_NOT_WORDCHAR:
950 if (clen > 0 && (c >= 256 ||
951 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
952 { ADD_NEW(state_offset + 1, 0); }
953 break;
954
955 /*-----------------------------------------------------------------*/
956 case OP_WORD_BOUNDARY:
957 case OP_NOT_WORD_BOUNDARY:
958 {
959 int left_word, right_word;
960
961 if (ptr > start_subject)
962 {
963 const pcre_uchar *temp = ptr - 1;
964 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
965 #ifdef SUPPORT_UTF
966 if (utf) { BACKCHAR(temp); }
967 #endif
968 GETCHARTEST(d, temp);
969 #ifdef SUPPORT_UCP
970 if ((md->poptions & PCRE_UCP) != 0)
971 {
972 if (d == '_') left_word = TRUE; else
973 {
974 int cat = UCD_CATEGORY(d);
975 left_word = (cat == ucp_L || cat == ucp_N);
976 }
977 }
978 else
979 #endif
980 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
981 }
982 else left_word = FALSE;
983
984 if (clen > 0)
985 {
986 #ifdef SUPPORT_UCP
987 if ((md->poptions & PCRE_UCP) != 0)
988 {
989 if (c == '_') right_word = TRUE; else
990 {
991 int cat = UCD_CATEGORY(c);
992 right_word = (cat == ucp_L || cat == ucp_N);
993 }
994 }
995 else
996 #endif
997 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
998 }
999 else right_word = FALSE;
1000
1001 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002 { ADD_ACTIVE(state_offset + 1, 0); }
1003 }
1004 break;
1005
1006
1007 /*-----------------------------------------------------------------*/
1008 /* Check the next character by Unicode property. We will get here only
1009 if the support is in the binary; otherwise a compile-time error occurs.
1010 */
1011
1012 #ifdef SUPPORT_UCP
1013 case OP_PROP:
1014 case OP_NOTPROP:
1015 if (clen > 0)
1016 {
1017 BOOL OK;
1018 const ucd_record * prop = GET_UCD(c);
1019 switch(code[1])
1020 {
1021 case PT_ANY:
1022 OK = TRUE;
1023 break;
1024
1025 case PT_LAMP:
1026 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1027 prop->chartype == ucp_Lt;
1028 break;
1029
1030 case PT_GC:
1031 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1032 break;
1033
1034 case PT_PC:
1035 OK = prop->chartype == code[2];
1036 break;
1037
1038 case PT_SC:
1039 OK = prop->script == code[2];
1040 break;
1041
1042 /* These are specials for combination cases. */
1043
1044 case PT_ALNUM:
1045 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1046 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1047 break;
1048
1049 case PT_SPACE: /* Perl space */
1050 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052 break;
1053
1054 case PT_PXSPACE: /* POSIX space */
1055 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1056 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057 c == CHAR_FF || c == CHAR_CR;
1058 break;
1059
1060 case PT_WORD:
1061 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1062 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1063 c == CHAR_UNDERSCORE;
1064 break;
1065
1066 /* Should never occur, but keep compilers from grumbling. */
1067
1068 default:
1069 OK = codevalue != OP_PROP;
1070 break;
1071 }
1072
1073 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1074 }
1075 break;
1076 #endif
1077
1078
1079
1080 /* ========================================================================== */
1081 /* These opcodes likewise inspect the subject character, but have an
1082 argument that is not a data character. It is one of these opcodes:
1083 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1084 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1085
1086 case OP_TYPEPLUS:
1087 case OP_TYPEMINPLUS:
1088 case OP_TYPEPOSPLUS:
1089 count = current_state->count; /* Already matched */
1090 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1091 if (clen > 0)
1092 {
1093 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1094 (c < 256 &&
1095 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1096 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1097 {
1098 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1099 {
1100 active_count--; /* Remove non-match possibility */
1101 next_active_state--;
1102 }
1103 count++;
1104 ADD_NEW(state_offset, count);
1105 }
1106 }
1107 break;
1108
1109 /*-----------------------------------------------------------------*/
1110 case OP_TYPEQUERY:
1111 case OP_TYPEMINQUERY:
1112 case OP_TYPEPOSQUERY:
1113 ADD_ACTIVE(state_offset + 2, 0);
1114 if (clen > 0)
1115 {
1116 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1117 (c < 256 &&
1118 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1119 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1120 {
1121 if (codevalue == OP_TYPEPOSQUERY)
1122 {
1123 active_count--; /* Remove non-match possibility */
1124 next_active_state--;
1125 }
1126 ADD_NEW(state_offset + 2, 0);
1127 }
1128 }
1129 break;
1130
1131 /*-----------------------------------------------------------------*/
1132 case OP_TYPESTAR:
1133 case OP_TYPEMINSTAR:
1134 case OP_TYPEPOSSTAR:
1135 ADD_ACTIVE(state_offset + 2, 0);
1136 if (clen > 0)
1137 {
1138 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1139 (c < 256 &&
1140 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1141 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1142 {
1143 if (codevalue == OP_TYPEPOSSTAR)
1144 {
1145 active_count--; /* Remove non-match possibility */
1146 next_active_state--;
1147 }
1148 ADD_NEW(state_offset, 0);
1149 }
1150 }
1151 break;
1152
1153 /*-----------------------------------------------------------------*/
1154 case OP_TYPEEXACT:
1155 count = current_state->count; /* Number already matched */
1156 if (clen > 0)
1157 {
1158 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159 (c < 256 &&
1160 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162 {
1163 if (++count >= GET2(code, 1))
1164 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1165 else
1166 { ADD_NEW(state_offset, count); }
1167 }
1168 }
1169 break;
1170
1171 /*-----------------------------------------------------------------*/
1172 case OP_TYPEUPTO:
1173 case OP_TYPEMINUPTO:
1174 case OP_TYPEPOSUPTO:
1175 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176 count = current_state->count; /* Number already matched */
1177 if (clen > 0)
1178 {
1179 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180 (c < 256 &&
1181 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183 {
1184 if (codevalue == OP_TYPEPOSUPTO)
1185 {
1186 active_count--; /* Remove non-match possibility */
1187 next_active_state--;
1188 }
1189 if (++count >= GET2(code, 1))
1190 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1191 else
1192 { ADD_NEW(state_offset, count); }
1193 }
1194 }
1195 break;
1196
1197 /* ========================================================================== */
1198 /* These are virtual opcodes that are used when something like
1199 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1200 argument. It keeps the code above fast for the other cases. The argument
1201 is in the d variable. */
1202
1203 #ifdef SUPPORT_UCP
1204 case OP_PROP_EXTRA + OP_TYPEPLUS:
1205 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1206 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1207 count = current_state->count; /* Already matched */
1208 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1209 if (clen > 0)
1210 {
1211 BOOL OK;
1212 const ucd_record * prop = GET_UCD(c);
1213 switch(code[2])
1214 {
1215 case PT_ANY:
1216 OK = TRUE;
1217 break;
1218
1219 case PT_LAMP:
1220 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1221 prop->chartype == ucp_Lt;
1222 break;
1223
1224 case PT_GC:
1225 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1226 break;
1227
1228 case PT_PC:
1229 OK = prop->chartype == code[3];
1230 break;
1231
1232 case PT_SC:
1233 OK = prop->script == code[3];
1234 break;
1235
1236 /* These are specials for combination cases. */
1237
1238 case PT_ALNUM:
1239 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1240 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1241 break;
1242
1243 case PT_SPACE: /* Perl space */
1244 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246 break;
1247
1248 case PT_PXSPACE: /* POSIX space */
1249 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1250 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251 c == CHAR_FF || c == CHAR_CR;
1252 break;
1253
1254 case PT_WORD:
1255 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1256 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1257 c == CHAR_UNDERSCORE;
1258 break;
1259
1260 /* Should never occur, but keep compilers from grumbling. */
1261
1262 default:
1263 OK = codevalue != OP_PROP;
1264 break;
1265 }
1266
1267 if (OK == (d == OP_PROP))
1268 {
1269 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1270 {
1271 active_count--; /* Remove non-match possibility */
1272 next_active_state--;
1273 }
1274 count++;
1275 ADD_NEW(state_offset, count);
1276 }
1277 }
1278 break;
1279
1280 /*-----------------------------------------------------------------*/
1281 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1282 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1283 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284 count = current_state->count; /* Already matched */
1285 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287 {
1288 const pcre_uchar *nptr = ptr + clen;
1289 int ncount = 0;
1290 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291 {
1292 active_count--; /* Remove non-match possibility */
1293 next_active_state--;
1294 }
1295 while (nptr < end_subject)
1296 {
1297 int nd;
1298 int ndlen = 1;
1299 GETCHARLEN(nd, nptr, ndlen);
1300 if (UCD_CATEGORY(nd) != ucp_M) break;
1301 ncount++;
1302 nptr += ndlen;
1303 }
1304 count++;
1305 ADD_NEW_DATA(-state_offset, count, ncount);
1306 }
1307 break;
1308 #endif
1309
1310 /*-----------------------------------------------------------------*/
1311 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1312 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1313 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1314 count = current_state->count; /* Already matched */
1315 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1316 if (clen > 0)
1317 {
1318 int ncount = 0;
1319 switch (c)
1320 {
1321 case 0x000b:
1322 case 0x000c:
1323 case 0x0085:
1324 case 0x2028:
1325 case 0x2029:
1326 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1327 goto ANYNL01;
1328
1329 case 0x000d:
1330 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1331 /* Fall through */
1332
1333 ANYNL01:
1334 case 0x000a:
1335 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1336 {
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1339 }
1340 count++;
1341 ADD_NEW_DATA(-state_offset, count, ncount);
1342 break;
1343
1344 default:
1345 break;
1346 }
1347 }
1348 break;
1349
1350 /*-----------------------------------------------------------------*/
1351 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1352 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1353 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1354 count = current_state->count; /* Already matched */
1355 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1356 if (clen > 0)
1357 {
1358 BOOL OK;
1359 switch (c)
1360 {
1361 case 0x000a:
1362 case 0x000b:
1363 case 0x000c:
1364 case 0x000d:
1365 case 0x0085:
1366 case 0x2028:
1367 case 0x2029:
1368 OK = TRUE;
1369 break;
1370
1371 default:
1372 OK = FALSE;
1373 break;
1374 }
1375
1376 if (OK == (d == OP_VSPACE))
1377 {
1378 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1379 {
1380 active_count--; /* Remove non-match possibility */
1381 next_active_state--;
1382 }
1383 count++;
1384 ADD_NEW_DATA(-state_offset, count, 0);
1385 }
1386 }
1387 break;
1388
1389 /*-----------------------------------------------------------------*/
1390 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1391 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1392 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1393 count = current_state->count; /* Already matched */
1394 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1395 if (clen > 0)
1396 {
1397 BOOL OK;
1398 switch (c)
1399 {
1400 case 0x09: /* HT */
1401 case 0x20: /* SPACE */
1402 case 0xa0: /* NBSP */
1403 case 0x1680: /* OGHAM SPACE MARK */
1404 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1405 case 0x2000: /* EN QUAD */
1406 case 0x2001: /* EM QUAD */
1407 case 0x2002: /* EN SPACE */
1408 case 0x2003: /* EM SPACE */
1409 case 0x2004: /* THREE-PER-EM SPACE */
1410 case 0x2005: /* FOUR-PER-EM SPACE */
1411 case 0x2006: /* SIX-PER-EM SPACE */
1412 case 0x2007: /* FIGURE SPACE */
1413 case 0x2008: /* PUNCTUATION SPACE */
1414 case 0x2009: /* THIN SPACE */
1415 case 0x200A: /* HAIR SPACE */
1416 case 0x202f: /* NARROW NO-BREAK SPACE */
1417 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1418 case 0x3000: /* IDEOGRAPHIC SPACE */
1419 OK = TRUE;
1420 break;
1421
1422 default:
1423 OK = FALSE;
1424 break;
1425 }
1426
1427 if (OK == (d == OP_HSPACE))
1428 {
1429 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1430 {
1431 active_count--; /* Remove non-match possibility */
1432 next_active_state--;
1433 }
1434 count++;
1435 ADD_NEW_DATA(-state_offset, count, 0);
1436 }
1437 }
1438 break;
1439
1440 /*-----------------------------------------------------------------*/
1441 #ifdef SUPPORT_UCP
1442 case OP_PROP_EXTRA + OP_TYPEQUERY:
1443 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1444 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1445 count = 4;
1446 goto QS1;
1447
1448 case OP_PROP_EXTRA + OP_TYPESTAR:
1449 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1450 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1451 count = 0;
1452
1453 QS1:
1454
1455 ADD_ACTIVE(state_offset + 4, 0);
1456 if (clen > 0)
1457 {
1458 BOOL OK;
1459 const ucd_record * prop = GET_UCD(c);
1460 switch(code[2])
1461 {
1462 case PT_ANY:
1463 OK = TRUE;
1464 break;
1465
1466 case PT_LAMP:
1467 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1468 prop->chartype == ucp_Lt;
1469 break;
1470
1471 case PT_GC:
1472 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1473 break;
1474
1475 case PT_PC:
1476 OK = prop->chartype == code[3];
1477 break;
1478
1479 case PT_SC:
1480 OK = prop->script == code[3];
1481 break;
1482
1483 /* These are specials for combination cases. */
1484
1485 case PT_ALNUM:
1486 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1487 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1488 break;
1489
1490 case PT_SPACE: /* Perl space */
1491 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493 break;
1494
1495 case PT_PXSPACE: /* POSIX space */
1496 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1497 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498 c == CHAR_FF || c == CHAR_CR;
1499 break;
1500
1501 case PT_WORD:
1502 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1503 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1504 c == CHAR_UNDERSCORE;
1505 break;
1506
1507 /* Should never occur, but keep compilers from grumbling. */
1508
1509 default:
1510 OK = codevalue != OP_PROP;
1511 break;
1512 }
1513
1514 if (OK == (d == OP_PROP))
1515 {
1516 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1517 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1518 {
1519 active_count--; /* Remove non-match possibility */
1520 next_active_state--;
1521 }
1522 ADD_NEW(state_offset + count, 0);
1523 }
1524 }
1525 break;
1526
1527 /*-----------------------------------------------------------------*/
1528 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1529 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1530 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1531 count = 2;
1532 goto QS2;
1533
1534 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1535 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1536 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1537 count = 0;
1538
1539 QS2:
1540
1541 ADD_ACTIVE(state_offset + 2, 0);
1542 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543 {
1544 const pcre_uchar *nptr = ptr + clen;
1545 int ncount = 0;
1546 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1548 {
1549 active_count--; /* Remove non-match possibility */
1550 next_active_state--;
1551 }
1552 while (nptr < end_subject)
1553 {
1554 int nd;
1555 int ndlen = 1;
1556 GETCHARLEN(nd, nptr, ndlen);
1557 if (UCD_CATEGORY(nd) != ucp_M) break;
1558 ncount++;
1559 nptr += ndlen;
1560 }
1561 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1562 }
1563 break;
1564 #endif
1565
1566 /*-----------------------------------------------------------------*/
1567 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1568 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1569 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1570 count = 2;
1571 goto QS3;
1572
1573 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1574 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1575 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1576 count = 0;
1577
1578 QS3:
1579 ADD_ACTIVE(state_offset + 2, 0);
1580 if (clen > 0)
1581 {
1582 int ncount = 0;
1583 switch (c)
1584 {
1585 case 0x000b:
1586 case 0x000c:
1587 case 0x0085:
1588 case 0x2028:
1589 case 0x2029:
1590 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1591 goto ANYNL02;
1592
1593 case 0x000d:
1594 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1595 /* Fall through */
1596
1597 ANYNL02:
1598 case 0x000a:
1599 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1600 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1601 {
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1604 }
1605 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1606 break;
1607
1608 default:
1609 break;
1610 }
1611 }
1612 break;
1613
1614 /*-----------------------------------------------------------------*/
1615 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1616 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1617 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1618 count = 2;
1619 goto QS4;
1620
1621 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1622 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1623 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1624 count = 0;
1625
1626 QS4:
1627 ADD_ACTIVE(state_offset + 2, 0);
1628 if (clen > 0)
1629 {
1630 BOOL OK;
1631 switch (c)
1632 {
1633 case 0x000a:
1634 case 0x000b:
1635 case 0x000c:
1636 case 0x000d:
1637 case 0x0085:
1638 case 0x2028:
1639 case 0x2029:
1640 OK = TRUE;
1641 break;
1642
1643 default:
1644 OK = FALSE;
1645 break;
1646 }
1647 if (OK == (d == OP_VSPACE))
1648 {
1649 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1650 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1651 {
1652 active_count--; /* Remove non-match possibility */
1653 next_active_state--;
1654 }
1655 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1656 }
1657 }
1658 break;
1659
1660 /*-----------------------------------------------------------------*/
1661 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1662 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1663 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1664 count = 2;
1665 goto QS5;
1666
1667 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1668 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1669 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1670 count = 0;
1671
1672 QS5:
1673 ADD_ACTIVE(state_offset + 2, 0);
1674 if (clen > 0)
1675 {
1676 BOOL OK;
1677 switch (c)
1678 {
1679 case 0x09: /* HT */
1680 case 0x20: /* SPACE */
1681 case 0xa0: /* NBSP */
1682 case 0x1680: /* OGHAM SPACE MARK */
1683 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1684 case 0x2000: /* EN QUAD */
1685 case 0x2001: /* EM QUAD */
1686 case 0x2002: /* EN SPACE */
1687 case 0x2003: /* EM SPACE */
1688 case 0x2004: /* THREE-PER-EM SPACE */
1689 case 0x2005: /* FOUR-PER-EM SPACE */
1690 case 0x2006: /* SIX-PER-EM SPACE */
1691 case 0x2007: /* FIGURE SPACE */
1692 case 0x2008: /* PUNCTUATION SPACE */
1693 case 0x2009: /* THIN SPACE */
1694 case 0x200A: /* HAIR SPACE */
1695 case 0x202f: /* NARROW NO-BREAK SPACE */
1696 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1697 case 0x3000: /* IDEOGRAPHIC SPACE */
1698 OK = TRUE;
1699 break;
1700
1701 default:
1702 OK = FALSE;
1703 break;
1704 }
1705
1706 if (OK == (d == OP_HSPACE))
1707 {
1708 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1709 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1710 {
1711 active_count--; /* Remove non-match possibility */
1712 next_active_state--;
1713 }
1714 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1715 }
1716 }
1717 break;
1718
1719 /*-----------------------------------------------------------------*/
1720 #ifdef SUPPORT_UCP
1721 case OP_PROP_EXTRA + OP_TYPEEXACT:
1722 case OP_PROP_EXTRA + OP_TYPEUPTO:
1723 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727 count = current_state->count; /* Number already matched */
1728 if (clen > 0)
1729 {
1730 BOOL OK;
1731 const ucd_record * prop = GET_UCD(c);
1732 switch(code[1 + IMM2_SIZE + 1])
1733 {
1734 case PT_ANY:
1735 OK = TRUE;
1736 break;
1737
1738 case PT_LAMP:
1739 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1740 prop->chartype == ucp_Lt;
1741 break;
1742
1743 case PT_GC:
1744 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1745 break;
1746
1747 case PT_PC:
1748 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1749 break;
1750
1751 case PT_SC:
1752 OK = prop->script == code[1 + IMM2_SIZE + 2];
1753 break;
1754
1755 /* These are specials for combination cases. */
1756
1757 case PT_ALNUM:
1758 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1759 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1760 break;
1761
1762 case PT_SPACE: /* Perl space */
1763 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765 break;
1766
1767 case PT_PXSPACE: /* POSIX space */
1768 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1769 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770 c == CHAR_FF || c == CHAR_CR;
1771 break;
1772
1773 case PT_WORD:
1774 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1775 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1776 c == CHAR_UNDERSCORE;
1777 break;
1778
1779 /* Should never occur, but keep compilers from grumbling. */
1780
1781 default:
1782 OK = codevalue != OP_PROP;
1783 break;
1784 }
1785
1786 if (OK == (d == OP_PROP))
1787 {
1788 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1789 {
1790 active_count--; /* Remove non-match possibility */
1791 next_active_state--;
1792 }
1793 if (++count >= GET2(code, 1))
1794 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1795 else
1796 { ADD_NEW(state_offset, count); }
1797 }
1798 }
1799 break;
1800
1801 /*-----------------------------------------------------------------*/
1802 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1803 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1804 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808 count = current_state->count; /* Number already matched */
1809 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810 {
1811 const pcre_uchar *nptr = ptr + clen;
1812 int ncount = 0;
1813 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814 {
1815 active_count--; /* Remove non-match possibility */
1816 next_active_state--;
1817 }
1818 while (nptr < end_subject)
1819 {
1820 int nd;
1821 int ndlen = 1;
1822 GETCHARLEN(nd, nptr, ndlen);
1823 if (UCD_CATEGORY(nd) != ucp_M) break;
1824 ncount++;
1825 nptr += ndlen;
1826 }
1827 if (++count >= GET2(code, 1))
1828 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1829 else
1830 { ADD_NEW_DATA(-state_offset, count, ncount); }
1831 }
1832 break;
1833 #endif
1834
1835 /*-----------------------------------------------------------------*/
1836 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1837 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1838 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842 count = current_state->count; /* Number already matched */
1843 if (clen > 0)
1844 {
1845 int ncount = 0;
1846 switch (c)
1847 {
1848 case 0x000b:
1849 case 0x000c:
1850 case 0x0085:
1851 case 0x2028:
1852 case 0x2029:
1853 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1854 goto ANYNL03;
1855
1856 case 0x000d:
1857 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1858 /* Fall through */
1859
1860 ANYNL03:
1861 case 0x000a:
1862 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1863 {
1864 active_count--; /* Remove non-match possibility */
1865 next_active_state--;
1866 }
1867 if (++count >= GET2(code, 1))
1868 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1869 else
1870 { ADD_NEW_DATA(-state_offset, count, ncount); }
1871 break;
1872
1873 default:
1874 break;
1875 }
1876 }
1877 break;
1878
1879 /*-----------------------------------------------------------------*/
1880 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1881 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1882 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886 count = current_state->count; /* Number already matched */
1887 if (clen > 0)
1888 {
1889 BOOL OK;
1890 switch (c)
1891 {
1892 case 0x000a:
1893 case 0x000b:
1894 case 0x000c:
1895 case 0x000d:
1896 case 0x0085:
1897 case 0x2028:
1898 case 0x2029:
1899 OK = TRUE;
1900 break;
1901
1902 default:
1903 OK = FALSE;
1904 }
1905
1906 if (OK == (d == OP_VSPACE))
1907 {
1908 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1909 {
1910 active_count--; /* Remove non-match possibility */
1911 next_active_state--;
1912 }
1913 if (++count >= GET2(code, 1))
1914 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1915 else
1916 { ADD_NEW_DATA(-state_offset, count, 0); }
1917 }
1918 }
1919 break;
1920
1921 /*-----------------------------------------------------------------*/
1922 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1923 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1924 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928 count = current_state->count; /* Number already matched */
1929 if (clen > 0)
1930 {
1931 BOOL OK;
1932 switch (c)
1933 {
1934 case 0x09: /* HT */
1935 case 0x20: /* SPACE */
1936 case 0xa0: /* NBSP */
1937 case 0x1680: /* OGHAM SPACE MARK */
1938 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1939 case 0x2000: /* EN QUAD */
1940 case 0x2001: /* EM QUAD */
1941 case 0x2002: /* EN SPACE */
1942 case 0x2003: /* EM SPACE */
1943 case 0x2004: /* THREE-PER-EM SPACE */
1944 case 0x2005: /* FOUR-PER-EM SPACE */
1945 case 0x2006: /* SIX-PER-EM SPACE */
1946 case 0x2007: /* FIGURE SPACE */
1947 case 0x2008: /* PUNCTUATION SPACE */
1948 case 0x2009: /* THIN SPACE */
1949 case 0x200A: /* HAIR SPACE */
1950 case 0x202f: /* NARROW NO-BREAK SPACE */
1951 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1952 case 0x3000: /* IDEOGRAPHIC SPACE */
1953 OK = TRUE;
1954 break;
1955
1956 default:
1957 OK = FALSE;
1958 break;
1959 }
1960
1961 if (OK == (d == OP_HSPACE))
1962 {
1963 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1964 {
1965 active_count--; /* Remove non-match possibility */
1966 next_active_state--;
1967 }
1968 if (++count >= GET2(code, 1))
1969 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1970 else
1971 { ADD_NEW_DATA(-state_offset, count, 0); }
1972 }
1973 }
1974 break;
1975
1976 /* ========================================================================== */
1977 /* These opcodes are followed by a character that is usually compared
1978 to the current subject character; it is loaded into d. We still get
1979 here even if there is no subject character, because in some cases zero
1980 repetitions are permitted. */
1981
1982 /*-----------------------------------------------------------------*/
1983 case OP_CHAR:
1984 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1985 break;
1986
1987 /*-----------------------------------------------------------------*/
1988 case OP_CHARI:
1989 if (clen == 0) break;
1990
1991 #ifdef SUPPORT_UTF
1992 if (utf)
1993 {
1994 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995 {
1996 unsigned int othercase;
1997 if (c < 128)
1998 othercase = fcc[c];
1999 else
2000 /* If we have Unicode property support, we can use it to test the
2001 other case of the character. */
2002 #ifdef SUPPORT_UCP
2003 othercase = UCD_OTHERCASE(c);
2004 #else
2005 othercase = NOTACHAR;
2006 #endif
2007
2008 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009 }
2010 }
2011 else
2012 #endif /* SUPPORT_UTF */
2013 /* Not UTF mode */
2014 {
2015 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2016 }
2017 break;
2018
2019
2020 #ifdef SUPPORT_UCP
2021 /*-----------------------------------------------------------------*/
2022 /* This is a tricky one because it can match more than one character.
2023 Find out how many characters to skip, and then set up a negative state
2024 to wait for them to pass before continuing. */
2025
2026 case OP_EXTUNI:
2027 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2028 {
2029 const pcre_uchar *nptr = ptr + clen;
2030 int ncount = 0;
2031 while (nptr < end_subject)
2032 {
2033 int nclen = 1;
2034 GETCHARLEN(c, nptr, nclen);
2035 if (UCD_CATEGORY(c) != ucp_M) break;
2036 ncount++;
2037 nptr += nclen;
2038 }
2039 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2040 }
2041 break;
2042 #endif
2043
2044 /*-----------------------------------------------------------------*/
2045 /* This is a tricky like EXTUNI because it too can match more than one
2046 character (when CR is followed by LF). In this case, set up a negative
2047 state to wait for one character to pass before continuing. */
2048
2049 case OP_ANYNL:
2050 if (clen > 0) switch(c)
2051 {
2052 case 0x000b:
2053 case 0x000c:
2054 case 0x0085:
2055 case 0x2028:
2056 case 0x2029:
2057 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2058
2059 case 0x000a:
2060 ADD_NEW(state_offset + 1, 0);
2061 break;
2062
2063 case 0x000d:
2064 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2065 {
2066 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2067 }
2068 else
2069 {
2070 ADD_NEW(state_offset + 1, 0);
2071 }
2072 break;
2073 }
2074 break;
2075
2076 /*-----------------------------------------------------------------*/
2077 case OP_NOT_VSPACE:
2078 if (clen > 0) switch(c)
2079 {
2080 case 0x000a:
2081 case 0x000b:
2082 case 0x000c:
2083 case 0x000d:
2084 case 0x0085:
2085 case 0x2028:
2086 case 0x2029:
2087 break;
2088
2089 default:
2090 ADD_NEW(state_offset + 1, 0);
2091 break;
2092 }
2093 break;
2094
2095 /*-----------------------------------------------------------------*/
2096 case OP_VSPACE:
2097 if (clen > 0) switch(c)
2098 {
2099 case 0x000a:
2100 case 0x000b:
2101 case 0x000c:
2102 case 0x000d:
2103 case 0x0085:
2104 case 0x2028:
2105 case 0x2029:
2106 ADD_NEW(state_offset + 1, 0);
2107 break;
2108
2109 default: break;
2110 }
2111 break;
2112
2113 /*-----------------------------------------------------------------*/
2114 case OP_NOT_HSPACE:
2115 if (clen > 0) switch(c)
2116 {
2117 case 0x09: /* HT */
2118 case 0x20: /* SPACE */
2119 case 0xa0: /* NBSP */
2120 case 0x1680: /* OGHAM SPACE MARK */
2121 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2122 case 0x2000: /* EN QUAD */
2123 case 0x2001: /* EM QUAD */
2124 case 0x2002: /* EN SPACE */
2125 case 0x2003: /* EM SPACE */
2126 case 0x2004: /* THREE-PER-EM SPACE */
2127 case 0x2005: /* FOUR-PER-EM SPACE */
2128 case 0x2006: /* SIX-PER-EM SPACE */
2129 case 0x2007: /* FIGURE SPACE */
2130 case 0x2008: /* PUNCTUATION SPACE */
2131 case 0x2009: /* THIN SPACE */
2132 case 0x200A: /* HAIR SPACE */
2133 case 0x202f: /* NARROW NO-BREAK SPACE */
2134 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2135 case 0x3000: /* IDEOGRAPHIC SPACE */
2136 break;
2137
2138 default:
2139 ADD_NEW(state_offset + 1, 0);
2140 break;
2141 }
2142 break;
2143
2144 /*-----------------------------------------------------------------*/
2145 case OP_HSPACE:
2146 if (clen > 0) switch(c)
2147 {
2148 case 0x09: /* HT */
2149 case 0x20: /* SPACE */
2150 case 0xa0: /* NBSP */
2151 case 0x1680: /* OGHAM SPACE MARK */
2152 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2153 case 0x2000: /* EN QUAD */
2154 case 0x2001: /* EM QUAD */
2155 case 0x2002: /* EN SPACE */
2156 case 0x2003: /* EM SPACE */
2157 case 0x2004: /* THREE-PER-EM SPACE */
2158 case 0x2005: /* FOUR-PER-EM SPACE */
2159 case 0x2006: /* SIX-PER-EM SPACE */
2160 case 0x2007: /* FIGURE SPACE */
2161 case 0x2008: /* PUNCTUATION SPACE */
2162 case 0x2009: /* THIN SPACE */
2163 case 0x200A: /* HAIR SPACE */
2164 case 0x202f: /* NARROW NO-BREAK SPACE */
2165 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2166 case 0x3000: /* IDEOGRAPHIC SPACE */
2167 ADD_NEW(state_offset + 1, 0);
2168 break;
2169 }
2170 break;
2171
2172 /*-----------------------------------------------------------------*/
2173 /* Match a negated single character casefully. This is only used for
2174 one-byte characters, that is, we know that d < 256. The character we are
2175 checking (c) can be multibyte. */
2176
2177 case OP_NOT:
2178 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2179 break;
2180
2181 /*-----------------------------------------------------------------*/
2182 /* Match a negated single character caselessly. This is only used for
2183 one-byte characters, that is, we know that d < 256. The character we are
2184 checking (c) can be multibyte. */
2185
2186 case OP_NOTI:
2187 if (clen > 0 && c != d && c != fcc[d])
2188 { ADD_NEW(state_offset + dlen + 1, 0); }
2189 break;
2190
2191 /*-----------------------------------------------------------------*/
2192 case OP_PLUSI:
2193 case OP_MINPLUSI:
2194 case OP_POSPLUSI:
2195 case OP_NOTPLUSI:
2196 case OP_NOTMINPLUSI:
2197 case OP_NOTPOSPLUSI:
2198 caseless = TRUE;
2199 codevalue -= OP_STARI - OP_STAR;
2200
2201 /* Fall through */
2202 case OP_PLUS:
2203 case OP_MINPLUS:
2204 case OP_POSPLUS:
2205 case OP_NOTPLUS:
2206 case OP_NOTMINPLUS:
2207 case OP_NOTPOSPLUS:
2208 count = current_state->count; /* Already matched */
2209 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2210 if (clen > 0)
2211 {
2212 unsigned int otherd = NOTACHAR;
2213 if (caseless)
2214 {
2215 #ifdef SUPPORT_UTF
2216 if (utf && d >= 128)
2217 {
2218 #ifdef SUPPORT_UCP
2219 otherd = UCD_OTHERCASE(d);
2220 #endif /* SUPPORT_UCP */
2221 }
2222 else
2223 #endif /* SUPPORT_UTF */
2224 otherd = fcc[d];
2225 }
2226 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2227 {
2228 if (count > 0 &&
2229 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2230 {
2231 active_count--; /* Remove non-match possibility */
2232 next_active_state--;
2233 }
2234 count++;
2235 ADD_NEW(state_offset, count);
2236 }
2237 }
2238 break;
2239
2240 /*-----------------------------------------------------------------*/
2241 case OP_QUERYI:
2242 case OP_MINQUERYI:
2243 case OP_POSQUERYI:
2244 case OP_NOTQUERYI:
2245 case OP_NOTMINQUERYI:
2246 case OP_NOTPOSQUERYI:
2247 caseless = TRUE;
2248 codevalue -= OP_STARI - OP_STAR;
2249 /* Fall through */
2250 case OP_QUERY:
2251 case OP_MINQUERY:
2252 case OP_POSQUERY:
2253 case OP_NOTQUERY:
2254 case OP_NOTMINQUERY:
2255 case OP_NOTPOSQUERY:
2256 ADD_ACTIVE(state_offset + dlen + 1, 0);
2257 if (clen > 0)
2258 {
2259 unsigned int otherd = NOTACHAR;
2260 if (caseless)
2261 {
2262 #ifdef SUPPORT_UTF
2263 if (utf && d >= 128)
2264 {
2265 #ifdef SUPPORT_UCP
2266 otherd = UCD_OTHERCASE(d);
2267 #endif /* SUPPORT_UCP */
2268 }
2269 else
2270 #endif /* SUPPORT_UTF */
2271 otherd = fcc[d];
2272 }
2273 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2274 {
2275 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2276 {
2277 active_count--; /* Remove non-match possibility */
2278 next_active_state--;
2279 }
2280 ADD_NEW(state_offset + dlen + 1, 0);
2281 }
2282 }
2283 break;
2284
2285 /*-----------------------------------------------------------------*/
2286 case OP_STARI:
2287 case OP_MINSTARI:
2288 case OP_POSSTARI:
2289 case OP_NOTSTARI:
2290 case OP_NOTMINSTARI:
2291 case OP_NOTPOSSTARI:
2292 caseless = TRUE;
2293 codevalue -= OP_STARI - OP_STAR;
2294 /* Fall through */
2295 case OP_STAR:
2296 case OP_MINSTAR:
2297 case OP_POSSTAR:
2298 case OP_NOTSTAR:
2299 case OP_NOTMINSTAR:
2300 case OP_NOTPOSSTAR:
2301 ADD_ACTIVE(state_offset + dlen + 1, 0);
2302 if (clen > 0)
2303 {
2304 unsigned int otherd = NOTACHAR;
2305 if (caseless)
2306 {
2307 #ifdef SUPPORT_UTF
2308 if (utf && d >= 128)
2309 {
2310 #ifdef SUPPORT_UCP
2311 otherd = UCD_OTHERCASE(d);
2312 #endif /* SUPPORT_UCP */
2313 }
2314 else
2315 #endif /* SUPPORT_UTF */
2316 otherd = fcc[d];
2317 }
2318 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2319 {
2320 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2321 {
2322 active_count--; /* Remove non-match possibility */
2323 next_active_state--;
2324 }
2325 ADD_NEW(state_offset, 0);
2326 }
2327 }
2328 break;
2329
2330 /*-----------------------------------------------------------------*/
2331 case OP_EXACTI:
2332 case OP_NOTEXACTI:
2333 caseless = TRUE;
2334 codevalue -= OP_STARI - OP_STAR;
2335 /* Fall through */
2336 case OP_EXACT:
2337 case OP_NOTEXACT:
2338 count = current_state->count; /* Number already matched */
2339 if (clen > 0)
2340 {
2341 unsigned int otherd = NOTACHAR;
2342 if (caseless)
2343 {
2344 #ifdef SUPPORT_UTF
2345 if (utf && d >= 128)
2346 {
2347 #ifdef SUPPORT_UCP
2348 otherd = UCD_OTHERCASE(d);
2349 #endif /* SUPPORT_UCP */
2350 }
2351 else
2352 #endif /* SUPPORT_UTF */
2353 otherd = fcc[d];
2354 }
2355 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2356 {
2357 if (++count >= GET2(code, 1))
2358 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2359 else
2360 { ADD_NEW(state_offset, count); }
2361 }
2362 }
2363 break;
2364
2365 /*-----------------------------------------------------------------*/
2366 case OP_UPTOI:
2367 case OP_MINUPTOI:
2368 case OP_POSUPTOI:
2369 case OP_NOTUPTOI:
2370 case OP_NOTMINUPTOI:
2371 case OP_NOTPOSUPTOI:
2372 caseless = TRUE;
2373 codevalue -= OP_STARI - OP_STAR;
2374 /* Fall through */
2375 case OP_UPTO:
2376 case OP_MINUPTO:
2377 case OP_POSUPTO:
2378 case OP_NOTUPTO:
2379 case OP_NOTMINUPTO:
2380 case OP_NOTPOSUPTO:
2381 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2382 count = current_state->count; /* Number already matched */
2383 if (clen > 0)
2384 {
2385 unsigned int otherd = NOTACHAR;
2386 if (caseless)
2387 {
2388 #ifdef SUPPORT_UTF
2389 if (utf && d >= 128)
2390 {
2391 #ifdef SUPPORT_UCP
2392 otherd = UCD_OTHERCASE(d);
2393 #endif /* SUPPORT_UCP */
2394 }
2395 else
2396 #endif /* SUPPORT_UTF */
2397 otherd = fcc[d];
2398 }
2399 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2400 {
2401 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2402 {
2403 active_count--; /* Remove non-match possibility */
2404 next_active_state--;
2405 }
2406 if (++count >= GET2(code, 1))
2407 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2408 else
2409 { ADD_NEW(state_offset, count); }
2410 }
2411 }
2412 break;
2413
2414
2415 /* ========================================================================== */
2416 /* These are the class-handling opcodes */
2417
2418 case OP_CLASS:
2419 case OP_NCLASS:
2420 case OP_XCLASS:
2421 {
2422 BOOL isinclass = FALSE;
2423 int next_state_offset;
2424 const pcre_uchar *ecode;
2425
2426 /* For a simple class, there is always just a 32-byte table, and we
2427 can set isinclass from it. */
2428
2429 if (codevalue != OP_XCLASS)
2430 {
2431 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2432 if (clen > 0)
2433 {
2434 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2435 ((code[1 + c/8] & (1 << (c&7))) != 0);
2436 }
2437 }
2438
2439 /* An extended class may have a table or a list of single characters,
2440 ranges, or both, and it may be positive or negative. There's a
2441 function that sorts all this out. */
2442
2443 else
2444 {
2445 ecode = code + GET(code, 1);
2446 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2447 }
2448
2449 /* At this point, isinclass is set for all kinds of class, and ecode
2450 points to the byte after the end of the class. If there is a
2451 quantifier, this is where it will be. */
2452
2453 next_state_offset = (int)(ecode - start_code);
2454
2455 switch (*ecode)
2456 {
2457 case OP_CRSTAR:
2458 case OP_CRMINSTAR:
2459 ADD_ACTIVE(next_state_offset + 1, 0);
2460 if (isinclass) { ADD_NEW(state_offset, 0); }
2461 break;
2462
2463 case OP_CRPLUS:
2464 case OP_CRMINPLUS:
2465 count = current_state->count; /* Already matched */
2466 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2467 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2468 break;
2469
2470 case OP_CRQUERY:
2471 case OP_CRMINQUERY:
2472 ADD_ACTIVE(next_state_offset + 1, 0);
2473 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2474 break;
2475
2476 case OP_CRRANGE:
2477 case OP_CRMINRANGE:
2478 count = current_state->count; /* Already matched */
2479 if (count >= GET2(ecode, 1))
2480 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2481 if (isinclass)
2482 {
2483 int max = GET2(ecode, 3);
2484 if (++count >= max && max != 0) /* Max 0 => no limit */
2485 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2486 else
2487 { ADD_NEW(state_offset, count); }
2488 }
2489 break;
2490
2491 default:
2492 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2493 break;
2494 }
2495 }
2496 break;
2497
2498 /* ========================================================================== */
2499 /* These are the opcodes for fancy brackets of various kinds. We have
2500 to use recursion in order to handle them. The "always failing" assertion
2501 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2502 though the other "backtracking verbs" are not supported. */
2503
2504 case OP_FAIL:
2505 forced_fail++; /* Count FAILs for multiple states */
2506 break;
2507
2508 case OP_ASSERT:
2509 case OP_ASSERT_NOT:
2510 case OP_ASSERTBACK:
2511 case OP_ASSERTBACK_NOT:
2512 {
2513 int rc;
2514 int local_offsets[2];
2515 int local_workspace[1000];
2516 const pcre_uchar *endasscode = code + GET(code, 1);
2517
2518 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2519
2520 rc = internal_dfa_exec(
2521 md, /* static match data */
2522 code, /* this subexpression's code */
2523 ptr, /* where we currently are */
2524 (int)(ptr - start_subject), /* start offset */
2525 local_offsets, /* offset vector */
2526 sizeof(local_offsets)/sizeof(int), /* size of same */
2527 local_workspace, /* workspace vector */
2528 sizeof(local_workspace)/sizeof(int), /* size of same */
2529 rlevel); /* function recursion level */
2530
2531 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2532 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2533 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2534 }
2535 break;
2536
2537 /*-----------------------------------------------------------------*/
2538 case OP_COND:
2539 case OP_SCOND:
2540 {
2541 int local_offsets[1000];
2542 int local_workspace[1000];
2543 int codelink = GET(code, 1);
2544 int condcode;
2545
2546 /* Because of the way auto-callout works during compile, a callout item
2547 is inserted between OP_COND and an assertion condition. This does not
2548 happen for the other conditions. */
2549
2550 if (code[LINK_SIZE+1] == OP_CALLOUT)
2551 {
2552 rrc = 0;
2553 if (pcre_callout != NULL)
2554 {
2555 pcre_callout_block cb;
2556 cb.version = 1; /* Version 1 of the callout block */
2557 cb.callout_number = code[LINK_SIZE+2];
2558 cb.offset_vector = offsets;
2559 cb.subject = (PCRE_SPTR)start_subject;
2560 cb.subject_length = (int)(end_subject - start_subject);
2561 cb.start_match = (int)(current_subject - start_subject);
2562 cb.current_position = (int)(ptr - start_subject);
2563 cb.pattern_position = GET(code, LINK_SIZE + 3);
2564 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2565 cb.capture_top = 1;
2566 cb.capture_last = -1;
2567 cb.callout_data = md->callout_data;
2568 cb.mark = NULL; /* No (*MARK) support */
2569 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2570 }
2571 if (rrc > 0) break; /* Fail this thread */
2572 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2573 }
2574
2575 condcode = code[LINK_SIZE+1];
2576
2577 /* Back reference conditions are not supported */
2578
2579 if (condcode == OP_CREF || condcode == OP_NCREF)
2580 return PCRE_ERROR_DFA_UCOND;
2581
2582 /* The DEFINE condition is always false */
2583
2584 if (condcode == OP_DEF)
2585 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2586
2587 /* The only supported version of OP_RREF is for the value RREF_ANY,
2588 which means "test if in any recursion". We can't test for specifically
2589 recursed groups. */
2590
2591 else if (condcode == OP_RREF || condcode == OP_NRREF)
2592 {
2593 int value = GET2(code, LINK_SIZE+2);
2594 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2595 if (md->recursive != NULL)
2596 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2597 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2598 }
2599
2600 /* Otherwise, the condition is an assertion */
2601
2602 else
2603 {
2604 int rc;
2605 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2606 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2607
2608 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2609
2610 rc = internal_dfa_exec(
2611 md, /* fixed match data */
2612 asscode, /* this subexpression's code */
2613 ptr, /* where we currently are */
2614 (int)(ptr - start_subject), /* start offset */
2615 local_offsets, /* offset vector */
2616 sizeof(local_offsets)/sizeof(int), /* size of same */
2617 local_workspace, /* workspace vector */
2618 sizeof(local_workspace)/sizeof(int), /* size of same */
2619 rlevel); /* function recursion level */
2620
2621 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2622 if ((rc >= 0) ==
2623 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2624 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2625 else
2626 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2627 }
2628 }
2629 break;
2630
2631 /*-----------------------------------------------------------------*/
2632 case OP_RECURSE:
2633 {
2634 dfa_recursion_info *ri;
2635 int local_offsets[1000];
2636 int local_workspace[1000];
2637 const pcre_uchar *callpat = start_code + GET(code, 1);
2638 int recno = (callpat == md->start_code)? 0 :
2639 GET2(callpat, 1 + LINK_SIZE);
2640 int rc;
2641
2642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2643
2644 /* Check for repeating a recursion without advancing the subject
2645 pointer. This should catch convoluted mutual recursions. (Some simple
2646 cases are caught at compile time.) */
2647
2648 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2649 if (recno == ri->group_num && ptr == ri->subject_position)
2650 return PCRE_ERROR_RECURSELOOP;
2651
2652 /* Remember this recursion and where we started it so as to
2653 catch infinite loops. */
2654
2655 new_recursive.group_num = recno;
2656 new_recursive.subject_position = ptr;
2657 new_recursive.prevrec = md->recursive;
2658 md->recursive = &new_recursive;
2659
2660 rc = internal_dfa_exec(
2661 md, /* fixed match data */
2662 callpat, /* this subexpression's code */
2663 ptr, /* where we currently are */
2664 (int)(ptr - start_subject), /* start offset */
2665 local_offsets, /* offset vector */
2666 sizeof(local_offsets)/sizeof(int), /* size of same */
2667 local_workspace, /* workspace vector */
2668 sizeof(local_workspace)/sizeof(int), /* size of same */
2669 rlevel); /* function recursion level */
2670
2671 md->recursive = new_recursive.prevrec; /* Done this recursion */
2672
2673 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2674 rc));
2675
2676 /* Ran out of internal offsets */
2677
2678 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2679
2680 /* For each successful matched substring, set up the next state with a
2681 count of characters to skip before trying it. Note that the count is in
2682 characters, not bytes. */
2683
2684 if (rc > 0)
2685 {
2686 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2687 {
2688 const pcre_uchar *p = start_subject + local_offsets[rc];
2689 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2690 int charcount = local_offsets[rc+1] - local_offsets[rc];
2691 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2692 if (charcount > 0)
2693 {
2694 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2695 }
2696 else
2697 {
2698 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2699 }
2700 }
2701 }
2702 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2703 }
2704 break;
2705
2706 /*-----------------------------------------------------------------*/
2707 case OP_BRAPOS:
2708 case OP_SBRAPOS:
2709 case OP_CBRAPOS:
2710 case OP_SCBRAPOS:
2711 case OP_BRAPOSZERO:
2712 {
2713 int charcount, matched_count;
2714 const pcre_uchar *local_ptr = ptr;
2715 BOOL allow_zero;
2716
2717 if (codevalue == OP_BRAPOSZERO)
2718 {
2719 allow_zero = TRUE;
2720 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2721 }
2722 else allow_zero = FALSE;
2723
2724 /* Loop to match the subpattern as many times as possible as if it were
2725 a complete pattern. */
2726
2727 for (matched_count = 0;; matched_count++)
2728 {
2729 int local_offsets[2];
2730 int local_workspace[1000];
2731
2732 int rc = internal_dfa_exec(
2733 md, /* fixed match data */
2734 code, /* this subexpression's code */
2735 local_ptr, /* where we currently are */
2736 (int)(ptr - start_subject), /* start offset */
2737 local_offsets, /* offset vector */
2738 sizeof(local_offsets)/sizeof(int), /* size of same */
2739 local_workspace, /* workspace vector */
2740 sizeof(local_workspace)/sizeof(int), /* size of same */
2741 rlevel); /* function recursion level */
2742
2743 /* Failed to match */
2744
2745 if (rc < 0)
2746 {
2747 if (rc != PCRE_ERROR_NOMATCH) return rc;
2748 break;
2749 }
2750
2751 /* Matched: break the loop if zero characters matched. */
2752
2753 charcount = local_offsets[1] - local_offsets[0];
2754 if (charcount == 0) break;
2755 local_ptr += charcount; /* Advance temporary position ptr */
2756 }
2757
2758 /* At this point we have matched the subpattern matched_count
2759 times, and local_ptr is pointing to the character after the end of the
2760 last match. */
2761
2762 if (matched_count > 0 || allow_zero)
2763 {
2764 const pcre_uchar *end_subpattern = code;
2765 int next_state_offset;
2766
2767 do { end_subpattern += GET(end_subpattern, 1); }
2768 while (*end_subpattern == OP_ALT);
2769 next_state_offset =
2770 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2771
2772 /* Optimization: if there are no more active states, and there
2773 are no new states yet set up, then skip over the subject string
2774 right here, to save looping. Otherwise, set up the new state to swing
2775 into action when the end of the matched substring is reached. */
2776
2777 if (i + 1 >= active_count && new_count == 0)
2778 {
2779 ptr = local_ptr;
2780 clen = 0;
2781 ADD_NEW(next_state_offset, 0);
2782 }
2783 else
2784 {
2785 const pcre_uchar *p = ptr;
2786 const pcre_uchar *pp = local_ptr;
2787 charcount = (int)(pp - p);
2788 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2789 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2790 }
2791 }
2792 }
2793 break;
2794
2795 /*-----------------------------------------------------------------*/
2796 case OP_ONCE:
2797 case OP_ONCE_NC:
2798 {
2799 int local_offsets[2];
2800 int local_workspace[1000];
2801
2802 int rc = internal_dfa_exec(
2803 md, /* fixed match data */
2804 code, /* this subexpression's code */
2805 ptr, /* where we currently are */
2806 (int)(ptr - start_subject), /* start offset */
2807 local_offsets, /* offset vector */
2808 sizeof(local_offsets)/sizeof(int), /* size of same */
2809 local_workspace, /* workspace vector */
2810 sizeof(local_workspace)/sizeof(int), /* size of same */
2811 rlevel); /* function recursion level */
2812
2813 if (rc >= 0)
2814 {
2815 const pcre_uchar *end_subpattern = code;
2816 int charcount = local_offsets[1] - local_offsets[0];
2817 int next_state_offset, repeat_state_offset;
2818
2819 do { end_subpattern += GET(end_subpattern, 1); }
2820 while (*end_subpattern == OP_ALT);
2821 next_state_offset =
2822 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2823
2824 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2825 arrange for the repeat state also to be added to the relevant list.
2826 Calculate the offset, or set -1 for no repeat. */
2827
2828 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2829 *end_subpattern == OP_KETRMIN)?
2830 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2831
2832 /* If we have matched an empty string, add the next state at the
2833 current character pointer. This is important so that the duplicate
2834 checking kicks in, which is what breaks infinite loops that match an
2835 empty string. */
2836
2837 if (charcount == 0)
2838 {
2839 ADD_ACTIVE(next_state_offset, 0);
2840 }
2841
2842 /* Optimization: if there are no more active states, and there
2843 are no new states yet set up, then skip over the subject string
2844 right here, to save looping. Otherwise, set up the new state to swing
2845 into action when the end of the matched substring is reached. */
2846
2847 else if (i + 1 >= active_count && new_count == 0)
2848 {
2849 ptr += charcount;
2850 clen = 0;
2851 ADD_NEW(next_state_offset, 0);
2852
2853 /* If we are adding a repeat state at the new character position,
2854 we must fudge things so that it is the only current state.
2855 Otherwise, it might be a duplicate of one we processed before, and
2856 that would cause it to be skipped. */
2857
2858 if (repeat_state_offset >= 0)
2859 {
2860 next_active_state = active_states;
2861 active_count = 0;
2862 i = -1;
2863 ADD_ACTIVE(repeat_state_offset, 0);
2864 }
2865 }
2866 else
2867 {
2868 const pcre_uchar *p = start_subject + local_offsets[0];
2869 const pcre_uchar *pp = start_subject + local_offsets[1];
2870 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2871 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2872 if (repeat_state_offset >= 0)
2873 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2874 }
2875 }
2876 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2877 }
2878 break;
2879
2880
2881 /* ========================================================================== */
2882 /* Handle callouts */
2883
2884 case OP_CALLOUT:
2885 rrc = 0;
2886 if (pcre_callout != NULL)
2887 {
2888 pcre_callout_block cb;
2889 cb.version = 1; /* Version 1 of the callout block */
2890 cb.callout_number = code[1];
2891 cb.offset_vector = offsets;
2892 cb.subject = (PCRE_SPTR)start_subject;
2893 cb.subject_length = (int)(end_subject - start_subject);
2894 cb.start_match = (int)(current_subject - start_subject);
2895 cb.current_position = (int)(ptr - start_subject);
2896 cb.pattern_position = GET(code, 2);
2897 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2898 cb.capture_top = 1;
2899 cb.capture_last = -1;
2900 cb.callout_data = md->callout_data;
2901 cb.mark = NULL; /* No (*MARK) support */
2902 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2903 }
2904 if (rrc == 0)
2905 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2906 break;
2907
2908
2909 /* ========================================================================== */
2910 default: /* Unsupported opcode */
2911 return PCRE_ERROR_DFA_UITEM;
2912 }
2913
2914 NEXT_ACTIVE_STATE: continue;
2915
2916 } /* End of loop scanning active states */
2917
2918 /* We have finished the processing at the current subject character. If no
2919 new states have been set for the next character, we have found all the
2920 matches that we are going to find. If we are at the top level and partial
2921 matching has been requested, check for appropriate conditions.
2922
2923 The "forced_ fail" variable counts the number of (*F) encountered for the
2924 character. If it is equal to the original active_count (saved in
2925 workspace[1]) it means that (*F) was found on every active state. In this
2926 case we don't want to give a partial match.
2927
2928 The "could_continue" variable is true if a state could have continued but
2929 for the fact that the end of the subject was reached. */
2930
2931 if (new_count <= 0)
2932 {
2933 if (rlevel == 1 && /* Top level, and */
2934 could_continue && /* Some could go on */
2935 forced_fail != workspace[1] && /* Not all forced fail & */
2936 ( /* either... */
2937 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2938 || /* or... */
2939 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2940 match_count < 0) /* no matches */
2941 ) && /* And... */
2942 ptr >= end_subject && /* Reached end of subject */
2943 ptr > md->start_used_ptr) /* Inspected non-empty string */
2944 {
2945 if (offsetcount >= 2)
2946 {
2947 offsets[0] = (int)(md->start_used_ptr - start_subject);
2948 offsets[1] = (int)(end_subject - start_subject);
2949 }
2950 match_count = PCRE_ERROR_PARTIAL;
2951 }
2952
2953 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2954 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2955 rlevel*2-2, SP));
2956 break; /* In effect, "return", but see the comment below */
2957 }
2958
2959 /* One or more states are active for the next character. */
2960
2961 ptr += clen; /* Advance to next subject character */
2962 } /* Loop to move along the subject string */
2963
2964 /* Control gets here from "break" a few lines above. We do it this way because
2965 if we use "return" above, we have compiler trouble. Some compilers warn if
2966 there's nothing here because they think the function doesn't return a value. On
2967 the other hand, if we put a dummy statement here, some more clever compilers
2968 complain that it can't be reached. Sigh. */
2969
2970 return match_count;
2971 }
2972
2973
2974
2975
2976 /*************************************************
2977 * Execute a Regular Expression - DFA engine *
2978 *************************************************/
2979
2980 /* This external function applies a compiled re to a subject string using a DFA
2981 engine. This function calls the internal function multiple times if the pattern
2982 is not anchored.
2983
2984 Arguments:
2985 argument_re points to the compiled expression
2986 extra_data points to extra data or is NULL
2987 subject points to the subject string
2988 length length of subject string (may contain binary zeros)
2989 start_offset where to start in the subject string
2990 options option bits
2991 offsets vector of match offsets
2992 offsetcount size of same
2993 workspace workspace vector
2994 wscount size of same
2995
2996 Returns: > 0 => number of match offset pairs placed in offsets
2997 = 0 => offsets overflowed; longest matches are present
2998 -1 => failed to match
2999 < -1 => some kind of unexpected problem
3000 */
3001
3002 #ifdef COMPILE_PCRE8
3003 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3004 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3005 const char *subject, int length, int start_offset, int options, int *offsets,
3006 int offsetcount, int *workspace, int wscount)
3007 #else
3008 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3009 pcre16_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3010 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3011 int offsetcount, int *workspace, int wscount)
3012 #endif
3013 {
3014 real_pcre *re = (real_pcre *)argument_re;
3015 dfa_match_data match_block;
3016 dfa_match_data *md = &match_block;
3017 BOOL utf, anchored, startline, firstline;
3018 const pcre_uchar *current_subject, *end_subject;
3019 const pcre_uint8 *lcc;
3020
3021 pcre_study_data internal_study;
3022 const pcre_study_data *study = NULL;
3023 real_pcre internal_re;
3024
3025 const pcre_uchar *req_char_ptr;
3026 const pcre_uint8 *start_bits = NULL;
3027 BOOL has_first_char = FALSE;
3028 BOOL has_req_char = FALSE;
3029 pcre_uchar first_char = 0;
3030 pcre_uchar first_char2 = 0;
3031 pcre_uchar req_char = 0;
3032 pcre_uchar req_char2 = 0;
3033 int newline;
3034
3035 /* Plausibility checks */
3036
3037 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3038 if (re == NULL || subject == NULL || workspace == NULL ||
3039 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3040 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3041 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3042 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3043
3044 /* We need to find the pointer to any study data before we test for byte
3045 flipping, so we scan the extra_data block first. This may set two fields in the
3046 match block, so we must initialize them beforehand. However, the other fields
3047 in the match block must not be set until after the byte flipping. */
3048
3049 md->tables = re->tables;
3050 md->callout_data = NULL;
3051
3052 if (extra_data != NULL)
3053 {
3054 unsigned int flags = extra_data->flags;
3055 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3056 study = (const pcre_study_data *)extra_data->study_data;
3057 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3058 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3059 return PCRE_ERROR_DFA_UMLIMIT;
3060 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3061 md->callout_data = extra_data->callout_data;
3062 if ((flags & PCRE_EXTRA_TABLES) != 0)
3063 md->tables = extra_data->tables;
3064 }
3065
3066 /* Check that the first field in the block is the magic number. If it is not,
3067 test for a regex that was compiled on a host of opposite endianness. If this is
3068 the case, flipped values are put in internal_re and internal_study if there was
3069 study data too. */
3070
3071 if (re->magic_number != MAGIC_NUMBER)
3072 {
3073 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3074 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3075 if (study != NULL) study = &internal_study;
3076 }
3077 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3078
3079 /* Set some local values */
3080
3081 current_subject = (const pcre_uchar *)subject + start_offset;
3082 end_subject = (const pcre_uchar *)subject + length;
3083 req_char_ptr = current_subject - 1;
3084
3085 #ifdef SUPPORT_UTF
3086 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3087 utf = (re->options & PCRE_UTF8) != 0;
3088 #else
3089 utf = FALSE;
3090 #endif
3091
3092 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3093 (re->options & PCRE_ANCHORED) != 0;
3094
3095 /* The remaining fixed data for passing around. */
3096
3097 md->start_code = (const pcre_uchar *)argument_re +
3098 re->name_table_offset + re->name_count * re->name_entry_size;
3099 md->start_subject = (const pcre_uchar *)subject;
3100 md->end_subject = end_subject;
3101 md->start_offset = start_offset;
3102 md->moptions = options;
3103 md->poptions = re->options;
3104
3105 /* If the BSR option is not set at match time, copy what was set
3106 at compile time. */
3107
3108 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3109 {
3110 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3111 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3112 #ifdef BSR_ANYCRLF
3113 else md->moptions |= PCRE_BSR_ANYCRLF;
3114 #endif
3115 }
3116
3117 /* Handle different types of newline. The three bits give eight cases. If
3118 nothing is set at run time, whatever was used at compile time applies. */
3119
3120 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3121 PCRE_NEWLINE_BITS)
3122 {
3123 case 0: newline = NEWLINE; break; /* Compile-time default */
3124 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3125 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3126 case PCRE_NEWLINE_CR+
3127 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3128 case PCRE_NEWLINE_ANY: newline = -1; break;
3129 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3130 default: return PCRE_ERROR_BADNEWLINE;
3131 }
3132
3133 if (newline == -2)
3134 {
3135 md->nltype = NLTYPE_ANYCRLF;
3136 }
3137 else if (newline < 0)
3138 {
3139 md->nltype = NLTYPE_ANY;
3140 }
3141 else
3142 {
3143 md->nltype = NLTYPE_FIXED;
3144 if (newline > 255)
3145 {
3146 md->nllen = 2;
3147 md->nl[0] = (newline >> 8) & 255;
3148 md->nl[1] = newline & 255;
3149 }
3150 else
3151 {
3152 md->nllen = 1;
3153 md->nl[0] = newline;
3154 }
3155 }
3156
3157 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3158 back the character offset. */
3159
3160 #ifdef SUPPORT_UTF
3161 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3162 {
3163 int erroroffset;
3164 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3165 if (errorcode != 0)
3166 {
3167 if (offsetcount >= 2)
3168 {
3169 offsets[0] = erroroffset;
3170 offsets[1] = errorcode;
3171 }
3172 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3173 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3174 }
3175 if (start_offset > 0 && start_offset < length &&
3176 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3177 return PCRE_ERROR_BADUTF8_OFFSET;
3178 }
3179 #endif
3180
3181 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3182 is a feature that makes it possible to save compiled regex and re-use them
3183 in other programs later. */
3184
3185 if (md->tables == NULL) md->tables = PRIV(default_tables);
3186
3187 /* The lower casing table and the "must be at the start of a line" flag are
3188 used in a loop when finding where to start. */
3189
3190 lcc = md->tables + lcc_offset;
3191 startline = (re->flags & PCRE_STARTLINE) != 0;
3192 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3193
3194 /* Set up the first character to match, if available. The first_byte value is
3195 never set for an anchored regular expression, but the anchoring may be forced
3196 at run time, so we have to test for anchoring. The first char may be unset for
3197 an unanchored pattern, of course. If there's no first char and the pattern was
3198 studied, there may be a bitmap of possible first characters. */
3199
3200 if (!anchored)
3201 {
3202 if ((re->flags & PCRE_FIRSTSET) != 0)
3203 {
3204 has_first_char = TRUE;
3205 first_char = first_char2 = re->first_char;
3206 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3207 {
3208 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3209 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3210 if (utf && first_char > 127)
3211 first_char2 = UCD_OTHERCASE(first_char);
3212 #endif
3213 }
3214 }
3215 else
3216 {
3217 if (!startline && study != NULL &&
3218 (study->flags & PCRE_STUDY_MAPPED) != 0)
3219 start_bits = study->start_bits;
3220 }
3221 }
3222
3223 /* For anchored or unanchored matches, there may be a "last known required
3224 character" set. */
3225
3226 if ((re->flags & PCRE_REQCHSET) != 0)
3227 {
3228 has_req_char = TRUE;
3229 req_char = req_char2 = re->req_char;
3230 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3231 {
3232 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3233 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3234 if (utf && req_char > 127)
3235 req_char2 = UCD_OTHERCASE(req_char);
3236 #endif
3237 }
3238 }
3239
3240 /* Call the main matching function, looping for a non-anchored regex after a
3241 failed match. If not restarting, perform certain optimizations at the start of
3242 a match. */
3243
3244 for (;;)
3245 {
3246 int rc;
3247
3248 if ((options & PCRE_DFA_RESTART) == 0)
3249 {
3250 const pcre_uchar *save_end_subject = end_subject;
3251
3252 /* If firstline is TRUE, the start of the match is constrained to the first
3253 line of a multiline string. Implement this by temporarily adjusting
3254 end_subject so that we stop scanning at a newline. If the match fails at
3255 the newline, later code breaks this loop. */
3256
3257 if (firstline)
3258 {
3259 PCRE_PUCHAR t = current_subject;
3260 #ifdef SUPPORT_UTF
3261 if (utf)
3262 {
3263 while (t < md->end_subject && !IS_NEWLINE(t))
3264 {
3265 t++;
3266 ACROSSCHAR(t < end_subject, *t, t++);
3267 }
3268 }
3269 else
3270 #endif
3271 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3272 end_subject = t;
3273 }
3274
3275 /* There are some optimizations that avoid running the match if a known
3276 starting point is not found. However, there is an option that disables
3277 these, for testing and for ensuring that all callouts do actually occur.
3278 The option can be set in the regex by (*NO_START_OPT) or passed in
3279 match-time options. */
3280
3281 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3282 {
3283 /* Advance to a known first char. */
3284
3285 if (has_first_char)
3286 {
3287 if (first_char != first_char2)
3288 while (current_subject < end_subject &&
3289 *current_subject != first_char && *current_subject != first_char2)
3290 current_subject++;
3291 else
3292 while (current_subject < end_subject &&
3293 *current_subject != first_char)
3294 current_subject++;
3295 }
3296
3297 /* Or to just after a linebreak for a multiline match if possible */
3298
3299 else if (startline)
3300 {
3301 if (current_subject > md->start_subject + start_offset)
3302 {
3303 #ifdef SUPPORT_UTF
3304 if (utf)
3305 {
3306 while (current_subject < end_subject &&
3307 !WAS_NEWLINE(current_subject))
3308 {
3309 current_subject++;
3310 ACROSSCHAR(current_subject < end_subject, *current_subject,
3311 current_subject++);
3312 }
3313 }
3314 else
3315 #endif
3316 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3317 current_subject++;
3318
3319 /* If we have just passed a CR and the newline option is ANY or
3320 ANYCRLF, and we are now at a LF, advance the match position by one
3321 more character. */
3322
3323 if (current_subject[-1] == CHAR_CR &&
3324 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3325 current_subject < end_subject &&
3326 *current_subject == CHAR_NL)
3327 current_subject++;
3328 }
3329 }
3330
3331 /* Or to a non-unique first char after study */
3332
3333 else if (start_bits != NULL)
3334 {
3335 while (current_subject < end_subject)
3336 {
3337 register unsigned int c = *current_subject;
3338 #ifndef COMPILE_PCRE8
3339 if (c > 255) c = 255;
3340 #endif
3341 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3342 {
3343 current_subject++;
3344 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3345 /* In non 8-bit mode, the iteration will stop for
3346 characters > 255 at the beginning or not stop at all. */
3347 if (utf)
3348 ACROSSCHAR(current_subject < end_subject, *current_subject,
3349 current_subject++);
3350 #endif
3351 }
3352 else break;
3353 }
3354 }
3355 }
3356
3357 /* Restore fudged end_subject */
3358
3359 end_subject = save_end_subject;
3360
3361 /* The following two optimizations are disabled for partial matching or if
3362 disabling is explicitly requested (and of course, by the test above, this
3363 code is not obeyed when restarting after a partial match). */
3364
3365 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3366 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3367 {
3368 /* If the pattern was studied, a minimum subject length may be set. This
3369 is a lower bound; no actual string of that length may actually match the
3370 pattern. Although the value is, strictly, in characters, we treat it as
3371 bytes to avoid spending too much time in this optimization. */
3372
3373 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3374 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3375 return PCRE_ERROR_NOMATCH;
3376
3377 /* If req_char is set, we know that that character must appear in the
3378 subject for the match to succeed. If the first character is set, req_char
3379 must be later in the subject; otherwise the test starts at the match
3380 point. This optimization can save a huge amount of work in patterns with
3381 nested unlimited repeats that aren't going to match. Writing separate
3382 code for cased/caseless versions makes it go faster, as does using an
3383 autoincrement and backing off on a match.
3384
3385 HOWEVER: when the subject string is very, very long, searching to its end
3386 can take a long time, and give bad performance on quite ordinary
3387 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3388 string... so we don't do this when the string is sufficiently long. */
3389
3390 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3391 {
3392 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3393
3394 /* We don't need to repeat the search if we haven't yet reached the
3395 place we found it at last time. */
3396
3397 if (p > req_char_ptr)
3398 {
3399 if (req_char != req_char2)
3400 {
3401 while (p < end_subject)
3402 {
3403 register int pp = *p++;
3404 if (pp == req_char || pp == req_char2) { p--; break; }
3405 }
3406 }
3407 else
3408 {
3409 while (p < end_subject)
3410 {
3411 if (*p++ == req_char) { p--; break; }
3412 }
3413 }
3414
3415 /* If we can't find the required character, break the matching loop,
3416 which will cause a return or PCRE_ERROR_NOMATCH. */
3417
3418 if (p >= end_subject) break;
3419
3420 /* If we have found the required character, save the point where we
3421 found it, so that we don't search again next time round the loop if
3422 the start hasn't passed this character yet. */
3423
3424 req_char_ptr = p;
3425 }
3426 }
3427 }
3428 } /* End of optimizations that are done when not restarting */
3429
3430 /* OK, now we can do the business */
3431
3432 md->start_used_ptr = current_subject;
3433 md->recursive = NULL;
3434
3435 rc = internal_dfa_exec(
3436 md, /* fixed match data */
3437 md->start_code, /* this subexpression's code */
3438 current_subject, /* where we currently are */
3439 start_offset, /* start offset in subject */
3440 offsets, /* offset vector */
3441 offsetcount, /* size of same */
3442 workspace, /* workspace vector */
3443 wscount, /* size of same */
3444 0); /* function recurse level */
3445
3446 /* Anything other than "no match" means we are done, always; otherwise, carry
3447 on only if not anchored. */
3448
3449 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3450
3451 /* Advance to the next subject character unless we are at the end of a line
3452 and firstline is set. */
3453
3454 if (firstline && IS_NEWLINE(current_subject)) break;
3455 current_subject++;
3456 #ifdef SUPPORT_UTF
3457 if (utf)
3458 {
3459 ACROSSCHAR(current_subject < end_subject, *current_subject,
3460 current_subject++);
3461 }
3462 #endif
3463 if (current_subject > end_subject) break;
3464
3465 /* If we have just passed a CR and we are now at a LF, and the pattern does
3466 not contain any explicit matches for \r or \n, and the newline option is CRLF
3467 or ANY or ANYCRLF, advance the match position by one more character. */
3468
3469 if (current_subject[-1] == CHAR_CR &&
3470 current_subject < end_subject &&
3471 *current_subject == CHAR_NL &&
3472 (re->flags & PCRE_HASCRORLF) == 0 &&
3473 (md->nltype == NLTYPE_ANY ||
3474 md->nltype == NLTYPE_ANYCRLF ||
3475 md->nllen == 2))
3476 current_subject++;
3477
3478 } /* "Bumpalong" loop */
3479
3480 return PCRE_ERROR_NOMATCH;
3481 }
3482
3483 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5