/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 806 - (show annotations)
Thu Dec 15 11:57:39 2011 UTC (8 years, 2 months ago) by zherczeg
File MIME type: text/plain
File size: 121448 byte(s)
lcc and inline printint.c fixes
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2011 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, /* CLASS */
156 0, /* NCLASS */
157 0, /* XCLASS - variable length */
158 0, /* REF */
159 0, /* REFI */
160 0, /* RECURSE */
161 0, /* CALLOUT */
162 0, /* Alt */
163 0, /* Ket */
164 0, /* KetRmax */
165 0, /* KetRmin */
166 0, /* KetRpos */
167 0, /* Reverse */
168 0, /* Assert */
169 0, /* Assert not */
170 0, /* Assert behind */
171 0, /* Assert behind not */
172 0, 0, /* ONCE, ONCE_NC */
173 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
174 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
175 0, 0, /* CREF, NCREF */
176 0, 0, /* RREF, NRREF */
177 0, /* DEF */
178 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
179 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
180 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
181 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
182 0, 0 /* CLOSE, SKIPZERO */
183 };
184
185 /* This table identifies those opcodes that inspect a character. It is used to
186 remember the fact that a character could have been inspected when the end of
187 the subject is reached. ***NOTE*** If the start of this table is modified, the
188 two tables that follow must also be modified. */
189
190 static const pcre_uint8 poptable[] = {
191 0, /* End */
192 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
193 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
194 1, 1, 1, /* Any, AllAny, Anybyte */
195 1, 1, /* \P, \p */
196 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
197 1, /* \X */
198 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
199 1, /* Char */
200 1, /* Chari */
201 1, /* not */
202 1, /* noti */
203 /* Positive single-char repeats */
204 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
205 1, 1, 1, /* upto, minupto, exact */
206 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
207 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
208 1, 1, 1, /* upto I, minupto I, exact I */
209 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
210 /* Negative single-char repeats - only for chars < 256 */
211 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
212 1, 1, 1, /* NOT upto, minupto, exact */
213 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
214 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
215 1, 1, 1, /* NOT upto I, minupto I, exact I */
216 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
217 /* Positive type repeats */
218 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
219 1, 1, 1, /* Type upto, minupto, exact */
220 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
221 /* Character class & ref repeats */
222 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
223 1, 1, /* CRRANGE, CRMINRANGE */
224 1, /* CLASS */
225 1, /* NCLASS */
226 1, /* XCLASS - variable length */
227 0, /* REF */
228 0, /* REFI */
229 0, /* RECURSE */
230 0, /* CALLOUT */
231 0, /* Alt */
232 0, /* Ket */
233 0, /* KetRmax */
234 0, /* KetRmin */
235 0, /* KetRpos */
236 0, /* Reverse */
237 0, /* Assert */
238 0, /* Assert not */
239 0, /* Assert behind */
240 0, /* Assert behind not */
241 0, 0, /* ONCE, ONCE_NC */
242 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
243 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
244 0, 0, /* CREF, NCREF */
245 0, 0, /* RREF, NRREF */
246 0, /* DEF */
247 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
248 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
249 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
250 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
251 0, 0 /* CLOSE, SKIPZERO */
252 };
253
254 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
255 and \w */
256
257 static const pcre_uint8 toptable1[] = {
258 0, 0, 0, 0, 0, 0,
259 ctype_digit, ctype_digit,
260 ctype_space, ctype_space,
261 ctype_word, ctype_word,
262 0, 0 /* OP_ANY, OP_ALLANY */
263 };
264
265 static const pcre_uint8 toptable2[] = {
266 0, 0, 0, 0, 0, 0,
267 ctype_digit, 0,
268 ctype_space, 0,
269 ctype_word, 0,
270 1, 1 /* OP_ANY, OP_ALLANY */
271 };
272
273
274 /* Structure for holding data about a particular state, which is in effect the
275 current data for an active path through the match tree. It must consist
276 entirely of ints because the working vector we are passed, and which we put
277 these structures in, is a vector of ints. */
278
279 typedef struct stateblock {
280 int offset; /* Offset to opcode */
281 int count; /* Count for repeats */
282 int data; /* Some use extra data */
283 } stateblock;
284
285 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
286
287
288 #ifdef PCRE_DEBUG
289 /*************************************************
290 * Print character string *
291 *************************************************/
292
293 /* Character string printing function for debugging.
294
295 Arguments:
296 p points to string
297 length number of bytes
298 f where to print
299
300 Returns: nothing
301 */
302
303 static void
304 pchars(const pcre_uchar *p, int length, FILE *f)
305 {
306 int c;
307 while (length-- > 0)
308 {
309 if (isprint(c = *(p++)))
310 fprintf(f, "%c", c);
311 else
312 fprintf(f, "\\x%02x", c);
313 }
314 }
315 #endif
316
317
318
319 /*************************************************
320 * Execute a Regular Expression - DFA engine *
321 *************************************************/
322
323 /* This internal function applies a compiled pattern to a subject string,
324 starting at a given point, using a DFA engine. This function is called from the
325 external one, possibly multiple times if the pattern is not anchored. The
326 function calls itself recursively for some kinds of subpattern.
327
328 Arguments:
329 md the match_data block with fixed information
330 this_start_code the opening bracket of this subexpression's code
331 current_subject where we currently are in the subject string
332 start_offset start offset in the subject string
333 offsets vector to contain the matching string offsets
334 offsetcount size of same
335 workspace vector of workspace
336 wscount size of same
337 rlevel function call recursion level
338
339 Returns: > 0 => number of match offset pairs placed in offsets
340 = 0 => offsets overflowed; longest matches are present
341 -1 => failed to match
342 < -1 => some kind of unexpected problem
343
344 The following macros are used for adding states to the two state vectors (one
345 for the current character, one for the following character). */
346
347 #define ADD_ACTIVE(x,y) \
348 if (active_count++ < wscount) \
349 { \
350 next_active_state->offset = (x); \
351 next_active_state->count = (y); \
352 next_active_state++; \
353 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354 } \
355 else return PCRE_ERROR_DFA_WSSIZE
356
357 #define ADD_ACTIVE_DATA(x,y,z) \
358 if (active_count++ < wscount) \
359 { \
360 next_active_state->offset = (x); \
361 next_active_state->count = (y); \
362 next_active_state->data = (z); \
363 next_active_state++; \
364 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365 } \
366 else return PCRE_ERROR_DFA_WSSIZE
367
368 #define ADD_NEW(x,y) \
369 if (new_count++ < wscount) \
370 { \
371 next_new_state->offset = (x); \
372 next_new_state->count = (y); \
373 next_new_state++; \
374 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375 } \
376 else return PCRE_ERROR_DFA_WSSIZE
377
378 #define ADD_NEW_DATA(x,y,z) \
379 if (new_count++ < wscount) \
380 { \
381 next_new_state->offset = (x); \
382 next_new_state->count = (y); \
383 next_new_state->data = (z); \
384 next_new_state++; \
385 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
386 } \
387 else return PCRE_ERROR_DFA_WSSIZE
388
389 /* And now, here is the code */
390
391 static int
392 internal_dfa_exec(
393 dfa_match_data *md,
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
396 int start_offset,
397 int *offsets,
398 int offsetcount,
399 int *workspace,
400 int wscount,
401 int rlevel)
402 {
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
405
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
409
410 dfa_recursion_info new_recursive;
411
412 int active_count, new_count, match_count;
413
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
416
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
420
421 #ifdef SUPPORT_UTF
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 #else
424 BOOL utf = FALSE;
425 #endif
426
427 rlevel++;
428 offsetcount &= (-2);
429
430 wscount -= 2;
431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
432 (2 * INTS_PER_STATEBLOCK);
433
434 DPRINTF(("\n%.*s---------------------\n"
435 "%.*sCall to internal_dfa_exec f=%d\n",
436 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
437
438 ctypes = md->tables + ctypes_offset;
439 lcc = md->tables + lcc_offset;
440 fcc = md->tables + fcc_offset;
441
442 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
443
444 active_states = (stateblock *)(workspace + 2);
445 next_new_state = new_states = active_states + wscount;
446 new_count = 0;
447
448 first_op = this_start_code + 1 + LINK_SIZE +
449 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
451 ? IMM2_SIZE:0);
452
453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454 the alternative states onto the list, and find out where the end is. This
455 makes is possible to use this function recursively, when we want to stop at a
456 matching internal ket rather than at the end.
457
458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459 a backward assertion. In that case, we have to find out the maximum amount to
460 move back, and set up each alternative appropriately. */
461
462 if (*first_op == OP_REVERSE)
463 {
464 int max_back = 0;
465 int gone_back;
466
467 end_code = this_start_code;
468 do
469 {
470 int back = GET(end_code, 2+LINK_SIZE);
471 if (back > max_back) max_back = back;
472 end_code += GET(end_code, 1);
473 }
474 while (*end_code == OP_ALT);
475
476 /* If we can't go back the amount required for the longest lookbehind
477 pattern, go back as far as we can; some alternatives may still be viable. */
478
479 #ifdef SUPPORT_UTF
480 /* In character mode we have to step back character by character */
481
482 if (utf)
483 {
484 for (gone_back = 0; gone_back < max_back; gone_back++)
485 {
486 if (current_subject <= start_subject) break;
487 current_subject--;
488 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
489 }
490 }
491 else
492 #endif
493
494 /* In byte-mode we can do this quickly. */
495
496 {
497 gone_back = (current_subject - max_back < start_subject)?
498 (int)(current_subject - start_subject) : max_back;
499 current_subject -= gone_back;
500 }
501
502 /* Save the earliest consulted character */
503
504 if (current_subject < md->start_used_ptr)
505 md->start_used_ptr = current_subject;
506
507 /* Now we can process the individual branches. */
508
509 end_code = this_start_code;
510 do
511 {
512 int back = GET(end_code, 2+LINK_SIZE);
513 if (back <= gone_back)
514 {
515 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516 ADD_NEW_DATA(-bstate, 0, gone_back - back);
517 }
518 end_code += GET(end_code, 1);
519 }
520 while (*end_code == OP_ALT);
521 }
522
523 /* This is the code for a "normal" subpattern (not a backward assertion). The
524 start of a whole pattern is always one of these. If we are at the top level,
525 we may be asked to restart matching from the same point that we reached for a
526 previous partial match. We still have to scan through the top-level branches to
527 find the end state. */
528
529 else
530 {
531 end_code = this_start_code;
532
533 /* Restarting */
534
535 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
536 {
537 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
538 new_count = workspace[1];
539 if (!workspace[0])
540 memcpy(new_states, active_states, new_count * sizeof(stateblock));
541 }
542
543 /* Not restarting */
544
545 else
546 {
547 int length = 1 + LINK_SIZE +
548 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
550 ? IMM2_SIZE:0);
551 do
552 {
553 ADD_NEW((int)(end_code - start_code + length), 0);
554 end_code += GET(end_code, 1);
555 length = 1 + LINK_SIZE;
556 }
557 while (*end_code == OP_ALT);
558 }
559 }
560
561 workspace[0] = 0; /* Bit indicating which vector is current */
562
563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
564
565 /* Loop for scanning the subject */
566
567 ptr = current_subject;
568 for (;;)
569 {
570 int i, j;
571 int clen, dlen;
572 unsigned int c, d;
573 int forced_fail = 0;
574 BOOL could_continue = FALSE;
575
576 /* Make the new state list into the active state list and empty the
577 new state list. */
578
579 temp_states = active_states;
580 active_states = new_states;
581 new_states = temp_states;
582 active_count = new_count;
583 new_count = 0;
584
585 workspace[0] ^= 1; /* Remember for the restarting feature */
586 workspace[1] = active_count;
587
588 #ifdef PCRE_DEBUG
589 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
590 pchars(ptr, STRLEN_UC(ptr), stdout);
591 printf("\"\n");
592
593 printf("%.*sActive states: ", rlevel*2-2, SP);
594 for (i = 0; i < active_count; i++)
595 printf("%d/%d ", active_states[i].offset, active_states[i].count);
596 printf("\n");
597 #endif
598
599 /* Set the pointers for adding new states */
600
601 next_active_state = active_states + active_count;
602 next_new_state = new_states;
603
604 /* Load the current character from the subject outside the loop, as many
605 different states may want to look at it, and we assume that at least one
606 will. */
607
608 if (ptr < end_subject)
609 {
610 clen = 1; /* Number of bytes in the character */
611 #ifdef SUPPORT_UTF
612 if (utf) { GETCHARLEN(c, ptr, clen); } else
613 #endif /* SUPPORT_UTF */
614 c = *ptr;
615 }
616 else
617 {
618 clen = 0; /* This indicates the end of the subject */
619 c = NOTACHAR; /* This value should never actually be used */
620 }
621
622 /* Scan up the active states and act on each one. The result of an action
623 may be to add more states to the currently active list (e.g. on hitting a
624 parenthesis) or it may be to put states on the new list, for considering
625 when we move the character pointer on. */
626
627 for (i = 0; i < active_count; i++)
628 {
629 stateblock *current_state = active_states + i;
630 BOOL caseless = FALSE;
631 const pcre_uchar *code;
632 int state_offset = current_state->offset;
633 int count, codevalue, rrc;
634
635 #ifdef PCRE_DEBUG
636 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
637 if (clen == 0) printf("EOL\n");
638 else if (c > 32 && c < 127) printf("'%c'\n", c);
639 else printf("0x%02x\n", c);
640 #endif
641
642 /* A negative offset is a special case meaning "hold off going to this
643 (negated) state until the number of characters in the data field have
644 been skipped". */
645
646 if (state_offset < 0)
647 {
648 if (current_state->data > 0)
649 {
650 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
651 ADD_NEW_DATA(state_offset, current_state->count,
652 current_state->data - 1);
653 continue;
654 }
655 else
656 {
657 current_state->offset = state_offset = -state_offset;
658 }
659 }
660
661 /* Check for a duplicate state with the same count, and skip if found.
662 See the note at the head of this module about the possibility of improving
663 performance here. */
664
665 for (j = 0; j < i; j++)
666 {
667 if (active_states[j].offset == state_offset &&
668 active_states[j].count == current_state->count)
669 {
670 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
671 goto NEXT_ACTIVE_STATE;
672 }
673 }
674
675 /* The state offset is the offset to the opcode */
676
677 code = start_code + state_offset;
678 codevalue = *code;
679
680 /* If this opcode inspects a character, but we are at the end of the
681 subject, remember the fact for use when testing for a partial match. */
682
683 if (clen == 0 && poptable[codevalue] != 0)
684 could_continue = TRUE;
685
686 /* If this opcode is followed by an inline character, load it. It is
687 tempting to test for the presence of a subject character here, but that
688 is wrong, because sometimes zero repetitions of the subject are
689 permitted.
690
691 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
692 argument that is not a data character - but is always one byte long. We
693 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
694 this case. To keep the other cases fast, convert these ones to new opcodes.
695 */
696
697 if (coptable[codevalue] > 0)
698 {
699 dlen = 1;
700 #ifdef SUPPORT_UTF
701 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
702 #endif /* SUPPORT_UTF */
703 d = code[coptable[codevalue]];
704 if (codevalue >= OP_TYPESTAR)
705 {
706 switch(d)
707 {
708 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
709 case OP_NOTPROP:
710 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
711 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
712 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
713 case OP_NOT_HSPACE:
714 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
715 case OP_NOT_VSPACE:
716 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
717 default: break;
718 }
719 }
720 }
721 else
722 {
723 dlen = 0; /* Not strictly necessary, but compilers moan */
724 d = NOTACHAR; /* if these variables are not set. */
725 }
726
727
728 /* Now process the individual opcodes */
729
730 switch (codevalue)
731 {
732 /* ========================================================================== */
733 /* These cases are never obeyed. This is a fudge that causes a compile-
734 time error if the vectors coptable or poptable, which are indexed by
735 opcode, are not the correct length. It seems to be the only way to do
736 such a check at compile time, as the sizeof() operator does not work
737 in the C preprocessor. */
738
739 case OP_TABLE_LENGTH:
740 case OP_TABLE_LENGTH +
741 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
742 (sizeof(poptable) == OP_TABLE_LENGTH)):
743 break;
744
745 /* ========================================================================== */
746 /* Reached a closing bracket. If not at the end of the pattern, carry
747 on with the next opcode. For repeating opcodes, also add the repeat
748 state. Note that KETRPOS will always be encountered at the end of the
749 subpattern, because the possessive subpattern repeats are always handled
750 using recursive calls. Thus, it never adds any new states.
751
752 At the end of the (sub)pattern, unless we have an empty string and
753 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
754 start of the subject, save the match data, shifting up all previous
755 matches so we always have the longest first. */
756
757 case OP_KET:
758 case OP_KETRMIN:
759 case OP_KETRMAX:
760 case OP_KETRPOS:
761 if (code != end_code)
762 {
763 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
764 if (codevalue != OP_KET)
765 {
766 ADD_ACTIVE(state_offset - GET(code, 1), 0);
767 }
768 }
769 else
770 {
771 if (ptr > current_subject ||
772 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
773 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
774 current_subject > start_subject + md->start_offset)))
775 {
776 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
777 else if (match_count > 0 && ++match_count * 2 > offsetcount)
778 match_count = 0;
779 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
780 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
781 if (offsetcount >= 2)
782 {
783 offsets[0] = (int)(current_subject - start_subject);
784 offsets[1] = (int)(ptr - start_subject);
785 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
786 offsets[1] - offsets[0], current_subject));
787 }
788 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
789 {
790 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
791 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
792 match_count, rlevel*2-2, SP));
793 return match_count;
794 }
795 }
796 }
797 break;
798
799 /* ========================================================================== */
800 /* These opcodes add to the current list of states without looking
801 at the current character. */
802
803 /*-----------------------------------------------------------------*/
804 case OP_ALT:
805 do { code += GET(code, 1); } while (*code == OP_ALT);
806 ADD_ACTIVE((int)(code - start_code), 0);
807 break;
808
809 /*-----------------------------------------------------------------*/
810 case OP_BRA:
811 case OP_SBRA:
812 do
813 {
814 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
815 code += GET(code, 1);
816 }
817 while (*code == OP_ALT);
818 break;
819
820 /*-----------------------------------------------------------------*/
821 case OP_CBRA:
822 case OP_SCBRA:
823 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
824 code += GET(code, 1);
825 while (*code == OP_ALT)
826 {
827 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
828 code += GET(code, 1);
829 }
830 break;
831
832 /*-----------------------------------------------------------------*/
833 case OP_BRAZERO:
834 case OP_BRAMINZERO:
835 ADD_ACTIVE(state_offset + 1, 0);
836 code += 1 + GET(code, 2);
837 while (*code == OP_ALT) code += GET(code, 1);
838 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
839 break;
840
841 /*-----------------------------------------------------------------*/
842 case OP_SKIPZERO:
843 code += 1 + GET(code, 2);
844 while (*code == OP_ALT) code += GET(code, 1);
845 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
846 break;
847
848 /*-----------------------------------------------------------------*/
849 case OP_CIRC:
850 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
851 { ADD_ACTIVE(state_offset + 1, 0); }
852 break;
853
854 /*-----------------------------------------------------------------*/
855 case OP_CIRCM:
856 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
857 (ptr != end_subject && WAS_NEWLINE(ptr)))
858 { ADD_ACTIVE(state_offset + 1, 0); }
859 break;
860
861 /*-----------------------------------------------------------------*/
862 case OP_EOD:
863 if (ptr >= end_subject)
864 {
865 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
866 could_continue = TRUE;
867 else { ADD_ACTIVE(state_offset + 1, 0); }
868 }
869 break;
870
871 /*-----------------------------------------------------------------*/
872 case OP_SOD:
873 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
874 break;
875
876 /*-----------------------------------------------------------------*/
877 case OP_SOM:
878 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
879 break;
880
881
882 /* ========================================================================== */
883 /* These opcodes inspect the next subject character, and sometimes
884 the previous one as well, but do not have an argument. The variable
885 clen contains the length of the current character and is zero if we are
886 at the end of the subject. */
887
888 /*-----------------------------------------------------------------*/
889 case OP_ANY:
890 if (clen > 0 && !IS_NEWLINE(ptr))
891 { ADD_NEW(state_offset + 1, 0); }
892 break;
893
894 /*-----------------------------------------------------------------*/
895 case OP_ALLANY:
896 if (clen > 0)
897 { ADD_NEW(state_offset + 1, 0); }
898 break;
899
900 /*-----------------------------------------------------------------*/
901 case OP_EODN:
902 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
903 could_continue = TRUE;
904 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
905 { ADD_ACTIVE(state_offset + 1, 0); }
906 break;
907
908 /*-----------------------------------------------------------------*/
909 case OP_DOLL:
910 if ((md->moptions & PCRE_NOTEOL) == 0)
911 {
912 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
913 could_continue = TRUE;
914 else if (clen == 0 ||
915 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
916 (ptr == end_subject - md->nllen)
917 ))
918 { ADD_ACTIVE(state_offset + 1, 0); }
919 }
920 break;
921
922 /*-----------------------------------------------------------------*/
923 case OP_DOLLM:
924 if ((md->moptions & PCRE_NOTEOL) == 0)
925 {
926 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
927 could_continue = TRUE;
928 else if (clen == 0 ||
929 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 }
932 else if (IS_NEWLINE(ptr))
933 { ADD_ACTIVE(state_offset + 1, 0); }
934 break;
935
936 /*-----------------------------------------------------------------*/
937
938 case OP_DIGIT:
939 case OP_WHITESPACE:
940 case OP_WORDCHAR:
941 if (clen > 0 && c < 256 &&
942 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
943 { ADD_NEW(state_offset + 1, 0); }
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_NOT_DIGIT:
948 case OP_NOT_WHITESPACE:
949 case OP_NOT_WORDCHAR:
950 if (clen > 0 && (c >= 256 ||
951 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
952 { ADD_NEW(state_offset + 1, 0); }
953 break;
954
955 /*-----------------------------------------------------------------*/
956 case OP_WORD_BOUNDARY:
957 case OP_NOT_WORD_BOUNDARY:
958 {
959 int left_word, right_word;
960
961 if (ptr > start_subject)
962 {
963 const pcre_uchar *temp = ptr - 1;
964 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
965 #ifdef SUPPORT_UTF
966 if (utf) { BACKCHAR(temp); }
967 #endif
968 GETCHARTEST(d, temp);
969 #ifdef SUPPORT_UCP
970 if ((md->poptions & PCRE_UCP) != 0)
971 {
972 if (d == '_') left_word = TRUE; else
973 {
974 int cat = UCD_CATEGORY(d);
975 left_word = (cat == ucp_L || cat == ucp_N);
976 }
977 }
978 else
979 #endif
980 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
981 }
982 else left_word = FALSE;
983
984 if (clen > 0)
985 {
986 #ifdef SUPPORT_UCP
987 if ((md->poptions & PCRE_UCP) != 0)
988 {
989 if (c == '_') right_word = TRUE; else
990 {
991 int cat = UCD_CATEGORY(c);
992 right_word = (cat == ucp_L || cat == ucp_N);
993 }
994 }
995 else
996 #endif
997 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
998 }
999 else right_word = FALSE;
1000
1001 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1002 { ADD_ACTIVE(state_offset + 1, 0); }
1003 }
1004 break;
1005
1006
1007 /*-----------------------------------------------------------------*/
1008 /* Check the next character by Unicode property. We will get here only
1009 if the support is in the binary; otherwise a compile-time error occurs.
1010 */
1011
1012 #ifdef SUPPORT_UCP
1013 case OP_PROP:
1014 case OP_NOTPROP:
1015 if (clen > 0)
1016 {
1017 BOOL OK;
1018 const ucd_record * prop = GET_UCD(c);
1019 switch(code[1])
1020 {
1021 case PT_ANY:
1022 OK = TRUE;
1023 break;
1024
1025 case PT_LAMP:
1026 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1027 prop->chartype == ucp_Lt;
1028 break;
1029
1030 case PT_GC:
1031 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1032 break;
1033
1034 case PT_PC:
1035 OK = prop->chartype == code[2];
1036 break;
1037
1038 case PT_SC:
1039 OK = prop->script == code[2];
1040 break;
1041
1042 /* These are specials for combination cases. */
1043
1044 case PT_ALNUM:
1045 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1046 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1047 break;
1048
1049 case PT_SPACE: /* Perl space */
1050 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1052 break;
1053
1054 case PT_PXSPACE: /* POSIX space */
1055 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1056 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1057 c == CHAR_FF || c == CHAR_CR;
1058 break;
1059
1060 case PT_WORD:
1061 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1062 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1063 c == CHAR_UNDERSCORE;
1064 break;
1065
1066 /* Should never occur, but keep compilers from grumbling. */
1067
1068 default:
1069 OK = codevalue != OP_PROP;
1070 break;
1071 }
1072
1073 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1074 }
1075 break;
1076 #endif
1077
1078
1079
1080 /* ========================================================================== */
1081 /* These opcodes likewise inspect the subject character, but have an
1082 argument that is not a data character. It is one of these opcodes:
1083 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1084 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1085
1086 case OP_TYPEPLUS:
1087 case OP_TYPEMINPLUS:
1088 case OP_TYPEPOSPLUS:
1089 count = current_state->count; /* Already matched */
1090 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1091 if (clen > 0)
1092 {
1093 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1094 (c < 256 &&
1095 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1096 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1097 {
1098 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1099 {
1100 active_count--; /* Remove non-match possibility */
1101 next_active_state--;
1102 }
1103 count++;
1104 ADD_NEW(state_offset, count);
1105 }
1106 }
1107 break;
1108
1109 /*-----------------------------------------------------------------*/
1110 case OP_TYPEQUERY:
1111 case OP_TYPEMINQUERY:
1112 case OP_TYPEPOSQUERY:
1113 ADD_ACTIVE(state_offset + 2, 0);
1114 if (clen > 0)
1115 {
1116 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1117 (c < 256 &&
1118 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1119 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1120 {
1121 if (codevalue == OP_TYPEPOSQUERY)
1122 {
1123 active_count--; /* Remove non-match possibility */
1124 next_active_state--;
1125 }
1126 ADD_NEW(state_offset + 2, 0);
1127 }
1128 }
1129 break;
1130
1131 /*-----------------------------------------------------------------*/
1132 case OP_TYPESTAR:
1133 case OP_TYPEMINSTAR:
1134 case OP_TYPEPOSSTAR:
1135 ADD_ACTIVE(state_offset + 2, 0);
1136 if (clen > 0)
1137 {
1138 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1139 (c < 256 &&
1140 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1141 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1142 {
1143 if (codevalue == OP_TYPEPOSSTAR)
1144 {
1145 active_count--; /* Remove non-match possibility */
1146 next_active_state--;
1147 }
1148 ADD_NEW(state_offset, 0);
1149 }
1150 }
1151 break;
1152
1153 /*-----------------------------------------------------------------*/
1154 case OP_TYPEEXACT:
1155 count = current_state->count; /* Number already matched */
1156 if (clen > 0)
1157 {
1158 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159 (c < 256 &&
1160 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162 {
1163 if (++count >= GET2(code, 1))
1164 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1165 else
1166 { ADD_NEW(state_offset, count); }
1167 }
1168 }
1169 break;
1170
1171 /*-----------------------------------------------------------------*/
1172 case OP_TYPEUPTO:
1173 case OP_TYPEMINUPTO:
1174 case OP_TYPEPOSUPTO:
1175 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1176 count = current_state->count; /* Number already matched */
1177 if (clen > 0)
1178 {
1179 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1180 (c < 256 &&
1181 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1182 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1183 {
1184 if (codevalue == OP_TYPEPOSUPTO)
1185 {
1186 active_count--; /* Remove non-match possibility */
1187 next_active_state--;
1188 }
1189 if (++count >= GET2(code, 1))
1190 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1191 else
1192 { ADD_NEW(state_offset, count); }
1193 }
1194 }
1195 break;
1196
1197 /* ========================================================================== */
1198 /* These are virtual opcodes that are used when something like
1199 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1200 argument. It keeps the code above fast for the other cases. The argument
1201 is in the d variable. */
1202
1203 #ifdef SUPPORT_UCP
1204 case OP_PROP_EXTRA + OP_TYPEPLUS:
1205 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1206 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1207 count = current_state->count; /* Already matched */
1208 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1209 if (clen > 0)
1210 {
1211 BOOL OK;
1212 const ucd_record * prop = GET_UCD(c);
1213 switch(code[2])
1214 {
1215 case PT_ANY:
1216 OK = TRUE;
1217 break;
1218
1219 case PT_LAMP:
1220 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1221 prop->chartype == ucp_Lt;
1222 break;
1223
1224 case PT_GC:
1225 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1226 break;
1227
1228 case PT_PC:
1229 OK = prop->chartype == code[3];
1230 break;
1231
1232 case PT_SC:
1233 OK = prop->script == code[3];
1234 break;
1235
1236 /* These are specials for combination cases. */
1237
1238 case PT_ALNUM:
1239 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1240 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1241 break;
1242
1243 case PT_SPACE: /* Perl space */
1244 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1246 break;
1247
1248 case PT_PXSPACE: /* POSIX space */
1249 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1250 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1251 c == CHAR_FF || c == CHAR_CR;
1252 break;
1253
1254 case PT_WORD:
1255 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1256 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1257 c == CHAR_UNDERSCORE;
1258 break;
1259
1260 /* Should never occur, but keep compilers from grumbling. */
1261
1262 default:
1263 OK = codevalue != OP_PROP;
1264 break;
1265 }
1266
1267 if (OK == (d == OP_PROP))
1268 {
1269 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1270 {
1271 active_count--; /* Remove non-match possibility */
1272 next_active_state--;
1273 }
1274 count++;
1275 ADD_NEW(state_offset, count);
1276 }
1277 }
1278 break;
1279
1280 /*-----------------------------------------------------------------*/
1281 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1282 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1283 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1284 count = current_state->count; /* Already matched */
1285 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1286 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1287 {
1288 const pcre_uchar *nptr = ptr + clen;
1289 int ncount = 0;
1290 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1291 {
1292 active_count--; /* Remove non-match possibility */
1293 next_active_state--;
1294 }
1295 while (nptr < end_subject)
1296 {
1297 int nd;
1298 int ndlen = 1;
1299 GETCHARLEN(nd, nptr, ndlen);
1300 if (UCD_CATEGORY(nd) != ucp_M) break;
1301 ncount++;
1302 nptr += ndlen;
1303 }
1304 count++;
1305 ADD_NEW_DATA(-state_offset, count, ncount);
1306 }
1307 break;
1308 #endif
1309
1310 /*-----------------------------------------------------------------*/
1311 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1312 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1313 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1314 count = current_state->count; /* Already matched */
1315 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1316 if (clen > 0)
1317 {
1318 int ncount = 0;
1319 switch (c)
1320 {
1321 case 0x000b:
1322 case 0x000c:
1323 case 0x0085:
1324 case 0x2028:
1325 case 0x2029:
1326 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1327 goto ANYNL01;
1328
1329 case 0x000d:
1330 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1331 /* Fall through */
1332
1333 ANYNL01:
1334 case 0x000a:
1335 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1336 {
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1339 }
1340 count++;
1341 ADD_NEW_DATA(-state_offset, count, ncount);
1342 break;
1343
1344 default:
1345 break;
1346 }
1347 }
1348 break;
1349
1350 /*-----------------------------------------------------------------*/
1351 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1352 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1353 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1354 count = current_state->count; /* Already matched */
1355 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1356 if (clen > 0)
1357 {
1358 BOOL OK;
1359 switch (c)
1360 {
1361 case 0x000a:
1362 case 0x000b:
1363 case 0x000c:
1364 case 0x000d:
1365 case 0x0085:
1366 case 0x2028:
1367 case 0x2029:
1368 OK = TRUE;
1369 break;
1370
1371 default:
1372 OK = FALSE;
1373 break;
1374 }
1375
1376 if (OK == (d == OP_VSPACE))
1377 {
1378 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1379 {
1380 active_count--; /* Remove non-match possibility */
1381 next_active_state--;
1382 }
1383 count++;
1384 ADD_NEW_DATA(-state_offset, count, 0);
1385 }
1386 }
1387 break;
1388
1389 /*-----------------------------------------------------------------*/
1390 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1391 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1392 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1393 count = current_state->count; /* Already matched */
1394 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1395 if (clen > 0)
1396 {
1397 BOOL OK;
1398 switch (c)
1399 {
1400 case 0x09: /* HT */
1401 case 0x20: /* SPACE */
1402 case 0xa0: /* NBSP */
1403 case 0x1680: /* OGHAM SPACE MARK */
1404 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1405 case 0x2000: /* EN QUAD */
1406 case 0x2001: /* EM QUAD */
1407 case 0x2002: /* EN SPACE */
1408 case 0x2003: /* EM SPACE */
1409 case 0x2004: /* THREE-PER-EM SPACE */
1410 case 0x2005: /* FOUR-PER-EM SPACE */
1411 case 0x2006: /* SIX-PER-EM SPACE */
1412 case 0x2007: /* FIGURE SPACE */
1413 case 0x2008: /* PUNCTUATION SPACE */
1414 case 0x2009: /* THIN SPACE */
1415 case 0x200A: /* HAIR SPACE */
1416 case 0x202f: /* NARROW NO-BREAK SPACE */
1417 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1418 case 0x3000: /* IDEOGRAPHIC SPACE */
1419 OK = TRUE;
1420 break;
1421
1422 default:
1423 OK = FALSE;
1424 break;
1425 }
1426
1427 if (OK == (d == OP_HSPACE))
1428 {
1429 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1430 {
1431 active_count--; /* Remove non-match possibility */
1432 next_active_state--;
1433 }
1434 count++;
1435 ADD_NEW_DATA(-state_offset, count, 0);
1436 }
1437 }
1438 break;
1439
1440 /*-----------------------------------------------------------------*/
1441 #ifdef SUPPORT_UCP
1442 case OP_PROP_EXTRA + OP_TYPEQUERY:
1443 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1444 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1445 count = 4;
1446 goto QS1;
1447
1448 case OP_PROP_EXTRA + OP_TYPESTAR:
1449 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1450 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1451 count = 0;
1452
1453 QS1:
1454
1455 ADD_ACTIVE(state_offset + 4, 0);
1456 if (clen > 0)
1457 {
1458 BOOL OK;
1459 const ucd_record * prop = GET_UCD(c);
1460 switch(code[2])
1461 {
1462 case PT_ANY:
1463 OK = TRUE;
1464 break;
1465
1466 case PT_LAMP:
1467 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1468 prop->chartype == ucp_Lt;
1469 break;
1470
1471 case PT_GC:
1472 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1473 break;
1474
1475 case PT_PC:
1476 OK = prop->chartype == code[3];
1477 break;
1478
1479 case PT_SC:
1480 OK = prop->script == code[3];
1481 break;
1482
1483 /* These are specials for combination cases. */
1484
1485 case PT_ALNUM:
1486 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1487 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1488 break;
1489
1490 case PT_SPACE: /* Perl space */
1491 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1493 break;
1494
1495 case PT_PXSPACE: /* POSIX space */
1496 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1497 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1498 c == CHAR_FF || c == CHAR_CR;
1499 break;
1500
1501 case PT_WORD:
1502 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1503 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1504 c == CHAR_UNDERSCORE;
1505 break;
1506
1507 /* Should never occur, but keep compilers from grumbling. */
1508
1509 default:
1510 OK = codevalue != OP_PROP;
1511 break;
1512 }
1513
1514 if (OK == (d == OP_PROP))
1515 {
1516 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1517 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1518 {
1519 active_count--; /* Remove non-match possibility */
1520 next_active_state--;
1521 }
1522 ADD_NEW(state_offset + count, 0);
1523 }
1524 }
1525 break;
1526
1527 /*-----------------------------------------------------------------*/
1528 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1529 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1530 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1531 count = 2;
1532 goto QS2;
1533
1534 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1535 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1536 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1537 count = 0;
1538
1539 QS2:
1540
1541 ADD_ACTIVE(state_offset + 2, 0);
1542 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1543 {
1544 const pcre_uchar *nptr = ptr + clen;
1545 int ncount = 0;
1546 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1547 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1548 {
1549 active_count--; /* Remove non-match possibility */
1550 next_active_state--;
1551 }
1552 while (nptr < end_subject)
1553 {
1554 int nd;
1555 int ndlen = 1;
1556 GETCHARLEN(nd, nptr, ndlen);
1557 if (UCD_CATEGORY(nd) != ucp_M) break;
1558 ncount++;
1559 nptr += ndlen;
1560 }
1561 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1562 }
1563 break;
1564 #endif
1565
1566 /*-----------------------------------------------------------------*/
1567 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1568 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1569 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1570 count = 2;
1571 goto QS3;
1572
1573 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1574 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1575 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1576 count = 0;
1577
1578 QS3:
1579 ADD_ACTIVE(state_offset + 2, 0);
1580 if (clen > 0)
1581 {
1582 int ncount = 0;
1583 switch (c)
1584 {
1585 case 0x000b:
1586 case 0x000c:
1587 case 0x0085:
1588 case 0x2028:
1589 case 0x2029:
1590 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1591 goto ANYNL02;
1592
1593 case 0x000d:
1594 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1595 /* Fall through */
1596
1597 ANYNL02:
1598 case 0x000a:
1599 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1600 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1601 {
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1604 }
1605 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1606 break;
1607
1608 default:
1609 break;
1610 }
1611 }
1612 break;
1613
1614 /*-----------------------------------------------------------------*/
1615 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1616 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1617 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1618 count = 2;
1619 goto QS4;
1620
1621 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1622 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1623 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1624 count = 0;
1625
1626 QS4:
1627 ADD_ACTIVE(state_offset + 2, 0);
1628 if (clen > 0)
1629 {
1630 BOOL OK;
1631 switch (c)
1632 {
1633 case 0x000a:
1634 case 0x000b:
1635 case 0x000c:
1636 case 0x000d:
1637 case 0x0085:
1638 case 0x2028:
1639 case 0x2029:
1640 OK = TRUE;
1641 break;
1642
1643 default:
1644 OK = FALSE;
1645 break;
1646 }
1647 if (OK == (d == OP_VSPACE))
1648 {
1649 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1650 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1651 {
1652 active_count--; /* Remove non-match possibility */
1653 next_active_state--;
1654 }
1655 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1656 }
1657 }
1658 break;
1659
1660 /*-----------------------------------------------------------------*/
1661 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1662 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1663 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1664 count = 2;
1665 goto QS5;
1666
1667 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1668 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1669 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1670 count = 0;
1671
1672 QS5:
1673 ADD_ACTIVE(state_offset + 2, 0);
1674 if (clen > 0)
1675 {
1676 BOOL OK;
1677 switch (c)
1678 {
1679 case 0x09: /* HT */
1680 case 0x20: /* SPACE */
1681 case 0xa0: /* NBSP */
1682 case 0x1680: /* OGHAM SPACE MARK */
1683 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1684 case 0x2000: /* EN QUAD */
1685 case 0x2001: /* EM QUAD */
1686 case 0x2002: /* EN SPACE */
1687 case 0x2003: /* EM SPACE */
1688 case 0x2004: /* THREE-PER-EM SPACE */
1689 case 0x2005: /* FOUR-PER-EM SPACE */
1690 case 0x2006: /* SIX-PER-EM SPACE */
1691 case 0x2007: /* FIGURE SPACE */
1692 case 0x2008: /* PUNCTUATION SPACE */
1693 case 0x2009: /* THIN SPACE */
1694 case 0x200A: /* HAIR SPACE */
1695 case 0x202f: /* NARROW NO-BREAK SPACE */
1696 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1697 case 0x3000: /* IDEOGRAPHIC SPACE */
1698 OK = TRUE;
1699 break;
1700
1701 default:
1702 OK = FALSE;
1703 break;
1704 }
1705
1706 if (OK == (d == OP_HSPACE))
1707 {
1708 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1709 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1710 {
1711 active_count--; /* Remove non-match possibility */
1712 next_active_state--;
1713 }
1714 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1715 }
1716 }
1717 break;
1718
1719 /*-----------------------------------------------------------------*/
1720 #ifdef SUPPORT_UCP
1721 case OP_PROP_EXTRA + OP_TYPEEXACT:
1722 case OP_PROP_EXTRA + OP_TYPEUPTO:
1723 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1724 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1725 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1726 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1727 count = current_state->count; /* Number already matched */
1728 if (clen > 0)
1729 {
1730 BOOL OK;
1731 const ucd_record * prop = GET_UCD(c);
1732 switch(code[1 + IMM2_SIZE + 1])
1733 {
1734 case PT_ANY:
1735 OK = TRUE;
1736 break;
1737
1738 case PT_LAMP:
1739 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1740 prop->chartype == ucp_Lt;
1741 break;
1742
1743 case PT_GC:
1744 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1745 break;
1746
1747 case PT_PC:
1748 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1749 break;
1750
1751 case PT_SC:
1752 OK = prop->script == code[1 + IMM2_SIZE + 2];
1753 break;
1754
1755 /* These are specials for combination cases. */
1756
1757 case PT_ALNUM:
1758 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1759 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1760 break;
1761
1762 case PT_SPACE: /* Perl space */
1763 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1765 break;
1766
1767 case PT_PXSPACE: /* POSIX space */
1768 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1769 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1770 c == CHAR_FF || c == CHAR_CR;
1771 break;
1772
1773 case PT_WORD:
1774 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1775 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1776 c == CHAR_UNDERSCORE;
1777 break;
1778
1779 /* Should never occur, but keep compilers from grumbling. */
1780
1781 default:
1782 OK = codevalue != OP_PROP;
1783 break;
1784 }
1785
1786 if (OK == (d == OP_PROP))
1787 {
1788 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1789 {
1790 active_count--; /* Remove non-match possibility */
1791 next_active_state--;
1792 }
1793 if (++count >= GET2(code, 1))
1794 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1795 else
1796 { ADD_NEW(state_offset, count); }
1797 }
1798 }
1799 break;
1800
1801 /*-----------------------------------------------------------------*/
1802 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1803 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1804 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1805 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1806 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1807 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1808 count = current_state->count; /* Number already matched */
1809 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1810 {
1811 const pcre_uchar *nptr = ptr + clen;
1812 int ncount = 0;
1813 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1814 {
1815 active_count--; /* Remove non-match possibility */
1816 next_active_state--;
1817 }
1818 while (nptr < end_subject)
1819 {
1820 int nd;
1821 int ndlen = 1;
1822 GETCHARLEN(nd, nptr, ndlen);
1823 if (UCD_CATEGORY(nd) != ucp_M) break;
1824 ncount++;
1825 nptr += ndlen;
1826 }
1827 if (++count >= GET2(code, 1))
1828 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1829 else
1830 { ADD_NEW_DATA(-state_offset, count, ncount); }
1831 }
1832 break;
1833 #endif
1834
1835 /*-----------------------------------------------------------------*/
1836 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1837 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1838 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1839 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1840 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1841 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1842 count = current_state->count; /* Number already matched */
1843 if (clen > 0)
1844 {
1845 int ncount = 0;
1846 switch (c)
1847 {
1848 case 0x000b:
1849 case 0x000c:
1850 case 0x0085:
1851 case 0x2028:
1852 case 0x2029:
1853 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1854 goto ANYNL03;
1855
1856 case 0x000d:
1857 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1858 /* Fall through */
1859
1860 ANYNL03:
1861 case 0x000a:
1862 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1863 {
1864 active_count--; /* Remove non-match possibility */
1865 next_active_state--;
1866 }
1867 if (++count >= GET2(code, 1))
1868 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1869 else
1870 { ADD_NEW_DATA(-state_offset, count, ncount); }
1871 break;
1872
1873 default:
1874 break;
1875 }
1876 }
1877 break;
1878
1879 /*-----------------------------------------------------------------*/
1880 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1881 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1882 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1883 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1884 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1885 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1886 count = current_state->count; /* Number already matched */
1887 if (clen > 0)
1888 {
1889 BOOL OK;
1890 switch (c)
1891 {
1892 case 0x000a:
1893 case 0x000b:
1894 case 0x000c:
1895 case 0x000d:
1896 case 0x0085:
1897 case 0x2028:
1898 case 0x2029:
1899 OK = TRUE;
1900 break;
1901
1902 default:
1903 OK = FALSE;
1904 }
1905
1906 if (OK == (d == OP_VSPACE))
1907 {
1908 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1909 {
1910 active_count--; /* Remove non-match possibility */
1911 next_active_state--;
1912 }
1913 if (++count >= GET2(code, 1))
1914 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1915 else
1916 { ADD_NEW_DATA(-state_offset, count, 0); }
1917 }
1918 }
1919 break;
1920
1921 /*-----------------------------------------------------------------*/
1922 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1923 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1924 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1925 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1926 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1927 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1928 count = current_state->count; /* Number already matched */
1929 if (clen > 0)
1930 {
1931 BOOL OK;
1932 switch (c)
1933 {
1934 case 0x09: /* HT */
1935 case 0x20: /* SPACE */
1936 case 0xa0: /* NBSP */
1937 case 0x1680: /* OGHAM SPACE MARK */
1938 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1939 case 0x2000: /* EN QUAD */
1940 case 0x2001: /* EM QUAD */
1941 case 0x2002: /* EN SPACE */
1942 case 0x2003: /* EM SPACE */
1943 case 0x2004: /* THREE-PER-EM SPACE */
1944 case 0x2005: /* FOUR-PER-EM SPACE */
1945 case 0x2006: /* SIX-PER-EM SPACE */
1946 case 0x2007: /* FIGURE SPACE */
1947 case 0x2008: /* PUNCTUATION SPACE */
1948 case 0x2009: /* THIN SPACE */
1949 case 0x200A: /* HAIR SPACE */
1950 case 0x202f: /* NARROW NO-BREAK SPACE */
1951 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1952 case 0x3000: /* IDEOGRAPHIC SPACE */
1953 OK = TRUE;
1954 break;
1955
1956 default:
1957 OK = FALSE;
1958 break;
1959 }
1960
1961 if (OK == (d == OP_HSPACE))
1962 {
1963 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1964 {
1965 active_count--; /* Remove non-match possibility */
1966 next_active_state--;
1967 }
1968 if (++count >= GET2(code, 1))
1969 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1970 else
1971 { ADD_NEW_DATA(-state_offset, count, 0); }
1972 }
1973 }
1974 break;
1975
1976 /* ========================================================================== */
1977 /* These opcodes are followed by a character that is usually compared
1978 to the current subject character; it is loaded into d. We still get
1979 here even if there is no subject character, because in some cases zero
1980 repetitions are permitted. */
1981
1982 /*-----------------------------------------------------------------*/
1983 case OP_CHAR:
1984 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1985 break;
1986
1987 /*-----------------------------------------------------------------*/
1988 case OP_CHARI:
1989 if (clen == 0) break;
1990
1991 #ifdef SUPPORT_UTF
1992 if (utf)
1993 {
1994 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1995 {
1996 unsigned int othercase;
1997 if (c < 128)
1998 othercase = fcc[c];
1999 else
2000 /* If we have Unicode property support, we can use it to test the
2001 other case of the character. */
2002 #ifdef SUPPORT_UCP
2003 othercase = UCD_OTHERCASE(c);
2004 #else
2005 othercase = NOTACHAR;
2006 #endif
2007
2008 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2009 }
2010 }
2011 else
2012 #endif /* SUPPORT_UTF */
2013 /* Not UTF mode */
2014 {
2015 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2016 { ADD_NEW(state_offset + 2, 0); }
2017 }
2018 break;
2019
2020
2021 #ifdef SUPPORT_UCP
2022 /*-----------------------------------------------------------------*/
2023 /* This is a tricky one because it can match more than one character.
2024 Find out how many characters to skip, and then set up a negative state
2025 to wait for them to pass before continuing. */
2026
2027 case OP_EXTUNI:
2028 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2029 {
2030 const pcre_uchar *nptr = ptr + clen;
2031 int ncount = 0;
2032 while (nptr < end_subject)
2033 {
2034 int nclen = 1;
2035 GETCHARLEN(c, nptr, nclen);
2036 if (UCD_CATEGORY(c) != ucp_M) break;
2037 ncount++;
2038 nptr += nclen;
2039 }
2040 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2041 }
2042 break;
2043 #endif
2044
2045 /*-----------------------------------------------------------------*/
2046 /* This is a tricky like EXTUNI because it too can match more than one
2047 character (when CR is followed by LF). In this case, set up a negative
2048 state to wait for one character to pass before continuing. */
2049
2050 case OP_ANYNL:
2051 if (clen > 0) switch(c)
2052 {
2053 case 0x000b:
2054 case 0x000c:
2055 case 0x0085:
2056 case 0x2028:
2057 case 0x2029:
2058 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2059
2060 case 0x000a:
2061 ADD_NEW(state_offset + 1, 0);
2062 break;
2063
2064 case 0x000d:
2065 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2066 {
2067 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2068 }
2069 else
2070 {
2071 ADD_NEW(state_offset + 1, 0);
2072 }
2073 break;
2074 }
2075 break;
2076
2077 /*-----------------------------------------------------------------*/
2078 case OP_NOT_VSPACE:
2079 if (clen > 0) switch(c)
2080 {
2081 case 0x000a:
2082 case 0x000b:
2083 case 0x000c:
2084 case 0x000d:
2085 case 0x0085:
2086 case 0x2028:
2087 case 0x2029:
2088 break;
2089
2090 default:
2091 ADD_NEW(state_offset + 1, 0);
2092 break;
2093 }
2094 break;
2095
2096 /*-----------------------------------------------------------------*/
2097 case OP_VSPACE:
2098 if (clen > 0) switch(c)
2099 {
2100 case 0x000a:
2101 case 0x000b:
2102 case 0x000c:
2103 case 0x000d:
2104 case 0x0085:
2105 case 0x2028:
2106 case 0x2029:
2107 ADD_NEW(state_offset + 1, 0);
2108 break;
2109
2110 default: break;
2111 }
2112 break;
2113
2114 /*-----------------------------------------------------------------*/
2115 case OP_NOT_HSPACE:
2116 if (clen > 0) switch(c)
2117 {
2118 case 0x09: /* HT */
2119 case 0x20: /* SPACE */
2120 case 0xa0: /* NBSP */
2121 case 0x1680: /* OGHAM SPACE MARK */
2122 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2123 case 0x2000: /* EN QUAD */
2124 case 0x2001: /* EM QUAD */
2125 case 0x2002: /* EN SPACE */
2126 case 0x2003: /* EM SPACE */
2127 case 0x2004: /* THREE-PER-EM SPACE */
2128 case 0x2005: /* FOUR-PER-EM SPACE */
2129 case 0x2006: /* SIX-PER-EM SPACE */
2130 case 0x2007: /* FIGURE SPACE */
2131 case 0x2008: /* PUNCTUATION SPACE */
2132 case 0x2009: /* THIN SPACE */
2133 case 0x200A: /* HAIR SPACE */
2134 case 0x202f: /* NARROW NO-BREAK SPACE */
2135 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2136 case 0x3000: /* IDEOGRAPHIC SPACE */
2137 break;
2138
2139 default:
2140 ADD_NEW(state_offset + 1, 0);
2141 break;
2142 }
2143 break;
2144
2145 /*-----------------------------------------------------------------*/
2146 case OP_HSPACE:
2147 if (clen > 0) switch(c)
2148 {
2149 case 0x09: /* HT */
2150 case 0x20: /* SPACE */
2151 case 0xa0: /* NBSP */
2152 case 0x1680: /* OGHAM SPACE MARK */
2153 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2154 case 0x2000: /* EN QUAD */
2155 case 0x2001: /* EM QUAD */
2156 case 0x2002: /* EN SPACE */
2157 case 0x2003: /* EM SPACE */
2158 case 0x2004: /* THREE-PER-EM SPACE */
2159 case 0x2005: /* FOUR-PER-EM SPACE */
2160 case 0x2006: /* SIX-PER-EM SPACE */
2161 case 0x2007: /* FIGURE SPACE */
2162 case 0x2008: /* PUNCTUATION SPACE */
2163 case 0x2009: /* THIN SPACE */
2164 case 0x200A: /* HAIR SPACE */
2165 case 0x202f: /* NARROW NO-BREAK SPACE */
2166 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2167 case 0x3000: /* IDEOGRAPHIC SPACE */
2168 ADD_NEW(state_offset + 1, 0);
2169 break;
2170 }
2171 break;
2172
2173 /*-----------------------------------------------------------------*/
2174 /* Match a negated single character casefully. This is only used for
2175 one-byte characters, that is, we know that d < 256. The character we are
2176 checking (c) can be multibyte. */
2177
2178 case OP_NOT:
2179 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2180 break;
2181
2182 /*-----------------------------------------------------------------*/
2183 /* Match a negated single character caselessly. This is only used for
2184 one-byte characters, that is, we know that d < 256. The character we are
2185 checking (c) can be multibyte. */
2186
2187 case OP_NOTI:
2188 if (clen > 0 && c != d && c != fcc[d])
2189 { ADD_NEW(state_offset + dlen + 1, 0); }
2190 break;
2191
2192 /*-----------------------------------------------------------------*/
2193 case OP_PLUSI:
2194 case OP_MINPLUSI:
2195 case OP_POSPLUSI:
2196 case OP_NOTPLUSI:
2197 case OP_NOTMINPLUSI:
2198 case OP_NOTPOSPLUSI:
2199 caseless = TRUE;
2200 codevalue -= OP_STARI - OP_STAR;
2201
2202 /* Fall through */
2203 case OP_PLUS:
2204 case OP_MINPLUS:
2205 case OP_POSPLUS:
2206 case OP_NOTPLUS:
2207 case OP_NOTMINPLUS:
2208 case OP_NOTPOSPLUS:
2209 count = current_state->count; /* Already matched */
2210 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2211 if (clen > 0)
2212 {
2213 unsigned int otherd = NOTACHAR;
2214 if (caseless)
2215 {
2216 #ifdef SUPPORT_UTF
2217 if (utf && d >= 128)
2218 {
2219 #ifdef SUPPORT_UCP
2220 otherd = UCD_OTHERCASE(d);
2221 #endif /* SUPPORT_UCP */
2222 }
2223 else
2224 #endif /* SUPPORT_UTF */
2225 otherd = fcc[d];
2226 }
2227 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2228 {
2229 if (count > 0 &&
2230 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2231 {
2232 active_count--; /* Remove non-match possibility */
2233 next_active_state--;
2234 }
2235 count++;
2236 ADD_NEW(state_offset, count);
2237 }
2238 }
2239 break;
2240
2241 /*-----------------------------------------------------------------*/
2242 case OP_QUERYI:
2243 case OP_MINQUERYI:
2244 case OP_POSQUERYI:
2245 case OP_NOTQUERYI:
2246 case OP_NOTMINQUERYI:
2247 case OP_NOTPOSQUERYI:
2248 caseless = TRUE;
2249 codevalue -= OP_STARI - OP_STAR;
2250 /* Fall through */
2251 case OP_QUERY:
2252 case OP_MINQUERY:
2253 case OP_POSQUERY:
2254 case OP_NOTQUERY:
2255 case OP_NOTMINQUERY:
2256 case OP_NOTPOSQUERY:
2257 ADD_ACTIVE(state_offset + dlen + 1, 0);
2258 if (clen > 0)
2259 {
2260 unsigned int otherd = NOTACHAR;
2261 if (caseless)
2262 {
2263 #ifdef SUPPORT_UTF
2264 if (utf && d >= 128)
2265 {
2266 #ifdef SUPPORT_UCP
2267 otherd = UCD_OTHERCASE(d);
2268 #endif /* SUPPORT_UCP */
2269 }
2270 else
2271 #endif /* SUPPORT_UTF */
2272 otherd = fcc[d];
2273 }
2274 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2275 {
2276 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2277 {
2278 active_count--; /* Remove non-match possibility */
2279 next_active_state--;
2280 }
2281 ADD_NEW(state_offset + dlen + 1, 0);
2282 }
2283 }
2284 break;
2285
2286 /*-----------------------------------------------------------------*/
2287 case OP_STARI:
2288 case OP_MINSTARI:
2289 case OP_POSSTARI:
2290 case OP_NOTSTARI:
2291 case OP_NOTMINSTARI:
2292 case OP_NOTPOSSTARI:
2293 caseless = TRUE;
2294 codevalue -= OP_STARI - OP_STAR;
2295 /* Fall through */
2296 case OP_STAR:
2297 case OP_MINSTAR:
2298 case OP_POSSTAR:
2299 case OP_NOTSTAR:
2300 case OP_NOTMINSTAR:
2301 case OP_NOTPOSSTAR:
2302 ADD_ACTIVE(state_offset + dlen + 1, 0);
2303 if (clen > 0)
2304 {
2305 unsigned int otherd = NOTACHAR;
2306 if (caseless)
2307 {
2308 #ifdef SUPPORT_UTF
2309 if (utf && d >= 128)
2310 {
2311 #ifdef SUPPORT_UCP
2312 otherd = UCD_OTHERCASE(d);
2313 #endif /* SUPPORT_UCP */
2314 }
2315 else
2316 #endif /* SUPPORT_UTF */
2317 otherd = fcc[d];
2318 }
2319 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2320 {
2321 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2322 {
2323 active_count--; /* Remove non-match possibility */
2324 next_active_state--;
2325 }
2326 ADD_NEW(state_offset, 0);
2327 }
2328 }
2329 break;
2330
2331 /*-----------------------------------------------------------------*/
2332 case OP_EXACTI:
2333 case OP_NOTEXACTI:
2334 caseless = TRUE;
2335 codevalue -= OP_STARI - OP_STAR;
2336 /* Fall through */
2337 case OP_EXACT:
2338 case OP_NOTEXACT:
2339 count = current_state->count; /* Number already matched */
2340 if (clen > 0)
2341 {
2342 unsigned int otherd = NOTACHAR;
2343 if (caseless)
2344 {
2345 #ifdef SUPPORT_UTF
2346 if (utf && d >= 128)
2347 {
2348 #ifdef SUPPORT_UCP
2349 otherd = UCD_OTHERCASE(d);
2350 #endif /* SUPPORT_UCP */
2351 }
2352 else
2353 #endif /* SUPPORT_UTF */
2354 otherd = fcc[d];
2355 }
2356 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2357 {
2358 if (++count >= GET2(code, 1))
2359 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2360 else
2361 { ADD_NEW(state_offset, count); }
2362 }
2363 }
2364 break;
2365
2366 /*-----------------------------------------------------------------*/
2367 case OP_UPTOI:
2368 case OP_MINUPTOI:
2369 case OP_POSUPTOI:
2370 case OP_NOTUPTOI:
2371 case OP_NOTMINUPTOI:
2372 case OP_NOTPOSUPTOI:
2373 caseless = TRUE;
2374 codevalue -= OP_STARI - OP_STAR;
2375 /* Fall through */
2376 case OP_UPTO:
2377 case OP_MINUPTO:
2378 case OP_POSUPTO:
2379 case OP_NOTUPTO:
2380 case OP_NOTMINUPTO:
2381 case OP_NOTPOSUPTO:
2382 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2383 count = current_state->count; /* Number already matched */
2384 if (clen > 0)
2385 {
2386 unsigned int otherd = NOTACHAR;
2387 if (caseless)
2388 {
2389 #ifdef SUPPORT_UTF
2390 if (utf && d >= 128)
2391 {
2392 #ifdef SUPPORT_UCP
2393 otherd = UCD_OTHERCASE(d);
2394 #endif /* SUPPORT_UCP */
2395 }
2396 else
2397 #endif /* SUPPORT_UTF */
2398 otherd = fcc[d];
2399 }
2400 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2401 {
2402 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2403 {
2404 active_count--; /* Remove non-match possibility */
2405 next_active_state--;
2406 }
2407 if (++count >= GET2(code, 1))
2408 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2409 else
2410 { ADD_NEW(state_offset, count); }
2411 }
2412 }
2413 break;
2414
2415
2416 /* ========================================================================== */
2417 /* These are the class-handling opcodes */
2418
2419 case OP_CLASS:
2420 case OP_NCLASS:
2421 case OP_XCLASS:
2422 {
2423 BOOL isinclass = FALSE;
2424 int next_state_offset;
2425 const pcre_uchar *ecode;
2426
2427 /* For a simple class, there is always just a 32-byte table, and we
2428 can set isinclass from it. */
2429
2430 if (codevalue != OP_XCLASS)
2431 {
2432 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2433 if (clen > 0)
2434 {
2435 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2436 ((code[1 + c/8] & (1 << (c&7))) != 0);
2437 }
2438 }
2439
2440 /* An extended class may have a table or a list of single characters,
2441 ranges, or both, and it may be positive or negative. There's a
2442 function that sorts all this out. */
2443
2444 else
2445 {
2446 ecode = code + GET(code, 1);
2447 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2448 }
2449
2450 /* At this point, isinclass is set for all kinds of class, and ecode
2451 points to the byte after the end of the class. If there is a
2452 quantifier, this is where it will be. */
2453
2454 next_state_offset = (int)(ecode - start_code);
2455
2456 switch (*ecode)
2457 {
2458 case OP_CRSTAR:
2459 case OP_CRMINSTAR:
2460 ADD_ACTIVE(next_state_offset + 1, 0);
2461 if (isinclass) { ADD_NEW(state_offset, 0); }
2462 break;
2463
2464 case OP_CRPLUS:
2465 case OP_CRMINPLUS:
2466 count = current_state->count; /* Already matched */
2467 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2468 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2469 break;
2470
2471 case OP_CRQUERY:
2472 case OP_CRMINQUERY:
2473 ADD_ACTIVE(next_state_offset + 1, 0);
2474 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2475 break;
2476
2477 case OP_CRRANGE:
2478 case OP_CRMINRANGE:
2479 count = current_state->count; /* Already matched */
2480 if (count >= GET2(ecode, 1))
2481 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2482 if (isinclass)
2483 {
2484 int max = GET2(ecode, 3);
2485 if (++count >= max && max != 0) /* Max 0 => no limit */
2486 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2487 else
2488 { ADD_NEW(state_offset, count); }
2489 }
2490 break;
2491
2492 default:
2493 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2494 break;
2495 }
2496 }
2497 break;
2498
2499 /* ========================================================================== */
2500 /* These are the opcodes for fancy brackets of various kinds. We have
2501 to use recursion in order to handle them. The "always failing" assertion
2502 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2503 though the other "backtracking verbs" are not supported. */
2504
2505 case OP_FAIL:
2506 forced_fail++; /* Count FAILs for multiple states */
2507 break;
2508
2509 case OP_ASSERT:
2510 case OP_ASSERT_NOT:
2511 case OP_ASSERTBACK:
2512 case OP_ASSERTBACK_NOT:
2513 {
2514 int rc;
2515 int local_offsets[2];
2516 int local_workspace[1000];
2517 const pcre_uchar *endasscode = code + GET(code, 1);
2518
2519 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2520
2521 rc = internal_dfa_exec(
2522 md, /* static match data */
2523 code, /* this subexpression's code */
2524 ptr, /* where we currently are */
2525 (int)(ptr - start_subject), /* start offset */
2526 local_offsets, /* offset vector */
2527 sizeof(local_offsets)/sizeof(int), /* size of same */
2528 local_workspace, /* workspace vector */
2529 sizeof(local_workspace)/sizeof(int), /* size of same */
2530 rlevel); /* function recursion level */
2531
2532 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2533 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2534 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2535 }
2536 break;
2537
2538 /*-----------------------------------------------------------------*/
2539 case OP_COND:
2540 case OP_SCOND:
2541 {
2542 int local_offsets[1000];
2543 int local_workspace[1000];
2544 int codelink = GET(code, 1);
2545 int condcode;
2546
2547 /* Because of the way auto-callout works during compile, a callout item
2548 is inserted between OP_COND and an assertion condition. This does not
2549 happen for the other conditions. */
2550
2551 if (code[LINK_SIZE+1] == OP_CALLOUT)
2552 {
2553 rrc = 0;
2554 if (PUBL(callout) != NULL)
2555 {
2556 pcre_callout_block cb;
2557 cb.version = 1; /* Version 1 of the callout block */
2558 cb.callout_number = code[LINK_SIZE+2];
2559 cb.offset_vector = offsets;
2560 cb.subject = (PCRE_SPTR)start_subject;
2561 cb.subject_length = (int)(end_subject - start_subject);
2562 cb.start_match = (int)(current_subject - start_subject);
2563 cb.current_position = (int)(ptr - start_subject);
2564 cb.pattern_position = GET(code, LINK_SIZE + 3);
2565 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2566 cb.capture_top = 1;
2567 cb.capture_last = -1;
2568 cb.callout_data = md->callout_data;
2569 cb.mark = NULL; /* No (*MARK) support */
2570 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2571 }
2572 if (rrc > 0) break; /* Fail this thread */
2573 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2574 }
2575
2576 condcode = code[LINK_SIZE+1];
2577
2578 /* Back reference conditions are not supported */
2579
2580 if (condcode == OP_CREF || condcode == OP_NCREF)
2581 return PCRE_ERROR_DFA_UCOND;
2582
2583 /* The DEFINE condition is always false */
2584
2585 if (condcode == OP_DEF)
2586 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2587
2588 /* The only supported version of OP_RREF is for the value RREF_ANY,
2589 which means "test if in any recursion". We can't test for specifically
2590 recursed groups. */
2591
2592 else if (condcode == OP_RREF || condcode == OP_NRREF)
2593 {
2594 int value = GET2(code, LINK_SIZE+2);
2595 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2596 if (md->recursive != NULL)
2597 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2598 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2599 }
2600
2601 /* Otherwise, the condition is an assertion */
2602
2603 else
2604 {
2605 int rc;
2606 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2607 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2608
2609 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2610
2611 rc = internal_dfa_exec(
2612 md, /* fixed match data */
2613 asscode, /* this subexpression's code */
2614 ptr, /* where we currently are */
2615 (int)(ptr - start_subject), /* start offset */
2616 local_offsets, /* offset vector */
2617 sizeof(local_offsets)/sizeof(int), /* size of same */
2618 local_workspace, /* workspace vector */
2619 sizeof(local_workspace)/sizeof(int), /* size of same */
2620 rlevel); /* function recursion level */
2621
2622 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2623 if ((rc >= 0) ==
2624 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2625 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2626 else
2627 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2628 }
2629 }
2630 break;
2631
2632 /*-----------------------------------------------------------------*/
2633 case OP_RECURSE:
2634 {
2635 dfa_recursion_info *ri;
2636 int local_offsets[1000];
2637 int local_workspace[1000];
2638 const pcre_uchar *callpat = start_code + GET(code, 1);
2639 int recno = (callpat == md->start_code)? 0 :
2640 GET2(callpat, 1 + LINK_SIZE);
2641 int rc;
2642
2643 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2644
2645 /* Check for repeating a recursion without advancing the subject
2646 pointer. This should catch convoluted mutual recursions. (Some simple
2647 cases are caught at compile time.) */
2648
2649 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2650 if (recno == ri->group_num && ptr == ri->subject_position)
2651 return PCRE_ERROR_RECURSELOOP;
2652
2653 /* Remember this recursion and where we started it so as to
2654 catch infinite loops. */
2655
2656 new_recursive.group_num = recno;
2657 new_recursive.subject_position = ptr;
2658 new_recursive.prevrec = md->recursive;
2659 md->recursive = &new_recursive;
2660
2661 rc = internal_dfa_exec(
2662 md, /* fixed match data */
2663 callpat, /* this subexpression's code */
2664 ptr, /* where we currently are */
2665 (int)(ptr - start_subject), /* start offset */
2666 local_offsets, /* offset vector */
2667 sizeof(local_offsets)/sizeof(int), /* size of same */
2668 local_workspace, /* workspace vector */
2669 sizeof(local_workspace)/sizeof(int), /* size of same */
2670 rlevel); /* function recursion level */
2671
2672 md->recursive = new_recursive.prevrec; /* Done this recursion */
2673
2674 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2675 rc));
2676
2677 /* Ran out of internal offsets */
2678
2679 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2680
2681 /* For each successful matched substring, set up the next state with a
2682 count of characters to skip before trying it. Note that the count is in
2683 characters, not bytes. */
2684
2685 if (rc > 0)
2686 {
2687 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2688 {
2689 const pcre_uchar *p = start_subject + local_offsets[rc];
2690 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2691 int charcount = local_offsets[rc+1] - local_offsets[rc];
2692 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2693 if (charcount > 0)
2694 {
2695 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2696 }
2697 else
2698 {
2699 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2700 }
2701 }
2702 }
2703 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2704 }
2705 break;
2706
2707 /*-----------------------------------------------------------------*/
2708 case OP_BRAPOS:
2709 case OP_SBRAPOS:
2710 case OP_CBRAPOS:
2711 case OP_SCBRAPOS:
2712 case OP_BRAPOSZERO:
2713 {
2714 int charcount, matched_count;
2715 const pcre_uchar *local_ptr = ptr;
2716 BOOL allow_zero;
2717
2718 if (codevalue == OP_BRAPOSZERO)
2719 {
2720 allow_zero = TRUE;
2721 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2722 }
2723 else allow_zero = FALSE;
2724
2725 /* Loop to match the subpattern as many times as possible as if it were
2726 a complete pattern. */
2727
2728 for (matched_count = 0;; matched_count++)
2729 {
2730 int local_offsets[2];
2731 int local_workspace[1000];
2732
2733 int rc = internal_dfa_exec(
2734 md, /* fixed match data */
2735 code, /* this subexpression's code */
2736 local_ptr, /* where we currently are */
2737 (int)(ptr - start_subject), /* start offset */
2738 local_offsets, /* offset vector */
2739 sizeof(local_offsets)/sizeof(int), /* size of same */
2740 local_workspace, /* workspace vector */
2741 sizeof(local_workspace)/sizeof(int), /* size of same */
2742 rlevel); /* function recursion level */
2743
2744 /* Failed to match */
2745
2746 if (rc < 0)
2747 {
2748 if (rc != PCRE_ERROR_NOMATCH) return rc;
2749 break;
2750 }
2751
2752 /* Matched: break the loop if zero characters matched. */
2753
2754 charcount = local_offsets[1] - local_offsets[0];
2755 if (charcount == 0) break;
2756 local_ptr += charcount; /* Advance temporary position ptr */
2757 }
2758
2759 /* At this point we have matched the subpattern matched_count
2760 times, and local_ptr is pointing to the character after the end of the
2761 last match. */
2762
2763 if (matched_count > 0 || allow_zero)
2764 {
2765 const pcre_uchar *end_subpattern = code;
2766 int next_state_offset;
2767
2768 do { end_subpattern += GET(end_subpattern, 1); }
2769 while (*end_subpattern == OP_ALT);
2770 next_state_offset =
2771 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2772
2773 /* Optimization: if there are no more active states, and there
2774 are no new states yet set up, then skip over the subject string
2775 right here, to save looping. Otherwise, set up the new state to swing
2776 into action when the end of the matched substring is reached. */
2777
2778 if (i + 1 >= active_count && new_count == 0)
2779 {
2780 ptr = local_ptr;
2781 clen = 0;
2782 ADD_NEW(next_state_offset, 0);
2783 }
2784 else
2785 {
2786 const pcre_uchar *p = ptr;
2787 const pcre_uchar *pp = local_ptr;
2788 charcount = (int)(pp - p);
2789 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2790 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2791 }
2792 }
2793 }
2794 break;
2795
2796 /*-----------------------------------------------------------------*/
2797 case OP_ONCE:
2798 case OP_ONCE_NC:
2799 {
2800 int local_offsets[2];
2801 int local_workspace[1000];
2802
2803 int rc = internal_dfa_exec(
2804 md, /* fixed match data */
2805 code, /* this subexpression's code */
2806 ptr, /* where we currently are */
2807 (int)(ptr - start_subject), /* start offset */
2808 local_offsets, /* offset vector */
2809 sizeof(local_offsets)/sizeof(int), /* size of same */
2810 local_workspace, /* workspace vector */
2811 sizeof(local_workspace)/sizeof(int), /* size of same */
2812 rlevel); /* function recursion level */
2813
2814 if (rc >= 0)
2815 {
2816 const pcre_uchar *end_subpattern = code;
2817 int charcount = local_offsets[1] - local_offsets[0];
2818 int next_state_offset, repeat_state_offset;
2819
2820 do { end_subpattern += GET(end_subpattern, 1); }
2821 while (*end_subpattern == OP_ALT);
2822 next_state_offset =
2823 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2824
2825 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2826 arrange for the repeat state also to be added to the relevant list.
2827 Calculate the offset, or set -1 for no repeat. */
2828
2829 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2830 *end_subpattern == OP_KETRMIN)?
2831 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2832
2833 /* If we have matched an empty string, add the next state at the
2834 current character pointer. This is important so that the duplicate
2835 checking kicks in, which is what breaks infinite loops that match an
2836 empty string. */
2837
2838 if (charcount == 0)
2839 {
2840 ADD_ACTIVE(next_state_offset, 0);
2841 }
2842
2843 /* Optimization: if there are no more active states, and there
2844 are no new states yet set up, then skip over the subject string
2845 right here, to save looping. Otherwise, set up the new state to swing
2846 into action when the end of the matched substring is reached. */
2847
2848 else if (i + 1 >= active_count && new_count == 0)
2849 {
2850 ptr += charcount;
2851 clen = 0;
2852 ADD_NEW(next_state_offset, 0);
2853
2854 /* If we are adding a repeat state at the new character position,
2855 we must fudge things so that it is the only current state.
2856 Otherwise, it might be a duplicate of one we processed before, and
2857 that would cause it to be skipped. */
2858
2859 if (repeat_state_offset >= 0)
2860 {
2861 next_active_state = active_states;
2862 active_count = 0;
2863 i = -1;
2864 ADD_ACTIVE(repeat_state_offset, 0);
2865 }
2866 }
2867 else
2868 {
2869 const pcre_uchar *p = start_subject + local_offsets[0];
2870 const pcre_uchar *pp = start_subject + local_offsets[1];
2871 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2872 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2873 if (repeat_state_offset >= 0)
2874 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2875 }
2876 }
2877 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2878 }
2879 break;
2880
2881
2882 /* ========================================================================== */
2883 /* Handle callouts */
2884
2885 case OP_CALLOUT:
2886 rrc = 0;
2887 if (PUBL(callout) != NULL)
2888 {
2889 pcre_callout_block cb;
2890 cb.version = 1; /* Version 1 of the callout block */
2891 cb.callout_number = code[1];
2892 cb.offset_vector = offsets;
2893 cb.subject = (PCRE_SPTR)start_subject;
2894 cb.subject_length = (int)(end_subject - start_subject);
2895 cb.start_match = (int)(current_subject - start_subject);
2896 cb.current_position = (int)(ptr - start_subject);
2897 cb.pattern_position = GET(code, 2);
2898 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2899 cb.capture_top = 1;
2900 cb.capture_last = -1;
2901 cb.callout_data = md->callout_data;
2902 cb.mark = NULL; /* No (*MARK) support */
2903 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2904 }
2905 if (rrc == 0)
2906 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2907 break;
2908
2909
2910 /* ========================================================================== */
2911 default: /* Unsupported opcode */
2912 return PCRE_ERROR_DFA_UITEM;
2913 }
2914
2915 NEXT_ACTIVE_STATE: continue;
2916
2917 } /* End of loop scanning active states */
2918
2919 /* We have finished the processing at the current subject character. If no
2920 new states have been set for the next character, we have found all the
2921 matches that we are going to find. If we are at the top level and partial
2922 matching has been requested, check for appropriate conditions.
2923
2924 The "forced_ fail" variable counts the number of (*F) encountered for the
2925 character. If it is equal to the original active_count (saved in
2926 workspace[1]) it means that (*F) was found on every active state. In this
2927 case we don't want to give a partial match.
2928
2929 The "could_continue" variable is true if a state could have continued but
2930 for the fact that the end of the subject was reached. */
2931
2932 if (new_count <= 0)
2933 {
2934 if (rlevel == 1 && /* Top level, and */
2935 could_continue && /* Some could go on */
2936 forced_fail != workspace[1] && /* Not all forced fail & */
2937 ( /* either... */
2938 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2939 || /* or... */
2940 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2941 match_count < 0) /* no matches */
2942 ) && /* And... */
2943 ptr >= end_subject && /* Reached end of subject */
2944 ptr > md->start_used_ptr) /* Inspected non-empty string */
2945 {
2946 if (offsetcount >= 2)
2947 {
2948 offsets[0] = (int)(md->start_used_ptr - start_subject);
2949 offsets[1] = (int)(end_subject - start_subject);
2950 }
2951 match_count = PCRE_ERROR_PARTIAL;
2952 }
2953
2954 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2955 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2956 rlevel*2-2, SP));
2957 break; /* In effect, "return", but see the comment below */
2958 }
2959
2960 /* One or more states are active for the next character. */
2961
2962 ptr += clen; /* Advance to next subject character */
2963 } /* Loop to move along the subject string */
2964
2965 /* Control gets here from "break" a few lines above. We do it this way because
2966 if we use "return" above, we have compiler trouble. Some compilers warn if
2967 there's nothing here because they think the function doesn't return a value. On
2968 the other hand, if we put a dummy statement here, some more clever compilers
2969 complain that it can't be reached. Sigh. */
2970
2971 return match_count;
2972 }
2973
2974
2975
2976
2977 /*************************************************
2978 * Execute a Regular Expression - DFA engine *
2979 *************************************************/
2980
2981 /* This external function applies a compiled re to a subject string using a DFA
2982 engine. This function calls the internal function multiple times if the pattern
2983 is not anchored.
2984
2985 Arguments:
2986 argument_re points to the compiled expression
2987 extra_data points to extra data or is NULL
2988 subject points to the subject string
2989 length length of subject string (may contain binary zeros)
2990 start_offset where to start in the subject string
2991 options option bits
2992 offsets vector of match offsets
2993 offsetcount size of same
2994 workspace workspace vector
2995 wscount size of same
2996
2997 Returns: > 0 => number of match offset pairs placed in offsets
2998 = 0 => offsets overflowed; longest matches are present
2999 -1 => failed to match
3000 < -1 => some kind of unexpected problem
3001 */
3002
3003 #ifdef COMPILE_PCRE8
3004 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3005 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3006 const char *subject, int length, int start_offset, int options, int *offsets,
3007 int offsetcount, int *workspace, int wscount)
3008 #else
3009 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3010 pcre16_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3011 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3012 int offsetcount, int *workspace, int wscount)
3013 #endif
3014 {
3015 real_pcre *re = (real_pcre *)argument_re;
3016 dfa_match_data match_block;
3017 dfa_match_data *md = &match_block;
3018 BOOL utf, anchored, startline, firstline;
3019 const pcre_uchar *current_subject, *end_subject;
3020 const pcre_uint8 *lcc;
3021
3022 pcre_study_data internal_study;
3023 const pcre_study_data *study = NULL;
3024 real_pcre internal_re;
3025
3026 const pcre_uchar *req_char_ptr;
3027 const pcre_uint8 *start_bits = NULL;
3028 BOOL has_first_char = FALSE;
3029 BOOL has_req_char = FALSE;
3030 pcre_uchar first_char = 0;
3031 pcre_uchar first_char2 = 0;
3032 pcre_uchar req_char = 0;
3033 pcre_uchar req_char2 = 0;
3034 int newline;
3035
3036 /* Plausibility checks */
3037
3038 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3039 if (re == NULL || subject == NULL || workspace == NULL ||
3040 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3041 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3042 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3043 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3044
3045 /* We need to find the pointer to any study data before we test for byte
3046 flipping, so we scan the extra_data block first. This may set two fields in the
3047 match block, so we must initialize them beforehand. However, the other fields
3048 in the match block must not be set until after the byte flipping. */
3049
3050 md->tables = re->tables;
3051 md->callout_data = NULL;
3052
3053 if (extra_data != NULL)
3054 {
3055 unsigned int flags = extra_data->flags;
3056 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3057 study = (const pcre_study_data *)extra_data->study_data;
3058 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3059 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3060 return PCRE_ERROR_DFA_UMLIMIT;
3061 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3062 md->callout_data = extra_data->callout_data;
3063 if ((flags & PCRE_EXTRA_TABLES) != 0)
3064 md->tables = extra_data->tables;
3065 }
3066
3067 /* Check that the first field in the block is the magic number. If it is not,
3068 test for a regex that was compiled on a host of opposite endianness. If this is
3069 the case, flipped values are put in internal_re and internal_study if there was
3070 study data too. */
3071
3072 if (re->magic_number != MAGIC_NUMBER)
3073 {
3074 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3075 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3076 if (study != NULL) study = &internal_study;
3077 }
3078 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3079
3080 /* Set some local values */
3081
3082 current_subject = (const pcre_uchar *)subject + start_offset;
3083 end_subject = (const pcre_uchar *)subject + length;
3084 req_char_ptr = current_subject - 1;
3085
3086 #ifdef SUPPORT_UTF
3087 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3088 utf = (re->options & PCRE_UTF8) != 0;
3089 #else
3090 utf = FALSE;
3091 #endif
3092
3093 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3094 (re->options & PCRE_ANCHORED) != 0;
3095
3096 /* The remaining fixed data for passing around. */
3097
3098 md->start_code = (const pcre_uchar *)argument_re +
3099 re->name_table_offset + re->name_count * re->name_entry_size;
3100 md->start_subject = (const pcre_uchar *)subject;
3101 md->end_subject = end_subject;
3102 md->start_offset = start_offset;
3103 md->moptions = options;
3104 md->poptions = re->options;
3105
3106 /* If the BSR option is not set at match time, copy what was set
3107 at compile time. */
3108
3109 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3110 {
3111 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3112 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3113 #ifdef BSR_ANYCRLF
3114 else md->moptions |= PCRE_BSR_ANYCRLF;
3115 #endif
3116 }
3117
3118 /* Handle different types of newline. The three bits give eight cases. If
3119 nothing is set at run time, whatever was used at compile time applies. */
3120
3121 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3122 PCRE_NEWLINE_BITS)
3123 {
3124 case 0: newline = NEWLINE; break; /* Compile-time default */
3125 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3126 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3127 case PCRE_NEWLINE_CR+
3128 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3129 case PCRE_NEWLINE_ANY: newline = -1; break;
3130 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3131 default: return PCRE_ERROR_BADNEWLINE;
3132 }
3133
3134 if (newline == -2)
3135 {
3136 md->nltype = NLTYPE_ANYCRLF;
3137 }
3138 else if (newline < 0)
3139 {
3140 md->nltype = NLTYPE_ANY;
3141 }
3142 else
3143 {
3144 md->nltype = NLTYPE_FIXED;
3145 if (newline > 255)
3146 {
3147 md->nllen = 2;
3148 md->nl[0] = (newline >> 8) & 255;
3149 md->nl[1] = newline & 255;
3150 }
3151 else
3152 {
3153 md->nllen = 1;
3154 md->nl[0] = newline;
3155 }
3156 }
3157
3158 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3159 back the character offset. */
3160
3161 #ifdef SUPPORT_UTF
3162 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3163 {
3164 int erroroffset;
3165 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3166 if (errorcode != 0)
3167 {
3168 if (offsetcount >= 2)
3169 {
3170 offsets[0] = erroroffset;
3171 offsets[1] = errorcode;
3172 }
3173 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3174 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3175 }
3176 if (start_offset > 0 && start_offset < length &&
3177 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3178 return PCRE_ERROR_BADUTF8_OFFSET;
3179 }
3180 #endif
3181
3182 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3183 is a feature that makes it possible to save compiled regex and re-use them
3184 in other programs later. */
3185
3186 if (md->tables == NULL) md->tables = PRIV(default_tables);
3187
3188 /* The lower casing table and the "must be at the start of a line" flag are
3189 used in a loop when finding where to start. */
3190
3191 lcc = md->tables + lcc_offset;
3192 startline = (re->flags & PCRE_STARTLINE) != 0;
3193 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3194
3195 /* Set up the first character to match, if available. The first_byte value is
3196 never set for an anchored regular expression, but the anchoring may be forced
3197 at run time, so we have to test for anchoring. The first char may be unset for
3198 an unanchored pattern, of course. If there's no first char and the pattern was
3199 studied, there may be a bitmap of possible first characters. */
3200
3201 if (!anchored)
3202 {
3203 if ((re->flags & PCRE_FIRSTSET) != 0)
3204 {
3205 has_first_char = TRUE;
3206 first_char = first_char2 = re->first_char;
3207 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3208 {
3209 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3210 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3211 if (utf && first_char > 127)
3212 first_char2 = UCD_OTHERCASE(first_char);
3213 #endif
3214 }
3215 }
3216 else
3217 {
3218 if (!startline && study != NULL &&
3219 (study->flags & PCRE_STUDY_MAPPED) != 0)
3220 start_bits = study->start_bits;
3221 }
3222 }
3223
3224 /* For anchored or unanchored matches, there may be a "last known required
3225 character" set. */
3226
3227 if ((re->flags & PCRE_REQCHSET) != 0)
3228 {
3229 has_req_char = TRUE;
3230 req_char = req_char2 = re->req_char;
3231 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3232 {
3233 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3234 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3235 if (utf && req_char > 127)
3236 req_char2 = UCD_OTHERCASE(req_char);
3237 #endif
3238 }
3239 }
3240
3241 /* Call the main matching function, looping for a non-anchored regex after a
3242 failed match. If not restarting, perform certain optimizations at the start of
3243 a match. */
3244
3245 for (;;)
3246 {
3247 int rc;
3248
3249 if ((options & PCRE_DFA_RESTART) == 0)
3250 {
3251 const pcre_uchar *save_end_subject = end_subject;
3252
3253 /* If firstline is TRUE, the start of the match is constrained to the first
3254 line of a multiline string. Implement this by temporarily adjusting
3255 end_subject so that we stop scanning at a newline. If the match fails at
3256 the newline, later code breaks this loop. */
3257
3258 if (firstline)
3259 {
3260 PCRE_PUCHAR t = current_subject;
3261 #ifdef SUPPORT_UTF
3262 if (utf)
3263 {
3264 while (t < md->end_subject && !IS_NEWLINE(t))
3265 {
3266 t++;
3267 ACROSSCHAR(t < end_subject, *t, t++);
3268 }
3269 }
3270 else
3271 #endif
3272 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3273 end_subject = t;
3274 }
3275
3276 /* There are some optimizations that avoid running the match if a known
3277 starting point is not found. However, there is an option that disables
3278 these, for testing and for ensuring that all callouts do actually occur.
3279 The option can be set in the regex by (*NO_START_OPT) or passed in
3280 match-time options. */
3281
3282 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3283 {
3284 /* Advance to a known first char. */
3285
3286 if (has_first_char)
3287 {
3288 if (first_char != first_char2)
3289 while (current_subject < end_subject &&
3290 *current_subject != first_char && *current_subject != first_char2)
3291 current_subject++;
3292 else
3293 while (current_subject < end_subject &&
3294 *current_subject != first_char)
3295 current_subject++;
3296 }
3297
3298 /* Or to just after a linebreak for a multiline match if possible */
3299
3300 else if (startline)
3301 {
3302 if (current_subject > md->start_subject + start_offset)
3303 {
3304 #ifdef SUPPORT_UTF
3305 if (utf)
3306 {
3307 while (current_subject < end_subject &&
3308 !WAS_NEWLINE(current_subject))
3309 {
3310 current_subject++;
3311 ACROSSCHAR(current_subject < end_subject, *current_subject,
3312 current_subject++);
3313 }
3314 }
3315 else
3316 #endif
3317 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3318 current_subject++;
3319
3320 /* If we have just passed a CR and the newline option is ANY or
3321 ANYCRLF, and we are now at a LF, advance the match position by one
3322 more character. */
3323
3324 if (current_subject[-1] == CHAR_CR &&
3325 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3326 current_subject < end_subject &&
3327 *current_subject == CHAR_NL)
3328 current_subject++;
3329 }
3330 }
3331
3332 /* Or to a non-unique first char after study */
3333
3334 else if (start_bits != NULL)
3335 {
3336 while (current_subject < end_subject)
3337 {
3338 register unsigned int c = *current_subject;
3339 #ifndef COMPILE_PCRE8
3340 if (c > 255) c = 255;
3341 #endif
3342 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3343 {
3344 current_subject++;
3345 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3346 /* In non 8-bit mode, the iteration will stop for
3347 characters > 255 at the beginning or not stop at all. */
3348 if (utf)
3349 ACROSSCHAR(current_subject < end_subject, *current_subject,
3350 current_subject++);
3351 #endif
3352 }
3353 else break;
3354 }
3355 }
3356 }
3357
3358 /* Restore fudged end_subject */
3359
3360 end_subject = save_end_subject;
3361
3362 /* The following two optimizations are disabled for partial matching or if
3363 disabling is explicitly requested (and of course, by the test above, this
3364 code is not obeyed when restarting after a partial match). */
3365
3366 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3367 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3368 {
3369 /* If the pattern was studied, a minimum subject length may be set. This
3370 is a lower bound; no actual string of that length may actually match the
3371 pattern. Although the value is, strictly, in characters, we treat it as
3372 bytes to avoid spending too much time in this optimization. */
3373
3374 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3375 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3376 return PCRE_ERROR_NOMATCH;
3377
3378 /* If req_char is set, we know that that character must appear in the
3379 subject for the match to succeed. If the first character is set, req_char
3380 must be later in the subject; otherwise the test starts at the match
3381 point. This optimization can save a huge amount of work in patterns with
3382 nested unlimited repeats that aren't going to match. Writing separate
3383 code for cased/caseless versions makes it go faster, as does using an
3384 autoincrement and backing off on a match.
3385
3386 HOWEVER: when the subject string is very, very long, searching to its end
3387 can take a long time, and give bad performance on quite ordinary
3388 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3389 string... so we don't do this when the string is sufficiently long. */
3390
3391 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3392 {
3393 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3394
3395 /* We don't need to repeat the search if we haven't yet reached the
3396 place we found it at last time. */
3397
3398 if (p > req_char_ptr)
3399 {
3400 if (req_char != req_char2)
3401 {
3402 while (p < end_subject)
3403 {
3404 register int pp = *p++;
3405 if (pp == req_char || pp == req_char2) { p--; break; }
3406 }
3407 }
3408 else
3409 {
3410 while (p < end_subject)
3411 {
3412 if (*p++ == req_char) { p--; break; }
3413 }
3414 }
3415
3416 /* If we can't find the required character, break the matching loop,
3417 which will cause a return or PCRE_ERROR_NOMATCH. */
3418
3419 if (p >= end_subject) break;
3420
3421 /* If we have found the required character, save the point where we
3422 found it, so that we don't search again next time round the loop if
3423 the start hasn't passed this character yet. */
3424
3425 req_char_ptr = p;
3426 }
3427 }
3428 }
3429 } /* End of optimizations that are done when not restarting */
3430
3431 /* OK, now we can do the business */
3432
3433 md->start_used_ptr = current_subject;
3434 md->recursive = NULL;
3435
3436 rc = internal_dfa_exec(
3437 md, /* fixed match data */
3438 md->start_code, /* this subexpression's code */
3439 current_subject, /* where we currently are */
3440 start_offset, /* start offset in subject */
3441 offsets, /* offset vector */
3442 offsetcount, /* size of same */
3443 workspace, /* workspace vector */
3444 wscount, /* size of same */
3445 0); /* function recurse level */
3446
3447 /* Anything other than "no match" means we are done, always; otherwise, carry
3448 on only if not anchored. */
3449
3450 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3451
3452 /* Advance to the next subject character unless we are at the end of a line
3453 and firstline is set. */
3454
3455 if (firstline && IS_NEWLINE(current_subject)) break;
3456 current_subject++;
3457 #ifdef SUPPORT_UTF
3458 if (utf)
3459 {
3460 ACROSSCHAR(current_subject < end_subject, *current_subject,
3461 current_subject++);
3462 }
3463 #endif
3464 if (current_subject > end_subject) break;
3465
3466 /* If we have just passed a CR and we are now at a LF, and the pattern does
3467 not contain any explicit matches for \r or \n, and the newline option is CRLF
3468 or ANY or ANYCRLF, advance the match position by one more character. */
3469
3470 if (current_subject[-1] == CHAR_CR &&
3471 current_subject < end_subject &&
3472 *current_subject == CHAR_NL &&
3473 (re->flags & PCRE_HASCRORLF) == 0 &&
3474 (md->nltype == NLTYPE_ANY ||
3475 md->nltype == NLTYPE_ANYCRLF ||
3476 md->nllen == 2))
3477 current_subject++;
3478
3479 } /* "Bumpalong" loop */
3480
3481 return PCRE_ERROR_NOMATCH;
3482 }
3483
3484 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5