/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 774 - (show annotations)
Thu Dec 1 06:08:45 2011 UTC (3 years, 5 months ago) by zherczeg
File MIME type: text/plain
File size: 120231 byte(s)
Error occurred while calculating annotation data.
better digit parsing, first_byte, req_byte are renamed to first_char req_char respectively
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2011 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 3, 3, 3, /* upto, minupto, exact */
132 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 3, 3, 3, /* upto I, minupto I, exact I */
135 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 /* Negative single-char repeats - only for chars < 256 */
137 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138 3, 3, 3, /* NOT upto, minupto, exact */
139 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141 3, 3, 3, /* NOT upto I, minupto I, exact I */
142 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 /* Positive type repeats */
144 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145 3, 3, 3, /* Type upto, minupto, exact */
146 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 /* Character class & ref repeats */
148 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149 0, 0, /* CRRANGE, CRMINRANGE */
150 0, /* CLASS */
151 0, /* NCLASS */
152 0, /* XCLASS - variable length */
153 0, /* REF */
154 0, /* REFI */
155 0, /* RECURSE */
156 0, /* CALLOUT */
157 0, /* Alt */
158 0, /* Ket */
159 0, /* KetRmax */
160 0, /* KetRmin */
161 0, /* KetRpos */
162 0, /* Reverse */
163 0, /* Assert */
164 0, /* Assert not */
165 0, /* Assert behind */
166 0, /* Assert behind not */
167 0, 0, /* ONCE, ONCE_NC */
168 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 0, 0, /* CREF, NCREF */
171 0, 0, /* RREF, NRREF */
172 0, /* DEF */
173 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177 0, 0 /* CLOSE, SKIPZERO */
178 };
179
180 /* This table identifies those opcodes that inspect a character. It is used to
181 remember the fact that a character could have been inspected when the end of
182 the subject is reached. ***NOTE*** If the start of this table is modified, the
183 two tables that follow must also be modified. */
184
185 static const pcre_uint8 poptable[] = {
186 0, /* End */
187 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189 1, 1, 1, /* Any, AllAny, Anybyte */
190 1, 1, /* \P, \p */
191 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 1, /* \X */
193 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 1, /* Char */
195 1, /* Chari */
196 1, /* not */
197 1, /* noti */
198 /* Positive single-char repeats */
199 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200 1, 1, 1, /* upto, minupto, exact */
201 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203 1, 1, 1, /* upto I, minupto I, exact I */
204 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 /* Negative single-char repeats - only for chars < 256 */
206 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207 1, 1, 1, /* NOT upto, minupto, exact */
208 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210 1, 1, 1, /* NOT upto I, minupto I, exact I */
211 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 /* Positive type repeats */
213 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* Type upto, minupto, exact */
215 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216 /* Character class & ref repeats */
217 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218 1, 1, /* CRRANGE, CRMINRANGE */
219 1, /* CLASS */
220 1, /* NCLASS */
221 1, /* XCLASS - variable length */
222 0, /* REF */
223 0, /* REFI */
224 0, /* RECURSE */
225 0, /* CALLOUT */
226 0, /* Alt */
227 0, /* Ket */
228 0, /* KetRmax */
229 0, /* KetRmin */
230 0, /* KetRpos */
231 0, /* Reverse */
232 0, /* Assert */
233 0, /* Assert not */
234 0, /* Assert behind */
235 0, /* Assert behind not */
236 0, 0, /* ONCE, ONCE_NC */
237 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 0, 0, /* CREF, NCREF */
240 0, 0, /* RREF, NRREF */
241 0, /* DEF */
242 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246 0, 0 /* CLOSE, SKIPZERO */
247 };
248
249 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250 and \w */
251
252 static const pcre_uint8 toptable1[] = {
253 0, 0, 0, 0, 0, 0,
254 ctype_digit, ctype_digit,
255 ctype_space, ctype_space,
256 ctype_word, ctype_word,
257 0, 0 /* OP_ANY, OP_ALLANY */
258 };
259
260 static const pcre_uint8 toptable2[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, 0,
263 ctype_space, 0,
264 ctype_word, 0,
265 1, 1 /* OP_ANY, OP_ALLANY */
266 };
267
268
269 /* Structure for holding data about a particular state, which is in effect the
270 current data for an active path through the match tree. It must consist
271 entirely of ints because the working vector we are passed, and which we put
272 these structures in, is a vector of ints. */
273
274 typedef struct stateblock {
275 int offset; /* Offset to opcode */
276 int count; /* Count for repeats */
277 int data; /* Some use extra data */
278 } stateblock;
279
280 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281
282
283 #ifdef PCRE_DEBUG
284 /*************************************************
285 * Print character string *
286 *************************************************/
287
288 /* Character string printing function for debugging.
289
290 Arguments:
291 p points to string
292 length number of bytes
293 f where to print
294
295 Returns: nothing
296 */
297
298 static void
299 pchars(unsigned char *p, int length, FILE *f)
300 {
301 int c;
302 while (length-- > 0)
303 {
304 if (isprint(c = *(p++)))
305 fprintf(f, "%c", c);
306 else
307 fprintf(f, "\\x%02x", c);
308 }
309 }
310 #endif
311
312
313
314 /*************************************************
315 * Execute a Regular Expression - DFA engine *
316 *************************************************/
317
318 /* This internal function applies a compiled pattern to a subject string,
319 starting at a given point, using a DFA engine. This function is called from the
320 external one, possibly multiple times if the pattern is not anchored. The
321 function calls itself recursively for some kinds of subpattern.
322
323 Arguments:
324 md the match_data block with fixed information
325 this_start_code the opening bracket of this subexpression's code
326 current_subject where we currently are in the subject string
327 start_offset start offset in the subject string
328 offsets vector to contain the matching string offsets
329 offsetcount size of same
330 workspace vector of workspace
331 wscount size of same
332 rlevel function call recursion level
333
334 Returns: > 0 => number of match offset pairs placed in offsets
335 = 0 => offsets overflowed; longest matches are present
336 -1 => failed to match
337 < -1 => some kind of unexpected problem
338
339 The following macros are used for adding states to the two state vectors (one
340 for the current character, one for the following character). */
341
342 #define ADD_ACTIVE(x,y) \
343 if (active_count++ < wscount) \
344 { \
345 next_active_state->offset = (x); \
346 next_active_state->count = (y); \
347 next_active_state++; \
348 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349 } \
350 else return PCRE_ERROR_DFA_WSSIZE
351
352 #define ADD_ACTIVE_DATA(x,y,z) \
353 if (active_count++ < wscount) \
354 { \
355 next_active_state->offset = (x); \
356 next_active_state->count = (y); \
357 next_active_state->data = (z); \
358 next_active_state++; \
359 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360 } \
361 else return PCRE_ERROR_DFA_WSSIZE
362
363 #define ADD_NEW(x,y) \
364 if (new_count++ < wscount) \
365 { \
366 next_new_state->offset = (x); \
367 next_new_state->count = (y); \
368 next_new_state++; \
369 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370 } \
371 else return PCRE_ERROR_DFA_WSSIZE
372
373 #define ADD_NEW_DATA(x,y,z) \
374 if (new_count++ < wscount) \
375 { \
376 next_new_state->offset = (x); \
377 next_new_state->count = (y); \
378 next_new_state->data = (z); \
379 next_new_state++; \
380 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381 } \
382 else return PCRE_ERROR_DFA_WSSIZE
383
384 /* And now, here is the code */
385
386 static int
387 internal_dfa_exec(
388 dfa_match_data *md,
389 const pcre_uchar *this_start_code,
390 const pcre_uchar *current_subject,
391 int start_offset,
392 int *offsets,
393 int offsetcount,
394 int *workspace,
395 int wscount,
396 int rlevel)
397 {
398 stateblock *active_states, *new_states, *temp_states;
399 stateblock *next_active_state, *next_new_state;
400
401 const pcre_uint8 *ctypes, *lcc, *fcc;
402 const pcre_uchar *ptr;
403 const pcre_uchar *end_code, *first_op;
404
405 dfa_recursion_info new_recursive;
406
407 int active_count, new_count, match_count;
408
409 /* Some fields in the md block are frequently referenced, so we load them into
410 independent variables in the hope that this will perform better. */
411
412 const pcre_uchar *start_subject = md->start_subject;
413 const pcre_uchar *end_subject = md->end_subject;
414 const pcre_uchar *start_code = md->start_code;
415
416 #ifdef SUPPORT_UTF8
417 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
418 #else
419 BOOL utf8 = FALSE;
420 #endif
421
422 rlevel++;
423 offsetcount &= (-2);
424
425 wscount -= 2;
426 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427 (2 * INTS_PER_STATEBLOCK);
428
429 DPRINTF(("\n%.*s---------------------\n"
430 "%.*sCall to internal_dfa_exec f=%d\n",
431 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432
433 ctypes = md->tables + ctypes_offset;
434 lcc = md->tables + lcc_offset;
435 fcc = md->tables + fcc_offset;
436
437 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438
439 active_states = (stateblock *)(workspace + 2);
440 next_new_state = new_states = active_states + wscount;
441 new_count = 0;
442
443 first_op = this_start_code + 1 + LINK_SIZE +
444 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
446 ? IMM2_SIZE:0);
447
448 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
449 the alternative states onto the list, and find out where the end is. This
450 makes is possible to use this function recursively, when we want to stop at a
451 matching internal ket rather than at the end.
452
453 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
454 a backward assertion. In that case, we have to find out the maximum amount to
455 move back, and set up each alternative appropriately. */
456
457 if (*first_op == OP_REVERSE)
458 {
459 int max_back = 0;
460 int gone_back;
461
462 end_code = this_start_code;
463 do
464 {
465 int back = GET(end_code, 2+LINK_SIZE);
466 if (back > max_back) max_back = back;
467 end_code += GET(end_code, 1);
468 }
469 while (*end_code == OP_ALT);
470
471 /* If we can't go back the amount required for the longest lookbehind
472 pattern, go back as far as we can; some alternatives may still be viable. */
473
474 #ifdef SUPPORT_UTF8
475 /* In character mode we have to step back character by character */
476
477 if (utf8)
478 {
479 for (gone_back = 0; gone_back < max_back; gone_back++)
480 {
481 if (current_subject <= start_subject) break;
482 current_subject--;
483 while (current_subject > start_subject &&
484 (*current_subject & 0xc0) == 0x80)
485 current_subject--;
486 }
487 }
488 else
489 #endif
490
491 /* In byte-mode we can do this quickly. */
492
493 {
494 gone_back = (current_subject - max_back < start_subject)?
495 (int)(current_subject - start_subject) : max_back;
496 current_subject -= gone_back;
497 }
498
499 /* Save the earliest consulted character */
500
501 if (current_subject < md->start_used_ptr)
502 md->start_used_ptr = current_subject;
503
504 /* Now we can process the individual branches. */
505
506 end_code = this_start_code;
507 do
508 {
509 int back = GET(end_code, 2+LINK_SIZE);
510 if (back <= gone_back)
511 {
512 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
513 ADD_NEW_DATA(-bstate, 0, gone_back - back);
514 }
515 end_code += GET(end_code, 1);
516 }
517 while (*end_code == OP_ALT);
518 }
519
520 /* This is the code for a "normal" subpattern (not a backward assertion). The
521 start of a whole pattern is always one of these. If we are at the top level,
522 we may be asked to restart matching from the same point that we reached for a
523 previous partial match. We still have to scan through the top-level branches to
524 find the end state. */
525
526 else
527 {
528 end_code = this_start_code;
529
530 /* Restarting */
531
532 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
533 {
534 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
535 new_count = workspace[1];
536 if (!workspace[0])
537 memcpy(new_states, active_states, new_count * sizeof(stateblock));
538 }
539
540 /* Not restarting */
541
542 else
543 {
544 int length = 1 + LINK_SIZE +
545 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
546 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
547 ? IMM2_SIZE:0);
548 do
549 {
550 ADD_NEW((int)(end_code - start_code + length), 0);
551 end_code += GET(end_code, 1);
552 length = 1 + LINK_SIZE;
553 }
554 while (*end_code == OP_ALT);
555 }
556 }
557
558 workspace[0] = 0; /* Bit indicating which vector is current */
559
560 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
561
562 /* Loop for scanning the subject */
563
564 ptr = current_subject;
565 for (;;)
566 {
567 int i, j;
568 int clen, dlen;
569 unsigned int c, d;
570 int forced_fail = 0;
571 BOOL could_continue = FALSE;
572
573 /* Make the new state list into the active state list and empty the
574 new state list. */
575
576 temp_states = active_states;
577 active_states = new_states;
578 new_states = temp_states;
579 active_count = new_count;
580 new_count = 0;
581
582 workspace[0] ^= 1; /* Remember for the restarting feature */
583 workspace[1] = active_count;
584
585 #ifdef PCRE_DEBUG
586 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
587 pchars((pcre_uchar *)ptr, strlen((char *)ptr), stdout);
588 printf("\"\n");
589
590 printf("%.*sActive states: ", rlevel*2-2, SP);
591 for (i = 0; i < active_count; i++)
592 printf("%d/%d ", active_states[i].offset, active_states[i].count);
593 printf("\n");
594 #endif
595
596 /* Set the pointers for adding new states */
597
598 next_active_state = active_states + active_count;
599 next_new_state = new_states;
600
601 /* Load the current character from the subject outside the loop, as many
602 different states may want to look at it, and we assume that at least one
603 will. */
604
605 if (ptr < end_subject)
606 {
607 clen = 1; /* Number of bytes in the character */
608 #ifdef SUPPORT_UTF8
609 if (utf8) { GETCHARLEN(c, ptr, clen); } else
610 #endif /* SUPPORT_UTF8 */
611 c = *ptr;
612 }
613 else
614 {
615 clen = 0; /* This indicates the end of the subject */
616 c = NOTACHAR; /* This value should never actually be used */
617 }
618
619 /* Scan up the active states and act on each one. The result of an action
620 may be to add more states to the currently active list (e.g. on hitting a
621 parenthesis) or it may be to put states on the new list, for considering
622 when we move the character pointer on. */
623
624 for (i = 0; i < active_count; i++)
625 {
626 stateblock *current_state = active_states + i;
627 BOOL caseless = FALSE;
628 const pcre_uchar *code;
629 int state_offset = current_state->offset;
630 int count, codevalue, rrc;
631
632 #ifdef PCRE_DEBUG
633 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
634 if (clen == 0) printf("EOL\n");
635 else if (c > 32 && c < 127) printf("'%c'\n", c);
636 else printf("0x%02x\n", c);
637 #endif
638
639 /* A negative offset is a special case meaning "hold off going to this
640 (negated) state until the number of characters in the data field have
641 been skipped". */
642
643 if (state_offset < 0)
644 {
645 if (current_state->data > 0)
646 {
647 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
648 ADD_NEW_DATA(state_offset, current_state->count,
649 current_state->data - 1);
650 continue;
651 }
652 else
653 {
654 current_state->offset = state_offset = -state_offset;
655 }
656 }
657
658 /* Check for a duplicate state with the same count, and skip if found.
659 See the note at the head of this module about the possibility of improving
660 performance here. */
661
662 for (j = 0; j < i; j++)
663 {
664 if (active_states[j].offset == state_offset &&
665 active_states[j].count == current_state->count)
666 {
667 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
668 goto NEXT_ACTIVE_STATE;
669 }
670 }
671
672 /* The state offset is the offset to the opcode */
673
674 code = start_code + state_offset;
675 codevalue = *code;
676
677 /* If this opcode inspects a character, but we are at the end of the
678 subject, remember the fact for use when testing for a partial match. */
679
680 if (clen == 0 && poptable[codevalue] != 0)
681 could_continue = TRUE;
682
683 /* If this opcode is followed by an inline character, load it. It is
684 tempting to test for the presence of a subject character here, but that
685 is wrong, because sometimes zero repetitions of the subject are
686 permitted.
687
688 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
689 argument that is not a data character - but is always one byte long. We
690 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
691 this case. To keep the other cases fast, convert these ones to new opcodes.
692 */
693
694 if (coptable[codevalue] > 0)
695 {
696 dlen = 1;
697 #ifdef SUPPORT_UTF8
698 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
699 #endif /* SUPPORT_UTF8 */
700 d = code[coptable[codevalue]];
701 if (codevalue >= OP_TYPESTAR)
702 {
703 switch(d)
704 {
705 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
706 case OP_NOTPROP:
707 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
708 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
709 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
710 case OP_NOT_HSPACE:
711 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
712 case OP_NOT_VSPACE:
713 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
714 default: break;
715 }
716 }
717 }
718 else
719 {
720 dlen = 0; /* Not strictly necessary, but compilers moan */
721 d = NOTACHAR; /* if these variables are not set. */
722 }
723
724
725 /* Now process the individual opcodes */
726
727 switch (codevalue)
728 {
729 /* ========================================================================== */
730 /* These cases are never obeyed. This is a fudge that causes a compile-
731 time error if the vectors coptable or poptable, which are indexed by
732 opcode, are not the correct length. It seems to be the only way to do
733 such a check at compile time, as the sizeof() operator does not work
734 in the C preprocessor. */
735
736 case OP_TABLE_LENGTH:
737 case OP_TABLE_LENGTH +
738 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
739 (sizeof(poptable) == OP_TABLE_LENGTH)):
740 break;
741
742 /* ========================================================================== */
743 /* Reached a closing bracket. If not at the end of the pattern, carry
744 on with the next opcode. For repeating opcodes, also add the repeat
745 state. Note that KETRPOS will always be encountered at the end of the
746 subpattern, because the possessive subpattern repeats are always handled
747 using recursive calls. Thus, it never adds any new states.
748
749 At the end of the (sub)pattern, unless we have an empty string and
750 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
751 start of the subject, save the match data, shifting up all previous
752 matches so we always have the longest first. */
753
754 case OP_KET:
755 case OP_KETRMIN:
756 case OP_KETRMAX:
757 case OP_KETRPOS:
758 if (code != end_code)
759 {
760 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
761 if (codevalue != OP_KET)
762 {
763 ADD_ACTIVE(state_offset - GET(code, 1), 0);
764 }
765 }
766 else
767 {
768 if (ptr > current_subject ||
769 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
770 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
771 current_subject > start_subject + md->start_offset)))
772 {
773 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
774 else if (match_count > 0 && ++match_count * 2 > offsetcount)
775 match_count = 0;
776 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
777 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
778 if (offsetcount >= 2)
779 {
780 offsets[0] = (int)(current_subject - start_subject);
781 offsets[1] = (int)(ptr - start_subject);
782 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
783 offsets[1] - offsets[0], current_subject));
784 }
785 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
786 {
787 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
788 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
789 match_count, rlevel*2-2, SP));
790 return match_count;
791 }
792 }
793 }
794 break;
795
796 /* ========================================================================== */
797 /* These opcodes add to the current list of states without looking
798 at the current character. */
799
800 /*-----------------------------------------------------------------*/
801 case OP_ALT:
802 do { code += GET(code, 1); } while (*code == OP_ALT);
803 ADD_ACTIVE((int)(code - start_code), 0);
804 break;
805
806 /*-----------------------------------------------------------------*/
807 case OP_BRA:
808 case OP_SBRA:
809 do
810 {
811 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
812 code += GET(code, 1);
813 }
814 while (*code == OP_ALT);
815 break;
816
817 /*-----------------------------------------------------------------*/
818 case OP_CBRA:
819 case OP_SCBRA:
820 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
821 code += GET(code, 1);
822 while (*code == OP_ALT)
823 {
824 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
825 code += GET(code, 1);
826 }
827 break;
828
829 /*-----------------------------------------------------------------*/
830 case OP_BRAZERO:
831 case OP_BRAMINZERO:
832 ADD_ACTIVE(state_offset + 1, 0);
833 code += 1 + GET(code, 2);
834 while (*code == OP_ALT) code += GET(code, 1);
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 break;
837
838 /*-----------------------------------------------------------------*/
839 case OP_SKIPZERO:
840 code += 1 + GET(code, 2);
841 while (*code == OP_ALT) code += GET(code, 1);
842 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
843 break;
844
845 /*-----------------------------------------------------------------*/
846 case OP_CIRC:
847 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
848 { ADD_ACTIVE(state_offset + 1, 0); }
849 break;
850
851 /*-----------------------------------------------------------------*/
852 case OP_CIRCM:
853 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
854 (ptr != end_subject && WAS_NEWLINE(ptr)))
855 { ADD_ACTIVE(state_offset + 1, 0); }
856 break;
857
858 /*-----------------------------------------------------------------*/
859 case OP_EOD:
860 if (ptr >= end_subject)
861 {
862 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
863 could_continue = TRUE;
864 else { ADD_ACTIVE(state_offset + 1, 0); }
865 }
866 break;
867
868 /*-----------------------------------------------------------------*/
869 case OP_SOD:
870 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
871 break;
872
873 /*-----------------------------------------------------------------*/
874 case OP_SOM:
875 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
876 break;
877
878
879 /* ========================================================================== */
880 /* These opcodes inspect the next subject character, and sometimes
881 the previous one as well, but do not have an argument. The variable
882 clen contains the length of the current character and is zero if we are
883 at the end of the subject. */
884
885 /*-----------------------------------------------------------------*/
886 case OP_ANY:
887 if (clen > 0 && !IS_NEWLINE(ptr))
888 { ADD_NEW(state_offset + 1, 0); }
889 break;
890
891 /*-----------------------------------------------------------------*/
892 case OP_ALLANY:
893 if (clen > 0)
894 { ADD_NEW(state_offset + 1, 0); }
895 break;
896
897 /*-----------------------------------------------------------------*/
898 case OP_EODN:
899 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
900 could_continue = TRUE;
901 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
902 { ADD_ACTIVE(state_offset + 1, 0); }
903 break;
904
905 /*-----------------------------------------------------------------*/
906 case OP_DOLL:
907 if ((md->moptions & PCRE_NOTEOL) == 0)
908 {
909 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
910 could_continue = TRUE;
911 else if (clen == 0 ||
912 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
913 (ptr == end_subject - md->nllen)
914 ))
915 { ADD_ACTIVE(state_offset + 1, 0); }
916 }
917 break;
918
919 /*-----------------------------------------------------------------*/
920 case OP_DOLLM:
921 if ((md->moptions & PCRE_NOTEOL) == 0)
922 {
923 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924 could_continue = TRUE;
925 else if (clen == 0 ||
926 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
927 { ADD_ACTIVE(state_offset + 1, 0); }
928 }
929 else if (IS_NEWLINE(ptr))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 break;
932
933 /*-----------------------------------------------------------------*/
934
935 case OP_DIGIT:
936 case OP_WHITESPACE:
937 case OP_WORDCHAR:
938 if (clen > 0 && c < 256 &&
939 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
940 { ADD_NEW(state_offset + 1, 0); }
941 break;
942
943 /*-----------------------------------------------------------------*/
944 case OP_NOT_DIGIT:
945 case OP_NOT_WHITESPACE:
946 case OP_NOT_WORDCHAR:
947 if (clen > 0 && (c >= 256 ||
948 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
949 { ADD_NEW(state_offset + 1, 0); }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_WORD_BOUNDARY:
954 case OP_NOT_WORD_BOUNDARY:
955 {
956 int left_word, right_word;
957
958 if (ptr > start_subject)
959 {
960 const pcre_uchar *temp = ptr - 1;
961 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
962 #ifdef SUPPORT_UTF8
963 if (utf8) BACKCHAR(temp);
964 #endif
965 GETCHARTEST(d, temp);
966 #ifdef SUPPORT_UCP
967 if ((md->poptions & PCRE_UCP) != 0)
968 {
969 if (d == '_') left_word = TRUE; else
970 {
971 int cat = UCD_CATEGORY(d);
972 left_word = (cat == ucp_L || cat == ucp_N);
973 }
974 }
975 else
976 #endif
977 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
978 }
979 else left_word = FALSE;
980
981 if (clen > 0)
982 {
983 #ifdef SUPPORT_UCP
984 if ((md->poptions & PCRE_UCP) != 0)
985 {
986 if (c == '_') right_word = TRUE; else
987 {
988 int cat = UCD_CATEGORY(c);
989 right_word = (cat == ucp_L || cat == ucp_N);
990 }
991 }
992 else
993 #endif
994 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
995 }
996 else right_word = FALSE;
997
998 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
999 { ADD_ACTIVE(state_offset + 1, 0); }
1000 }
1001 break;
1002
1003
1004 /*-----------------------------------------------------------------*/
1005 /* Check the next character by Unicode property. We will get here only
1006 if the support is in the binary; otherwise a compile-time error occurs.
1007 */
1008
1009 #ifdef SUPPORT_UCP
1010 case OP_PROP:
1011 case OP_NOTPROP:
1012 if (clen > 0)
1013 {
1014 BOOL OK;
1015 const ucd_record * prop = GET_UCD(c);
1016 switch(code[1])
1017 {
1018 case PT_ANY:
1019 OK = TRUE;
1020 break;
1021
1022 case PT_LAMP:
1023 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1024 prop->chartype == ucp_Lt;
1025 break;
1026
1027 case PT_GC:
1028 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1029 break;
1030
1031 case PT_PC:
1032 OK = prop->chartype == code[2];
1033 break;
1034
1035 case PT_SC:
1036 OK = prop->script == code[2];
1037 break;
1038
1039 /* These are specials for combination cases. */
1040
1041 case PT_ALNUM:
1042 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1043 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1044 break;
1045
1046 case PT_SPACE: /* Perl space */
1047 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1048 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1049 break;
1050
1051 case PT_PXSPACE: /* POSIX space */
1052 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1053 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1054 c == CHAR_FF || c == CHAR_CR;
1055 break;
1056
1057 case PT_WORD:
1058 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1059 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1060 c == CHAR_UNDERSCORE;
1061 break;
1062
1063 /* Should never occur, but keep compilers from grumbling. */
1064
1065 default:
1066 OK = codevalue != OP_PROP;
1067 break;
1068 }
1069
1070 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1071 }
1072 break;
1073 #endif
1074
1075
1076
1077 /* ========================================================================== */
1078 /* These opcodes likewise inspect the subject character, but have an
1079 argument that is not a data character. It is one of these opcodes:
1080 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1081 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1082
1083 case OP_TYPEPLUS:
1084 case OP_TYPEMINPLUS:
1085 case OP_TYPEPOSPLUS:
1086 count = current_state->count; /* Already matched */
1087 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1088 if (clen > 0)
1089 {
1090 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1091 (c < 256 &&
1092 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1093 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1094 {
1095 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1096 {
1097 active_count--; /* Remove non-match possibility */
1098 next_active_state--;
1099 }
1100 count++;
1101 ADD_NEW(state_offset, count);
1102 }
1103 }
1104 break;
1105
1106 /*-----------------------------------------------------------------*/
1107 case OP_TYPEQUERY:
1108 case OP_TYPEMINQUERY:
1109 case OP_TYPEPOSQUERY:
1110 ADD_ACTIVE(state_offset + 2, 0);
1111 if (clen > 0)
1112 {
1113 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1114 (c < 256 &&
1115 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1116 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1117 {
1118 if (codevalue == OP_TYPEPOSQUERY)
1119 {
1120 active_count--; /* Remove non-match possibility */
1121 next_active_state--;
1122 }
1123 ADD_NEW(state_offset + 2, 0);
1124 }
1125 }
1126 break;
1127
1128 /*-----------------------------------------------------------------*/
1129 case OP_TYPESTAR:
1130 case OP_TYPEMINSTAR:
1131 case OP_TYPEPOSSTAR:
1132 ADD_ACTIVE(state_offset + 2, 0);
1133 if (clen > 0)
1134 {
1135 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1136 (c < 256 &&
1137 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1138 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1139 {
1140 if (codevalue == OP_TYPEPOSSTAR)
1141 {
1142 active_count--; /* Remove non-match possibility */
1143 next_active_state--;
1144 }
1145 ADD_NEW(state_offset, 0);
1146 }
1147 }
1148 break;
1149
1150 /*-----------------------------------------------------------------*/
1151 case OP_TYPEEXACT:
1152 count = current_state->count; /* Number already matched */
1153 if (clen > 0)
1154 {
1155 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1156 (c < 256 &&
1157 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1158 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1159 {
1160 if (++count >= GET2(code, 1))
1161 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1162 else
1163 { ADD_NEW(state_offset, count); }
1164 }
1165 }
1166 break;
1167
1168 /*-----------------------------------------------------------------*/
1169 case OP_TYPEUPTO:
1170 case OP_TYPEMINUPTO:
1171 case OP_TYPEPOSUPTO:
1172 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1173 count = current_state->count; /* Number already matched */
1174 if (clen > 0)
1175 {
1176 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1177 (c < 256 &&
1178 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1179 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1180 {
1181 if (codevalue == OP_TYPEPOSUPTO)
1182 {
1183 active_count--; /* Remove non-match possibility */
1184 next_active_state--;
1185 }
1186 if (++count >= GET2(code, 1))
1187 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1188 else
1189 { ADD_NEW(state_offset, count); }
1190 }
1191 }
1192 break;
1193
1194 /* ========================================================================== */
1195 /* These are virtual opcodes that are used when something like
1196 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1197 argument. It keeps the code above fast for the other cases. The argument
1198 is in the d variable. */
1199
1200 #ifdef SUPPORT_UCP
1201 case OP_PROP_EXTRA + OP_TYPEPLUS:
1202 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1203 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1204 count = current_state->count; /* Already matched */
1205 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1206 if (clen > 0)
1207 {
1208 BOOL OK;
1209 const ucd_record * prop = GET_UCD(c);
1210 switch(code[2])
1211 {
1212 case PT_ANY:
1213 OK = TRUE;
1214 break;
1215
1216 case PT_LAMP:
1217 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1218 prop->chartype == ucp_Lt;
1219 break;
1220
1221 case PT_GC:
1222 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1223 break;
1224
1225 case PT_PC:
1226 OK = prop->chartype == code[3];
1227 break;
1228
1229 case PT_SC:
1230 OK = prop->script == code[3];
1231 break;
1232
1233 /* These are specials for combination cases. */
1234
1235 case PT_ALNUM:
1236 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1237 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1238 break;
1239
1240 case PT_SPACE: /* Perl space */
1241 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1242 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1243 break;
1244
1245 case PT_PXSPACE: /* POSIX space */
1246 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1247 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1248 c == CHAR_FF || c == CHAR_CR;
1249 break;
1250
1251 case PT_WORD:
1252 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1253 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1254 c == CHAR_UNDERSCORE;
1255 break;
1256
1257 /* Should never occur, but keep compilers from grumbling. */
1258
1259 default:
1260 OK = codevalue != OP_PROP;
1261 break;
1262 }
1263
1264 if (OK == (d == OP_PROP))
1265 {
1266 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1267 {
1268 active_count--; /* Remove non-match possibility */
1269 next_active_state--;
1270 }
1271 count++;
1272 ADD_NEW(state_offset, count);
1273 }
1274 }
1275 break;
1276
1277 /*-----------------------------------------------------------------*/
1278 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1279 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1280 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1281 count = current_state->count; /* Already matched */
1282 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1283 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1284 {
1285 const pcre_uchar *nptr = ptr + clen;
1286 int ncount = 0;
1287 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1288 {
1289 active_count--; /* Remove non-match possibility */
1290 next_active_state--;
1291 }
1292 while (nptr < end_subject)
1293 {
1294 int nd;
1295 int ndlen = 1;
1296 GETCHARLEN(nd, nptr, ndlen);
1297 if (UCD_CATEGORY(nd) != ucp_M) break;
1298 ncount++;
1299 nptr += ndlen;
1300 }
1301 count++;
1302 ADD_NEW_DATA(-state_offset, count, ncount);
1303 }
1304 break;
1305 #endif
1306
1307 /*-----------------------------------------------------------------*/
1308 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1309 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1310 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1311 count = current_state->count; /* Already matched */
1312 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1313 if (clen > 0)
1314 {
1315 int ncount = 0;
1316 switch (c)
1317 {
1318 case 0x000b:
1319 case 0x000c:
1320 case 0x0085:
1321 case 0x2028:
1322 case 0x2029:
1323 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324 goto ANYNL01;
1325
1326 case 0x000d:
1327 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328 /* Fall through */
1329
1330 ANYNL01:
1331 case 0x000a:
1332 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1333 {
1334 active_count--; /* Remove non-match possibility */
1335 next_active_state--;
1336 }
1337 count++;
1338 ADD_NEW_DATA(-state_offset, count, ncount);
1339 break;
1340
1341 default:
1342 break;
1343 }
1344 }
1345 break;
1346
1347 /*-----------------------------------------------------------------*/
1348 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1349 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1350 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1351 count = current_state->count; /* Already matched */
1352 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1353 if (clen > 0)
1354 {
1355 BOOL OK;
1356 switch (c)
1357 {
1358 case 0x000a:
1359 case 0x000b:
1360 case 0x000c:
1361 case 0x000d:
1362 case 0x0085:
1363 case 0x2028:
1364 case 0x2029:
1365 OK = TRUE;
1366 break;
1367
1368 default:
1369 OK = FALSE;
1370 break;
1371 }
1372
1373 if (OK == (d == OP_VSPACE))
1374 {
1375 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1376 {
1377 active_count--; /* Remove non-match possibility */
1378 next_active_state--;
1379 }
1380 count++;
1381 ADD_NEW_DATA(-state_offset, count, 0);
1382 }
1383 }
1384 break;
1385
1386 /*-----------------------------------------------------------------*/
1387 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1388 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1389 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1390 count = current_state->count; /* Already matched */
1391 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1392 if (clen > 0)
1393 {
1394 BOOL OK;
1395 switch (c)
1396 {
1397 case 0x09: /* HT */
1398 case 0x20: /* SPACE */
1399 case 0xa0: /* NBSP */
1400 case 0x1680: /* OGHAM SPACE MARK */
1401 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1402 case 0x2000: /* EN QUAD */
1403 case 0x2001: /* EM QUAD */
1404 case 0x2002: /* EN SPACE */
1405 case 0x2003: /* EM SPACE */
1406 case 0x2004: /* THREE-PER-EM SPACE */
1407 case 0x2005: /* FOUR-PER-EM SPACE */
1408 case 0x2006: /* SIX-PER-EM SPACE */
1409 case 0x2007: /* FIGURE SPACE */
1410 case 0x2008: /* PUNCTUATION SPACE */
1411 case 0x2009: /* THIN SPACE */
1412 case 0x200A: /* HAIR SPACE */
1413 case 0x202f: /* NARROW NO-BREAK SPACE */
1414 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1415 case 0x3000: /* IDEOGRAPHIC SPACE */
1416 OK = TRUE;
1417 break;
1418
1419 default:
1420 OK = FALSE;
1421 break;
1422 }
1423
1424 if (OK == (d == OP_HSPACE))
1425 {
1426 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1427 {
1428 active_count--; /* Remove non-match possibility */
1429 next_active_state--;
1430 }
1431 count++;
1432 ADD_NEW_DATA(-state_offset, count, 0);
1433 }
1434 }
1435 break;
1436
1437 /*-----------------------------------------------------------------*/
1438 #ifdef SUPPORT_UCP
1439 case OP_PROP_EXTRA + OP_TYPEQUERY:
1440 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1441 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1442 count = 4;
1443 goto QS1;
1444
1445 case OP_PROP_EXTRA + OP_TYPESTAR:
1446 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1447 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1448 count = 0;
1449
1450 QS1:
1451
1452 ADD_ACTIVE(state_offset + 4, 0);
1453 if (clen > 0)
1454 {
1455 BOOL OK;
1456 const ucd_record * prop = GET_UCD(c);
1457 switch(code[2])
1458 {
1459 case PT_ANY:
1460 OK = TRUE;
1461 break;
1462
1463 case PT_LAMP:
1464 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1465 prop->chartype == ucp_Lt;
1466 break;
1467
1468 case PT_GC:
1469 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1470 break;
1471
1472 case PT_PC:
1473 OK = prop->chartype == code[3];
1474 break;
1475
1476 case PT_SC:
1477 OK = prop->script == code[3];
1478 break;
1479
1480 /* These are specials for combination cases. */
1481
1482 case PT_ALNUM:
1483 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1484 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1485 break;
1486
1487 case PT_SPACE: /* Perl space */
1488 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1489 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1490 break;
1491
1492 case PT_PXSPACE: /* POSIX space */
1493 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1494 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1495 c == CHAR_FF || c == CHAR_CR;
1496 break;
1497
1498 case PT_WORD:
1499 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1500 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1501 c == CHAR_UNDERSCORE;
1502 break;
1503
1504 /* Should never occur, but keep compilers from grumbling. */
1505
1506 default:
1507 OK = codevalue != OP_PROP;
1508 break;
1509 }
1510
1511 if (OK == (d == OP_PROP))
1512 {
1513 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1514 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1515 {
1516 active_count--; /* Remove non-match possibility */
1517 next_active_state--;
1518 }
1519 ADD_NEW(state_offset + count, 0);
1520 }
1521 }
1522 break;
1523
1524 /*-----------------------------------------------------------------*/
1525 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1526 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1527 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1528 count = 2;
1529 goto QS2;
1530
1531 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1532 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1533 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1534 count = 0;
1535
1536 QS2:
1537
1538 ADD_ACTIVE(state_offset + 2, 0);
1539 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1540 {
1541 const pcre_uchar *nptr = ptr + clen;
1542 int ncount = 0;
1543 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1544 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1545 {
1546 active_count--; /* Remove non-match possibility */
1547 next_active_state--;
1548 }
1549 while (nptr < end_subject)
1550 {
1551 int nd;
1552 int ndlen = 1;
1553 GETCHARLEN(nd, nptr, ndlen);
1554 if (UCD_CATEGORY(nd) != ucp_M) break;
1555 ncount++;
1556 nptr += ndlen;
1557 }
1558 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1559 }
1560 break;
1561 #endif
1562
1563 /*-----------------------------------------------------------------*/
1564 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1565 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1566 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1567 count = 2;
1568 goto QS3;
1569
1570 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1571 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1572 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1573 count = 0;
1574
1575 QS3:
1576 ADD_ACTIVE(state_offset + 2, 0);
1577 if (clen > 0)
1578 {
1579 int ncount = 0;
1580 switch (c)
1581 {
1582 case 0x000b:
1583 case 0x000c:
1584 case 0x0085:
1585 case 0x2028:
1586 case 0x2029:
1587 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1588 goto ANYNL02;
1589
1590 case 0x000d:
1591 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1592 /* Fall through */
1593
1594 ANYNL02:
1595 case 0x000a:
1596 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1597 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1598 {
1599 active_count--; /* Remove non-match possibility */
1600 next_active_state--;
1601 }
1602 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1603 break;
1604
1605 default:
1606 break;
1607 }
1608 }
1609 break;
1610
1611 /*-----------------------------------------------------------------*/
1612 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1613 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1614 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1615 count = 2;
1616 goto QS4;
1617
1618 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1619 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1620 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1621 count = 0;
1622
1623 QS4:
1624 ADD_ACTIVE(state_offset + 2, 0);
1625 if (clen > 0)
1626 {
1627 BOOL OK;
1628 switch (c)
1629 {
1630 case 0x000a:
1631 case 0x000b:
1632 case 0x000c:
1633 case 0x000d:
1634 case 0x0085:
1635 case 0x2028:
1636 case 0x2029:
1637 OK = TRUE;
1638 break;
1639
1640 default:
1641 OK = FALSE;
1642 break;
1643 }
1644 if (OK == (d == OP_VSPACE))
1645 {
1646 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1647 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1648 {
1649 active_count--; /* Remove non-match possibility */
1650 next_active_state--;
1651 }
1652 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1653 }
1654 }
1655 break;
1656
1657 /*-----------------------------------------------------------------*/
1658 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1659 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1660 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1661 count = 2;
1662 goto QS5;
1663
1664 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1665 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1666 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1667 count = 0;
1668
1669 QS5:
1670 ADD_ACTIVE(state_offset + 2, 0);
1671 if (clen > 0)
1672 {
1673 BOOL OK;
1674 switch (c)
1675 {
1676 case 0x09: /* HT */
1677 case 0x20: /* SPACE */
1678 case 0xa0: /* NBSP */
1679 case 0x1680: /* OGHAM SPACE MARK */
1680 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1681 case 0x2000: /* EN QUAD */
1682 case 0x2001: /* EM QUAD */
1683 case 0x2002: /* EN SPACE */
1684 case 0x2003: /* EM SPACE */
1685 case 0x2004: /* THREE-PER-EM SPACE */
1686 case 0x2005: /* FOUR-PER-EM SPACE */
1687 case 0x2006: /* SIX-PER-EM SPACE */
1688 case 0x2007: /* FIGURE SPACE */
1689 case 0x2008: /* PUNCTUATION SPACE */
1690 case 0x2009: /* THIN SPACE */
1691 case 0x200A: /* HAIR SPACE */
1692 case 0x202f: /* NARROW NO-BREAK SPACE */
1693 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1694 case 0x3000: /* IDEOGRAPHIC SPACE */
1695 OK = TRUE;
1696 break;
1697
1698 default:
1699 OK = FALSE;
1700 break;
1701 }
1702
1703 if (OK == (d == OP_HSPACE))
1704 {
1705 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1706 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1707 {
1708 active_count--; /* Remove non-match possibility */
1709 next_active_state--;
1710 }
1711 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1712 }
1713 }
1714 break;
1715
1716 /*-----------------------------------------------------------------*/
1717 #ifdef SUPPORT_UCP
1718 case OP_PROP_EXTRA + OP_TYPEEXACT:
1719 case OP_PROP_EXTRA + OP_TYPEUPTO:
1720 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1721 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1722 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1723 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1724 count = current_state->count; /* Number already matched */
1725 if (clen > 0)
1726 {
1727 BOOL OK;
1728 const ucd_record * prop = GET_UCD(c);
1729 switch(code[1 + IMM2_SIZE + 1])
1730 {
1731 case PT_ANY:
1732 OK = TRUE;
1733 break;
1734
1735 case PT_LAMP:
1736 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1737 prop->chartype == ucp_Lt;
1738 break;
1739
1740 case PT_GC:
1741 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1742 break;
1743
1744 case PT_PC:
1745 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1746 break;
1747
1748 case PT_SC:
1749 OK = prop->script == code[1 + IMM2_SIZE + 2];
1750 break;
1751
1752 /* These are specials for combination cases. */
1753
1754 case PT_ALNUM:
1755 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1756 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1757 break;
1758
1759 case PT_SPACE: /* Perl space */
1760 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1761 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1762 break;
1763
1764 case PT_PXSPACE: /* POSIX space */
1765 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1766 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1767 c == CHAR_FF || c == CHAR_CR;
1768 break;
1769
1770 case PT_WORD:
1771 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1772 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1773 c == CHAR_UNDERSCORE;
1774 break;
1775
1776 /* Should never occur, but keep compilers from grumbling. */
1777
1778 default:
1779 OK = codevalue != OP_PROP;
1780 break;
1781 }
1782
1783 if (OK == (d == OP_PROP))
1784 {
1785 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1786 {
1787 active_count--; /* Remove non-match possibility */
1788 next_active_state--;
1789 }
1790 if (++count >= GET2(code, 1))
1791 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1792 else
1793 { ADD_NEW(state_offset, count); }
1794 }
1795 }
1796 break;
1797
1798 /*-----------------------------------------------------------------*/
1799 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1800 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1801 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1802 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1803 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1804 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1805 count = current_state->count; /* Number already matched */
1806 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1807 {
1808 const pcre_uchar *nptr = ptr + clen;
1809 int ncount = 0;
1810 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1811 {
1812 active_count--; /* Remove non-match possibility */
1813 next_active_state--;
1814 }
1815 while (nptr < end_subject)
1816 {
1817 int nd;
1818 int ndlen = 1;
1819 GETCHARLEN(nd, nptr, ndlen);
1820 if (UCD_CATEGORY(nd) != ucp_M) break;
1821 ncount++;
1822 nptr += ndlen;
1823 }
1824 if (++count >= GET2(code, 1))
1825 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1826 else
1827 { ADD_NEW_DATA(-state_offset, count, ncount); }
1828 }
1829 break;
1830 #endif
1831
1832 /*-----------------------------------------------------------------*/
1833 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1834 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1835 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1836 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1837 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1838 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1839 count = current_state->count; /* Number already matched */
1840 if (clen > 0)
1841 {
1842 int ncount = 0;
1843 switch (c)
1844 {
1845 case 0x000b:
1846 case 0x000c:
1847 case 0x0085:
1848 case 0x2028:
1849 case 0x2029:
1850 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1851 goto ANYNL03;
1852
1853 case 0x000d:
1854 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1855 /* Fall through */
1856
1857 ANYNL03:
1858 case 0x000a:
1859 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1860 {
1861 active_count--; /* Remove non-match possibility */
1862 next_active_state--;
1863 }
1864 if (++count >= GET2(code, 1))
1865 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1866 else
1867 { ADD_NEW_DATA(-state_offset, count, ncount); }
1868 break;
1869
1870 default:
1871 break;
1872 }
1873 }
1874 break;
1875
1876 /*-----------------------------------------------------------------*/
1877 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1878 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1879 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1880 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1881 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1882 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1883 count = current_state->count; /* Number already matched */
1884 if (clen > 0)
1885 {
1886 BOOL OK;
1887 switch (c)
1888 {
1889 case 0x000a:
1890 case 0x000b:
1891 case 0x000c:
1892 case 0x000d:
1893 case 0x0085:
1894 case 0x2028:
1895 case 0x2029:
1896 OK = TRUE;
1897 break;
1898
1899 default:
1900 OK = FALSE;
1901 }
1902
1903 if (OK == (d == OP_VSPACE))
1904 {
1905 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1906 {
1907 active_count--; /* Remove non-match possibility */
1908 next_active_state--;
1909 }
1910 if (++count >= GET2(code, 1))
1911 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1912 else
1913 { ADD_NEW_DATA(-state_offset, count, 0); }
1914 }
1915 }
1916 break;
1917
1918 /*-----------------------------------------------------------------*/
1919 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1920 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1921 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1922 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1923 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1924 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925 count = current_state->count; /* Number already matched */
1926 if (clen > 0)
1927 {
1928 BOOL OK;
1929 switch (c)
1930 {
1931 case 0x09: /* HT */
1932 case 0x20: /* SPACE */
1933 case 0xa0: /* NBSP */
1934 case 0x1680: /* OGHAM SPACE MARK */
1935 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1936 case 0x2000: /* EN QUAD */
1937 case 0x2001: /* EM QUAD */
1938 case 0x2002: /* EN SPACE */
1939 case 0x2003: /* EM SPACE */
1940 case 0x2004: /* THREE-PER-EM SPACE */
1941 case 0x2005: /* FOUR-PER-EM SPACE */
1942 case 0x2006: /* SIX-PER-EM SPACE */
1943 case 0x2007: /* FIGURE SPACE */
1944 case 0x2008: /* PUNCTUATION SPACE */
1945 case 0x2009: /* THIN SPACE */
1946 case 0x200A: /* HAIR SPACE */
1947 case 0x202f: /* NARROW NO-BREAK SPACE */
1948 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1949 case 0x3000: /* IDEOGRAPHIC SPACE */
1950 OK = TRUE;
1951 break;
1952
1953 default:
1954 OK = FALSE;
1955 break;
1956 }
1957
1958 if (OK == (d == OP_HSPACE))
1959 {
1960 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1961 {
1962 active_count--; /* Remove non-match possibility */
1963 next_active_state--;
1964 }
1965 if (++count >= GET2(code, 1))
1966 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1967 else
1968 { ADD_NEW_DATA(-state_offset, count, 0); }
1969 }
1970 }
1971 break;
1972
1973 /* ========================================================================== */
1974 /* These opcodes are followed by a character that is usually compared
1975 to the current subject character; it is loaded into d. We still get
1976 here even if there is no subject character, because in some cases zero
1977 repetitions are permitted. */
1978
1979 /*-----------------------------------------------------------------*/
1980 case OP_CHAR:
1981 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1982 break;
1983
1984 /*-----------------------------------------------------------------*/
1985 case OP_CHARI:
1986 if (clen == 0) break;
1987
1988 #ifdef SUPPORT_UTF8
1989 if (utf8)
1990 {
1991 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1992 {
1993 unsigned int othercase;
1994 if (c < 128) othercase = fcc[c]; else
1995
1996 /* If we have Unicode property support, we can use it to test the
1997 other case of the character. */
1998
1999 #ifdef SUPPORT_UCP
2000 othercase = UCD_OTHERCASE(c);
2001 #else
2002 othercase = NOTACHAR;
2003 #endif
2004
2005 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2006 }
2007 }
2008 else
2009 #endif /* SUPPORT_UTF8 */
2010
2011 /* Non-UTF-8 mode */
2012 {
2013 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2014 }
2015 break;
2016
2017
2018 #ifdef SUPPORT_UCP
2019 /*-----------------------------------------------------------------*/
2020 /* This is a tricky one because it can match more than one character.
2021 Find out how many characters to skip, and then set up a negative state
2022 to wait for them to pass before continuing. */
2023
2024 case OP_EXTUNI:
2025 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2026 {
2027 const pcre_uchar *nptr = ptr + clen;
2028 int ncount = 0;
2029 while (nptr < end_subject)
2030 {
2031 int nclen = 1;
2032 GETCHARLEN(c, nptr, nclen);
2033 if (UCD_CATEGORY(c) != ucp_M) break;
2034 ncount++;
2035 nptr += nclen;
2036 }
2037 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2038 }
2039 break;
2040 #endif
2041
2042 /*-----------------------------------------------------------------*/
2043 /* This is a tricky like EXTUNI because it too can match more than one
2044 character (when CR is followed by LF). In this case, set up a negative
2045 state to wait for one character to pass before continuing. */
2046
2047 case OP_ANYNL:
2048 if (clen > 0) switch(c)
2049 {
2050 case 0x000b:
2051 case 0x000c:
2052 case 0x0085:
2053 case 0x2028:
2054 case 0x2029:
2055 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2056
2057 case 0x000a:
2058 ADD_NEW(state_offset + 1, 0);
2059 break;
2060
2061 case 0x000d:
2062 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2063 {
2064 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2065 }
2066 else
2067 {
2068 ADD_NEW(state_offset + 1, 0);
2069 }
2070 break;
2071 }
2072 break;
2073
2074 /*-----------------------------------------------------------------*/
2075 case OP_NOT_VSPACE:
2076 if (clen > 0) switch(c)
2077 {
2078 case 0x000a:
2079 case 0x000b:
2080 case 0x000c:
2081 case 0x000d:
2082 case 0x0085:
2083 case 0x2028:
2084 case 0x2029:
2085 break;
2086
2087 default:
2088 ADD_NEW(state_offset + 1, 0);
2089 break;
2090 }
2091 break;
2092
2093 /*-----------------------------------------------------------------*/
2094 case OP_VSPACE:
2095 if (clen > 0) switch(c)
2096 {
2097 case 0x000a:
2098 case 0x000b:
2099 case 0x000c:
2100 case 0x000d:
2101 case 0x0085:
2102 case 0x2028:
2103 case 0x2029:
2104 ADD_NEW(state_offset + 1, 0);
2105 break;
2106
2107 default: break;
2108 }
2109 break;
2110
2111 /*-----------------------------------------------------------------*/
2112 case OP_NOT_HSPACE:
2113 if (clen > 0) switch(c)
2114 {
2115 case 0x09: /* HT */
2116 case 0x20: /* SPACE */
2117 case 0xa0: /* NBSP */
2118 case 0x1680: /* OGHAM SPACE MARK */
2119 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2120 case 0x2000: /* EN QUAD */
2121 case 0x2001: /* EM QUAD */
2122 case 0x2002: /* EN SPACE */
2123 case 0x2003: /* EM SPACE */
2124 case 0x2004: /* THREE-PER-EM SPACE */
2125 case 0x2005: /* FOUR-PER-EM SPACE */
2126 case 0x2006: /* SIX-PER-EM SPACE */
2127 case 0x2007: /* FIGURE SPACE */
2128 case 0x2008: /* PUNCTUATION SPACE */
2129 case 0x2009: /* THIN SPACE */
2130 case 0x200A: /* HAIR SPACE */
2131 case 0x202f: /* NARROW NO-BREAK SPACE */
2132 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2133 case 0x3000: /* IDEOGRAPHIC SPACE */
2134 break;
2135
2136 default:
2137 ADD_NEW(state_offset + 1, 0);
2138 break;
2139 }
2140 break;
2141
2142 /*-----------------------------------------------------------------*/
2143 case OP_HSPACE:
2144 if (clen > 0) switch(c)
2145 {
2146 case 0x09: /* HT */
2147 case 0x20: /* SPACE */
2148 case 0xa0: /* NBSP */
2149 case 0x1680: /* OGHAM SPACE MARK */
2150 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2151 case 0x2000: /* EN QUAD */
2152 case 0x2001: /* EM QUAD */
2153 case 0x2002: /* EN SPACE */
2154 case 0x2003: /* EM SPACE */
2155 case 0x2004: /* THREE-PER-EM SPACE */
2156 case 0x2005: /* FOUR-PER-EM SPACE */
2157 case 0x2006: /* SIX-PER-EM SPACE */
2158 case 0x2007: /* FIGURE SPACE */
2159 case 0x2008: /* PUNCTUATION SPACE */
2160 case 0x2009: /* THIN SPACE */
2161 case 0x200A: /* HAIR SPACE */
2162 case 0x202f: /* NARROW NO-BREAK SPACE */
2163 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2164 case 0x3000: /* IDEOGRAPHIC SPACE */
2165 ADD_NEW(state_offset + 1, 0);
2166 break;
2167 }
2168 break;
2169
2170 /*-----------------------------------------------------------------*/
2171 /* Match a negated single character casefully. This is only used for
2172 one-byte characters, that is, we know that d < 256. The character we are
2173 checking (c) can be multibyte. */
2174
2175 case OP_NOT:
2176 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2177 break;
2178
2179 /*-----------------------------------------------------------------*/
2180 /* Match a negated single character caselessly. This is only used for
2181 one-byte characters, that is, we know that d < 256. The character we are
2182 checking (c) can be multibyte. */
2183
2184 case OP_NOTI:
2185 if (clen > 0 && c != d && c != fcc[d])
2186 { ADD_NEW(state_offset + dlen + 1, 0); }
2187 break;
2188
2189 /*-----------------------------------------------------------------*/
2190 case OP_PLUSI:
2191 case OP_MINPLUSI:
2192 case OP_POSPLUSI:
2193 case OP_NOTPLUSI:
2194 case OP_NOTMINPLUSI:
2195 case OP_NOTPOSPLUSI:
2196 caseless = TRUE;
2197 codevalue -= OP_STARI - OP_STAR;
2198
2199 /* Fall through */
2200 case OP_PLUS:
2201 case OP_MINPLUS:
2202 case OP_POSPLUS:
2203 case OP_NOTPLUS:
2204 case OP_NOTMINPLUS:
2205 case OP_NOTPOSPLUS:
2206 count = current_state->count; /* Already matched */
2207 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2208 if (clen > 0)
2209 {
2210 unsigned int otherd = NOTACHAR;
2211 if (caseless)
2212 {
2213 #ifdef SUPPORT_UTF8
2214 if (utf8 && d >= 128)
2215 {
2216 #ifdef SUPPORT_UCP
2217 otherd = UCD_OTHERCASE(d);
2218 #endif /* SUPPORT_UCP */
2219 }
2220 else
2221 #endif /* SUPPORT_UTF8 */
2222 otherd = fcc[d];
2223 }
2224 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2225 {
2226 if (count > 0 &&
2227 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2228 {
2229 active_count--; /* Remove non-match possibility */
2230 next_active_state--;
2231 }
2232 count++;
2233 ADD_NEW(state_offset, count);
2234 }
2235 }
2236 break;
2237
2238 /*-----------------------------------------------------------------*/
2239 case OP_QUERYI:
2240 case OP_MINQUERYI:
2241 case OP_POSQUERYI:
2242 case OP_NOTQUERYI:
2243 case OP_NOTMINQUERYI:
2244 case OP_NOTPOSQUERYI:
2245 caseless = TRUE;
2246 codevalue -= OP_STARI - OP_STAR;
2247 /* Fall through */
2248 case OP_QUERY:
2249 case OP_MINQUERY:
2250 case OP_POSQUERY:
2251 case OP_NOTQUERY:
2252 case OP_NOTMINQUERY:
2253 case OP_NOTPOSQUERY:
2254 ADD_ACTIVE(state_offset + dlen + 1, 0);
2255 if (clen > 0)
2256 {
2257 unsigned int otherd = NOTACHAR;
2258 if (caseless)
2259 {
2260 #ifdef SUPPORT_UTF8
2261 if (utf8 && d >= 128)
2262 {
2263 #ifdef SUPPORT_UCP
2264 otherd = UCD_OTHERCASE(d);
2265 #endif /* SUPPORT_UCP */
2266 }
2267 else
2268 #endif /* SUPPORT_UTF8 */
2269 otherd = fcc[d];
2270 }
2271 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2272 {
2273 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2274 {
2275 active_count--; /* Remove non-match possibility */
2276 next_active_state--;
2277 }
2278 ADD_NEW(state_offset + dlen + 1, 0);
2279 }
2280 }
2281 break;
2282
2283 /*-----------------------------------------------------------------*/
2284 case OP_STARI:
2285 case OP_MINSTARI:
2286 case OP_POSSTARI:
2287 case OP_NOTSTARI:
2288 case OP_NOTMINSTARI:
2289 case OP_NOTPOSSTARI:
2290 caseless = TRUE;
2291 codevalue -= OP_STARI - OP_STAR;
2292 /* Fall through */
2293 case OP_STAR:
2294 case OP_MINSTAR:
2295 case OP_POSSTAR:
2296 case OP_NOTSTAR:
2297 case OP_NOTMINSTAR:
2298 case OP_NOTPOSSTAR:
2299 ADD_ACTIVE(state_offset + dlen + 1, 0);
2300 if (clen > 0)
2301 {
2302 unsigned int otherd = NOTACHAR;
2303 if (caseless)
2304 {
2305 #ifdef SUPPORT_UTF8
2306 if (utf8 && d >= 128)
2307 {
2308 #ifdef SUPPORT_UCP
2309 otherd = UCD_OTHERCASE(d);
2310 #endif /* SUPPORT_UCP */
2311 }
2312 else
2313 #endif /* SUPPORT_UTF8 */
2314 otherd = fcc[d];
2315 }
2316 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2317 {
2318 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2319 {
2320 active_count--; /* Remove non-match possibility */
2321 next_active_state--;
2322 }
2323 ADD_NEW(state_offset, 0);
2324 }
2325 }
2326 break;
2327
2328 /*-----------------------------------------------------------------*/
2329 case OP_EXACTI:
2330 case OP_NOTEXACTI:
2331 caseless = TRUE;
2332 codevalue -= OP_STARI - OP_STAR;
2333 /* Fall through */
2334 case OP_EXACT:
2335 case OP_NOTEXACT:
2336 count = current_state->count; /* Number already matched */
2337 if (clen > 0)
2338 {
2339 unsigned int otherd = NOTACHAR;
2340 if (caseless)
2341 {
2342 #ifdef SUPPORT_UTF8
2343 if (utf8 && d >= 128)
2344 {
2345 #ifdef SUPPORT_UCP
2346 otherd = UCD_OTHERCASE(d);
2347 #endif /* SUPPORT_UCP */
2348 }
2349 else
2350 #endif /* SUPPORT_UTF8 */
2351 otherd = fcc[d];
2352 }
2353 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2354 {
2355 if (++count >= GET2(code, 1))
2356 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2357 else
2358 { ADD_NEW(state_offset, count); }
2359 }
2360 }
2361 break;
2362
2363 /*-----------------------------------------------------------------*/
2364 case OP_UPTOI:
2365 case OP_MINUPTOI:
2366 case OP_POSUPTOI:
2367 case OP_NOTUPTOI:
2368 case OP_NOTMINUPTOI:
2369 case OP_NOTPOSUPTOI:
2370 caseless = TRUE;
2371 codevalue -= OP_STARI - OP_STAR;
2372 /* Fall through */
2373 case OP_UPTO:
2374 case OP_MINUPTO:
2375 case OP_POSUPTO:
2376 case OP_NOTUPTO:
2377 case OP_NOTMINUPTO:
2378 case OP_NOTPOSUPTO:
2379 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2380 count = current_state->count; /* Number already matched */
2381 if (clen > 0)
2382 {
2383 unsigned int otherd = NOTACHAR;
2384 if (caseless)
2385 {
2386 #ifdef SUPPORT_UTF8
2387 if (utf8 && d >= 128)
2388 {
2389 #ifdef SUPPORT_UCP
2390 otherd = UCD_OTHERCASE(d);
2391 #endif /* SUPPORT_UCP */
2392 }
2393 else
2394 #endif /* SUPPORT_UTF8 */
2395 otherd = fcc[d];
2396 }
2397 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2398 {
2399 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2400 {
2401 active_count--; /* Remove non-match possibility */
2402 next_active_state--;
2403 }
2404 if (++count >= GET2(code, 1))
2405 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2406 else
2407 { ADD_NEW(state_offset, count); }
2408 }
2409 }
2410 break;
2411
2412
2413 /* ========================================================================== */
2414 /* These are the class-handling opcodes */
2415
2416 case OP_CLASS:
2417 case OP_NCLASS:
2418 case OP_XCLASS:
2419 {
2420 BOOL isinclass = FALSE;
2421 int next_state_offset;
2422 const pcre_uchar *ecode;
2423
2424 /* For a simple class, there is always just a 32-byte table, and we
2425 can set isinclass from it. */
2426
2427 if (codevalue != OP_XCLASS)
2428 {
2429 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2430 if (clen > 0)
2431 {
2432 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2433 ((code[1 + c/8] & (1 << (c&7))) != 0);
2434 }
2435 }
2436
2437 /* An extended class may have a table or a list of single characters,
2438 ranges, or both, and it may be positive or negative. There's a
2439 function that sorts all this out. */
2440
2441 else
2442 {
2443 ecode = code + GET(code, 1);
2444 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE);
2445 }
2446
2447 /* At this point, isinclass is set for all kinds of class, and ecode
2448 points to the byte after the end of the class. If there is a
2449 quantifier, this is where it will be. */
2450
2451 next_state_offset = (int)(ecode - start_code);
2452
2453 switch (*ecode)
2454 {
2455 case OP_CRSTAR:
2456 case OP_CRMINSTAR:
2457 ADD_ACTIVE(next_state_offset + 1, 0);
2458 if (isinclass) { ADD_NEW(state_offset, 0); }
2459 break;
2460
2461 case OP_CRPLUS:
2462 case OP_CRMINPLUS:
2463 count = current_state->count; /* Already matched */
2464 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2465 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2466 break;
2467
2468 case OP_CRQUERY:
2469 case OP_CRMINQUERY:
2470 ADD_ACTIVE(next_state_offset + 1, 0);
2471 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2472 break;
2473
2474 case OP_CRRANGE:
2475 case OP_CRMINRANGE:
2476 count = current_state->count; /* Already matched */
2477 if (count >= GET2(ecode, 1))
2478 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2479 if (isinclass)
2480 {
2481 int max = GET2(ecode, 3);
2482 if (++count >= max && max != 0) /* Max 0 => no limit */
2483 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2484 else
2485 { ADD_NEW(state_offset, count); }
2486 }
2487 break;
2488
2489 default:
2490 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2491 break;
2492 }
2493 }
2494 break;
2495
2496 /* ========================================================================== */
2497 /* These are the opcodes for fancy brackets of various kinds. We have
2498 to use recursion in order to handle them. The "always failing" assertion
2499 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2500 though the other "backtracking verbs" are not supported. */
2501
2502 case OP_FAIL:
2503 forced_fail++; /* Count FAILs for multiple states */
2504 break;
2505
2506 case OP_ASSERT:
2507 case OP_ASSERT_NOT:
2508 case OP_ASSERTBACK:
2509 case OP_ASSERTBACK_NOT:
2510 {
2511 int rc;
2512 int local_offsets[2];
2513 int local_workspace[1000];
2514 const pcre_uchar *endasscode = code + GET(code, 1);
2515
2516 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2517
2518 rc = internal_dfa_exec(
2519 md, /* static match data */
2520 code, /* this subexpression's code */
2521 ptr, /* where we currently are */
2522 (int)(ptr - start_subject), /* start offset */
2523 local_offsets, /* offset vector */
2524 sizeof(local_offsets)/sizeof(int), /* size of same */
2525 local_workspace, /* workspace vector */
2526 sizeof(local_workspace)/sizeof(int), /* size of same */
2527 rlevel); /* function recursion level */
2528
2529 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2530 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2531 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2532 }
2533 break;
2534
2535 /*-----------------------------------------------------------------*/
2536 case OP_COND:
2537 case OP_SCOND:
2538 {
2539 int local_offsets[1000];
2540 int local_workspace[1000];
2541 int codelink = GET(code, 1);
2542 int condcode;
2543
2544 /* Because of the way auto-callout works during compile, a callout item
2545 is inserted between OP_COND and an assertion condition. This does not
2546 happen for the other conditions. */
2547
2548 if (code[LINK_SIZE+1] == OP_CALLOUT)
2549 {
2550 rrc = 0;
2551 if (pcre_callout != NULL)
2552 {
2553 pcre_callout_block cb;
2554 cb.version = 1; /* Version 1 of the callout block */
2555 cb.callout_number = code[LINK_SIZE+2];
2556 cb.offset_vector = offsets;
2557 cb.subject = (PCRE_SPTR)start_subject;
2558 cb.subject_length = (int)(end_subject - start_subject);
2559 cb.start_match = (int)(current_subject - start_subject);
2560 cb.current_position = (int)(ptr - start_subject);
2561 cb.pattern_position = GET(code, LINK_SIZE + 3);
2562 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2563 cb.capture_top = 1;
2564 cb.capture_last = -1;
2565 cb.callout_data = md->callout_data;
2566 cb.mark = NULL; /* No (*MARK) support */
2567 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2568 }
2569 if (rrc > 0) break; /* Fail this thread */
2570 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2571 }
2572
2573 condcode = code[LINK_SIZE+1];
2574
2575 /* Back reference conditions are not supported */
2576
2577 if (condcode == OP_CREF || condcode == OP_NCREF)
2578 return PCRE_ERROR_DFA_UCOND;
2579
2580 /* The DEFINE condition is always false */
2581
2582 if (condcode == OP_DEF)
2583 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2584
2585 /* The only supported version of OP_RREF is for the value RREF_ANY,
2586 which means "test if in any recursion". We can't test for specifically
2587 recursed groups. */
2588
2589 else if (condcode == OP_RREF || condcode == OP_NRREF)
2590 {
2591 int value = GET2(code, LINK_SIZE+2);
2592 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2593 if (md->recursive != NULL)
2594 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2595 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2596 }
2597
2598 /* Otherwise, the condition is an assertion */
2599
2600 else
2601 {
2602 int rc;
2603 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2604 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2605
2606 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2607
2608 rc = internal_dfa_exec(
2609 md, /* fixed match data */
2610 asscode, /* this subexpression's code */
2611 ptr, /* where we currently are */
2612 (int)(ptr - start_subject), /* start offset */
2613 local_offsets, /* offset vector */
2614 sizeof(local_offsets)/sizeof(int), /* size of same */
2615 local_workspace, /* workspace vector */
2616 sizeof(local_workspace)/sizeof(int), /* size of same */
2617 rlevel); /* function recursion level */
2618
2619 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2620 if ((rc >= 0) ==
2621 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2622 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2623 else
2624 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2625 }
2626 }
2627 break;
2628
2629 /*-----------------------------------------------------------------*/
2630 case OP_RECURSE:
2631 {
2632 dfa_recursion_info *ri;
2633 int local_offsets[1000];
2634 int local_workspace[1000];
2635 const pcre_uchar *callpat = start_code + GET(code, 1);
2636 int recno = (callpat == md->start_code)? 0 :
2637 GET2(callpat, 1 + LINK_SIZE);
2638 int rc;
2639
2640 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2641
2642 /* Check for repeating a recursion without advancing the subject
2643 pointer. This should catch convoluted mutual recursions. (Some simple
2644 cases are caught at compile time.) */
2645
2646 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2647 if (recno == ri->group_num && ptr == ri->subject_position)
2648 return PCRE_ERROR_RECURSELOOP;
2649
2650 /* Remember this recursion and where we started it so as to
2651 catch infinite loops. */
2652
2653 new_recursive.group_num = recno;
2654 new_recursive.subject_position = ptr;
2655 new_recursive.prevrec = md->recursive;
2656 md->recursive = &new_recursive;
2657
2658 rc = internal_dfa_exec(
2659 md, /* fixed match data */
2660 callpat, /* this subexpression's code */
2661 ptr, /* where we currently are */
2662 (int)(ptr - start_subject), /* start offset */
2663 local_offsets, /* offset vector */
2664 sizeof(local_offsets)/sizeof(int), /* size of same */
2665 local_workspace, /* workspace vector */
2666 sizeof(local_workspace)/sizeof(int), /* size of same */
2667 rlevel); /* function recursion level */
2668
2669 md->recursive = new_recursive.prevrec; /* Done this recursion */
2670
2671 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2672 rc));
2673
2674 /* Ran out of internal offsets */
2675
2676 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2677
2678 /* For each successful matched substring, set up the next state with a
2679 count of characters to skip before trying it. Note that the count is in
2680 characters, not bytes. */
2681
2682 if (rc > 0)
2683 {
2684 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2685 {
2686 const pcre_uchar *p = start_subject + local_offsets[rc];
2687 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2688 int charcount = local_offsets[rc+1] - local_offsets[rc];
2689 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2690 if (charcount > 0)
2691 {
2692 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2693 }
2694 else
2695 {
2696 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2697 }
2698 }
2699 }
2700 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2701 }
2702 break;
2703
2704 /*-----------------------------------------------------------------*/
2705 case OP_BRAPOS:
2706 case OP_SBRAPOS:
2707 case OP_CBRAPOS:
2708 case OP_SCBRAPOS:
2709 case OP_BRAPOSZERO:
2710 {
2711 int charcount, matched_count;
2712 const pcre_uchar *local_ptr = ptr;
2713 BOOL allow_zero;
2714
2715 if (codevalue == OP_BRAPOSZERO)
2716 {
2717 allow_zero = TRUE;
2718 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2719 }
2720 else allow_zero = FALSE;
2721
2722 /* Loop to match the subpattern as many times as possible as if it were
2723 a complete pattern. */
2724
2725 for (matched_count = 0;; matched_count++)
2726 {
2727 int local_offsets[2];
2728 int local_workspace[1000];
2729
2730 int rc = internal_dfa_exec(
2731 md, /* fixed match data */
2732 code, /* this subexpression's code */
2733 local_ptr, /* where we currently are */
2734 (int)(ptr - start_subject), /* start offset */
2735 local_offsets, /* offset vector */
2736 sizeof(local_offsets)/sizeof(int), /* size of same */
2737 local_workspace, /* workspace vector */
2738 sizeof(local_workspace)/sizeof(int), /* size of same */
2739 rlevel); /* function recursion level */
2740
2741 /* Failed to match */
2742
2743 if (rc < 0)
2744 {
2745 if (rc != PCRE_ERROR_NOMATCH) return rc;
2746 break;
2747 }
2748
2749 /* Matched: break the loop if zero characters matched. */
2750
2751 charcount = local_offsets[1] - local_offsets[0];
2752 if (charcount == 0) break;
2753 local_ptr += charcount; /* Advance temporary position ptr */
2754 }
2755
2756 /* At this point we have matched the subpattern matched_count
2757 times, and local_ptr is pointing to the character after the end of the
2758 last match. */
2759
2760 if (matched_count > 0 || allow_zero)
2761 {
2762 const pcre_uchar *end_subpattern = code;
2763 int next_state_offset;
2764
2765 do { end_subpattern += GET(end_subpattern, 1); }
2766 while (*end_subpattern == OP_ALT);
2767 next_state_offset =
2768 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2769
2770 /* Optimization: if there are no more active states, and there
2771 are no new states yet set up, then skip over the subject string
2772 right here, to save looping. Otherwise, set up the new state to swing
2773 into action when the end of the matched substring is reached. */
2774
2775 if (i + 1 >= active_count && new_count == 0)
2776 {
2777 ptr = local_ptr;
2778 clen = 0;
2779 ADD_NEW(next_state_offset, 0);
2780 }
2781 else
2782 {
2783 const pcre_uchar *p = ptr;
2784 const pcre_uchar *pp = local_ptr;
2785 charcount = pp - p;
2786 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2787 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2788 }
2789 }
2790 }
2791 break;
2792
2793 /*-----------------------------------------------------------------*/
2794 case OP_ONCE:
2795 case OP_ONCE_NC:
2796 {
2797 int local_offsets[2];
2798 int local_workspace[1000];
2799
2800 int rc = internal_dfa_exec(
2801 md, /* fixed match data */
2802 code, /* this subexpression's code */
2803 ptr, /* where we currently are */
2804 (int)(ptr - start_subject), /* start offset */
2805 local_offsets, /* offset vector */
2806 sizeof(local_offsets)/sizeof(int), /* size of same */
2807 local_workspace, /* workspace vector */
2808 sizeof(local_workspace)/sizeof(int), /* size of same */
2809 rlevel); /* function recursion level */
2810
2811 if (rc >= 0)
2812 {
2813 const pcre_uchar *end_subpattern = code;
2814 int charcount = local_offsets[1] - local_offsets[0];
2815 int next_state_offset, repeat_state_offset;
2816
2817 do { end_subpattern += GET(end_subpattern, 1); }
2818 while (*end_subpattern == OP_ALT);
2819 next_state_offset =
2820 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2821
2822 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2823 arrange for the repeat state also to be added to the relevant list.
2824 Calculate the offset, or set -1 for no repeat. */
2825
2826 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2827 *end_subpattern == OP_KETRMIN)?
2828 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2829
2830 /* If we have matched an empty string, add the next state at the
2831 current character pointer. This is important so that the duplicate
2832 checking kicks in, which is what breaks infinite loops that match an
2833 empty string. */
2834
2835 if (charcount == 0)
2836 {
2837 ADD_ACTIVE(next_state_offset, 0);
2838 }
2839
2840 /* Optimization: if there are no more active states, and there
2841 are no new states yet set up, then skip over the subject string
2842 right here, to save looping. Otherwise, set up the new state to swing
2843 into action when the end of the matched substring is reached. */
2844
2845 else if (i + 1 >= active_count && new_count == 0)
2846 {
2847 ptr += charcount;
2848 clen = 0;
2849 ADD_NEW(next_state_offset, 0);
2850
2851 /* If we are adding a repeat state at the new character position,
2852 we must fudge things so that it is the only current state.
2853 Otherwise, it might be a duplicate of one we processed before, and
2854 that would cause it to be skipped. */
2855
2856 if (repeat_state_offset >= 0)
2857 {
2858 next_active_state = active_states;
2859 active_count = 0;
2860 i = -1;
2861 ADD_ACTIVE(repeat_state_offset, 0);
2862 }
2863 }
2864 else
2865 {
2866 const pcre_uchar *p = start_subject + local_offsets[0];
2867 const pcre_uchar *pp = start_subject + local_offsets[1];
2868 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2869 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2870 if (repeat_state_offset >= 0)
2871 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2872 }
2873 }
2874 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2875 }
2876 break;
2877
2878
2879 /* ========================================================================== */
2880 /* Handle callouts */
2881
2882 case OP_CALLOUT:
2883 rrc = 0;
2884 if (pcre_callout != NULL)
2885 {
2886 pcre_callout_block cb;
2887 cb.version = 1; /* Version 1 of the callout block */
2888 cb.callout_number = code[1];
2889 cb.offset_vector = offsets;
2890 cb.subject = (PCRE_SPTR)start_subject;
2891 cb.subject_length = (int)(end_subject - start_subject);
2892 cb.start_match = (int)(current_subject - start_subject);
2893 cb.current_position = (int)(ptr - start_subject);
2894 cb.pattern_position = GET(code, 2);
2895 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2896 cb.capture_top = 1;
2897 cb.capture_last = -1;
2898 cb.callout_data = md->callout_data;
2899 cb.mark = NULL; /* No (*MARK) support */
2900 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2901 }
2902 if (rrc == 0)
2903 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2904 break;
2905
2906
2907 /* ========================================================================== */
2908 default: /* Unsupported opcode */
2909 return PCRE_ERROR_DFA_UITEM;
2910 }
2911
2912 NEXT_ACTIVE_STATE: continue;
2913
2914 } /* End of loop scanning active states */
2915
2916 /* We have finished the processing at the current subject character. If no
2917 new states have been set for the next character, we have found all the
2918 matches that we are going to find. If we are at the top level and partial
2919 matching has been requested, check for appropriate conditions.
2920
2921 The "forced_ fail" variable counts the number of (*F) encountered for the
2922 character. If it is equal to the original active_count (saved in
2923 workspace[1]) it means that (*F) was found on every active state. In this
2924 case we don't want to give a partial match.
2925
2926 The "could_continue" variable is true if a state could have continued but
2927 for the fact that the end of the subject was reached. */
2928
2929 if (new_count <= 0)
2930 {
2931 if (rlevel == 1 && /* Top level, and */
2932 could_continue && /* Some could go on */
2933 forced_fail != workspace[1] && /* Not all forced fail & */
2934 ( /* either... */
2935 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2936 || /* or... */
2937 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2938 match_count < 0) /* no matches */
2939 ) && /* And... */
2940 ptr >= end_subject && /* Reached end of subject */
2941 ptr > md->start_used_ptr) /* Inspected non-empty string */
2942 {
2943 if (offsetcount >= 2)
2944 {
2945 offsets[0] = (int)(md->start_used_ptr - start_subject);
2946 offsets[1] = (int)(end_subject - start_subject);
2947 }
2948 match_count = PCRE_ERROR_PARTIAL;
2949 }
2950
2951 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2952 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2953 rlevel*2-2, SP));
2954 break; /* In effect, "return", but see the comment below */
2955 }
2956
2957 /* One or more states are active for the next character. */
2958
2959 ptr += clen; /* Advance to next subject character */
2960 } /* Loop to move along the subject string */
2961
2962 /* Control gets here from "break" a few lines above. We do it this way because
2963 if we use "return" above, we have compiler trouble. Some compilers warn if
2964 there's nothing here because they think the function doesn't return a value. On
2965 the other hand, if we put a dummy statement here, some more clever compilers
2966 complain that it can't be reached. Sigh. */
2967
2968 return match_count;
2969 }
2970
2971
2972
2973
2974 /*************************************************
2975 * Execute a Regular Expression - DFA engine *
2976 *************************************************/
2977
2978 /* This external function applies a compiled re to a subject string using a DFA
2979 engine. This function calls the internal function multiple times if the pattern
2980 is not anchored.
2981
2982 Arguments:
2983 argument_re points to the compiled expression
2984 extra_data points to extra data or is NULL
2985 subject points to the subject string
2986 length length of subject string (may contain binary zeros)
2987 start_offset where to start in the subject string
2988 options option bits
2989 offsets vector of match offsets
2990 offsetcount size of same
2991 workspace workspace vector
2992 wscount size of same
2993
2994 Returns: > 0 => number of match offset pairs placed in offsets
2995 = 0 => offsets overflowed; longest matches are present
2996 -1 => failed to match
2997 < -1 => some kind of unexpected problem
2998 */
2999
3000 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3001 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3002 const char *subject, int length, int start_offset, int options, int *offsets,
3003 int offsetcount, int *workspace, int wscount)
3004 {
3005 real_pcre *re = (real_pcre *)argument_re;
3006 dfa_match_data match_block;
3007 dfa_match_data *md = &match_block;
3008 BOOL utf8, anchored, startline, firstline;
3009 const pcre_uchar *current_subject, *end_subject;
3010 const pcre_uint8 *lcc;
3011
3012 pcre_study_data internal_study;
3013 const pcre_study_data *study = NULL;
3014 real_pcre internal_re;
3015
3016 const pcre_uchar *req_char_ptr;
3017 const pcre_uint8 *start_bits = NULL;
3018 BOOL has_first_char = FALSE;
3019 BOOL has_req_char = FALSE;
3020 pcre_uchar first_char = 0;
3021 pcre_uchar first_char2 = 0;
3022 pcre_uchar req_char = 0;
3023 pcre_uchar req_char2 = 0;
3024 int newline;
3025
3026 /* Plausibility checks */
3027
3028 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3029 if (re == NULL || subject == NULL || workspace == NULL ||
3030 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3031 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3032 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3033 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3034
3035 /* We need to find the pointer to any study data before we test for byte
3036 flipping, so we scan the extra_data block first. This may set two fields in the
3037 match block, so we must initialize them beforehand. However, the other fields
3038 in the match block must not be set until after the byte flipping. */
3039
3040 md->tables = re->tables;
3041 md->callout_data = NULL;
3042
3043 if (extra_data != NULL)
3044 {
3045 unsigned int flags = extra_data->flags;
3046 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3047 study = (const pcre_study_data *)extra_data->study_data;
3048 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3049 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3050 return PCRE_ERROR_DFA_UMLIMIT;
3051 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3052 md->callout_data = extra_data->callout_data;
3053 if ((flags & PCRE_EXTRA_TABLES) != 0)
3054 md->tables = extra_data->tables;
3055 }
3056
3057 /* Check that the first field in the block is the magic number. If it is not,
3058 test for a regex that was compiled on a host of opposite endianness. If this is
3059 the case, flipped values are put in internal_re and internal_study if there was
3060 study data too. */
3061
3062 if (re->magic_number != MAGIC_NUMBER)
3063 {
3064 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3065 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3066 if (study != NULL) study = &internal_study;
3067 }
3068
3069 /* Set some local values */
3070
3071 current_subject = (const unsigned char *)subject + start_offset;
3072 end_subject = (const unsigned char *)subject + length;
3073 req_char_ptr = current_subject - 1;
3074
3075 #ifdef SUPPORT_UTF8
3076 utf8 = (re->options & PCRE_UTF8) != 0;
3077 #else
3078 utf8 = FALSE;
3079 #endif
3080
3081 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3082 (re->options & PCRE_ANCHORED) != 0;
3083
3084 /* The remaining fixed data for passing around. */
3085
3086 md->start_code = (const pcre_uchar *)argument_re +
3087 re->name_table_offset + re->name_count * re->name_entry_size;
3088 md->start_subject = (const unsigned char *)subject;
3089 md->end_subject = end_subject;
3090 md->start_offset = start_offset;
3091 md->moptions = options;
3092 md->poptions = re->options;
3093
3094 /* If the BSR option is not set at match time, copy what was set
3095 at compile time. */
3096
3097 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3098 {
3099 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3100 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3101 #ifdef BSR_ANYCRLF
3102 else md->moptions |= PCRE_BSR_ANYCRLF;
3103 #endif
3104 }
3105
3106 /* Handle different types of newline. The three bits give eight cases. If
3107 nothing is set at run time, whatever was used at compile time applies. */
3108
3109 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3110 PCRE_NEWLINE_BITS)
3111 {
3112 case 0: newline = NEWLINE; break; /* Compile-time default */
3113 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3114 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3115 case PCRE_NEWLINE_CR+
3116 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3117 case PCRE_NEWLINE_ANY: newline = -1; break;
3118 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3119 default: return PCRE_ERROR_BADNEWLINE;
3120 }
3121
3122 if (newline == -2)
3123 {
3124 md->nltype = NLTYPE_ANYCRLF;
3125 }
3126 else if (newline < 0)
3127 {
3128 md->nltype = NLTYPE_ANY;
3129 }
3130 else
3131 {
3132 md->nltype = NLTYPE_FIXED;
3133 if (newline > 255)
3134 {
3135 md->nllen = 2;
3136 md->nl[0] = (newline >> 8) & 255;
3137 md->nl[1] = newline & 255;
3138 }
3139 else
3140 {
3141 md->nllen = 1;
3142 md->nl[0] = newline;
3143 }
3144 }
3145
3146 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3147 back the character offset. */
3148
3149 #ifdef SUPPORT_UTF8
3150 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3151 {
3152 int erroroffset;
3153 int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset);
3154 if (errorcode != 0)
3155 {
3156 if (offsetcount >= 2)
3157 {
3158 offsets[0] = erroroffset;
3159 offsets[1] = errorcode;
3160 }
3161 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3162 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3163 }
3164 if (start_offset > 0 && start_offset < length &&
3165 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
3166 return PCRE_ERROR_BADUTF8_OFFSET;
3167 }
3168 #endif
3169
3170 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3171 is a feature that makes it possible to save compiled regex and re-use them
3172 in other programs later. */
3173
3174 if (md->tables == NULL) md->tables = PRIV(default_tables);
3175
3176 /* The lower casing table and the "must be at the start of a line" flag are
3177 used in a loop when finding where to start. */
3178
3179 lcc = md->tables + lcc_offset;
3180 startline = (re->flags & PCRE_STARTLINE) != 0;
3181 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3182
3183 /* Set up the first character to match, if available. The first_byte value is
3184 never set for an anchored regular expression, but the anchoring may be forced
3185 at run time, so we have to test for anchoring. The first char may be unset for
3186 an unanchored pattern, of course. If there's no first char and the pattern was
3187 studied, there may be a bitmap of possible first characters. */
3188
3189 if (!anchored)
3190 {
3191 if ((re->flags & PCRE_FIRSTSET) != 0)
3192 {
3193 has_first_char = TRUE;
3194 first_char = first_char2 = re->first_char;
3195 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3196 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3197 }
3198 else
3199 {
3200 if (!startline && study != NULL &&
3201 (study->flags & PCRE_STUDY_MAPPED) != 0)
3202 start_bits = study->start_bits;
3203 }
3204 }
3205
3206 /* For anchored or unanchored matches, there may be a "last known required
3207 character" set. */
3208
3209 if ((re->flags & PCRE_REQCHSET) != 0)
3210 {
3211 has_req_char = TRUE;
3212 req_char = req_char2 = re->req_char;
3213 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3214 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3215 }
3216
3217 /* Call the main matching function, looping for a non-anchored regex after a
3218 failed match. If not restarting, perform certain optimizations at the start of
3219 a match. */
3220
3221 for (;;)
3222 {
3223 int rc;
3224
3225 if ((options & PCRE_DFA_RESTART) == 0)
3226 {
3227 const pcre_uchar *save_end_subject = end_subject;
3228
3229 /* If firstline is TRUE, the start of the match is constrained to the first
3230 line of a multiline string. Implement this by temporarily adjusting
3231 end_subject so that we stop scanning at a newline. If the match fails at
3232 the newline, later code breaks this loop. */
3233
3234 if (firstline)
3235 {
3236 PCRE_PUCHAR t = current_subject;
3237 #ifdef SUPPORT_UTF8
3238 if (utf8)
3239 {
3240 while (t < md->end_subject && !IS_NEWLINE(t))
3241 {
3242 t++;
3243 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3244 }
3245 }
3246 else
3247 #endif
3248 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3249 end_subject = t;
3250 }
3251
3252 /* There are some optimizations that avoid running the match if a known
3253 starting point is not found. However, there is an option that disables
3254 these, for testing and for ensuring that all callouts do actually occur.
3255 The option can be set in the regex by (*NO_START_OPT) or passed in
3256 match-time options. */
3257
3258 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3259 {
3260 /* Advance to a known first char. */
3261
3262 if (has_first_char)
3263 {
3264 if (first_char != first_char2)
3265 while (current_subject < end_subject &&
3266 *current_subject != first_char && *current_subject != first_char2)
3267 current_subject++;
3268 else
3269 while (current_subject < end_subject &&
3270 *current_subject != first_char)
3271 current_subject++;
3272 }
3273
3274 /* Or to just after a linebreak for a multiline match if possible */
3275
3276 else if (startline)
3277 {
3278 if (current_subject > md->start_subject + start_offset)
3279 {
3280 #ifdef SUPPORT_UTF8
3281 if (utf8)
3282 {
3283 while (current_subject < end_subject &&
3284 !WAS_NEWLINE(current_subject))
3285 {
3286 current_subject++;
3287 while(current_subject < end_subject &&
3288 (*current_subject & 0xc0) == 0x80)
3289 current_subject++;
3290 }
3291 }
3292 else
3293 #endif
3294 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3295 current_subject++;
3296
3297 /* If we have just passed a CR and the newline option is ANY or
3298 ANYCRLF, and we are now at a LF, advance the match position by one
3299 more character. */
3300
3301 if (current_subject[-1] == CHAR_CR &&
3302 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3303 current_subject < end_subject &&
3304 *current_subject == CHAR_NL)
3305 current_subject++;
3306 }
3307 }
3308
3309 /* Or to a non-unique first char after study */
3310
3311 else if (start_bits != NULL)
3312 {
3313 while (current_subject < end_subject)
3314 {
3315 register unsigned int c = *current_subject;
3316 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3317 {
3318 current_subject++;
3319 #ifdef SUPPORT_UTF8
3320 if (utf8)
3321 while(current_subject < end_subject &&
3322 (*current_subject & 0xc0) == 0x80) current_subject++;
3323 #endif
3324 }
3325 else break;
3326 }
3327 }
3328 }
3329
3330 /* Restore fudged end_subject */
3331
3332 end_subject = save_end_subject;
3333
3334 /* The following two optimizations are disabled for partial matching or if
3335 disabling is explicitly requested (and of course, by the test above, this
3336 code is not obeyed when restarting after a partial match). */
3337
3338 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3339 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3340 {
3341 /* If the pattern was studied, a minimum subject length may be set. This
3342 is a lower bound; no actual string of that length may actually match the
3343 pattern. Although the value is, strictly, in characters, we treat it as
3344 bytes to avoid spending too much time in this optimization. */
3345
3346 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3347 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3348 return PCRE_ERROR_NOMATCH;
3349
3350 /* If req_char is set, we know that that character must appear in the
3351 subject for the match to succeed. If the first character is set, req_char
3352 must be later in the subject; otherwise the test starts at the match
3353 point. This optimization can save a huge amount of work in patterns with
3354 nested unlimited repeats that aren't going to match. Writing separate
3355 code for cased/caseless versions makes it go faster, as does using an
3356 autoincrement and backing off on a match.
3357
3358 HOWEVER: when the subject string is very, very long, searching to its end
3359 can take a long time, and give bad performance on quite ordinary
3360 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3361 string... so we don't do this when the string is sufficiently long. */
3362
3363 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3364 {
3365 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3366
3367 /* We don't need to repeat the search if we haven't yet reached the
3368 place we found it at last time. */
3369
3370 if (p > req_char_ptr)
3371 {
3372 if (req_char != req_char2)
3373 {
3374 while (p < end_subject)
3375 {
3376 register int pp = *p++;
3377 if (pp == req_char || pp == req_char2) { p--; break; }
3378 }
3379 }
3380 else
3381 {
3382 while (p < end_subject)
3383 {
3384 if (*p++ == req_char) { p--; break; }
3385 }
3386 }
3387
3388 /* If we can't find the required character, break the matching loop,
3389 which will cause a return or PCRE_ERROR_NOMATCH. */
3390
3391 if (p >= end_subject) break;
3392
3393 /* If we have found the required character, save the point where we
3394 found it, so that we don't search again next time round the loop if
3395 the start hasn't passed this character yet. */
3396
3397 req_char_ptr = p;
3398 }
3399 }
3400 }
3401 } /* End of optimizations that are done when not restarting */
3402
3403 /* OK, now we can do the business */
3404
3405 md->start_used_ptr = current_subject;
3406 md->recursive = NULL;
3407
3408 rc = internal_dfa_exec(
3409 md, /* fixed match data */
3410 md->start_code, /* this subexpression's code */
3411 current_subject, /* where we currently are */
3412 start_offset, /* start offset in subject */
3413 offsets, /* offset vector */
3414 offsetcount, /* size of same */
3415 workspace, /* workspace vector */
3416 wscount, /* size of same */
3417 0); /* function recurse level */
3418
3419 /* Anything other than "no match" means we are done, always; otherwise, carry
3420 on only if not anchored. */
3421
3422 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3423
3424 /* Advance to the next subject character unless we are at the end of a line
3425 and firstline is set. */
3426
3427 if (firstline && IS_NEWLINE(current_subject)) break;
3428 current_subject++;
3429 if (utf8)
3430 {
3431 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3432 current_subject++;
3433 }
3434 if (current_subject > end_subject) break;
3435
3436 /* If we have just passed a CR and we are now at a LF, and the pattern does
3437 not contain any explicit matches for \r or \n, and the newline option is CRLF
3438 or ANY or ANYCRLF, advance the match position by one more character. */
3439
3440 if (current_subject[-1] == CHAR_CR &&
3441 current_subject < end_subject &&
3442 *current_subject == CHAR_NL &&
3443 (re->flags & PCRE_HASCRORLF) == 0 &&
3444 (md->nltype == NLTYPE_ANY ||
3445 md->nltype == NLTYPE_ANYCRLF ||
3446 md->nllen == 2))
3447 current_subject++;
3448
3449 } /* "Bumpalong" loop */
3450
3451 return PCRE_ERROR_NOMATCH;
3452 }
3453
3454 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5