/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 781 - (show annotations)
Sat Dec 3 07:58:30 2011 UTC (9 years, 4 months ago) by zherczeg
File MIME type: text/plain
File size: 120257 byte(s)
renaming utf8 to utf, JIT compiler update, disallowing invalid utf chars
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2011 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 3, 3, 3, /* upto, minupto, exact */
132 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 3, 3, 3, /* upto I, minupto I, exact I */
135 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 /* Negative single-char repeats - only for chars < 256 */
137 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138 3, 3, 3, /* NOT upto, minupto, exact */
139 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141 3, 3, 3, /* NOT upto I, minupto I, exact I */
142 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 /* Positive type repeats */
144 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145 3, 3, 3, /* Type upto, minupto, exact */
146 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 /* Character class & ref repeats */
148 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149 0, 0, /* CRRANGE, CRMINRANGE */
150 0, /* CLASS */
151 0, /* NCLASS */
152 0, /* XCLASS - variable length */
153 0, /* REF */
154 0, /* REFI */
155 0, /* RECURSE */
156 0, /* CALLOUT */
157 0, /* Alt */
158 0, /* Ket */
159 0, /* KetRmax */
160 0, /* KetRmin */
161 0, /* KetRpos */
162 0, /* Reverse */
163 0, /* Assert */
164 0, /* Assert not */
165 0, /* Assert behind */
166 0, /* Assert behind not */
167 0, 0, /* ONCE, ONCE_NC */
168 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 0, 0, /* CREF, NCREF */
171 0, 0, /* RREF, NRREF */
172 0, /* DEF */
173 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177 0, 0 /* CLOSE, SKIPZERO */
178 };
179
180 /* This table identifies those opcodes that inspect a character. It is used to
181 remember the fact that a character could have been inspected when the end of
182 the subject is reached. ***NOTE*** If the start of this table is modified, the
183 two tables that follow must also be modified. */
184
185 static const pcre_uint8 poptable[] = {
186 0, /* End */
187 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189 1, 1, 1, /* Any, AllAny, Anybyte */
190 1, 1, /* \P, \p */
191 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 1, /* \X */
193 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 1, /* Char */
195 1, /* Chari */
196 1, /* not */
197 1, /* noti */
198 /* Positive single-char repeats */
199 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200 1, 1, 1, /* upto, minupto, exact */
201 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203 1, 1, 1, /* upto I, minupto I, exact I */
204 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 /* Negative single-char repeats - only for chars < 256 */
206 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207 1, 1, 1, /* NOT upto, minupto, exact */
208 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210 1, 1, 1, /* NOT upto I, minupto I, exact I */
211 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 /* Positive type repeats */
213 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* Type upto, minupto, exact */
215 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216 /* Character class & ref repeats */
217 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218 1, 1, /* CRRANGE, CRMINRANGE */
219 1, /* CLASS */
220 1, /* NCLASS */
221 1, /* XCLASS - variable length */
222 0, /* REF */
223 0, /* REFI */
224 0, /* RECURSE */
225 0, /* CALLOUT */
226 0, /* Alt */
227 0, /* Ket */
228 0, /* KetRmax */
229 0, /* KetRmin */
230 0, /* KetRpos */
231 0, /* Reverse */
232 0, /* Assert */
233 0, /* Assert not */
234 0, /* Assert behind */
235 0, /* Assert behind not */
236 0, 0, /* ONCE, ONCE_NC */
237 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 0, 0, /* CREF, NCREF */
240 0, 0, /* RREF, NRREF */
241 0, /* DEF */
242 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246 0, 0 /* CLOSE, SKIPZERO */
247 };
248
249 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250 and \w */
251
252 static const pcre_uint8 toptable1[] = {
253 0, 0, 0, 0, 0, 0,
254 ctype_digit, ctype_digit,
255 ctype_space, ctype_space,
256 ctype_word, ctype_word,
257 0, 0 /* OP_ANY, OP_ALLANY */
258 };
259
260 static const pcre_uint8 toptable2[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, 0,
263 ctype_space, 0,
264 ctype_word, 0,
265 1, 1 /* OP_ANY, OP_ALLANY */
266 };
267
268
269 /* Structure for holding data about a particular state, which is in effect the
270 current data for an active path through the match tree. It must consist
271 entirely of ints because the working vector we are passed, and which we put
272 these structures in, is a vector of ints. */
273
274 typedef struct stateblock {
275 int offset; /* Offset to opcode */
276 int count; /* Count for repeats */
277 int data; /* Some use extra data */
278 } stateblock;
279
280 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281
282
283 #ifdef PCRE_DEBUG
284 /*************************************************
285 * Print character string *
286 *************************************************/
287
288 /* Character string printing function for debugging.
289
290 Arguments:
291 p points to string
292 length number of bytes
293 f where to print
294
295 Returns: nothing
296 */
297
298 static void
299 pchars(unsigned char *p, int length, FILE *f)
300 {
301 int c;
302 while (length-- > 0)
303 {
304 if (isprint(c = *(p++)))
305 fprintf(f, "%c", c);
306 else
307 fprintf(f, "\\x%02x", c);
308 }
309 }
310 #endif
311
312
313
314 /*************************************************
315 * Execute a Regular Expression - DFA engine *
316 *************************************************/
317
318 /* This internal function applies a compiled pattern to a subject string,
319 starting at a given point, using a DFA engine. This function is called from the
320 external one, possibly multiple times if the pattern is not anchored. The
321 function calls itself recursively for some kinds of subpattern.
322
323 Arguments:
324 md the match_data block with fixed information
325 this_start_code the opening bracket of this subexpression's code
326 current_subject where we currently are in the subject string
327 start_offset start offset in the subject string
328 offsets vector to contain the matching string offsets
329 offsetcount size of same
330 workspace vector of workspace
331 wscount size of same
332 rlevel function call recursion level
333
334 Returns: > 0 => number of match offset pairs placed in offsets
335 = 0 => offsets overflowed; longest matches are present
336 -1 => failed to match
337 < -1 => some kind of unexpected problem
338
339 The following macros are used for adding states to the two state vectors (one
340 for the current character, one for the following character). */
341
342 #define ADD_ACTIVE(x,y) \
343 if (active_count++ < wscount) \
344 { \
345 next_active_state->offset = (x); \
346 next_active_state->count = (y); \
347 next_active_state++; \
348 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349 } \
350 else return PCRE_ERROR_DFA_WSSIZE
351
352 #define ADD_ACTIVE_DATA(x,y,z) \
353 if (active_count++ < wscount) \
354 { \
355 next_active_state->offset = (x); \
356 next_active_state->count = (y); \
357 next_active_state->data = (z); \
358 next_active_state++; \
359 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360 } \
361 else return PCRE_ERROR_DFA_WSSIZE
362
363 #define ADD_NEW(x,y) \
364 if (new_count++ < wscount) \
365 { \
366 next_new_state->offset = (x); \
367 next_new_state->count = (y); \
368 next_new_state++; \
369 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370 } \
371 else return PCRE_ERROR_DFA_WSSIZE
372
373 #define ADD_NEW_DATA(x,y,z) \
374 if (new_count++ < wscount) \
375 { \
376 next_new_state->offset = (x); \
377 next_new_state->count = (y); \
378 next_new_state->data = (z); \
379 next_new_state++; \
380 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381 } \
382 else return PCRE_ERROR_DFA_WSSIZE
383
384 /* And now, here is the code */
385
386 static int
387 internal_dfa_exec(
388 dfa_match_data *md,
389 const pcre_uchar *this_start_code,
390 const pcre_uchar *current_subject,
391 int start_offset,
392 int *offsets,
393 int offsetcount,
394 int *workspace,
395 int wscount,
396 int rlevel)
397 {
398 stateblock *active_states, *new_states, *temp_states;
399 stateblock *next_active_state, *next_new_state;
400
401 const pcre_uint8 *ctypes, *lcc, *fcc;
402 const pcre_uchar *ptr;
403 const pcre_uchar *end_code, *first_op;
404
405 dfa_recursion_info new_recursive;
406
407 int active_count, new_count, match_count;
408
409 /* Some fields in the md block are frequently referenced, so we load them into
410 independent variables in the hope that this will perform better. */
411
412 const pcre_uchar *start_subject = md->start_subject;
413 const pcre_uchar *end_subject = md->end_subject;
414 const pcre_uchar *start_code = md->start_code;
415
416 #ifdef SUPPORT_UTF8
417 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
418 #else
419 BOOL utf = FALSE;
420 #endif
421
422 rlevel++;
423 offsetcount &= (-2);
424
425 wscount -= 2;
426 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427 (2 * INTS_PER_STATEBLOCK);
428
429 DPRINTF(("\n%.*s---------------------\n"
430 "%.*sCall to internal_dfa_exec f=%d\n",
431 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432
433 ctypes = md->tables + ctypes_offset;
434 lcc = md->tables + lcc_offset;
435 fcc = md->tables + fcc_offset;
436
437 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438
439 active_states = (stateblock *)(workspace + 2);
440 next_new_state = new_states = active_states + wscount;
441 new_count = 0;
442
443 first_op = this_start_code + 1 + LINK_SIZE +
444 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
446 ? IMM2_SIZE:0);
447
448 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
449 the alternative states onto the list, and find out where the end is. This
450 makes is possible to use this function recursively, when we want to stop at a
451 matching internal ket rather than at the end.
452
453 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
454 a backward assertion. In that case, we have to find out the maximum amount to
455 move back, and set up each alternative appropriately. */
456
457 if (*first_op == OP_REVERSE)
458 {
459 int max_back = 0;
460 int gone_back;
461
462 end_code = this_start_code;
463 do
464 {
465 int back = GET(end_code, 2+LINK_SIZE);
466 if (back > max_back) max_back = back;
467 end_code += GET(end_code, 1);
468 }
469 while (*end_code == OP_ALT);
470
471 /* If we can't go back the amount required for the longest lookbehind
472 pattern, go back as far as we can; some alternatives may still be viable. */
473
474 #ifdef SUPPORT_UTF8
475 /* In character mode we have to step back character by character */
476
477 if (utf)
478 {
479 for (gone_back = 0; gone_back < max_back; gone_back++)
480 {
481 if (current_subject <= start_subject) break;
482 current_subject--;
483 while (current_subject > start_subject &&
484 (*current_subject & 0xc0) == 0x80)
485 current_subject--;
486 }
487 }
488 else
489 #endif
490
491 /* In byte-mode we can do this quickly. */
492
493 {
494 gone_back = (current_subject - max_back < start_subject)?
495 (int)(current_subject - start_subject) : max_back;
496 current_subject -= gone_back;
497 }
498
499 /* Save the earliest consulted character */
500
501 if (current_subject < md->start_used_ptr)
502 md->start_used_ptr = current_subject;
503
504 /* Now we can process the individual branches. */
505
506 end_code = this_start_code;
507 do
508 {
509 int back = GET(end_code, 2+LINK_SIZE);
510 if (back <= gone_back)
511 {
512 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
513 ADD_NEW_DATA(-bstate, 0, gone_back - back);
514 }
515 end_code += GET(end_code, 1);
516 }
517 while (*end_code == OP_ALT);
518 }
519
520 /* This is the code for a "normal" subpattern (not a backward assertion). The
521 start of a whole pattern is always one of these. If we are at the top level,
522 we may be asked to restart matching from the same point that we reached for a
523 previous partial match. We still have to scan through the top-level branches to
524 find the end state. */
525
526 else
527 {
528 end_code = this_start_code;
529
530 /* Restarting */
531
532 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
533 {
534 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
535 new_count = workspace[1];
536 if (!workspace[0])
537 memcpy(new_states, active_states, new_count * sizeof(stateblock));
538 }
539
540 /* Not restarting */
541
542 else
543 {
544 int length = 1 + LINK_SIZE +
545 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
546 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
547 ? IMM2_SIZE:0);
548 do
549 {
550 ADD_NEW((int)(end_code - start_code + length), 0);
551 end_code += GET(end_code, 1);
552 length = 1 + LINK_SIZE;
553 }
554 while (*end_code == OP_ALT);
555 }
556 }
557
558 workspace[0] = 0; /* Bit indicating which vector is current */
559
560 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
561
562 /* Loop for scanning the subject */
563
564 ptr = current_subject;
565 for (;;)
566 {
567 int i, j;
568 int clen, dlen;
569 unsigned int c, d;
570 int forced_fail = 0;
571 BOOL could_continue = FALSE;
572
573 /* Make the new state list into the active state list and empty the
574 new state list. */
575
576 temp_states = active_states;
577 active_states = new_states;
578 new_states = temp_states;
579 active_count = new_count;
580 new_count = 0;
581
582 workspace[0] ^= 1; /* Remember for the restarting feature */
583 workspace[1] = active_count;
584
585 #ifdef PCRE_DEBUG
586 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
587 pchars((pcre_uchar *)ptr, strlen((char *)ptr), stdout);
588 printf("\"\n");
589
590 printf("%.*sActive states: ", rlevel*2-2, SP);
591 for (i = 0; i < active_count; i++)
592 printf("%d/%d ", active_states[i].offset, active_states[i].count);
593 printf("\n");
594 #endif
595
596 /* Set the pointers for adding new states */
597
598 next_active_state = active_states + active_count;
599 next_new_state = new_states;
600
601 /* Load the current character from the subject outside the loop, as many
602 different states may want to look at it, and we assume that at least one
603 will. */
604
605 if (ptr < end_subject)
606 {
607 clen = 1; /* Number of bytes in the character */
608 #ifdef SUPPORT_UTF8
609 if (utf) { GETCHARLEN(c, ptr, clen); } else
610 #endif /* SUPPORT_UTF8 */
611 c = *ptr;
612 }
613 else
614 {
615 clen = 0; /* This indicates the end of the subject */
616 c = NOTACHAR; /* This value should never actually be used */
617 }
618
619 /* Scan up the active states and act on each one. The result of an action
620 may be to add more states to the currently active list (e.g. on hitting a
621 parenthesis) or it may be to put states on the new list, for considering
622 when we move the character pointer on. */
623
624 for (i = 0; i < active_count; i++)
625 {
626 stateblock *current_state = active_states + i;
627 BOOL caseless = FALSE;
628 const pcre_uchar *code;
629 int state_offset = current_state->offset;
630 int count, codevalue, rrc;
631
632 #ifdef PCRE_DEBUG
633 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
634 if (clen == 0) printf("EOL\n");
635 else if (c > 32 && c < 127) printf("'%c'\n", c);
636 else printf("0x%02x\n", c);
637 #endif
638
639 /* A negative offset is a special case meaning "hold off going to this
640 (negated) state until the number of characters in the data field have
641 been skipped". */
642
643 if (state_offset < 0)
644 {
645 if (current_state->data > 0)
646 {
647 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
648 ADD_NEW_DATA(state_offset, current_state->count,
649 current_state->data - 1);
650 continue;
651 }
652 else
653 {
654 current_state->offset = state_offset = -state_offset;
655 }
656 }
657
658 /* Check for a duplicate state with the same count, and skip if found.
659 See the note at the head of this module about the possibility of improving
660 performance here. */
661
662 for (j = 0; j < i; j++)
663 {
664 if (active_states[j].offset == state_offset &&
665 active_states[j].count == current_state->count)
666 {
667 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
668 goto NEXT_ACTIVE_STATE;
669 }
670 }
671
672 /* The state offset is the offset to the opcode */
673
674 code = start_code + state_offset;
675 codevalue = *code;
676
677 /* If this opcode inspects a character, but we are at the end of the
678 subject, remember the fact for use when testing for a partial match. */
679
680 if (clen == 0 && poptable[codevalue] != 0)
681 could_continue = TRUE;
682
683 /* If this opcode is followed by an inline character, load it. It is
684 tempting to test for the presence of a subject character here, but that
685 is wrong, because sometimes zero repetitions of the subject are
686 permitted.
687
688 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
689 argument that is not a data character - but is always one byte long. We
690 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
691 this case. To keep the other cases fast, convert these ones to new opcodes.
692 */
693
694 if (coptable[codevalue] > 0)
695 {
696 dlen = 1;
697 #ifdef SUPPORT_UTF8
698 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
699 #endif /* SUPPORT_UTF8 */
700 d = code[coptable[codevalue]];
701 if (codevalue >= OP_TYPESTAR)
702 {
703 switch(d)
704 {
705 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
706 case OP_NOTPROP:
707 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
708 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
709 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
710 case OP_NOT_HSPACE:
711 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
712 case OP_NOT_VSPACE:
713 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
714 default: break;
715 }
716 }
717 }
718 else
719 {
720 dlen = 0; /* Not strictly necessary, but compilers moan */
721 d = NOTACHAR; /* if these variables are not set. */
722 }
723
724
725 /* Now process the individual opcodes */
726
727 switch (codevalue)
728 {
729 /* ========================================================================== */
730 /* These cases are never obeyed. This is a fudge that causes a compile-
731 time error if the vectors coptable or poptable, which are indexed by
732 opcode, are not the correct length. It seems to be the only way to do
733 such a check at compile time, as the sizeof() operator does not work
734 in the C preprocessor. */
735
736 case OP_TABLE_LENGTH:
737 case OP_TABLE_LENGTH +
738 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
739 (sizeof(poptable) == OP_TABLE_LENGTH)):
740 break;
741
742 /* ========================================================================== */
743 /* Reached a closing bracket. If not at the end of the pattern, carry
744 on with the next opcode. For repeating opcodes, also add the repeat
745 state. Note that KETRPOS will always be encountered at the end of the
746 subpattern, because the possessive subpattern repeats are always handled
747 using recursive calls. Thus, it never adds any new states.
748
749 At the end of the (sub)pattern, unless we have an empty string and
750 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
751 start of the subject, save the match data, shifting up all previous
752 matches so we always have the longest first. */
753
754 case OP_KET:
755 case OP_KETRMIN:
756 case OP_KETRMAX:
757 case OP_KETRPOS:
758 if (code != end_code)
759 {
760 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
761 if (codevalue != OP_KET)
762 {
763 ADD_ACTIVE(state_offset - GET(code, 1), 0);
764 }
765 }
766 else
767 {
768 if (ptr > current_subject ||
769 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
770 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
771 current_subject > start_subject + md->start_offset)))
772 {
773 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
774 else if (match_count > 0 && ++match_count * 2 > offsetcount)
775 match_count = 0;
776 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
777 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
778 if (offsetcount >= 2)
779 {
780 offsets[0] = (int)(current_subject - start_subject);
781 offsets[1] = (int)(ptr - start_subject);
782 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
783 offsets[1] - offsets[0], current_subject));
784 }
785 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
786 {
787 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
788 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
789 match_count, rlevel*2-2, SP));
790 return match_count;
791 }
792 }
793 }
794 break;
795
796 /* ========================================================================== */
797 /* These opcodes add to the current list of states without looking
798 at the current character. */
799
800 /*-----------------------------------------------------------------*/
801 case OP_ALT:
802 do { code += GET(code, 1); } while (*code == OP_ALT);
803 ADD_ACTIVE((int)(code - start_code), 0);
804 break;
805
806 /*-----------------------------------------------------------------*/
807 case OP_BRA:
808 case OP_SBRA:
809 do
810 {
811 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
812 code += GET(code, 1);
813 }
814 while (*code == OP_ALT);
815 break;
816
817 /*-----------------------------------------------------------------*/
818 case OP_CBRA:
819 case OP_SCBRA:
820 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
821 code += GET(code, 1);
822 while (*code == OP_ALT)
823 {
824 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
825 code += GET(code, 1);
826 }
827 break;
828
829 /*-----------------------------------------------------------------*/
830 case OP_BRAZERO:
831 case OP_BRAMINZERO:
832 ADD_ACTIVE(state_offset + 1, 0);
833 code += 1 + GET(code, 2);
834 while (*code == OP_ALT) code += GET(code, 1);
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 break;
837
838 /*-----------------------------------------------------------------*/
839 case OP_SKIPZERO:
840 code += 1 + GET(code, 2);
841 while (*code == OP_ALT) code += GET(code, 1);
842 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
843 break;
844
845 /*-----------------------------------------------------------------*/
846 case OP_CIRC:
847 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
848 { ADD_ACTIVE(state_offset + 1, 0); }
849 break;
850
851 /*-----------------------------------------------------------------*/
852 case OP_CIRCM:
853 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
854 (ptr != end_subject && WAS_NEWLINE(ptr)))
855 { ADD_ACTIVE(state_offset + 1, 0); }
856 break;
857
858 /*-----------------------------------------------------------------*/
859 case OP_EOD:
860 if (ptr >= end_subject)
861 {
862 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
863 could_continue = TRUE;
864 else { ADD_ACTIVE(state_offset + 1, 0); }
865 }
866 break;
867
868 /*-----------------------------------------------------------------*/
869 case OP_SOD:
870 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
871 break;
872
873 /*-----------------------------------------------------------------*/
874 case OP_SOM:
875 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
876 break;
877
878
879 /* ========================================================================== */
880 /* These opcodes inspect the next subject character, and sometimes
881 the previous one as well, but do not have an argument. The variable
882 clen contains the length of the current character and is zero if we are
883 at the end of the subject. */
884
885 /*-----------------------------------------------------------------*/
886 case OP_ANY:
887 if (clen > 0 && !IS_NEWLINE(ptr))
888 { ADD_NEW(state_offset + 1, 0); }
889 break;
890
891 /*-----------------------------------------------------------------*/
892 case OP_ALLANY:
893 if (clen > 0)
894 { ADD_NEW(state_offset + 1, 0); }
895 break;
896
897 /*-----------------------------------------------------------------*/
898 case OP_EODN:
899 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
900 could_continue = TRUE;
901 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
902 { ADD_ACTIVE(state_offset + 1, 0); }
903 break;
904
905 /*-----------------------------------------------------------------*/
906 case OP_DOLL:
907 if ((md->moptions & PCRE_NOTEOL) == 0)
908 {
909 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
910 could_continue = TRUE;
911 else if (clen == 0 ||
912 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
913 (ptr == end_subject - md->nllen)
914 ))
915 { ADD_ACTIVE(state_offset + 1, 0); }
916 }
917 break;
918
919 /*-----------------------------------------------------------------*/
920 case OP_DOLLM:
921 if ((md->moptions & PCRE_NOTEOL) == 0)
922 {
923 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924 could_continue = TRUE;
925 else if (clen == 0 ||
926 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
927 { ADD_ACTIVE(state_offset + 1, 0); }
928 }
929 else if (IS_NEWLINE(ptr))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 break;
932
933 /*-----------------------------------------------------------------*/
934
935 case OP_DIGIT:
936 case OP_WHITESPACE:
937 case OP_WORDCHAR:
938 if (clen > 0 && c < 256 &&
939 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
940 { ADD_NEW(state_offset + 1, 0); }
941 break;
942
943 /*-----------------------------------------------------------------*/
944 case OP_NOT_DIGIT:
945 case OP_NOT_WHITESPACE:
946 case OP_NOT_WORDCHAR:
947 if (clen > 0 && (c >= 256 ||
948 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
949 { ADD_NEW(state_offset + 1, 0); }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_WORD_BOUNDARY:
954 case OP_NOT_WORD_BOUNDARY:
955 {
956 int left_word, right_word;
957
958 if (ptr > start_subject)
959 {
960 const pcre_uchar *temp = ptr - 1;
961 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
962 #ifdef SUPPORT_UTF8
963 if (utf) BACKCHAR(temp);
964 #endif
965 GETCHARTEST(d, temp);
966 #ifdef SUPPORT_UCP
967 if ((md->poptions & PCRE_UCP) != 0)
968 {
969 if (d == '_') left_word = TRUE; else
970 {
971 int cat = UCD_CATEGORY(d);
972 left_word = (cat == ucp_L || cat == ucp_N);
973 }
974 }
975 else
976 #endif
977 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
978 }
979 else left_word = FALSE;
980
981 if (clen > 0)
982 {
983 #ifdef SUPPORT_UCP
984 if ((md->poptions & PCRE_UCP) != 0)
985 {
986 if (c == '_') right_word = TRUE; else
987 {
988 int cat = UCD_CATEGORY(c);
989 right_word = (cat == ucp_L || cat == ucp_N);
990 }
991 }
992 else
993 #endif
994 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
995 }
996 else right_word = FALSE;
997
998 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
999 { ADD_ACTIVE(state_offset + 1, 0); }
1000 }
1001 break;
1002
1003
1004 /*-----------------------------------------------------------------*/
1005 /* Check the next character by Unicode property. We will get here only
1006 if the support is in the binary; otherwise a compile-time error occurs.
1007 */
1008
1009 #ifdef SUPPORT_UCP
1010 case OP_PROP:
1011 case OP_NOTPROP:
1012 if (clen > 0)
1013 {
1014 BOOL OK;
1015 const ucd_record * prop = GET_UCD(c);
1016 switch(code[1])
1017 {
1018 case PT_ANY:
1019 OK = TRUE;
1020 break;
1021
1022 case PT_LAMP:
1023 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1024 prop->chartype == ucp_Lt;
1025 break;
1026
1027 case PT_GC:
1028 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1029 break;
1030
1031 case PT_PC:
1032 OK = prop->chartype == code[2];
1033 break;
1034
1035 case PT_SC:
1036 OK = prop->script == code[2];
1037 break;
1038
1039 /* These are specials for combination cases. */
1040
1041 case PT_ALNUM:
1042 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1043 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1044 break;
1045
1046 case PT_SPACE: /* Perl space */
1047 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1048 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1049 break;
1050
1051 case PT_PXSPACE: /* POSIX space */
1052 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1053 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1054 c == CHAR_FF || c == CHAR_CR;
1055 break;
1056
1057 case PT_WORD:
1058 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1059 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1060 c == CHAR_UNDERSCORE;
1061 break;
1062
1063 /* Should never occur, but keep compilers from grumbling. */
1064
1065 default:
1066 OK = codevalue != OP_PROP;
1067 break;
1068 }
1069
1070 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1071 }
1072 break;
1073 #endif
1074
1075
1076
1077 /* ========================================================================== */
1078 /* These opcodes likewise inspect the subject character, but have an
1079 argument that is not a data character. It is one of these opcodes:
1080 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1081 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1082
1083 case OP_TYPEPLUS:
1084 case OP_TYPEMINPLUS:
1085 case OP_TYPEPOSPLUS:
1086 count = current_state->count; /* Already matched */
1087 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1088 if (clen > 0)
1089 {
1090 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1091 (c < 256 &&
1092 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1093 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1094 {
1095 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1096 {
1097 active_count--; /* Remove non-match possibility */
1098 next_active_state--;
1099 }
1100 count++;
1101 ADD_NEW(state_offset, count);
1102 }
1103 }
1104 break;
1105
1106 /*-----------------------------------------------------------------*/
1107 case OP_TYPEQUERY:
1108 case OP_TYPEMINQUERY:
1109 case OP_TYPEPOSQUERY:
1110 ADD_ACTIVE(state_offset + 2, 0);
1111 if (clen > 0)
1112 {
1113 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1114 (c < 256 &&
1115 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1116 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1117 {
1118 if (codevalue == OP_TYPEPOSQUERY)
1119 {
1120 active_count--; /* Remove non-match possibility */
1121 next_active_state--;
1122 }
1123 ADD_NEW(state_offset + 2, 0);
1124 }
1125 }
1126 break;
1127
1128 /*-----------------------------------------------------------------*/
1129 case OP_TYPESTAR:
1130 case OP_TYPEMINSTAR:
1131 case OP_TYPEPOSSTAR:
1132 ADD_ACTIVE(state_offset + 2, 0);
1133 if (clen > 0)
1134 {
1135 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1136 (c < 256 &&
1137 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1138 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1139 {
1140 if (codevalue == OP_TYPEPOSSTAR)
1141 {
1142 active_count--; /* Remove non-match possibility */
1143 next_active_state--;
1144 }
1145 ADD_NEW(state_offset, 0);
1146 }
1147 }
1148 break;
1149
1150 /*-----------------------------------------------------------------*/
1151 case OP_TYPEEXACT:
1152 count = current_state->count; /* Number already matched */
1153 if (clen > 0)
1154 {
1155 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1156 (c < 256 &&
1157 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1158 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1159 {
1160 if (++count >= GET2(code, 1))
1161 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1162 else
1163 { ADD_NEW(state_offset, count); }
1164 }
1165 }
1166 break;
1167
1168 /*-----------------------------------------------------------------*/
1169 case OP_TYPEUPTO:
1170 case OP_TYPEMINUPTO:
1171 case OP_TYPEPOSUPTO:
1172 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1173 count = current_state->count; /* Number already matched */
1174 if (clen > 0)
1175 {
1176 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1177 (c < 256 &&
1178 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1179 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1180 {
1181 if (codevalue == OP_TYPEPOSUPTO)
1182 {
1183 active_count--; /* Remove non-match possibility */
1184 next_active_state--;
1185 }
1186 if (++count >= GET2(code, 1))
1187 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1188 else
1189 { ADD_NEW(state_offset, count); }
1190 }
1191 }
1192 break;
1193
1194 /* ========================================================================== */
1195 /* These are virtual opcodes that are used when something like
1196 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1197 argument. It keeps the code above fast for the other cases. The argument
1198 is in the d variable. */
1199
1200 #ifdef SUPPORT_UCP
1201 case OP_PROP_EXTRA + OP_TYPEPLUS:
1202 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1203 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1204 count = current_state->count; /* Already matched */
1205 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1206 if (clen > 0)
1207 {
1208 BOOL OK;
1209 const ucd_record * prop = GET_UCD(c);
1210 switch(code[2])
1211 {
1212 case PT_ANY:
1213 OK = TRUE;
1214 break;
1215
1216 case PT_LAMP:
1217 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1218 prop->chartype == ucp_Lt;
1219 break;
1220
1221 case PT_GC:
1222 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1223 break;
1224
1225 case PT_PC:
1226 OK = prop->chartype == code[3];
1227 break;
1228
1229 case PT_SC:
1230 OK = prop->script == code[3];
1231 break;
1232
1233 /* These are specials for combination cases. */
1234
1235 case PT_ALNUM:
1236 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1237 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1238 break;
1239
1240 case PT_SPACE: /* Perl space */
1241 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1242 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1243 break;
1244
1245 case PT_PXSPACE: /* POSIX space */
1246 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1247 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1248 c == CHAR_FF || c == CHAR_CR;
1249 break;
1250
1251 case PT_WORD:
1252 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1253 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1254 c == CHAR_UNDERSCORE;
1255 break;
1256
1257 /* Should never occur, but keep compilers from grumbling. */
1258
1259 default:
1260 OK = codevalue != OP_PROP;
1261 break;
1262 }
1263
1264 if (OK == (d == OP_PROP))
1265 {
1266 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1267 {
1268 active_count--; /* Remove non-match possibility */
1269 next_active_state--;
1270 }
1271 count++;
1272 ADD_NEW(state_offset, count);
1273 }
1274 }
1275 break;
1276
1277 /*-----------------------------------------------------------------*/
1278 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1279 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1280 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1281 count = current_state->count; /* Already matched */
1282 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1283 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1284 {
1285 const pcre_uchar *nptr = ptr + clen;
1286 int ncount = 0;
1287 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1288 {
1289 active_count--; /* Remove non-match possibility */
1290 next_active_state--;
1291 }
1292 while (nptr < end_subject)
1293 {
1294 int nd;
1295 int ndlen = 1;
1296 GETCHARLEN(nd, nptr, ndlen);
1297 if (UCD_CATEGORY(nd) != ucp_M) break;
1298 ncount++;
1299 nptr += ndlen;
1300 }
1301 count++;
1302 ADD_NEW_DATA(-state_offset, count, ncount);
1303 }
1304 break;
1305 #endif
1306
1307 /*-----------------------------------------------------------------*/
1308 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1309 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1310 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1311 count = current_state->count; /* Already matched */
1312 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1313 if (clen > 0)
1314 {
1315 int ncount = 0;
1316 switch (c)
1317 {
1318 case 0x000b:
1319 case 0x000c:
1320 case 0x0085:
1321 case 0x2028:
1322 case 0x2029:
1323 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324 goto ANYNL01;
1325
1326 case 0x000d:
1327 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328 /* Fall through */
1329
1330 ANYNL01:
1331 case 0x000a:
1332 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1333 {
1334 active_count--; /* Remove non-match possibility */
1335 next_active_state--;
1336 }
1337 count++;
1338 ADD_NEW_DATA(-state_offset, count, ncount);
1339 break;
1340
1341 default:
1342 break;
1343 }
1344 }
1345 break;
1346
1347 /*-----------------------------------------------------------------*/
1348 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1349 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1350 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1351 count = current_state->count; /* Already matched */
1352 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1353 if (clen > 0)
1354 {
1355 BOOL OK;
1356 switch (c)
1357 {
1358 case 0x000a:
1359 case 0x000b:
1360 case 0x000c:
1361 case 0x000d:
1362 case 0x0085:
1363 case 0x2028:
1364 case 0x2029:
1365 OK = TRUE;
1366 break;
1367
1368 default:
1369 OK = FALSE;
1370 break;
1371 }
1372
1373 if (OK == (d == OP_VSPACE))
1374 {
1375 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1376 {
1377 active_count--; /* Remove non-match possibility */
1378 next_active_state--;
1379 }
1380 count++;
1381 ADD_NEW_DATA(-state_offset, count, 0);
1382 }
1383 }
1384 break;
1385
1386 /*-----------------------------------------------------------------*/
1387 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1388 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1389 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1390 count = current_state->count; /* Already matched */
1391 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1392 if (clen > 0)
1393 {
1394 BOOL OK;
1395 switch (c)
1396 {
1397 case 0x09: /* HT */
1398 case 0x20: /* SPACE */
1399 case 0xa0: /* NBSP */
1400 case 0x1680: /* OGHAM SPACE MARK */
1401 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1402 case 0x2000: /* EN QUAD */
1403 case 0x2001: /* EM QUAD */
1404 case 0x2002: /* EN SPACE */
1405 case 0x2003: /* EM SPACE */
1406 case 0x2004: /* THREE-PER-EM SPACE */
1407 case 0x2005: /* FOUR-PER-EM SPACE */
1408 case 0x2006: /* SIX-PER-EM SPACE */
1409 case 0x2007: /* FIGURE SPACE */
1410 case 0x2008: /* PUNCTUATION SPACE */
1411 case 0x2009: /* THIN SPACE */
1412 case 0x200A: /* HAIR SPACE */
1413 case 0x202f: /* NARROW NO-BREAK SPACE */
1414 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1415 case 0x3000: /* IDEOGRAPHIC SPACE */
1416 OK = TRUE;
1417 break;
1418
1419 default:
1420 OK = FALSE;
1421 break;
1422 }
1423
1424 if (OK == (d == OP_HSPACE))
1425 {
1426 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1427 {
1428 active_count--; /* Remove non-match possibility */
1429 next_active_state--;
1430 }
1431 count++;
1432 ADD_NEW_DATA(-state_offset, count, 0);
1433 }
1434 }
1435 break;
1436
1437 /*-----------------------------------------------------------------*/
1438 #ifdef SUPPORT_UCP
1439 case OP_PROP_EXTRA + OP_TYPEQUERY:
1440 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1441 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1442 count = 4;
1443 goto QS1;
1444
1445 case OP_PROP_EXTRA + OP_TYPESTAR:
1446 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1447 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1448 count = 0;
1449
1450 QS1:
1451
1452 ADD_ACTIVE(state_offset + 4, 0);
1453 if (clen > 0)
1454 {
1455 BOOL OK;
1456 const ucd_record * prop = GET_UCD(c);
1457 switch(code[2])
1458 {
1459 case PT_ANY:
1460 OK = TRUE;
1461 break;
1462
1463 case PT_LAMP:
1464 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1465 prop->chartype == ucp_Lt;
1466 break;
1467
1468 case PT_GC:
1469 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1470 break;
1471
1472 case PT_PC:
1473 OK = prop->chartype == code[3];
1474 break;
1475
1476 case PT_SC:
1477 OK = prop->script == code[3];
1478 break;
1479
1480 /* These are specials for combination cases. */
1481
1482 case PT_ALNUM:
1483 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1484 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1485 break;
1486
1487 case PT_SPACE: /* Perl space */
1488 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1489 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1490 break;
1491
1492 case PT_PXSPACE: /* POSIX space */
1493 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1494 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1495 c == CHAR_FF || c == CHAR_CR;
1496 break;
1497
1498 case PT_WORD:
1499 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1500 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1501 c == CHAR_UNDERSCORE;
1502 break;
1503
1504 /* Should never occur, but keep compilers from grumbling. */
1505
1506 default:
1507 OK = codevalue != OP_PROP;
1508 break;
1509 }
1510
1511 if (OK == (d == OP_PROP))
1512 {
1513 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1514 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1515 {
1516 active_count--; /* Remove non-match possibility */
1517 next_active_state--;
1518 }
1519 ADD_NEW(state_offset + count, 0);
1520 }
1521 }
1522 break;
1523
1524 /*-----------------------------------------------------------------*/
1525 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1526 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1527 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1528 count = 2;
1529 goto QS2;
1530
1531 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1532 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1533 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1534 count = 0;
1535
1536 QS2:
1537
1538 ADD_ACTIVE(state_offset + 2, 0);
1539 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1540 {
1541 const pcre_uchar *nptr = ptr + clen;
1542 int ncount = 0;
1543 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1544 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1545 {
1546 active_count--; /* Remove non-match possibility */
1547 next_active_state--;
1548 }
1549 while (nptr < end_subject)
1550 {
1551 int nd;
1552 int ndlen = 1;
1553 GETCHARLEN(nd, nptr, ndlen);
1554 if (UCD_CATEGORY(nd) != ucp_M) break;
1555 ncount++;
1556 nptr += ndlen;
1557 }
1558 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1559 }
1560 break;
1561 #endif
1562
1563 /*-----------------------------------------------------------------*/
1564 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1565 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1566 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1567 count = 2;
1568 goto QS3;
1569
1570 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1571 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1572 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1573 count = 0;
1574
1575 QS3:
1576 ADD_ACTIVE(state_offset + 2, 0);
1577 if (clen > 0)
1578 {
1579 int ncount = 0;
1580 switch (c)
1581 {
1582 case 0x000b:
1583 case 0x000c:
1584 case 0x0085:
1585 case 0x2028:
1586 case 0x2029:
1587 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1588 goto ANYNL02;
1589
1590 case 0x000d:
1591 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1592 /* Fall through */
1593
1594 ANYNL02:
1595 case 0x000a:
1596 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1597 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1598 {
1599 active_count--; /* Remove non-match possibility */
1600 next_active_state--;
1601 }
1602 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1603 break;
1604
1605 default:
1606 break;
1607 }
1608 }
1609 break;
1610
1611 /*-----------------------------------------------------------------*/
1612 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1613 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1614 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1615 count = 2;
1616 goto QS4;
1617
1618 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1619 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1620 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1621 count = 0;
1622
1623 QS4:
1624 ADD_ACTIVE(state_offset + 2, 0);
1625 if (clen > 0)
1626 {
1627 BOOL OK;
1628 switch (c)
1629 {
1630 case 0x000a:
1631 case 0x000b:
1632 case 0x000c:
1633 case 0x000d:
1634 case 0x0085:
1635 case 0x2028:
1636 case 0x2029:
1637 OK = TRUE;
1638 break;
1639
1640 default:
1641 OK = FALSE;
1642 break;
1643 }
1644 if (OK == (d == OP_VSPACE))
1645 {
1646 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1647 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1648 {
1649 active_count--; /* Remove non-match possibility */
1650 next_active_state--;
1651 }
1652 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1653 }
1654 }
1655 break;
1656
1657 /*-----------------------------------------------------------------*/
1658 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1659 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1660 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1661 count = 2;
1662 goto QS5;
1663
1664 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1665 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1666 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1667 count = 0;
1668
1669 QS5:
1670 ADD_ACTIVE(state_offset + 2, 0);
1671 if (clen > 0)
1672 {
1673 BOOL OK;
1674 switch (c)
1675 {
1676 case 0x09: /* HT */
1677 case 0x20: /* SPACE */
1678 case 0xa0: /* NBSP */
1679 case 0x1680: /* OGHAM SPACE MARK */
1680 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1681 case 0x2000: /* EN QUAD */
1682 case 0x2001: /* EM QUAD */
1683 case 0x2002: /* EN SPACE */
1684 case 0x2003: /* EM SPACE */
1685 case 0x2004: /* THREE-PER-EM SPACE */
1686 case 0x2005: /* FOUR-PER-EM SPACE */
1687 case 0x2006: /* SIX-PER-EM SPACE */
1688 case 0x2007: /* FIGURE SPACE */
1689 case 0x2008: /* PUNCTUATION SPACE */
1690 case 0x2009: /* THIN SPACE */
1691 case 0x200A: /* HAIR SPACE */
1692 case 0x202f: /* NARROW NO-BREAK SPACE */
1693 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1694 case 0x3000: /* IDEOGRAPHIC SPACE */
1695 OK = TRUE;
1696 break;
1697
1698 default:
1699 OK = FALSE;
1700 break;
1701 }
1702
1703 if (OK == (d == OP_HSPACE))
1704 {
1705 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1706 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1707 {
1708 active_count--; /* Remove non-match possibility */
1709 next_active_state--;
1710 }
1711 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1712 }
1713 }
1714 break;
1715
1716 /*-----------------------------------------------------------------*/
1717 #ifdef SUPPORT_UCP
1718 case OP_PROP_EXTRA + OP_TYPEEXACT:
1719 case OP_PROP_EXTRA + OP_TYPEUPTO:
1720 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1721 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1722 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1723 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1724 count = current_state->count; /* Number already matched */
1725 if (clen > 0)
1726 {
1727 BOOL OK;
1728 const ucd_record * prop = GET_UCD(c);
1729 switch(code[1 + IMM2_SIZE + 1])
1730 {
1731 case PT_ANY:
1732 OK = TRUE;
1733 break;
1734
1735 case PT_LAMP:
1736 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1737 prop->chartype == ucp_Lt;
1738 break;
1739
1740 case PT_GC:
1741 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1742 break;
1743
1744 case PT_PC:
1745 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1746 break;
1747
1748 case PT_SC:
1749 OK = prop->script == code[1 + IMM2_SIZE + 2];
1750 break;
1751
1752 /* These are specials for combination cases. */
1753
1754 case PT_ALNUM:
1755 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1756 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1757 break;
1758
1759 case PT_SPACE: /* Perl space */
1760 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1761 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1762 break;
1763
1764 case PT_PXSPACE: /* POSIX space */
1765 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1766 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1767 c == CHAR_FF || c == CHAR_CR;
1768 break;
1769
1770 case PT_WORD:
1771 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1772 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1773 c == CHAR_UNDERSCORE;
1774 break;
1775
1776 /* Should never occur, but keep compilers from grumbling. */
1777
1778 default:
1779 OK = codevalue != OP_PROP;
1780 break;
1781 }
1782
1783 if (OK == (d == OP_PROP))
1784 {
1785 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1786 {
1787 active_count--; /* Remove non-match possibility */
1788 next_active_state--;
1789 }
1790 if (++count >= GET2(code, 1))
1791 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1792 else
1793 { ADD_NEW(state_offset, count); }
1794 }
1795 }
1796 break;
1797
1798 /*-----------------------------------------------------------------*/
1799 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1800 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1801 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1802 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1803 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1804 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1805 count = current_state->count; /* Number already matched */
1806 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1807 {
1808 const pcre_uchar *nptr = ptr + clen;
1809 int ncount = 0;
1810 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1811 {
1812 active_count--; /* Remove non-match possibility */
1813 next_active_state--;
1814 }
1815 while (nptr < end_subject)
1816 {
1817 int nd;
1818 int ndlen = 1;
1819 GETCHARLEN(nd, nptr, ndlen);
1820 if (UCD_CATEGORY(nd) != ucp_M) break;
1821 ncount++;
1822 nptr += ndlen;
1823 }
1824 if (++count >= GET2(code, 1))
1825 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1826 else
1827 { ADD_NEW_DATA(-state_offset, count, ncount); }
1828 }
1829 break;
1830 #endif
1831
1832 /*-----------------------------------------------------------------*/
1833 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1834 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1835 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1836 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1837 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1838 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1839 count = current_state->count; /* Number already matched */
1840 if (clen > 0)
1841 {
1842 int ncount = 0;
1843 switch (c)
1844 {
1845 case 0x000b:
1846 case 0x000c:
1847 case 0x0085:
1848 case 0x2028:
1849 case 0x2029:
1850 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1851 goto ANYNL03;
1852
1853 case 0x000d:
1854 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1855 /* Fall through */
1856
1857 ANYNL03:
1858 case 0x000a:
1859 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1860 {
1861 active_count--; /* Remove non-match possibility */
1862 next_active_state--;
1863 }
1864 if (++count >= GET2(code, 1))
1865 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1866 else
1867 { ADD_NEW_DATA(-state_offset, count, ncount); }
1868 break;
1869
1870 default:
1871 break;
1872 }
1873 }
1874 break;
1875
1876 /*-----------------------------------------------------------------*/
1877 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1878 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1879 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1880 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1881 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1882 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1883 count = current_state->count; /* Number already matched */
1884 if (clen > 0)
1885 {
1886 BOOL OK;
1887 switch (c)
1888 {
1889 case 0x000a:
1890 case 0x000b:
1891 case 0x000c:
1892 case 0x000d:
1893 case 0x0085:
1894 case 0x2028:
1895 case 0x2029:
1896 OK = TRUE;
1897 break;
1898
1899 default:
1900 OK = FALSE;
1901 }
1902
1903 if (OK == (d == OP_VSPACE))
1904 {
1905 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1906 {
1907 active_count--; /* Remove non-match possibility */
1908 next_active_state--;
1909 }
1910 if (++count >= GET2(code, 1))
1911 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1912 else
1913 { ADD_NEW_DATA(-state_offset, count, 0); }
1914 }
1915 }
1916 break;
1917
1918 /*-----------------------------------------------------------------*/
1919 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1920 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1921 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1922 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1923 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1924 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925 count = current_state->count; /* Number already matched */
1926 if (clen > 0)
1927 {
1928 BOOL OK;
1929 switch (c)
1930 {
1931 case 0x09: /* HT */
1932 case 0x20: /* SPACE */
1933 case 0xa0: /* NBSP */
1934 case 0x1680: /* OGHAM SPACE MARK */
1935 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1936 case 0x2000: /* EN QUAD */
1937 case 0x2001: /* EM QUAD */
1938 case 0x2002: /* EN SPACE */
1939 case 0x2003: /* EM SPACE */
1940 case 0x2004: /* THREE-PER-EM SPACE */
1941 case 0x2005: /* FOUR-PER-EM SPACE */
1942 case 0x2006: /* SIX-PER-EM SPACE */
1943 case 0x2007: /* FIGURE SPACE */
1944 case 0x2008: /* PUNCTUATION SPACE */
1945 case 0x2009: /* THIN SPACE */
1946 case 0x200A: /* HAIR SPACE */
1947 case 0x202f: /* NARROW NO-BREAK SPACE */
1948 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1949 case 0x3000: /* IDEOGRAPHIC SPACE */
1950 OK = TRUE;
1951 break;
1952
1953 default:
1954 OK = FALSE;
1955 break;
1956 }
1957
1958 if (OK == (d == OP_HSPACE))
1959 {
1960 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1961 {
1962 active_count--; /* Remove non-match possibility */
1963 next_active_state--;
1964 }
1965 if (++count >= GET2(code, 1))
1966 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1967 else
1968 { ADD_NEW_DATA(-state_offset, count, 0); }
1969 }
1970 }
1971 break;
1972
1973 /* ========================================================================== */
1974 /* These opcodes are followed by a character that is usually compared
1975 to the current subject character; it is loaded into d. We still get
1976 here even if there is no subject character, because in some cases zero
1977 repetitions are permitted. */
1978
1979 /*-----------------------------------------------------------------*/
1980 case OP_CHAR:
1981 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1982 break;
1983
1984 /*-----------------------------------------------------------------*/
1985 case OP_CHARI:
1986 if (clen == 0) break;
1987
1988 #ifdef SUPPORT_UTF8
1989 if (utf)
1990 {
1991 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1992 {
1993 unsigned int othercase;
1994 if (c < 128) othercase = fcc[c]; else
1995
1996 /* If we have Unicode property support, we can use it to test the
1997 other case of the character. */
1998
1999 #ifdef SUPPORT_UCP
2000 othercase = UCD_OTHERCASE(c);
2001 #else
2002 othercase = NOTACHAR;
2003 #endif
2004
2005 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2006 }
2007 }
2008 else
2009 #endif /* SUPPORT_UTF8 */
2010 /* Not UTF mode */
2011 {
2012 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2013 }
2014 break;
2015
2016
2017 #ifdef SUPPORT_UCP
2018 /*-----------------------------------------------------------------*/
2019 /* This is a tricky one because it can match more than one character.
2020 Find out how many characters to skip, and then set up a negative state
2021 to wait for them to pass before continuing. */
2022
2023 case OP_EXTUNI:
2024 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2025 {
2026 const pcre_uchar *nptr = ptr + clen;
2027 int ncount = 0;
2028 while (nptr < end_subject)
2029 {
2030 int nclen = 1;
2031 GETCHARLEN(c, nptr, nclen);
2032 if (UCD_CATEGORY(c) != ucp_M) break;
2033 ncount++;
2034 nptr += nclen;
2035 }
2036 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2037 }
2038 break;
2039 #endif
2040
2041 /*-----------------------------------------------------------------*/
2042 /* This is a tricky like EXTUNI because it too can match more than one
2043 character (when CR is followed by LF). In this case, set up a negative
2044 state to wait for one character to pass before continuing. */
2045
2046 case OP_ANYNL:
2047 if (clen > 0) switch(c)
2048 {
2049 case 0x000b:
2050 case 0x000c:
2051 case 0x0085:
2052 case 0x2028:
2053 case 0x2029:
2054 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2055
2056 case 0x000a:
2057 ADD_NEW(state_offset + 1, 0);
2058 break;
2059
2060 case 0x000d:
2061 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2062 {
2063 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2064 }
2065 else
2066 {
2067 ADD_NEW(state_offset + 1, 0);
2068 }
2069 break;
2070 }
2071 break;
2072
2073 /*-----------------------------------------------------------------*/
2074 case OP_NOT_VSPACE:
2075 if (clen > 0) switch(c)
2076 {
2077 case 0x000a:
2078 case 0x000b:
2079 case 0x000c:
2080 case 0x000d:
2081 case 0x0085:
2082 case 0x2028:
2083 case 0x2029:
2084 break;
2085
2086 default:
2087 ADD_NEW(state_offset + 1, 0);
2088 break;
2089 }
2090 break;
2091
2092 /*-----------------------------------------------------------------*/
2093 case OP_VSPACE:
2094 if (clen > 0) switch(c)
2095 {
2096 case 0x000a:
2097 case 0x000b:
2098 case 0x000c:
2099 case 0x000d:
2100 case 0x0085:
2101 case 0x2028:
2102 case 0x2029:
2103 ADD_NEW(state_offset + 1, 0);
2104 break;
2105
2106 default: break;
2107 }
2108 break;
2109
2110 /*-----------------------------------------------------------------*/
2111 case OP_NOT_HSPACE:
2112 if (clen > 0) switch(c)
2113 {
2114 case 0x09: /* HT */
2115 case 0x20: /* SPACE */
2116 case 0xa0: /* NBSP */
2117 case 0x1680: /* OGHAM SPACE MARK */
2118 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2119 case 0x2000: /* EN QUAD */
2120 case 0x2001: /* EM QUAD */
2121 case 0x2002: /* EN SPACE */
2122 case 0x2003: /* EM SPACE */
2123 case 0x2004: /* THREE-PER-EM SPACE */
2124 case 0x2005: /* FOUR-PER-EM SPACE */
2125 case 0x2006: /* SIX-PER-EM SPACE */
2126 case 0x2007: /* FIGURE SPACE */
2127 case 0x2008: /* PUNCTUATION SPACE */
2128 case 0x2009: /* THIN SPACE */
2129 case 0x200A: /* HAIR SPACE */
2130 case 0x202f: /* NARROW NO-BREAK SPACE */
2131 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2132 case 0x3000: /* IDEOGRAPHIC SPACE */
2133 break;
2134
2135 default:
2136 ADD_NEW(state_offset + 1, 0);
2137 break;
2138 }
2139 break;
2140
2141 /*-----------------------------------------------------------------*/
2142 case OP_HSPACE:
2143 if (clen > 0) switch(c)
2144 {
2145 case 0x09: /* HT */
2146 case 0x20: /* SPACE */
2147 case 0xa0: /* NBSP */
2148 case 0x1680: /* OGHAM SPACE MARK */
2149 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2150 case 0x2000: /* EN QUAD */
2151 case 0x2001: /* EM QUAD */
2152 case 0x2002: /* EN SPACE */
2153 case 0x2003: /* EM SPACE */
2154 case 0x2004: /* THREE-PER-EM SPACE */
2155 case 0x2005: /* FOUR-PER-EM SPACE */
2156 case 0x2006: /* SIX-PER-EM SPACE */
2157 case 0x2007: /* FIGURE SPACE */
2158 case 0x2008: /* PUNCTUATION SPACE */
2159 case 0x2009: /* THIN SPACE */
2160 case 0x200A: /* HAIR SPACE */
2161 case 0x202f: /* NARROW NO-BREAK SPACE */
2162 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2163 case 0x3000: /* IDEOGRAPHIC SPACE */
2164 ADD_NEW(state_offset + 1, 0);
2165 break;
2166 }
2167 break;
2168
2169 /*-----------------------------------------------------------------*/
2170 /* Match a negated single character casefully. This is only used for
2171 one-byte characters, that is, we know that d < 256. The character we are
2172 checking (c) can be multibyte. */
2173
2174 case OP_NOT:
2175 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2176 break;
2177
2178 /*-----------------------------------------------------------------*/
2179 /* Match a negated single character caselessly. This is only used for
2180 one-byte characters, that is, we know that d < 256. The character we are
2181 checking (c) can be multibyte. */
2182
2183 case OP_NOTI:
2184 if (clen > 0 && c != d && c != fcc[d])
2185 { ADD_NEW(state_offset + dlen + 1, 0); }
2186 break;
2187
2188 /*-----------------------------------------------------------------*/
2189 case OP_PLUSI:
2190 case OP_MINPLUSI:
2191 case OP_POSPLUSI:
2192 case OP_NOTPLUSI:
2193 case OP_NOTMINPLUSI:
2194 case OP_NOTPOSPLUSI:
2195 caseless = TRUE;
2196 codevalue -= OP_STARI - OP_STAR;
2197
2198 /* Fall through */
2199 case OP_PLUS:
2200 case OP_MINPLUS:
2201 case OP_POSPLUS:
2202 case OP_NOTPLUS:
2203 case OP_NOTMINPLUS:
2204 case OP_NOTPOSPLUS:
2205 count = current_state->count; /* Already matched */
2206 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2207 if (clen > 0)
2208 {
2209 unsigned int otherd = NOTACHAR;
2210 if (caseless)
2211 {
2212 #ifdef SUPPORT_UTF8
2213 if (utf && d >= 128)
2214 {
2215 #ifdef SUPPORT_UCP
2216 otherd = UCD_OTHERCASE(d);
2217 #endif /* SUPPORT_UCP */
2218 }
2219 else
2220 #endif /* SUPPORT_UTF8 */
2221 otherd = fcc[d];
2222 }
2223 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2224 {
2225 if (count > 0 &&
2226 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2227 {
2228 active_count--; /* Remove non-match possibility */
2229 next_active_state--;
2230 }
2231 count++;
2232 ADD_NEW(state_offset, count);
2233 }
2234 }
2235 break;
2236
2237 /*-----------------------------------------------------------------*/
2238 case OP_QUERYI:
2239 case OP_MINQUERYI:
2240 case OP_POSQUERYI:
2241 case OP_NOTQUERYI:
2242 case OP_NOTMINQUERYI:
2243 case OP_NOTPOSQUERYI:
2244 caseless = TRUE;
2245 codevalue -= OP_STARI - OP_STAR;
2246 /* Fall through */
2247 case OP_QUERY:
2248 case OP_MINQUERY:
2249 case OP_POSQUERY:
2250 case OP_NOTQUERY:
2251 case OP_NOTMINQUERY:
2252 case OP_NOTPOSQUERY:
2253 ADD_ACTIVE(state_offset + dlen + 1, 0);
2254 if (clen > 0)
2255 {
2256 unsigned int otherd = NOTACHAR;
2257 if (caseless)
2258 {
2259 #ifdef SUPPORT_UTF8
2260 if (utf && d >= 128)
2261 {
2262 #ifdef SUPPORT_UCP
2263 otherd = UCD_OTHERCASE(d);
2264 #endif /* SUPPORT_UCP */
2265 }
2266 else
2267 #endif /* SUPPORT_UTF8 */
2268 otherd = fcc[d];
2269 }
2270 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2271 {
2272 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2273 {
2274 active_count--; /* Remove non-match possibility */
2275 next_active_state--;
2276 }
2277 ADD_NEW(state_offset + dlen + 1, 0);
2278 }
2279 }
2280 break;
2281
2282 /*-----------------------------------------------------------------*/
2283 case OP_STARI:
2284 case OP_MINSTARI:
2285 case OP_POSSTARI:
2286 case OP_NOTSTARI:
2287 case OP_NOTMINSTARI:
2288 case OP_NOTPOSSTARI:
2289 caseless = TRUE;
2290 codevalue -= OP_STARI - OP_STAR;
2291 /* Fall through */
2292 case OP_STAR:
2293 case OP_MINSTAR:
2294 case OP_POSSTAR:
2295 case OP_NOTSTAR:
2296 case OP_NOTMINSTAR:
2297 case OP_NOTPOSSTAR:
2298 ADD_ACTIVE(state_offset + dlen + 1, 0);
2299 if (clen > 0)
2300 {
2301 unsigned int otherd = NOTACHAR;
2302 if (caseless)
2303 {
2304 #ifdef SUPPORT_UTF8
2305 if (utf && d >= 128)
2306 {
2307 #ifdef SUPPORT_UCP
2308 otherd = UCD_OTHERCASE(d);
2309 #endif /* SUPPORT_UCP */
2310 }
2311 else
2312 #endif /* SUPPORT_UTF8 */
2313 otherd = fcc[d];
2314 }
2315 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2316 {
2317 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2318 {
2319 active_count--; /* Remove non-match possibility */
2320 next_active_state--;
2321 }
2322 ADD_NEW(state_offset, 0);
2323 }
2324 }
2325 break;
2326
2327 /*-----------------------------------------------------------------*/
2328 case OP_EXACTI:
2329 case OP_NOTEXACTI:
2330 caseless = TRUE;
2331 codevalue -= OP_STARI - OP_STAR;
2332 /* Fall through */
2333 case OP_EXACT:
2334 case OP_NOTEXACT:
2335 count = current_state->count; /* Number already matched */
2336 if (clen > 0)
2337 {
2338 unsigned int otherd = NOTACHAR;
2339 if (caseless)
2340 {
2341 #ifdef SUPPORT_UTF8
2342 if (utf && d >= 128)
2343 {
2344 #ifdef SUPPORT_UCP
2345 otherd = UCD_OTHERCASE(d);
2346 #endif /* SUPPORT_UCP */
2347 }
2348 else
2349 #endif /* SUPPORT_UTF8 */
2350 otherd = fcc[d];
2351 }
2352 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2353 {
2354 if (++count >= GET2(code, 1))
2355 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2356 else
2357 { ADD_NEW(state_offset, count); }
2358 }
2359 }
2360 break;
2361
2362 /*-----------------------------------------------------------------*/
2363 case OP_UPTOI:
2364 case OP_MINUPTOI:
2365 case OP_POSUPTOI:
2366 case OP_NOTUPTOI:
2367 case OP_NOTMINUPTOI:
2368 case OP_NOTPOSUPTOI:
2369 caseless = TRUE;
2370 codevalue -= OP_STARI - OP_STAR;
2371 /* Fall through */
2372 case OP_UPTO:
2373 case OP_MINUPTO:
2374 case OP_POSUPTO:
2375 case OP_NOTUPTO:
2376 case OP_NOTMINUPTO:
2377 case OP_NOTPOSUPTO:
2378 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2379 count = current_state->count; /* Number already matched */
2380 if (clen > 0)
2381 {
2382 unsigned int otherd = NOTACHAR;
2383 if (caseless)
2384 {
2385 #ifdef SUPPORT_UTF8
2386 if (utf && d >= 128)
2387 {
2388 #ifdef SUPPORT_UCP
2389 otherd = UCD_OTHERCASE(d);
2390 #endif /* SUPPORT_UCP */
2391 }
2392 else
2393 #endif /* SUPPORT_UTF8 */
2394 otherd = fcc[d];
2395 }
2396 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2397 {
2398 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2399 {
2400 active_count--; /* Remove non-match possibility */
2401 next_active_state--;
2402 }
2403 if (++count >= GET2(code, 1))
2404 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2405 else
2406 { ADD_NEW(state_offset, count); }
2407 }
2408 }
2409 break;
2410
2411
2412 /* ========================================================================== */
2413 /* These are the class-handling opcodes */
2414
2415 case OP_CLASS:
2416 case OP_NCLASS:
2417 case OP_XCLASS:
2418 {
2419 BOOL isinclass = FALSE;
2420 int next_state_offset;
2421 const pcre_uchar *ecode;
2422
2423 /* For a simple class, there is always just a 32-byte table, and we
2424 can set isinclass from it. */
2425
2426 if (codevalue != OP_XCLASS)
2427 {
2428 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2429 if (clen > 0)
2430 {
2431 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2432 ((code[1 + c/8] & (1 << (c&7))) != 0);
2433 }
2434 }
2435
2436 /* An extended class may have a table or a list of single characters,
2437 ranges, or both, and it may be positive or negative. There's a
2438 function that sorts all this out. */
2439
2440 else
2441 {
2442 ecode = code + GET(code, 1);
2443 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE);
2444 }
2445
2446 /* At this point, isinclass is set for all kinds of class, and ecode
2447 points to the byte after the end of the class. If there is a
2448 quantifier, this is where it will be. */
2449
2450 next_state_offset = (int)(ecode - start_code);
2451
2452 switch (*ecode)
2453 {
2454 case OP_CRSTAR:
2455 case OP_CRMINSTAR:
2456 ADD_ACTIVE(next_state_offset + 1, 0);
2457 if (isinclass) { ADD_NEW(state_offset, 0); }
2458 break;
2459
2460 case OP_CRPLUS:
2461 case OP_CRMINPLUS:
2462 count = current_state->count; /* Already matched */
2463 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2464 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2465 break;
2466
2467 case OP_CRQUERY:
2468 case OP_CRMINQUERY:
2469 ADD_ACTIVE(next_state_offset + 1, 0);
2470 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2471 break;
2472
2473 case OP_CRRANGE:
2474 case OP_CRMINRANGE:
2475 count = current_state->count; /* Already matched */
2476 if (count >= GET2(ecode, 1))
2477 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2478 if (isinclass)
2479 {
2480 int max = GET2(ecode, 3);
2481 if (++count >= max && max != 0) /* Max 0 => no limit */
2482 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2483 else
2484 { ADD_NEW(state_offset, count); }
2485 }
2486 break;
2487
2488 default:
2489 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2490 break;
2491 }
2492 }
2493 break;
2494
2495 /* ========================================================================== */
2496 /* These are the opcodes for fancy brackets of various kinds. We have
2497 to use recursion in order to handle them. The "always failing" assertion
2498 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499 though the other "backtracking verbs" are not supported. */
2500
2501 case OP_FAIL:
2502 forced_fail++; /* Count FAILs for multiple states */
2503 break;
2504
2505 case OP_ASSERT:
2506 case OP_ASSERT_NOT:
2507 case OP_ASSERTBACK:
2508 case OP_ASSERTBACK_NOT:
2509 {
2510 int rc;
2511 int local_offsets[2];
2512 int local_workspace[1000];
2513 const pcre_uchar *endasscode = code + GET(code, 1);
2514
2515 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2516
2517 rc = internal_dfa_exec(
2518 md, /* static match data */
2519 code, /* this subexpression's code */
2520 ptr, /* where we currently are */
2521 (int)(ptr - start_subject), /* start offset */
2522 local_offsets, /* offset vector */
2523 sizeof(local_offsets)/sizeof(int), /* size of same */
2524 local_workspace, /* workspace vector */
2525 sizeof(local_workspace)/sizeof(int), /* size of same */
2526 rlevel); /* function recursion level */
2527
2528 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531 }
2532 break;
2533
2534 /*-----------------------------------------------------------------*/
2535 case OP_COND:
2536 case OP_SCOND:
2537 {
2538 int local_offsets[1000];
2539 int local_workspace[1000];
2540 int codelink = GET(code, 1);
2541 int condcode;
2542
2543 /* Because of the way auto-callout works during compile, a callout item
2544 is inserted between OP_COND and an assertion condition. This does not
2545 happen for the other conditions. */
2546
2547 if (code[LINK_SIZE+1] == OP_CALLOUT)
2548 {
2549 rrc = 0;
2550 if (pcre_callout != NULL)
2551 {
2552 pcre_callout_block cb;
2553 cb.version = 1; /* Version 1 of the callout block */
2554 cb.callout_number = code[LINK_SIZE+2];
2555 cb.offset_vector = offsets;
2556 cb.subject = (PCRE_SPTR)start_subject;
2557 cb.subject_length = (int)(end_subject - start_subject);
2558 cb.start_match = (int)(current_subject - start_subject);
2559 cb.current_position = (int)(ptr - start_subject);
2560 cb.pattern_position = GET(code, LINK_SIZE + 3);
2561 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562 cb.capture_top = 1;
2563 cb.capture_last = -1;
2564 cb.callout_data = md->callout_data;
2565 cb.mark = NULL; /* No (*MARK) support */
2566 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2567 }
2568 if (rrc > 0) break; /* Fail this thread */
2569 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2570 }
2571
2572 condcode = code[LINK_SIZE+1];
2573
2574 /* Back reference conditions are not supported */
2575
2576 if (condcode == OP_CREF || condcode == OP_NCREF)
2577 return PCRE_ERROR_DFA_UCOND;
2578
2579 /* The DEFINE condition is always false */
2580
2581 if (condcode == OP_DEF)
2582 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2583
2584 /* The only supported version of OP_RREF is for the value RREF_ANY,
2585 which means "test if in any recursion". We can't test for specifically
2586 recursed groups. */
2587
2588 else if (condcode == OP_RREF || condcode == OP_NRREF)
2589 {
2590 int value = GET2(code, LINK_SIZE+2);
2591 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592 if (md->recursive != NULL)
2593 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595 }
2596
2597 /* Otherwise, the condition is an assertion */
2598
2599 else
2600 {
2601 int rc;
2602 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2603 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2604
2605 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2606
2607 rc = internal_dfa_exec(
2608 md, /* fixed match data */
2609 asscode, /* this subexpression's code */
2610 ptr, /* where we currently are */
2611 (int)(ptr - start_subject), /* start offset */
2612 local_offsets, /* offset vector */
2613 sizeof(local_offsets)/sizeof(int), /* size of same */
2614 local_workspace, /* workspace vector */
2615 sizeof(local_workspace)/sizeof(int), /* size of same */
2616 rlevel); /* function recursion level */
2617
2618 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619 if ((rc >= 0) ==
2620 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622 else
2623 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624 }
2625 }
2626 break;
2627
2628 /*-----------------------------------------------------------------*/
2629 case OP_RECURSE:
2630 {
2631 dfa_recursion_info *ri;
2632 int local_offsets[1000];
2633 int local_workspace[1000];
2634 const pcre_uchar *callpat = start_code + GET(code, 1);
2635 int recno = (callpat == md->start_code)? 0 :
2636 GET2(callpat, 1 + LINK_SIZE);
2637 int rc;
2638
2639 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640
2641 /* Check for repeating a recursion without advancing the subject
2642 pointer. This should catch convoluted mutual recursions. (Some simple
2643 cases are caught at compile time.) */
2644
2645 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646 if (recno == ri->group_num && ptr == ri->subject_position)
2647 return PCRE_ERROR_RECURSELOOP;
2648
2649 /* Remember this recursion and where we started it so as to
2650 catch infinite loops. */
2651
2652 new_recursive.group_num = recno;
2653 new_recursive.subject_position = ptr;
2654 new_recursive.prevrec = md->recursive;
2655 md->recursive = &new_recursive;
2656
2657 rc = internal_dfa_exec(
2658 md, /* fixed match data */
2659 callpat, /* this subexpression's code */
2660 ptr, /* where we currently are */
2661 (int)(ptr - start_subject), /* start offset */
2662 local_offsets, /* offset vector */
2663 sizeof(local_offsets)/sizeof(int), /* size of same */
2664 local_workspace, /* workspace vector */
2665 sizeof(local_workspace)/sizeof(int), /* size of same */
2666 rlevel); /* function recursion level */
2667
2668 md->recursive = new_recursive.prevrec; /* Done this recursion */
2669
2670 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671 rc));
2672
2673 /* Ran out of internal offsets */
2674
2675 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2676
2677 /* For each successful matched substring, set up the next state with a
2678 count of characters to skip before trying it. Note that the count is in
2679 characters, not bytes. */
2680
2681 if (rc > 0)
2682 {
2683 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2684 {
2685 const pcre_uchar *p = start_subject + local_offsets[rc];
2686 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2687 int charcount = local_offsets[rc+1] - local_offsets[rc];
2688 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2689 if (charcount > 0)
2690 {
2691 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2692 }
2693 else
2694 {
2695 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2696 }
2697 }
2698 }
2699 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2700 }
2701 break;
2702
2703 /*-----------------------------------------------------------------*/
2704 case OP_BRAPOS:
2705 case OP_SBRAPOS:
2706 case OP_CBRAPOS:
2707 case OP_SCBRAPOS:
2708 case OP_BRAPOSZERO:
2709 {
2710 int charcount, matched_count;
2711 const pcre_uchar *local_ptr = ptr;
2712 BOOL allow_zero;
2713
2714 if (codevalue == OP_BRAPOSZERO)
2715 {
2716 allow_zero = TRUE;
2717 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2718 }
2719 else allow_zero = FALSE;
2720
2721 /* Loop to match the subpattern as many times as possible as if it were
2722 a complete pattern. */
2723
2724 for (matched_count = 0;; matched_count++)
2725 {
2726 int local_offsets[2];
2727 int local_workspace[1000];
2728
2729 int rc = internal_dfa_exec(
2730 md, /* fixed match data */
2731 code, /* this subexpression's code */
2732 local_ptr, /* where we currently are */
2733 (int)(ptr - start_subject), /* start offset */
2734 local_offsets, /* offset vector */
2735 sizeof(local_offsets)/sizeof(int), /* size of same */
2736 local_workspace, /* workspace vector */
2737 sizeof(local_workspace)/sizeof(int), /* size of same */
2738 rlevel); /* function recursion level */
2739
2740 /* Failed to match */
2741
2742 if (rc < 0)
2743 {
2744 if (rc != PCRE_ERROR_NOMATCH) return rc;
2745 break;
2746 }
2747
2748 /* Matched: break the loop if zero characters matched. */
2749
2750 charcount = local_offsets[1] - local_offsets[0];
2751 if (charcount == 0) break;
2752 local_ptr += charcount; /* Advance temporary position ptr */
2753 }
2754
2755 /* At this point we have matched the subpattern matched_count
2756 times, and local_ptr is pointing to the character after the end of the
2757 last match. */
2758
2759 if (matched_count > 0 || allow_zero)
2760 {
2761 const pcre_uchar *end_subpattern = code;
2762 int next_state_offset;
2763
2764 do { end_subpattern += GET(end_subpattern, 1); }
2765 while (*end_subpattern == OP_ALT);
2766 next_state_offset =
2767 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768
2769 /* Optimization: if there are no more active states, and there
2770 are no new states yet set up, then skip over the subject string
2771 right here, to save looping. Otherwise, set up the new state to swing
2772 into action when the end of the matched substring is reached. */
2773
2774 if (i + 1 >= active_count && new_count == 0)
2775 {
2776 ptr = local_ptr;
2777 clen = 0;
2778 ADD_NEW(next_state_offset, 0);
2779 }
2780 else
2781 {
2782 const pcre_uchar *p = ptr;
2783 const pcre_uchar *pp = local_ptr;
2784 charcount = pp - p;
2785 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787 }
2788 }
2789 }
2790 break;
2791
2792 /*-----------------------------------------------------------------*/
2793 case OP_ONCE:
2794 case OP_ONCE_NC:
2795 {
2796 int local_offsets[2];
2797 int local_workspace[1000];
2798
2799 int rc = internal_dfa_exec(
2800 md, /* fixed match data */
2801 code, /* this subexpression's code */
2802 ptr, /* where we currently are */
2803 (int)(ptr - start_subject), /* start offset */
2804 local_offsets, /* offset vector */
2805 sizeof(local_offsets)/sizeof(int), /* size of same */
2806 local_workspace, /* workspace vector */
2807 sizeof(local_workspace)/sizeof(int), /* size of same */
2808 rlevel); /* function recursion level */
2809
2810 if (rc >= 0)
2811 {
2812 const pcre_uchar *end_subpattern = code;
2813 int charcount = local_offsets[1] - local_offsets[0];
2814 int next_state_offset, repeat_state_offset;
2815
2816 do { end_subpattern += GET(end_subpattern, 1); }
2817 while (*end_subpattern == OP_ALT);
2818 next_state_offset =
2819 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820
2821 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822 arrange for the repeat state also to be added to the relevant list.
2823 Calculate the offset, or set -1 for no repeat. */
2824
2825 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826 *end_subpattern == OP_KETRMIN)?
2827 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828
2829 /* If we have matched an empty string, add the next state at the
2830 current character pointer. This is important so that the duplicate
2831 checking kicks in, which is what breaks infinite loops that match an
2832 empty string. */
2833
2834 if (charcount == 0)
2835 {
2836 ADD_ACTIVE(next_state_offset, 0);
2837 }
2838
2839 /* Optimization: if there are no more active states, and there
2840 are no new states yet set up, then skip over the subject string
2841 right here, to save looping. Otherwise, set up the new state to swing
2842 into action when the end of the matched substring is reached. */
2843
2844 else if (i + 1 >= active_count && new_count == 0)
2845 {
2846 ptr += charcount;
2847 clen = 0;
2848 ADD_NEW(next_state_offset, 0);
2849
2850 /* If we are adding a repeat state at the new character position,
2851 we must fudge things so that it is the only current state.
2852 Otherwise, it might be a duplicate of one we processed before, and
2853 that would cause it to be skipped. */
2854
2855 if (repeat_state_offset >= 0)
2856 {
2857 next_active_state = active_states;
2858 active_count = 0;
2859 i = -1;
2860 ADD_ACTIVE(repeat_state_offset, 0);
2861 }
2862 }
2863 else
2864 {
2865 const pcre_uchar *p = start_subject + local_offsets[0];
2866 const pcre_uchar *pp = start_subject + local_offsets[1];
2867 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2868 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2869 if (repeat_state_offset >= 0)
2870 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871 }
2872 }
2873 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874 }
2875 break;
2876
2877
2878 /* ========================================================================== */
2879 /* Handle callouts */
2880
2881 case OP_CALLOUT:
2882 rrc = 0;
2883 if (pcre_callout != NULL)
2884 {
2885 pcre_callout_block cb;
2886 cb.version = 1; /* Version 1 of the callout block */
2887 cb.callout_number = code[1];
2888 cb.offset_vector = offsets;
2889 cb.subject = (PCRE_SPTR)start_subject;
2890 cb.subject_length = (int)(end_subject - start_subject);
2891 cb.start_match = (int)(current_subject - start_subject);
2892 cb.current_position = (int)(ptr - start_subject);
2893 cb.pattern_position = GET(code, 2);
2894 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895 cb.capture_top = 1;
2896 cb.capture_last = -1;
2897 cb.callout_data = md->callout_data;
2898 cb.mark = NULL; /* No (*MARK) support */
2899 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2900 }
2901 if (rrc == 0)
2902 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2903 break;
2904
2905
2906 /* ========================================================================== */
2907 default: /* Unsupported opcode */
2908 return PCRE_ERROR_DFA_UITEM;
2909 }
2910
2911 NEXT_ACTIVE_STATE: continue;
2912
2913 } /* End of loop scanning active states */
2914
2915 /* We have finished the processing at the current subject character. If no
2916 new states have been set for the next character, we have found all the
2917 matches that we are going to find. If we are at the top level and partial
2918 matching has been requested, check for appropriate conditions.
2919
2920 The "forced_ fail" variable counts the number of (*F) encountered for the
2921 character. If it is equal to the original active_count (saved in
2922 workspace[1]) it means that (*F) was found on every active state. In this
2923 case we don't want to give a partial match.
2924
2925 The "could_continue" variable is true if a state could have continued but
2926 for the fact that the end of the subject was reached. */
2927
2928 if (new_count <= 0)
2929 {
2930 if (rlevel == 1 && /* Top level, and */
2931 could_continue && /* Some could go on */
2932 forced_fail != workspace[1] && /* Not all forced fail & */
2933 ( /* either... */
2934 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2935 || /* or... */
2936 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2937 match_count < 0) /* no matches */
2938 ) && /* And... */
2939 ptr >= end_subject && /* Reached end of subject */
2940 ptr > md->start_used_ptr) /* Inspected non-empty string */
2941 {
2942 if (offsetcount >= 2)
2943 {
2944 offsets[0] = (int)(md->start_used_ptr - start_subject);
2945 offsets[1] = (int)(end_subject - start_subject);
2946 }
2947 match_count = PCRE_ERROR_PARTIAL;
2948 }
2949
2950 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2951 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2952 rlevel*2-2, SP));
2953 break; /* In effect, "return", but see the comment below */
2954 }
2955
2956 /* One or more states are active for the next character. */
2957
2958 ptr += clen; /* Advance to next subject character */
2959 } /* Loop to move along the subject string */
2960
2961 /* Control gets here from "break" a few lines above. We do it this way because
2962 if we use "return" above, we have compiler trouble. Some compilers warn if
2963 there's nothing here because they think the function doesn't return a value. On
2964 the other hand, if we put a dummy statement here, some more clever compilers
2965 complain that it can't be reached. Sigh. */
2966
2967 return match_count;
2968 }
2969
2970
2971
2972
2973 /*************************************************
2974 * Execute a Regular Expression - DFA engine *
2975 *************************************************/
2976
2977 /* This external function applies a compiled re to a subject string using a DFA
2978 engine. This function calls the internal function multiple times if the pattern
2979 is not anchored.
2980
2981 Arguments:
2982 argument_re points to the compiled expression
2983 extra_data points to extra data or is NULL
2984 subject points to the subject string
2985 length length of subject string (may contain binary zeros)
2986 start_offset where to start in the subject string
2987 options option bits
2988 offsets vector of match offsets
2989 offsetcount size of same
2990 workspace workspace vector
2991 wscount size of same
2992
2993 Returns: > 0 => number of match offset pairs placed in offsets
2994 = 0 => offsets overflowed; longest matches are present
2995 -1 => failed to match
2996 < -1 => some kind of unexpected problem
2997 */
2998
2999 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3000 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3001 const char *subject, int length, int start_offset, int options, int *offsets,
3002 int offsetcount, int *workspace, int wscount)
3003 {
3004 real_pcre *re = (real_pcre *)argument_re;
3005 dfa_match_data match_block;
3006 dfa_match_data *md = &match_block;
3007 BOOL utf, anchored, startline, firstline;
3008 const pcre_uchar *current_subject, *end_subject;
3009 const pcre_uint8 *lcc;
3010
3011 pcre_study_data internal_study;
3012 const pcre_study_data *study = NULL;
3013 real_pcre internal_re;
3014
3015 const pcre_uchar *req_char_ptr;
3016 const pcre_uint8 *start_bits = NULL;
3017 BOOL has_first_char = FALSE;
3018 BOOL has_req_char = FALSE;
3019 pcre_uchar first_char = 0;
3020 pcre_uchar first_char2 = 0;
3021 pcre_uchar req_char = 0;
3022 pcre_uchar req_char2 = 0;
3023 int newline;
3024
3025 /* Plausibility checks */
3026
3027 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3028 if (re == NULL || subject == NULL || workspace == NULL ||
3029 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3030 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3031 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3032 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3033
3034 /* We need to find the pointer to any study data before we test for byte
3035 flipping, so we scan the extra_data block first. This may set two fields in the
3036 match block, so we must initialize them beforehand. However, the other fields
3037 in the match block must not be set until after the byte flipping. */
3038
3039 md->tables = re->tables;
3040 md->callout_data = NULL;
3041
3042 if (extra_data != NULL)
3043 {
3044 unsigned int flags = extra_data->flags;
3045 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3046 study = (const pcre_study_data *)extra_data->study_data;
3047 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3048 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3049 return PCRE_ERROR_DFA_UMLIMIT;
3050 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3051 md->callout_data = extra_data->callout_data;
3052 if ((flags & PCRE_EXTRA_TABLES) != 0)
3053 md->tables = extra_data->tables;
3054 }
3055
3056 /* Check that the first field in the block is the magic number. If it is not,
3057 test for a regex that was compiled on a host of opposite endianness. If this is
3058 the case, flipped values are put in internal_re and internal_study if there was
3059 study data too. */
3060
3061 if (re->magic_number != MAGIC_NUMBER)
3062 {
3063 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3064 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3065 if (study != NULL) study = &internal_study;
3066 }
3067
3068 /* Set some local values */
3069
3070 current_subject = (const unsigned char *)subject + start_offset;
3071 end_subject = (const unsigned char *)subject + length;
3072 req_char_ptr = current_subject - 1;
3073
3074 #ifdef SUPPORT_UTF8
3075 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3076 utf = (re->options & PCRE_UTF8) != 0;
3077 #else
3078 utf = FALSE;
3079 #endif
3080
3081 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3082 (re->options & PCRE_ANCHORED) != 0;
3083
3084 /* The remaining fixed data for passing around. */
3085
3086 md->start_code = (const pcre_uchar *)argument_re +
3087 re->name_table_offset + re->name_count * re->name_entry_size;
3088 md->start_subject = (const unsigned char *)subject;
3089 md->end_subject = end_subject;
3090 md->start_offset = start_offset;
3091 md->moptions = options;
3092 md->poptions = re->options;
3093
3094 /* If the BSR option is not set at match time, copy what was set
3095 at compile time. */
3096
3097 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3098 {
3099 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3100 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3101 #ifdef BSR_ANYCRLF
3102 else md->moptions |= PCRE_BSR_ANYCRLF;
3103 #endif
3104 }
3105
3106 /* Handle different types of newline. The three bits give eight cases. If
3107 nothing is set at run time, whatever was used at compile time applies. */
3108
3109 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3110 PCRE_NEWLINE_BITS)
3111 {
3112 case 0: newline = NEWLINE; break; /* Compile-time default */
3113 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3114 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3115 case PCRE_NEWLINE_CR+
3116 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3117 case PCRE_NEWLINE_ANY: newline = -1; break;
3118 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3119 default: return PCRE_ERROR_BADNEWLINE;
3120 }
3121
3122 if (newline == -2)
3123 {
3124 md->nltype = NLTYPE_ANYCRLF;
3125 }
3126 else if (newline < 0)
3127 {
3128 md->nltype = NLTYPE_ANY;
3129 }
3130 else
3131 {
3132 md->nltype = NLTYPE_FIXED;
3133 if (newline > 255)
3134 {
3135 md->nllen = 2;
3136 md->nl[0] = (newline >> 8) & 255;
3137 md->nl[1] = newline & 255;
3138 }
3139 else
3140 {
3141 md->nllen = 1;
3142 md->nl[0] = newline;
3143 }
3144 }
3145
3146 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3147 back the character offset. */
3148
3149 #ifdef SUPPORT_UTF8
3150 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3151 {
3152 int erroroffset;
3153 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3154 if (errorcode != 0)
3155 {
3156 if (offsetcount >= 2)
3157 {
3158 offsets[0] = erroroffset;
3159 offsets[1] = errorcode;
3160 }
3161 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3162 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3163 }
3164 if (start_offset > 0 && start_offset < length &&
3165 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
3166 return PCRE_ERROR_BADUTF8_OFFSET;
3167 }
3168 #endif
3169
3170 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3171 is a feature that makes it possible to save compiled regex and re-use them
3172 in other programs later. */
3173
3174 if (md->tables == NULL) md->tables = PRIV(default_tables);
3175
3176 /* The lower casing table and the "must be at the start of a line" flag are
3177 used in a loop when finding where to start. */
3178
3179 lcc = md->tables + lcc_offset;
3180 startline = (re->flags & PCRE_STARTLINE) != 0;
3181 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3182
3183 /* Set up the first character to match, if available. The first_byte value is
3184 never set for an anchored regular expression, but the anchoring may be forced
3185 at run time, so we have to test for anchoring. The first char may be unset for
3186 an unanchored pattern, of course. If there's no first char and the pattern was
3187 studied, there may be a bitmap of possible first characters. */
3188
3189 if (!anchored)
3190 {
3191 if ((re->flags & PCRE_FIRSTSET) != 0)
3192 {
3193 has_first_char = TRUE;
3194 first_char = first_char2 = re->first_char;
3195 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3196 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3197 }
3198 else
3199 {
3200 if (!startline && study != NULL &&
3201 (study->flags & PCRE_STUDY_MAPPED) != 0)
3202 start_bits = study->start_bits;
3203 }
3204 }
3205
3206 /* For anchored or unanchored matches, there may be a "last known required
3207 character" set. */
3208
3209 if ((re->flags & PCRE_REQCHSET) != 0)
3210 {
3211 has_req_char = TRUE;
3212 req_char = req_char2 = re->req_char;
3213 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3214 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3215 }
3216
3217 /* Call the main matching function, looping for a non-anchored regex after a
3218 failed match. If not restarting, perform certain optimizations at the start of
3219 a match. */
3220
3221 for (;;)
3222 {
3223 int rc;
3224
3225 if ((options & PCRE_DFA_RESTART) == 0)
3226 {
3227 const pcre_uchar *save_end_subject = end_subject;
3228
3229 /* If firstline is TRUE, the start of the match is constrained to the first
3230 line of a multiline string. Implement this by temporarily adjusting
3231 end_subject so that we stop scanning at a newline. If the match fails at
3232 the newline, later code breaks this loop. */
3233
3234 if (firstline)
3235 {
3236 PCRE_PUCHAR t = current_subject;
3237 #ifdef SUPPORT_UTF8
3238 if (utf)
3239 {
3240 while (t < md->end_subject && !IS_NEWLINE(t))
3241 {
3242 t++;
3243 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3244 }
3245 }
3246 else
3247 #endif
3248 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3249 end_subject = t;
3250 }
3251
3252 /* There are some optimizations that avoid running the match if a known
3253 starting point is not found. However, there is an option that disables
3254 these, for testing and for ensuring that all callouts do actually occur.
3255 The option can be set in the regex by (*NO_START_OPT) or passed in
3256 match-time options. */
3257
3258 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3259 {
3260 /* Advance to a known first char. */
3261
3262 if (has_first_char)
3263 {
3264 if (first_char != first_char2)
3265 while (current_subject < end_subject &&
3266 *current_subject != first_char && *current_subject != first_char2)
3267 current_subject++;
3268 else
3269 while (current_subject < end_subject &&
3270 *current_subject != first_char)
3271 current_subject++;
3272 }
3273
3274 /* Or to just after a linebreak for a multiline match if possible */
3275
3276 else if (startline)
3277 {
3278 if (current_subject > md->start_subject + start_offset)
3279 {
3280 #ifdef SUPPORT_UTF8
3281 if (utf)
3282 {
3283 while (current_subject < end_subject &&
3284 !WAS_NEWLINE(current_subject))
3285 {
3286 current_subject++;
3287 while(current_subject < end_subject &&
3288 (*current_subject & 0xc0) == 0x80)
3289 current_subject++;
3290 }
3291 }
3292 else
3293 #endif
3294 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3295 current_subject++;
3296
3297 /* If we have just passed a CR and the newline option is ANY or
3298 ANYCRLF, and we are now at a LF, advance the match position by one
3299 more character. */
3300
3301 if (current_subject[-1] == CHAR_CR &&
3302 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3303 current_subject < end_subject &&
3304 *current_subject == CHAR_NL)
3305 current_subject++;
3306 }
3307 }
3308
3309 /* Or to a non-unique first char after study */
3310
3311 else if (start_bits != NULL)
3312 {
3313 while (current_subject < end_subject)
3314 {
3315 register unsigned int c = *current_subject;
3316 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3317 {
3318 current_subject++;
3319 #ifdef SUPPORT_UTF8
3320 if (utf)
3321 while(current_subject < end_subject &&
3322 (*current_subject & 0xc0) == 0x80) current_subject++;
3323 #endif
3324 }
3325 else break;
3326 }
3327 }
3328 }
3329
3330 /* Restore fudged end_subject */
3331
3332 end_subject = save_end_subject;
3333
3334 /* The following two optimizations are disabled for partial matching or if
3335 disabling is explicitly requested (and of course, by the test above, this
3336 code is not obeyed when restarting after a partial match). */
3337
3338 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3339 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3340 {
3341 /* If the pattern was studied, a minimum subject length may be set. This
3342 is a lower bound; no actual string of that length may actually match the
3343 pattern. Although the value is, strictly, in characters, we treat it as
3344 bytes to avoid spending too much time in this optimization. */
3345
3346 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3347 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3348 return PCRE_ERROR_NOMATCH;
3349
3350 /* If req_char is set, we know that that character must appear in the
3351 subject for the match to succeed. If the first character is set, req_char
3352 must be later in the subject; otherwise the test starts at the match
3353 point. This optimization can save a huge amount of work in patterns with
3354 nested unlimited repeats that aren't going to match. Writing separate
3355 code for cased/caseless versions makes it go faster, as does using an
3356 autoincrement and backing off on a match.
3357
3358 HOWEVER: when the subject string is very, very long, searching to its end
3359 can take a long time, and give bad performance on quite ordinary
3360 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3361 string... so we don't do this when the string is sufficiently long. */
3362
3363 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3364 {
3365 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3366
3367 /* We don't need to repeat the search if we haven't yet reached the
3368 place we found it at last time. */
3369
3370 if (p > req_char_ptr)
3371 {
3372 if (req_char != req_char2)
3373 {
3374 while (p < end_subject)
3375 {
3376 register int pp = *p++;
3377 if (pp == req_char || pp == req_char2) { p--; break; }
3378 }
3379 }
3380 else
3381 {
3382 while (p < end_subject)
3383 {
3384 if (*p++ == req_char) { p--; break; }
3385 }
3386 }
3387
3388 /* If we can't find the required character, break the matching loop,
3389 which will cause a return or PCRE_ERROR_NOMATCH. */
3390
3391 if (p >= end_subject) break;
3392
3393 /* If we have found the required character, save the point where we
3394 found it, so that we don't search again next time round the loop if
3395 the start hasn't passed this character yet. */
3396
3397 req_char_ptr = p;
3398 }
3399 }
3400 }
3401 } /* End of optimizations that are done when not restarting */
3402
3403 /* OK, now we can do the business */
3404
3405 md->start_used_ptr = current_subject;
3406 md->recursive = NULL;
3407
3408 rc = internal_dfa_exec(
3409 md, /* fixed match data */
3410 md->start_code, /* this subexpression's code */
3411 current_subject, /* where we currently are */
3412 start_offset, /* start offset in subject */
3413 offsets, /* offset vector */
3414 offsetcount, /* size of same */
3415 workspace, /* workspace vector */
3416 wscount, /* size of same */
3417 0); /* function recurse level */
3418
3419 /* Anything other than "no match" means we are done, always; otherwise, carry
3420 on only if not anchored. */
3421
3422 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3423
3424 /* Advance to the next subject character unless we are at the end of a line
3425 and firstline is set. */
3426
3427 if (firstline && IS_NEWLINE(current_subject)) break;
3428 current_subject++;
3429 if (utf)
3430 {
3431 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3432 current_subject++;
3433 }
3434 if (current_subject > end_subject) break;
3435
3436 /* If we have just passed a CR and we are now at a LF, and the pattern does
3437 not contain any explicit matches for \r or \n, and the newline option is CRLF
3438 or ANY or ANYCRLF, advance the match position by one more character. */
3439
3440 if (current_subject[-1] == CHAR_CR &&
3441 current_subject < end_subject &&
3442 *current_subject == CHAR_NL &&
3443 (re->flags & PCRE_HASCRORLF) == 0 &&
3444 (md->nltype == NLTYPE_ANY ||
3445 md->nltype == NLTYPE_ANYCRLF ||
3446 md->nllen == 2))
3447 current_subject++;
3448
3449 } /* "Bumpalong" loop */
3450
3451 return PCRE_ERROR_NOMATCH;
3452 }
3453
3454 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5