/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 794 - (show annotations)
Thu Dec 8 07:36:41 2011 UTC (7 years, 8 months ago) by zherczeg
File MIME type: text/plain
File size: 120684 byte(s)
Adding --enable-utf option rather than --enable-utf16. --enable-utf8 is kept for compatibility reasons. And fixing other, minor issues.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2011 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes it possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const pcre_uint8 coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 3, 3, 3, /* upto, minupto, exact */
132 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 3, 3, 3, /* upto I, minupto I, exact I */
135 1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 /* Negative single-char repeats - only for chars < 256 */
137 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138 3, 3, 3, /* NOT upto, minupto, exact */
139 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141 3, 3, 3, /* NOT upto I, minupto I, exact I */
142 1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 /* Positive type repeats */
144 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145 3, 3, 3, /* Type upto, minupto, exact */
146 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 /* Character class & ref repeats */
148 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149 0, 0, /* CRRANGE, CRMINRANGE */
150 0, /* CLASS */
151 0, /* NCLASS */
152 0, /* XCLASS - variable length */
153 0, /* REF */
154 0, /* REFI */
155 0, /* RECURSE */
156 0, /* CALLOUT */
157 0, /* Alt */
158 0, /* Ket */
159 0, /* KetRmax */
160 0, /* KetRmin */
161 0, /* KetRpos */
162 0, /* Reverse */
163 0, /* Assert */
164 0, /* Assert not */
165 0, /* Assert behind */
166 0, /* Assert behind not */
167 0, 0, /* ONCE, ONCE_NC */
168 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 0, 0, /* CREF, NCREF */
171 0, 0, /* RREF, NRREF */
172 0, /* DEF */
173 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177 0, 0 /* CLOSE, SKIPZERO */
178 };
179
180 /* This table identifies those opcodes that inspect a character. It is used to
181 remember the fact that a character could have been inspected when the end of
182 the subject is reached. ***NOTE*** If the start of this table is modified, the
183 two tables that follow must also be modified. */
184
185 static const pcre_uint8 poptable[] = {
186 0, /* End */
187 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189 1, 1, 1, /* Any, AllAny, Anybyte */
190 1, 1, /* \P, \p */
191 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 1, /* \X */
193 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 1, /* Char */
195 1, /* Chari */
196 1, /* not */
197 1, /* noti */
198 /* Positive single-char repeats */
199 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200 1, 1, 1, /* upto, minupto, exact */
201 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203 1, 1, 1, /* upto I, minupto I, exact I */
204 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 /* Negative single-char repeats - only for chars < 256 */
206 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207 1, 1, 1, /* NOT upto, minupto, exact */
208 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210 1, 1, 1, /* NOT upto I, minupto I, exact I */
211 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 /* Positive type repeats */
213 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* Type upto, minupto, exact */
215 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216 /* Character class & ref repeats */
217 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218 1, 1, /* CRRANGE, CRMINRANGE */
219 1, /* CLASS */
220 1, /* NCLASS */
221 1, /* XCLASS - variable length */
222 0, /* REF */
223 0, /* REFI */
224 0, /* RECURSE */
225 0, /* CALLOUT */
226 0, /* Alt */
227 0, /* Ket */
228 0, /* KetRmax */
229 0, /* KetRmin */
230 0, /* KetRpos */
231 0, /* Reverse */
232 0, /* Assert */
233 0, /* Assert not */
234 0, /* Assert behind */
235 0, /* Assert behind not */
236 0, 0, /* ONCE, ONCE_NC */
237 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 0, 0, /* CREF, NCREF */
240 0, 0, /* RREF, NRREF */
241 0, /* DEF */
242 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246 0, 0 /* CLOSE, SKIPZERO */
247 };
248
249 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250 and \w */
251
252 static const pcre_uint8 toptable1[] = {
253 0, 0, 0, 0, 0, 0,
254 ctype_digit, ctype_digit,
255 ctype_space, ctype_space,
256 ctype_word, ctype_word,
257 0, 0 /* OP_ANY, OP_ALLANY */
258 };
259
260 static const pcre_uint8 toptable2[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, 0,
263 ctype_space, 0,
264 ctype_word, 0,
265 1, 1 /* OP_ANY, OP_ALLANY */
266 };
267
268
269 /* Structure for holding data about a particular state, which is in effect the
270 current data for an active path through the match tree. It must consist
271 entirely of ints because the working vector we are passed, and which we put
272 these structures in, is a vector of ints. */
273
274 typedef struct stateblock {
275 int offset; /* Offset to opcode */
276 int count; /* Count for repeats */
277 int data; /* Some use extra data */
278 } stateblock;
279
280 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281
282
283 #ifdef PCRE_DEBUG
284 /*************************************************
285 * Print character string *
286 *************************************************/
287
288 /* Character string printing function for debugging.
289
290 Arguments:
291 p points to string
292 length number of bytes
293 f where to print
294
295 Returns: nothing
296 */
297
298 static void
299 pchars(unsigned char *p, int length, FILE *f)
300 {
301 int c;
302 while (length-- > 0)
303 {
304 if (isprint(c = *(p++)))
305 fprintf(f, "%c", c);
306 else
307 fprintf(f, "\\x%02x", c);
308 }
309 }
310 #endif
311
312
313
314 /*************************************************
315 * Execute a Regular Expression - DFA engine *
316 *************************************************/
317
318 /* This internal function applies a compiled pattern to a subject string,
319 starting at a given point, using a DFA engine. This function is called from the
320 external one, possibly multiple times if the pattern is not anchored. The
321 function calls itself recursively for some kinds of subpattern.
322
323 Arguments:
324 md the match_data block with fixed information
325 this_start_code the opening bracket of this subexpression's code
326 current_subject where we currently are in the subject string
327 start_offset start offset in the subject string
328 offsets vector to contain the matching string offsets
329 offsetcount size of same
330 workspace vector of workspace
331 wscount size of same
332 rlevel function call recursion level
333
334 Returns: > 0 => number of match offset pairs placed in offsets
335 = 0 => offsets overflowed; longest matches are present
336 -1 => failed to match
337 < -1 => some kind of unexpected problem
338
339 The following macros are used for adding states to the two state vectors (one
340 for the current character, one for the following character). */
341
342 #define ADD_ACTIVE(x,y) \
343 if (active_count++ < wscount) \
344 { \
345 next_active_state->offset = (x); \
346 next_active_state->count = (y); \
347 next_active_state++; \
348 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349 } \
350 else return PCRE_ERROR_DFA_WSSIZE
351
352 #define ADD_ACTIVE_DATA(x,y,z) \
353 if (active_count++ < wscount) \
354 { \
355 next_active_state->offset = (x); \
356 next_active_state->count = (y); \
357 next_active_state->data = (z); \
358 next_active_state++; \
359 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360 } \
361 else return PCRE_ERROR_DFA_WSSIZE
362
363 #define ADD_NEW(x,y) \
364 if (new_count++ < wscount) \
365 { \
366 next_new_state->offset = (x); \
367 next_new_state->count = (y); \
368 next_new_state++; \
369 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370 } \
371 else return PCRE_ERROR_DFA_WSSIZE
372
373 #define ADD_NEW_DATA(x,y,z) \
374 if (new_count++ < wscount) \
375 { \
376 next_new_state->offset = (x); \
377 next_new_state->count = (y); \
378 next_new_state->data = (z); \
379 next_new_state++; \
380 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381 } \
382 else return PCRE_ERROR_DFA_WSSIZE
383
384 /* And now, here is the code */
385
386 static int
387 internal_dfa_exec(
388 dfa_match_data *md,
389 const pcre_uchar *this_start_code,
390 const pcre_uchar *current_subject,
391 int start_offset,
392 int *offsets,
393 int offsetcount,
394 int *workspace,
395 int wscount,
396 int rlevel)
397 {
398 stateblock *active_states, *new_states, *temp_states;
399 stateblock *next_active_state, *next_new_state;
400
401 const pcre_uint8 *ctypes, *lcc, *fcc;
402 const pcre_uchar *ptr;
403 const pcre_uchar *end_code, *first_op;
404
405 dfa_recursion_info new_recursive;
406
407 int active_count, new_count, match_count;
408
409 /* Some fields in the md block are frequently referenced, so we load them into
410 independent variables in the hope that this will perform better. */
411
412 const pcre_uchar *start_subject = md->start_subject;
413 const pcre_uchar *end_subject = md->end_subject;
414 const pcre_uchar *start_code = md->start_code;
415
416 #ifdef SUPPORT_UTF8
417 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
418 #else
419 BOOL utf = FALSE;
420 #endif
421
422 rlevel++;
423 offsetcount &= (-2);
424
425 wscount -= 2;
426 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427 (2 * INTS_PER_STATEBLOCK);
428
429 DPRINTF(("\n%.*s---------------------\n"
430 "%.*sCall to internal_dfa_exec f=%d\n",
431 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432
433 ctypes = md->tables + ctypes_offset;
434 lcc = md->tables + lcc_offset;
435 fcc = md->tables + fcc_offset;
436
437 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438
439 active_states = (stateblock *)(workspace + 2);
440 next_new_state = new_states = active_states + wscount;
441 new_count = 0;
442
443 first_op = this_start_code + 1 + LINK_SIZE +
444 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
446 ? IMM2_SIZE:0);
447
448 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
449 the alternative states onto the list, and find out where the end is. This
450 makes is possible to use this function recursively, when we want to stop at a
451 matching internal ket rather than at the end.
452
453 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
454 a backward assertion. In that case, we have to find out the maximum amount to
455 move back, and set up each alternative appropriately. */
456
457 if (*first_op == OP_REVERSE)
458 {
459 int max_back = 0;
460 int gone_back;
461
462 end_code = this_start_code;
463 do
464 {
465 int back = GET(end_code, 2+LINK_SIZE);
466 if (back > max_back) max_back = back;
467 end_code += GET(end_code, 1);
468 }
469 while (*end_code == OP_ALT);
470
471 /* If we can't go back the amount required for the longest lookbehind
472 pattern, go back as far as we can; some alternatives may still be viable. */
473
474 #ifdef SUPPORT_UTF8
475 /* In character mode we have to step back character by character */
476
477 if (utf)
478 {
479 for (gone_back = 0; gone_back < max_back; gone_back++)
480 {
481 if (current_subject <= start_subject) break;
482 current_subject--;
483 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
484 }
485 }
486 else
487 #endif
488
489 /* In byte-mode we can do this quickly. */
490
491 {
492 gone_back = (current_subject - max_back < start_subject)?
493 (int)(current_subject - start_subject) : max_back;
494 current_subject -= gone_back;
495 }
496
497 /* Save the earliest consulted character */
498
499 if (current_subject < md->start_used_ptr)
500 md->start_used_ptr = current_subject;
501
502 /* Now we can process the individual branches. */
503
504 end_code = this_start_code;
505 do
506 {
507 int back = GET(end_code, 2+LINK_SIZE);
508 if (back <= gone_back)
509 {
510 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
511 ADD_NEW_DATA(-bstate, 0, gone_back - back);
512 }
513 end_code += GET(end_code, 1);
514 }
515 while (*end_code == OP_ALT);
516 }
517
518 /* This is the code for a "normal" subpattern (not a backward assertion). The
519 start of a whole pattern is always one of these. If we are at the top level,
520 we may be asked to restart matching from the same point that we reached for a
521 previous partial match. We still have to scan through the top-level branches to
522 find the end state. */
523
524 else
525 {
526 end_code = this_start_code;
527
528 /* Restarting */
529
530 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
531 {
532 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
533 new_count = workspace[1];
534 if (!workspace[0])
535 memcpy(new_states, active_states, new_count * sizeof(stateblock));
536 }
537
538 /* Not restarting */
539
540 else
541 {
542 int length = 1 + LINK_SIZE +
543 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
544 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
545 ? IMM2_SIZE:0);
546 do
547 {
548 ADD_NEW((int)(end_code - start_code + length), 0);
549 end_code += GET(end_code, 1);
550 length = 1 + LINK_SIZE;
551 }
552 while (*end_code == OP_ALT);
553 }
554 }
555
556 workspace[0] = 0; /* Bit indicating which vector is current */
557
558 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
559
560 /* Loop for scanning the subject */
561
562 ptr = current_subject;
563 for (;;)
564 {
565 int i, j;
566 int clen, dlen;
567 unsigned int c, d;
568 int forced_fail = 0;
569 BOOL could_continue = FALSE;
570
571 /* Make the new state list into the active state list and empty the
572 new state list. */
573
574 temp_states = active_states;
575 active_states = new_states;
576 new_states = temp_states;
577 active_count = new_count;
578 new_count = 0;
579
580 workspace[0] ^= 1; /* Remember for the restarting feature */
581 workspace[1] = active_count;
582
583 #ifdef PCRE_DEBUG
584 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
585 pchars((pcre_uchar *)ptr, strlen((char *)ptr), stdout);
586 printf("\"\n");
587
588 printf("%.*sActive states: ", rlevel*2-2, SP);
589 for (i = 0; i < active_count; i++)
590 printf("%d/%d ", active_states[i].offset, active_states[i].count);
591 printf("\n");
592 #endif
593
594 /* Set the pointers for adding new states */
595
596 next_active_state = active_states + active_count;
597 next_new_state = new_states;
598
599 /* Load the current character from the subject outside the loop, as many
600 different states may want to look at it, and we assume that at least one
601 will. */
602
603 if (ptr < end_subject)
604 {
605 clen = 1; /* Number of bytes in the character */
606 #ifdef SUPPORT_UTF8
607 if (utf) { GETCHARLEN(c, ptr, clen); } else
608 #endif /* SUPPORT_UTF8 */
609 c = *ptr;
610 }
611 else
612 {
613 clen = 0; /* This indicates the end of the subject */
614 c = NOTACHAR; /* This value should never actually be used */
615 }
616
617 /* Scan up the active states and act on each one. The result of an action
618 may be to add more states to the currently active list (e.g. on hitting a
619 parenthesis) or it may be to put states on the new list, for considering
620 when we move the character pointer on. */
621
622 for (i = 0; i < active_count; i++)
623 {
624 stateblock *current_state = active_states + i;
625 BOOL caseless = FALSE;
626 const pcre_uchar *code;
627 int state_offset = current_state->offset;
628 int count, codevalue, rrc;
629
630 #ifdef PCRE_DEBUG
631 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
632 if (clen == 0) printf("EOL\n");
633 else if (c > 32 && c < 127) printf("'%c'\n", c);
634 else printf("0x%02x\n", c);
635 #endif
636
637 /* A negative offset is a special case meaning "hold off going to this
638 (negated) state until the number of characters in the data field have
639 been skipped". */
640
641 if (state_offset < 0)
642 {
643 if (current_state->data > 0)
644 {
645 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
646 ADD_NEW_DATA(state_offset, current_state->count,
647 current_state->data - 1);
648 continue;
649 }
650 else
651 {
652 current_state->offset = state_offset = -state_offset;
653 }
654 }
655
656 /* Check for a duplicate state with the same count, and skip if found.
657 See the note at the head of this module about the possibility of improving
658 performance here. */
659
660 for (j = 0; j < i; j++)
661 {
662 if (active_states[j].offset == state_offset &&
663 active_states[j].count == current_state->count)
664 {
665 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
666 goto NEXT_ACTIVE_STATE;
667 }
668 }
669
670 /* The state offset is the offset to the opcode */
671
672 code = start_code + state_offset;
673 codevalue = *code;
674
675 /* If this opcode inspects a character, but we are at the end of the
676 subject, remember the fact for use when testing for a partial match. */
677
678 if (clen == 0 && poptable[codevalue] != 0)
679 could_continue = TRUE;
680
681 /* If this opcode is followed by an inline character, load it. It is
682 tempting to test for the presence of a subject character here, but that
683 is wrong, because sometimes zero repetitions of the subject are
684 permitted.
685
686 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
687 argument that is not a data character - but is always one byte long. We
688 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
689 this case. To keep the other cases fast, convert these ones to new opcodes.
690 */
691
692 if (coptable[codevalue] > 0)
693 {
694 dlen = 1;
695 #ifdef SUPPORT_UTF8
696 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
697 #endif /* SUPPORT_UTF8 */
698 d = code[coptable[codevalue]];
699 if (codevalue >= OP_TYPESTAR)
700 {
701 switch(d)
702 {
703 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
704 case OP_NOTPROP:
705 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
706 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
707 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
708 case OP_NOT_HSPACE:
709 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
710 case OP_NOT_VSPACE:
711 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
712 default: break;
713 }
714 }
715 }
716 else
717 {
718 dlen = 0; /* Not strictly necessary, but compilers moan */
719 d = NOTACHAR; /* if these variables are not set. */
720 }
721
722
723 /* Now process the individual opcodes */
724
725 switch (codevalue)
726 {
727 /* ========================================================================== */
728 /* These cases are never obeyed. This is a fudge that causes a compile-
729 time error if the vectors coptable or poptable, which are indexed by
730 opcode, are not the correct length. It seems to be the only way to do
731 such a check at compile time, as the sizeof() operator does not work
732 in the C preprocessor. */
733
734 case OP_TABLE_LENGTH:
735 case OP_TABLE_LENGTH +
736 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
737 (sizeof(poptable) == OP_TABLE_LENGTH)):
738 break;
739
740 /* ========================================================================== */
741 /* Reached a closing bracket. If not at the end of the pattern, carry
742 on with the next opcode. For repeating opcodes, also add the repeat
743 state. Note that KETRPOS will always be encountered at the end of the
744 subpattern, because the possessive subpattern repeats are always handled
745 using recursive calls. Thus, it never adds any new states.
746
747 At the end of the (sub)pattern, unless we have an empty string and
748 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
749 start of the subject, save the match data, shifting up all previous
750 matches so we always have the longest first. */
751
752 case OP_KET:
753 case OP_KETRMIN:
754 case OP_KETRMAX:
755 case OP_KETRPOS:
756 if (code != end_code)
757 {
758 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
759 if (codevalue != OP_KET)
760 {
761 ADD_ACTIVE(state_offset - GET(code, 1), 0);
762 }
763 }
764 else
765 {
766 if (ptr > current_subject ||
767 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
768 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
769 current_subject > start_subject + md->start_offset)))
770 {
771 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
772 else if (match_count > 0 && ++match_count * 2 > offsetcount)
773 match_count = 0;
774 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
775 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
776 if (offsetcount >= 2)
777 {
778 offsets[0] = (int)(current_subject - start_subject);
779 offsets[1] = (int)(ptr - start_subject);
780 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
781 offsets[1] - offsets[0], current_subject));
782 }
783 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
784 {
785 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
786 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
787 match_count, rlevel*2-2, SP));
788 return match_count;
789 }
790 }
791 }
792 break;
793
794 /* ========================================================================== */
795 /* These opcodes add to the current list of states without looking
796 at the current character. */
797
798 /*-----------------------------------------------------------------*/
799 case OP_ALT:
800 do { code += GET(code, 1); } while (*code == OP_ALT);
801 ADD_ACTIVE((int)(code - start_code), 0);
802 break;
803
804 /*-----------------------------------------------------------------*/
805 case OP_BRA:
806 case OP_SBRA:
807 do
808 {
809 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
810 code += GET(code, 1);
811 }
812 while (*code == OP_ALT);
813 break;
814
815 /*-----------------------------------------------------------------*/
816 case OP_CBRA:
817 case OP_SCBRA:
818 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
819 code += GET(code, 1);
820 while (*code == OP_ALT)
821 {
822 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 code += GET(code, 1);
824 }
825 break;
826
827 /*-----------------------------------------------------------------*/
828 case OP_BRAZERO:
829 case OP_BRAMINZERO:
830 ADD_ACTIVE(state_offset + 1, 0);
831 code += 1 + GET(code, 2);
832 while (*code == OP_ALT) code += GET(code, 1);
833 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
834 break;
835
836 /*-----------------------------------------------------------------*/
837 case OP_SKIPZERO:
838 code += 1 + GET(code, 2);
839 while (*code == OP_ALT) code += GET(code, 1);
840 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
841 break;
842
843 /*-----------------------------------------------------------------*/
844 case OP_CIRC:
845 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
846 { ADD_ACTIVE(state_offset + 1, 0); }
847 break;
848
849 /*-----------------------------------------------------------------*/
850 case OP_CIRCM:
851 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
852 (ptr != end_subject && WAS_NEWLINE(ptr)))
853 { ADD_ACTIVE(state_offset + 1, 0); }
854 break;
855
856 /*-----------------------------------------------------------------*/
857 case OP_EOD:
858 if (ptr >= end_subject)
859 {
860 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
861 could_continue = TRUE;
862 else { ADD_ACTIVE(state_offset + 1, 0); }
863 }
864 break;
865
866 /*-----------------------------------------------------------------*/
867 case OP_SOD:
868 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
869 break;
870
871 /*-----------------------------------------------------------------*/
872 case OP_SOM:
873 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
874 break;
875
876
877 /* ========================================================================== */
878 /* These opcodes inspect the next subject character, and sometimes
879 the previous one as well, but do not have an argument. The variable
880 clen contains the length of the current character and is zero if we are
881 at the end of the subject. */
882
883 /*-----------------------------------------------------------------*/
884 case OP_ANY:
885 if (clen > 0 && !IS_NEWLINE(ptr))
886 { ADD_NEW(state_offset + 1, 0); }
887 break;
888
889 /*-----------------------------------------------------------------*/
890 case OP_ALLANY:
891 if (clen > 0)
892 { ADD_NEW(state_offset + 1, 0); }
893 break;
894
895 /*-----------------------------------------------------------------*/
896 case OP_EODN:
897 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
898 could_continue = TRUE;
899 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
900 { ADD_ACTIVE(state_offset + 1, 0); }
901 break;
902
903 /*-----------------------------------------------------------------*/
904 case OP_DOLL:
905 if ((md->moptions & PCRE_NOTEOL) == 0)
906 {
907 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
908 could_continue = TRUE;
909 else if (clen == 0 ||
910 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
911 (ptr == end_subject - md->nllen)
912 ))
913 { ADD_ACTIVE(state_offset + 1, 0); }
914 }
915 break;
916
917 /*-----------------------------------------------------------------*/
918 case OP_DOLLM:
919 if ((md->moptions & PCRE_NOTEOL) == 0)
920 {
921 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922 could_continue = TRUE;
923 else if (clen == 0 ||
924 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
925 { ADD_ACTIVE(state_offset + 1, 0); }
926 }
927 else if (IS_NEWLINE(ptr))
928 { ADD_ACTIVE(state_offset + 1, 0); }
929 break;
930
931 /*-----------------------------------------------------------------*/
932
933 case OP_DIGIT:
934 case OP_WHITESPACE:
935 case OP_WORDCHAR:
936 if (clen > 0 && c < 256 &&
937 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
938 { ADD_NEW(state_offset + 1, 0); }
939 break;
940
941 /*-----------------------------------------------------------------*/
942 case OP_NOT_DIGIT:
943 case OP_NOT_WHITESPACE:
944 case OP_NOT_WORDCHAR:
945 if (clen > 0 && (c >= 256 ||
946 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
947 { ADD_NEW(state_offset + 1, 0); }
948 break;
949
950 /*-----------------------------------------------------------------*/
951 case OP_WORD_BOUNDARY:
952 case OP_NOT_WORD_BOUNDARY:
953 {
954 int left_word, right_word;
955
956 if (ptr > start_subject)
957 {
958 const pcre_uchar *temp = ptr - 1;
959 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
960 #ifdef SUPPORT_UTF8
961 if (utf) BACKCHAR(temp);
962 #endif
963 GETCHARTEST(d, temp);
964 #ifdef SUPPORT_UCP
965 if ((md->poptions & PCRE_UCP) != 0)
966 {
967 if (d == '_') left_word = TRUE; else
968 {
969 int cat = UCD_CATEGORY(d);
970 left_word = (cat == ucp_L || cat == ucp_N);
971 }
972 }
973 else
974 #endif
975 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
976 }
977 else left_word = FALSE;
978
979 if (clen > 0)
980 {
981 #ifdef SUPPORT_UCP
982 if ((md->poptions & PCRE_UCP) != 0)
983 {
984 if (c == '_') right_word = TRUE; else
985 {
986 int cat = UCD_CATEGORY(c);
987 right_word = (cat == ucp_L || cat == ucp_N);
988 }
989 }
990 else
991 #endif
992 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
993 }
994 else right_word = FALSE;
995
996 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
997 { ADD_ACTIVE(state_offset + 1, 0); }
998 }
999 break;
1000
1001
1002 /*-----------------------------------------------------------------*/
1003 /* Check the next character by Unicode property. We will get here only
1004 if the support is in the binary; otherwise a compile-time error occurs.
1005 */
1006
1007 #ifdef SUPPORT_UCP
1008 case OP_PROP:
1009 case OP_NOTPROP:
1010 if (clen > 0)
1011 {
1012 BOOL OK;
1013 const ucd_record * prop = GET_UCD(c);
1014 switch(code[1])
1015 {
1016 case PT_ANY:
1017 OK = TRUE;
1018 break;
1019
1020 case PT_LAMP:
1021 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1022 prop->chartype == ucp_Lt;
1023 break;
1024
1025 case PT_GC:
1026 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1027 break;
1028
1029 case PT_PC:
1030 OK = prop->chartype == code[2];
1031 break;
1032
1033 case PT_SC:
1034 OK = prop->script == code[2];
1035 break;
1036
1037 /* These are specials for combination cases. */
1038
1039 case PT_ALNUM:
1040 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1041 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1042 break;
1043
1044 case PT_SPACE: /* Perl space */
1045 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1046 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1047 break;
1048
1049 case PT_PXSPACE: /* POSIX space */
1050 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1052 c == CHAR_FF || c == CHAR_CR;
1053 break;
1054
1055 case PT_WORD:
1056 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1057 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1058 c == CHAR_UNDERSCORE;
1059 break;
1060
1061 /* Should never occur, but keep compilers from grumbling. */
1062
1063 default:
1064 OK = codevalue != OP_PROP;
1065 break;
1066 }
1067
1068 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1069 }
1070 break;
1071 #endif
1072
1073
1074
1075 /* ========================================================================== */
1076 /* These opcodes likewise inspect the subject character, but have an
1077 argument that is not a data character. It is one of these opcodes:
1078 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1079 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1080
1081 case OP_TYPEPLUS:
1082 case OP_TYPEMINPLUS:
1083 case OP_TYPEPOSPLUS:
1084 count = current_state->count; /* Already matched */
1085 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1086 if (clen > 0)
1087 {
1088 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1089 (c < 256 &&
1090 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1091 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1092 {
1093 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1094 {
1095 active_count--; /* Remove non-match possibility */
1096 next_active_state--;
1097 }
1098 count++;
1099 ADD_NEW(state_offset, count);
1100 }
1101 }
1102 break;
1103
1104 /*-----------------------------------------------------------------*/
1105 case OP_TYPEQUERY:
1106 case OP_TYPEMINQUERY:
1107 case OP_TYPEPOSQUERY:
1108 ADD_ACTIVE(state_offset + 2, 0);
1109 if (clen > 0)
1110 {
1111 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1112 (c < 256 &&
1113 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1114 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1115 {
1116 if (codevalue == OP_TYPEPOSQUERY)
1117 {
1118 active_count--; /* Remove non-match possibility */
1119 next_active_state--;
1120 }
1121 ADD_NEW(state_offset + 2, 0);
1122 }
1123 }
1124 break;
1125
1126 /*-----------------------------------------------------------------*/
1127 case OP_TYPESTAR:
1128 case OP_TYPEMINSTAR:
1129 case OP_TYPEPOSSTAR:
1130 ADD_ACTIVE(state_offset + 2, 0);
1131 if (clen > 0)
1132 {
1133 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1134 (c < 256 &&
1135 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1136 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1137 {
1138 if (codevalue == OP_TYPEPOSSTAR)
1139 {
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1142 }
1143 ADD_NEW(state_offset, 0);
1144 }
1145 }
1146 break;
1147
1148 /*-----------------------------------------------------------------*/
1149 case OP_TYPEEXACT:
1150 count = current_state->count; /* Number already matched */
1151 if (clen > 0)
1152 {
1153 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1154 (c < 256 &&
1155 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1156 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1157 {
1158 if (++count >= GET2(code, 1))
1159 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1160 else
1161 { ADD_NEW(state_offset, count); }
1162 }
1163 }
1164 break;
1165
1166 /*-----------------------------------------------------------------*/
1167 case OP_TYPEUPTO:
1168 case OP_TYPEMINUPTO:
1169 case OP_TYPEPOSUPTO:
1170 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1171 count = current_state->count; /* Number already matched */
1172 if (clen > 0)
1173 {
1174 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1175 (c < 256 &&
1176 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1177 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1178 {
1179 if (codevalue == OP_TYPEPOSUPTO)
1180 {
1181 active_count--; /* Remove non-match possibility */
1182 next_active_state--;
1183 }
1184 if (++count >= GET2(code, 1))
1185 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1186 else
1187 { ADD_NEW(state_offset, count); }
1188 }
1189 }
1190 break;
1191
1192 /* ========================================================================== */
1193 /* These are virtual opcodes that are used when something like
1194 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1195 argument. It keeps the code above fast for the other cases. The argument
1196 is in the d variable. */
1197
1198 #ifdef SUPPORT_UCP
1199 case OP_PROP_EXTRA + OP_TYPEPLUS:
1200 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1201 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1202 count = current_state->count; /* Already matched */
1203 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1204 if (clen > 0)
1205 {
1206 BOOL OK;
1207 const ucd_record * prop = GET_UCD(c);
1208 switch(code[2])
1209 {
1210 case PT_ANY:
1211 OK = TRUE;
1212 break;
1213
1214 case PT_LAMP:
1215 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1216 prop->chartype == ucp_Lt;
1217 break;
1218
1219 case PT_GC:
1220 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1221 break;
1222
1223 case PT_PC:
1224 OK = prop->chartype == code[3];
1225 break;
1226
1227 case PT_SC:
1228 OK = prop->script == code[3];
1229 break;
1230
1231 /* These are specials for combination cases. */
1232
1233 case PT_ALNUM:
1234 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1235 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1236 break;
1237
1238 case PT_SPACE: /* Perl space */
1239 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1240 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1241 break;
1242
1243 case PT_PXSPACE: /* POSIX space */
1244 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1246 c == CHAR_FF || c == CHAR_CR;
1247 break;
1248
1249 case PT_WORD:
1250 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1251 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1252 c == CHAR_UNDERSCORE;
1253 break;
1254
1255 /* Should never occur, but keep compilers from grumbling. */
1256
1257 default:
1258 OK = codevalue != OP_PROP;
1259 break;
1260 }
1261
1262 if (OK == (d == OP_PROP))
1263 {
1264 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1265 {
1266 active_count--; /* Remove non-match possibility */
1267 next_active_state--;
1268 }
1269 count++;
1270 ADD_NEW(state_offset, count);
1271 }
1272 }
1273 break;
1274
1275 /*-----------------------------------------------------------------*/
1276 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1277 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1278 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1279 count = current_state->count; /* Already matched */
1280 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1281 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1282 {
1283 const pcre_uchar *nptr = ptr + clen;
1284 int ncount = 0;
1285 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1286 {
1287 active_count--; /* Remove non-match possibility */
1288 next_active_state--;
1289 }
1290 while (nptr < end_subject)
1291 {
1292 int nd;
1293 int ndlen = 1;
1294 GETCHARLEN(nd, nptr, ndlen);
1295 if (UCD_CATEGORY(nd) != ucp_M) break;
1296 ncount++;
1297 nptr += ndlen;
1298 }
1299 count++;
1300 ADD_NEW_DATA(-state_offset, count, ncount);
1301 }
1302 break;
1303 #endif
1304
1305 /*-----------------------------------------------------------------*/
1306 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1307 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1308 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1309 count = current_state->count; /* Already matched */
1310 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1311 if (clen > 0)
1312 {
1313 int ncount = 0;
1314 switch (c)
1315 {
1316 case 0x000b:
1317 case 0x000c:
1318 case 0x0085:
1319 case 0x2028:
1320 case 0x2029:
1321 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1322 goto ANYNL01;
1323
1324 case 0x000d:
1325 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1326 /* Fall through */
1327
1328 ANYNL01:
1329 case 0x000a:
1330 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1331 {
1332 active_count--; /* Remove non-match possibility */
1333 next_active_state--;
1334 }
1335 count++;
1336 ADD_NEW_DATA(-state_offset, count, ncount);
1337 break;
1338
1339 default:
1340 break;
1341 }
1342 }
1343 break;
1344
1345 /*-----------------------------------------------------------------*/
1346 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1347 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1348 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1349 count = current_state->count; /* Already matched */
1350 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1351 if (clen > 0)
1352 {
1353 BOOL OK;
1354 switch (c)
1355 {
1356 case 0x000a:
1357 case 0x000b:
1358 case 0x000c:
1359 case 0x000d:
1360 case 0x0085:
1361 case 0x2028:
1362 case 0x2029:
1363 OK = TRUE;
1364 break;
1365
1366 default:
1367 OK = FALSE;
1368 break;
1369 }
1370
1371 if (OK == (d == OP_VSPACE))
1372 {
1373 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1374 {
1375 active_count--; /* Remove non-match possibility */
1376 next_active_state--;
1377 }
1378 count++;
1379 ADD_NEW_DATA(-state_offset, count, 0);
1380 }
1381 }
1382 break;
1383
1384 /*-----------------------------------------------------------------*/
1385 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1386 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1387 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1388 count = current_state->count; /* Already matched */
1389 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1390 if (clen > 0)
1391 {
1392 BOOL OK;
1393 switch (c)
1394 {
1395 case 0x09: /* HT */
1396 case 0x20: /* SPACE */
1397 case 0xa0: /* NBSP */
1398 case 0x1680: /* OGHAM SPACE MARK */
1399 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1400 case 0x2000: /* EN QUAD */
1401 case 0x2001: /* EM QUAD */
1402 case 0x2002: /* EN SPACE */
1403 case 0x2003: /* EM SPACE */
1404 case 0x2004: /* THREE-PER-EM SPACE */
1405 case 0x2005: /* FOUR-PER-EM SPACE */
1406 case 0x2006: /* SIX-PER-EM SPACE */
1407 case 0x2007: /* FIGURE SPACE */
1408 case 0x2008: /* PUNCTUATION SPACE */
1409 case 0x2009: /* THIN SPACE */
1410 case 0x200A: /* HAIR SPACE */
1411 case 0x202f: /* NARROW NO-BREAK SPACE */
1412 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1413 case 0x3000: /* IDEOGRAPHIC SPACE */
1414 OK = TRUE;
1415 break;
1416
1417 default:
1418 OK = FALSE;
1419 break;
1420 }
1421
1422 if (OK == (d == OP_HSPACE))
1423 {
1424 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1425 {
1426 active_count--; /* Remove non-match possibility */
1427 next_active_state--;
1428 }
1429 count++;
1430 ADD_NEW_DATA(-state_offset, count, 0);
1431 }
1432 }
1433 break;
1434
1435 /*-----------------------------------------------------------------*/
1436 #ifdef SUPPORT_UCP
1437 case OP_PROP_EXTRA + OP_TYPEQUERY:
1438 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1439 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1440 count = 4;
1441 goto QS1;
1442
1443 case OP_PROP_EXTRA + OP_TYPESTAR:
1444 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1445 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1446 count = 0;
1447
1448 QS1:
1449
1450 ADD_ACTIVE(state_offset + 4, 0);
1451 if (clen > 0)
1452 {
1453 BOOL OK;
1454 const ucd_record * prop = GET_UCD(c);
1455 switch(code[2])
1456 {
1457 case PT_ANY:
1458 OK = TRUE;
1459 break;
1460
1461 case PT_LAMP:
1462 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1463 prop->chartype == ucp_Lt;
1464 break;
1465
1466 case PT_GC:
1467 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1468 break;
1469
1470 case PT_PC:
1471 OK = prop->chartype == code[3];
1472 break;
1473
1474 case PT_SC:
1475 OK = prop->script == code[3];
1476 break;
1477
1478 /* These are specials for combination cases. */
1479
1480 case PT_ALNUM:
1481 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1482 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1483 break;
1484
1485 case PT_SPACE: /* Perl space */
1486 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1487 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1488 break;
1489
1490 case PT_PXSPACE: /* POSIX space */
1491 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1493 c == CHAR_FF || c == CHAR_CR;
1494 break;
1495
1496 case PT_WORD:
1497 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1498 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1499 c == CHAR_UNDERSCORE;
1500 break;
1501
1502 /* Should never occur, but keep compilers from grumbling. */
1503
1504 default:
1505 OK = codevalue != OP_PROP;
1506 break;
1507 }
1508
1509 if (OK == (d == OP_PROP))
1510 {
1511 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1512 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1513 {
1514 active_count--; /* Remove non-match possibility */
1515 next_active_state--;
1516 }
1517 ADD_NEW(state_offset + count, 0);
1518 }
1519 }
1520 break;
1521
1522 /*-----------------------------------------------------------------*/
1523 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1524 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1525 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1526 count = 2;
1527 goto QS2;
1528
1529 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1530 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1531 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1532 count = 0;
1533
1534 QS2:
1535
1536 ADD_ACTIVE(state_offset + 2, 0);
1537 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1538 {
1539 const pcre_uchar *nptr = ptr + clen;
1540 int ncount = 0;
1541 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1542 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1543 {
1544 active_count--; /* Remove non-match possibility */
1545 next_active_state--;
1546 }
1547 while (nptr < end_subject)
1548 {
1549 int nd;
1550 int ndlen = 1;
1551 GETCHARLEN(nd, nptr, ndlen);
1552 if (UCD_CATEGORY(nd) != ucp_M) break;
1553 ncount++;
1554 nptr += ndlen;
1555 }
1556 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1557 }
1558 break;
1559 #endif
1560
1561 /*-----------------------------------------------------------------*/
1562 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1563 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1564 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1565 count = 2;
1566 goto QS3;
1567
1568 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1569 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1570 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1571 count = 0;
1572
1573 QS3:
1574 ADD_ACTIVE(state_offset + 2, 0);
1575 if (clen > 0)
1576 {
1577 int ncount = 0;
1578 switch (c)
1579 {
1580 case 0x000b:
1581 case 0x000c:
1582 case 0x0085:
1583 case 0x2028:
1584 case 0x2029:
1585 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1586 goto ANYNL02;
1587
1588 case 0x000d:
1589 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1590 /* Fall through */
1591
1592 ANYNL02:
1593 case 0x000a:
1594 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1595 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1596 {
1597 active_count--; /* Remove non-match possibility */
1598 next_active_state--;
1599 }
1600 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1601 break;
1602
1603 default:
1604 break;
1605 }
1606 }
1607 break;
1608
1609 /*-----------------------------------------------------------------*/
1610 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1611 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1612 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1613 count = 2;
1614 goto QS4;
1615
1616 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1617 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1618 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1619 count = 0;
1620
1621 QS4:
1622 ADD_ACTIVE(state_offset + 2, 0);
1623 if (clen > 0)
1624 {
1625 BOOL OK;
1626 switch (c)
1627 {
1628 case 0x000a:
1629 case 0x000b:
1630 case 0x000c:
1631 case 0x000d:
1632 case 0x0085:
1633 case 0x2028:
1634 case 0x2029:
1635 OK = TRUE;
1636 break;
1637
1638 default:
1639 OK = FALSE;
1640 break;
1641 }
1642 if (OK == (d == OP_VSPACE))
1643 {
1644 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1645 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1646 {
1647 active_count--; /* Remove non-match possibility */
1648 next_active_state--;
1649 }
1650 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1651 }
1652 }
1653 break;
1654
1655 /*-----------------------------------------------------------------*/
1656 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1657 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1658 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1659 count = 2;
1660 goto QS5;
1661
1662 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1663 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1664 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1665 count = 0;
1666
1667 QS5:
1668 ADD_ACTIVE(state_offset + 2, 0);
1669 if (clen > 0)
1670 {
1671 BOOL OK;
1672 switch (c)
1673 {
1674 case 0x09: /* HT */
1675 case 0x20: /* SPACE */
1676 case 0xa0: /* NBSP */
1677 case 0x1680: /* OGHAM SPACE MARK */
1678 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1679 case 0x2000: /* EN QUAD */
1680 case 0x2001: /* EM QUAD */
1681 case 0x2002: /* EN SPACE */
1682 case 0x2003: /* EM SPACE */
1683 case 0x2004: /* THREE-PER-EM SPACE */
1684 case 0x2005: /* FOUR-PER-EM SPACE */
1685 case 0x2006: /* SIX-PER-EM SPACE */
1686 case 0x2007: /* FIGURE SPACE */
1687 case 0x2008: /* PUNCTUATION SPACE */
1688 case 0x2009: /* THIN SPACE */
1689 case 0x200A: /* HAIR SPACE */
1690 case 0x202f: /* NARROW NO-BREAK SPACE */
1691 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1692 case 0x3000: /* IDEOGRAPHIC SPACE */
1693 OK = TRUE;
1694 break;
1695
1696 default:
1697 OK = FALSE;
1698 break;
1699 }
1700
1701 if (OK == (d == OP_HSPACE))
1702 {
1703 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1704 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1705 {
1706 active_count--; /* Remove non-match possibility */
1707 next_active_state--;
1708 }
1709 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1710 }
1711 }
1712 break;
1713
1714 /*-----------------------------------------------------------------*/
1715 #ifdef SUPPORT_UCP
1716 case OP_PROP_EXTRA + OP_TYPEEXACT:
1717 case OP_PROP_EXTRA + OP_TYPEUPTO:
1718 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1719 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1720 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1721 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1722 count = current_state->count; /* Number already matched */
1723 if (clen > 0)
1724 {
1725 BOOL OK;
1726 const ucd_record * prop = GET_UCD(c);
1727 switch(code[1 + IMM2_SIZE + 1])
1728 {
1729 case PT_ANY:
1730 OK = TRUE;
1731 break;
1732
1733 case PT_LAMP:
1734 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1735 prop->chartype == ucp_Lt;
1736 break;
1737
1738 case PT_GC:
1739 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1740 break;
1741
1742 case PT_PC:
1743 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1744 break;
1745
1746 case PT_SC:
1747 OK = prop->script == code[1 + IMM2_SIZE + 2];
1748 break;
1749
1750 /* These are specials for combination cases. */
1751
1752 case PT_ALNUM:
1753 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1754 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1755 break;
1756
1757 case PT_SPACE: /* Perl space */
1758 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1759 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1760 break;
1761
1762 case PT_PXSPACE: /* POSIX space */
1763 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1765 c == CHAR_FF || c == CHAR_CR;
1766 break;
1767
1768 case PT_WORD:
1769 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1770 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1771 c == CHAR_UNDERSCORE;
1772 break;
1773
1774 /* Should never occur, but keep compilers from grumbling. */
1775
1776 default:
1777 OK = codevalue != OP_PROP;
1778 break;
1779 }
1780
1781 if (OK == (d == OP_PROP))
1782 {
1783 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1784 {
1785 active_count--; /* Remove non-match possibility */
1786 next_active_state--;
1787 }
1788 if (++count >= GET2(code, 1))
1789 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1790 else
1791 { ADD_NEW(state_offset, count); }
1792 }
1793 }
1794 break;
1795
1796 /*-----------------------------------------------------------------*/
1797 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1798 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1799 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1800 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1801 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1802 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1803 count = current_state->count; /* Number already matched */
1804 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1805 {
1806 const pcre_uchar *nptr = ptr + clen;
1807 int ncount = 0;
1808 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1809 {
1810 active_count--; /* Remove non-match possibility */
1811 next_active_state--;
1812 }
1813 while (nptr < end_subject)
1814 {
1815 int nd;
1816 int ndlen = 1;
1817 GETCHARLEN(nd, nptr, ndlen);
1818 if (UCD_CATEGORY(nd) != ucp_M) break;
1819 ncount++;
1820 nptr += ndlen;
1821 }
1822 if (++count >= GET2(code, 1))
1823 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1824 else
1825 { ADD_NEW_DATA(-state_offset, count, ncount); }
1826 }
1827 break;
1828 #endif
1829
1830 /*-----------------------------------------------------------------*/
1831 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1832 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1833 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1834 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1835 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1837 count = current_state->count; /* Number already matched */
1838 if (clen > 0)
1839 {
1840 int ncount = 0;
1841 switch (c)
1842 {
1843 case 0x000b:
1844 case 0x000c:
1845 case 0x0085:
1846 case 0x2028:
1847 case 0x2029:
1848 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1849 goto ANYNL03;
1850
1851 case 0x000d:
1852 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1853 /* Fall through */
1854
1855 ANYNL03:
1856 case 0x000a:
1857 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1858 {
1859 active_count--; /* Remove non-match possibility */
1860 next_active_state--;
1861 }
1862 if (++count >= GET2(code, 1))
1863 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1864 else
1865 { ADD_NEW_DATA(-state_offset, count, ncount); }
1866 break;
1867
1868 default:
1869 break;
1870 }
1871 }
1872 break;
1873
1874 /*-----------------------------------------------------------------*/
1875 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1876 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1877 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1878 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1879 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1880 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1881 count = current_state->count; /* Number already matched */
1882 if (clen > 0)
1883 {
1884 BOOL OK;
1885 switch (c)
1886 {
1887 case 0x000a:
1888 case 0x000b:
1889 case 0x000c:
1890 case 0x000d:
1891 case 0x0085:
1892 case 0x2028:
1893 case 0x2029:
1894 OK = TRUE;
1895 break;
1896
1897 default:
1898 OK = FALSE;
1899 }
1900
1901 if (OK == (d == OP_VSPACE))
1902 {
1903 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1904 {
1905 active_count--; /* Remove non-match possibility */
1906 next_active_state--;
1907 }
1908 if (++count >= GET2(code, 1))
1909 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1910 else
1911 { ADD_NEW_DATA(-state_offset, count, 0); }
1912 }
1913 }
1914 break;
1915
1916 /*-----------------------------------------------------------------*/
1917 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1918 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1919 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1920 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1921 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1922 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1923 count = current_state->count; /* Number already matched */
1924 if (clen > 0)
1925 {
1926 BOOL OK;
1927 switch (c)
1928 {
1929 case 0x09: /* HT */
1930 case 0x20: /* SPACE */
1931 case 0xa0: /* NBSP */
1932 case 0x1680: /* OGHAM SPACE MARK */
1933 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1934 case 0x2000: /* EN QUAD */
1935 case 0x2001: /* EM QUAD */
1936 case 0x2002: /* EN SPACE */
1937 case 0x2003: /* EM SPACE */
1938 case 0x2004: /* THREE-PER-EM SPACE */
1939 case 0x2005: /* FOUR-PER-EM SPACE */
1940 case 0x2006: /* SIX-PER-EM SPACE */
1941 case 0x2007: /* FIGURE SPACE */
1942 case 0x2008: /* PUNCTUATION SPACE */
1943 case 0x2009: /* THIN SPACE */
1944 case 0x200A: /* HAIR SPACE */
1945 case 0x202f: /* NARROW NO-BREAK SPACE */
1946 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1947 case 0x3000: /* IDEOGRAPHIC SPACE */
1948 OK = TRUE;
1949 break;
1950
1951 default:
1952 OK = FALSE;
1953 break;
1954 }
1955
1956 if (OK == (d == OP_HSPACE))
1957 {
1958 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1959 {
1960 active_count--; /* Remove non-match possibility */
1961 next_active_state--;
1962 }
1963 if (++count >= GET2(code, 1))
1964 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1965 else
1966 { ADD_NEW_DATA(-state_offset, count, 0); }
1967 }
1968 }
1969 break;
1970
1971 /* ========================================================================== */
1972 /* These opcodes are followed by a character that is usually compared
1973 to the current subject character; it is loaded into d. We still get
1974 here even if there is no subject character, because in some cases zero
1975 repetitions are permitted. */
1976
1977 /*-----------------------------------------------------------------*/
1978 case OP_CHAR:
1979 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1980 break;
1981
1982 /*-----------------------------------------------------------------*/
1983 case OP_CHARI:
1984 if (clen == 0) break;
1985
1986 #ifdef SUPPORT_UTF8
1987 if (utf)
1988 {
1989 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1990 {
1991 unsigned int othercase;
1992 if (c < 128) othercase = fcc[c]; else
1993
1994 /* If we have Unicode property support, we can use it to test the
1995 other case of the character. */
1996
1997 #ifdef SUPPORT_UCP
1998 othercase = UCD_OTHERCASE(c);
1999 #else
2000 othercase = NOTACHAR;
2001 #endif
2002
2003 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2004 }
2005 }
2006 else
2007 #endif /* SUPPORT_UTF8 */
2008 /* Not UTF mode */
2009 {
2010 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2011 }
2012 break;
2013
2014
2015 #ifdef SUPPORT_UCP
2016 /*-----------------------------------------------------------------*/
2017 /* This is a tricky one because it can match more than one character.
2018 Find out how many characters to skip, and then set up a negative state
2019 to wait for them to pass before continuing. */
2020
2021 case OP_EXTUNI:
2022 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2023 {
2024 const pcre_uchar *nptr = ptr + clen;
2025 int ncount = 0;
2026 while (nptr < end_subject)
2027 {
2028 int nclen = 1;
2029 GETCHARLEN(c, nptr, nclen);
2030 if (UCD_CATEGORY(c) != ucp_M) break;
2031 ncount++;
2032 nptr += nclen;
2033 }
2034 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2035 }
2036 break;
2037 #endif
2038
2039 /*-----------------------------------------------------------------*/
2040 /* This is a tricky like EXTUNI because it too can match more than one
2041 character (when CR is followed by LF). In this case, set up a negative
2042 state to wait for one character to pass before continuing. */
2043
2044 case OP_ANYNL:
2045 if (clen > 0) switch(c)
2046 {
2047 case 0x000b:
2048 case 0x000c:
2049 case 0x0085:
2050 case 0x2028:
2051 case 0x2029:
2052 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2053
2054 case 0x000a:
2055 ADD_NEW(state_offset + 1, 0);
2056 break;
2057
2058 case 0x000d:
2059 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2060 {
2061 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2062 }
2063 else
2064 {
2065 ADD_NEW(state_offset + 1, 0);
2066 }
2067 break;
2068 }
2069 break;
2070
2071 /*-----------------------------------------------------------------*/
2072 case OP_NOT_VSPACE:
2073 if (clen > 0) switch(c)
2074 {
2075 case 0x000a:
2076 case 0x000b:
2077 case 0x000c:
2078 case 0x000d:
2079 case 0x0085:
2080 case 0x2028:
2081 case 0x2029:
2082 break;
2083
2084 default:
2085 ADD_NEW(state_offset + 1, 0);
2086 break;
2087 }
2088 break;
2089
2090 /*-----------------------------------------------------------------*/
2091 case OP_VSPACE:
2092 if (clen > 0) switch(c)
2093 {
2094 case 0x000a:
2095 case 0x000b:
2096 case 0x000c:
2097 case 0x000d:
2098 case 0x0085:
2099 case 0x2028:
2100 case 0x2029:
2101 ADD_NEW(state_offset + 1, 0);
2102 break;
2103
2104 default: break;
2105 }
2106 break;
2107
2108 /*-----------------------------------------------------------------*/
2109 case OP_NOT_HSPACE:
2110 if (clen > 0) switch(c)
2111 {
2112 case 0x09: /* HT */
2113 case 0x20: /* SPACE */
2114 case 0xa0: /* NBSP */
2115 case 0x1680: /* OGHAM SPACE MARK */
2116 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2117 case 0x2000: /* EN QUAD */
2118 case 0x2001: /* EM QUAD */
2119 case 0x2002: /* EN SPACE */
2120 case 0x2003: /* EM SPACE */
2121 case 0x2004: /* THREE-PER-EM SPACE */
2122 case 0x2005: /* FOUR-PER-EM SPACE */
2123 case 0x2006: /* SIX-PER-EM SPACE */
2124 case 0x2007: /* FIGURE SPACE */
2125 case 0x2008: /* PUNCTUATION SPACE */
2126 case 0x2009: /* THIN SPACE */
2127 case 0x200A: /* HAIR SPACE */
2128 case 0x202f: /* NARROW NO-BREAK SPACE */
2129 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2130 case 0x3000: /* IDEOGRAPHIC SPACE */
2131 break;
2132
2133 default:
2134 ADD_NEW(state_offset + 1, 0);
2135 break;
2136 }
2137 break;
2138
2139 /*-----------------------------------------------------------------*/
2140 case OP_HSPACE:
2141 if (clen > 0) switch(c)
2142 {
2143 case 0x09: /* HT */
2144 case 0x20: /* SPACE */
2145 case 0xa0: /* NBSP */
2146 case 0x1680: /* OGHAM SPACE MARK */
2147 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2148 case 0x2000: /* EN QUAD */
2149 case 0x2001: /* EM QUAD */
2150 case 0x2002: /* EN SPACE */
2151 case 0x2003: /* EM SPACE */
2152 case 0x2004: /* THREE-PER-EM SPACE */
2153 case 0x2005: /* FOUR-PER-EM SPACE */
2154 case 0x2006: /* SIX-PER-EM SPACE */
2155 case 0x2007: /* FIGURE SPACE */
2156 case 0x2008: /* PUNCTUATION SPACE */
2157 case 0x2009: /* THIN SPACE */
2158 case 0x200A: /* HAIR SPACE */
2159 case 0x202f: /* NARROW NO-BREAK SPACE */
2160 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2161 case 0x3000: /* IDEOGRAPHIC SPACE */
2162 ADD_NEW(state_offset + 1, 0);
2163 break;
2164 }
2165 break;
2166
2167 /*-----------------------------------------------------------------*/
2168 /* Match a negated single character casefully. This is only used for
2169 one-byte characters, that is, we know that d < 256. The character we are
2170 checking (c) can be multibyte. */
2171
2172 case OP_NOT:
2173 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2174 break;
2175
2176 /*-----------------------------------------------------------------*/
2177 /* Match a negated single character caselessly. This is only used for
2178 one-byte characters, that is, we know that d < 256. The character we are
2179 checking (c) can be multibyte. */
2180
2181 case OP_NOTI:
2182 if (clen > 0 && c != d && c != fcc[d])
2183 { ADD_NEW(state_offset + dlen + 1, 0); }
2184 break;
2185
2186 /*-----------------------------------------------------------------*/
2187 case OP_PLUSI:
2188 case OP_MINPLUSI:
2189 case OP_POSPLUSI:
2190 case OP_NOTPLUSI:
2191 case OP_NOTMINPLUSI:
2192 case OP_NOTPOSPLUSI:
2193 caseless = TRUE;
2194 codevalue -= OP_STARI - OP_STAR;
2195
2196 /* Fall through */
2197 case OP_PLUS:
2198 case OP_MINPLUS:
2199 case OP_POSPLUS:
2200 case OP_NOTPLUS:
2201 case OP_NOTMINPLUS:
2202 case OP_NOTPOSPLUS:
2203 count = current_state->count; /* Already matched */
2204 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2205 if (clen > 0)
2206 {
2207 unsigned int otherd = NOTACHAR;
2208 if (caseless)
2209 {
2210 #ifdef SUPPORT_UTF8
2211 if (utf && d >= 128)
2212 {
2213 #ifdef SUPPORT_UCP
2214 otherd = UCD_OTHERCASE(d);
2215 #endif /* SUPPORT_UCP */
2216 }
2217 else
2218 #endif /* SUPPORT_UTF8 */
2219 otherd = fcc[d];
2220 }
2221 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2222 {
2223 if (count > 0 &&
2224 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2225 {
2226 active_count--; /* Remove non-match possibility */
2227 next_active_state--;
2228 }
2229 count++;
2230 ADD_NEW(state_offset, count);
2231 }
2232 }
2233 break;
2234
2235 /*-----------------------------------------------------------------*/
2236 case OP_QUERYI:
2237 case OP_MINQUERYI:
2238 case OP_POSQUERYI:
2239 case OP_NOTQUERYI:
2240 case OP_NOTMINQUERYI:
2241 case OP_NOTPOSQUERYI:
2242 caseless = TRUE;
2243 codevalue -= OP_STARI - OP_STAR;
2244 /* Fall through */
2245 case OP_QUERY:
2246 case OP_MINQUERY:
2247 case OP_POSQUERY:
2248 case OP_NOTQUERY:
2249 case OP_NOTMINQUERY:
2250 case OP_NOTPOSQUERY:
2251 ADD_ACTIVE(state_offset + dlen + 1, 0);
2252 if (clen > 0)
2253 {
2254 unsigned int otherd = NOTACHAR;
2255 if (caseless)
2256 {
2257 #ifdef SUPPORT_UTF8
2258 if (utf && d >= 128)
2259 {
2260 #ifdef SUPPORT_UCP
2261 otherd = UCD_OTHERCASE(d);
2262 #endif /* SUPPORT_UCP */
2263 }
2264 else
2265 #endif /* SUPPORT_UTF8 */
2266 otherd = fcc[d];
2267 }
2268 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2269 {
2270 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2271 {
2272 active_count--; /* Remove non-match possibility */
2273 next_active_state--;
2274 }
2275 ADD_NEW(state_offset + dlen + 1, 0);
2276 }
2277 }
2278 break;
2279
2280 /*-----------------------------------------------------------------*/
2281 case OP_STARI:
2282 case OP_MINSTARI:
2283 case OP_POSSTARI:
2284 case OP_NOTSTARI:
2285 case OP_NOTMINSTARI:
2286 case OP_NOTPOSSTARI:
2287 caseless = TRUE;
2288 codevalue -= OP_STARI - OP_STAR;
2289 /* Fall through */
2290 case OP_STAR:
2291 case OP_MINSTAR:
2292 case OP_POSSTAR:
2293 case OP_NOTSTAR:
2294 case OP_NOTMINSTAR:
2295 case OP_NOTPOSSTAR:
2296 ADD_ACTIVE(state_offset + dlen + 1, 0);
2297 if (clen > 0)
2298 {
2299 unsigned int otherd = NOTACHAR;
2300 if (caseless)
2301 {
2302 #ifdef SUPPORT_UTF8
2303 if (utf && d >= 128)
2304 {
2305 #ifdef SUPPORT_UCP
2306 otherd = UCD_OTHERCASE(d);
2307 #endif /* SUPPORT_UCP */
2308 }
2309 else
2310 #endif /* SUPPORT_UTF8 */
2311 otherd = fcc[d];
2312 }
2313 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2314 {
2315 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2316 {
2317 active_count--; /* Remove non-match possibility */
2318 next_active_state--;
2319 }
2320 ADD_NEW(state_offset, 0);
2321 }
2322 }
2323 break;
2324
2325 /*-----------------------------------------------------------------*/
2326 case OP_EXACTI:
2327 case OP_NOTEXACTI:
2328 caseless = TRUE;
2329 codevalue -= OP_STARI - OP_STAR;
2330 /* Fall through */
2331 case OP_EXACT:
2332 case OP_NOTEXACT:
2333 count = current_state->count; /* Number already matched */
2334 if (clen > 0)
2335 {
2336 unsigned int otherd = NOTACHAR;
2337 if (caseless)
2338 {
2339 #ifdef SUPPORT_UTF8
2340 if (utf && d >= 128)
2341 {
2342 #ifdef SUPPORT_UCP
2343 otherd = UCD_OTHERCASE(d);
2344 #endif /* SUPPORT_UCP */
2345 }
2346 else
2347 #endif /* SUPPORT_UTF8 */
2348 otherd = fcc[d];
2349 }
2350 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351 {
2352 if (++count >= GET2(code, 1))
2353 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2354 else
2355 { ADD_NEW(state_offset, count); }
2356 }
2357 }
2358 break;
2359
2360 /*-----------------------------------------------------------------*/
2361 case OP_UPTOI:
2362 case OP_MINUPTOI:
2363 case OP_POSUPTOI:
2364 case OP_NOTUPTOI:
2365 case OP_NOTMINUPTOI:
2366 case OP_NOTPOSUPTOI:
2367 caseless = TRUE;
2368 codevalue -= OP_STARI - OP_STAR;
2369 /* Fall through */
2370 case OP_UPTO:
2371 case OP_MINUPTO:
2372 case OP_POSUPTO:
2373 case OP_NOTUPTO:
2374 case OP_NOTMINUPTO:
2375 case OP_NOTPOSUPTO:
2376 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2377 count = current_state->count; /* Number already matched */
2378 if (clen > 0)
2379 {
2380 unsigned int otherd = NOTACHAR;
2381 if (caseless)
2382 {
2383 #ifdef SUPPORT_UTF8
2384 if (utf && d >= 128)
2385 {
2386 #ifdef SUPPORT_UCP
2387 otherd = UCD_OTHERCASE(d);
2388 #endif /* SUPPORT_UCP */
2389 }
2390 else
2391 #endif /* SUPPORT_UTF8 */
2392 otherd = fcc[d];
2393 }
2394 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2395 {
2396 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2397 {
2398 active_count--; /* Remove non-match possibility */
2399 next_active_state--;
2400 }
2401 if (++count >= GET2(code, 1))
2402 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2403 else
2404 { ADD_NEW(state_offset, count); }
2405 }
2406 }
2407 break;
2408
2409
2410 /* ========================================================================== */
2411 /* These are the class-handling opcodes */
2412
2413 case OP_CLASS:
2414 case OP_NCLASS:
2415 case OP_XCLASS:
2416 {
2417 BOOL isinclass = FALSE;
2418 int next_state_offset;
2419 const pcre_uchar *ecode;
2420
2421 /* For a simple class, there is always just a 32-byte table, and we
2422 can set isinclass from it. */
2423
2424 if (codevalue != OP_XCLASS)
2425 {
2426 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2427 if (clen > 0)
2428 {
2429 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2430 ((code[1 + c/8] & (1 << (c&7))) != 0);
2431 }
2432 }
2433
2434 /* An extended class may have a table or a list of single characters,
2435 ranges, or both, and it may be positive or negative. There's a
2436 function that sorts all this out. */
2437
2438 else
2439 {
2440 ecode = code + GET(code, 1);
2441 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE);
2442 }
2443
2444 /* At this point, isinclass is set for all kinds of class, and ecode
2445 points to the byte after the end of the class. If there is a
2446 quantifier, this is where it will be. */
2447
2448 next_state_offset = (int)(ecode - start_code);
2449
2450 switch (*ecode)
2451 {
2452 case OP_CRSTAR:
2453 case OP_CRMINSTAR:
2454 ADD_ACTIVE(next_state_offset + 1, 0);
2455 if (isinclass) { ADD_NEW(state_offset, 0); }
2456 break;
2457
2458 case OP_CRPLUS:
2459 case OP_CRMINPLUS:
2460 count = current_state->count; /* Already matched */
2461 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2462 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2463 break;
2464
2465 case OP_CRQUERY:
2466 case OP_CRMINQUERY:
2467 ADD_ACTIVE(next_state_offset + 1, 0);
2468 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2469 break;
2470
2471 case OP_CRRANGE:
2472 case OP_CRMINRANGE:
2473 count = current_state->count; /* Already matched */
2474 if (count >= GET2(ecode, 1))
2475 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2476 if (isinclass)
2477 {
2478 int max = GET2(ecode, 3);
2479 if (++count >= max && max != 0) /* Max 0 => no limit */
2480 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2481 else
2482 { ADD_NEW(state_offset, count); }
2483 }
2484 break;
2485
2486 default:
2487 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2488 break;
2489 }
2490 }
2491 break;
2492
2493 /* ========================================================================== */
2494 /* These are the opcodes for fancy brackets of various kinds. We have
2495 to use recursion in order to handle them. The "always failing" assertion
2496 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2497 though the other "backtracking verbs" are not supported. */
2498
2499 case OP_FAIL:
2500 forced_fail++; /* Count FAILs for multiple states */
2501 break;
2502
2503 case OP_ASSERT:
2504 case OP_ASSERT_NOT:
2505 case OP_ASSERTBACK:
2506 case OP_ASSERTBACK_NOT:
2507 {
2508 int rc;
2509 int local_offsets[2];
2510 int local_workspace[1000];
2511 const pcre_uchar *endasscode = code + GET(code, 1);
2512
2513 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2514
2515 rc = internal_dfa_exec(
2516 md, /* static match data */
2517 code, /* this subexpression's code */
2518 ptr, /* where we currently are */
2519 (int)(ptr - start_subject), /* start offset */
2520 local_offsets, /* offset vector */
2521 sizeof(local_offsets)/sizeof(int), /* size of same */
2522 local_workspace, /* workspace vector */
2523 sizeof(local_workspace)/sizeof(int), /* size of same */
2524 rlevel); /* function recursion level */
2525
2526 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2527 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2528 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2529 }
2530 break;
2531
2532 /*-----------------------------------------------------------------*/
2533 case OP_COND:
2534 case OP_SCOND:
2535 {
2536 int local_offsets[1000];
2537 int local_workspace[1000];
2538 int codelink = GET(code, 1);
2539 int condcode;
2540
2541 /* Because of the way auto-callout works during compile, a callout item
2542 is inserted between OP_COND and an assertion condition. This does not
2543 happen for the other conditions. */
2544
2545 if (code[LINK_SIZE+1] == OP_CALLOUT)
2546 {
2547 rrc = 0;
2548 if (pcre_callout != NULL)
2549 {
2550 pcre_callout_block cb;
2551 cb.version = 1; /* Version 1 of the callout block */
2552 cb.callout_number = code[LINK_SIZE+2];
2553 cb.offset_vector = offsets;
2554 cb.subject = (PCRE_SPTR)start_subject;
2555 cb.subject_length = (int)(end_subject - start_subject);
2556 cb.start_match = (int)(current_subject - start_subject);
2557 cb.current_position = (int)(ptr - start_subject);
2558 cb.pattern_position = GET(code, LINK_SIZE + 3);
2559 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2560 cb.capture_top = 1;
2561 cb.capture_last = -1;
2562 cb.callout_data = md->callout_data;
2563 cb.mark = NULL; /* No (*MARK) support */
2564 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2565 }
2566 if (rrc > 0) break; /* Fail this thread */
2567 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2568 }
2569
2570 condcode = code[LINK_SIZE+1];
2571
2572 /* Back reference conditions are not supported */
2573
2574 if (condcode == OP_CREF || condcode == OP_NCREF)
2575 return PCRE_ERROR_DFA_UCOND;
2576
2577 /* The DEFINE condition is always false */
2578
2579 if (condcode == OP_DEF)
2580 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2581
2582 /* The only supported version of OP_RREF is for the value RREF_ANY,
2583 which means "test if in any recursion". We can't test for specifically
2584 recursed groups. */
2585
2586 else if (condcode == OP_RREF || condcode == OP_NRREF)
2587 {
2588 int value = GET2(code, LINK_SIZE+2);
2589 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2590 if (md->recursive != NULL)
2591 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2592 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2593 }
2594
2595 /* Otherwise, the condition is an assertion */
2596
2597 else
2598 {
2599 int rc;
2600 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2601 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2602
2603 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2604
2605 rc = internal_dfa_exec(
2606 md, /* fixed match data */
2607 asscode, /* this subexpression's code */
2608 ptr, /* where we currently are */
2609 (int)(ptr - start_subject), /* start offset */
2610 local_offsets, /* offset vector */
2611 sizeof(local_offsets)/sizeof(int), /* size of same */
2612 local_workspace, /* workspace vector */
2613 sizeof(local_workspace)/sizeof(int), /* size of same */
2614 rlevel); /* function recursion level */
2615
2616 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2617 if ((rc >= 0) ==
2618 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2619 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2620 else
2621 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2622 }
2623 }
2624 break;
2625
2626 /*-----------------------------------------------------------------*/
2627 case OP_RECURSE:
2628 {
2629 dfa_recursion_info *ri;
2630 int local_offsets[1000];
2631 int local_workspace[1000];
2632 const pcre_uchar *callpat = start_code + GET(code, 1);
2633 int recno = (callpat == md->start_code)? 0 :
2634 GET2(callpat, 1 + LINK_SIZE);
2635 int rc;
2636
2637 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2638
2639 /* Check for repeating a recursion without advancing the subject
2640 pointer. This should catch convoluted mutual recursions. (Some simple
2641 cases are caught at compile time.) */
2642
2643 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2644 if (recno == ri->group_num && ptr == ri->subject_position)
2645 return PCRE_ERROR_RECURSELOOP;
2646
2647 /* Remember this recursion and where we started it so as to
2648 catch infinite loops. */
2649
2650 new_recursive.group_num = recno;
2651 new_recursive.subject_position = ptr;
2652 new_recursive.prevrec = md->recursive;
2653 md->recursive = &new_recursive;
2654
2655 rc = internal_dfa_exec(
2656 md, /* fixed match data */
2657 callpat, /* this subexpression's code */
2658 ptr, /* where we currently are */
2659 (int)(ptr - start_subject), /* start offset */
2660 local_offsets, /* offset vector */
2661 sizeof(local_offsets)/sizeof(int), /* size of same */
2662 local_workspace, /* workspace vector */
2663 sizeof(local_workspace)/sizeof(int), /* size of same */
2664 rlevel); /* function recursion level */
2665
2666 md->recursive = new_recursive.prevrec; /* Done this recursion */
2667
2668 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2669 rc));
2670
2671 /* Ran out of internal offsets */
2672
2673 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2674
2675 /* For each successful matched substring, set up the next state with a
2676 count of characters to skip before trying it. Note that the count is in
2677 characters, not bytes. */
2678
2679 if (rc > 0)
2680 {
2681 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2682 {
2683 const pcre_uchar *p = start_subject + local_offsets[rc];
2684 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2685 int charcount = local_offsets[rc+1] - local_offsets[rc];
2686 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2687 if (charcount > 0)
2688 {
2689 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2690 }
2691 else
2692 {
2693 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2694 }
2695 }
2696 }
2697 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2698 }
2699 break;
2700
2701 /*-----------------------------------------------------------------*/
2702 case OP_BRAPOS:
2703 case OP_SBRAPOS:
2704 case OP_CBRAPOS:
2705 case OP_SCBRAPOS:
2706 case OP_BRAPOSZERO:
2707 {
2708 int charcount, matched_count;
2709 const pcre_uchar *local_ptr = ptr;
2710 BOOL allow_zero;
2711
2712 if (codevalue == OP_BRAPOSZERO)
2713 {
2714 allow_zero = TRUE;
2715 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2716 }
2717 else allow_zero = FALSE;
2718
2719 /* Loop to match the subpattern as many times as possible as if it were
2720 a complete pattern. */
2721
2722 for (matched_count = 0;; matched_count++)
2723 {
2724 int local_offsets[2];
2725 int local_workspace[1000];
2726
2727 int rc = internal_dfa_exec(
2728 md, /* fixed match data */
2729 code, /* this subexpression's code */
2730 local_ptr, /* where we currently are */
2731 (int)(ptr - start_subject), /* start offset */
2732 local_offsets, /* offset vector */
2733 sizeof(local_offsets)/sizeof(int), /* size of same */
2734 local_workspace, /* workspace vector */
2735 sizeof(local_workspace)/sizeof(int), /* size of same */
2736 rlevel); /* function recursion level */
2737
2738 /* Failed to match */
2739
2740 if (rc < 0)
2741 {
2742 if (rc != PCRE_ERROR_NOMATCH) return rc;
2743 break;
2744 }
2745
2746 /* Matched: break the loop if zero characters matched. */
2747
2748 charcount = local_offsets[1] - local_offsets[0];
2749 if (charcount == 0) break;
2750 local_ptr += charcount; /* Advance temporary position ptr */
2751 }
2752
2753 /* At this point we have matched the subpattern matched_count
2754 times, and local_ptr is pointing to the character after the end of the
2755 last match. */
2756
2757 if (matched_count > 0 || allow_zero)
2758 {
2759 const pcre_uchar *end_subpattern = code;
2760 int next_state_offset;
2761
2762 do { end_subpattern += GET(end_subpattern, 1); }
2763 while (*end_subpattern == OP_ALT);
2764 next_state_offset =
2765 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2766
2767 /* Optimization: if there are no more active states, and there
2768 are no new states yet set up, then skip over the subject string
2769 right here, to save looping. Otherwise, set up the new state to swing
2770 into action when the end of the matched substring is reached. */
2771
2772 if (i + 1 >= active_count && new_count == 0)
2773 {
2774 ptr = local_ptr;
2775 clen = 0;
2776 ADD_NEW(next_state_offset, 0);
2777 }
2778 else
2779 {
2780 const pcre_uchar *p = ptr;
2781 const pcre_uchar *pp = local_ptr;
2782 charcount = pp - p;
2783 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2784 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2785 }
2786 }
2787 }
2788 break;
2789
2790 /*-----------------------------------------------------------------*/
2791 case OP_ONCE:
2792 case OP_ONCE_NC:
2793 {
2794 int local_offsets[2];
2795 int local_workspace[1000];
2796
2797 int rc = internal_dfa_exec(
2798 md, /* fixed match data */
2799 code, /* this subexpression's code */
2800 ptr, /* where we currently are */
2801 (int)(ptr - start_subject), /* start offset */
2802 local_offsets, /* offset vector */
2803 sizeof(local_offsets)/sizeof(int), /* size of same */
2804 local_workspace, /* workspace vector */
2805 sizeof(local_workspace)/sizeof(int), /* size of same */
2806 rlevel); /* function recursion level */
2807
2808 if (rc >= 0)
2809 {
2810 const pcre_uchar *end_subpattern = code;
2811 int charcount = local_offsets[1] - local_offsets[0];
2812 int next_state_offset, repeat_state_offset;
2813
2814 do { end_subpattern += GET(end_subpattern, 1); }
2815 while (*end_subpattern == OP_ALT);
2816 next_state_offset =
2817 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2818
2819 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2820 arrange for the repeat state also to be added to the relevant list.
2821 Calculate the offset, or set -1 for no repeat. */
2822
2823 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2824 *end_subpattern == OP_KETRMIN)?
2825 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2826
2827 /* If we have matched an empty string, add the next state at the
2828 current character pointer. This is important so that the duplicate
2829 checking kicks in, which is what breaks infinite loops that match an
2830 empty string. */
2831
2832 if (charcount == 0)
2833 {
2834 ADD_ACTIVE(next_state_offset, 0);
2835 }
2836
2837 /* Optimization: if there are no more active states, and there
2838 are no new states yet set up, then skip over the subject string
2839 right here, to save looping. Otherwise, set up the new state to swing
2840 into action when the end of the matched substring is reached. */
2841
2842 else if (i + 1 >= active_count && new_count == 0)
2843 {
2844 ptr += charcount;
2845 clen = 0;
2846 ADD_NEW(next_state_offset, 0);
2847
2848 /* If we are adding a repeat state at the new character position,
2849 we must fudge things so that it is the only current state.
2850 Otherwise, it might be a duplicate of one we processed before, and
2851 that would cause it to be skipped. */
2852
2853 if (repeat_state_offset >= 0)
2854 {
2855 next_active_state = active_states;
2856 active_count = 0;
2857 i = -1;
2858 ADD_ACTIVE(repeat_state_offset, 0);
2859 }
2860 }
2861 else
2862 {
2863 const pcre_uchar *p = start_subject + local_offsets[0];
2864 const pcre_uchar *pp = start_subject + local_offsets[1];
2865 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2866 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2867 if (repeat_state_offset >= 0)
2868 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2869 }
2870 }
2871 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2872 }
2873 break;
2874
2875
2876 /* ========================================================================== */
2877 /* Handle callouts */
2878
2879 case OP_CALLOUT:
2880 rrc = 0;
2881 if (pcre_callout != NULL)
2882 {
2883 pcre_callout_block cb;
2884 cb.version = 1; /* Version 1 of the callout block */
2885 cb.callout_number = code[1];
2886 cb.offset_vector = offsets;
2887 cb.subject = (PCRE_SPTR)start_subject;
2888 cb.subject_length = (int)(end_subject - start_subject);
2889 cb.start_match = (int)(current_subject - start_subject);
2890 cb.current_position = (int)(ptr - start_subject);
2891 cb.pattern_position = GET(code, 2);
2892 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2893 cb.capture_top = 1;
2894 cb.capture_last = -1;
2895 cb.callout_data = md->callout_data;
2896 cb.mark = NULL; /* No (*MARK) support */
2897 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2898 }
2899 if (rrc == 0)
2900 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2901 break;
2902
2903
2904 /* ========================================================================== */
2905 default: /* Unsupported opcode */
2906 return PCRE_ERROR_DFA_UITEM;
2907 }
2908
2909 NEXT_ACTIVE_STATE: continue;
2910
2911 } /* End of loop scanning active states */
2912
2913 /* We have finished the processing at the current subject character. If no
2914 new states have been set for the next character, we have found all the
2915 matches that we are going to find. If we are at the top level and partial
2916 matching has been requested, check for appropriate conditions.
2917
2918 The "forced_ fail" variable counts the number of (*F) encountered for the
2919 character. If it is equal to the original active_count (saved in
2920 workspace[1]) it means that (*F) was found on every active state. In this
2921 case we don't want to give a partial match.
2922
2923 The "could_continue" variable is true if a state could have continued but
2924 for the fact that the end of the subject was reached. */
2925
2926 if (new_count <= 0)
2927 {
2928 if (rlevel == 1 && /* Top level, and */
2929 could_continue && /* Some could go on */
2930 forced_fail != workspace[1] && /* Not all forced fail & */
2931 ( /* either... */
2932 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2933 || /* or... */
2934 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2935 match_count < 0) /* no matches */
2936 ) && /* And... */
2937 ptr >= end_subject && /* Reached end of subject */
2938 ptr > md->start_used_ptr) /* Inspected non-empty string */
2939 {
2940 if (offsetcount >= 2)
2941 {
2942 offsets[0] = (int)(md->start_used_ptr - start_subject);
2943 offsets[1] = (int)(end_subject - start_subject);
2944 }
2945 match_count = PCRE_ERROR_PARTIAL;
2946 }
2947
2948 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2949 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2950 rlevel*2-2, SP));
2951 break; /* In effect, "return", but see the comment below */
2952 }
2953
2954 /* One or more states are active for the next character. */
2955
2956 ptr += clen; /* Advance to next subject character */
2957 } /* Loop to move along the subject string */
2958
2959 /* Control gets here from "break" a few lines above. We do it this way because
2960 if we use "return" above, we have compiler trouble. Some compilers warn if
2961 there's nothing here because they think the function doesn't return a value. On
2962 the other hand, if we put a dummy statement here, some more clever compilers
2963 complain that it can't be reached. Sigh. */
2964
2965 return match_count;
2966 }
2967
2968
2969
2970
2971 /*************************************************
2972 * Execute a Regular Expression - DFA engine *
2973 *************************************************/
2974
2975 /* This external function applies a compiled re to a subject string using a DFA
2976 engine. This function calls the internal function multiple times if the pattern
2977 is not anchored.
2978
2979 Arguments:
2980 argument_re points to the compiled expression
2981 extra_data points to extra data or is NULL
2982 subject points to the subject string
2983 length length of subject string (may contain binary zeros)
2984 start_offset where to start in the subject string
2985 options option bits
2986 offsets vector of match offsets
2987 offsetcount size of same
2988 workspace workspace vector
2989 wscount size of same
2990
2991 Returns: > 0 => number of match offset pairs placed in offsets
2992 = 0 => offsets overflowed; longest matches are present
2993 -1 => failed to match
2994 < -1 => some kind of unexpected problem
2995 */
2996
2997 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2998 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2999 const char *subject, int length, int start_offset, int options, int *offsets,
3000 int offsetcount, int *workspace, int wscount)
3001 {
3002 real_pcre *re = (real_pcre *)argument_re;
3003 dfa_match_data match_block;
3004 dfa_match_data *md = &match_block;
3005 BOOL utf, anchored, startline, firstline;
3006 const pcre_uchar *current_subject, *end_subject;
3007 const pcre_uint8 *lcc;
3008
3009 pcre_study_data internal_study;
3010 const pcre_study_data *study = NULL;
3011 real_pcre internal_re;
3012
3013 const pcre_uchar *req_char_ptr;
3014 const pcre_uint8 *start_bits = NULL;
3015 BOOL has_first_char = FALSE;
3016 BOOL has_req_char = FALSE;
3017 pcre_uchar first_char = 0;
3018 pcre_uchar first_char2 = 0;
3019 pcre_uchar req_char = 0;
3020 pcre_uchar req_char2 = 0;
3021 int newline;
3022
3023 /* Plausibility checks */
3024
3025 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3026 if (re == NULL || subject == NULL || workspace == NULL ||
3027 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031
3032 /* We need to find the pointer to any study data before we test for byte
3033 flipping, so we scan the extra_data block first. This may set two fields in the
3034 match block, so we must initialize them beforehand. However, the other fields
3035 in the match block must not be set until after the byte flipping. */
3036
3037 md->tables = re->tables;
3038 md->callout_data = NULL;
3039
3040 if (extra_data != NULL)
3041 {
3042 unsigned int flags = extra_data->flags;
3043 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3044 study = (const pcre_study_data *)extra_data->study_data;
3045 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3046 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3047 return PCRE_ERROR_DFA_UMLIMIT;
3048 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3049 md->callout_data = extra_data->callout_data;
3050 if ((flags & PCRE_EXTRA_TABLES) != 0)
3051 md->tables = extra_data->tables;
3052 }
3053
3054 /* Check that the first field in the block is the magic number. If it is not,
3055 test for a regex that was compiled on a host of opposite endianness. If this is
3056 the case, flipped values are put in internal_re and internal_study if there was
3057 study data too. */
3058
3059 if (re->magic_number != MAGIC_NUMBER)
3060 {
3061 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3062 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3063 if (study != NULL) study = &internal_study;
3064 }
3065
3066 /* Set some local values */
3067
3068 current_subject = (const unsigned char *)subject + start_offset;
3069 end_subject = (const unsigned char *)subject + length;
3070 req_char_ptr = current_subject - 1;
3071
3072 #ifdef SUPPORT_UTF8
3073 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3074 utf = (re->options & PCRE_UTF8) != 0;
3075 #else
3076 utf = FALSE;
3077 #endif
3078
3079 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3080 (re->options & PCRE_ANCHORED) != 0;
3081
3082 /* The remaining fixed data for passing around. */
3083
3084 md->start_code = (const pcre_uchar *)argument_re +
3085 re->name_table_offset + re->name_count * re->name_entry_size;
3086 md->start_subject = (const unsigned char *)subject;
3087 md->end_subject = end_subject;
3088 md->start_offset = start_offset;
3089 md->moptions = options;
3090 md->poptions = re->options;
3091
3092 /* If the BSR option is not set at match time, copy what was set
3093 at compile time. */
3094
3095 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3096 {
3097 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3098 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3099 #ifdef BSR_ANYCRLF
3100 else md->moptions |= PCRE_BSR_ANYCRLF;
3101 #endif
3102 }
3103
3104 /* Handle different types of newline. The three bits give eight cases. If
3105 nothing is set at run time, whatever was used at compile time applies. */
3106
3107 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3108 PCRE_NEWLINE_BITS)
3109 {
3110 case 0: newline = NEWLINE; break; /* Compile-time default */
3111 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3112 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3113 case PCRE_NEWLINE_CR+
3114 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3115 case PCRE_NEWLINE_ANY: newline = -1; break;
3116 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3117 default: return PCRE_ERROR_BADNEWLINE;
3118 }
3119
3120 if (newline == -2)
3121 {
3122 md->nltype = NLTYPE_ANYCRLF;
3123 }
3124 else if (newline < 0)
3125 {
3126 md->nltype = NLTYPE_ANY;
3127 }
3128 else
3129 {
3130 md->nltype = NLTYPE_FIXED;
3131 if (newline > 255)
3132 {
3133 md->nllen = 2;
3134 md->nl[0] = (newline >> 8) & 255;
3135 md->nl[1] = newline & 255;
3136 }
3137 else
3138 {
3139 md->nllen = 1;
3140 md->nl[0] = newline;
3141 }
3142 }
3143
3144 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3145 back the character offset. */
3146
3147 #ifdef SUPPORT_UTF
3148 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3149 {
3150 int erroroffset;
3151 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3152 if (errorcode != 0)
3153 {
3154 if (offsetcount >= 2)
3155 {
3156 offsets[0] = erroroffset;
3157 offsets[1] = errorcode;
3158 }
3159 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3160 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3161 }
3162 if (start_offset > 0 && start_offset < length &&
3163 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3164 return PCRE_ERROR_BADUTF8_OFFSET;
3165 }
3166 #endif
3167
3168 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3169 is a feature that makes it possible to save compiled regex and re-use them
3170 in other programs later. */
3171
3172 if (md->tables == NULL) md->tables = PRIV(default_tables);
3173
3174 /* The lower casing table and the "must be at the start of a line" flag are
3175 used in a loop when finding where to start. */
3176
3177 lcc = md->tables + lcc_offset;
3178 startline = (re->flags & PCRE_STARTLINE) != 0;
3179 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3180
3181 /* Set up the first character to match, if available. The first_byte value is
3182 never set for an anchored regular expression, but the anchoring may be forced
3183 at run time, so we have to test for anchoring. The first char may be unset for
3184 an unanchored pattern, of course. If there's no first char and the pattern was
3185 studied, there may be a bitmap of possible first characters. */
3186
3187 if (!anchored)
3188 {
3189 if ((re->flags & PCRE_FIRSTSET) != 0)
3190 {
3191 has_first_char = TRUE;
3192 first_char = first_char2 = re->first_char;
3193 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3194 {
3195 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3196 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3197 if (utf && first_char > 127)
3198 first_char2 = UCD_OTHERCASE(first_char);
3199 #endif
3200 }
3201 }
3202 else
3203 {
3204 if (!startline && study != NULL &&
3205 (study->flags & PCRE_STUDY_MAPPED) != 0)
3206 start_bits = study->start_bits;
3207 }
3208 }
3209
3210 /* For anchored or unanchored matches, there may be a "last known required
3211 character" set. */
3212
3213 if ((re->flags & PCRE_REQCHSET) != 0)
3214 {
3215 has_req_char = TRUE;
3216 req_char = req_char2 = re->req_char;
3217 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3218 {
3219 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3220 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3221 if (utf && req_char > 127)
3222 req_char2 = UCD_OTHERCASE(req_char);
3223 #endif
3224 }
3225 }
3226
3227 /* Call the main matching function, looping for a non-anchored regex after a
3228 failed match. If not restarting, perform certain optimizations at the start of
3229 a match. */
3230
3231 for (;;)
3232 {
3233 int rc;
3234
3235 if ((options & PCRE_DFA_RESTART) == 0)
3236 {
3237 const pcre_uchar *save_end_subject = end_subject;
3238
3239 /* If firstline is TRUE, the start of the match is constrained to the first
3240 line of a multiline string. Implement this by temporarily adjusting
3241 end_subject so that we stop scanning at a newline. If the match fails at
3242 the newline, later code breaks this loop. */
3243
3244 if (firstline)
3245 {
3246 PCRE_PUCHAR t = current_subject;
3247 #ifdef SUPPORT_UTF
3248 if (utf)
3249 {
3250 while (t < md->end_subject && !IS_NEWLINE(t))
3251 {
3252 t++;
3253 ACROSSCHAR(t < end_subject, *t, t++);
3254 }
3255 }
3256 else
3257 #endif
3258 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3259 end_subject = t;
3260 }
3261
3262 /* There are some optimizations that avoid running the match if a known
3263 starting point is not found. However, there is an option that disables
3264 these, for testing and for ensuring that all callouts do actually occur.
3265 The option can be set in the regex by (*NO_START_OPT) or passed in
3266 match-time options. */
3267
3268 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3269 {
3270 /* Advance to a known first char. */
3271
3272 if (has_first_char)
3273 {
3274 if (first_char != first_char2)
3275 while (current_subject < end_subject &&
3276 *current_subject != first_char && *current_subject != first_char2)
3277 current_subject++;
3278 else
3279 while (current_subject < end_subject &&
3280 *current_subject != first_char)
3281 current_subject++;
3282 }
3283
3284 /* Or to just after a linebreak for a multiline match if possible */
3285
3286 else if (startline)
3287 {
3288 if (current_subject > md->start_subject + start_offset)
3289 {
3290 #ifdef SUPPORT_UTF
3291 if (utf)
3292 {
3293 while (current_subject < end_subject &&
3294 !WAS_NEWLINE(current_subject))
3295 {
3296 current_subject++;
3297 ACROSSCHAR(current_subject < end_subject, *current_subject,
3298 current_subject++);
3299 }
3300 }
3301 else
3302 #endif
3303 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3304 current_subject++;
3305
3306 /* If we have just passed a CR and the newline option is ANY or
3307 ANYCRLF, and we are now at a LF, advance the match position by one
3308 more character. */
3309
3310 if (current_subject[-1] == CHAR_CR &&
3311 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3312 current_subject < end_subject &&
3313 *current_subject == CHAR_NL)
3314 current_subject++;
3315 }
3316 }
3317
3318 /* Or to a non-unique first char after study */
3319
3320 else if (start_bits != NULL)
3321 {
3322 while (current_subject < end_subject)
3323 {
3324 register unsigned int c = *current_subject;
3325 #ifndef COMPILE_PCRE8
3326 if (c > 255) c = 255;
3327 #endif
3328 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3329 {
3330 current_subject++;
3331 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3332 /* In non 8-bit mode, the iteration will stop for
3333 characters > 255 at the beginning or not stop at all. */
3334 if (utf)
3335 ACROSSCHAR(current_subject < end_subject, *current_subject,
3336 current_subject++);
3337 #endif
3338 }
3339 else break;
3340 }
3341 }
3342 }
3343
3344 /* Restore fudged end_subject */
3345
3346 end_subject = save_end_subject;
3347
3348 /* The following two optimizations are disabled for partial matching or if
3349 disabling is explicitly requested (and of course, by the test above, this
3350 code is not obeyed when restarting after a partial match). */
3351
3352 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3353 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3354 {
3355 /* If the pattern was studied, a minimum subject length may be set. This
3356 is a lower bound; no actual string of that length may actually match the
3357 pattern. Although the value is, strictly, in characters, we treat it as
3358 bytes to avoid spending too much time in this optimization. */
3359
3360 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3361 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3362 return PCRE_ERROR_NOMATCH;
3363
3364 /* If req_char is set, we know that that character must appear in the
3365 subject for the match to succeed. If the first character is set, req_char
3366 must be later in the subject; otherwise the test starts at the match
3367 point. This optimization can save a huge amount of work in patterns with
3368 nested unlimited repeats that aren't going to match. Writing separate
3369 code for cased/caseless versions makes it go faster, as does using an
3370 autoincrement and backing off on a match.
3371
3372 HOWEVER: when the subject string is very, very long, searching to its end
3373 can take a long time, and give bad performance on quite ordinary
3374 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3375 string... so we don't do this when the string is sufficiently long. */
3376
3377 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3378 {
3379 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3380
3381 /* We don't need to repeat the search if we haven't yet reached the
3382 place we found it at last time. */
3383
3384 if (p > req_char_ptr)
3385 {
3386 if (req_char != req_char2)
3387 {
3388 while (p < end_subject)
3389 {
3390 register int pp = *p++;
3391 if (pp == req_char || pp == req_char2) { p--; break; }
3392 }
3393 }
3394 else
3395 {
3396 while (p < end_subject)
3397 {
3398 if (*p++ == req_char) { p--; break; }
3399 }
3400 }
3401
3402 /* If we can't find the required character, break the matching loop,
3403 which will cause a return or PCRE_ERROR_NOMATCH. */
3404
3405 if (p >= end_subject) break;
3406
3407 /* If we have found the required character, save the point where we
3408 found it, so that we don't search again next time round the loop if
3409 the start hasn't passed this character yet. */
3410
3411 req_char_ptr = p;
3412 }
3413 }
3414 }
3415 } /* End of optimizations that are done when not restarting */
3416
3417 /* OK, now we can do the business */
3418
3419 md->start_used_ptr = current_subject;
3420 md->recursive = NULL;
3421
3422 rc = internal_dfa_exec(
3423 md, /* fixed match data */
3424 md->start_code, /* this subexpression's code */
3425 current_subject, /* where we currently are */
3426 start_offset, /* start offset in subject */
3427 offsets, /* offset vector */
3428 offsetcount, /* size of same */
3429 workspace, /* workspace vector */
3430 wscount, /* size of same */
3431 0); /* function recurse level */
3432
3433 /* Anything other than "no match" means we are done, always; otherwise, carry
3434 on only if not anchored. */
3435
3436 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3437
3438 /* Advance to the next subject character unless we are at the end of a line
3439 and firstline is set. */
3440
3441 if (firstline && IS_NEWLINE(current_subject)) break;
3442 current_subject++;
3443 #ifdef SUPPORT_UTF
3444 if (utf)
3445 {
3446 ACROSSCHAR(current_subject < end_subject, *current_subject,
3447 current_subject++);
3448 }
3449 #endif
3450 if (current_subject > end_subject) break;
3451
3452 /* If we have just passed a CR and we are now at a LF, and the pattern does
3453 not contain any explicit matches for \r or \n, and the newline option is CRLF
3454 or ANY or ANYCRLF, advance the match position by one more character. */
3455
3456 if (current_subject[-1] == CHAR_CR &&
3457 current_subject < end_subject &&
3458 *current_subject == CHAR_NL &&
3459 (re->flags & PCRE_HASCRORLF) == 0 &&
3460 (md->nltype == NLTYPE_ANY ||
3461 md->nltype == NLTYPE_ANYCRLF ||
3462 md->nllen == 2))
3463 current_subject++;
3464
3465 } /* "Bumpalong" loop */
3466
3467 return PCRE_ERROR_NOMATCH;
3468 }
3469
3470 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5