/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1376 - (show annotations)
Sat Oct 12 18:02:11 2013 UTC (6 years ago) by ph10
File MIME type: text/plain
File size: 125911 byte(s)
Error occurred while calculating annotation data.
Add U+0085 and U+180E to what \s matches in UCP mode, to match Perl.
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2013 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
45
46
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
51
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
60
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
69
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
72
73
74
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
78
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
82
83 #include "pcre_internal.h"
84
85
86 /* For use to indent debugging output */
87
88 #define SP " "
89
90
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
94
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
99
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
105
106
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
114
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* DNREF */
160 0, /* DNREFI */
161 0, /* RECURSE */
162 0, /* CALLOUT */
163 0, /* Alt */
164 0, /* Ket */
165 0, /* KetRmax */
166 0, /* KetRmin */
167 0, /* KetRpos */
168 0, /* Reverse */
169 0, /* Assert */
170 0, /* Assert not */
171 0, /* Assert behind */
172 0, /* Assert behind not */
173 0, 0, /* ONCE, ONCE_NC */
174 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
175 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
176 0, 0, /* CREF, DNCREF */
177 0, 0, /* RREF, DNRREF */
178 0, /* DEF */
179 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
180 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
181 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
182 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
183 0, 0 /* CLOSE, SKIPZERO */
184 };
185
186 /* This table identifies those opcodes that inspect a character. It is used to
187 remember the fact that a character could have been inspected when the end of
188 the subject is reached. ***NOTE*** If the start of this table is modified, the
189 two tables that follow must also be modified. */
190
191 static const pcre_uint8 poptable[] = {
192 0, /* End */
193 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
194 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
195 1, 1, 1, /* Any, AllAny, Anybyte */
196 1, 1, /* \P, \p */
197 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
198 1, /* \X */
199 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
200 1, /* Char */
201 1, /* Chari */
202 1, /* not */
203 1, /* noti */
204 /* Positive single-char repeats */
205 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
206 1, 1, 1, /* upto, minupto, exact */
207 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
208 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
209 1, 1, 1, /* upto I, minupto I, exact I */
210 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
211 /* Negative single-char repeats - only for chars < 256 */
212 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
213 1, 1, 1, /* NOT upto, minupto, exact */
214 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
215 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
216 1, 1, 1, /* NOT upto I, minupto I, exact I */
217 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
218 /* Positive type repeats */
219 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
220 1, 1, 1, /* Type upto, minupto, exact */
221 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
222 /* Character class & ref repeats */
223 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
224 1, 1, /* CRRANGE, CRMINRANGE */
225 1, /* CLASS */
226 1, /* NCLASS */
227 1, /* XCLASS - variable length */
228 0, /* REF */
229 0, /* REFI */
230 0, /* DNREF */
231 0, /* DNREFI */
232 0, /* RECURSE */
233 0, /* CALLOUT */
234 0, /* Alt */
235 0, /* Ket */
236 0, /* KetRmax */
237 0, /* KetRmin */
238 0, /* KetRpos */
239 0, /* Reverse */
240 0, /* Assert */
241 0, /* Assert not */
242 0, /* Assert behind */
243 0, /* Assert behind not */
244 0, 0, /* ONCE, ONCE_NC */
245 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
246 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
247 0, 0, /* CREF, DNCREF */
248 0, 0, /* RREF, DNRREF */
249 0, /* DEF */
250 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
251 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
252 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
253 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
254 0, 0 /* CLOSE, SKIPZERO */
255 };
256
257 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
258 and \w */
259
260 static const pcre_uint8 toptable1[] = {
261 0, 0, 0, 0, 0, 0,
262 ctype_digit, ctype_digit,
263 ctype_space, ctype_space,
264 ctype_word, ctype_word,
265 0, 0 /* OP_ANY, OP_ALLANY */
266 };
267
268 static const pcre_uint8 toptable2[] = {
269 0, 0, 0, 0, 0, 0,
270 ctype_digit, 0,
271 ctype_space, 0,
272 ctype_word, 0,
273 1, 1 /* OP_ANY, OP_ALLANY */
274 };
275
276
277 /* Structure for holding data about a particular state, which is in effect the
278 current data for an active path through the match tree. It must consist
279 entirely of ints because the working vector we are passed, and which we put
280 these structures in, is a vector of ints. */
281
282 typedef struct stateblock {
283 int offset; /* Offset to opcode */
284 int count; /* Count for repeats */
285 int data; /* Some use extra data */
286 } stateblock;
287
288 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
289
290
291 #ifdef PCRE_DEBUG
292 /*************************************************
293 * Print character string *
294 *************************************************/
295
296 /* Character string printing function for debugging.
297
298 Arguments:
299 p points to string
300 length number of bytes
301 f where to print
302
303 Returns: nothing
304 */
305
306 static void
307 pchars(const pcre_uchar *p, int length, FILE *f)
308 {
309 pcre_uint32 c;
310 while (length-- > 0)
311 {
312 if (isprint(c = *(p++)))
313 fprintf(f, "%c", c);
314 else
315 fprintf(f, "\\x{%02x}", c);
316 }
317 }
318 #endif
319
320
321
322 /*************************************************
323 * Execute a Regular Expression - DFA engine *
324 *************************************************/
325
326 /* This internal function applies a compiled pattern to a subject string,
327 starting at a given point, using a DFA engine. This function is called from the
328 external one, possibly multiple times if the pattern is not anchored. The
329 function calls itself recursively for some kinds of subpattern.
330
331 Arguments:
332 md the match_data block with fixed information
333 this_start_code the opening bracket of this subexpression's code
334 current_subject where we currently are in the subject string
335 start_offset start offset in the subject string
336 offsets vector to contain the matching string offsets
337 offsetcount size of same
338 workspace vector of workspace
339 wscount size of same
340 rlevel function call recursion level
341
342 Returns: > 0 => number of match offset pairs placed in offsets
343 = 0 => offsets overflowed; longest matches are present
344 -1 => failed to match
345 < -1 => some kind of unexpected problem
346
347 The following macros are used for adding states to the two state vectors (one
348 for the current character, one for the following character). */
349
350 #define ADD_ACTIVE(x,y) \
351 if (active_count++ < wscount) \
352 { \
353 next_active_state->offset = (x); \
354 next_active_state->count = (y); \
355 next_active_state++; \
356 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
357 } \
358 else return PCRE_ERROR_DFA_WSSIZE
359
360 #define ADD_ACTIVE_DATA(x,y,z) \
361 if (active_count++ < wscount) \
362 { \
363 next_active_state->offset = (x); \
364 next_active_state->count = (y); \
365 next_active_state->data = (z); \
366 next_active_state++; \
367 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
368 } \
369 else return PCRE_ERROR_DFA_WSSIZE
370
371 #define ADD_NEW(x,y) \
372 if (new_count++ < wscount) \
373 { \
374 next_new_state->offset = (x); \
375 next_new_state->count = (y); \
376 next_new_state++; \
377 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
378 } \
379 else return PCRE_ERROR_DFA_WSSIZE
380
381 #define ADD_NEW_DATA(x,y,z) \
382 if (new_count++ < wscount) \
383 { \
384 next_new_state->offset = (x); \
385 next_new_state->count = (y); \
386 next_new_state->data = (z); \
387 next_new_state++; \
388 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
389 (x), (y), (z), __LINE__)); \
390 } \
391 else return PCRE_ERROR_DFA_WSSIZE
392
393 /* And now, here is the code */
394
395 static int
396 internal_dfa_exec(
397 dfa_match_data *md,
398 const pcre_uchar *this_start_code,
399 const pcre_uchar *current_subject,
400 int start_offset,
401 int *offsets,
402 int offsetcount,
403 int *workspace,
404 int wscount,
405 int rlevel)
406 {
407 stateblock *active_states, *new_states, *temp_states;
408 stateblock *next_active_state, *next_new_state;
409
410 const pcre_uint8 *ctypes, *lcc, *fcc;
411 const pcre_uchar *ptr;
412 const pcre_uchar *end_code, *first_op;
413
414 dfa_recursion_info new_recursive;
415
416 int active_count, new_count, match_count;
417
418 /* Some fields in the md block are frequently referenced, so we load them into
419 independent variables in the hope that this will perform better. */
420
421 const pcre_uchar *start_subject = md->start_subject;
422 const pcre_uchar *end_subject = md->end_subject;
423 const pcre_uchar *start_code = md->start_code;
424
425 #ifdef SUPPORT_UTF
426 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
427 #else
428 BOOL utf = FALSE;
429 #endif
430
431 BOOL reset_could_continue = FALSE;
432
433 rlevel++;
434 offsetcount &= (-2);
435
436 wscount -= 2;
437 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
438 (2 * INTS_PER_STATEBLOCK);
439
440 DPRINTF(("\n%.*s---------------------\n"
441 "%.*sCall to internal_dfa_exec f=%d\n",
442 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
443
444 ctypes = md->tables + ctypes_offset;
445 lcc = md->tables + lcc_offset;
446 fcc = md->tables + fcc_offset;
447
448 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
449
450 active_states = (stateblock *)(workspace + 2);
451 next_new_state = new_states = active_states + wscount;
452 new_count = 0;
453
454 first_op = this_start_code + 1 + LINK_SIZE +
455 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
456 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
457 ? IMM2_SIZE:0);
458
459 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
460 the alternative states onto the list, and find out where the end is. This
461 makes is possible to use this function recursively, when we want to stop at a
462 matching internal ket rather than at the end.
463
464 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
465 a backward assertion. In that case, we have to find out the maximum amount to
466 move back, and set up each alternative appropriately. */
467
468 if (*first_op == OP_REVERSE)
469 {
470 int max_back = 0;
471 int gone_back;
472
473 end_code = this_start_code;
474 do
475 {
476 int back = GET(end_code, 2+LINK_SIZE);
477 if (back > max_back) max_back = back;
478 end_code += GET(end_code, 1);
479 }
480 while (*end_code == OP_ALT);
481
482 /* If we can't go back the amount required for the longest lookbehind
483 pattern, go back as far as we can; some alternatives may still be viable. */
484
485 #ifdef SUPPORT_UTF
486 /* In character mode we have to step back character by character */
487
488 if (utf)
489 {
490 for (gone_back = 0; gone_back < max_back; gone_back++)
491 {
492 if (current_subject <= start_subject) break;
493 current_subject--;
494 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
495 }
496 }
497 else
498 #endif
499
500 /* In byte-mode we can do this quickly. */
501
502 {
503 gone_back = (current_subject - max_back < start_subject)?
504 (int)(current_subject - start_subject) : max_back;
505 current_subject -= gone_back;
506 }
507
508 /* Save the earliest consulted character */
509
510 if (current_subject < md->start_used_ptr)
511 md->start_used_ptr = current_subject;
512
513 /* Now we can process the individual branches. */
514
515 end_code = this_start_code;
516 do
517 {
518 int back = GET(end_code, 2+LINK_SIZE);
519 if (back <= gone_back)
520 {
521 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
522 ADD_NEW_DATA(-bstate, 0, gone_back - back);
523 }
524 end_code += GET(end_code, 1);
525 }
526 while (*end_code == OP_ALT);
527 }
528
529 /* This is the code for a "normal" subpattern (not a backward assertion). The
530 start of a whole pattern is always one of these. If we are at the top level,
531 we may be asked to restart matching from the same point that we reached for a
532 previous partial match. We still have to scan through the top-level branches to
533 find the end state. */
534
535 else
536 {
537 end_code = this_start_code;
538
539 /* Restarting */
540
541 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
542 {
543 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
544 new_count = workspace[1];
545 if (!workspace[0])
546 memcpy(new_states, active_states, new_count * sizeof(stateblock));
547 }
548
549 /* Not restarting */
550
551 else
552 {
553 int length = 1 + LINK_SIZE +
554 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
555 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
556 ? IMM2_SIZE:0);
557 do
558 {
559 ADD_NEW((int)(end_code - start_code + length), 0);
560 end_code += GET(end_code, 1);
561 length = 1 + LINK_SIZE;
562 }
563 while (*end_code == OP_ALT);
564 }
565 }
566
567 workspace[0] = 0; /* Bit indicating which vector is current */
568
569 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
570
571 /* Loop for scanning the subject */
572
573 ptr = current_subject;
574 for (;;)
575 {
576 int i, j;
577 int clen, dlen;
578 pcre_uint32 c, d;
579 int forced_fail = 0;
580 BOOL partial_newline = FALSE;
581 BOOL could_continue = reset_could_continue;
582 reset_could_continue = FALSE;
583
584 /* Make the new state list into the active state list and empty the
585 new state list. */
586
587 temp_states = active_states;
588 active_states = new_states;
589 new_states = temp_states;
590 active_count = new_count;
591 new_count = 0;
592
593 workspace[0] ^= 1; /* Remember for the restarting feature */
594 workspace[1] = active_count;
595
596 #ifdef PCRE_DEBUG
597 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
598 pchars(ptr, STRLEN_UC(ptr), stdout);
599 printf("\"\n");
600
601 printf("%.*sActive states: ", rlevel*2-2, SP);
602 for (i = 0; i < active_count; i++)
603 printf("%d/%d ", active_states[i].offset, active_states[i].count);
604 printf("\n");
605 #endif
606
607 /* Set the pointers for adding new states */
608
609 next_active_state = active_states + active_count;
610 next_new_state = new_states;
611
612 /* Load the current character from the subject outside the loop, as many
613 different states may want to look at it, and we assume that at least one
614 will. */
615
616 if (ptr < end_subject)
617 {
618 clen = 1; /* Number of data items in the character */
619 #ifdef SUPPORT_UTF
620 GETCHARLENTEST(c, ptr, clen);
621 #else
622 c = *ptr;
623 #endif /* SUPPORT_UTF */
624 }
625 else
626 {
627 clen = 0; /* This indicates the end of the subject */
628 c = NOTACHAR; /* This value should never actually be used */
629 }
630
631 /* Scan up the active states and act on each one. The result of an action
632 may be to add more states to the currently active list (e.g. on hitting a
633 parenthesis) or it may be to put states on the new list, for considering
634 when we move the character pointer on. */
635
636 for (i = 0; i < active_count; i++)
637 {
638 stateblock *current_state = active_states + i;
639 BOOL caseless = FALSE;
640 const pcre_uchar *code;
641 int state_offset = current_state->offset;
642 int codevalue, rrc;
643 int count;
644
645 #ifdef PCRE_DEBUG
646 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
647 if (clen == 0) printf("EOL\n");
648 else if (c > 32 && c < 127) printf("'%c'\n", c);
649 else printf("0x%02x\n", c);
650 #endif
651
652 /* A negative offset is a special case meaning "hold off going to this
653 (negated) state until the number of characters in the data field have
654 been skipped". If the could_continue flag was passed over from a previous
655 state, arrange for it to passed on. */
656
657 if (state_offset < 0)
658 {
659 if (current_state->data > 0)
660 {
661 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
662 ADD_NEW_DATA(state_offset, current_state->count,
663 current_state->data - 1);
664 if (could_continue) reset_could_continue = TRUE;
665 continue;
666 }
667 else
668 {
669 current_state->offset = state_offset = -state_offset;
670 }
671 }
672
673 /* Check for a duplicate state with the same count, and skip if found.
674 See the note at the head of this module about the possibility of improving
675 performance here. */
676
677 for (j = 0; j < i; j++)
678 {
679 if (active_states[j].offset == state_offset &&
680 active_states[j].count == current_state->count)
681 {
682 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
683 goto NEXT_ACTIVE_STATE;
684 }
685 }
686
687 /* The state offset is the offset to the opcode */
688
689 code = start_code + state_offset;
690 codevalue = *code;
691
692 /* If this opcode inspects a character, but we are at the end of the
693 subject, remember the fact for use when testing for a partial match. */
694
695 if (clen == 0 && poptable[codevalue] != 0)
696 could_continue = TRUE;
697
698 /* If this opcode is followed by an inline character, load it. It is
699 tempting to test for the presence of a subject character here, but that
700 is wrong, because sometimes zero repetitions of the subject are
701 permitted.
702
703 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
704 argument that is not a data character - but is always one byte long because
705 the values are small. We have to take special action to deal with \P, \p,
706 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
707 these ones to new opcodes. */
708
709 if (coptable[codevalue] > 0)
710 {
711 dlen = 1;
712 #ifdef SUPPORT_UTF
713 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
714 #endif /* SUPPORT_UTF */
715 d = code[coptable[codevalue]];
716 if (codevalue >= OP_TYPESTAR)
717 {
718 switch(d)
719 {
720 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
721 case OP_NOTPROP:
722 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
723 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
724 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
725 case OP_NOT_HSPACE:
726 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
727 case OP_NOT_VSPACE:
728 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
729 default: break;
730 }
731 }
732 }
733 else
734 {
735 dlen = 0; /* Not strictly necessary, but compilers moan */
736 d = NOTACHAR; /* if these variables are not set. */
737 }
738
739
740 /* Now process the individual opcodes */
741
742 switch (codevalue)
743 {
744 /* ========================================================================== */
745 /* These cases are never obeyed. This is a fudge that causes a compile-
746 time error if the vectors coptable or poptable, which are indexed by
747 opcode, are not the correct length. It seems to be the only way to do
748 such a check at compile time, as the sizeof() operator does not work
749 in the C preprocessor. */
750
751 case OP_TABLE_LENGTH:
752 case OP_TABLE_LENGTH +
753 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
754 (sizeof(poptable) == OP_TABLE_LENGTH)):
755 break;
756
757 /* ========================================================================== */
758 /* Reached a closing bracket. If not at the end of the pattern, carry
759 on with the next opcode. For repeating opcodes, also add the repeat
760 state. Note that KETRPOS will always be encountered at the end of the
761 subpattern, because the possessive subpattern repeats are always handled
762 using recursive calls. Thus, it never adds any new states.
763
764 At the end of the (sub)pattern, unless we have an empty string and
765 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
766 start of the subject, save the match data, shifting up all previous
767 matches so we always have the longest first. */
768
769 case OP_KET:
770 case OP_KETRMIN:
771 case OP_KETRMAX:
772 case OP_KETRPOS:
773 if (code != end_code)
774 {
775 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
776 if (codevalue != OP_KET)
777 {
778 ADD_ACTIVE(state_offset - GET(code, 1), 0);
779 }
780 }
781 else
782 {
783 if (ptr > current_subject ||
784 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
785 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
786 current_subject > start_subject + md->start_offset)))
787 {
788 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
789 else if (match_count > 0 && ++match_count * 2 > offsetcount)
790 match_count = 0;
791 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
792 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
793 if (offsetcount >= 2)
794 {
795 offsets[0] = (int)(current_subject - start_subject);
796 offsets[1] = (int)(ptr - start_subject);
797 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
798 offsets[1] - offsets[0], (char *)current_subject));
799 }
800 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
801 {
802 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
803 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
804 match_count, rlevel*2-2, SP));
805 return match_count;
806 }
807 }
808 }
809 break;
810
811 /* ========================================================================== */
812 /* These opcodes add to the current list of states without looking
813 at the current character. */
814
815 /*-----------------------------------------------------------------*/
816 case OP_ALT:
817 do { code += GET(code, 1); } while (*code == OP_ALT);
818 ADD_ACTIVE((int)(code - start_code), 0);
819 break;
820
821 /*-----------------------------------------------------------------*/
822 case OP_BRA:
823 case OP_SBRA:
824 do
825 {
826 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
827 code += GET(code, 1);
828 }
829 while (*code == OP_ALT);
830 break;
831
832 /*-----------------------------------------------------------------*/
833 case OP_CBRA:
834 case OP_SCBRA:
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
836 code += GET(code, 1);
837 while (*code == OP_ALT)
838 {
839 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
840 code += GET(code, 1);
841 }
842 break;
843
844 /*-----------------------------------------------------------------*/
845 case OP_BRAZERO:
846 case OP_BRAMINZERO:
847 ADD_ACTIVE(state_offset + 1, 0);
848 code += 1 + GET(code, 2);
849 while (*code == OP_ALT) code += GET(code, 1);
850 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
851 break;
852
853 /*-----------------------------------------------------------------*/
854 case OP_SKIPZERO:
855 code += 1 + GET(code, 2);
856 while (*code == OP_ALT) code += GET(code, 1);
857 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
858 break;
859
860 /*-----------------------------------------------------------------*/
861 case OP_CIRC:
862 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
863 { ADD_ACTIVE(state_offset + 1, 0); }
864 break;
865
866 /*-----------------------------------------------------------------*/
867 case OP_CIRCM:
868 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
869 (ptr != end_subject && WAS_NEWLINE(ptr)))
870 { ADD_ACTIVE(state_offset + 1, 0); }
871 break;
872
873 /*-----------------------------------------------------------------*/
874 case OP_EOD:
875 if (ptr >= end_subject)
876 {
877 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
878 could_continue = TRUE;
879 else { ADD_ACTIVE(state_offset + 1, 0); }
880 }
881 break;
882
883 /*-----------------------------------------------------------------*/
884 case OP_SOD:
885 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
886 break;
887
888 /*-----------------------------------------------------------------*/
889 case OP_SOM:
890 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
891 break;
892
893
894 /* ========================================================================== */
895 /* These opcodes inspect the next subject character, and sometimes
896 the previous one as well, but do not have an argument. The variable
897 clen contains the length of the current character and is zero if we are
898 at the end of the subject. */
899
900 /*-----------------------------------------------------------------*/
901 case OP_ANY:
902 if (clen > 0 && !IS_NEWLINE(ptr))
903 {
904 if (ptr + 1 >= md->end_subject &&
905 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
906 NLBLOCK->nltype == NLTYPE_FIXED &&
907 NLBLOCK->nllen == 2 &&
908 c == NLBLOCK->nl[0])
909 {
910 could_continue = partial_newline = TRUE;
911 }
912 else
913 {
914 ADD_NEW(state_offset + 1, 0);
915 }
916 }
917 break;
918
919 /*-----------------------------------------------------------------*/
920 case OP_ALLANY:
921 if (clen > 0)
922 { ADD_NEW(state_offset + 1, 0); }
923 break;
924
925 /*-----------------------------------------------------------------*/
926 case OP_EODN:
927 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
928 could_continue = TRUE;
929 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
930 { ADD_ACTIVE(state_offset + 1, 0); }
931 break;
932
933 /*-----------------------------------------------------------------*/
934 case OP_DOLL:
935 if ((md->moptions & PCRE_NOTEOL) == 0)
936 {
937 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
938 could_continue = TRUE;
939 else if (clen == 0 ||
940 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
941 (ptr == end_subject - md->nllen)
942 ))
943 { ADD_ACTIVE(state_offset + 1, 0); }
944 else if (ptr + 1 >= md->end_subject &&
945 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
946 NLBLOCK->nltype == NLTYPE_FIXED &&
947 NLBLOCK->nllen == 2 &&
948 c == NLBLOCK->nl[0])
949 {
950 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
951 {
952 reset_could_continue = TRUE;
953 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
954 }
955 else could_continue = partial_newline = TRUE;
956 }
957 }
958 break;
959
960 /*-----------------------------------------------------------------*/
961 case OP_DOLLM:
962 if ((md->moptions & PCRE_NOTEOL) == 0)
963 {
964 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
965 could_continue = TRUE;
966 else if (clen == 0 ||
967 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
968 { ADD_ACTIVE(state_offset + 1, 0); }
969 else if (ptr + 1 >= md->end_subject &&
970 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
971 NLBLOCK->nltype == NLTYPE_FIXED &&
972 NLBLOCK->nllen == 2 &&
973 c == NLBLOCK->nl[0])
974 {
975 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
976 {
977 reset_could_continue = TRUE;
978 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
979 }
980 else could_continue = partial_newline = TRUE;
981 }
982 }
983 else if (IS_NEWLINE(ptr))
984 { ADD_ACTIVE(state_offset + 1, 0); }
985 break;
986
987 /*-----------------------------------------------------------------*/
988
989 case OP_DIGIT:
990 case OP_WHITESPACE:
991 case OP_WORDCHAR:
992 if (clen > 0 && c < 256 &&
993 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
994 { ADD_NEW(state_offset + 1, 0); }
995 break;
996
997 /*-----------------------------------------------------------------*/
998 case OP_NOT_DIGIT:
999 case OP_NOT_WHITESPACE:
1000 case OP_NOT_WORDCHAR:
1001 if (clen > 0 && (c >= 256 ||
1002 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1003 { ADD_NEW(state_offset + 1, 0); }
1004 break;
1005
1006 /*-----------------------------------------------------------------*/
1007 case OP_WORD_BOUNDARY:
1008 case OP_NOT_WORD_BOUNDARY:
1009 {
1010 int left_word, right_word;
1011
1012 if (ptr > start_subject)
1013 {
1014 const pcre_uchar *temp = ptr - 1;
1015 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1016 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1017 if (utf) { BACKCHAR(temp); }
1018 #endif
1019 GETCHARTEST(d, temp);
1020 #ifdef SUPPORT_UCP
1021 if ((md->poptions & PCRE_UCP) != 0)
1022 {
1023 if (d == '_') left_word = TRUE; else
1024 {
1025 int cat = UCD_CATEGORY(d);
1026 left_word = (cat == ucp_L || cat == ucp_N);
1027 }
1028 }
1029 else
1030 #endif
1031 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1032 }
1033 else left_word = FALSE;
1034
1035 if (clen > 0)
1036 {
1037 #ifdef SUPPORT_UCP
1038 if ((md->poptions & PCRE_UCP) != 0)
1039 {
1040 if (c == '_') right_word = TRUE; else
1041 {
1042 int cat = UCD_CATEGORY(c);
1043 right_word = (cat == ucp_L || cat == ucp_N);
1044 }
1045 }
1046 else
1047 #endif
1048 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1049 }
1050 else right_word = FALSE;
1051
1052 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1053 { ADD_ACTIVE(state_offset + 1, 0); }
1054 }
1055 break;
1056
1057
1058 /*-----------------------------------------------------------------*/
1059 /* Check the next character by Unicode property. We will get here only
1060 if the support is in the binary; otherwise a compile-time error occurs.
1061 */
1062
1063 #ifdef SUPPORT_UCP
1064 case OP_PROP:
1065 case OP_NOTPROP:
1066 if (clen > 0)
1067 {
1068 BOOL OK;
1069 const pcre_uint32 *cp;
1070 const ucd_record * prop = GET_UCD(c);
1071 switch(code[1])
1072 {
1073 case PT_ANY:
1074 OK = TRUE;
1075 break;
1076
1077 case PT_LAMP:
1078 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1079 prop->chartype == ucp_Lt;
1080 break;
1081
1082 case PT_GC:
1083 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1084 break;
1085
1086 case PT_PC:
1087 OK = prop->chartype == code[2];
1088 break;
1089
1090 case PT_SC:
1091 OK = prop->script == code[2];
1092 break;
1093
1094 /* These are specials for combination cases. */
1095
1096 case PT_ALNUM:
1097 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1098 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1099 break;
1100
1101 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1102 which means that Perl space and POSIX space are now identical. PCRE
1103 was changed at release 8.34. */
1104
1105 case PT_SPACE: /* Perl space */
1106 case PT_PXSPACE: /* POSIX space */
1107 switch(c)
1108 {
1109 HSPACE_CASES:
1110 VSPACE_CASES:
1111 OK = TRUE;
1112 break;
1113
1114 default:
1115 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1116 break;
1117 }
1118 break;
1119
1120 case PT_WORD:
1121 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1122 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1123 c == CHAR_UNDERSCORE;
1124 break;
1125
1126 case PT_CLIST:
1127 cp = PRIV(ucd_caseless_sets) + code[2];
1128 for (;;)
1129 {
1130 if (c < *cp) { OK = FALSE; break; }
1131 if (c == *cp++) { OK = TRUE; break; }
1132 }
1133 break;
1134
1135 case PT_UCNC:
1136 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1137 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1138 c >= 0xe000;
1139 break;
1140
1141 /* Should never occur, but keep compilers from grumbling. */
1142
1143 default:
1144 OK = codevalue != OP_PROP;
1145 break;
1146 }
1147
1148 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1149 }
1150 break;
1151 #endif
1152
1153
1154
1155 /* ========================================================================== */
1156 /* These opcodes likewise inspect the subject character, but have an
1157 argument that is not a data character. It is one of these opcodes:
1158 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1159 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1160
1161 case OP_TYPEPLUS:
1162 case OP_TYPEMINPLUS:
1163 case OP_TYPEPOSPLUS:
1164 count = current_state->count; /* Already matched */
1165 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1166 if (clen > 0)
1167 {
1168 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1169 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1170 NLBLOCK->nltype == NLTYPE_FIXED &&
1171 NLBLOCK->nllen == 2 &&
1172 c == NLBLOCK->nl[0])
1173 {
1174 could_continue = partial_newline = TRUE;
1175 }
1176 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1177 (c < 256 &&
1178 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1179 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1180 {
1181 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1182 {
1183 active_count--; /* Remove non-match possibility */
1184 next_active_state--;
1185 }
1186 count++;
1187 ADD_NEW(state_offset, count);
1188 }
1189 }
1190 break;
1191
1192 /*-----------------------------------------------------------------*/
1193 case OP_TYPEQUERY:
1194 case OP_TYPEMINQUERY:
1195 case OP_TYPEPOSQUERY:
1196 ADD_ACTIVE(state_offset + 2, 0);
1197 if (clen > 0)
1198 {
1199 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201 NLBLOCK->nltype == NLTYPE_FIXED &&
1202 NLBLOCK->nllen == 2 &&
1203 c == NLBLOCK->nl[0])
1204 {
1205 could_continue = partial_newline = TRUE;
1206 }
1207 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208 (c < 256 &&
1209 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211 {
1212 if (codevalue == OP_TYPEPOSQUERY)
1213 {
1214 active_count--; /* Remove non-match possibility */
1215 next_active_state--;
1216 }
1217 ADD_NEW(state_offset + 2, 0);
1218 }
1219 }
1220 break;
1221
1222 /*-----------------------------------------------------------------*/
1223 case OP_TYPESTAR:
1224 case OP_TYPEMINSTAR:
1225 case OP_TYPEPOSSTAR:
1226 ADD_ACTIVE(state_offset + 2, 0);
1227 if (clen > 0)
1228 {
1229 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1230 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1231 NLBLOCK->nltype == NLTYPE_FIXED &&
1232 NLBLOCK->nllen == 2 &&
1233 c == NLBLOCK->nl[0])
1234 {
1235 could_continue = partial_newline = TRUE;
1236 }
1237 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1238 (c < 256 &&
1239 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1240 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1241 {
1242 if (codevalue == OP_TYPEPOSSTAR)
1243 {
1244 active_count--; /* Remove non-match possibility */
1245 next_active_state--;
1246 }
1247 ADD_NEW(state_offset, 0);
1248 }
1249 }
1250 break;
1251
1252 /*-----------------------------------------------------------------*/
1253 case OP_TYPEEXACT:
1254 count = current_state->count; /* Number already matched */
1255 if (clen > 0)
1256 {
1257 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1258 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1259 NLBLOCK->nltype == NLTYPE_FIXED &&
1260 NLBLOCK->nllen == 2 &&
1261 c == NLBLOCK->nl[0])
1262 {
1263 could_continue = partial_newline = TRUE;
1264 }
1265 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1266 (c < 256 &&
1267 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1268 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1269 {
1270 if (++count >= (int)GET2(code, 1))
1271 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1272 else
1273 { ADD_NEW(state_offset, count); }
1274 }
1275 }
1276 break;
1277
1278 /*-----------------------------------------------------------------*/
1279 case OP_TYPEUPTO:
1280 case OP_TYPEMINUPTO:
1281 case OP_TYPEPOSUPTO:
1282 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1283 count = current_state->count; /* Number already matched */
1284 if (clen > 0)
1285 {
1286 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1287 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1288 NLBLOCK->nltype == NLTYPE_FIXED &&
1289 NLBLOCK->nllen == 2 &&
1290 c == NLBLOCK->nl[0])
1291 {
1292 could_continue = partial_newline = TRUE;
1293 }
1294 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1295 (c < 256 &&
1296 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1297 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1298 {
1299 if (codevalue == OP_TYPEPOSUPTO)
1300 {
1301 active_count--; /* Remove non-match possibility */
1302 next_active_state--;
1303 }
1304 if (++count >= (int)GET2(code, 1))
1305 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1306 else
1307 { ADD_NEW(state_offset, count); }
1308 }
1309 }
1310 break;
1311
1312 /* ========================================================================== */
1313 /* These are virtual opcodes that are used when something like
1314 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1315 argument. It keeps the code above fast for the other cases. The argument
1316 is in the d variable. */
1317
1318 #ifdef SUPPORT_UCP
1319 case OP_PROP_EXTRA + OP_TYPEPLUS:
1320 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1321 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1322 count = current_state->count; /* Already matched */
1323 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1324 if (clen > 0)
1325 {
1326 BOOL OK;
1327 const pcre_uint32 *cp;
1328 const ucd_record * prop = GET_UCD(c);
1329 switch(code[2])
1330 {
1331 case PT_ANY:
1332 OK = TRUE;
1333 break;
1334
1335 case PT_LAMP:
1336 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1337 prop->chartype == ucp_Lt;
1338 break;
1339
1340 case PT_GC:
1341 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1342 break;
1343
1344 case PT_PC:
1345 OK = prop->chartype == code[3];
1346 break;
1347
1348 case PT_SC:
1349 OK = prop->script == code[3];
1350 break;
1351
1352 /* These are specials for combination cases. */
1353
1354 case PT_ALNUM:
1355 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1356 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1357 break;
1358
1359 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1360 which means that Perl space and POSIX space are now identical. PCRE
1361 was changed at release 8.34. */
1362
1363 case PT_SPACE: /* Perl space */
1364 case PT_PXSPACE: /* POSIX space */
1365 switch(c)
1366 {
1367 HSPACE_CASES:
1368 VSPACE_CASES:
1369 OK = TRUE;
1370 break;
1371
1372 default:
1373 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1374 break;
1375 }
1376 break;
1377
1378 case PT_WORD:
1379 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1380 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1381 c == CHAR_UNDERSCORE;
1382 break;
1383
1384 case PT_CLIST:
1385 cp = PRIV(ucd_caseless_sets) + code[3];
1386 for (;;)
1387 {
1388 if (c < *cp) { OK = FALSE; break; }
1389 if (c == *cp++) { OK = TRUE; break; }
1390 }
1391 break;
1392
1393 case PT_UCNC:
1394 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1395 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1396 c >= 0xe000;
1397 break;
1398
1399 /* Should never occur, but keep compilers from grumbling. */
1400
1401 default:
1402 OK = codevalue != OP_PROP;
1403 break;
1404 }
1405
1406 if (OK == (d == OP_PROP))
1407 {
1408 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1409 {
1410 active_count--; /* Remove non-match possibility */
1411 next_active_state--;
1412 }
1413 count++;
1414 ADD_NEW(state_offset, count);
1415 }
1416 }
1417 break;
1418
1419 /*-----------------------------------------------------------------*/
1420 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1421 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1422 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1423 count = current_state->count; /* Already matched */
1424 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1425 if (clen > 0)
1426 {
1427 int lgb, rgb;
1428 const pcre_uchar *nptr = ptr + clen;
1429 int ncount = 0;
1430 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1431 {
1432 active_count--; /* Remove non-match possibility */
1433 next_active_state--;
1434 }
1435 lgb = UCD_GRAPHBREAK(c);
1436 while (nptr < end_subject)
1437 {
1438 dlen = 1;
1439 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1440 rgb = UCD_GRAPHBREAK(d);
1441 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1442 ncount++;
1443 lgb = rgb;
1444 nptr += dlen;
1445 }
1446 count++;
1447 ADD_NEW_DATA(-state_offset, count, ncount);
1448 }
1449 break;
1450 #endif
1451
1452 /*-----------------------------------------------------------------*/
1453 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1454 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1455 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1456 count = current_state->count; /* Already matched */
1457 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1458 if (clen > 0)
1459 {
1460 int ncount = 0;
1461 switch (c)
1462 {
1463 case CHAR_VT:
1464 case CHAR_FF:
1465 case CHAR_NEL:
1466 #ifndef EBCDIC
1467 case 0x2028:
1468 case 0x2029:
1469 #endif /* Not EBCDIC */
1470 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1471 goto ANYNL01;
1472
1473 case CHAR_CR:
1474 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1475 /* Fall through */
1476
1477 ANYNL01:
1478 case CHAR_LF:
1479 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1480 {
1481 active_count--; /* Remove non-match possibility */
1482 next_active_state--;
1483 }
1484 count++;
1485 ADD_NEW_DATA(-state_offset, count, ncount);
1486 break;
1487
1488 default:
1489 break;
1490 }
1491 }
1492 break;
1493
1494 /*-----------------------------------------------------------------*/
1495 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1496 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1497 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1498 count = current_state->count; /* Already matched */
1499 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1500 if (clen > 0)
1501 {
1502 BOOL OK;
1503 switch (c)
1504 {
1505 VSPACE_CASES:
1506 OK = TRUE;
1507 break;
1508
1509 default:
1510 OK = FALSE;
1511 break;
1512 }
1513
1514 if (OK == (d == OP_VSPACE))
1515 {
1516 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1517 {
1518 active_count--; /* Remove non-match possibility */
1519 next_active_state--;
1520 }
1521 count++;
1522 ADD_NEW_DATA(-state_offset, count, 0);
1523 }
1524 }
1525 break;
1526
1527 /*-----------------------------------------------------------------*/
1528 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1529 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1530 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1531 count = current_state->count; /* Already matched */
1532 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1533 if (clen > 0)
1534 {
1535 BOOL OK;
1536 switch (c)
1537 {
1538 HSPACE_CASES:
1539 OK = TRUE;
1540 break;
1541
1542 default:
1543 OK = FALSE;
1544 break;
1545 }
1546
1547 if (OK == (d == OP_HSPACE))
1548 {
1549 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1550 {
1551 active_count--; /* Remove non-match possibility */
1552 next_active_state--;
1553 }
1554 count++;
1555 ADD_NEW_DATA(-state_offset, count, 0);
1556 }
1557 }
1558 break;
1559
1560 /*-----------------------------------------------------------------*/
1561 #ifdef SUPPORT_UCP
1562 case OP_PROP_EXTRA + OP_TYPEQUERY:
1563 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1564 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1565 count = 4;
1566 goto QS1;
1567
1568 case OP_PROP_EXTRA + OP_TYPESTAR:
1569 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1570 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1571 count = 0;
1572
1573 QS1:
1574
1575 ADD_ACTIVE(state_offset + 4, 0);
1576 if (clen > 0)
1577 {
1578 BOOL OK;
1579 const pcre_uint32 *cp;
1580 const ucd_record * prop = GET_UCD(c);
1581 switch(code[2])
1582 {
1583 case PT_ANY:
1584 OK = TRUE;
1585 break;
1586
1587 case PT_LAMP:
1588 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1589 prop->chartype == ucp_Lt;
1590 break;
1591
1592 case PT_GC:
1593 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1594 break;
1595
1596 case PT_PC:
1597 OK = prop->chartype == code[3];
1598 break;
1599
1600 case PT_SC:
1601 OK = prop->script == code[3];
1602 break;
1603
1604 /* These are specials for combination cases. */
1605
1606 case PT_ALNUM:
1607 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1608 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1609 break;
1610
1611 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1612 which means that Perl space and POSIX space are now identical. PCRE
1613 was changed at release 8.34. */
1614
1615 case PT_SPACE: /* Perl space */
1616 case PT_PXSPACE: /* POSIX space */
1617 switch(c)
1618 {
1619 HSPACE_CASES:
1620 VSPACE_CASES:
1621 OK = TRUE;
1622 break;
1623
1624 default:
1625 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1626 break;
1627 }
1628 break;
1629
1630 case PT_WORD:
1631 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1632 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1633 c == CHAR_UNDERSCORE;
1634 break;
1635
1636 case PT_CLIST:
1637 cp = PRIV(ucd_caseless_sets) + code[3];
1638 for (;;)
1639 {
1640 if (c < *cp) { OK = FALSE; break; }
1641 if (c == *cp++) { OK = TRUE; break; }
1642 }
1643 break;
1644
1645 case PT_UCNC:
1646 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1647 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1648 c >= 0xe000;
1649 break;
1650
1651 /* Should never occur, but keep compilers from grumbling. */
1652
1653 default:
1654 OK = codevalue != OP_PROP;
1655 break;
1656 }
1657
1658 if (OK == (d == OP_PROP))
1659 {
1660 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1661 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1662 {
1663 active_count--; /* Remove non-match possibility */
1664 next_active_state--;
1665 }
1666 ADD_NEW(state_offset + count, 0);
1667 }
1668 }
1669 break;
1670
1671 /*-----------------------------------------------------------------*/
1672 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1673 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1674 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1675 count = 2;
1676 goto QS2;
1677
1678 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1679 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1680 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1681 count = 0;
1682
1683 QS2:
1684
1685 ADD_ACTIVE(state_offset + 2, 0);
1686 if (clen > 0)
1687 {
1688 int lgb, rgb;
1689 const pcre_uchar *nptr = ptr + clen;
1690 int ncount = 0;
1691 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1692 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1693 {
1694 active_count--; /* Remove non-match possibility */
1695 next_active_state--;
1696 }
1697 lgb = UCD_GRAPHBREAK(c);
1698 while (nptr < end_subject)
1699 {
1700 dlen = 1;
1701 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1702 rgb = UCD_GRAPHBREAK(d);
1703 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1704 ncount++;
1705 lgb = rgb;
1706 nptr += dlen;
1707 }
1708 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1709 }
1710 break;
1711 #endif
1712
1713 /*-----------------------------------------------------------------*/
1714 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1715 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1716 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1717 count = 2;
1718 goto QS3;
1719
1720 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1721 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1722 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1723 count = 0;
1724
1725 QS3:
1726 ADD_ACTIVE(state_offset + 2, 0);
1727 if (clen > 0)
1728 {
1729 int ncount = 0;
1730 switch (c)
1731 {
1732 case CHAR_VT:
1733 case CHAR_FF:
1734 case CHAR_NEL:
1735 #ifndef EBCDIC
1736 case 0x2028:
1737 case 0x2029:
1738 #endif /* Not EBCDIC */
1739 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1740 goto ANYNL02;
1741
1742 case CHAR_CR:
1743 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1744 /* Fall through */
1745
1746 ANYNL02:
1747 case CHAR_LF:
1748 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1749 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1750 {
1751 active_count--; /* Remove non-match possibility */
1752 next_active_state--;
1753 }
1754 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1755 break;
1756
1757 default:
1758 break;
1759 }
1760 }
1761 break;
1762
1763 /*-----------------------------------------------------------------*/
1764 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1765 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1766 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1767 count = 2;
1768 goto QS4;
1769
1770 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1771 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1772 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1773 count = 0;
1774
1775 QS4:
1776 ADD_ACTIVE(state_offset + 2, 0);
1777 if (clen > 0)
1778 {
1779 BOOL OK;
1780 switch (c)
1781 {
1782 VSPACE_CASES:
1783 OK = TRUE;
1784 break;
1785
1786 default:
1787 OK = FALSE;
1788 break;
1789 }
1790 if (OK == (d == OP_VSPACE))
1791 {
1792 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1793 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1794 {
1795 active_count--; /* Remove non-match possibility */
1796 next_active_state--;
1797 }
1798 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1799 }
1800 }
1801 break;
1802
1803 /*-----------------------------------------------------------------*/
1804 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1805 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1806 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1807 count = 2;
1808 goto QS5;
1809
1810 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1811 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1812 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1813 count = 0;
1814
1815 QS5:
1816 ADD_ACTIVE(state_offset + 2, 0);
1817 if (clen > 0)
1818 {
1819 BOOL OK;
1820 switch (c)
1821 {
1822 HSPACE_CASES:
1823 OK = TRUE;
1824 break;
1825
1826 default:
1827 OK = FALSE;
1828 break;
1829 }
1830
1831 if (OK == (d == OP_HSPACE))
1832 {
1833 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1834 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1835 {
1836 active_count--; /* Remove non-match possibility */
1837 next_active_state--;
1838 }
1839 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1840 }
1841 }
1842 break;
1843
1844 /*-----------------------------------------------------------------*/
1845 #ifdef SUPPORT_UCP
1846 case OP_PROP_EXTRA + OP_TYPEEXACT:
1847 case OP_PROP_EXTRA + OP_TYPEUPTO:
1848 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1849 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1850 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1851 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1852 count = current_state->count; /* Number already matched */
1853 if (clen > 0)
1854 {
1855 BOOL OK;
1856 const pcre_uint32 *cp;
1857 const ucd_record * prop = GET_UCD(c);
1858 switch(code[1 + IMM2_SIZE + 1])
1859 {
1860 case PT_ANY:
1861 OK = TRUE;
1862 break;
1863
1864 case PT_LAMP:
1865 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1866 prop->chartype == ucp_Lt;
1867 break;
1868
1869 case PT_GC:
1870 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1871 break;
1872
1873 case PT_PC:
1874 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1875 break;
1876
1877 case PT_SC:
1878 OK = prop->script == code[1 + IMM2_SIZE + 2];
1879 break;
1880
1881 /* These are specials for combination cases. */
1882
1883 case PT_ALNUM:
1884 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1885 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1886 break;
1887
1888 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1889 which means that Perl space and POSIX space are now identical. PCRE
1890 was changed at release 8.34. */
1891
1892 case PT_SPACE: /* Perl space */
1893 case PT_PXSPACE: /* POSIX space */
1894 switch(c)
1895 {
1896 HSPACE_CASES:
1897 VSPACE_CASES:
1898 OK = TRUE;
1899 break;
1900
1901 default:
1902 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1903 break;
1904 }
1905 break;
1906
1907 case PT_WORD:
1908 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1909 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1910 c == CHAR_UNDERSCORE;
1911 break;
1912
1913 case PT_CLIST:
1914 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1915 for (;;)
1916 {
1917 if (c < *cp) { OK = FALSE; break; }
1918 if (c == *cp++) { OK = TRUE; break; }
1919 }
1920 break;
1921
1922 case PT_UCNC:
1923 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1924 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1925 c >= 0xe000;
1926 break;
1927
1928 /* Should never occur, but keep compilers from grumbling. */
1929
1930 default:
1931 OK = codevalue != OP_PROP;
1932 break;
1933 }
1934
1935 if (OK == (d == OP_PROP))
1936 {
1937 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1938 {
1939 active_count--; /* Remove non-match possibility */
1940 next_active_state--;
1941 }
1942 if (++count >= (int)GET2(code, 1))
1943 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1944 else
1945 { ADD_NEW(state_offset, count); }
1946 }
1947 }
1948 break;
1949
1950 /*-----------------------------------------------------------------*/
1951 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1952 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1953 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1954 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1955 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1956 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1957 count = current_state->count; /* Number already matched */
1958 if (clen > 0)
1959 {
1960 int lgb, rgb;
1961 const pcre_uchar *nptr = ptr + clen;
1962 int ncount = 0;
1963 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1964 {
1965 active_count--; /* Remove non-match possibility */
1966 next_active_state--;
1967 }
1968 lgb = UCD_GRAPHBREAK(c);
1969 while (nptr < end_subject)
1970 {
1971 dlen = 1;
1972 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1973 rgb = UCD_GRAPHBREAK(d);
1974 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1975 ncount++;
1976 lgb = rgb;
1977 nptr += dlen;
1978 }
1979 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1980 reset_could_continue = TRUE;
1981 if (++count >= (int)GET2(code, 1))
1982 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1983 else
1984 { ADD_NEW_DATA(-state_offset, count, ncount); }
1985 }
1986 break;
1987 #endif
1988
1989 /*-----------------------------------------------------------------*/
1990 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1991 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1992 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1993 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1994 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1995 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1996 count = current_state->count; /* Number already matched */
1997 if (clen > 0)
1998 {
1999 int ncount = 0;
2000 switch (c)
2001 {
2002 case CHAR_VT:
2003 case CHAR_FF:
2004 case CHAR_NEL:
2005 #ifndef EBCDIC
2006 case 0x2028:
2007 case 0x2029:
2008 #endif /* Not EBCDIC */
2009 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2010 goto ANYNL03;
2011
2012 case CHAR_CR:
2013 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
2014 /* Fall through */
2015
2016 ANYNL03:
2017 case CHAR_LF:
2018 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2019 {
2020 active_count--; /* Remove non-match possibility */
2021 next_active_state--;
2022 }
2023 if (++count >= (int)GET2(code, 1))
2024 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2025 else
2026 { ADD_NEW_DATA(-state_offset, count, ncount); }
2027 break;
2028
2029 default:
2030 break;
2031 }
2032 }
2033 break;
2034
2035 /*-----------------------------------------------------------------*/
2036 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2037 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2038 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2039 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2040 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2041 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2042 count = current_state->count; /* Number already matched */
2043 if (clen > 0)
2044 {
2045 BOOL OK;
2046 switch (c)
2047 {
2048 VSPACE_CASES:
2049 OK = TRUE;
2050 break;
2051
2052 default:
2053 OK = FALSE;
2054 }
2055
2056 if (OK == (d == OP_VSPACE))
2057 {
2058 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2059 {
2060 active_count--; /* Remove non-match possibility */
2061 next_active_state--;
2062 }
2063 if (++count >= (int)GET2(code, 1))
2064 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2065 else
2066 { ADD_NEW_DATA(-state_offset, count, 0); }
2067 }
2068 }
2069 break;
2070
2071 /*-----------------------------------------------------------------*/
2072 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2073 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2074 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2075 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2076 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2077 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2078 count = current_state->count; /* Number already matched */
2079 if (clen > 0)
2080 {
2081 BOOL OK;
2082 switch (c)
2083 {
2084 HSPACE_CASES:
2085 OK = TRUE;
2086 break;
2087
2088 default:
2089 OK = FALSE;
2090 break;
2091 }
2092
2093 if (OK == (d == OP_HSPACE))
2094 {
2095 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2096 {
2097 active_count--; /* Remove non-match possibility */
2098 next_active_state--;
2099 }
2100 if (++count >= (int)GET2(code, 1))
2101 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2102 else
2103 { ADD_NEW_DATA(-state_offset, count, 0); }
2104 }
2105 }
2106 break;
2107
2108 /* ========================================================================== */
2109 /* These opcodes are followed by a character that is usually compared
2110 to the current subject character; it is loaded into d. We still get
2111 here even if there is no subject character, because in some cases zero
2112 repetitions are permitted. */
2113
2114 /*-----------------------------------------------------------------*/
2115 case OP_CHAR:
2116 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2117 break;
2118
2119 /*-----------------------------------------------------------------*/
2120 case OP_CHARI:
2121 if (clen == 0) break;
2122
2123 #ifdef SUPPORT_UTF
2124 if (utf)
2125 {
2126 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2127 {
2128 unsigned int othercase;
2129 if (c < 128)
2130 othercase = fcc[c];
2131 else
2132 /* If we have Unicode property support, we can use it to test the
2133 other case of the character. */
2134 #ifdef SUPPORT_UCP
2135 othercase = UCD_OTHERCASE(c);
2136 #else
2137 othercase = NOTACHAR;
2138 #endif
2139
2140 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2141 }
2142 }
2143 else
2144 #endif /* SUPPORT_UTF */
2145 /* Not UTF mode */
2146 {
2147 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2148 { ADD_NEW(state_offset + 2, 0); }
2149 }
2150 break;
2151
2152
2153 #ifdef SUPPORT_UCP
2154 /*-----------------------------------------------------------------*/
2155 /* This is a tricky one because it can match more than one character.
2156 Find out how many characters to skip, and then set up a negative state
2157 to wait for them to pass before continuing. */
2158
2159 case OP_EXTUNI:
2160 if (clen > 0)
2161 {
2162 int lgb, rgb;
2163 const pcre_uchar *nptr = ptr + clen;
2164 int ncount = 0;
2165 lgb = UCD_GRAPHBREAK(c);
2166 while (nptr < end_subject)
2167 {
2168 dlen = 1;
2169 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2170 rgb = UCD_GRAPHBREAK(d);
2171 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2172 ncount++;
2173 lgb = rgb;
2174 nptr += dlen;
2175 }
2176 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2177 reset_could_continue = TRUE;
2178 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2179 }
2180 break;
2181 #endif
2182
2183 /*-----------------------------------------------------------------*/
2184 /* This is a tricky like EXTUNI because it too can match more than one
2185 character (when CR is followed by LF). In this case, set up a negative
2186 state to wait for one character to pass before continuing. */
2187
2188 case OP_ANYNL:
2189 if (clen > 0) switch(c)
2190 {
2191 case CHAR_VT:
2192 case CHAR_FF:
2193 case CHAR_NEL:
2194 #ifndef EBCDIC
2195 case 0x2028:
2196 case 0x2029:
2197 #endif /* Not EBCDIC */
2198 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2199
2200 case CHAR_LF:
2201 ADD_NEW(state_offset + 1, 0);
2202 break;
2203
2204 case CHAR_CR:
2205 if (ptr + 1 >= end_subject)
2206 {
2207 ADD_NEW(state_offset + 1, 0);
2208 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2209 reset_could_continue = TRUE;
2210 }
2211 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2212 {
2213 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2214 }
2215 else
2216 {
2217 ADD_NEW(state_offset + 1, 0);
2218 }
2219 break;
2220 }
2221 break;
2222
2223 /*-----------------------------------------------------------------*/
2224 case OP_NOT_VSPACE:
2225 if (clen > 0) switch(c)
2226 {
2227 VSPACE_CASES:
2228 break;
2229
2230 default:
2231 ADD_NEW(state_offset + 1, 0);
2232 break;
2233 }
2234 break;
2235
2236 /*-----------------------------------------------------------------*/
2237 case OP_VSPACE:
2238 if (clen > 0) switch(c)
2239 {
2240 VSPACE_CASES:
2241 ADD_NEW(state_offset + 1, 0);
2242 break;
2243
2244 default:
2245 break;
2246 }
2247 break;
2248
2249 /*-----------------------------------------------------------------*/
2250 case OP_NOT_HSPACE:
2251 if (clen > 0) switch(c)
2252 {
2253 HSPACE_CASES:
2254 break;
2255
2256 default:
2257 ADD_NEW(state_offset + 1, 0);
2258 break;
2259 }
2260 break;
2261
2262 /*-----------------------------------------------------------------*/
2263 case OP_HSPACE:
2264 if (clen > 0) switch(c)
2265 {
2266 HSPACE_CASES:
2267 ADD_NEW(state_offset + 1, 0);
2268 break;
2269
2270 default:
2271 break;
2272 }
2273 break;
2274
2275 /*-----------------------------------------------------------------*/
2276 /* Match a negated single character casefully. */
2277
2278 case OP_NOT:
2279 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2280 break;
2281
2282 /*-----------------------------------------------------------------*/
2283 /* Match a negated single character caselessly. */
2284
2285 case OP_NOTI:
2286 if (clen > 0)
2287 {
2288 unsigned int otherd;
2289 #ifdef SUPPORT_UTF
2290 if (utf && d >= 128)
2291 {
2292 #ifdef SUPPORT_UCP
2293 otherd = UCD_OTHERCASE(d);
2294 #endif /* SUPPORT_UCP */
2295 }
2296 else
2297 #endif /* SUPPORT_UTF */
2298 otherd = TABLE_GET(d, fcc, d);
2299 if (c != d && c != otherd)
2300 { ADD_NEW(state_offset + dlen + 1, 0); }
2301 }
2302 break;
2303
2304 /*-----------------------------------------------------------------*/
2305 case OP_PLUSI:
2306 case OP_MINPLUSI:
2307 case OP_POSPLUSI:
2308 case OP_NOTPLUSI:
2309 case OP_NOTMINPLUSI:
2310 case OP_NOTPOSPLUSI:
2311 caseless = TRUE;
2312 codevalue -= OP_STARI - OP_STAR;
2313
2314 /* Fall through */
2315 case OP_PLUS:
2316 case OP_MINPLUS:
2317 case OP_POSPLUS:
2318 case OP_NOTPLUS:
2319 case OP_NOTMINPLUS:
2320 case OP_NOTPOSPLUS:
2321 count = current_state->count; /* Already matched */
2322 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2323 if (clen > 0)
2324 {
2325 pcre_uint32 otherd = NOTACHAR;
2326 if (caseless)
2327 {
2328 #ifdef SUPPORT_UTF
2329 if (utf && d >= 128)
2330 {
2331 #ifdef SUPPORT_UCP
2332 otherd = UCD_OTHERCASE(d);
2333 #endif /* SUPPORT_UCP */
2334 }
2335 else
2336 #endif /* SUPPORT_UTF */
2337 otherd = TABLE_GET(d, fcc, d);
2338 }
2339 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2340 {
2341 if (count > 0 &&
2342 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2343 {
2344 active_count--; /* Remove non-match possibility */
2345 next_active_state--;
2346 }
2347 count++;
2348 ADD_NEW(state_offset, count);
2349 }
2350 }
2351 break;
2352
2353 /*-----------------------------------------------------------------*/
2354 case OP_QUERYI:
2355 case OP_MINQUERYI:
2356 case OP_POSQUERYI:
2357 case OP_NOTQUERYI:
2358 case OP_NOTMINQUERYI:
2359 case OP_NOTPOSQUERYI:
2360 caseless = TRUE;
2361 codevalue -= OP_STARI - OP_STAR;
2362 /* Fall through */
2363 case OP_QUERY:
2364 case OP_MINQUERY:
2365 case OP_POSQUERY:
2366 case OP_NOTQUERY:
2367 case OP_NOTMINQUERY:
2368 case OP_NOTPOSQUERY:
2369 ADD_ACTIVE(state_offset + dlen + 1, 0);
2370 if (clen > 0)
2371 {
2372 pcre_uint32 otherd = NOTACHAR;
2373 if (caseless)
2374 {
2375 #ifdef SUPPORT_UTF
2376 if (utf && d >= 128)
2377 {
2378 #ifdef SUPPORT_UCP
2379 otherd = UCD_OTHERCASE(d);
2380 #endif /* SUPPORT_UCP */
2381 }
2382 else
2383 #endif /* SUPPORT_UTF */
2384 otherd = TABLE_GET(d, fcc, d);
2385 }
2386 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2387 {
2388 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2389 {
2390 active_count--; /* Remove non-match possibility */
2391 next_active_state--;
2392 }
2393 ADD_NEW(state_offset + dlen + 1, 0);
2394 }
2395 }
2396 break;
2397
2398 /*-----------------------------------------------------------------*/
2399 case OP_STARI:
2400 case OP_MINSTARI:
2401 case OP_POSSTARI:
2402 case OP_NOTSTARI:
2403 case OP_NOTMINSTARI:
2404 case OP_NOTPOSSTARI:
2405 caseless = TRUE;
2406 codevalue -= OP_STARI - OP_STAR;
2407 /* Fall through */
2408 case OP_STAR:
2409 case OP_MINSTAR:
2410 case OP_POSSTAR:
2411 case OP_NOTSTAR:
2412 case OP_NOTMINSTAR:
2413 case OP_NOTPOSSTAR:
2414 ADD_ACTIVE(state_offset + dlen + 1, 0);
2415 if (clen > 0)
2416 {
2417 pcre_uint32 otherd = NOTACHAR;
2418 if (caseless)
2419 {
2420 #ifdef SUPPORT_UTF
2421 if (utf && d >= 128)
2422 {
2423 #ifdef SUPPORT_UCP
2424 otherd = UCD_OTHERCASE(d);
2425 #endif /* SUPPORT_UCP */
2426 }
2427 else
2428 #endif /* SUPPORT_UTF */
2429 otherd = TABLE_GET(d, fcc, d);
2430 }
2431 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2432 {
2433 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2434 {
2435 active_count--; /* Remove non-match possibility */
2436 next_active_state--;
2437 }
2438 ADD_NEW(state_offset, 0);
2439 }
2440 }
2441 break;
2442
2443 /*-----------------------------------------------------------------*/
2444 case OP_EXACTI:
2445 case OP_NOTEXACTI:
2446 caseless = TRUE;
2447 codevalue -= OP_STARI - OP_STAR;
2448 /* Fall through */
2449 case OP_EXACT:
2450 case OP_NOTEXACT:
2451 count = current_state->count; /* Number already matched */
2452 if (clen > 0)
2453 {
2454 pcre_uint32 otherd = NOTACHAR;
2455 if (caseless)
2456 {
2457 #ifdef SUPPORT_UTF
2458 if (utf && d >= 128)
2459 {
2460 #ifdef SUPPORT_UCP
2461 otherd = UCD_OTHERCASE(d);
2462 #endif /* SUPPORT_UCP */
2463 }
2464 else
2465 #endif /* SUPPORT_UTF */
2466 otherd = TABLE_GET(d, fcc, d);
2467 }
2468 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2469 {
2470 if (++count >= (int)GET2(code, 1))
2471 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2472 else
2473 { ADD_NEW(state_offset, count); }
2474 }
2475 }
2476 break;
2477
2478 /*-----------------------------------------------------------------*/
2479 case OP_UPTOI:
2480 case OP_MINUPTOI:
2481 case OP_POSUPTOI:
2482 case OP_NOTUPTOI:
2483 case OP_NOTMINUPTOI:
2484 case OP_NOTPOSUPTOI:
2485 caseless = TRUE;
2486 codevalue -= OP_STARI - OP_STAR;
2487 /* Fall through */
2488 case OP_UPTO:
2489 case OP_MINUPTO:
2490 case OP_POSUPTO:
2491 case OP_NOTUPTO:
2492 case OP_NOTMINUPTO:
2493 case OP_NOTPOSUPTO:
2494 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2495 count = current_state->count; /* Number already matched */
2496 if (clen > 0)
2497 {
2498 pcre_uint32 otherd = NOTACHAR;
2499 if (caseless)
2500 {
2501 #ifdef SUPPORT_UTF
2502 if (utf && d >= 128)
2503 {
2504 #ifdef SUPPORT_UCP
2505 otherd = UCD_OTHERCASE(d);
2506 #endif /* SUPPORT_UCP */
2507 }
2508 else
2509 #endif /* SUPPORT_UTF */
2510 otherd = TABLE_GET(d, fcc, d);
2511 }
2512 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2513 {
2514 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2515 {
2516 active_count--; /* Remove non-match possibility */
2517 next_active_state--;
2518 }
2519 if (++count >= (int)GET2(code, 1))
2520 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2521 else
2522 { ADD_NEW(state_offset, count); }
2523 }
2524 }
2525 break;
2526
2527
2528 /* ========================================================================== */
2529 /* These are the class-handling opcodes */
2530
2531 case OP_CLASS:
2532 case OP_NCLASS:
2533 case OP_XCLASS:
2534 {
2535 BOOL isinclass = FALSE;
2536 int next_state_offset;
2537 const pcre_uchar *ecode;
2538
2539 /* For a simple class, there is always just a 32-byte table, and we
2540 can set isinclass from it. */
2541
2542 if (codevalue != OP_XCLASS)
2543 {
2544 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2545 if (clen > 0)
2546 {
2547 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2548 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2549 }
2550 }
2551
2552 /* An extended class may have a table or a list of single characters,
2553 ranges, or both, and it may be positive or negative. There's a
2554 function that sorts all this out. */
2555
2556 else
2557 {
2558 ecode = code + GET(code, 1);
2559 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2560 }
2561
2562 /* At this point, isinclass is set for all kinds of class, and ecode
2563 points to the byte after the end of the class. If there is a
2564 quantifier, this is where it will be. */
2565
2566 next_state_offset = (int)(ecode - start_code);
2567
2568 switch (*ecode)
2569 {
2570 case OP_CRSTAR:
2571 case OP_CRMINSTAR:
2572 ADD_ACTIVE(next_state_offset + 1, 0);
2573 if (isinclass) { ADD_NEW(state_offset, 0); }
2574 break;
2575
2576 case OP_CRPLUS:
2577 case OP_CRMINPLUS:
2578 count = current_state->count; /* Already matched */
2579 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2580 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2581 break;
2582
2583 case OP_CRQUERY:
2584 case OP_CRMINQUERY:
2585 ADD_ACTIVE(next_state_offset + 1, 0);
2586 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2587 break;
2588
2589 case OP_CRRANGE:
2590 case OP_CRMINRANGE:
2591 count = current_state->count; /* Already matched */
2592 if (count >= (int)GET2(ecode, 1))
2593 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2594 if (isinclass)
2595 {
2596 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2597 if (++count >= max && max != 0) /* Max 0 => no limit */
2598 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2599 else
2600 { ADD_NEW(state_offset, count); }
2601 }
2602 break;
2603
2604 default:
2605 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2606 break;
2607 }
2608 }
2609 break;
2610
2611 /* ========================================================================== */
2612 /* These are the opcodes for fancy brackets of various kinds. We have
2613 to use recursion in order to handle them. The "always failing" assertion
2614 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2615 though the other "backtracking verbs" are not supported. */
2616
2617 case OP_FAIL:
2618 forced_fail++; /* Count FAILs for multiple states */
2619 break;
2620
2621 case OP_ASSERT:
2622 case OP_ASSERT_NOT:
2623 case OP_ASSERTBACK:
2624 case OP_ASSERTBACK_NOT:
2625 {
2626 int rc;
2627 int local_offsets[2];
2628 int local_workspace[1000];
2629 const pcre_uchar *endasscode = code + GET(code, 1);
2630
2631 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2632
2633 rc = internal_dfa_exec(
2634 md, /* static match data */
2635 code, /* this subexpression's code */
2636 ptr, /* where we currently are */
2637 (int)(ptr - start_subject), /* start offset */
2638 local_offsets, /* offset vector */
2639 sizeof(local_offsets)/sizeof(int), /* size of same */
2640 local_workspace, /* workspace vector */
2641 sizeof(local_workspace)/sizeof(int), /* size of same */
2642 rlevel); /* function recursion level */
2643
2644 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2645 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2646 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2647 }
2648 break;
2649
2650 /*-----------------------------------------------------------------*/
2651 case OP_COND:
2652 case OP_SCOND:
2653 {
2654 int local_offsets[1000];
2655 int local_workspace[1000];
2656 int codelink = GET(code, 1);
2657 int condcode;
2658
2659 /* Because of the way auto-callout works during compile, a callout item
2660 is inserted between OP_COND and an assertion condition. This does not
2661 happen for the other conditions. */
2662
2663 if (code[LINK_SIZE+1] == OP_CALLOUT)
2664 {
2665 rrc = 0;
2666 if (PUBL(callout) != NULL)
2667 {
2668 PUBL(callout_block) cb;
2669 cb.version = 1; /* Version 1 of the callout block */
2670 cb.callout_number = code[LINK_SIZE+2];
2671 cb.offset_vector = offsets;
2672 #if defined COMPILE_PCRE8
2673 cb.subject = (PCRE_SPTR)start_subject;
2674 #elif defined COMPILE_PCRE16
2675 cb.subject = (PCRE_SPTR16)start_subject;
2676 #elif defined COMPILE_PCRE32
2677 cb.subject = (PCRE_SPTR32)start_subject;
2678 #endif
2679 cb.subject_length = (int)(end_subject - start_subject);
2680 cb.start_match = (int)(current_subject - start_subject);
2681 cb.current_position = (int)(ptr - start_subject);
2682 cb.pattern_position = GET(code, LINK_SIZE + 3);
2683 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2684 cb.capture_top = 1;
2685 cb.capture_last = -1;
2686 cb.callout_data = md->callout_data;
2687 cb.mark = NULL; /* No (*MARK) support */
2688 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2689 }
2690 if (rrc > 0) break; /* Fail this thread */
2691 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2692 }
2693
2694 condcode = code[LINK_SIZE+1];
2695
2696 /* Back reference conditions and duplicate named recursion conditions
2697 are not supported */
2698
2699 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2700 condcode == OP_DNRREF)
2701 return PCRE_ERROR_DFA_UCOND;
2702
2703 /* The DEFINE condition is always false */
2704
2705 if (condcode == OP_DEF)
2706 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2707
2708 /* The only supported version of OP_RREF is for the value RREF_ANY,
2709 which means "test if in any recursion". We can't test for specifically
2710 recursed groups. */
2711
2712 else if (condcode == OP_RREF)
2713 {
2714 int value = GET2(code, LINK_SIZE + 2);
2715 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2716 if (md->recursive != NULL)
2717 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2718 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2719 }
2720
2721 /* Otherwise, the condition is an assertion */
2722
2723 else
2724 {
2725 int rc;
2726 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2727 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2728
2729 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2730
2731 rc = internal_dfa_exec(
2732 md, /* fixed match data */
2733 asscode, /* this subexpression's code */
2734 ptr, /* where we currently are */
2735 (int)(ptr - start_subject), /* start offset */
2736 local_offsets, /* offset vector */
2737 sizeof(local_offsets)/sizeof(int), /* size of same */
2738 local_workspace, /* workspace vector */
2739 sizeof(local_workspace)/sizeof(int), /* size of same */
2740 rlevel); /* function recursion level */
2741
2742 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2743 if ((rc >= 0) ==
2744 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2745 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2746 else
2747 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2748 }
2749 }
2750 break;
2751
2752 /*-----------------------------------------------------------------*/
2753 case OP_RECURSE:
2754 {
2755 dfa_recursion_info *ri;
2756 int local_offsets[1000];
2757 int local_workspace[1000];
2758 const pcre_uchar *callpat = start_code + GET(code, 1);
2759 int recno = (callpat == md->start_code)? 0 :
2760 GET2(callpat, 1 + LINK_SIZE);
2761 int rc;
2762
2763 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2764
2765 /* Check for repeating a recursion without advancing the subject
2766 pointer. This should catch convoluted mutual recursions. (Some simple
2767 cases are caught at compile time.) */
2768
2769 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2770 if (recno == ri->group_num && ptr == ri->subject_position)
2771 return PCRE_ERROR_RECURSELOOP;
2772
2773 /* Remember this recursion and where we started it so as to
2774 catch infinite loops. */
2775
2776 new_recursive.group_num = recno;
2777 new_recursive.subject_position = ptr;
2778 new_recursive.prevrec = md->recursive;
2779 md->recursive = &new_recursive;
2780
2781 rc = internal_dfa_exec(
2782 md, /* fixed match data */
2783 callpat, /* this subexpression's code */
2784 ptr, /* where we currently are */
2785 (int)(ptr - start_subject), /* start offset */
2786 local_offsets, /* offset vector */
2787 sizeof(local_offsets)/sizeof(int), /* size of same */
2788 local_workspace, /* workspace vector */
2789 sizeof(local_workspace)/sizeof(int), /* size of same */
2790 rlevel); /* function recursion level */
2791
2792 md->recursive = new_recursive.prevrec; /* Done this recursion */
2793
2794 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2795 rc));
2796
2797 /* Ran out of internal offsets */
2798
2799 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2800
2801 /* For each successful matched substring, set up the next state with a
2802 count of characters to skip before trying it. Note that the count is in
2803 characters, not bytes. */
2804
2805 if (rc > 0)
2806 {
2807 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2808 {
2809 int charcount = local_offsets[rc+1] - local_offsets[rc];
2810 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2811 if (utf)
2812 {
2813 const pcre_uchar *p = start_subject + local_offsets[rc];
2814 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2815 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2816 }
2817 #endif
2818 if (charcount > 0)
2819 {
2820 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2821 }
2822 else
2823 {
2824 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2825 }
2826 }
2827 }
2828 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2829 }
2830 break;
2831
2832 /*-----------------------------------------------------------------*/
2833 case OP_BRAPOS:
2834 case OP_SBRAPOS:
2835 case OP_CBRAPOS:
2836 case OP_SCBRAPOS:
2837 case OP_BRAPOSZERO:
2838 {
2839 int charcount, matched_count;
2840 const pcre_uchar *local_ptr = ptr;
2841 BOOL allow_zero;
2842
2843 if (codevalue == OP_BRAPOSZERO)
2844 {
2845 allow_zero = TRUE;
2846 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2847 }
2848 else allow_zero = FALSE;
2849
2850 /* Loop to match the subpattern as many times as possible as if it were
2851 a complete pattern. */
2852
2853 for (matched_count = 0;; matched_count++)
2854 {
2855 int local_offsets[2];
2856 int local_workspace[1000];
2857
2858 int rc = internal_dfa_exec(
2859 md, /* fixed match data */
2860 code, /* this subexpression's code */
2861 local_ptr, /* where we currently are */
2862 (int)(ptr - start_subject), /* start offset */
2863 local_offsets, /* offset vector */
2864 sizeof(local_offsets)/sizeof(int), /* size of same */
2865 local_workspace, /* workspace vector */
2866 sizeof(local_workspace)/sizeof(int), /* size of same */
2867 rlevel); /* function recursion level */
2868
2869 /* Failed to match */
2870
2871 if (rc < 0)
2872 {
2873 if (rc != PCRE_ERROR_NOMATCH) return rc;
2874 break;
2875 }
2876
2877 /* Matched: break the loop if zero characters matched. */
2878
2879 charcount = local_offsets[1] - local_offsets[0];
2880 if (charcount == 0) break;
2881 local_ptr += charcount; /* Advance temporary position ptr */
2882 }
2883
2884 /* At this point we have matched the subpattern matched_count
2885 times, and local_ptr is pointing to the character after the end of the
2886 last match. */
2887
2888 if (matched_count > 0 || allow_zero)
2889 {
2890 const pcre_uchar *end_subpattern = code;
2891 int next_state_offset;
2892
2893 do { end_subpattern += GET(end_subpattern, 1); }
2894 while (*end_subpattern == OP_ALT);
2895 next_state_offset =
2896 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2897
2898 /* Optimization: if there are no more active states, and there
2899 are no new states yet set up, then skip over the subject string
2900 right here, to save looping. Otherwise, set up the new state to swing
2901 into action when the end of the matched substring is reached. */
2902
2903 if (i + 1 >= active_count && new_count == 0)
2904 {
2905 ptr = local_ptr;
2906 clen = 0;
2907 ADD_NEW(next_state_offset, 0);
2908 }
2909 else
2910 {
2911 const pcre_uchar *p = ptr;
2912 const pcre_uchar *pp = local_ptr;
2913 charcount = (int)(pp - p);
2914 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2915 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2916 #endif
2917 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2918 }
2919 }
2920 }
2921 break;
2922
2923 /*-----------------------------------------------------------------*/
2924 case OP_ONCE:
2925 case OP_ONCE_NC:
2926 {
2927 int local_offsets[2];
2928 int local_workspace[1000];
2929
2930 int rc = internal_dfa_exec(
2931 md, /* fixed match data */
2932 code, /* this subexpression's code */
2933 ptr, /* where we currently are */
2934 (int)(ptr - start_subject), /* start offset */
2935 local_offsets, /* offset vector */
2936 sizeof(local_offsets)/sizeof(int), /* size of same */
2937 local_workspace, /* workspace vector */
2938 sizeof(local_workspace)/sizeof(int), /* size of same */
2939 rlevel); /* function recursion level */
2940
2941 if (rc >= 0)
2942 {
2943 const pcre_uchar *end_subpattern = code;
2944 int charcount = local_offsets[1] - local_offsets[0];
2945 int next_state_offset, repeat_state_offset;
2946
2947 do { end_subpattern += GET(end_subpattern, 1); }
2948 while (*end_subpattern == OP_ALT);
2949 next_state_offset =
2950 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2951
2952 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2953 arrange for the repeat state also to be added to the relevant list.
2954 Calculate the offset, or set -1 for no repeat. */
2955
2956 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2957 *end_subpattern == OP_KETRMIN)?
2958 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2959
2960 /* If we have matched an empty string, add the next state at the
2961 current character pointer. This is important so that the duplicate
2962 checking kicks in, which is what breaks infinite loops that match an
2963 empty string. */
2964
2965 if (charcount == 0)
2966 {
2967 ADD_ACTIVE(next_state_offset, 0);
2968 }
2969
2970 /* Optimization: if there are no more active states, and there
2971 are no new states yet set up, then skip over the subject string
2972 right here, to save looping. Otherwise, set up the new state to swing
2973 into action when the end of the matched substring is reached. */
2974
2975 else if (i + 1 >= active_count && new_count == 0)
2976 {
2977 ptr += charcount;
2978 clen = 0;
2979 ADD_NEW(next_state_offset, 0);
2980
2981 /* If we are adding a repeat state at the new character position,
2982 we must fudge things so that it is the only current state.
2983 Otherwise, it might be a duplicate of one we processed before, and
2984 that would cause it to be skipped. */
2985
2986 if (repeat_state_offset >= 0)
2987 {
2988 next_active_state = active_states;
2989 active_count = 0;
2990 i = -1;
2991 ADD_ACTIVE(repeat_state_offset, 0);
2992 }
2993 }
2994 else
2995 {
2996 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2997 if (utf)
2998 {
2999 const pcre_uchar *p = start_subject + local_offsets[0];
3000 const pcre_uchar *pp = start_subject + local_offsets[1];
3001 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3002 }
3003 #endif
3004 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3005 if (repeat_state_offset >= 0)
3006 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3007 }
3008 }
3009 else if (rc != PCRE_ERROR_NOMATCH) return rc;
3010 }
3011 break;
3012
3013
3014 /* ========================================================================== */
3015 /* Handle callouts */
3016
3017 case OP_CALLOUT:
3018 rrc = 0;
3019 if (PUBL(callout) != NULL)
3020 {
3021 PUBL(callout_block) cb;
3022 cb.version = 1; /* Version 1 of the callout block */
3023 cb.callout_number = code[1];
3024 cb.offset_vector = offsets;
3025 #if defined COMPILE_PCRE8
3026 cb.subject = (PCRE_SPTR)start_subject;
3027 #elif defined COMPILE_PCRE16
3028 cb.subject = (PCRE_SPTR16)start_subject;
3029 #elif defined COMPILE_PCRE32
3030 cb.subject = (PCRE_SPTR32)start_subject;
3031 #endif
3032 cb.subject_length = (int)(end_subject - start_subject);
3033 cb.start_match = (int)(current_subject - start_subject);
3034 cb.current_position = (int)(ptr - start_subject);
3035 cb.pattern_position = GET(code, 2);
3036 cb.next_item_length = GET(code, 2 + LINK_SIZE);
3037 cb.capture_top = 1;
3038 cb.capture_last = -1;
3039 cb.callout_data = md->callout_data;
3040 cb.mark = NULL; /* No (*MARK) support */
3041 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3042 }
3043 if (rrc == 0)
3044 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3045 break;
3046
3047
3048 /* ========================================================================== */
3049 default: /* Unsupported opcode */
3050 return PCRE_ERROR_DFA_UITEM;
3051 }
3052
3053 NEXT_ACTIVE_STATE: continue;
3054
3055 } /* End of loop scanning active states */
3056
3057 /* We have finished the processing at the current subject character. If no
3058 new states have been set for the next character, we have found all the
3059 matches that we are going to find. If we are at the top level and partial
3060 matching has been requested, check for appropriate conditions.
3061
3062 The "forced_ fail" variable counts the number of (*F) encountered for the
3063 character. If it is equal to the original active_count (saved in
3064 workspace[1]) it means that (*F) was found on every active state. In this
3065 case we don't want to give a partial match.
3066
3067 The "could_continue" variable is true if a state could have continued but
3068 for the fact that the end of the subject was reached. */
3069
3070 if (new_count <= 0)
3071 {
3072 if (rlevel == 1 && /* Top level, and */
3073 could_continue && /* Some could go on, and */
3074 forced_fail != workspace[1] && /* Not all forced fail & */
3075 ( /* either... */
3076 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3077 || /* or... */
3078 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3079 match_count < 0) /* no matches */
3080 ) && /* And... */
3081 (
3082 partial_newline || /* Either partial NL */
3083 ( /* or ... */
3084 ptr >= end_subject && /* End of subject and */
3085 ptr > md->start_used_ptr) /* Inspected non-empty string */
3086 )
3087 )
3088 match_count = PCRE_ERROR_PARTIAL;
3089 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3090 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3091 rlevel*2-2, SP));
3092 break; /* In effect, "return", but see the comment below */
3093 }
3094
3095 /* One or more states are active for the next character. */
3096
3097 ptr += clen; /* Advance to next subject character */
3098 } /* Loop to move along the subject string */
3099
3100 /* Control gets here from "break" a few lines above. We do it this way because
3101 if we use "return" above, we have compiler trouble. Some compilers warn if
3102 there's nothing here because they think the function doesn't return a value. On
3103 the other hand, if we put a dummy statement here, some more clever compilers
3104 complain that it can't be reached. Sigh. */
3105
3106 return match_count;
3107 }
3108
3109
3110
3111
3112 /*************************************************
3113 * Execute a Regular Expression - DFA engine *
3114 *************************************************/
3115
3116 /* This external function applies a compiled re to a subject string using a DFA
3117 engine. This function calls the internal function multiple times if the pattern
3118 is not anchored.
3119
3120 Arguments:
3121 argument_re points to the compiled expression
3122 extra_data points to extra data or is NULL
3123 subject points to the subject string
3124 length length of subject string (may contain binary zeros)
3125 start_offset where to start in the subject string
3126 options option bits
3127 offsets vector of match offsets
3128 offsetcount size of same
3129 workspace workspace vector
3130 wscount size of same
3131
3132 Returns: > 0 => number of match offset pairs placed in offsets
3133 = 0 => offsets overflowed; longest matches are present
3134 -1 => failed to match
3135 < -1 => some kind of unexpected problem
3136 */
3137
3138 #if defined COMPILE_PCRE8
3139 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3141 const char *subject, int length, int start_offset, int options, int *offsets,
3142 int offsetcount, int *workspace, int wscount)
3143 #elif defined COMPILE_PCRE16
3144 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3145 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3146 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3147 int offsetcount, int *workspace, int wscount)
3148 #elif defined COMPILE_PCRE32
3149 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3150 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3151 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3152 int offsetcount, int *workspace, int wscount)
3153 #endif
3154 {
3155 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3156 dfa_match_data match_block;
3157 dfa_match_data *md = &match_block;
3158 BOOL utf, anchored, startline, firstline;
3159 const pcre_uchar *current_subject, *end_subject;
3160 const pcre_study_data *study = NULL;
3161
3162 const pcre_uchar *req_char_ptr;
3163 const pcre_uint8 *start_bits = NULL;
3164 BOOL has_first_char = FALSE;
3165 BOOL has_req_char = FALSE;
3166 pcre_uchar first_char = 0;
3167 pcre_uchar first_char2 = 0;
3168 pcre_uchar req_char = 0;
3169 pcre_uchar req_char2 = 0;
3170 int newline;
3171
3172 /* Plausibility checks */
3173
3174 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3175 if (re == NULL || subject == NULL || workspace == NULL ||
3176 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3177 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3178 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3179 if (length < 0) return PCRE_ERROR_BADLENGTH;
3180 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3181
3182 /* Check that the first field in the block is the magic number. If it is not,
3183 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3184 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3185 means that the pattern is likely compiled with different endianness. */
3186
3187 if (re->magic_number != MAGIC_NUMBER)
3188 return re->magic_number == REVERSED_MAGIC_NUMBER?
3189 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3190 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3191
3192 /* If restarting after a partial match, do some sanity checks on the contents
3193 of the workspace. */
3194
3195 if ((options & PCRE_DFA_RESTART) != 0)
3196 {
3197 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3198 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3199 return PCRE_ERROR_DFA_BADRESTART;
3200 }
3201
3202 /* Set up study, callout, and table data */
3203
3204 md->tables = re->tables;
3205 md->callout_data = NULL;
3206
3207 if (extra_data != NULL)
3208 {
3209 unsigned int flags = extra_data->flags;
3210 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3211 study = (const pcre_study_data *)extra_data->study_data;
3212 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3213 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3214 return PCRE_ERROR_DFA_UMLIMIT;
3215 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3216 md->callout_data = extra_data->callout_data;
3217 if ((flags & PCRE_EXTRA_TABLES) != 0)
3218 md->tables = extra_data->tables;
3219 }
3220
3221 /* Set some local values */
3222
3223 current_subject = (const pcre_uchar *)subject + start_offset;
3224 end_subject = (const pcre_uchar *)subject + length;
3225 req_char_ptr = current_subject - 1;
3226
3227 #ifdef SUPPORT_UTF
3228 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3229 utf = (re->options & PCRE_UTF8) != 0;
3230 #else
3231 utf = FALSE;
3232 #endif
3233
3234 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3235 (re->options & PCRE_ANCHORED) != 0;
3236
3237 /* The remaining fixed data for passing around. */
3238
3239 md->start_code = (const pcre_uchar *)argument_re +
3240 re->name_table_offset + re->name_count * re->name_entry_size;
3241 md->start_subject = (const pcre_uchar *)subject;
3242 md->end_subject = end_subject;
3243 md->start_offset = start_offset;
3244 md->moptions = options;
3245 md->poptions = re->options;
3246
3247 /* If the BSR option is not set at match time, copy what was set
3248 at compile time. */
3249
3250 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3251 {
3252 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3253 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3254 #ifdef BSR_ANYCRLF
3255 else md->moptions |= PCRE_BSR_ANYCRLF;
3256 #endif
3257 }
3258
3259 /* Handle different types of newline. The three bits give eight cases. If
3260 nothing is set at run time, whatever was used at compile time applies. */
3261
3262 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3263 PCRE_NEWLINE_BITS)
3264 {
3265 case 0: newline = NEWLINE; break; /* Compile-time default */
3266 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3267 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3268 case PCRE_NEWLINE_CR+
3269 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3270 case PCRE_NEWLINE_ANY: newline = -1; break;
3271 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3272 default: return PCRE_ERROR_BADNEWLINE;
3273 }
3274
3275 if (newline == -2)
3276 {
3277 md->nltype = NLTYPE_ANYCRLF;
3278 }
3279 else if (newline < 0)
3280 {
3281 md->nltype = NLTYPE_ANY;
3282 }
3283 else
3284 {
3285 md->nltype = NLTYPE_FIXED;
3286 if (newline > 255)
3287 {
3288 md->nllen = 2;
3289 md->nl[0] = (newline >> 8) & 255;
3290 md->nl[1] = newline & 255;
3291 }
3292 else
3293 {
3294 md->nllen = 1;
3295 md->nl[0] = newline;
3296 }
3297 }
3298
3299 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3300 back the character offset. */
3301
3302 #ifdef SUPPORT_UTF
3303 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3304 {
3305 int erroroffset;
3306 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3307 if (errorcode != 0)
3308 {
3309 if (offsetcount >= 2)
3310 {
3311 offsets[0] = erroroffset;
3312 offsets[1] = errorcode;
3313 }
3314 #if defined COMPILE_PCRE8
3315 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3316 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3317 #elif defined COMPILE_PCRE16
3318 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3319 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3320 #elif defined COMPILE_PCRE32
3321 return PCRE_ERROR_BADUTF32;
3322 #endif
3323 }
3324 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3325 if (start_offset > 0 && start_offset < length &&
3326 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3327 return PCRE_ERROR_BADUTF8_OFFSET;
3328 #endif
3329 }
3330 #endif
3331
3332 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3333 is a feature that makes it possible to save compiled regex and re-use them
3334 in other programs later. */
3335
3336 if (md->tables == NULL) md->tables = PRIV(default_tables);
3337
3338 /* The "must be at the start of a line" flags are used in a loop when finding
3339 where to start. */
3340
3341 startline = (re->flags & PCRE_STARTLINE) != 0;
3342 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3343
3344 /* Set up the first character to match, if available. The first_byte value is
3345 never set for an anchored regular expression, but the anchoring may be forced
3346 at run time, so we have to test for anchoring. The first char may be unset for
3347 an unanchored pattern, of course. If there's no first char and the pattern was
3348 studied, there may be a bitmap of possible first characters. */
3349
3350 if (!anchored)
3351 {
3352 if ((re->flags & PCRE_FIRSTSET) != 0)
3353 {
3354 has_first_char = TRUE;
3355 first_char = first_char2 = (pcre_uchar)(re->first_char);
3356 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3357 {
3358 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3359 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3360 if (utf && first_char > 127)
3361 first_char2 = UCD_OTHERCASE(first_char);
3362 #endif
3363 }
3364 }
3365 else
3366 {
3367 if (!startline && study != NULL &&
3368 (study->flags & PCRE_STUDY_MAPPED) != 0)
3369 start_bits = study->start_bits;
3370 }
3371 }
3372
3373 /* For anchored or unanchored matches, there may be a "last known required
3374 character" set. */
3375
3376 if ((re->flags & PCRE_REQCHSET) != 0)
3377 {
3378 has_req_char = TRUE;
3379 req_char = req_char2 = (pcre_uchar)(re->req_char);
3380 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3381 {
3382 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3383 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3384 if (utf && req_char > 127)
3385 req_char2 = UCD_OTHERCASE(req_char);
3386 #endif
3387 }
3388 }
3389
3390 /* Call the main matching function, looping for a non-anchored regex after a
3391 failed match. If not restarting, perform certain optimizations at the start of
3392 a match. */
3393
3394 for (;;)
3395 {
3396 int rc;
3397
3398 if ((options & PCRE_DFA_RESTART) == 0)
3399 {
3400 const pcre_uchar *save_end_subject = end_subject;
3401
3402 /* If firstline is TRUE, the start of the match is constrained to the first
3403 line of a multiline string. Implement this by temporarily adjusting
3404 end_subject so that we stop scanning at a newline. If the match fails at
3405 the newline, later code breaks this loop. */
3406
3407 if (firstline)
3408 {
3409 PCRE_PUCHAR t = current_subject;
3410 #ifdef SUPPORT_UTF
3411 if (utf)
3412 {
3413 while (t < md->end_subject && !IS_NEWLINE(t))
3414 {
3415 t++;
3416 ACROSSCHAR(t < end_subject, *t, t++);
3417 }
3418 }
3419 else
3420 #endif
3421 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3422 end_subject = t;
3423 }
3424
3425 /* There are some optimizations that avoid running the match if a known
3426 starting point is not found. However, there is an option that disables
3427 these, for testing and for ensuring that all callouts do actually occur.
3428 The option can be set in the regex by (*NO_START_OPT) or passed in
3429 match-time options. */
3430
3431 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3432 {
3433 /* Advance to a known first char. */
3434
3435 if (has_first_char)
3436 {
3437 if (first_char != first_char2)
3438 {
3439 pcre_uchar csc;
3440 while (current_subject < end_subject &&
3441 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3442 current_subject++;
3443 }
3444 else
3445 while (current_subject < end_subject &&
3446 RAWUCHARTEST(current_subject) != first_char)
3447 current_subject++;
3448 }
3449
3450 /* Or to just after a linebreak for a multiline match if possible */
3451
3452 else if (startline)
3453 {
3454 if (current_subject > md->start_subject + start_offset)
3455 {
3456 #ifdef SUPPORT_UTF
3457 if (utf)
3458 {
3459 while (current_subject < end_subject &&
3460 !WAS_NEWLINE(current_subject))
3461 {
3462 current_subject++;
3463 ACROSSCHAR(current_subject < end_subject, *current_subject,
3464 current_subject++);
3465 }
3466 }
3467 else
3468 #endif
3469 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3470 current_subject++;
3471
3472 /* If we have just passed a CR and the newline option is ANY or
3473 ANYCRLF, and we are now at a LF, advance the match position by one
3474 more character. */
3475
3476 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3477 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3478 current_subject < end_subject &&
3479 RAWUCHARTEST(current_subject) == CHAR_NL)
3480 current_subject++;
3481 }
3482 }
3483
3484 /* Or to a non-unique first char after study */
3485
3486 else if (start_bits != NULL)
3487 {
3488 while (current_subject < end_subject)
3489 {
3490 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3491 #ifndef COMPILE_PCRE8
3492 if (c > 255) c = 255;
3493 #endif
3494 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3495 {
3496 current_subject++;
3497 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3498 /* In non 8-bit mode, the iteration will stop for
3499 characters > 255 at the beginning or not stop at all. */
3500 if (utf)
3501 ACROSSCHAR(current_subject < end_subject, *current_subject,
3502 current_subject++);
3503 #endif
3504 }
3505 else break;
3506 }
3507 }
3508 }
3509
3510 /* Restore fudged end_subject */
3511
3512 end_subject = save_end_subject;
3513
3514 /* The following two optimizations are disabled for partial matching or if
3515 disabling is explicitly requested (and of course, by the test above, this
3516 code is not obeyed when restarting after a partial match). */
3517
3518 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3519 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3520 {
3521 /* If the pattern was studied, a minimum subject length may be set. This
3522 is a lower bound; no actual string of that length may actually match the
3523 pattern. Although the value is, strictly, in characters, we treat it as
3524 bytes to avoid spending too much time in this optimization. */
3525
3526 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3527 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3528 return PCRE_ERROR_NOMATCH;
3529
3530 /* If req_char is set, we know that that character must appear in the
3531 subject for the match to succeed. If the first character is set, req_char
3532 must be later in the subject; otherwise the test starts at the match
3533 point. This optimization can save a huge amount of work in patterns with
3534 nested unlimited repeats that aren't going to match. Writing separate
3535 code for cased/caseless versions makes it go faster, as does using an
3536 autoincrement and backing off on a match.
3537
3538 HOWEVER: when the subject string is very, very long, searching to its end
3539 can take a long time, and give bad performance on quite ordinary
3540 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3541 string... so we don't do this when the string is sufficiently long. */
3542
3543 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3544 {
3545 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3546
3547 /* We don't need to repeat the search if we haven't yet reached the
3548 place we found it at last time. */
3549
3550 if (p > req_char_ptr)
3551 {
3552 if (req_char != req_char2)
3553 {
3554 while (p < end_subject)
3555 {
3556 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3557 if (pp == req_char || pp == req_char2) { p--; break; }
3558 }
3559 }
3560 else
3561 {
3562 while (p < end_subject)
3563 {
3564 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3565 }
3566 }
3567
3568 /* If we can't find the required character, break the matching loop,
3569 which will cause a return or PCRE_ERROR_NOMATCH. */
3570
3571 if (p >= end_subject) break;
3572
3573 /* If we have found the required character, save the point where we
3574 found it, so that we don't search again next time round the loop if
3575 the start hasn't passed this character yet. */
3576
3577 req_char_ptr = p;
3578 }
3579 }
3580 }
3581 } /* End of optimizations that are done when not restarting */
3582
3583 /* OK, now we can do the business */
3584
3585 md->start_used_ptr = current_subject;
3586 md->recursive = NULL;
3587
3588 rc = internal_dfa_exec(
3589 md, /* fixed match data */
3590 md->start_code, /* this subexpression's code */
3591 current_subject, /* where we currently are */
3592 start_offset, /* start offset in subject */
3593 offsets, /* offset vector */
3594 offsetcount, /* size of same */
3595 workspace, /* workspace vector */
3596 wscount, /* size of same */
3597 0); /* function recurse level */
3598
3599 /* Anything other than "no match" means we are done, always; otherwise, carry
3600 on only if not anchored. */
3601
3602 if (rc != PCRE_ERROR_NOMATCH || anchored)
3603 {
3604 if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3605 {
3606 offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3607 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3608 if (offsetcount > 2)
3609 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3610 }
3611 return rc;
3612 }
3613
3614 /* Advance to the next subject character unless we are at the end of a line
3615 and firstline is set. */
3616
3617 if (firstline && IS_NEWLINE(current_subject)) break;
3618 current_subject++;
3619 #ifdef SUPPORT_UTF
3620 if (utf)
3621 {
3622 ACROSSCHAR(current_subject < end_subject, *current_subject,
3623 current_subject++);
3624 }
3625 #endif
3626 if (current_subject > end_subject) break;
3627
3628 /* If we have just passed a CR and we are now at a LF, and the pattern does
3629 not contain any explicit matches for \r or \n, and the newline option is CRLF
3630 or ANY or ANYCRLF, advance the match position by one more character. */
3631
3632 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3633 current_subject < end_subject &&
3634 RAWUCHARTEST(current_subject) == CHAR_NL &&
3635 (re->flags & PCRE_HASCRORLF) == 0 &&
3636 (md->nltype == NLTYPE_ANY ||
3637 md->nltype == NLTYPE_ANYCRLF ||
3638 md->nllen == 2))
3639 current_subject++;
3640
3641 } /* "Bumpalong" loop */
3642
3643 return PCRE_ERROR_NOMATCH;
3644 }
3645
3646 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

  ViewVC Help
Powered by ViewVC 1.1.5