1 |
/*************************************************
|
2 |
* Perl-Compatible Regular Expressions *
|
3 |
*************************************************/
|
4 |
|
5 |
/* PCRE is a library of functions to support regular expressions whose syntax
|
6 |
and semantics are as close as possible to those of the Perl 5 language.
|
7 |
|
8 |
Written by Philip Hazel
|
9 |
Copyright (c) 1997-2007 University of Cambridge
|
10 |
|
11 |
-----------------------------------------------------------------------------
|
12 |
Redistribution and use in source and binary forms, with or without
|
13 |
modification, are permitted provided that the following conditions are met:
|
14 |
|
15 |
* Redistributions of source code must retain the above copyright notice,
|
16 |
this list of conditions and the following disclaimer.
|
17 |
|
18 |
* Redistributions in binary form must reproduce the above copyright
|
19 |
notice, this list of conditions and the following disclaimer in the
|
20 |
documentation and/or other materials provided with the distribution.
|
21 |
|
22 |
* Neither the name of the University of Cambridge nor the names of its
|
23 |
contributors may be used to endorse or promote products derived from
|
24 |
this software without specific prior written permission.
|
25 |
|
26 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
27 |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
28 |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
29 |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
30 |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
31 |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
32 |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
33 |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
34 |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
35 |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
36 |
POSSIBILITY OF SUCH DAMAGE.
|
37 |
-----------------------------------------------------------------------------
|
38 |
*/
|
39 |
|
40 |
|
41 |
/* This module contains the external function pcre_dfa_exec(), which is an
|
42 |
alternative matching function that uses a sort of DFA algorithm (not a true
|
43 |
FSM). This is NOT Perl- compatible, but it has advantages in certain
|
44 |
applications. */
|
45 |
|
46 |
|
47 |
#define NLBLOCK md /* Block containing newline information */
|
48 |
#define PSSTART start_subject /* Field containing processed string start */
|
49 |
#define PSEND end_subject /* Field containing processed string end */
|
50 |
|
51 |
#include "pcre_internal.h"
|
52 |
|
53 |
|
54 |
/* For use to indent debugging output */
|
55 |
|
56 |
#define SP " "
|
57 |
|
58 |
|
59 |
|
60 |
/*************************************************
|
61 |
* Code parameters and static tables *
|
62 |
*************************************************/
|
63 |
|
64 |
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
|
65 |
into others, under special conditions. A gap of 20 between the blocks should be
|
66 |
enough. */
|
67 |
|
68 |
#define OP_PROP_EXTRA 100
|
69 |
#define OP_EXTUNI_EXTRA 120
|
70 |
#define OP_ANYNL_EXTRA 140
|
71 |
|
72 |
|
73 |
/* This table identifies those opcodes that are followed immediately by a
|
74 |
character that is to be tested in some way. This makes is possible to
|
75 |
centralize the loading of these characters. In the case of Type * etc, the
|
76 |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
|
77 |
small value. */
|
78 |
|
79 |
static uschar coptable[] = {
|
80 |
0, /* End */
|
81 |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
|
82 |
0, 0, /* Any, Anybyte */
|
83 |
0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
|
84 |
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
85 |
1, /* Char */
|
86 |
1, /* Charnc */
|
87 |
1, /* not */
|
88 |
/* Positive single-char repeats */
|
89 |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
90 |
3, 3, 3, /* upto, minupto, exact */
|
91 |
1, 1, 1, 3, /* *+, ++, ?+, upto+ */
|
92 |
/* Negative single-char repeats - only for chars < 256 */
|
93 |
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
94 |
3, 3, 3, /* NOT upto, minupto, exact */
|
95 |
1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
|
96 |
/* Positive type repeats */
|
97 |
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
98 |
3, 3, 3, /* Type upto, minupto, exact */
|
99 |
1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
|
100 |
/* Character class & ref repeats */
|
101 |
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
|
102 |
0, 0, /* CRRANGE, CRMINRANGE */
|
103 |
0, /* CLASS */
|
104 |
0, /* NCLASS */
|
105 |
0, /* XCLASS - variable length */
|
106 |
0, /* REF */
|
107 |
0, /* RECURSE */
|
108 |
0, /* CALLOUT */
|
109 |
0, /* Alt */
|
110 |
0, /* Ket */
|
111 |
0, /* KetRmax */
|
112 |
0, /* KetRmin */
|
113 |
0, /* Assert */
|
114 |
0, /* Assert not */
|
115 |
0, /* Assert behind */
|
116 |
0, /* Assert behind not */
|
117 |
0, /* Reverse */
|
118 |
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
119 |
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
120 |
0, /* CREF */
|
121 |
0, /* RREF */
|
122 |
0, /* DEF */
|
123 |
0, 0 /* BRAZERO, BRAMINZERO */
|
124 |
};
|
125 |
|
126 |
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
127 |
and \w */
|
128 |
|
129 |
static uschar toptable1[] = {
|
130 |
0, 0, 0, 0, 0,
|
131 |
ctype_digit, ctype_digit,
|
132 |
ctype_space, ctype_space,
|
133 |
ctype_word, ctype_word,
|
134 |
0 /* OP_ANY */
|
135 |
};
|
136 |
|
137 |
static uschar toptable2[] = {
|
138 |
0, 0, 0, 0, 0,
|
139 |
ctype_digit, 0,
|
140 |
ctype_space, 0,
|
141 |
ctype_word, 0,
|
142 |
1 /* OP_ANY */
|
143 |
};
|
144 |
|
145 |
|
146 |
/* Structure for holding data about a particular state, which is in effect the
|
147 |
current data for an active path through the match tree. It must consist
|
148 |
entirely of ints because the working vector we are passed, and which we put
|
149 |
these structures in, is a vector of ints. */
|
150 |
|
151 |
typedef struct stateblock {
|
152 |
int offset; /* Offset to opcode */
|
153 |
int count; /* Count for repeats */
|
154 |
int ims; /* ims flag bits */
|
155 |
int data; /* Some use extra data */
|
156 |
} stateblock;
|
157 |
|
158 |
#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
|
159 |
|
160 |
|
161 |
#ifdef DEBUG
|
162 |
/*************************************************
|
163 |
* Print character string *
|
164 |
*************************************************/
|
165 |
|
166 |
/* Character string printing function for debugging.
|
167 |
|
168 |
Arguments:
|
169 |
p points to string
|
170 |
length number of bytes
|
171 |
f where to print
|
172 |
|
173 |
Returns: nothing
|
174 |
*/
|
175 |
|
176 |
static void
|
177 |
pchars(unsigned char *p, int length, FILE *f)
|
178 |
{
|
179 |
int c;
|
180 |
while (length-- > 0)
|
181 |
{
|
182 |
if (isprint(c = *(p++)))
|
183 |
fprintf(f, "%c", c);
|
184 |
else
|
185 |
fprintf(f, "\\x%02x", c);
|
186 |
}
|
187 |
}
|
188 |
#endif
|
189 |
|
190 |
|
191 |
|
192 |
/*************************************************
|
193 |
* Execute a Regular Expression - DFA engine *
|
194 |
*************************************************/
|
195 |
|
196 |
/* This internal function applies a compiled pattern to a subject string,
|
197 |
starting at a given point, using a DFA engine. This function is called from the
|
198 |
external one, possibly multiple times if the pattern is not anchored. The
|
199 |
function calls itself recursively for some kinds of subpattern.
|
200 |
|
201 |
Arguments:
|
202 |
md the match_data block with fixed information
|
203 |
this_start_code the opening bracket of this subexpression's code
|
204 |
current_subject where we currently are in the subject string
|
205 |
start_offset start offset in the subject string
|
206 |
offsets vector to contain the matching string offsets
|
207 |
offsetcount size of same
|
208 |
workspace vector of workspace
|
209 |
wscount size of same
|
210 |
ims the current ims flags
|
211 |
rlevel function call recursion level
|
212 |
recursing regex recursive call level
|
213 |
|
214 |
Returns: > 0 =>
|
215 |
= 0 =>
|
216 |
-1 => failed to match
|
217 |
< -1 => some kind of unexpected problem
|
218 |
|
219 |
The following macros are used for adding states to the two state vectors (one
|
220 |
for the current character, one for the following character). */
|
221 |
|
222 |
#define ADD_ACTIVE(x,y) \
|
223 |
if (active_count++ < wscount) \
|
224 |
{ \
|
225 |
next_active_state->offset = (x); \
|
226 |
next_active_state->count = (y); \
|
227 |
next_active_state->ims = ims; \
|
228 |
next_active_state++; \
|
229 |
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
230 |
} \
|
231 |
else return PCRE_ERROR_DFA_WSSIZE
|
232 |
|
233 |
#define ADD_ACTIVE_DATA(x,y,z) \
|
234 |
if (active_count++ < wscount) \
|
235 |
{ \
|
236 |
next_active_state->offset = (x); \
|
237 |
next_active_state->count = (y); \
|
238 |
next_active_state->ims = ims; \
|
239 |
next_active_state->data = (z); \
|
240 |
next_active_state++; \
|
241 |
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
242 |
} \
|
243 |
else return PCRE_ERROR_DFA_WSSIZE
|
244 |
|
245 |
#define ADD_NEW(x,y) \
|
246 |
if (new_count++ < wscount) \
|
247 |
{ \
|
248 |
next_new_state->offset = (x); \
|
249 |
next_new_state->count = (y); \
|
250 |
next_new_state->ims = ims; \
|
251 |
next_new_state++; \
|
252 |
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
253 |
} \
|
254 |
else return PCRE_ERROR_DFA_WSSIZE
|
255 |
|
256 |
#define ADD_NEW_DATA(x,y,z) \
|
257 |
if (new_count++ < wscount) \
|
258 |
{ \
|
259 |
next_new_state->offset = (x); \
|
260 |
next_new_state->count = (y); \
|
261 |
next_new_state->ims = ims; \
|
262 |
next_new_state->data = (z); \
|
263 |
next_new_state++; \
|
264 |
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
265 |
} \
|
266 |
else return PCRE_ERROR_DFA_WSSIZE
|
267 |
|
268 |
/* And now, here is the code */
|
269 |
|
270 |
static int
|
271 |
internal_dfa_exec(
|
272 |
dfa_match_data *md,
|
273 |
const uschar *this_start_code,
|
274 |
const uschar *current_subject,
|
275 |
int start_offset,
|
276 |
int *offsets,
|
277 |
int offsetcount,
|
278 |
int *workspace,
|
279 |
int wscount,
|
280 |
int ims,
|
281 |
int rlevel,
|
282 |
int recursing)
|
283 |
{
|
284 |
stateblock *active_states, *new_states, *temp_states;
|
285 |
stateblock *next_active_state, *next_new_state;
|
286 |
|
287 |
const uschar *ctypes, *lcc, *fcc;
|
288 |
const uschar *ptr;
|
289 |
const uschar *end_code, *first_op;
|
290 |
|
291 |
int active_count, new_count, match_count;
|
292 |
|
293 |
/* Some fields in the md block are frequently referenced, so we load them into
|
294 |
independent variables in the hope that this will perform better. */
|
295 |
|
296 |
const uschar *start_subject = md->start_subject;
|
297 |
const uschar *end_subject = md->end_subject;
|
298 |
const uschar *start_code = md->start_code;
|
299 |
|
300 |
#ifdef SUPPORT_UTF8
|
301 |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
|
302 |
#else
|
303 |
BOOL utf8 = FALSE;
|
304 |
#endif
|
305 |
|
306 |
rlevel++;
|
307 |
offsetcount &= (-2);
|
308 |
|
309 |
wscount -= 2;
|
310 |
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
|
311 |
(2 * INTS_PER_STATEBLOCK);
|
312 |
|
313 |
DPRINTF(("\n%.*s---------------------\n"
|
314 |
"%.*sCall to internal_dfa_exec f=%d r=%d\n",
|
315 |
rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
|
316 |
|
317 |
ctypes = md->tables + ctypes_offset;
|
318 |
lcc = md->tables + lcc_offset;
|
319 |
fcc = md->tables + fcc_offset;
|
320 |
|
321 |
match_count = PCRE_ERROR_NOMATCH; /* A negative number */
|
322 |
|
323 |
active_states = (stateblock *)(workspace + 2);
|
324 |
next_new_state = new_states = active_states + wscount;
|
325 |
new_count = 0;
|
326 |
|
327 |
first_op = this_start_code + 1 + LINK_SIZE +
|
328 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
329 |
|
330 |
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
|
331 |
the alternative states onto the list, and find out where the end is. This
|
332 |
makes is possible to use this function recursively, when we want to stop at a
|
333 |
matching internal ket rather than at the end.
|
334 |
|
335 |
If the first opcode in the first alternative is OP_REVERSE, we are dealing with
|
336 |
a backward assertion. In that case, we have to find out the maximum amount to
|
337 |
move back, and set up each alternative appropriately. */
|
338 |
|
339 |
if (*first_op == OP_REVERSE)
|
340 |
{
|
341 |
int max_back = 0;
|
342 |
int gone_back;
|
343 |
|
344 |
end_code = this_start_code;
|
345 |
do
|
346 |
{
|
347 |
int back = GET(end_code, 2+LINK_SIZE);
|
348 |
if (back > max_back) max_back = back;
|
349 |
end_code += GET(end_code, 1);
|
350 |
}
|
351 |
while (*end_code == OP_ALT);
|
352 |
|
353 |
/* If we can't go back the amount required for the longest lookbehind
|
354 |
pattern, go back as far as we can; some alternatives may still be viable. */
|
355 |
|
356 |
#ifdef SUPPORT_UTF8
|
357 |
/* In character mode we have to step back character by character */
|
358 |
|
359 |
if (utf8)
|
360 |
{
|
361 |
for (gone_back = 0; gone_back < max_back; gone_back++)
|
362 |
{
|
363 |
if (current_subject <= start_subject) break;
|
364 |
current_subject--;
|
365 |
while (current_subject > start_subject &&
|
366 |
(*current_subject & 0xc0) == 0x80)
|
367 |
current_subject--;
|
368 |
}
|
369 |
}
|
370 |
else
|
371 |
#endif
|
372 |
|
373 |
/* In byte-mode we can do this quickly. */
|
374 |
|
375 |
{
|
376 |
gone_back = (current_subject - max_back < start_subject)?
|
377 |
current_subject - start_subject : max_back;
|
378 |
current_subject -= gone_back;
|
379 |
}
|
380 |
|
381 |
/* Now we can process the individual branches. */
|
382 |
|
383 |
end_code = this_start_code;
|
384 |
do
|
385 |
{
|
386 |
int back = GET(end_code, 2+LINK_SIZE);
|
387 |
if (back <= gone_back)
|
388 |
{
|
389 |
int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
|
390 |
ADD_NEW_DATA(-bstate, 0, gone_back - back);
|
391 |
}
|
392 |
end_code += GET(end_code, 1);
|
393 |
}
|
394 |
while (*end_code == OP_ALT);
|
395 |
}
|
396 |
|
397 |
/* This is the code for a "normal" subpattern (not a backward assertion). The
|
398 |
start of a whole pattern is always one of these. If we are at the top level,
|
399 |
we may be asked to restart matching from the same point that we reached for a
|
400 |
previous partial match. We still have to scan through the top-level branches to
|
401 |
find the end state. */
|
402 |
|
403 |
else
|
404 |
{
|
405 |
end_code = this_start_code;
|
406 |
|
407 |
/* Restarting */
|
408 |
|
409 |
if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
|
410 |
{
|
411 |
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
|
412 |
new_count = workspace[1];
|
413 |
if (!workspace[0])
|
414 |
memcpy(new_states, active_states, new_count * sizeof(stateblock));
|
415 |
}
|
416 |
|
417 |
/* Not restarting */
|
418 |
|
419 |
else
|
420 |
{
|
421 |
int length = 1 + LINK_SIZE +
|
422 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
423 |
do
|
424 |
{
|
425 |
ADD_NEW(end_code - start_code + length, 0);
|
426 |
end_code += GET(end_code, 1);
|
427 |
length = 1 + LINK_SIZE;
|
428 |
}
|
429 |
while (*end_code == OP_ALT);
|
430 |
}
|
431 |
}
|
432 |
|
433 |
workspace[0] = 0; /* Bit indicating which vector is current */
|
434 |
|
435 |
DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
|
436 |
|
437 |
/* Loop for scanning the subject */
|
438 |
|
439 |
ptr = current_subject;
|
440 |
for (;;)
|
441 |
{
|
442 |
int i, j;
|
443 |
int clen, dlen;
|
444 |
unsigned int c, d;
|
445 |
|
446 |
/* Make the new state list into the active state list and empty the
|
447 |
new state list. */
|
448 |
|
449 |
temp_states = active_states;
|
450 |
active_states = new_states;
|
451 |
new_states = temp_states;
|
452 |
active_count = new_count;
|
453 |
new_count = 0;
|
454 |
|
455 |
workspace[0] ^= 1; /* Remember for the restarting feature */
|
456 |
workspace[1] = active_count;
|
457 |
|
458 |
#ifdef DEBUG
|
459 |
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
|
460 |
pchars((uschar *)ptr, strlen((char *)ptr), stdout);
|
461 |
printf("\"\n");
|
462 |
|
463 |
printf("%.*sActive states: ", rlevel*2-2, SP);
|
464 |
for (i = 0; i < active_count; i++)
|
465 |
printf("%d/%d ", active_states[i].offset, active_states[i].count);
|
466 |
printf("\n");
|
467 |
#endif
|
468 |
|
469 |
/* Set the pointers for adding new states */
|
470 |
|
471 |
next_active_state = active_states + active_count;
|
472 |
next_new_state = new_states;
|
473 |
|
474 |
/* Load the current character from the subject outside the loop, as many
|
475 |
different states may want to look at it, and we assume that at least one
|
476 |
will. */
|
477 |
|
478 |
if (ptr < end_subject)
|
479 |
{
|
480 |
clen = 1; /* Number of bytes in the character */
|
481 |
#ifdef SUPPORT_UTF8
|
482 |
if (utf8) { GETCHARLEN(c, ptr, clen); } else
|
483 |
#endif /* SUPPORT_UTF8 */
|
484 |
c = *ptr;
|
485 |
}
|
486 |
else
|
487 |
{
|
488 |
clen = 0; /* This indicates the end of the subject */
|
489 |
c = NOTACHAR; /* This value should never actually be used */
|
490 |
}
|
491 |
|
492 |
/* Scan up the active states and act on each one. The result of an action
|
493 |
may be to add more states to the currently active list (e.g. on hitting a
|
494 |
parenthesis) or it may be to put states on the new list, for considering
|
495 |
when we move the character pointer on. */
|
496 |
|
497 |
for (i = 0; i < active_count; i++)
|
498 |
{
|
499 |
stateblock *current_state = active_states + i;
|
500 |
const uschar *code;
|
501 |
int state_offset = current_state->offset;
|
502 |
int count, codevalue;
|
503 |
int chartype, script;
|
504 |
|
505 |
#ifdef DEBUG
|
506 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
|
507 |
if (clen == 0) printf("EOL\n");
|
508 |
else if (c > 32 && c < 127) printf("'%c'\n", c);
|
509 |
else printf("0x%02x\n", c);
|
510 |
#endif
|
511 |
|
512 |
/* This variable is referred to implicity in the ADD_xxx macros. */
|
513 |
|
514 |
ims = current_state->ims;
|
515 |
|
516 |
/* A negative offset is a special case meaning "hold off going to this
|
517 |
(negated) state until the number of characters in the data field have
|
518 |
been skipped". */
|
519 |
|
520 |
if (state_offset < 0)
|
521 |
{
|
522 |
if (current_state->data > 0)
|
523 |
{
|
524 |
DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
|
525 |
ADD_NEW_DATA(state_offset, current_state->count,
|
526 |
current_state->data - 1);
|
527 |
continue;
|
528 |
}
|
529 |
else
|
530 |
{
|
531 |
current_state->offset = state_offset = -state_offset;
|
532 |
}
|
533 |
}
|
534 |
|
535 |
/* Check for a duplicate state with the same count, and skip if found. */
|
536 |
|
537 |
for (j = 0; j < i; j++)
|
538 |
{
|
539 |
if (active_states[j].offset == state_offset &&
|
540 |
active_states[j].count == current_state->count)
|
541 |
{
|
542 |
DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
|
543 |
goto NEXT_ACTIVE_STATE;
|
544 |
}
|
545 |
}
|
546 |
|
547 |
/* The state offset is the offset to the opcode */
|
548 |
|
549 |
code = start_code + state_offset;
|
550 |
codevalue = *code;
|
551 |
|
552 |
/* If this opcode is followed by an inline character, load it. It is
|
553 |
tempting to test for the presence of a subject character here, but that
|
554 |
is wrong, because sometimes zero repetitions of the subject are
|
555 |
permitted.
|
556 |
|
557 |
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
|
558 |
argument that is not a data character - but is always one byte long.
|
559 |
Unfortunately, we have to take special action to deal with \P, \p, and
|
560 |
\X in this case. To keep the other cases fast, convert these ones to new
|
561 |
opcodes. */
|
562 |
|
563 |
if (coptable[codevalue] > 0)
|
564 |
{
|
565 |
dlen = 1;
|
566 |
#ifdef SUPPORT_UTF8
|
567 |
if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
|
568 |
#endif /* SUPPORT_UTF8 */
|
569 |
d = code[coptable[codevalue]];
|
570 |
if (codevalue >= OP_TYPESTAR)
|
571 |
{
|
572 |
switch(d)
|
573 |
{
|
574 |
case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
|
575 |
case OP_NOTPROP:
|
576 |
case OP_PROP: codevalue += OP_PROP_EXTRA; break;
|
577 |
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
|
578 |
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
|
579 |
default: break;
|
580 |
}
|
581 |
}
|
582 |
}
|
583 |
else
|
584 |
{
|
585 |
dlen = 0; /* Not strictly necessary, but compilers moan */
|
586 |
d = NOTACHAR; /* if these variables are not set. */
|
587 |
}
|
588 |
|
589 |
|
590 |
/* Now process the individual opcodes */
|
591 |
|
592 |
switch (codevalue)
|
593 |
{
|
594 |
|
595 |
/* ========================================================================== */
|
596 |
/* Reached a closing bracket. If not at the end of the pattern, carry
|
597 |
on with the next opcode. Otherwise, unless we have an empty string and
|
598 |
PCRE_NOTEMPTY is set, save the match data, shifting up all previous
|
599 |
matches so we always have the longest first. */
|
600 |
|
601 |
case OP_KET:
|
602 |
case OP_KETRMIN:
|
603 |
case OP_KETRMAX:
|
604 |
if (code != end_code)
|
605 |
{
|
606 |
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
|
607 |
if (codevalue != OP_KET)
|
608 |
{
|
609 |
ADD_ACTIVE(state_offset - GET(code, 1), 0);
|
610 |
}
|
611 |
}
|
612 |
else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
|
613 |
{
|
614 |
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
615 |
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
|
616 |
match_count = 0;
|
617 |
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
618 |
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
619 |
if (offsetcount >= 2)
|
620 |
{
|
621 |
offsets[0] = current_subject - start_subject;
|
622 |
offsets[1] = ptr - start_subject;
|
623 |
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
624 |
offsets[1] - offsets[0], current_subject));
|
625 |
}
|
626 |
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
|
627 |
{
|
628 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
629 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
|
630 |
match_count, rlevel*2-2, SP));
|
631 |
return match_count;
|
632 |
}
|
633 |
}
|
634 |
break;
|
635 |
|
636 |
/* ========================================================================== */
|
637 |
/* These opcodes add to the current list of states without looking
|
638 |
at the current character. */
|
639 |
|
640 |
/*-----------------------------------------------------------------*/
|
641 |
case OP_ALT:
|
642 |
do { code += GET(code, 1); } while (*code == OP_ALT);
|
643 |
ADD_ACTIVE(code - start_code, 0);
|
644 |
break;
|
645 |
|
646 |
/*-----------------------------------------------------------------*/
|
647 |
case OP_BRA:
|
648 |
case OP_SBRA:
|
649 |
do
|
650 |
{
|
651 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
652 |
code += GET(code, 1);
|
653 |
}
|
654 |
while (*code == OP_ALT);
|
655 |
break;
|
656 |
|
657 |
/*-----------------------------------------------------------------*/
|
658 |
case OP_CBRA:
|
659 |
case OP_SCBRA:
|
660 |
ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
|
661 |
code += GET(code, 1);
|
662 |
while (*code == OP_ALT)
|
663 |
{
|
664 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
665 |
code += GET(code, 1);
|
666 |
}
|
667 |
break;
|
668 |
|
669 |
/*-----------------------------------------------------------------*/
|
670 |
case OP_BRAZERO:
|
671 |
case OP_BRAMINZERO:
|
672 |
ADD_ACTIVE(state_offset + 1, 0);
|
673 |
code += 1 + GET(code, 2);
|
674 |
while (*code == OP_ALT) code += GET(code, 1);
|
675 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
676 |
break;
|
677 |
|
678 |
/*-----------------------------------------------------------------*/
|
679 |
case OP_CIRC:
|
680 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
|
681 |
((ims & PCRE_MULTILINE) != 0 &&
|
682 |
ptr != end_subject &&
|
683 |
WAS_NEWLINE(ptr)))
|
684 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
685 |
break;
|
686 |
|
687 |
/*-----------------------------------------------------------------*/
|
688 |
case OP_EOD:
|
689 |
if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
690 |
break;
|
691 |
|
692 |
/*-----------------------------------------------------------------*/
|
693 |
case OP_OPT:
|
694 |
ims = code[1];
|
695 |
ADD_ACTIVE(state_offset + 2, 0);
|
696 |
break;
|
697 |
|
698 |
/*-----------------------------------------------------------------*/
|
699 |
case OP_SOD:
|
700 |
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
701 |
break;
|
702 |
|
703 |
/*-----------------------------------------------------------------*/
|
704 |
case OP_SOM:
|
705 |
if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
|
706 |
break;
|
707 |
|
708 |
|
709 |
/* ========================================================================== */
|
710 |
/* These opcodes inspect the next subject character, and sometimes
|
711 |
the previous one as well, but do not have an argument. The variable
|
712 |
clen contains the length of the current character and is zero if we are
|
713 |
at the end of the subject. */
|
714 |
|
715 |
/*-----------------------------------------------------------------*/
|
716 |
case OP_ANY:
|
717 |
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
|
718 |
{ ADD_NEW(state_offset + 1, 0); }
|
719 |
break;
|
720 |
|
721 |
/*-----------------------------------------------------------------*/
|
722 |
case OP_EODN:
|
723 |
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
|
724 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
725 |
break;
|
726 |
|
727 |
/*-----------------------------------------------------------------*/
|
728 |
case OP_DOLL:
|
729 |
if ((md->moptions & PCRE_NOTEOL) == 0)
|
730 |
{
|
731 |
if (clen == 0 ||
|
732 |
(IS_NEWLINE(ptr) &&
|
733 |
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
|
734 |
))
|
735 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
736 |
}
|
737 |
else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
|
738 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
739 |
break;
|
740 |
|
741 |
/*-----------------------------------------------------------------*/
|
742 |
|
743 |
case OP_DIGIT:
|
744 |
case OP_WHITESPACE:
|
745 |
case OP_WORDCHAR:
|
746 |
if (clen > 0 && c < 256 &&
|
747 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
|
748 |
{ ADD_NEW(state_offset + 1, 0); }
|
749 |
break;
|
750 |
|
751 |
/*-----------------------------------------------------------------*/
|
752 |
case OP_NOT_DIGIT:
|
753 |
case OP_NOT_WHITESPACE:
|
754 |
case OP_NOT_WORDCHAR:
|
755 |
if (clen > 0 && (c >= 256 ||
|
756 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
|
757 |
{ ADD_NEW(state_offset + 1, 0); }
|
758 |
break;
|
759 |
|
760 |
/*-----------------------------------------------------------------*/
|
761 |
case OP_WORD_BOUNDARY:
|
762 |
case OP_NOT_WORD_BOUNDARY:
|
763 |
{
|
764 |
int left_word, right_word;
|
765 |
|
766 |
if (ptr > start_subject)
|
767 |
{
|
768 |
const uschar *temp = ptr - 1;
|
769 |
#ifdef SUPPORT_UTF8
|
770 |
if (utf8) BACKCHAR(temp);
|
771 |
#endif
|
772 |
GETCHARTEST(d, temp);
|
773 |
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
|
774 |
}
|
775 |
else left_word = 0;
|
776 |
|
777 |
if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
778 |
else right_word = 0;
|
779 |
|
780 |
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
781 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
782 |
}
|
783 |
break;
|
784 |
|
785 |
|
786 |
#ifdef SUPPORT_UCP
|
787 |
|
788 |
/*-----------------------------------------------------------------*/
|
789 |
/* Check the next character by Unicode property. We will get here only
|
790 |
if the support is in the binary; otherwise a compile-time error occurs.
|
791 |
*/
|
792 |
|
793 |
case OP_PROP:
|
794 |
case OP_NOTPROP:
|
795 |
if (clen > 0)
|
796 |
{
|
797 |
BOOL OK;
|
798 |
int category = _pcre_ucp_findprop(c, &chartype, &script);
|
799 |
switch(code[1])
|
800 |
{
|
801 |
case PT_ANY:
|
802 |
OK = TRUE;
|
803 |
break;
|
804 |
|
805 |
case PT_LAMP:
|
806 |
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
807 |
break;
|
808 |
|
809 |
case PT_GC:
|
810 |
OK = category == code[2];
|
811 |
break;
|
812 |
|
813 |
case PT_PC:
|
814 |
OK = chartype == code[2];
|
815 |
break;
|
816 |
|
817 |
case PT_SC:
|
818 |
OK = script == code[2];
|
819 |
break;
|
820 |
|
821 |
/* Should never occur, but keep compilers from grumbling. */
|
822 |
|
823 |
default:
|
824 |
OK = codevalue != OP_PROP;
|
825 |
break;
|
826 |
}
|
827 |
|
828 |
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
|
829 |
}
|
830 |
break;
|
831 |
#endif
|
832 |
|
833 |
|
834 |
|
835 |
/* ========================================================================== */
|
836 |
/* These opcodes likewise inspect the subject character, but have an
|
837 |
argument that is not a data character. It is one of these opcodes:
|
838 |
OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
|
839 |
OP_NOT_WORDCHAR. The value is loaded into d. */
|
840 |
|
841 |
case OP_TYPEPLUS:
|
842 |
case OP_TYPEMINPLUS:
|
843 |
case OP_TYPEPOSPLUS:
|
844 |
count = current_state->count; /* Already matched */
|
845 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
846 |
if (clen > 0)
|
847 |
{
|
848 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
849 |
(c < 256 &&
|
850 |
(d != OP_ANY ||
|
851 |
(ims & PCRE_DOTALL) != 0 ||
|
852 |
!IS_NEWLINE(ptr)
|
853 |
) &&
|
854 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
855 |
{
|
856 |
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
|
857 |
{
|
858 |
active_count--; /* Remove non-match possibility */
|
859 |
next_active_state--;
|
860 |
}
|
861 |
count++;
|
862 |
ADD_NEW(state_offset, count);
|
863 |
}
|
864 |
}
|
865 |
break;
|
866 |
|
867 |
/*-----------------------------------------------------------------*/
|
868 |
case OP_TYPEQUERY:
|
869 |
case OP_TYPEMINQUERY:
|
870 |
case OP_TYPEPOSQUERY:
|
871 |
ADD_ACTIVE(state_offset + 2, 0);
|
872 |
if (clen > 0)
|
873 |
{
|
874 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
875 |
(c < 256 &&
|
876 |
(d != OP_ANY ||
|
877 |
(ims & PCRE_DOTALL) != 0 ||
|
878 |
!IS_NEWLINE(ptr)
|
879 |
) &&
|
880 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
881 |
{
|
882 |
if (codevalue == OP_TYPEPOSQUERY)
|
883 |
{
|
884 |
active_count--; /* Remove non-match possibility */
|
885 |
next_active_state--;
|
886 |
}
|
887 |
ADD_NEW(state_offset + 2, 0);
|
888 |
}
|
889 |
}
|
890 |
break;
|
891 |
|
892 |
/*-----------------------------------------------------------------*/
|
893 |
case OP_TYPESTAR:
|
894 |
case OP_TYPEMINSTAR:
|
895 |
case OP_TYPEPOSSTAR:
|
896 |
ADD_ACTIVE(state_offset + 2, 0);
|
897 |
if (clen > 0)
|
898 |
{
|
899 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
900 |
(c < 256 &&
|
901 |
(d != OP_ANY ||
|
902 |
(ims & PCRE_DOTALL) != 0 ||
|
903 |
!IS_NEWLINE(ptr)
|
904 |
) &&
|
905 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
906 |
{
|
907 |
if (codevalue == OP_TYPEPOSSTAR)
|
908 |
{
|
909 |
active_count--; /* Remove non-match possibility */
|
910 |
next_active_state--;
|
911 |
}
|
912 |
ADD_NEW(state_offset, 0);
|
913 |
}
|
914 |
}
|
915 |
break;
|
916 |
|
917 |
/*-----------------------------------------------------------------*/
|
918 |
case OP_TYPEEXACT:
|
919 |
count = current_state->count; /* Number already matched */
|
920 |
if (clen > 0)
|
921 |
{
|
922 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
923 |
(c < 256 &&
|
924 |
(d != OP_ANY ||
|
925 |
(ims & PCRE_DOTALL) != 0 ||
|
926 |
!IS_NEWLINE(ptr)
|
927 |
) &&
|
928 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
929 |
{
|
930 |
if (++count >= GET2(code, 1))
|
931 |
{ ADD_NEW(state_offset + 4, 0); }
|
932 |
else
|
933 |
{ ADD_NEW(state_offset, count); }
|
934 |
}
|
935 |
}
|
936 |
break;
|
937 |
|
938 |
/*-----------------------------------------------------------------*/
|
939 |
case OP_TYPEUPTO:
|
940 |
case OP_TYPEMINUPTO:
|
941 |
case OP_TYPEPOSUPTO:
|
942 |
ADD_ACTIVE(state_offset + 4, 0);
|
943 |
count = current_state->count; /* Number already matched */
|
944 |
if (clen > 0)
|
945 |
{
|
946 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
947 |
(c < 256 &&
|
948 |
(d != OP_ANY ||
|
949 |
(ims & PCRE_DOTALL) != 0 ||
|
950 |
!IS_NEWLINE(ptr)
|
951 |
) &&
|
952 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
953 |
{
|
954 |
if (codevalue == OP_TYPEPOSUPTO)
|
955 |
{
|
956 |
active_count--; /* Remove non-match possibility */
|
957 |
next_active_state--;
|
958 |
}
|
959 |
if (++count >= GET2(code, 1))
|
960 |
{ ADD_NEW(state_offset + 4, 0); }
|
961 |
else
|
962 |
{ ADD_NEW(state_offset, count); }
|
963 |
}
|
964 |
}
|
965 |
break;
|
966 |
|
967 |
/* ========================================================================== */
|
968 |
/* These are virtual opcodes that are used when something like
|
969 |
OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
|
970 |
argument. It keeps the code above fast for the other cases. The argument
|
971 |
is in the d variable. */
|
972 |
|
973 |
case OP_PROP_EXTRA + OP_TYPEPLUS:
|
974 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
|
975 |
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
|
976 |
count = current_state->count; /* Already matched */
|
977 |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
|
978 |
if (clen > 0)
|
979 |
{
|
980 |
BOOL OK;
|
981 |
int category = _pcre_ucp_findprop(c, &chartype, &script);
|
982 |
switch(code[2])
|
983 |
{
|
984 |
case PT_ANY:
|
985 |
OK = TRUE;
|
986 |
break;
|
987 |
|
988 |
case PT_LAMP:
|
989 |
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
990 |
break;
|
991 |
|
992 |
case PT_GC:
|
993 |
OK = category == code[3];
|
994 |
break;
|
995 |
|
996 |
case PT_PC:
|
997 |
OK = chartype == code[3];
|
998 |
break;
|
999 |
|
1000 |
case PT_SC:
|
1001 |
OK = script == code[3];
|
1002 |
break;
|
1003 |
|
1004 |
/* Should never occur, but keep compilers from grumbling. */
|
1005 |
|
1006 |
default:
|
1007 |
OK = codevalue != OP_PROP;
|
1008 |
break;
|
1009 |
}
|
1010 |
|
1011 |
if (OK == (d == OP_PROP))
|
1012 |
{
|
1013 |
if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
|
1014 |
{
|
1015 |
active_count--; /* Remove non-match possibility */
|
1016 |
next_active_state--;
|
1017 |
}
|
1018 |
count++;
|
1019 |
ADD_NEW(state_offset, count);
|
1020 |
}
|
1021 |
}
|
1022 |
break;
|
1023 |
|
1024 |
/*-----------------------------------------------------------------*/
|
1025 |
case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
|
1026 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
|
1027 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
|
1028 |
count = current_state->count; /* Already matched */
|
1029 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1030 |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
|
1031 |
{
|
1032 |
const uschar *nptr = ptr + clen;
|
1033 |
int ncount = 0;
|
1034 |
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
|
1035 |
{
|
1036 |
active_count--; /* Remove non-match possibility */
|
1037 |
next_active_state--;
|
1038 |
}
|
1039 |
while (nptr < end_subject)
|
1040 |
{
|
1041 |
int nd;
|
1042 |
int ndlen = 1;
|
1043 |
GETCHARLEN(nd, nptr, ndlen);
|
1044 |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
|
1045 |
ncount++;
|
1046 |
nptr += ndlen;
|
1047 |
}
|
1048 |
count++;
|
1049 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1050 |
}
|
1051 |
break;
|
1052 |
|
1053 |
/*-----------------------------------------------------------------*/
|
1054 |
case OP_ANYNL_EXTRA + OP_TYPEPLUS:
|
1055 |
case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
|
1056 |
case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
|
1057 |
count = current_state->count; /* Already matched */
|
1058 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1059 |
if (clen > 0)
|
1060 |
{
|
1061 |
int ncount = 0;
|
1062 |
switch (c)
|
1063 |
{
|
1064 |
case 0x000d:
|
1065 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1066 |
/* Fall through */
|
1067 |
case 0x000a:
|
1068 |
case 0x000b:
|
1069 |
case 0x000c:
|
1070 |
case 0x0085:
|
1071 |
case 0x2028:
|
1072 |
case 0x2029:
|
1073 |
if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
|
1074 |
{
|
1075 |
active_count--; /* Remove non-match possibility */
|
1076 |
next_active_state--;
|
1077 |
}
|
1078 |
count++;
|
1079 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1080 |
break;
|
1081 |
default:
|
1082 |
break;
|
1083 |
}
|
1084 |
}
|
1085 |
break;
|
1086 |
|
1087 |
/*-----------------------------------------------------------------*/
|
1088 |
case OP_PROP_EXTRA + OP_TYPEQUERY:
|
1089 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
|
1090 |
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
|
1091 |
count = 4;
|
1092 |
goto QS1;
|
1093 |
|
1094 |
case OP_PROP_EXTRA + OP_TYPESTAR:
|
1095 |
case OP_PROP_EXTRA + OP_TYPEMINSTAR:
|
1096 |
case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
|
1097 |
count = 0;
|
1098 |
|
1099 |
QS1:
|
1100 |
|
1101 |
ADD_ACTIVE(state_offset + 4, 0);
|
1102 |
if (clen > 0)
|
1103 |
{
|
1104 |
BOOL OK;
|
1105 |
int category = _pcre_ucp_findprop(c, &chartype, &script);
|
1106 |
switch(code[2])
|
1107 |
{
|
1108 |
case PT_ANY:
|
1109 |
OK = TRUE;
|
1110 |
break;
|
1111 |
|
1112 |
case PT_LAMP:
|
1113 |
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
1114 |
break;
|
1115 |
|
1116 |
case PT_GC:
|
1117 |
OK = category == code[3];
|
1118 |
break;
|
1119 |
|
1120 |
case PT_PC:
|
1121 |
OK = chartype == code[3];
|
1122 |
break;
|
1123 |
|
1124 |
case PT_SC:
|
1125 |
OK = script == code[3];
|
1126 |
break;
|
1127 |
|
1128 |
/* Should never occur, but keep compilers from grumbling. */
|
1129 |
|
1130 |
default:
|
1131 |
OK = codevalue != OP_PROP;
|
1132 |
break;
|
1133 |
}
|
1134 |
|
1135 |
if (OK == (d == OP_PROP))
|
1136 |
{
|
1137 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
|
1138 |
codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
|
1139 |
{
|
1140 |
active_count--; /* Remove non-match possibility */
|
1141 |
next_active_state--;
|
1142 |
}
|
1143 |
ADD_NEW(state_offset + count, 0);
|
1144 |
}
|
1145 |
}
|
1146 |
break;
|
1147 |
|
1148 |
/*-----------------------------------------------------------------*/
|
1149 |
case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
|
1150 |
case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
|
1151 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
|
1152 |
count = 2;
|
1153 |
goto QS2;
|
1154 |
|
1155 |
case OP_EXTUNI_EXTRA + OP_TYPESTAR:
|
1156 |
case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
|
1157 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
|
1158 |
count = 0;
|
1159 |
|
1160 |
QS2:
|
1161 |
|
1162 |
ADD_ACTIVE(state_offset + 2, 0);
|
1163 |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
|
1164 |
{
|
1165 |
const uschar *nptr = ptr + clen;
|
1166 |
int ncount = 0;
|
1167 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
|
1168 |
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
|
1169 |
{
|
1170 |
active_count--; /* Remove non-match possibility */
|
1171 |
next_active_state--;
|
1172 |
}
|
1173 |
while (nptr < end_subject)
|
1174 |
{
|
1175 |
int nd;
|
1176 |
int ndlen = 1;
|
1177 |
GETCHARLEN(nd, nptr, ndlen);
|
1178 |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
|
1179 |
ncount++;
|
1180 |
nptr += ndlen;
|
1181 |
}
|
1182 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
|
1183 |
}
|
1184 |
break;
|
1185 |
|
1186 |
/*-----------------------------------------------------------------*/
|
1187 |
case OP_ANYNL_EXTRA + OP_TYPEQUERY:
|
1188 |
case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
|
1189 |
case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
|
1190 |
count = 2;
|
1191 |
goto QS3;
|
1192 |
|
1193 |
case OP_ANYNL_EXTRA + OP_TYPESTAR:
|
1194 |
case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
|
1195 |
case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
|
1196 |
count = 0;
|
1197 |
|
1198 |
QS3:
|
1199 |
ADD_ACTIVE(state_offset + 2, 0);
|
1200 |
if (clen > 0)
|
1201 |
{
|
1202 |
int ncount = 0;
|
1203 |
switch (c)
|
1204 |
{
|
1205 |
case 0x000d:
|
1206 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1207 |
/* Fall through */
|
1208 |
case 0x000a:
|
1209 |
case 0x000b:
|
1210 |
case 0x000c:
|
1211 |
case 0x0085:
|
1212 |
case 0x2028:
|
1213 |
case 0x2029:
|
1214 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
|
1215 |
codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
|
1216 |
{
|
1217 |
active_count--; /* Remove non-match possibility */
|
1218 |
next_active_state--;
|
1219 |
}
|
1220 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
|
1221 |
break;
|
1222 |
default:
|
1223 |
break;
|
1224 |
}
|
1225 |
}
|
1226 |
break;
|
1227 |
|
1228 |
/*-----------------------------------------------------------------*/
|
1229 |
case OP_PROP_EXTRA + OP_TYPEEXACT:
|
1230 |
case OP_PROP_EXTRA + OP_TYPEUPTO:
|
1231 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
|
1232 |
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
|
1233 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
|
1234 |
{ ADD_ACTIVE(state_offset + 6, 0); }
|
1235 |
count = current_state->count; /* Number already matched */
|
1236 |
if (clen > 0)
|
1237 |
{
|
1238 |
BOOL OK;
|
1239 |
int category = _pcre_ucp_findprop(c, &chartype, &script);
|
1240 |
switch(code[4])
|
1241 |
{
|
1242 |
case PT_ANY:
|
1243 |
OK = TRUE;
|
1244 |
break;
|
1245 |
|
1246 |
case PT_LAMP:
|
1247 |
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
1248 |
break;
|
1249 |
|
1250 |
case PT_GC:
|
1251 |
OK = category == code[5];
|
1252 |
break;
|
1253 |
|
1254 |
case PT_PC:
|
1255 |
OK = chartype == code[5];
|
1256 |
break;
|
1257 |
|
1258 |
case PT_SC:
|
1259 |
OK = script == code[5];
|
1260 |
break;
|
1261 |
|
1262 |
/* Should never occur, but keep compilers from grumbling. */
|
1263 |
|
1264 |
default:
|
1265 |
OK = codevalue != OP_PROP;
|
1266 |
break;
|
1267 |
}
|
1268 |
|
1269 |
if (OK == (d == OP_PROP))
|
1270 |
{
|
1271 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
|
1272 |
{
|
1273 |
active_count--; /* Remove non-match possibility */
|
1274 |
next_active_state--;
|
1275 |
}
|
1276 |
if (++count >= GET2(code, 1))
|
1277 |
{ ADD_NEW(state_offset + 6, 0); }
|
1278 |
else
|
1279 |
{ ADD_NEW(state_offset, count); }
|
1280 |
}
|
1281 |
}
|
1282 |
break;
|
1283 |
|
1284 |
/*-----------------------------------------------------------------*/
|
1285 |
case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
|
1286 |
case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
|
1287 |
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
|
1288 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
|
1289 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
|
1290 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1291 |
count = current_state->count; /* Number already matched */
|
1292 |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
|
1293 |
{
|
1294 |
const uschar *nptr = ptr + clen;
|
1295 |
int ncount = 0;
|
1296 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
|
1297 |
{
|
1298 |
active_count--; /* Remove non-match possibility */
|
1299 |
next_active_state--;
|
1300 |
}
|
1301 |
while (nptr < end_subject)
|
1302 |
{
|
1303 |
int nd;
|
1304 |
int ndlen = 1;
|
1305 |
GETCHARLEN(nd, nptr, ndlen);
|
1306 |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
|
1307 |
ncount++;
|
1308 |
nptr += ndlen;
|
1309 |
}
|
1310 |
if (++count >= GET2(code, 1))
|
1311 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
|
1312 |
else
|
1313 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1314 |
}
|
1315 |
break;
|
1316 |
|
1317 |
/*-----------------------------------------------------------------*/
|
1318 |
case OP_ANYNL_EXTRA + OP_TYPEEXACT:
|
1319 |
case OP_ANYNL_EXTRA + OP_TYPEUPTO:
|
1320 |
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
|
1321 |
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
|
1322 |
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
|
1323 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1324 |
count = current_state->count; /* Number already matched */
|
1325 |
if (clen > 0)
|
1326 |
{
|
1327 |
int ncount = 0;
|
1328 |
switch (c)
|
1329 |
{
|
1330 |
case 0x000d:
|
1331 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1332 |
/* Fall through */
|
1333 |
case 0x000a:
|
1334 |
case 0x000b:
|
1335 |
case 0x000c:
|
1336 |
case 0x0085:
|
1337 |
case 0x2028:
|
1338 |
case 0x2029:
|
1339 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
|
1340 |
{
|
1341 |
active_count--; /* Remove non-match possibility */
|
1342 |
next_active_state--;
|
1343 |
}
|
1344 |
if (++count >= GET2(code, 1))
|
1345 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
|
1346 |
else
|
1347 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1348 |
break;
|
1349 |
default:
|
1350 |
break;
|
1351 |
}
|
1352 |
}
|
1353 |
break;
|
1354 |
|
1355 |
/* ========================================================================== */
|
1356 |
/* These opcodes are followed by a character that is usually compared
|
1357 |
to the current subject character; it is loaded into d. We still get
|
1358 |
here even if there is no subject character, because in some cases zero
|
1359 |
repetitions are permitted. */
|
1360 |
|
1361 |
/*-----------------------------------------------------------------*/
|
1362 |
case OP_CHAR:
|
1363 |
if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1364 |
break;
|
1365 |
|
1366 |
/*-----------------------------------------------------------------*/
|
1367 |
case OP_CHARNC:
|
1368 |
if (clen == 0) break;
|
1369 |
|
1370 |
#ifdef SUPPORT_UTF8
|
1371 |
if (utf8)
|
1372 |
{
|
1373 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
|
1374 |
{
|
1375 |
unsigned int othercase;
|
1376 |
if (c < 128) othercase = fcc[c]; else
|
1377 |
|
1378 |
/* If we have Unicode property support, we can use it to test the
|
1379 |
other case of the character. */
|
1380 |
|
1381 |
#ifdef SUPPORT_UCP
|
1382 |
othercase = _pcre_ucp_othercase(c);
|
1383 |
#else
|
1384 |
othercase = NOTACHAR;
|
1385 |
#endif
|
1386 |
|
1387 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1388 |
}
|
1389 |
}
|
1390 |
else
|
1391 |
#endif /* SUPPORT_UTF8 */
|
1392 |
|
1393 |
/* Non-UTF-8 mode */
|
1394 |
{
|
1395 |
if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
|
1396 |
}
|
1397 |
break;
|
1398 |
|
1399 |
|
1400 |
#ifdef SUPPORT_UCP
|
1401 |
/*-----------------------------------------------------------------*/
|
1402 |
/* This is a tricky one because it can match more than one character.
|
1403 |
Find out how many characters to skip, and then set up a negative state
|
1404 |
to wait for them to pass before continuing. */
|
1405 |
|
1406 |
case OP_EXTUNI:
|
1407 |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
|
1408 |
{
|
1409 |
const uschar *nptr = ptr + clen;
|
1410 |
int ncount = 0;
|
1411 |
while (nptr < end_subject)
|
1412 |
{
|
1413 |
int nclen = 1;
|
1414 |
GETCHARLEN(c, nptr, nclen);
|
1415 |
if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
|
1416 |
ncount++;
|
1417 |
nptr += nclen;
|
1418 |
}
|
1419 |
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
|
1420 |
}
|
1421 |
break;
|
1422 |
#endif
|
1423 |
|
1424 |
/*-----------------------------------------------------------------*/
|
1425 |
/* This is a tricky like EXTUNI because it too can match more than one
|
1426 |
character (when CR is followed by LF). In this case, set up a negative
|
1427 |
state to wait for one character to pass before continuing. */
|
1428 |
|
1429 |
case OP_ANYNL:
|
1430 |
if (clen > 0) switch(c)
|
1431 |
{
|
1432 |
case 0x000a:
|
1433 |
case 0x000b:
|
1434 |
case 0x000c:
|
1435 |
case 0x0085:
|
1436 |
case 0x2028:
|
1437 |
case 0x2029:
|
1438 |
ADD_NEW(state_offset + 1, 0);
|
1439 |
break;
|
1440 |
case 0x000d:
|
1441 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a)
|
1442 |
{
|
1443 |
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
|
1444 |
}
|
1445 |
else
|
1446 |
{
|
1447 |
ADD_NEW(state_offset + 1, 0);
|
1448 |
}
|
1449 |
break;
|
1450 |
}
|
1451 |
break;
|
1452 |
|
1453 |
/*-----------------------------------------------------------------*/
|
1454 |
/* Match a negated single character. This is only used for one-byte
|
1455 |
characters, that is, we know that d < 256. The character we are
|
1456 |
checking (c) can be multibyte. */
|
1457 |
|
1458 |
case OP_NOT:
|
1459 |
if (clen > 0)
|
1460 |
{
|
1461 |
unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
|
1462 |
if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1463 |
}
|
1464 |
break;
|
1465 |
|
1466 |
/*-----------------------------------------------------------------*/
|
1467 |
case OP_PLUS:
|
1468 |
case OP_MINPLUS:
|
1469 |
case OP_POSPLUS:
|
1470 |
case OP_NOTPLUS:
|
1471 |
case OP_NOTMINPLUS:
|
1472 |
case OP_NOTPOSPLUS:
|
1473 |
count = current_state->count; /* Already matched */
|
1474 |
if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
|
1475 |
if (clen > 0)
|
1476 |
{
|
1477 |
unsigned int otherd = NOTACHAR;
|
1478 |
if ((ims & PCRE_CASELESS) != 0)
|
1479 |
{
|
1480 |
#ifdef SUPPORT_UTF8
|
1481 |
if (utf8 && d >= 128)
|
1482 |
{
|
1483 |
#ifdef SUPPORT_UCP
|
1484 |
otherd = _pcre_ucp_othercase(d);
|
1485 |
#endif /* SUPPORT_UCP */
|
1486 |
}
|
1487 |
else
|
1488 |
#endif /* SUPPORT_UTF8 */
|
1489 |
otherd = fcc[d];
|
1490 |
}
|
1491 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1492 |
{
|
1493 |
if (count > 0 &&
|
1494 |
(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
|
1495 |
{
|
1496 |
active_count--; /* Remove non-match possibility */
|
1497 |
next_active_state--;
|
1498 |
}
|
1499 |
count++;
|
1500 |
ADD_NEW(state_offset, count);
|
1501 |
}
|
1502 |
}
|
1503 |
break;
|
1504 |
|
1505 |
/*-----------------------------------------------------------------*/
|
1506 |
case OP_QUERY:
|
1507 |
case OP_MINQUERY:
|
1508 |
case OP_POSQUERY:
|
1509 |
case OP_NOTQUERY:
|
1510 |
case OP_NOTMINQUERY:
|
1511 |
case OP_NOTPOSQUERY:
|
1512 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
1513 |
if (clen > 0)
|
1514 |
{
|
1515 |
unsigned int otherd = NOTACHAR;
|
1516 |
if ((ims & PCRE_CASELESS) != 0)
|
1517 |
{
|
1518 |
#ifdef SUPPORT_UTF8
|
1519 |
if (utf8 && d >= 128)
|
1520 |
{
|
1521 |
#ifdef SUPPORT_UCP
|
1522 |
otherd = _pcre_ucp_othercase(d);
|
1523 |
#endif /* SUPPORT_UCP */
|
1524 |
}
|
1525 |
else
|
1526 |
#endif /* SUPPORT_UTF8 */
|
1527 |
otherd = fcc[d];
|
1528 |
}
|
1529 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1530 |
{
|
1531 |
if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
|
1532 |
{
|
1533 |
active_count--; /* Remove non-match possibility */
|
1534 |
next_active_state--;
|
1535 |
}
|
1536 |
ADD_NEW(state_offset + dlen + 1, 0);
|
1537 |
}
|
1538 |
}
|
1539 |
break;
|
1540 |
|
1541 |
/*-----------------------------------------------------------------*/
|
1542 |
case OP_STAR:
|
1543 |
case OP_MINSTAR:
|
1544 |
case OP_POSSTAR:
|
1545 |
case OP_NOTSTAR:
|
1546 |
case OP_NOTMINSTAR:
|
1547 |
case OP_NOTPOSSTAR:
|
1548 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
1549 |
if (clen > 0)
|
1550 |
{
|
1551 |
unsigned int otherd = NOTACHAR;
|
1552 |
if ((ims & PCRE_CASELESS) != 0)
|
1553 |
{
|
1554 |
#ifdef SUPPORT_UTF8
|
1555 |
if (utf8 && d >= 128)
|
1556 |
{
|
1557 |
#ifdef SUPPORT_UCP
|
1558 |
otherd = _pcre_ucp_othercase(d);
|
1559 |
#endif /* SUPPORT_UCP */
|
1560 |
}
|
1561 |
else
|
1562 |
#endif /* SUPPORT_UTF8 */
|
1563 |
otherd = fcc[d];
|
1564 |
}
|
1565 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1566 |
{
|
1567 |
if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
|
1568 |
{
|
1569 |
active_count--; /* Remove non-match possibility */
|
1570 |
next_active_state--;
|
1571 |
}
|
1572 |
ADD_NEW(state_offset, 0);
|
1573 |
}
|
1574 |
}
|
1575 |
break;
|
1576 |
|
1577 |
/*-----------------------------------------------------------------*/
|
1578 |
case OP_EXACT:
|
1579 |
case OP_NOTEXACT:
|
1580 |
count = current_state->count; /* Number already matched */
|
1581 |
if (clen > 0)
|
1582 |
{
|
1583 |
unsigned int otherd = NOTACHAR;
|
1584 |
if ((ims & PCRE_CASELESS) != 0)
|
1585 |
{
|
1586 |
#ifdef SUPPORT_UTF8
|
1587 |
if (utf8 && d >= 128)
|
1588 |
{
|
1589 |
#ifdef SUPPORT_UCP
|
1590 |
otherd = _pcre_ucp_othercase(d);
|
1591 |
#endif /* SUPPORT_UCP */
|
1592 |
}
|
1593 |
else
|
1594 |
#endif /* SUPPORT_UTF8 */
|
1595 |
otherd = fcc[d];
|
1596 |
}
|
1597 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1598 |
{
|
1599 |
if (++count >= GET2(code, 1))
|
1600 |
{ ADD_NEW(state_offset + dlen + 3, 0); }
|
1601 |
else
|
1602 |
{ ADD_NEW(state_offset, count); }
|
1603 |
}
|
1604 |
}
|
1605 |
break;
|
1606 |
|
1607 |
/*-----------------------------------------------------------------*/
|
1608 |
case OP_UPTO:
|
1609 |
case OP_MINUPTO:
|
1610 |
case OP_POSUPTO:
|
1611 |
case OP_NOTUPTO:
|
1612 |
case OP_NOTMINUPTO:
|
1613 |
case OP_NOTPOSUPTO:
|
1614 |
ADD_ACTIVE(state_offset + dlen + 3, 0);
|
1615 |
count = current_state->count; /* Number already matched */
|
1616 |
if (clen > 0)
|
1617 |
{
|
1618 |
unsigned int otherd = NOTACHAR;
|
1619 |
if ((ims & PCRE_CASELESS) != 0)
|
1620 |
{
|
1621 |
#ifdef SUPPORT_UTF8
|
1622 |
if (utf8 && d >= 128)
|
1623 |
{
|
1624 |
#ifdef SUPPORT_UCP
|
1625 |
otherd = _pcre_ucp_othercase(d);
|
1626 |
#endif /* SUPPORT_UCP */
|
1627 |
}
|
1628 |
else
|
1629 |
#endif /* SUPPORT_UTF8 */
|
1630 |
otherd = fcc[d];
|
1631 |
}
|
1632 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1633 |
{
|
1634 |
if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
|
1635 |
{
|
1636 |
active_count--; /* Remove non-match possibility */
|
1637 |
next_active_state--;
|
1638 |
}
|
1639 |
if (++count >= GET2(code, 1))
|
1640 |
{ ADD_NEW(state_offset + dlen + 3, 0); }
|
1641 |
else
|
1642 |
{ ADD_NEW(state_offset, count); }
|
1643 |
}
|
1644 |
}
|
1645 |
break;
|
1646 |
|
1647 |
|
1648 |
/* ========================================================================== */
|
1649 |
/* These are the class-handling opcodes */
|
1650 |
|
1651 |
case OP_CLASS:
|
1652 |
case OP_NCLASS:
|
1653 |
case OP_XCLASS:
|
1654 |
{
|
1655 |
BOOL isinclass = FALSE;
|
1656 |
int next_state_offset;
|
1657 |
const uschar *ecode;
|
1658 |
|
1659 |
/* For a simple class, there is always just a 32-byte table, and we
|
1660 |
can set isinclass from it. */
|
1661 |
|
1662 |
if (codevalue != OP_XCLASS)
|
1663 |
{
|
1664 |
ecode = code + 33;
|
1665 |
if (clen > 0)
|
1666 |
{
|
1667 |
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
|
1668 |
((code[1 + c/8] & (1 << (c&7))) != 0);
|
1669 |
}
|
1670 |
}
|
1671 |
|
1672 |
/* An extended class may have a table or a list of single characters,
|
1673 |
ranges, or both, and it may be positive or negative. There's a
|
1674 |
function that sorts all this out. */
|
1675 |
|
1676 |
else
|
1677 |
{
|
1678 |
ecode = code + GET(code, 1);
|
1679 |
if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
|
1680 |
}
|
1681 |
|
1682 |
/* At this point, isinclass is set for all kinds of class, and ecode
|
1683 |
points to the byte after the end of the class. If there is a
|
1684 |
quantifier, this is where it will be. */
|
1685 |
|
1686 |
next_state_offset = ecode - start_code;
|
1687 |
|
1688 |
switch (*ecode)
|
1689 |
{
|
1690 |
case OP_CRSTAR:
|
1691 |
case OP_CRMINSTAR:
|
1692 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
1693 |
if (isinclass) { ADD_NEW(state_offset, 0); }
|
1694 |
break;
|
1695 |
|
1696 |
case OP_CRPLUS:
|
1697 |
case OP_CRMINPLUS:
|
1698 |
count = current_state->count; /* Already matched */
|
1699 |
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
|
1700 |
if (isinclass) { count++; ADD_NEW(state_offset, count); }
|
1701 |
break;
|
1702 |
|
1703 |
case OP_CRQUERY:
|
1704 |
case OP_CRMINQUERY:
|
1705 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
1706 |
if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
|
1707 |
break;
|
1708 |
|
1709 |
case OP_CRRANGE:
|
1710 |
case OP_CRMINRANGE:
|
1711 |
count = current_state->count; /* Already matched */
|
1712 |
if (count >= GET2(ecode, 1))
|
1713 |
{ ADD_ACTIVE(next_state_offset + 5, 0); }
|
1714 |
if (isinclass)
|
1715 |
{
|
1716 |
int max = GET2(ecode, 3);
|
1717 |
if (++count >= max && max != 0) /* Max 0 => no limit */
|
1718 |
{ ADD_NEW(next_state_offset + 5, 0); }
|
1719 |
else
|
1720 |
{ ADD_NEW(state_offset, count); }
|
1721 |
}
|
1722 |
break;
|
1723 |
|
1724 |
default:
|
1725 |
if (isinclass) { ADD_NEW(next_state_offset, 0); }
|
1726 |
break;
|
1727 |
}
|
1728 |
}
|
1729 |
break;
|
1730 |
|
1731 |
/* ========================================================================== */
|
1732 |
/* These are the opcodes for fancy brackets of various kinds. We have
|
1733 |
to use recursion in order to handle them. */
|
1734 |
|
1735 |
case OP_ASSERT:
|
1736 |
case OP_ASSERT_NOT:
|
1737 |
case OP_ASSERTBACK:
|
1738 |
case OP_ASSERTBACK_NOT:
|
1739 |
{
|
1740 |
int rc;
|
1741 |
int local_offsets[2];
|
1742 |
int local_workspace[1000];
|
1743 |
const uschar *endasscode = code + GET(code, 1);
|
1744 |
|
1745 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
1746 |
|
1747 |
rc = internal_dfa_exec(
|
1748 |
md, /* static match data */
|
1749 |
code, /* this subexpression's code */
|
1750 |
ptr, /* where we currently are */
|
1751 |
ptr - start_subject, /* start offset */
|
1752 |
local_offsets, /* offset vector */
|
1753 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
1754 |
local_workspace, /* workspace vector */
|
1755 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
1756 |
ims, /* the current ims flags */
|
1757 |
rlevel, /* function recursion level */
|
1758 |
recursing); /* pass on regex recursion */
|
1759 |
|
1760 |
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
1761 |
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
1762 |
}
|
1763 |
break;
|
1764 |
|
1765 |
/*-----------------------------------------------------------------*/
|
1766 |
case OP_COND:
|
1767 |
case OP_SCOND:
|
1768 |
{
|
1769 |
int local_offsets[1000];
|
1770 |
int local_workspace[1000];
|
1771 |
int condcode = code[LINK_SIZE+1];
|
1772 |
|
1773 |
/* Back reference conditions are not supported */
|
1774 |
|
1775 |
if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
|
1776 |
|
1777 |
/* The DEFINE condition is always false */
|
1778 |
|
1779 |
if (condcode == OP_DEF)
|
1780 |
{
|
1781 |
ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
|
1782 |
}
|
1783 |
|
1784 |
/* The only supported version of OP_RREF is for the value RREF_ANY,
|
1785 |
which means "test if in any recursion". We can't test for specifically
|
1786 |
recursed groups. */
|
1787 |
|
1788 |
else if (condcode == OP_RREF)
|
1789 |
{
|
1790 |
int value = GET2(code, LINK_SIZE+2);
|
1791 |
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
|
1792 |
if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
|
1793 |
else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
|
1794 |
}
|
1795 |
|
1796 |
/* Otherwise, the condition is an assertion */
|
1797 |
|
1798 |
else
|
1799 |
{
|
1800 |
int rc;
|
1801 |
const uschar *asscode = code + LINK_SIZE + 1;
|
1802 |
const uschar *endasscode = asscode + GET(asscode, 1);
|
1803 |
|
1804 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
1805 |
|
1806 |
rc = internal_dfa_exec(
|
1807 |
md, /* fixed match data */
|
1808 |
asscode, /* this subexpression's code */
|
1809 |
ptr, /* where we currently are */
|
1810 |
ptr - start_subject, /* start offset */
|
1811 |
local_offsets, /* offset vector */
|
1812 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
1813 |
local_workspace, /* workspace vector */
|
1814 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
1815 |
ims, /* the current ims flags */
|
1816 |
rlevel, /* function recursion level */
|
1817 |
recursing); /* pass on regex recursion */
|
1818 |
|
1819 |
if ((rc >= 0) ==
|
1820 |
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
|
1821 |
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
1822 |
else
|
1823 |
{ ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
|
1824 |
}
|
1825 |
}
|
1826 |
break;
|
1827 |
|
1828 |
/*-----------------------------------------------------------------*/
|
1829 |
case OP_RECURSE:
|
1830 |
{
|
1831 |
int local_offsets[1000];
|
1832 |
int local_workspace[1000];
|
1833 |
int rc;
|
1834 |
|
1835 |
DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
|
1836 |
recursing + 1));
|
1837 |
|
1838 |
rc = internal_dfa_exec(
|
1839 |
md, /* fixed match data */
|
1840 |
start_code + GET(code, 1), /* this subexpression's code */
|
1841 |
ptr, /* where we currently are */
|
1842 |
ptr - start_subject, /* start offset */
|
1843 |
local_offsets, /* offset vector */
|
1844 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
1845 |
local_workspace, /* workspace vector */
|
1846 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
1847 |
ims, /* the current ims flags */
|
1848 |
rlevel, /* function recursion level */
|
1849 |
recursing + 1); /* regex recurse level */
|
1850 |
|
1851 |
DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
|
1852 |
recursing + 1, rc));
|
1853 |
|
1854 |
/* Ran out of internal offsets */
|
1855 |
|
1856 |
if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
|
1857 |
|
1858 |
/* For each successful matched substring, set up the next state with a
|
1859 |
count of characters to skip before trying it. Note that the count is in
|
1860 |
characters, not bytes. */
|
1861 |
|
1862 |
if (rc > 0)
|
1863 |
{
|
1864 |
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
|
1865 |
{
|
1866 |
const uschar *p = start_subject + local_offsets[rc];
|
1867 |
const uschar *pp = start_subject + local_offsets[rc+1];
|
1868 |
int charcount = local_offsets[rc+1] - local_offsets[rc];
|
1869 |
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
|
1870 |
if (charcount > 0)
|
1871 |
{
|
1872 |
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
|
1873 |
}
|
1874 |
else
|
1875 |
{
|
1876 |
ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
|
1877 |
}
|
1878 |
}
|
1879 |
}
|
1880 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
1881 |
}
|
1882 |
break;
|
1883 |
|
1884 |
/*-----------------------------------------------------------------*/
|
1885 |
case OP_ONCE:
|
1886 |
{
|
1887 |
int local_offsets[2];
|
1888 |
int local_workspace[1000];
|
1889 |
|
1890 |
int rc = internal_dfa_exec(
|
1891 |
md, /* fixed match data */
|
1892 |
code, /* this subexpression's code */
|
1893 |
ptr, /* where we currently are */
|
1894 |
ptr - start_subject, /* start offset */
|
1895 |
local_offsets, /* offset vector */
|
1896 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
1897 |
local_workspace, /* workspace vector */
|
1898 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
1899 |
ims, /* the current ims flags */
|
1900 |
rlevel, /* function recursion level */
|
1901 |
recursing); /* pass on regex recursion */
|
1902 |
|
1903 |
if (rc >= 0)
|
1904 |
{
|
1905 |
const uschar *end_subpattern = code;
|
1906 |
int charcount = local_offsets[1] - local_offsets[0];
|
1907 |
int next_state_offset, repeat_state_offset;
|
1908 |
|
1909 |
do { end_subpattern += GET(end_subpattern, 1); }
|
1910 |
while (*end_subpattern == OP_ALT);
|
1911 |
next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
|
1912 |
|
1913 |
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
|
1914 |
arrange for the repeat state also to be added to the relevant list.
|
1915 |
Calculate the offset, or set -1 for no repeat. */
|
1916 |
|
1917 |
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
|
1918 |
*end_subpattern == OP_KETRMIN)?
|
1919 |
end_subpattern - start_code - GET(end_subpattern, 1) : -1;
|
1920 |
|
1921 |
/* If we have matched an empty string, add the next state at the
|
1922 |
current character pointer. This is important so that the duplicate
|
1923 |
checking kicks in, which is what breaks infinite loops that match an
|
1924 |
empty string. */
|
1925 |
|
1926 |
if (charcount == 0)
|
1927 |
{
|
1928 |
ADD_ACTIVE(next_state_offset, 0);
|
1929 |
}
|
1930 |
|
1931 |
/* Optimization: if there are no more active states, and there
|
1932 |
are no new states yet set up, then skip over the subject string
|
1933 |
right here, to save looping. Otherwise, set up the new state to swing
|
1934 |
into action when the end of the substring is reached. */
|
1935 |
|
1936 |
else if (i + 1 >= active_count && new_count == 0)
|
1937 |
{
|
1938 |
ptr += charcount;
|
1939 |
clen = 0;
|
1940 |
ADD_NEW(next_state_offset, 0);
|
1941 |
|
1942 |
/* If we are adding a repeat state at the new character position,
|
1943 |
we must fudge things so that it is the only current state.
|
1944 |
Otherwise, it might be a duplicate of one we processed before, and
|
1945 |
that would cause it to be skipped. */
|
1946 |
|
1947 |
if (repeat_state_offset >= 0)
|
1948 |
{
|
1949 |
next_active_state = active_states;
|
1950 |
active_count = 0;
|
1951 |
i = -1;
|
1952 |
ADD_ACTIVE(repeat_state_offset, 0);
|
1953 |
}
|
1954 |
}
|
1955 |
else
|
1956 |
{
|
1957 |
const uschar *p = start_subject + local_offsets[0];
|
1958 |
const uschar *pp = start_subject + local_offsets[1];
|
1959 |
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
|
1960 |
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
1961 |
if (repeat_state_offset >= 0)
|
1962 |
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
|
1963 |
}
|
1964 |
|
1965 |
}
|
1966 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
1967 |
}
|
1968 |
break;
|
1969 |
|
1970 |
|
1971 |
/* ========================================================================== */
|
1972 |
/* Handle callouts */
|
1973 |
|
1974 |
case OP_CALLOUT:
|
1975 |
if (pcre_callout != NULL)
|
1976 |
{
|
1977 |
int rrc;
|
1978 |
pcre_callout_block cb;
|
1979 |
cb.version = 1; /* Version 1 of the callout block */
|
1980 |
cb.callout_number = code[1];
|
1981 |
cb.offset_vector = offsets;
|
1982 |
cb.subject = (PCRE_SPTR)start_subject;
|
1983 |
cb.subject_length = end_subject - start_subject;
|
1984 |
cb.start_match = current_subject - start_subject;
|
1985 |
cb.current_position = ptr - start_subject;
|
1986 |
cb.pattern_position = GET(code, 2);
|
1987 |
cb.next_item_length = GET(code, 2 + LINK_SIZE);
|
1988 |
cb.capture_top = 1;
|
1989 |
cb.capture_last = -1;
|
1990 |
cb.callout_data = md->callout_data;
|
1991 |
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
1992 |
if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
|
1993 |
}
|
1994 |
break;
|
1995 |
|
1996 |
|
1997 |
/* ========================================================================== */
|
1998 |
default: /* Unsupported opcode */
|
1999 |
return PCRE_ERROR_DFA_UITEM;
|
2000 |
}
|
2001 |
|
2002 |
NEXT_ACTIVE_STATE: continue;
|
2003 |
|
2004 |
} /* End of loop scanning active states */
|
2005 |
|
2006 |
/* We have finished the processing at the current subject character. If no
|
2007 |
new states have been set for the next character, we have found all the
|
2008 |
matches that we are going to find. If we are at the top level and partial
|
2009 |
matching has been requested, check for appropriate conditions. */
|
2010 |
|
2011 |
if (new_count <= 0)
|
2012 |
{
|
2013 |
if (match_count < 0 && /* No matches found */
|
2014 |
rlevel == 1 && /* Top level match function */
|
2015 |
(md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
|
2016 |
ptr >= end_subject && /* Reached end of subject */
|
2017 |
ptr > current_subject) /* Matched non-empty string */
|
2018 |
{
|
2019 |
if (offsetcount >= 2)
|
2020 |
{
|
2021 |
offsets[0] = current_subject - start_subject;
|
2022 |
offsets[1] = end_subject - start_subject;
|
2023 |
}
|
2024 |
match_count = PCRE_ERROR_PARTIAL;
|
2025 |
}
|
2026 |
|
2027 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
2028 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
|
2029 |
rlevel*2-2, SP));
|
2030 |
break; /* In effect, "return", but see the comment below */
|
2031 |
}
|
2032 |
|
2033 |
/* One or more states are active for the next character. */
|
2034 |
|
2035 |
ptr += clen; /* Advance to next subject character */
|
2036 |
} /* Loop to move along the subject string */
|
2037 |
|
2038 |
/* Control gets here from "break" a few lines above. We do it this way because
|
2039 |
if we use "return" above, we have compiler trouble. Some compilers warn if
|
2040 |
there's nothing here because they think the function doesn't return a value. On
|
2041 |
the other hand, if we put a dummy statement here, some more clever compilers
|
2042 |
complain that it can't be reached. Sigh. */
|
2043 |
|
2044 |
return match_count;
|
2045 |
}
|
2046 |
|
2047 |
|
2048 |
|
2049 |
|
2050 |
/*************************************************
|
2051 |
* Execute a Regular Expression - DFA engine *
|
2052 |
*************************************************/
|
2053 |
|
2054 |
/* This external function applies a compiled re to a subject string using a DFA
|
2055 |
engine. This function calls the internal function multiple times if the pattern
|
2056 |
is not anchored.
|
2057 |
|
2058 |
Arguments:
|
2059 |
argument_re points to the compiled expression
|
2060 |
extra_data points to extra data or is NULL
|
2061 |
subject points to the subject string
|
2062 |
length length of subject string (may contain binary zeros)
|
2063 |
start_offset where to start in the subject string
|
2064 |
options option bits
|
2065 |
offsets vector of match offsets
|
2066 |
offsetcount size of same
|
2067 |
workspace workspace vector
|
2068 |
wscount size of same
|
2069 |
|
2070 |
Returns: > 0 => number of match offset pairs placed in offsets
|
2071 |
= 0 => offsets overflowed; longest matches are present
|
2072 |
-1 => failed to match
|
2073 |
< -1 => some kind of unexpected problem
|
2074 |
*/
|
2075 |
|
2076 |
PCRE_EXP_DEFN int
|
2077 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
|
2078 |
const char *subject, int length, int start_offset, int options, int *offsets,
|
2079 |
int offsetcount, int *workspace, int wscount)
|
2080 |
{
|
2081 |
real_pcre *re = (real_pcre *)argument_re;
|
2082 |
dfa_match_data match_block;
|
2083 |
dfa_match_data *md = &match_block;
|
2084 |
BOOL utf8, anchored, startline, firstline;
|
2085 |
const uschar *current_subject, *end_subject, *lcc;
|
2086 |
|
2087 |
pcre_study_data internal_study;
|
2088 |
const pcre_study_data *study = NULL;
|
2089 |
real_pcre internal_re;
|
2090 |
|
2091 |
const uschar *req_byte_ptr;
|
2092 |
const uschar *start_bits = NULL;
|
2093 |
BOOL first_byte_caseless = FALSE;
|
2094 |
BOOL req_byte_caseless = FALSE;
|
2095 |
int first_byte = -1;
|
2096 |
int req_byte = -1;
|
2097 |
int req_byte2 = -1;
|
2098 |
int newline;
|
2099 |
|
2100 |
/* Plausibility checks */
|
2101 |
|
2102 |
if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
|
2103 |
if (re == NULL || subject == NULL || workspace == NULL ||
|
2104 |
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
|
2105 |
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
|
2106 |
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
|
2107 |
|
2108 |
/* We need to find the pointer to any study data before we test for byte
|
2109 |
flipping, so we scan the extra_data block first. This may set two fields in the
|
2110 |
match block, so we must initialize them beforehand. However, the other fields
|
2111 |
in the match block must not be set until after the byte flipping. */
|
2112 |
|
2113 |
md->tables = re->tables;
|
2114 |
md->callout_data = NULL;
|
2115 |
|
2116 |
if (extra_data != NULL)
|
2117 |
{
|
2118 |
unsigned int flags = extra_data->flags;
|
2119 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
|
2120 |
study = (const pcre_study_data *)extra_data->study_data;
|
2121 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
|
2122 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
|
2123 |
return PCRE_ERROR_DFA_UMLIMIT;
|
2124 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
|
2125 |
md->callout_data = extra_data->callout_data;
|
2126 |
if ((flags & PCRE_EXTRA_TABLES) != 0)
|
2127 |
md->tables = extra_data->tables;
|
2128 |
}
|
2129 |
|
2130 |
/* Check that the first field in the block is the magic number. If it is not,
|
2131 |
test for a regex that was compiled on a host of opposite endianness. If this is
|
2132 |
the case, flipped values are put in internal_re and internal_study if there was
|
2133 |
study data too. */
|
2134 |
|
2135 |
if (re->magic_number != MAGIC_NUMBER)
|
2136 |
{
|
2137 |
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
|
2138 |
if (re == NULL) return PCRE_ERROR_BADMAGIC;
|
2139 |
if (study != NULL) study = &internal_study;
|
2140 |
}
|
2141 |
|
2142 |
/* Set some local values */
|
2143 |
|
2144 |
current_subject = (const unsigned char *)subject + start_offset;
|
2145 |
end_subject = (const unsigned char *)subject + length;
|
2146 |
req_byte_ptr = current_subject - 1;
|
2147 |
|
2148 |
#ifdef SUPPORT_UTF8
|
2149 |
utf8 = (re->options & PCRE_UTF8) != 0;
|
2150 |
#else
|
2151 |
utf8 = FALSE;
|
2152 |
#endif
|
2153 |
|
2154 |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
|
2155 |
(re->options & PCRE_ANCHORED) != 0;
|
2156 |
|
2157 |
/* The remaining fixed data for passing around. */
|
2158 |
|
2159 |
md->start_code = (const uschar *)argument_re +
|
2160 |
re->name_table_offset + re->name_count * re->name_entry_size;
|
2161 |
md->start_subject = (const unsigned char *)subject;
|
2162 |
md->end_subject = end_subject;
|
2163 |
md->moptions = options;
|
2164 |
md->poptions = re->options;
|
2165 |
|
2166 |
/* Handle different types of newline. The three bits give eight cases. If
|
2167 |
nothing is set at run time, whatever was used at compile time applies. */
|
2168 |
|
2169 |
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
|
2170 |
PCRE_NEWLINE_BITS)
|
2171 |
{
|
2172 |
case 0: newline = NEWLINE; break; /* Compile-time default */
|
2173 |
case PCRE_NEWLINE_CR: newline = '\r'; break;
|
2174 |
case PCRE_NEWLINE_LF: newline = '\n'; break;
|
2175 |
case PCRE_NEWLINE_CR+
|
2176 |
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
|
2177 |
case PCRE_NEWLINE_ANY: newline = -1; break;
|
2178 |
default: return PCRE_ERROR_BADNEWLINE;
|
2179 |
}
|
2180 |
|
2181 |
if (newline < 0)
|
2182 |
{
|
2183 |
md->nltype = NLTYPE_ANY;
|
2184 |
}
|
2185 |
else
|
2186 |
{
|
2187 |
md->nltype = NLTYPE_FIXED;
|
2188 |
if (newline > 255)
|
2189 |
{
|
2190 |
md->nllen = 2;
|
2191 |
md->nl[0] = (newline >> 8) & 255;
|
2192 |
md->nl[1] = newline & 255;
|
2193 |
}
|
2194 |
else
|
2195 |
{
|
2196 |
md->nllen = 1;
|
2197 |
md->nl[0] = newline;
|
2198 |
}
|
2199 |
}
|
2200 |
|
2201 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
|
2202 |
back the character offset. */
|
2203 |
|
2204 |
#ifdef SUPPORT_UTF8
|
2205 |
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
2206 |
{
|
2207 |
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
|
2208 |
return PCRE_ERROR_BADUTF8;
|
2209 |
if (start_offset > 0 && start_offset < length)
|
2210 |
{
|
2211 |
int tb = ((uschar *)subject)[start_offset];
|
2212 |
if (tb > 127)
|
2213 |
{
|
2214 |
tb &= 0xc0;
|
2215 |
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
|
2216 |
}
|
2217 |
}
|
2218 |
}
|
2219 |
#endif
|
2220 |
|
2221 |
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
|
2222 |
is a feature that makes it possible to save compiled regex and re-use them
|
2223 |
in other programs later. */
|
2224 |
|
2225 |
if (md->tables == NULL) md->tables = _pcre_default_tables;
|
2226 |
|
2227 |
/* The lower casing table and the "must be at the start of a line" flag are
|
2228 |
used in a loop when finding where to start. */
|
2229 |
|
2230 |
lcc = md->tables + lcc_offset;
|
2231 |
startline = (re->options & PCRE_STARTLINE) != 0;
|
2232 |
firstline = (re->options & PCRE_FIRSTLINE) != 0;
|
2233 |
|
2234 |
/* Set up the first character to match, if available. The first_byte value is
|
2235 |
never set for an anchored regular expression, but the anchoring may be forced
|
2236 |
at run time, so we have to test for anchoring. The first char may be unset for
|
2237 |
an unanchored pattern, of course. If there's no first char and the pattern was
|
2238 |
studied, there may be a bitmap of possible first characters. */
|
2239 |
|
2240 |
if (!anchored)
|
2241 |
{
|
2242 |
if ((re->options & PCRE_FIRSTSET) != 0)
|
2243 |
{
|
2244 |
first_byte = re->first_byte & 255;
|
2245 |
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
|
2246 |
first_byte = lcc[first_byte];
|
2247 |
}
|
2248 |
else
|
2249 |
{
|
2250 |
if (startline && study != NULL &&
|
2251 |
(study->options & PCRE_STUDY_MAPPED) != 0)
|
2252 |
start_bits = study->start_bits;
|
2253 |
}
|
2254 |
}
|
2255 |
|
2256 |
/* For anchored or unanchored matches, there may be a "last known required
|
2257 |
character" set. */
|
2258 |
|
2259 |
if ((re->options & PCRE_REQCHSET) != 0)
|
2260 |
{
|
2261 |
req_byte = re->req_byte & 255;
|
2262 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
|
2263 |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
|
2264 |
}
|
2265 |
|
2266 |
/* Call the main matching function, looping for a non-anchored regex after a
|
2267 |
failed match. Unless restarting, optimize by moving to the first match
|
2268 |
character if possible, when not anchored. Then unless wanting a partial match,
|
2269 |
check for a required later character. */
|
2270 |
|
2271 |
for (;;)
|
2272 |
{
|
2273 |
int rc;
|
2274 |
|
2275 |
if ((options & PCRE_DFA_RESTART) == 0)
|
2276 |
{
|
2277 |
const uschar *save_end_subject = end_subject;
|
2278 |
|
2279 |
/* Advance to a unique first char if possible. If firstline is TRUE, the
|
2280 |
start of the match is constrained to the first line of a multiline string.
|
2281 |
Implement this by temporarily adjusting end_subject so that we stop
|
2282 |
scanning at a newline. If the match fails at the newline, later code breaks
|
2283 |
this loop. */
|
2284 |
|
2285 |
if (firstline)
|
2286 |
{
|
2287 |
const uschar *t = current_subject;
|
2288 |
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
|
2289 |
end_subject = t;
|
2290 |
}
|
2291 |
|
2292 |
if (first_byte >= 0)
|
2293 |
{
|
2294 |
if (first_byte_caseless)
|
2295 |
while (current_subject < end_subject &&
|
2296 |
lcc[*current_subject] != first_byte)
|
2297 |
current_subject++;
|
2298 |
else
|
2299 |
while (current_subject < end_subject && *current_subject != first_byte)
|
2300 |
current_subject++;
|
2301 |
}
|
2302 |
|
2303 |
/* Or to just after a linebreak for a multiline match if possible */
|
2304 |
|
2305 |
else if (startline)
|
2306 |
{
|
2307 |
if (current_subject > md->start_subject + start_offset)
|
2308 |
{
|
2309 |
while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
|
2310 |
current_subject++;
|
2311 |
|
2312 |
/* If we have just passed a CR and the newline option is ANY, and we
|
2313 |
are now at a LF, advance the match position by one more character. */
|
2314 |
|
2315 |
if (current_subject[-1] == '\r' &&
|
2316 |
md->nltype == NLTYPE_ANY &&
|
2317 |
current_subject < end_subject &&
|
2318 |
*current_subject == '\n')
|
2319 |
current_subject++;
|
2320 |
}
|
2321 |
}
|
2322 |
|
2323 |
/* Or to a non-unique first char after study */
|
2324 |
|
2325 |
else if (start_bits != NULL)
|
2326 |
{
|
2327 |
while (current_subject < end_subject)
|
2328 |
{
|
2329 |
register unsigned int c = *current_subject;
|
2330 |
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
|
2331 |
else break;
|
2332 |
}
|
2333 |
}
|
2334 |
|
2335 |
/* Restore fudged end_subject */
|
2336 |
|
2337 |
end_subject = save_end_subject;
|
2338 |
}
|
2339 |
|
2340 |
/* If req_byte is set, we know that that character must appear in the subject
|
2341 |
for the match to succeed. If the first character is set, req_byte must be
|
2342 |
later in the subject; otherwise the test starts at the match point. This
|
2343 |
optimization can save a huge amount of work in patterns with nested unlimited
|
2344 |
repeats that aren't going to match. Writing separate code for cased/caseless
|
2345 |
versions makes it go faster, as does using an autoincrement and backing off
|
2346 |
on a match.
|
2347 |
|
2348 |
HOWEVER: when the subject string is very, very long, searching to its end can
|
2349 |
take a long time, and give bad performance on quite ordinary patterns. This
|
2350 |
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
|
2351 |
don't do this when the string is sufficiently long.
|
2352 |
|
2353 |
ALSO: this processing is disabled when partial matching is requested.
|
2354 |
*/
|
2355 |
|
2356 |
if (req_byte >= 0 &&
|
2357 |
end_subject - current_subject < REQ_BYTE_MAX &&
|
2358 |
(options & PCRE_PARTIAL) == 0)
|
2359 |
{
|
2360 |
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
|
2361 |
|
2362 |
/* We don't need to repeat the search if we haven't yet reached the
|
2363 |
place we found it at last time. */
|
2364 |
|
2365 |
if (p > req_byte_ptr)
|
2366 |
{
|
2367 |
if (req_byte_caseless)
|
2368 |
{
|
2369 |
while (p < end_subject)
|
2370 |
{
|
2371 |
register int pp = *p++;
|
2372 |
if (pp == req_byte || pp == req_byte2) { p--; break; }
|
2373 |
}
|
2374 |
}
|
2375 |
else
|
2376 |
{
|
2377 |
while (p < end_subject)
|
2378 |
{
|
2379 |
if (*p++ == req_byte) { p--; break; }
|
2380 |
}
|
2381 |
}
|
2382 |
|
2383 |
/* If we can't find the required character, break the matching loop,
|
2384 |
which will cause a return or PCRE_ERROR_NOMATCH. */
|
2385 |
|
2386 |
if (p >= end_subject) break;
|
2387 |
|
2388 |
/* If we have found the required character, save the point where we
|
2389 |
found it, so that we don't search again next time round the loop if
|
2390 |
the start hasn't passed this character yet. */
|
2391 |
|
2392 |
req_byte_ptr = p;
|
2393 |
}
|
2394 |
}
|
2395 |
|
2396 |
/* OK, now we can do the business */
|
2397 |
|
2398 |
rc = internal_dfa_exec(
|
2399 |
md, /* fixed match data */
|
2400 |
md->start_code, /* this subexpression's code */
|
2401 |
current_subject, /* where we currently are */
|
2402 |
start_offset, /* start offset in subject */
|
2403 |
offsets, /* offset vector */
|
2404 |
offsetcount, /* size of same */
|
2405 |
workspace, /* workspace vector */
|
2406 |
wscount, /* size of same */
|
2407 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
|
2408 |
0, /* function recurse level */
|
2409 |
0); /* regex recurse level */
|
2410 |
|
2411 |
/* Anything other than "no match" means we are done, always; otherwise, carry
|
2412 |
on only if not anchored. */
|
2413 |
|
2414 |
if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
|
2415 |
|
2416 |
/* Advance to the next subject character unless we are at the end of a line
|
2417 |
and firstline is set. */
|
2418 |
|
2419 |
if (firstline && IS_NEWLINE(current_subject)) break;
|
2420 |
current_subject++;
|
2421 |
if (utf8)
|
2422 |
{
|
2423 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
|
2424 |
current_subject++;
|
2425 |
}
|
2426 |
if (current_subject > end_subject) break;
|
2427 |
|
2428 |
/* If we have just passed a CR and the newline option is CRLF or ANY, and we
|
2429 |
are now at a LF, advance the match position by one more character. */
|
2430 |
|
2431 |
if (current_subject[-1] == '\r' &&
|
2432 |
(md->nltype == NLTYPE_ANY || md->nllen == 2) &&
|
2433 |
current_subject < end_subject &&
|
2434 |
*current_subject == '\n')
|
2435 |
current_subject++;
|
2436 |
|
2437 |
} /* "Bumpalong" loop */
|
2438 |
|
2439 |
return PCRE_ERROR_NOMATCH;
|
2440 |
}
|
2441 |
|
2442 |
/* End of pcre_dfa_exec.c */
|