1 |
/*************************************************
|
2 |
* Perl-Compatible Regular Expressions *
|
3 |
*************************************************/
|
4 |
|
5 |
/* PCRE is a library of functions to support regular expressions whose syntax
|
6 |
and semantics are as close as possible to those of the Perl 5 language (but see
|
7 |
below for why this module is different).
|
8 |
|
9 |
Written by Philip Hazel
|
10 |
Copyright (c) 1997-2009 University of Cambridge
|
11 |
|
12 |
-----------------------------------------------------------------------------
|
13 |
Redistribution and use in source and binary forms, with or without
|
14 |
modification, are permitted provided that the following conditions are met:
|
15 |
|
16 |
* Redistributions of source code must retain the above copyright notice,
|
17 |
this list of conditions and the following disclaimer.
|
18 |
|
19 |
* Redistributions in binary form must reproduce the above copyright
|
20 |
notice, this list of conditions and the following disclaimer in the
|
21 |
documentation and/or other materials provided with the distribution.
|
22 |
|
23 |
* Neither the name of the University of Cambridge nor the names of its
|
24 |
contributors may be used to endorse or promote products derived from
|
25 |
this software without specific prior written permission.
|
26 |
|
27 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
28 |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
29 |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
30 |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
31 |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
32 |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
33 |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
34 |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
35 |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
36 |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
37 |
POSSIBILITY OF SUCH DAMAGE.
|
38 |
-----------------------------------------------------------------------------
|
39 |
*/
|
40 |
|
41 |
|
42 |
/* This module contains the external function pcre_dfa_exec(), which is an
|
43 |
alternative matching function that uses a sort of DFA algorithm (not a true
|
44 |
FSM). This is NOT Perl- compatible, but it has advantages in certain
|
45 |
applications. */
|
46 |
|
47 |
|
48 |
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
|
49 |
the performance of his patterns greatly. I could not use it as it stood, as it
|
50 |
was not thread safe, and made assumptions about pattern sizes. Also, it caused
|
51 |
test 7 to loop, and test 9 to crash with a segfault.
|
52 |
|
53 |
The issue is the check for duplicate states, which is done by a simple linear
|
54 |
search up the state list. (Grep for "duplicate" below to find the code.) For
|
55 |
many patterns, there will never be many states active at one time, so a simple
|
56 |
linear search is fine. In patterns that have many active states, it might be a
|
57 |
bottleneck. The suggested code used an indexing scheme to remember which states
|
58 |
had previously been used for each character, and avoided the linear search when
|
59 |
it knew there was no chance of a duplicate. This was implemented when adding
|
60 |
states to the state lists.
|
61 |
|
62 |
I wrote some thread-safe, not-limited code to try something similar at the time
|
63 |
of checking for duplicates (instead of when adding states), using index vectors
|
64 |
on the stack. It did give a 13% improvement with one specially constructed
|
65 |
pattern for certain subject strings, but on other strings and on many of the
|
66 |
simpler patterns in the test suite it did worse. The major problem, I think,
|
67 |
was the extra time to initialize the index. This had to be done for each call
|
68 |
of internal_dfa_exec(). (The supplied patch used a static vector, initialized
|
69 |
only once - I suspect this was the cause of the problems with the tests.)
|
70 |
|
71 |
Overall, I concluded that the gains in some cases did not outweigh the losses
|
72 |
in others, so I abandoned this code. */
|
73 |
|
74 |
|
75 |
|
76 |
#ifdef HAVE_CONFIG_H
|
77 |
#include "config.h"
|
78 |
#endif
|
79 |
|
80 |
#define NLBLOCK md /* Block containing newline information */
|
81 |
#define PSSTART start_subject /* Field containing processed string start */
|
82 |
#define PSEND end_subject /* Field containing processed string end */
|
83 |
|
84 |
#include "pcre_internal.h"
|
85 |
|
86 |
|
87 |
/* For use to indent debugging output */
|
88 |
|
89 |
#define SP " "
|
90 |
|
91 |
|
92 |
/*************************************************
|
93 |
* Code parameters and static tables *
|
94 |
*************************************************/
|
95 |
|
96 |
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
|
97 |
into others, under special conditions. A gap of 20 between the blocks should be
|
98 |
enough. The resulting opcodes don't have to be less than 256 because they are
|
99 |
never stored, so we push them well clear of the normal opcodes. */
|
100 |
|
101 |
#define OP_PROP_EXTRA 300
|
102 |
#define OP_EXTUNI_EXTRA 320
|
103 |
#define OP_ANYNL_EXTRA 340
|
104 |
#define OP_HSPACE_EXTRA 360
|
105 |
#define OP_VSPACE_EXTRA 380
|
106 |
|
107 |
|
108 |
/* This table identifies those opcodes that are followed immediately by a
|
109 |
character that is to be tested in some way. This makes is possible to
|
110 |
centralize the loading of these characters. In the case of Type * etc, the
|
111 |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
|
112 |
small value. ***NOTE*** If the start of this table is modified, the two tables
|
113 |
that follow must also be modified. */
|
114 |
|
115 |
static const uschar coptable[] = {
|
116 |
0, /* End */
|
117 |
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
|
118 |
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
|
119 |
0, 0, 0, /* Any, AllAny, Anybyte */
|
120 |
0, 0, 0, /* NOTPROP, PROP, EXTUNI */
|
121 |
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
|
122 |
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
123 |
1, /* Char */
|
124 |
1, /* Charnc */
|
125 |
1, /* not */
|
126 |
/* Positive single-char repeats */
|
127 |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
128 |
3, 3, 3, /* upto, minupto, exact */
|
129 |
1, 1, 1, 3, /* *+, ++, ?+, upto+ */
|
130 |
/* Negative single-char repeats - only for chars < 256 */
|
131 |
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
132 |
3, 3, 3, /* NOT upto, minupto, exact */
|
133 |
1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
|
134 |
/* Positive type repeats */
|
135 |
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
136 |
3, 3, 3, /* Type upto, minupto, exact */
|
137 |
1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
|
138 |
/* Character class & ref repeats */
|
139 |
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
|
140 |
0, 0, /* CRRANGE, CRMINRANGE */
|
141 |
0, /* CLASS */
|
142 |
0, /* NCLASS */
|
143 |
0, /* XCLASS - variable length */
|
144 |
0, /* REF */
|
145 |
0, /* RECURSE */
|
146 |
0, /* CALLOUT */
|
147 |
0, /* Alt */
|
148 |
0, /* Ket */
|
149 |
0, /* KetRmax */
|
150 |
0, /* KetRmin */
|
151 |
0, /* Assert */
|
152 |
0, /* Assert not */
|
153 |
0, /* Assert behind */
|
154 |
0, /* Assert behind not */
|
155 |
0, /* Reverse */
|
156 |
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
157 |
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
158 |
0, /* CREF */
|
159 |
0, /* RREF */
|
160 |
0, /* DEF */
|
161 |
0, 0, /* BRAZERO, BRAMINZERO */
|
162 |
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
|
163 |
0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
|
164 |
};
|
165 |
|
166 |
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
167 |
and \w */
|
168 |
|
169 |
static const uschar toptable1[] = {
|
170 |
0, 0, 0, 0, 0, 0,
|
171 |
ctype_digit, ctype_digit,
|
172 |
ctype_space, ctype_space,
|
173 |
ctype_word, ctype_word,
|
174 |
0, 0 /* OP_ANY, OP_ALLANY */
|
175 |
};
|
176 |
|
177 |
static const uschar toptable2[] = {
|
178 |
0, 0, 0, 0, 0, 0,
|
179 |
ctype_digit, 0,
|
180 |
ctype_space, 0,
|
181 |
ctype_word, 0,
|
182 |
1, 1 /* OP_ANY, OP_ALLANY */
|
183 |
};
|
184 |
|
185 |
|
186 |
/* Structure for holding data about a particular state, which is in effect the
|
187 |
current data for an active path through the match tree. It must consist
|
188 |
entirely of ints because the working vector we are passed, and which we put
|
189 |
these structures in, is a vector of ints. */
|
190 |
|
191 |
typedef struct stateblock {
|
192 |
int offset; /* Offset to opcode */
|
193 |
int count; /* Count for repeats */
|
194 |
int ims; /* ims flag bits */
|
195 |
int data; /* Some use extra data */
|
196 |
} stateblock;
|
197 |
|
198 |
#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
|
199 |
|
200 |
|
201 |
#ifdef DEBUG
|
202 |
/*************************************************
|
203 |
* Print character string *
|
204 |
*************************************************/
|
205 |
|
206 |
/* Character string printing function for debugging.
|
207 |
|
208 |
Arguments:
|
209 |
p points to string
|
210 |
length number of bytes
|
211 |
f where to print
|
212 |
|
213 |
Returns: nothing
|
214 |
*/
|
215 |
|
216 |
static void
|
217 |
pchars(unsigned char *p, int length, FILE *f)
|
218 |
{
|
219 |
int c;
|
220 |
while (length-- > 0)
|
221 |
{
|
222 |
if (isprint(c = *(p++)))
|
223 |
fprintf(f, "%c", c);
|
224 |
else
|
225 |
fprintf(f, "\\x%02x", c);
|
226 |
}
|
227 |
}
|
228 |
#endif
|
229 |
|
230 |
|
231 |
|
232 |
/*************************************************
|
233 |
* Execute a Regular Expression - DFA engine *
|
234 |
*************************************************/
|
235 |
|
236 |
/* This internal function applies a compiled pattern to a subject string,
|
237 |
starting at a given point, using a DFA engine. This function is called from the
|
238 |
external one, possibly multiple times if the pattern is not anchored. The
|
239 |
function calls itself recursively for some kinds of subpattern.
|
240 |
|
241 |
Arguments:
|
242 |
md the match_data block with fixed information
|
243 |
this_start_code the opening bracket of this subexpression's code
|
244 |
current_subject where we currently are in the subject string
|
245 |
start_offset start offset in the subject string
|
246 |
offsets vector to contain the matching string offsets
|
247 |
offsetcount size of same
|
248 |
workspace vector of workspace
|
249 |
wscount size of same
|
250 |
ims the current ims flags
|
251 |
rlevel function call recursion level
|
252 |
recursing regex recursive call level
|
253 |
|
254 |
Returns: > 0 => number of match offset pairs placed in offsets
|
255 |
= 0 => offsets overflowed; longest matches are present
|
256 |
-1 => failed to match
|
257 |
< -1 => some kind of unexpected problem
|
258 |
|
259 |
The following macros are used for adding states to the two state vectors (one
|
260 |
for the current character, one for the following character). */
|
261 |
|
262 |
#define ADD_ACTIVE(x,y) \
|
263 |
if (active_count++ < wscount) \
|
264 |
{ \
|
265 |
next_active_state->offset = (x); \
|
266 |
next_active_state->count = (y); \
|
267 |
next_active_state->ims = ims; \
|
268 |
next_active_state++; \
|
269 |
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
270 |
} \
|
271 |
else return PCRE_ERROR_DFA_WSSIZE
|
272 |
|
273 |
#define ADD_ACTIVE_DATA(x,y,z) \
|
274 |
if (active_count++ < wscount) \
|
275 |
{ \
|
276 |
next_active_state->offset = (x); \
|
277 |
next_active_state->count = (y); \
|
278 |
next_active_state->ims = ims; \
|
279 |
next_active_state->data = (z); \
|
280 |
next_active_state++; \
|
281 |
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
282 |
} \
|
283 |
else return PCRE_ERROR_DFA_WSSIZE
|
284 |
|
285 |
#define ADD_NEW(x,y) \
|
286 |
if (new_count++ < wscount) \
|
287 |
{ \
|
288 |
next_new_state->offset = (x); \
|
289 |
next_new_state->count = (y); \
|
290 |
next_new_state->ims = ims; \
|
291 |
next_new_state++; \
|
292 |
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
293 |
} \
|
294 |
else return PCRE_ERROR_DFA_WSSIZE
|
295 |
|
296 |
#define ADD_NEW_DATA(x,y,z) \
|
297 |
if (new_count++ < wscount) \
|
298 |
{ \
|
299 |
next_new_state->offset = (x); \
|
300 |
next_new_state->count = (y); \
|
301 |
next_new_state->ims = ims; \
|
302 |
next_new_state->data = (z); \
|
303 |
next_new_state++; \
|
304 |
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
305 |
} \
|
306 |
else return PCRE_ERROR_DFA_WSSIZE
|
307 |
|
308 |
/* And now, here is the code */
|
309 |
|
310 |
static int
|
311 |
internal_dfa_exec(
|
312 |
dfa_match_data *md,
|
313 |
const uschar *this_start_code,
|
314 |
const uschar *current_subject,
|
315 |
int start_offset,
|
316 |
int *offsets,
|
317 |
int offsetcount,
|
318 |
int *workspace,
|
319 |
int wscount,
|
320 |
int ims,
|
321 |
int rlevel,
|
322 |
int recursing)
|
323 |
{
|
324 |
stateblock *active_states, *new_states, *temp_states;
|
325 |
stateblock *next_active_state, *next_new_state;
|
326 |
|
327 |
const uschar *ctypes, *lcc, *fcc;
|
328 |
const uschar *ptr;
|
329 |
const uschar *end_code, *first_op;
|
330 |
|
331 |
int active_count, new_count, match_count;
|
332 |
|
333 |
/* Some fields in the md block are frequently referenced, so we load them into
|
334 |
independent variables in the hope that this will perform better. */
|
335 |
|
336 |
const uschar *start_subject = md->start_subject;
|
337 |
const uschar *end_subject = md->end_subject;
|
338 |
const uschar *start_code = md->start_code;
|
339 |
|
340 |
#ifdef SUPPORT_UTF8
|
341 |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
|
342 |
#else
|
343 |
BOOL utf8 = FALSE;
|
344 |
#endif
|
345 |
|
346 |
rlevel++;
|
347 |
offsetcount &= (-2);
|
348 |
|
349 |
wscount -= 2;
|
350 |
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
|
351 |
(2 * INTS_PER_STATEBLOCK);
|
352 |
|
353 |
DPRINTF(("\n%.*s---------------------\n"
|
354 |
"%.*sCall to internal_dfa_exec f=%d r=%d\n",
|
355 |
rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
|
356 |
|
357 |
ctypes = md->tables + ctypes_offset;
|
358 |
lcc = md->tables + lcc_offset;
|
359 |
fcc = md->tables + fcc_offset;
|
360 |
|
361 |
match_count = PCRE_ERROR_NOMATCH; /* A negative number */
|
362 |
|
363 |
active_states = (stateblock *)(workspace + 2);
|
364 |
next_new_state = new_states = active_states + wscount;
|
365 |
new_count = 0;
|
366 |
|
367 |
first_op = this_start_code + 1 + LINK_SIZE +
|
368 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
369 |
|
370 |
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
|
371 |
the alternative states onto the list, and find out where the end is. This
|
372 |
makes is possible to use this function recursively, when we want to stop at a
|
373 |
matching internal ket rather than at the end.
|
374 |
|
375 |
If the first opcode in the first alternative is OP_REVERSE, we are dealing with
|
376 |
a backward assertion. In that case, we have to find out the maximum amount to
|
377 |
move back, and set up each alternative appropriately. */
|
378 |
|
379 |
if (*first_op == OP_REVERSE)
|
380 |
{
|
381 |
int max_back = 0;
|
382 |
int gone_back;
|
383 |
|
384 |
end_code = this_start_code;
|
385 |
do
|
386 |
{
|
387 |
int back = GET(end_code, 2+LINK_SIZE);
|
388 |
if (back > max_back) max_back = back;
|
389 |
end_code += GET(end_code, 1);
|
390 |
}
|
391 |
while (*end_code == OP_ALT);
|
392 |
|
393 |
/* If we can't go back the amount required for the longest lookbehind
|
394 |
pattern, go back as far as we can; some alternatives may still be viable. */
|
395 |
|
396 |
#ifdef SUPPORT_UTF8
|
397 |
/* In character mode we have to step back character by character */
|
398 |
|
399 |
if (utf8)
|
400 |
{
|
401 |
for (gone_back = 0; gone_back < max_back; gone_back++)
|
402 |
{
|
403 |
if (current_subject <= start_subject) break;
|
404 |
current_subject--;
|
405 |
while (current_subject > start_subject &&
|
406 |
(*current_subject & 0xc0) == 0x80)
|
407 |
current_subject--;
|
408 |
}
|
409 |
}
|
410 |
else
|
411 |
#endif
|
412 |
|
413 |
/* In byte-mode we can do this quickly. */
|
414 |
|
415 |
{
|
416 |
gone_back = (current_subject - max_back < start_subject)?
|
417 |
current_subject - start_subject : max_back;
|
418 |
current_subject -= gone_back;
|
419 |
}
|
420 |
|
421 |
/* Save the earliest consulted character */
|
422 |
|
423 |
if (current_subject < md->start_used_ptr)
|
424 |
md->start_used_ptr = current_subject;
|
425 |
|
426 |
/* Now we can process the individual branches. */
|
427 |
|
428 |
end_code = this_start_code;
|
429 |
do
|
430 |
{
|
431 |
int back = GET(end_code, 2+LINK_SIZE);
|
432 |
if (back <= gone_back)
|
433 |
{
|
434 |
int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
|
435 |
ADD_NEW_DATA(-bstate, 0, gone_back - back);
|
436 |
}
|
437 |
end_code += GET(end_code, 1);
|
438 |
}
|
439 |
while (*end_code == OP_ALT);
|
440 |
}
|
441 |
|
442 |
/* This is the code for a "normal" subpattern (not a backward assertion). The
|
443 |
start of a whole pattern is always one of these. If we are at the top level,
|
444 |
we may be asked to restart matching from the same point that we reached for a
|
445 |
previous partial match. We still have to scan through the top-level branches to
|
446 |
find the end state. */
|
447 |
|
448 |
else
|
449 |
{
|
450 |
end_code = this_start_code;
|
451 |
|
452 |
/* Restarting */
|
453 |
|
454 |
if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
|
455 |
{
|
456 |
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
|
457 |
new_count = workspace[1];
|
458 |
if (!workspace[0])
|
459 |
memcpy(new_states, active_states, new_count * sizeof(stateblock));
|
460 |
}
|
461 |
|
462 |
/* Not restarting */
|
463 |
|
464 |
else
|
465 |
{
|
466 |
int length = 1 + LINK_SIZE +
|
467 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
468 |
do
|
469 |
{
|
470 |
ADD_NEW(end_code - start_code + length, 0);
|
471 |
end_code += GET(end_code, 1);
|
472 |
length = 1 + LINK_SIZE;
|
473 |
}
|
474 |
while (*end_code == OP_ALT);
|
475 |
}
|
476 |
}
|
477 |
|
478 |
workspace[0] = 0; /* Bit indicating which vector is current */
|
479 |
|
480 |
DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
|
481 |
|
482 |
/* Loop for scanning the subject */
|
483 |
|
484 |
ptr = current_subject;
|
485 |
for (;;)
|
486 |
{
|
487 |
int i, j;
|
488 |
int clen, dlen;
|
489 |
unsigned int c, d;
|
490 |
int forced_fail = 0;
|
491 |
int reached_end = 0;
|
492 |
|
493 |
/* Make the new state list into the active state list and empty the
|
494 |
new state list. */
|
495 |
|
496 |
temp_states = active_states;
|
497 |
active_states = new_states;
|
498 |
new_states = temp_states;
|
499 |
active_count = new_count;
|
500 |
new_count = 0;
|
501 |
|
502 |
workspace[0] ^= 1; /* Remember for the restarting feature */
|
503 |
workspace[1] = active_count;
|
504 |
|
505 |
#ifdef DEBUG
|
506 |
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
|
507 |
pchars((uschar *)ptr, strlen((char *)ptr), stdout);
|
508 |
printf("\"\n");
|
509 |
|
510 |
printf("%.*sActive states: ", rlevel*2-2, SP);
|
511 |
for (i = 0; i < active_count; i++)
|
512 |
printf("%d/%d ", active_states[i].offset, active_states[i].count);
|
513 |
printf("\n");
|
514 |
#endif
|
515 |
|
516 |
/* Set the pointers for adding new states */
|
517 |
|
518 |
next_active_state = active_states + active_count;
|
519 |
next_new_state = new_states;
|
520 |
|
521 |
/* Load the current character from the subject outside the loop, as many
|
522 |
different states may want to look at it, and we assume that at least one
|
523 |
will. */
|
524 |
|
525 |
if (ptr < end_subject)
|
526 |
{
|
527 |
clen = 1; /* Number of bytes in the character */
|
528 |
#ifdef SUPPORT_UTF8
|
529 |
if (utf8) { GETCHARLEN(c, ptr, clen); } else
|
530 |
#endif /* SUPPORT_UTF8 */
|
531 |
c = *ptr;
|
532 |
}
|
533 |
else
|
534 |
{
|
535 |
clen = 0; /* This indicates the end of the subject */
|
536 |
c = NOTACHAR; /* This value should never actually be used */
|
537 |
}
|
538 |
|
539 |
/* Scan up the active states and act on each one. The result of an action
|
540 |
may be to add more states to the currently active list (e.g. on hitting a
|
541 |
parenthesis) or it may be to put states on the new list, for considering
|
542 |
when we move the character pointer on. */
|
543 |
|
544 |
for (i = 0; i < active_count; i++)
|
545 |
{
|
546 |
stateblock *current_state = active_states + i;
|
547 |
const uschar *code;
|
548 |
int state_offset = current_state->offset;
|
549 |
int count, codevalue, rrc;
|
550 |
|
551 |
#ifdef DEBUG
|
552 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
|
553 |
if (clen == 0) printf("EOL\n");
|
554 |
else if (c > 32 && c < 127) printf("'%c'\n", c);
|
555 |
else printf("0x%02x\n", c);
|
556 |
#endif
|
557 |
|
558 |
/* This variable is referred to implicity in the ADD_xxx macros. */
|
559 |
|
560 |
ims = current_state->ims;
|
561 |
|
562 |
/* A negative offset is a special case meaning "hold off going to this
|
563 |
(negated) state until the number of characters in the data field have
|
564 |
been skipped". */
|
565 |
|
566 |
if (state_offset < 0)
|
567 |
{
|
568 |
if (current_state->data > 0)
|
569 |
{
|
570 |
DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
|
571 |
ADD_NEW_DATA(state_offset, current_state->count,
|
572 |
current_state->data - 1);
|
573 |
continue;
|
574 |
}
|
575 |
else
|
576 |
{
|
577 |
current_state->offset = state_offset = -state_offset;
|
578 |
}
|
579 |
}
|
580 |
|
581 |
/* Check for a duplicate state with the same count, and skip if found.
|
582 |
See the note at the head of this module about the possibility of improving
|
583 |
performance here. */
|
584 |
|
585 |
for (j = 0; j < i; j++)
|
586 |
{
|
587 |
if (active_states[j].offset == state_offset &&
|
588 |
active_states[j].count == current_state->count)
|
589 |
{
|
590 |
DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
|
591 |
goto NEXT_ACTIVE_STATE;
|
592 |
}
|
593 |
}
|
594 |
|
595 |
/* The state offset is the offset to the opcode */
|
596 |
|
597 |
code = start_code + state_offset;
|
598 |
codevalue = *code;
|
599 |
|
600 |
/* If this opcode is followed by an inline character, load it. It is
|
601 |
tempting to test for the presence of a subject character here, but that
|
602 |
is wrong, because sometimes zero repetitions of the subject are
|
603 |
permitted.
|
604 |
|
605 |
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
|
606 |
argument that is not a data character - but is always one byte long. We
|
607 |
have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
|
608 |
this case. To keep the other cases fast, convert these ones to new opcodes.
|
609 |
*/
|
610 |
|
611 |
if (coptable[codevalue] > 0)
|
612 |
{
|
613 |
dlen = 1;
|
614 |
#ifdef SUPPORT_UTF8
|
615 |
if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
|
616 |
#endif /* SUPPORT_UTF8 */
|
617 |
d = code[coptable[codevalue]];
|
618 |
if (codevalue >= OP_TYPESTAR)
|
619 |
{
|
620 |
switch(d)
|
621 |
{
|
622 |
case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
|
623 |
case OP_NOTPROP:
|
624 |
case OP_PROP: codevalue += OP_PROP_EXTRA; break;
|
625 |
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
|
626 |
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
|
627 |
case OP_NOT_HSPACE:
|
628 |
case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
|
629 |
case OP_NOT_VSPACE:
|
630 |
case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
|
631 |
default: break;
|
632 |
}
|
633 |
}
|
634 |
}
|
635 |
else
|
636 |
{
|
637 |
dlen = 0; /* Not strictly necessary, but compilers moan */
|
638 |
d = NOTACHAR; /* if these variables are not set. */
|
639 |
}
|
640 |
|
641 |
|
642 |
/* Now process the individual opcodes */
|
643 |
|
644 |
switch (codevalue)
|
645 |
{
|
646 |
|
647 |
/* ========================================================================== */
|
648 |
/* Reached a closing bracket. If not at the end of the pattern, carry
|
649 |
on with the next opcode. Otherwise, unless we have an empty string and
|
650 |
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
|
651 |
start of the subject, save the match data, shifting up all previous
|
652 |
matches so we always have the longest first. */
|
653 |
|
654 |
case OP_KET:
|
655 |
case OP_KETRMIN:
|
656 |
case OP_KETRMAX:
|
657 |
if (code != end_code)
|
658 |
{
|
659 |
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
|
660 |
if (codevalue != OP_KET)
|
661 |
{
|
662 |
ADD_ACTIVE(state_offset - GET(code, 1), 0);
|
663 |
}
|
664 |
}
|
665 |
else
|
666 |
{
|
667 |
reached_end++; /* Count branches that reach the end */
|
668 |
if (ptr > current_subject ||
|
669 |
((md->moptions & PCRE_NOTEMPTY) == 0 &&
|
670 |
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
|
671 |
current_subject > start_subject + md->start_offset)))
|
672 |
{
|
673 |
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
674 |
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
|
675 |
match_count = 0;
|
676 |
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
677 |
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
678 |
if (offsetcount >= 2)
|
679 |
{
|
680 |
offsets[0] = current_subject - start_subject;
|
681 |
offsets[1] = ptr - start_subject;
|
682 |
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
683 |
offsets[1] - offsets[0], current_subject));
|
684 |
}
|
685 |
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
|
686 |
{
|
687 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
688 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
|
689 |
match_count, rlevel*2-2, SP));
|
690 |
return match_count;
|
691 |
}
|
692 |
}
|
693 |
}
|
694 |
break;
|
695 |
|
696 |
/* ========================================================================== */
|
697 |
/* These opcodes add to the current list of states without looking
|
698 |
at the current character. */
|
699 |
|
700 |
/*-----------------------------------------------------------------*/
|
701 |
case OP_ALT:
|
702 |
do { code += GET(code, 1); } while (*code == OP_ALT);
|
703 |
ADD_ACTIVE(code - start_code, 0);
|
704 |
break;
|
705 |
|
706 |
/*-----------------------------------------------------------------*/
|
707 |
case OP_BRA:
|
708 |
case OP_SBRA:
|
709 |
do
|
710 |
{
|
711 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
712 |
code += GET(code, 1);
|
713 |
}
|
714 |
while (*code == OP_ALT);
|
715 |
break;
|
716 |
|
717 |
/*-----------------------------------------------------------------*/
|
718 |
case OP_CBRA:
|
719 |
case OP_SCBRA:
|
720 |
ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
|
721 |
code += GET(code, 1);
|
722 |
while (*code == OP_ALT)
|
723 |
{
|
724 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
725 |
code += GET(code, 1);
|
726 |
}
|
727 |
break;
|
728 |
|
729 |
/*-----------------------------------------------------------------*/
|
730 |
case OP_BRAZERO:
|
731 |
case OP_BRAMINZERO:
|
732 |
ADD_ACTIVE(state_offset + 1, 0);
|
733 |
code += 1 + GET(code, 2);
|
734 |
while (*code == OP_ALT) code += GET(code, 1);
|
735 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
736 |
break;
|
737 |
|
738 |
/*-----------------------------------------------------------------*/
|
739 |
case OP_SKIPZERO:
|
740 |
code += 1 + GET(code, 2);
|
741 |
while (*code == OP_ALT) code += GET(code, 1);
|
742 |
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
743 |
break;
|
744 |
|
745 |
/*-----------------------------------------------------------------*/
|
746 |
case OP_CIRC:
|
747 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
|
748 |
((ims & PCRE_MULTILINE) != 0 &&
|
749 |
ptr != end_subject &&
|
750 |
WAS_NEWLINE(ptr)))
|
751 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
752 |
break;
|
753 |
|
754 |
/*-----------------------------------------------------------------*/
|
755 |
case OP_EOD:
|
756 |
if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
757 |
break;
|
758 |
|
759 |
/*-----------------------------------------------------------------*/
|
760 |
case OP_OPT:
|
761 |
ims = code[1];
|
762 |
ADD_ACTIVE(state_offset + 2, 0);
|
763 |
break;
|
764 |
|
765 |
/*-----------------------------------------------------------------*/
|
766 |
case OP_SOD:
|
767 |
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
768 |
break;
|
769 |
|
770 |
/*-----------------------------------------------------------------*/
|
771 |
case OP_SOM:
|
772 |
if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
|
773 |
break;
|
774 |
|
775 |
|
776 |
/* ========================================================================== */
|
777 |
/* These opcodes inspect the next subject character, and sometimes
|
778 |
the previous one as well, but do not have an argument. The variable
|
779 |
clen contains the length of the current character and is zero if we are
|
780 |
at the end of the subject. */
|
781 |
|
782 |
/*-----------------------------------------------------------------*/
|
783 |
case OP_ANY:
|
784 |
if (clen > 0 && !IS_NEWLINE(ptr))
|
785 |
{ ADD_NEW(state_offset + 1, 0); }
|
786 |
break;
|
787 |
|
788 |
/*-----------------------------------------------------------------*/
|
789 |
case OP_ALLANY:
|
790 |
if (clen > 0)
|
791 |
{ ADD_NEW(state_offset + 1, 0); }
|
792 |
break;
|
793 |
|
794 |
/*-----------------------------------------------------------------*/
|
795 |
case OP_EODN:
|
796 |
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
|
797 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
798 |
break;
|
799 |
|
800 |
/*-----------------------------------------------------------------*/
|
801 |
case OP_DOLL:
|
802 |
if ((md->moptions & PCRE_NOTEOL) == 0)
|
803 |
{
|
804 |
if (clen == 0 ||
|
805 |
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
|
806 |
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
|
807 |
))
|
808 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
809 |
}
|
810 |
else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
|
811 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
812 |
break;
|
813 |
|
814 |
/*-----------------------------------------------------------------*/
|
815 |
|
816 |
case OP_DIGIT:
|
817 |
case OP_WHITESPACE:
|
818 |
case OP_WORDCHAR:
|
819 |
if (clen > 0 && c < 256 &&
|
820 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
|
821 |
{ ADD_NEW(state_offset + 1, 0); }
|
822 |
break;
|
823 |
|
824 |
/*-----------------------------------------------------------------*/
|
825 |
case OP_NOT_DIGIT:
|
826 |
case OP_NOT_WHITESPACE:
|
827 |
case OP_NOT_WORDCHAR:
|
828 |
if (clen > 0 && (c >= 256 ||
|
829 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
|
830 |
{ ADD_NEW(state_offset + 1, 0); }
|
831 |
break;
|
832 |
|
833 |
/*-----------------------------------------------------------------*/
|
834 |
case OP_WORD_BOUNDARY:
|
835 |
case OP_NOT_WORD_BOUNDARY:
|
836 |
{
|
837 |
int left_word, right_word;
|
838 |
|
839 |
if (ptr > start_subject)
|
840 |
{
|
841 |
const uschar *temp = ptr - 1;
|
842 |
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
|
843 |
#ifdef SUPPORT_UTF8
|
844 |
if (utf8) BACKCHAR(temp);
|
845 |
#endif
|
846 |
GETCHARTEST(d, temp);
|
847 |
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
|
848 |
}
|
849 |
else left_word = 0;
|
850 |
|
851 |
if (clen > 0)
|
852 |
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
853 |
else /* This is a fudge to ensure that if this is the */
|
854 |
{ /* last item in the pattern, we don't count it as */
|
855 |
reached_end--; /* reached, thus disabling a partial match. */
|
856 |
right_word = 0;
|
857 |
}
|
858 |
|
859 |
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
860 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
861 |
}
|
862 |
break;
|
863 |
|
864 |
|
865 |
/*-----------------------------------------------------------------*/
|
866 |
/* Check the next character by Unicode property. We will get here only
|
867 |
if the support is in the binary; otherwise a compile-time error occurs.
|
868 |
*/
|
869 |
|
870 |
#ifdef SUPPORT_UCP
|
871 |
case OP_PROP:
|
872 |
case OP_NOTPROP:
|
873 |
if (clen > 0)
|
874 |
{
|
875 |
BOOL OK;
|
876 |
const ucd_record * prop = GET_UCD(c);
|
877 |
switch(code[1])
|
878 |
{
|
879 |
case PT_ANY:
|
880 |
OK = TRUE;
|
881 |
break;
|
882 |
|
883 |
case PT_LAMP:
|
884 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
|
885 |
break;
|
886 |
|
887 |
case PT_GC:
|
888 |
OK = _pcre_ucp_gentype[prop->chartype] == code[2];
|
889 |
break;
|
890 |
|
891 |
case PT_PC:
|
892 |
OK = prop->chartype == code[2];
|
893 |
break;
|
894 |
|
895 |
case PT_SC:
|
896 |
OK = prop->script == code[2];
|
897 |
break;
|
898 |
|
899 |
/* Should never occur, but keep compilers from grumbling. */
|
900 |
|
901 |
default:
|
902 |
OK = codevalue != OP_PROP;
|
903 |
break;
|
904 |
}
|
905 |
|
906 |
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
|
907 |
}
|
908 |
break;
|
909 |
#endif
|
910 |
|
911 |
|
912 |
|
913 |
/* ========================================================================== */
|
914 |
/* These opcodes likewise inspect the subject character, but have an
|
915 |
argument that is not a data character. It is one of these opcodes:
|
916 |
OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
|
917 |
OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
|
918 |
|
919 |
case OP_TYPEPLUS:
|
920 |
case OP_TYPEMINPLUS:
|
921 |
case OP_TYPEPOSPLUS:
|
922 |
count = current_state->count; /* Already matched */
|
923 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
924 |
if (clen > 0)
|
925 |
{
|
926 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
927 |
(c < 256 &&
|
928 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
929 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
930 |
{
|
931 |
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
|
932 |
{
|
933 |
active_count--; /* Remove non-match possibility */
|
934 |
next_active_state--;
|
935 |
}
|
936 |
count++;
|
937 |
ADD_NEW(state_offset, count);
|
938 |
}
|
939 |
}
|
940 |
break;
|
941 |
|
942 |
/*-----------------------------------------------------------------*/
|
943 |
case OP_TYPEQUERY:
|
944 |
case OP_TYPEMINQUERY:
|
945 |
case OP_TYPEPOSQUERY:
|
946 |
ADD_ACTIVE(state_offset + 2, 0);
|
947 |
if (clen > 0)
|
948 |
{
|
949 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
950 |
(c < 256 &&
|
951 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
952 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
953 |
{
|
954 |
if (codevalue == OP_TYPEPOSQUERY)
|
955 |
{
|
956 |
active_count--; /* Remove non-match possibility */
|
957 |
next_active_state--;
|
958 |
}
|
959 |
ADD_NEW(state_offset + 2, 0);
|
960 |
}
|
961 |
}
|
962 |
break;
|
963 |
|
964 |
/*-----------------------------------------------------------------*/
|
965 |
case OP_TYPESTAR:
|
966 |
case OP_TYPEMINSTAR:
|
967 |
case OP_TYPEPOSSTAR:
|
968 |
ADD_ACTIVE(state_offset + 2, 0);
|
969 |
if (clen > 0)
|
970 |
{
|
971 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
972 |
(c < 256 &&
|
973 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
974 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
975 |
{
|
976 |
if (codevalue == OP_TYPEPOSSTAR)
|
977 |
{
|
978 |
active_count--; /* Remove non-match possibility */
|
979 |
next_active_state--;
|
980 |
}
|
981 |
ADD_NEW(state_offset, 0);
|
982 |
}
|
983 |
}
|
984 |
break;
|
985 |
|
986 |
/*-----------------------------------------------------------------*/
|
987 |
case OP_TYPEEXACT:
|
988 |
count = current_state->count; /* Number already matched */
|
989 |
if (clen > 0)
|
990 |
{
|
991 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
992 |
(c < 256 &&
|
993 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
994 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
995 |
{
|
996 |
if (++count >= GET2(code, 1))
|
997 |
{ ADD_NEW(state_offset + 4, 0); }
|
998 |
else
|
999 |
{ ADD_NEW(state_offset, count); }
|
1000 |
}
|
1001 |
}
|
1002 |
break;
|
1003 |
|
1004 |
/*-----------------------------------------------------------------*/
|
1005 |
case OP_TYPEUPTO:
|
1006 |
case OP_TYPEMINUPTO:
|
1007 |
case OP_TYPEPOSUPTO:
|
1008 |
ADD_ACTIVE(state_offset + 4, 0);
|
1009 |
count = current_state->count; /* Number already matched */
|
1010 |
if (clen > 0)
|
1011 |
{
|
1012 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1013 |
(c < 256 &&
|
1014 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1015 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1016 |
{
|
1017 |
if (codevalue == OP_TYPEPOSUPTO)
|
1018 |
{
|
1019 |
active_count--; /* Remove non-match possibility */
|
1020 |
next_active_state--;
|
1021 |
}
|
1022 |
if (++count >= GET2(code, 1))
|
1023 |
{ ADD_NEW(state_offset + 4, 0); }
|
1024 |
else
|
1025 |
{ ADD_NEW(state_offset, count); }
|
1026 |
}
|
1027 |
}
|
1028 |
break;
|
1029 |
|
1030 |
/* ========================================================================== */
|
1031 |
/* These are virtual opcodes that are used when something like
|
1032 |
OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
|
1033 |
argument. It keeps the code above fast for the other cases. The argument
|
1034 |
is in the d variable. */
|
1035 |
|
1036 |
#ifdef SUPPORT_UCP
|
1037 |
case OP_PROP_EXTRA + OP_TYPEPLUS:
|
1038 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
|
1039 |
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
|
1040 |
count = current_state->count; /* Already matched */
|
1041 |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
|
1042 |
if (clen > 0)
|
1043 |
{
|
1044 |
BOOL OK;
|
1045 |
const ucd_record * prop = GET_UCD(c);
|
1046 |
switch(code[2])
|
1047 |
{
|
1048 |
case PT_ANY:
|
1049 |
OK = TRUE;
|
1050 |
break;
|
1051 |
|
1052 |
case PT_LAMP:
|
1053 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
|
1054 |
break;
|
1055 |
|
1056 |
case PT_GC:
|
1057 |
OK = _pcre_ucp_gentype[prop->chartype] == code[3];
|
1058 |
break;
|
1059 |
|
1060 |
case PT_PC:
|
1061 |
OK = prop->chartype == code[3];
|
1062 |
break;
|
1063 |
|
1064 |
case PT_SC:
|
1065 |
OK = prop->script == code[3];
|
1066 |
break;
|
1067 |
|
1068 |
/* Should never occur, but keep compilers from grumbling. */
|
1069 |
|
1070 |
default:
|
1071 |
OK = codevalue != OP_PROP;
|
1072 |
break;
|
1073 |
}
|
1074 |
|
1075 |
if (OK == (d == OP_PROP))
|
1076 |
{
|
1077 |
if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
|
1078 |
{
|
1079 |
active_count--; /* Remove non-match possibility */
|
1080 |
next_active_state--;
|
1081 |
}
|
1082 |
count++;
|
1083 |
ADD_NEW(state_offset, count);
|
1084 |
}
|
1085 |
}
|
1086 |
break;
|
1087 |
|
1088 |
/*-----------------------------------------------------------------*/
|
1089 |
case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
|
1090 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
|
1091 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
|
1092 |
count = current_state->count; /* Already matched */
|
1093 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1094 |
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
|
1095 |
{
|
1096 |
const uschar *nptr = ptr + clen;
|
1097 |
int ncount = 0;
|
1098 |
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
|
1099 |
{
|
1100 |
active_count--; /* Remove non-match possibility */
|
1101 |
next_active_state--;
|
1102 |
}
|
1103 |
while (nptr < end_subject)
|
1104 |
{
|
1105 |
int nd;
|
1106 |
int ndlen = 1;
|
1107 |
GETCHARLEN(nd, nptr, ndlen);
|
1108 |
if (UCD_CATEGORY(nd) != ucp_M) break;
|
1109 |
ncount++;
|
1110 |
nptr += ndlen;
|
1111 |
}
|
1112 |
count++;
|
1113 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1114 |
}
|
1115 |
break;
|
1116 |
#endif
|
1117 |
|
1118 |
/*-----------------------------------------------------------------*/
|
1119 |
case OP_ANYNL_EXTRA + OP_TYPEPLUS:
|
1120 |
case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
|
1121 |
case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
|
1122 |
count = current_state->count; /* Already matched */
|
1123 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1124 |
if (clen > 0)
|
1125 |
{
|
1126 |
int ncount = 0;
|
1127 |
switch (c)
|
1128 |
{
|
1129 |
case 0x000b:
|
1130 |
case 0x000c:
|
1131 |
case 0x0085:
|
1132 |
case 0x2028:
|
1133 |
case 0x2029:
|
1134 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1135 |
goto ANYNL01;
|
1136 |
|
1137 |
case 0x000d:
|
1138 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1139 |
/* Fall through */
|
1140 |
|
1141 |
ANYNL01:
|
1142 |
case 0x000a:
|
1143 |
if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
|
1144 |
{
|
1145 |
active_count--; /* Remove non-match possibility */
|
1146 |
next_active_state--;
|
1147 |
}
|
1148 |
count++;
|
1149 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1150 |
break;
|
1151 |
|
1152 |
default:
|
1153 |
break;
|
1154 |
}
|
1155 |
}
|
1156 |
break;
|
1157 |
|
1158 |
/*-----------------------------------------------------------------*/
|
1159 |
case OP_VSPACE_EXTRA + OP_TYPEPLUS:
|
1160 |
case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
|
1161 |
case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
|
1162 |
count = current_state->count; /* Already matched */
|
1163 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1164 |
if (clen > 0)
|
1165 |
{
|
1166 |
BOOL OK;
|
1167 |
switch (c)
|
1168 |
{
|
1169 |
case 0x000a:
|
1170 |
case 0x000b:
|
1171 |
case 0x000c:
|
1172 |
case 0x000d:
|
1173 |
case 0x0085:
|
1174 |
case 0x2028:
|
1175 |
case 0x2029:
|
1176 |
OK = TRUE;
|
1177 |
break;
|
1178 |
|
1179 |
default:
|
1180 |
OK = FALSE;
|
1181 |
break;
|
1182 |
}
|
1183 |
|
1184 |
if (OK == (d == OP_VSPACE))
|
1185 |
{
|
1186 |
if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
|
1187 |
{
|
1188 |
active_count--; /* Remove non-match possibility */
|
1189 |
next_active_state--;
|
1190 |
}
|
1191 |
count++;
|
1192 |
ADD_NEW_DATA(-state_offset, count, 0);
|
1193 |
}
|
1194 |
}
|
1195 |
break;
|
1196 |
|
1197 |
/*-----------------------------------------------------------------*/
|
1198 |
case OP_HSPACE_EXTRA + OP_TYPEPLUS:
|
1199 |
case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
|
1200 |
case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
|
1201 |
count = current_state->count; /* Already matched */
|
1202 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1203 |
if (clen > 0)
|
1204 |
{
|
1205 |
BOOL OK;
|
1206 |
switch (c)
|
1207 |
{
|
1208 |
case 0x09: /* HT */
|
1209 |
case 0x20: /* SPACE */
|
1210 |
case 0xa0: /* NBSP */
|
1211 |
case 0x1680: /* OGHAM SPACE MARK */
|
1212 |
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
|
1213 |
case 0x2000: /* EN QUAD */
|
1214 |
case 0x2001: /* EM QUAD */
|
1215 |
case 0x2002: /* EN SPACE */
|
1216 |
case 0x2003: /* EM SPACE */
|
1217 |
case 0x2004: /* THREE-PER-EM SPACE */
|
1218 |
case 0x2005: /* FOUR-PER-EM SPACE */
|
1219 |
case 0x2006: /* SIX-PER-EM SPACE */
|
1220 |
case 0x2007: /* FIGURE SPACE */
|
1221 |
case 0x2008: /* PUNCTUATION SPACE */
|
1222 |
case 0x2009: /* THIN SPACE */
|
1223 |
case 0x200A: /* HAIR SPACE */
|
1224 |
case 0x202f: /* NARROW NO-BREAK SPACE */
|
1225 |
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
|
1226 |
case 0x3000: /* IDEOGRAPHIC SPACE */
|
1227 |
OK = TRUE;
|
1228 |
break;
|
1229 |
|
1230 |
default:
|
1231 |
OK = FALSE;
|
1232 |
break;
|
1233 |
}
|
1234 |
|
1235 |
if (OK == (d == OP_HSPACE))
|
1236 |
{
|
1237 |
if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
|
1238 |
{
|
1239 |
active_count--; /* Remove non-match possibility */
|
1240 |
next_active_state--;
|
1241 |
}
|
1242 |
count++;
|
1243 |
ADD_NEW_DATA(-state_offset, count, 0);
|
1244 |
}
|
1245 |
}
|
1246 |
break;
|
1247 |
|
1248 |
/*-----------------------------------------------------------------*/
|
1249 |
#ifdef SUPPORT_UCP
|
1250 |
case OP_PROP_EXTRA + OP_TYPEQUERY:
|
1251 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
|
1252 |
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
|
1253 |
count = 4;
|
1254 |
goto QS1;
|
1255 |
|
1256 |
case OP_PROP_EXTRA + OP_TYPESTAR:
|
1257 |
case OP_PROP_EXTRA + OP_TYPEMINSTAR:
|
1258 |
case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
|
1259 |
count = 0;
|
1260 |
|
1261 |
QS1:
|
1262 |
|
1263 |
ADD_ACTIVE(state_offset + 4, 0);
|
1264 |
if (clen > 0)
|
1265 |
{
|
1266 |
BOOL OK;
|
1267 |
const ucd_record * prop = GET_UCD(c);
|
1268 |
switch(code[2])
|
1269 |
{
|
1270 |
case PT_ANY:
|
1271 |
OK = TRUE;
|
1272 |
break;
|
1273 |
|
1274 |
case PT_LAMP:
|
1275 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
|
1276 |
break;
|
1277 |
|
1278 |
case PT_GC:
|
1279 |
OK = _pcre_ucp_gentype[prop->chartype] == code[3];
|
1280 |
break;
|
1281 |
|
1282 |
case PT_PC:
|
1283 |
OK = prop->chartype == code[3];
|
1284 |
break;
|
1285 |
|
1286 |
case PT_SC:
|
1287 |
OK = prop->script == code[3];
|
1288 |
break;
|
1289 |
|
1290 |
/* Should never occur, but keep compilers from grumbling. */
|
1291 |
|
1292 |
default:
|
1293 |
OK = codevalue != OP_PROP;
|
1294 |
break;
|
1295 |
}
|
1296 |
|
1297 |
if (OK == (d == OP_PROP))
|
1298 |
{
|
1299 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
|
1300 |
codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
|
1301 |
{
|
1302 |
active_count--; /* Remove non-match possibility */
|
1303 |
next_active_state--;
|
1304 |
}
|
1305 |
ADD_NEW(state_offset + count, 0);
|
1306 |
}
|
1307 |
}
|
1308 |
break;
|
1309 |
|
1310 |
/*-----------------------------------------------------------------*/
|
1311 |
case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
|
1312 |
case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
|
1313 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
|
1314 |
count = 2;
|
1315 |
goto QS2;
|
1316 |
|
1317 |
case OP_EXTUNI_EXTRA + OP_TYPESTAR:
|
1318 |
case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
|
1319 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
|
1320 |
count = 0;
|
1321 |
|
1322 |
QS2:
|
1323 |
|
1324 |
ADD_ACTIVE(state_offset + 2, 0);
|
1325 |
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
|
1326 |
{
|
1327 |
const uschar *nptr = ptr + clen;
|
1328 |
int ncount = 0;
|
1329 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
|
1330 |
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
|
1331 |
{
|
1332 |
active_count--; /* Remove non-match possibility */
|
1333 |
next_active_state--;
|
1334 |
}
|
1335 |
while (nptr < end_subject)
|
1336 |
{
|
1337 |
int nd;
|
1338 |
int ndlen = 1;
|
1339 |
GETCHARLEN(nd, nptr, ndlen);
|
1340 |
if (UCD_CATEGORY(nd) != ucp_M) break;
|
1341 |
ncount++;
|
1342 |
nptr += ndlen;
|
1343 |
}
|
1344 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
|
1345 |
}
|
1346 |
break;
|
1347 |
#endif
|
1348 |
|
1349 |
/*-----------------------------------------------------------------*/
|
1350 |
case OP_ANYNL_EXTRA + OP_TYPEQUERY:
|
1351 |
case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
|
1352 |
case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
|
1353 |
count = 2;
|
1354 |
goto QS3;
|
1355 |
|
1356 |
case OP_ANYNL_EXTRA + OP_TYPESTAR:
|
1357 |
case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
|
1358 |
case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
|
1359 |
count = 0;
|
1360 |
|
1361 |
QS3:
|
1362 |
ADD_ACTIVE(state_offset + 2, 0);
|
1363 |
if (clen > 0)
|
1364 |
{
|
1365 |
int ncount = 0;
|
1366 |
switch (c)
|
1367 |
{
|
1368 |
case 0x000b:
|
1369 |
case 0x000c:
|
1370 |
case 0x0085:
|
1371 |
case 0x2028:
|
1372 |
case 0x2029:
|
1373 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1374 |
goto ANYNL02;
|
1375 |
|
1376 |
case 0x000d:
|
1377 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1378 |
/* Fall through */
|
1379 |
|
1380 |
ANYNL02:
|
1381 |
case 0x000a:
|
1382 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
|
1383 |
codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
|
1384 |
{
|
1385 |
active_count--; /* Remove non-match possibility */
|
1386 |
next_active_state--;
|
1387 |
}
|
1388 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
|
1389 |
break;
|
1390 |
|
1391 |
default:
|
1392 |
break;
|
1393 |
}
|
1394 |
}
|
1395 |
break;
|
1396 |
|
1397 |
/*-----------------------------------------------------------------*/
|
1398 |
case OP_VSPACE_EXTRA + OP_TYPEQUERY:
|
1399 |
case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
|
1400 |
case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
|
1401 |
count = 2;
|
1402 |
goto QS4;
|
1403 |
|
1404 |
case OP_VSPACE_EXTRA + OP_TYPESTAR:
|
1405 |
case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
|
1406 |
case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
|
1407 |
count = 0;
|
1408 |
|
1409 |
QS4:
|
1410 |
ADD_ACTIVE(state_offset + 2, 0);
|
1411 |
if (clen > 0)
|
1412 |
{
|
1413 |
BOOL OK;
|
1414 |
switch (c)
|
1415 |
{
|
1416 |
case 0x000a:
|
1417 |
case 0x000b:
|
1418 |
case 0x000c:
|
1419 |
case 0x000d:
|
1420 |
case 0x0085:
|
1421 |
case 0x2028:
|
1422 |
case 0x2029:
|
1423 |
OK = TRUE;
|
1424 |
break;
|
1425 |
|
1426 |
default:
|
1427 |
OK = FALSE;
|
1428 |
break;
|
1429 |
}
|
1430 |
if (OK == (d == OP_VSPACE))
|
1431 |
{
|
1432 |
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
|
1433 |
codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
|
1434 |
{
|
1435 |
active_count--; /* Remove non-match possibility */
|
1436 |
next_active_state--;
|
1437 |
}
|
1438 |
ADD_NEW_DATA(-(state_offset + count), 0, 0);
|
1439 |
}
|
1440 |
}
|
1441 |
break;
|
1442 |
|
1443 |
/*-----------------------------------------------------------------*/
|
1444 |
case OP_HSPACE_EXTRA + OP_TYPEQUERY:
|
1445 |
case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
|
1446 |
case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
|
1447 |
count = 2;
|
1448 |
goto QS5;
|
1449 |
|
1450 |
case OP_HSPACE_EXTRA + OP_TYPESTAR:
|
1451 |
case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
|
1452 |
case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
|
1453 |
count = 0;
|
1454 |
|
1455 |
QS5:
|
1456 |
ADD_ACTIVE(state_offset + 2, 0);
|
1457 |
if (clen > 0)
|
1458 |
{
|
1459 |
BOOL OK;
|
1460 |
switch (c)
|
1461 |
{
|
1462 |
case 0x09: /* HT */
|
1463 |
case 0x20: /* SPACE */
|
1464 |
case 0xa0: /* NBSP */
|
1465 |
case 0x1680: /* OGHAM SPACE MARK */
|
1466 |
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
|
1467 |
case 0x2000: /* EN QUAD */
|
1468 |
case 0x2001: /* EM QUAD */
|
1469 |
case 0x2002: /* EN SPACE */
|
1470 |
case 0x2003: /* EM SPACE */
|
1471 |
case 0x2004: /* THREE-PER-EM SPACE */
|
1472 |
case 0x2005: /* FOUR-PER-EM SPACE */
|
1473 |
case 0x2006: /* SIX-PER-EM SPACE */
|
1474 |
case 0x2007: /* FIGURE SPACE */
|
1475 |
case 0x2008: /* PUNCTUATION SPACE */
|
1476 |
case 0x2009: /* THIN SPACE */
|
1477 |
case 0x200A: /* HAIR SPACE */
|
1478 |
case 0x202f: /* NARROW NO-BREAK SPACE */
|
1479 |
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
|
1480 |
case 0x3000: /* IDEOGRAPHIC SPACE */
|
1481 |
OK = TRUE;
|
1482 |
break;
|
1483 |
|
1484 |
default:
|
1485 |
OK = FALSE;
|
1486 |
break;
|
1487 |
}
|
1488 |
|
1489 |
if (OK == (d == OP_HSPACE))
|
1490 |
{
|
1491 |
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
|
1492 |
codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
|
1493 |
{
|
1494 |
active_count--; /* Remove non-match possibility */
|
1495 |
next_active_state--;
|
1496 |
}
|
1497 |
ADD_NEW_DATA(-(state_offset + count), 0, 0);
|
1498 |
}
|
1499 |
}
|
1500 |
break;
|
1501 |
|
1502 |
/*-----------------------------------------------------------------*/
|
1503 |
#ifdef SUPPORT_UCP
|
1504 |
case OP_PROP_EXTRA + OP_TYPEEXACT:
|
1505 |
case OP_PROP_EXTRA + OP_TYPEUPTO:
|
1506 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
|
1507 |
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
|
1508 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
|
1509 |
{ ADD_ACTIVE(state_offset + 6, 0); }
|
1510 |
count = current_state->count; /* Number already matched */
|
1511 |
if (clen > 0)
|
1512 |
{
|
1513 |
BOOL OK;
|
1514 |
const ucd_record * prop = GET_UCD(c);
|
1515 |
switch(code[4])
|
1516 |
{
|
1517 |
case PT_ANY:
|
1518 |
OK = TRUE;
|
1519 |
break;
|
1520 |
|
1521 |
case PT_LAMP:
|
1522 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
|
1523 |
break;
|
1524 |
|
1525 |
case PT_GC:
|
1526 |
OK = _pcre_ucp_gentype[prop->chartype] == code[5];
|
1527 |
break;
|
1528 |
|
1529 |
case PT_PC:
|
1530 |
OK = prop->chartype == code[5];
|
1531 |
break;
|
1532 |
|
1533 |
case PT_SC:
|
1534 |
OK = prop->script == code[5];
|
1535 |
break;
|
1536 |
|
1537 |
/* Should never occur, but keep compilers from grumbling. */
|
1538 |
|
1539 |
default:
|
1540 |
OK = codevalue != OP_PROP;
|
1541 |
break;
|
1542 |
}
|
1543 |
|
1544 |
if (OK == (d == OP_PROP))
|
1545 |
{
|
1546 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
|
1547 |
{
|
1548 |
active_count--; /* Remove non-match possibility */
|
1549 |
next_active_state--;
|
1550 |
}
|
1551 |
if (++count >= GET2(code, 1))
|
1552 |
{ ADD_NEW(state_offset + 6, 0); }
|
1553 |
else
|
1554 |
{ ADD_NEW(state_offset, count); }
|
1555 |
}
|
1556 |
}
|
1557 |
break;
|
1558 |
|
1559 |
/*-----------------------------------------------------------------*/
|
1560 |
case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
|
1561 |
case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
|
1562 |
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
|
1563 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
|
1564 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
|
1565 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1566 |
count = current_state->count; /* Number already matched */
|
1567 |
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
|
1568 |
{
|
1569 |
const uschar *nptr = ptr + clen;
|
1570 |
int ncount = 0;
|
1571 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
|
1572 |
{
|
1573 |
active_count--; /* Remove non-match possibility */
|
1574 |
next_active_state--;
|
1575 |
}
|
1576 |
while (nptr < end_subject)
|
1577 |
{
|
1578 |
int nd;
|
1579 |
int ndlen = 1;
|
1580 |
GETCHARLEN(nd, nptr, ndlen);
|
1581 |
if (UCD_CATEGORY(nd) != ucp_M) break;
|
1582 |
ncount++;
|
1583 |
nptr += ndlen;
|
1584 |
}
|
1585 |
if (++count >= GET2(code, 1))
|
1586 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
|
1587 |
else
|
1588 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1589 |
}
|
1590 |
break;
|
1591 |
#endif
|
1592 |
|
1593 |
/*-----------------------------------------------------------------*/
|
1594 |
case OP_ANYNL_EXTRA + OP_TYPEEXACT:
|
1595 |
case OP_ANYNL_EXTRA + OP_TYPEUPTO:
|
1596 |
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
|
1597 |
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
|
1598 |
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
|
1599 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1600 |
count = current_state->count; /* Number already matched */
|
1601 |
if (clen > 0)
|
1602 |
{
|
1603 |
int ncount = 0;
|
1604 |
switch (c)
|
1605 |
{
|
1606 |
case 0x000b:
|
1607 |
case 0x000c:
|
1608 |
case 0x0085:
|
1609 |
case 0x2028:
|
1610 |
case 0x2029:
|
1611 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1612 |
goto ANYNL03;
|
1613 |
|
1614 |
case 0x000d:
|
1615 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
|
1616 |
/* Fall through */
|
1617 |
|
1618 |
ANYNL03:
|
1619 |
case 0x000a:
|
1620 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
|
1621 |
{
|
1622 |
active_count--; /* Remove non-match possibility */
|
1623 |
next_active_state--;
|
1624 |
}
|
1625 |
if (++count >= GET2(code, 1))
|
1626 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
|
1627 |
else
|
1628 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1629 |
break;
|
1630 |
|
1631 |
default:
|
1632 |
break;
|
1633 |
}
|
1634 |
}
|
1635 |
break;
|
1636 |
|
1637 |
/*-----------------------------------------------------------------*/
|
1638 |
case OP_VSPACE_EXTRA + OP_TYPEEXACT:
|
1639 |
case OP_VSPACE_EXTRA + OP_TYPEUPTO:
|
1640 |
case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
|
1641 |
case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
|
1642 |
if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
|
1643 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1644 |
count = current_state->count; /* Number already matched */
|
1645 |
if (clen > 0)
|
1646 |
{
|
1647 |
BOOL OK;
|
1648 |
switch (c)
|
1649 |
{
|
1650 |
case 0x000a:
|
1651 |
case 0x000b:
|
1652 |
case 0x000c:
|
1653 |
case 0x000d:
|
1654 |
case 0x0085:
|
1655 |
case 0x2028:
|
1656 |
case 0x2029:
|
1657 |
OK = TRUE;
|
1658 |
break;
|
1659 |
|
1660 |
default:
|
1661 |
OK = FALSE;
|
1662 |
}
|
1663 |
|
1664 |
if (OK == (d == OP_VSPACE))
|
1665 |
{
|
1666 |
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
|
1667 |
{
|
1668 |
active_count--; /* Remove non-match possibility */
|
1669 |
next_active_state--;
|
1670 |
}
|
1671 |
if (++count >= GET2(code, 1))
|
1672 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
|
1673 |
else
|
1674 |
{ ADD_NEW_DATA(-state_offset, count, 0); }
|
1675 |
}
|
1676 |
}
|
1677 |
break;
|
1678 |
|
1679 |
/*-----------------------------------------------------------------*/
|
1680 |
case OP_HSPACE_EXTRA + OP_TYPEEXACT:
|
1681 |
case OP_HSPACE_EXTRA + OP_TYPEUPTO:
|
1682 |
case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
|
1683 |
case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
|
1684 |
if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
|
1685 |
{ ADD_ACTIVE(state_offset + 4, 0); }
|
1686 |
count = current_state->count; /* Number already matched */
|
1687 |
if (clen > 0)
|
1688 |
{
|
1689 |
BOOL OK;
|
1690 |
switch (c)
|
1691 |
{
|
1692 |
case 0x09: /* HT */
|
1693 |
case 0x20: /* SPACE */
|
1694 |
case 0xa0: /* NBSP */
|
1695 |
case 0x1680: /* OGHAM SPACE MARK */
|
1696 |
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
|
1697 |
case 0x2000: /* EN QUAD */
|
1698 |
case 0x2001: /* EM QUAD */
|
1699 |
case 0x2002: /* EN SPACE */
|
1700 |
case 0x2003: /* EM SPACE */
|
1701 |
case 0x2004: /* THREE-PER-EM SPACE */
|
1702 |
case 0x2005: /* FOUR-PER-EM SPACE */
|
1703 |
case 0x2006: /* SIX-PER-EM SPACE */
|
1704 |
case 0x2007: /* FIGURE SPACE */
|
1705 |
case 0x2008: /* PUNCTUATION SPACE */
|
1706 |
case 0x2009: /* THIN SPACE */
|
1707 |
case 0x200A: /* HAIR SPACE */
|
1708 |
case 0x202f: /* NARROW NO-BREAK SPACE */
|
1709 |
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
|
1710 |
case 0x3000: /* IDEOGRAPHIC SPACE */
|
1711 |
OK = TRUE;
|
1712 |
break;
|
1713 |
|
1714 |
default:
|
1715 |
OK = FALSE;
|
1716 |
break;
|
1717 |
}
|
1718 |
|
1719 |
if (OK == (d == OP_HSPACE))
|
1720 |
{
|
1721 |
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
|
1722 |
{
|
1723 |
active_count--; /* Remove non-match possibility */
|
1724 |
next_active_state--;
|
1725 |
}
|
1726 |
if (++count >= GET2(code, 1))
|
1727 |
{ ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
|
1728 |
else
|
1729 |
{ ADD_NEW_DATA(-state_offset, count, 0); }
|
1730 |
}
|
1731 |
}
|
1732 |
break;
|
1733 |
|
1734 |
/* ========================================================================== */
|
1735 |
/* These opcodes are followed by a character that is usually compared
|
1736 |
to the current subject character; it is loaded into d. We still get
|
1737 |
here even if there is no subject character, because in some cases zero
|
1738 |
repetitions are permitted. */
|
1739 |
|
1740 |
/*-----------------------------------------------------------------*/
|
1741 |
case OP_CHAR:
|
1742 |
if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1743 |
break;
|
1744 |
|
1745 |
/*-----------------------------------------------------------------*/
|
1746 |
case OP_CHARNC:
|
1747 |
if (clen == 0) break;
|
1748 |
|
1749 |
#ifdef SUPPORT_UTF8
|
1750 |
if (utf8)
|
1751 |
{
|
1752 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
|
1753 |
{
|
1754 |
unsigned int othercase;
|
1755 |
if (c < 128) othercase = fcc[c]; else
|
1756 |
|
1757 |
/* If we have Unicode property support, we can use it to test the
|
1758 |
other case of the character. */
|
1759 |
|
1760 |
#ifdef SUPPORT_UCP
|
1761 |
othercase = UCD_OTHERCASE(c);
|
1762 |
#else
|
1763 |
othercase = NOTACHAR;
|
1764 |
#endif
|
1765 |
|
1766 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1767 |
}
|
1768 |
}
|
1769 |
else
|
1770 |
#endif /* SUPPORT_UTF8 */
|
1771 |
|
1772 |
/* Non-UTF-8 mode */
|
1773 |
{
|
1774 |
if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
|
1775 |
}
|
1776 |
break;
|
1777 |
|
1778 |
|
1779 |
#ifdef SUPPORT_UCP
|
1780 |
/*-----------------------------------------------------------------*/
|
1781 |
/* This is a tricky one because it can match more than one character.
|
1782 |
Find out how many characters to skip, and then set up a negative state
|
1783 |
to wait for them to pass before continuing. */
|
1784 |
|
1785 |
case OP_EXTUNI:
|
1786 |
if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
|
1787 |
{
|
1788 |
const uschar *nptr = ptr + clen;
|
1789 |
int ncount = 0;
|
1790 |
while (nptr < end_subject)
|
1791 |
{
|
1792 |
int nclen = 1;
|
1793 |
GETCHARLEN(c, nptr, nclen);
|
1794 |
if (UCD_CATEGORY(c) != ucp_M) break;
|
1795 |
ncount++;
|
1796 |
nptr += nclen;
|
1797 |
}
|
1798 |
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
|
1799 |
}
|
1800 |
break;
|
1801 |
#endif
|
1802 |
|
1803 |
/*-----------------------------------------------------------------*/
|
1804 |
/* This is a tricky like EXTUNI because it too can match more than one
|
1805 |
character (when CR is followed by LF). In this case, set up a negative
|
1806 |
state to wait for one character to pass before continuing. */
|
1807 |
|
1808 |
case OP_ANYNL:
|
1809 |
if (clen > 0) switch(c)
|
1810 |
{
|
1811 |
case 0x000b:
|
1812 |
case 0x000c:
|
1813 |
case 0x0085:
|
1814 |
case 0x2028:
|
1815 |
case 0x2029:
|
1816 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1817 |
|
1818 |
case 0x000a:
|
1819 |
ADD_NEW(state_offset + 1, 0);
|
1820 |
break;
|
1821 |
|
1822 |
case 0x000d:
|
1823 |
if (ptr + 1 < end_subject && ptr[1] == 0x0a)
|
1824 |
{
|
1825 |
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
|
1826 |
}
|
1827 |
else
|
1828 |
{
|
1829 |
ADD_NEW(state_offset + 1, 0);
|
1830 |
}
|
1831 |
break;
|
1832 |
}
|
1833 |
break;
|
1834 |
|
1835 |
/*-----------------------------------------------------------------*/
|
1836 |
case OP_NOT_VSPACE:
|
1837 |
if (clen > 0) switch(c)
|
1838 |
{
|
1839 |
case 0x000a:
|
1840 |
case 0x000b:
|
1841 |
case 0x000c:
|
1842 |
case 0x000d:
|
1843 |
case 0x0085:
|
1844 |
case 0x2028:
|
1845 |
case 0x2029:
|
1846 |
break;
|
1847 |
|
1848 |
default:
|
1849 |
ADD_NEW(state_offset + 1, 0);
|
1850 |
break;
|
1851 |
}
|
1852 |
break;
|
1853 |
|
1854 |
/*-----------------------------------------------------------------*/
|
1855 |
case OP_VSPACE:
|
1856 |
if (clen > 0) switch(c)
|
1857 |
{
|
1858 |
case 0x000a:
|
1859 |
case 0x000b:
|
1860 |
case 0x000c:
|
1861 |
case 0x000d:
|
1862 |
case 0x0085:
|
1863 |
case 0x2028:
|
1864 |
case 0x2029:
|
1865 |
ADD_NEW(state_offset + 1, 0);
|
1866 |
break;
|
1867 |
|
1868 |
default: break;
|
1869 |
}
|
1870 |
break;
|
1871 |
|
1872 |
/*-----------------------------------------------------------------*/
|
1873 |
case OP_NOT_HSPACE:
|
1874 |
if (clen > 0) switch(c)
|
1875 |
{
|
1876 |
case 0x09: /* HT */
|
1877 |
case 0x20: /* SPACE */
|
1878 |
case 0xa0: /* NBSP */
|
1879 |
case 0x1680: /* OGHAM SPACE MARK */
|
1880 |
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
|
1881 |
case 0x2000: /* EN QUAD */
|
1882 |
case 0x2001: /* EM QUAD */
|
1883 |
case 0x2002: /* EN SPACE */
|
1884 |
case 0x2003: /* EM SPACE */
|
1885 |
case 0x2004: /* THREE-PER-EM SPACE */
|
1886 |
case 0x2005: /* FOUR-PER-EM SPACE */
|
1887 |
case 0x2006: /* SIX-PER-EM SPACE */
|
1888 |
case 0x2007: /* FIGURE SPACE */
|
1889 |
case 0x2008: /* PUNCTUATION SPACE */
|
1890 |
case 0x2009: /* THIN SPACE */
|
1891 |
case 0x200A: /* HAIR SPACE */
|
1892 |
case 0x202f: /* NARROW NO-BREAK SPACE */
|
1893 |
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
|
1894 |
case 0x3000: /* IDEOGRAPHIC SPACE */
|
1895 |
break;
|
1896 |
|
1897 |
default:
|
1898 |
ADD_NEW(state_offset + 1, 0);
|
1899 |
break;
|
1900 |
}
|
1901 |
break;
|
1902 |
|
1903 |
/*-----------------------------------------------------------------*/
|
1904 |
case OP_HSPACE:
|
1905 |
if (clen > 0) switch(c)
|
1906 |
{
|
1907 |
case 0x09: /* HT */
|
1908 |
case 0x20: /* SPACE */
|
1909 |
case 0xa0: /* NBSP */
|
1910 |
case 0x1680: /* OGHAM SPACE MARK */
|
1911 |
case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
|
1912 |
case 0x2000: /* EN QUAD */
|
1913 |
case 0x2001: /* EM QUAD */
|
1914 |
case 0x2002: /* EN SPACE */
|
1915 |
case 0x2003: /* EM SPACE */
|
1916 |
case 0x2004: /* THREE-PER-EM SPACE */
|
1917 |
case 0x2005: /* FOUR-PER-EM SPACE */
|
1918 |
case 0x2006: /* SIX-PER-EM SPACE */
|
1919 |
case 0x2007: /* FIGURE SPACE */
|
1920 |
case 0x2008: /* PUNCTUATION SPACE */
|
1921 |
case 0x2009: /* THIN SPACE */
|
1922 |
case 0x200A: /* HAIR SPACE */
|
1923 |
case 0x202f: /* NARROW NO-BREAK SPACE */
|
1924 |
case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
|
1925 |
case 0x3000: /* IDEOGRAPHIC SPACE */
|
1926 |
ADD_NEW(state_offset + 1, 0);
|
1927 |
break;
|
1928 |
}
|
1929 |
break;
|
1930 |
|
1931 |
/*-----------------------------------------------------------------*/
|
1932 |
/* Match a negated single character. This is only used for one-byte
|
1933 |
characters, that is, we know that d < 256. The character we are
|
1934 |
checking (c) can be multibyte. */
|
1935 |
|
1936 |
case OP_NOT:
|
1937 |
if (clen > 0)
|
1938 |
{
|
1939 |
unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
|
1940 |
if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
|
1941 |
}
|
1942 |
break;
|
1943 |
|
1944 |
/*-----------------------------------------------------------------*/
|
1945 |
case OP_PLUS:
|
1946 |
case OP_MINPLUS:
|
1947 |
case OP_POSPLUS:
|
1948 |
case OP_NOTPLUS:
|
1949 |
case OP_NOTMINPLUS:
|
1950 |
case OP_NOTPOSPLUS:
|
1951 |
count = current_state->count; /* Already matched */
|
1952 |
if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
|
1953 |
if (clen > 0)
|
1954 |
{
|
1955 |
unsigned int otherd = NOTACHAR;
|
1956 |
if ((ims & PCRE_CASELESS) != 0)
|
1957 |
{
|
1958 |
#ifdef SUPPORT_UTF8
|
1959 |
if (utf8 && d >= 128)
|
1960 |
{
|
1961 |
#ifdef SUPPORT_UCP
|
1962 |
otherd = UCD_OTHERCASE(d);
|
1963 |
#endif /* SUPPORT_UCP */
|
1964 |
}
|
1965 |
else
|
1966 |
#endif /* SUPPORT_UTF8 */
|
1967 |
otherd = fcc[d];
|
1968 |
}
|
1969 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
1970 |
{
|
1971 |
if (count > 0 &&
|
1972 |
(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
|
1973 |
{
|
1974 |
active_count--; /* Remove non-match possibility */
|
1975 |
next_active_state--;
|
1976 |
}
|
1977 |
count++;
|
1978 |
ADD_NEW(state_offset, count);
|
1979 |
}
|
1980 |
}
|
1981 |
break;
|
1982 |
|
1983 |
/*-----------------------------------------------------------------*/
|
1984 |
case OP_QUERY:
|
1985 |
case OP_MINQUERY:
|
1986 |
case OP_POSQUERY:
|
1987 |
case OP_NOTQUERY:
|
1988 |
case OP_NOTMINQUERY:
|
1989 |
case OP_NOTPOSQUERY:
|
1990 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
1991 |
if (clen > 0)
|
1992 |
{
|
1993 |
unsigned int otherd = NOTACHAR;
|
1994 |
if ((ims & PCRE_CASELESS) != 0)
|
1995 |
{
|
1996 |
#ifdef SUPPORT_UTF8
|
1997 |
if (utf8 && d >= 128)
|
1998 |
{
|
1999 |
#ifdef SUPPORT_UCP
|
2000 |
otherd = UCD_OTHERCASE(d);
|
2001 |
#endif /* SUPPORT_UCP */
|
2002 |
}
|
2003 |
else
|
2004 |
#endif /* SUPPORT_UTF8 */
|
2005 |
otherd = fcc[d];
|
2006 |
}
|
2007 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2008 |
{
|
2009 |
if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
|
2010 |
{
|
2011 |
active_count--; /* Remove non-match possibility */
|
2012 |
next_active_state--;
|
2013 |
}
|
2014 |
ADD_NEW(state_offset + dlen + 1, 0);
|
2015 |
}
|
2016 |
}
|
2017 |
break;
|
2018 |
|
2019 |
/*-----------------------------------------------------------------*/
|
2020 |
case OP_STAR:
|
2021 |
case OP_MINSTAR:
|
2022 |
case OP_POSSTAR:
|
2023 |
case OP_NOTSTAR:
|
2024 |
case OP_NOTMINSTAR:
|
2025 |
case OP_NOTPOSSTAR:
|
2026 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
2027 |
if (clen > 0)
|
2028 |
{
|
2029 |
unsigned int otherd = NOTACHAR;
|
2030 |
if ((ims & PCRE_CASELESS) != 0)
|
2031 |
{
|
2032 |
#ifdef SUPPORT_UTF8
|
2033 |
if (utf8 && d >= 128)
|
2034 |
{
|
2035 |
#ifdef SUPPORT_UCP
|
2036 |
otherd = UCD_OTHERCASE(d);
|
2037 |
#endif /* SUPPORT_UCP */
|
2038 |
}
|
2039 |
else
|
2040 |
#endif /* SUPPORT_UTF8 */
|
2041 |
otherd = fcc[d];
|
2042 |
}
|
2043 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2044 |
{
|
2045 |
if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
|
2046 |
{
|
2047 |
active_count--; /* Remove non-match possibility */
|
2048 |
next_active_state--;
|
2049 |
}
|
2050 |
ADD_NEW(state_offset, 0);
|
2051 |
}
|
2052 |
}
|
2053 |
break;
|
2054 |
|
2055 |
/*-----------------------------------------------------------------*/
|
2056 |
case OP_EXACT:
|
2057 |
case OP_NOTEXACT:
|
2058 |
count = current_state->count; /* Number already matched */
|
2059 |
if (clen > 0)
|
2060 |
{
|
2061 |
unsigned int otherd = NOTACHAR;
|
2062 |
if ((ims & PCRE_CASELESS) != 0)
|
2063 |
{
|
2064 |
#ifdef SUPPORT_UTF8
|
2065 |
if (utf8 && d >= 128)
|
2066 |
{
|
2067 |
#ifdef SUPPORT_UCP
|
2068 |
otherd = UCD_OTHERCASE(d);
|
2069 |
#endif /* SUPPORT_UCP */
|
2070 |
}
|
2071 |
else
|
2072 |
#endif /* SUPPORT_UTF8 */
|
2073 |
otherd = fcc[d];
|
2074 |
}
|
2075 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2076 |
{
|
2077 |
if (++count >= GET2(code, 1))
|
2078 |
{ ADD_NEW(state_offset + dlen + 3, 0); }
|
2079 |
else
|
2080 |
{ ADD_NEW(state_offset, count); }
|
2081 |
}
|
2082 |
}
|
2083 |
break;
|
2084 |
|
2085 |
/*-----------------------------------------------------------------*/
|
2086 |
case OP_UPTO:
|
2087 |
case OP_MINUPTO:
|
2088 |
case OP_POSUPTO:
|
2089 |
case OP_NOTUPTO:
|
2090 |
case OP_NOTMINUPTO:
|
2091 |
case OP_NOTPOSUPTO:
|
2092 |
ADD_ACTIVE(state_offset + dlen + 3, 0);
|
2093 |
count = current_state->count; /* Number already matched */
|
2094 |
if (clen > 0)
|
2095 |
{
|
2096 |
unsigned int otherd = NOTACHAR;
|
2097 |
if ((ims & PCRE_CASELESS) != 0)
|
2098 |
{
|
2099 |
#ifdef SUPPORT_UTF8
|
2100 |
if (utf8 && d >= 128)
|
2101 |
{
|
2102 |
#ifdef SUPPORT_UCP
|
2103 |
otherd = UCD_OTHERCASE(d);
|
2104 |
#endif /* SUPPORT_UCP */
|
2105 |
}
|
2106 |
else
|
2107 |
#endif /* SUPPORT_UTF8 */
|
2108 |
otherd = fcc[d];
|
2109 |
}
|
2110 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2111 |
{
|
2112 |
if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
|
2113 |
{
|
2114 |
active_count--; /* Remove non-match possibility */
|
2115 |
next_active_state--;
|
2116 |
}
|
2117 |
if (++count >= GET2(code, 1))
|
2118 |
{ ADD_NEW(state_offset + dlen + 3, 0); }
|
2119 |
else
|
2120 |
{ ADD_NEW(state_offset, count); }
|
2121 |
}
|
2122 |
}
|
2123 |
break;
|
2124 |
|
2125 |
|
2126 |
/* ========================================================================== */
|
2127 |
/* These are the class-handling opcodes */
|
2128 |
|
2129 |
case OP_CLASS:
|
2130 |
case OP_NCLASS:
|
2131 |
case OP_XCLASS:
|
2132 |
{
|
2133 |
BOOL isinclass = FALSE;
|
2134 |
int next_state_offset;
|
2135 |
const uschar *ecode;
|
2136 |
|
2137 |
/* For a simple class, there is always just a 32-byte table, and we
|
2138 |
can set isinclass from it. */
|
2139 |
|
2140 |
if (codevalue != OP_XCLASS)
|
2141 |
{
|
2142 |
ecode = code + 33;
|
2143 |
if (clen > 0)
|
2144 |
{
|
2145 |
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
|
2146 |
((code[1 + c/8] & (1 << (c&7))) != 0);
|
2147 |
}
|
2148 |
}
|
2149 |
|
2150 |
/* An extended class may have a table or a list of single characters,
|
2151 |
ranges, or both, and it may be positive or negative. There's a
|
2152 |
function that sorts all this out. */
|
2153 |
|
2154 |
else
|
2155 |
{
|
2156 |
ecode = code + GET(code, 1);
|
2157 |
if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
|
2158 |
}
|
2159 |
|
2160 |
/* At this point, isinclass is set for all kinds of class, and ecode
|
2161 |
points to the byte after the end of the class. If there is a
|
2162 |
quantifier, this is where it will be. */
|
2163 |
|
2164 |
next_state_offset = ecode - start_code;
|
2165 |
|
2166 |
switch (*ecode)
|
2167 |
{
|
2168 |
case OP_CRSTAR:
|
2169 |
case OP_CRMINSTAR:
|
2170 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
2171 |
if (isinclass) { ADD_NEW(state_offset, 0); }
|
2172 |
break;
|
2173 |
|
2174 |
case OP_CRPLUS:
|
2175 |
case OP_CRMINPLUS:
|
2176 |
count = current_state->count; /* Already matched */
|
2177 |
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
|
2178 |
if (isinclass) { count++; ADD_NEW(state_offset, count); }
|
2179 |
break;
|
2180 |
|
2181 |
case OP_CRQUERY:
|
2182 |
case OP_CRMINQUERY:
|
2183 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
2184 |
if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
|
2185 |
break;
|
2186 |
|
2187 |
case OP_CRRANGE:
|
2188 |
case OP_CRMINRANGE:
|
2189 |
count = current_state->count; /* Already matched */
|
2190 |
if (count >= GET2(ecode, 1))
|
2191 |
{ ADD_ACTIVE(next_state_offset + 5, 0); }
|
2192 |
if (isinclass)
|
2193 |
{
|
2194 |
int max = GET2(ecode, 3);
|
2195 |
if (++count >= max && max != 0) /* Max 0 => no limit */
|
2196 |
{ ADD_NEW(next_state_offset + 5, 0); }
|
2197 |
else
|
2198 |
{ ADD_NEW(state_offset, count); }
|
2199 |
}
|
2200 |
break;
|
2201 |
|
2202 |
default:
|
2203 |
if (isinclass) { ADD_NEW(next_state_offset, 0); }
|
2204 |
break;
|
2205 |
}
|
2206 |
}
|
2207 |
break;
|
2208 |
|
2209 |
/* ========================================================================== */
|
2210 |
/* These are the opcodes for fancy brackets of various kinds. We have
|
2211 |
to use recursion in order to handle them. The "always failing" assertion
|
2212 |
(?!) is optimised to OP_FAIL when compiling, so we have to support that,
|
2213 |
though the other "backtracking verbs" are not supported. */
|
2214 |
|
2215 |
case OP_FAIL:
|
2216 |
forced_fail++; /* Count FAILs for multiple states */
|
2217 |
break;
|
2218 |
|
2219 |
case OP_ASSERT:
|
2220 |
case OP_ASSERT_NOT:
|
2221 |
case OP_ASSERTBACK:
|
2222 |
case OP_ASSERTBACK_NOT:
|
2223 |
{
|
2224 |
int rc;
|
2225 |
int local_offsets[2];
|
2226 |
int local_workspace[1000];
|
2227 |
const uschar *endasscode = code + GET(code, 1);
|
2228 |
|
2229 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
2230 |
|
2231 |
rc = internal_dfa_exec(
|
2232 |
md, /* static match data */
|
2233 |
code, /* this subexpression's code */
|
2234 |
ptr, /* where we currently are */
|
2235 |
ptr - start_subject, /* start offset */
|
2236 |
local_offsets, /* offset vector */
|
2237 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2238 |
local_workspace, /* workspace vector */
|
2239 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2240 |
ims, /* the current ims flags */
|
2241 |
rlevel, /* function recursion level */
|
2242 |
recursing); /* pass on regex recursion */
|
2243 |
|
2244 |
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
2245 |
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
2246 |
}
|
2247 |
break;
|
2248 |
|
2249 |
/*-----------------------------------------------------------------*/
|
2250 |
case OP_COND:
|
2251 |
case OP_SCOND:
|
2252 |
{
|
2253 |
int local_offsets[1000];
|
2254 |
int local_workspace[1000];
|
2255 |
int codelink = GET(code, 1);
|
2256 |
int condcode;
|
2257 |
|
2258 |
/* Because of the way auto-callout works during compile, a callout item
|
2259 |
is inserted between OP_COND and an assertion condition. This does not
|
2260 |
happen for the other conditions. */
|
2261 |
|
2262 |
if (code[LINK_SIZE+1] == OP_CALLOUT)
|
2263 |
{
|
2264 |
rrc = 0;
|
2265 |
if (pcre_callout != NULL)
|
2266 |
{
|
2267 |
pcre_callout_block cb;
|
2268 |
cb.version = 1; /* Version 1 of the callout block */
|
2269 |
cb.callout_number = code[LINK_SIZE+2];
|
2270 |
cb.offset_vector = offsets;
|
2271 |
cb.subject = (PCRE_SPTR)start_subject;
|
2272 |
cb.subject_length = end_subject - start_subject;
|
2273 |
cb.start_match = current_subject - start_subject;
|
2274 |
cb.current_position = ptr - start_subject;
|
2275 |
cb.pattern_position = GET(code, LINK_SIZE + 3);
|
2276 |
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
|
2277 |
cb.capture_top = 1;
|
2278 |
cb.capture_last = -1;
|
2279 |
cb.callout_data = md->callout_data;
|
2280 |
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
2281 |
}
|
2282 |
if (rrc > 0) break; /* Fail this thread */
|
2283 |
code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
|
2284 |
}
|
2285 |
|
2286 |
condcode = code[LINK_SIZE+1];
|
2287 |
|
2288 |
/* Back reference conditions are not supported */
|
2289 |
|
2290 |
if (condcode == OP_CREF || condcode == OP_NCREF)
|
2291 |
return PCRE_ERROR_DFA_UCOND;
|
2292 |
|
2293 |
/* The DEFINE condition is always false */
|
2294 |
|
2295 |
if (condcode == OP_DEF)
|
2296 |
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2297 |
|
2298 |
/* The only supported version of OP_RREF is for the value RREF_ANY,
|
2299 |
which means "test if in any recursion". We can't test for specifically
|
2300 |
recursed groups. */
|
2301 |
|
2302 |
else if (condcode == OP_RREF || condcode == OP_NRREF)
|
2303 |
{
|
2304 |
int value = GET2(code, LINK_SIZE+2);
|
2305 |
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
|
2306 |
if (recursing > 0)
|
2307 |
{ ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
|
2308 |
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2309 |
}
|
2310 |
|
2311 |
/* Otherwise, the condition is an assertion */
|
2312 |
|
2313 |
else
|
2314 |
{
|
2315 |
int rc;
|
2316 |
const uschar *asscode = code + LINK_SIZE + 1;
|
2317 |
const uschar *endasscode = asscode + GET(asscode, 1);
|
2318 |
|
2319 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
2320 |
|
2321 |
rc = internal_dfa_exec(
|
2322 |
md, /* fixed match data */
|
2323 |
asscode, /* this subexpression's code */
|
2324 |
ptr, /* where we currently are */
|
2325 |
ptr - start_subject, /* start offset */
|
2326 |
local_offsets, /* offset vector */
|
2327 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2328 |
local_workspace, /* workspace vector */
|
2329 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2330 |
ims, /* the current ims flags */
|
2331 |
rlevel, /* function recursion level */
|
2332 |
recursing); /* pass on regex recursion */
|
2333 |
|
2334 |
if ((rc >= 0) ==
|
2335 |
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
|
2336 |
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
2337 |
else
|
2338 |
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2339 |
}
|
2340 |
}
|
2341 |
break;
|
2342 |
|
2343 |
/*-----------------------------------------------------------------*/
|
2344 |
case OP_RECURSE:
|
2345 |
{
|
2346 |
int local_offsets[1000];
|
2347 |
int local_workspace[1000];
|
2348 |
int rc;
|
2349 |
|
2350 |
DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
|
2351 |
recursing + 1));
|
2352 |
|
2353 |
rc = internal_dfa_exec(
|
2354 |
md, /* fixed match data */
|
2355 |
start_code + GET(code, 1), /* this subexpression's code */
|
2356 |
ptr, /* where we currently are */
|
2357 |
ptr - start_subject, /* start offset */
|
2358 |
local_offsets, /* offset vector */
|
2359 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2360 |
local_workspace, /* workspace vector */
|
2361 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2362 |
ims, /* the current ims flags */
|
2363 |
rlevel, /* function recursion level */
|
2364 |
recursing + 1); /* regex recurse level */
|
2365 |
|
2366 |
DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
|
2367 |
recursing + 1, rc));
|
2368 |
|
2369 |
/* Ran out of internal offsets */
|
2370 |
|
2371 |
if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
|
2372 |
|
2373 |
/* For each successful matched substring, set up the next state with a
|
2374 |
count of characters to skip before trying it. Note that the count is in
|
2375 |
characters, not bytes. */
|
2376 |
|
2377 |
if (rc > 0)
|
2378 |
{
|
2379 |
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
|
2380 |
{
|
2381 |
const uschar *p = start_subject + local_offsets[rc];
|
2382 |
const uschar *pp = start_subject + local_offsets[rc+1];
|
2383 |
int charcount = local_offsets[rc+1] - local_offsets[rc];
|
2384 |
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
|
2385 |
if (charcount > 0)
|
2386 |
{
|
2387 |
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
|
2388 |
}
|
2389 |
else
|
2390 |
{
|
2391 |
ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
|
2392 |
}
|
2393 |
}
|
2394 |
}
|
2395 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
2396 |
}
|
2397 |
break;
|
2398 |
|
2399 |
/*-----------------------------------------------------------------*/
|
2400 |
case OP_ONCE:
|
2401 |
{
|
2402 |
int local_offsets[2];
|
2403 |
int local_workspace[1000];
|
2404 |
|
2405 |
int rc = internal_dfa_exec(
|
2406 |
md, /* fixed match data */
|
2407 |
code, /* this subexpression's code */
|
2408 |
ptr, /* where we currently are */
|
2409 |
ptr - start_subject, /* start offset */
|
2410 |
local_offsets, /* offset vector */
|
2411 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2412 |
local_workspace, /* workspace vector */
|
2413 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2414 |
ims, /* the current ims flags */
|
2415 |
rlevel, /* function recursion level */
|
2416 |
recursing); /* pass on regex recursion */
|
2417 |
|
2418 |
if (rc >= 0)
|
2419 |
{
|
2420 |
const uschar *end_subpattern = code;
|
2421 |
int charcount = local_offsets[1] - local_offsets[0];
|
2422 |
int next_state_offset, repeat_state_offset;
|
2423 |
|
2424 |
do { end_subpattern += GET(end_subpattern, 1); }
|
2425 |
while (*end_subpattern == OP_ALT);
|
2426 |
next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
|
2427 |
|
2428 |
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
|
2429 |
arrange for the repeat state also to be added to the relevant list.
|
2430 |
Calculate the offset, or set -1 for no repeat. */
|
2431 |
|
2432 |
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
|
2433 |
*end_subpattern == OP_KETRMIN)?
|
2434 |
end_subpattern - start_code - GET(end_subpattern, 1) : -1;
|
2435 |
|
2436 |
/* If we have matched an empty string, add the next state at the
|
2437 |
current character pointer. This is important so that the duplicate
|
2438 |
checking kicks in, which is what breaks infinite loops that match an
|
2439 |
empty string. */
|
2440 |
|
2441 |
if (charcount == 0)
|
2442 |
{
|
2443 |
ADD_ACTIVE(next_state_offset, 0);
|
2444 |
}
|
2445 |
|
2446 |
/* Optimization: if there are no more active states, and there
|
2447 |
are no new states yet set up, then skip over the subject string
|
2448 |
right here, to save looping. Otherwise, set up the new state to swing
|
2449 |
into action when the end of the substring is reached. */
|
2450 |
|
2451 |
else if (i + 1 >= active_count && new_count == 0)
|
2452 |
{
|
2453 |
ptr += charcount;
|
2454 |
clen = 0;
|
2455 |
ADD_NEW(next_state_offset, 0);
|
2456 |
|
2457 |
/* If we are adding a repeat state at the new character position,
|
2458 |
we must fudge things so that it is the only current state.
|
2459 |
Otherwise, it might be a duplicate of one we processed before, and
|
2460 |
that would cause it to be skipped. */
|
2461 |
|
2462 |
if (repeat_state_offset >= 0)
|
2463 |
{
|
2464 |
next_active_state = active_states;
|
2465 |
active_count = 0;
|
2466 |
i = -1;
|
2467 |
ADD_ACTIVE(repeat_state_offset, 0);
|
2468 |
}
|
2469 |
}
|
2470 |
else
|
2471 |
{
|
2472 |
const uschar *p = start_subject + local_offsets[0];
|
2473 |
const uschar *pp = start_subject + local_offsets[1];
|
2474 |
while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
|
2475 |
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
2476 |
if (repeat_state_offset >= 0)
|
2477 |
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
|
2478 |
}
|
2479 |
|
2480 |
}
|
2481 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
2482 |
}
|
2483 |
break;
|
2484 |
|
2485 |
|
2486 |
/* ========================================================================== */
|
2487 |
/* Handle callouts */
|
2488 |
|
2489 |
case OP_CALLOUT:
|
2490 |
rrc = 0;
|
2491 |
if (pcre_callout != NULL)
|
2492 |
{
|
2493 |
pcre_callout_block cb;
|
2494 |
cb.version = 1; /* Version 1 of the callout block */
|
2495 |
cb.callout_number = code[1];
|
2496 |
cb.offset_vector = offsets;
|
2497 |
cb.subject = (PCRE_SPTR)start_subject;
|
2498 |
cb.subject_length = end_subject - start_subject;
|
2499 |
cb.start_match = current_subject - start_subject;
|
2500 |
cb.current_position = ptr - start_subject;
|
2501 |
cb.pattern_position = GET(code, 2);
|
2502 |
cb.next_item_length = GET(code, 2 + LINK_SIZE);
|
2503 |
cb.capture_top = 1;
|
2504 |
cb.capture_last = -1;
|
2505 |
cb.callout_data = md->callout_data;
|
2506 |
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
2507 |
}
|
2508 |
if (rrc == 0)
|
2509 |
{ ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
|
2510 |
break;
|
2511 |
|
2512 |
|
2513 |
/* ========================================================================== */
|
2514 |
default: /* Unsupported opcode */
|
2515 |
return PCRE_ERROR_DFA_UITEM;
|
2516 |
}
|
2517 |
|
2518 |
NEXT_ACTIVE_STATE: continue;
|
2519 |
|
2520 |
} /* End of loop scanning active states */
|
2521 |
|
2522 |
/* We have finished the processing at the current subject character. If no
|
2523 |
new states have been set for the next character, we have found all the
|
2524 |
matches that we are going to find. If we are at the top level and partial
|
2525 |
matching has been requested, check for appropriate conditions. The "forced_
|
2526 |
fail" variable counts the number of (*F) encountered for the character. If it
|
2527 |
is equal to the original active_count (saved in workspace[1]) it means that
|
2528 |
(*F) was found on every active state. In this case we don't want to give a
|
2529 |
partial match. */
|
2530 |
|
2531 |
if (new_count <= 0)
|
2532 |
{
|
2533 |
if (rlevel == 1 && /* Top level, and */
|
2534 |
reached_end != workspace[1] && /* Not all reached end */
|
2535 |
forced_fail != workspace[1] && /* Not all forced fail & */
|
2536 |
( /* either... */
|
2537 |
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
|
2538 |
|| /* or... */
|
2539 |
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
|
2540 |
match_count < 0) /* no matches */
|
2541 |
) && /* And... */
|
2542 |
ptr >= end_subject && /* Reached end of subject */
|
2543 |
ptr > current_subject) /* Matched non-empty string */
|
2544 |
{
|
2545 |
if (offsetcount >= 2)
|
2546 |
{
|
2547 |
offsets[0] = md->start_used_ptr - start_subject;
|
2548 |
offsets[1] = end_subject - start_subject;
|
2549 |
}
|
2550 |
match_count = PCRE_ERROR_PARTIAL;
|
2551 |
}
|
2552 |
|
2553 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
2554 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
|
2555 |
rlevel*2-2, SP));
|
2556 |
break; /* In effect, "return", but see the comment below */
|
2557 |
}
|
2558 |
|
2559 |
/* One or more states are active for the next character. */
|
2560 |
|
2561 |
ptr += clen; /* Advance to next subject character */
|
2562 |
} /* Loop to move along the subject string */
|
2563 |
|
2564 |
/* Control gets here from "break" a few lines above. We do it this way because
|
2565 |
if we use "return" above, we have compiler trouble. Some compilers warn if
|
2566 |
there's nothing here because they think the function doesn't return a value. On
|
2567 |
the other hand, if we put a dummy statement here, some more clever compilers
|
2568 |
complain that it can't be reached. Sigh. */
|
2569 |
|
2570 |
return match_count;
|
2571 |
}
|
2572 |
|
2573 |
|
2574 |
|
2575 |
|
2576 |
/*************************************************
|
2577 |
* Execute a Regular Expression - DFA engine *
|
2578 |
*************************************************/
|
2579 |
|
2580 |
/* This external function applies a compiled re to a subject string using a DFA
|
2581 |
engine. This function calls the internal function multiple times if the pattern
|
2582 |
is not anchored.
|
2583 |
|
2584 |
Arguments:
|
2585 |
argument_re points to the compiled expression
|
2586 |
extra_data points to extra data or is NULL
|
2587 |
subject points to the subject string
|
2588 |
length length of subject string (may contain binary zeros)
|
2589 |
start_offset where to start in the subject string
|
2590 |
options option bits
|
2591 |
offsets vector of match offsets
|
2592 |
offsetcount size of same
|
2593 |
workspace workspace vector
|
2594 |
wscount size of same
|
2595 |
|
2596 |
Returns: > 0 => number of match offset pairs placed in offsets
|
2597 |
= 0 => offsets overflowed; longest matches are present
|
2598 |
-1 => failed to match
|
2599 |
< -1 => some kind of unexpected problem
|
2600 |
*/
|
2601 |
|
2602 |
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
2603 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
|
2604 |
const char *subject, int length, int start_offset, int options, int *offsets,
|
2605 |
int offsetcount, int *workspace, int wscount)
|
2606 |
{
|
2607 |
real_pcre *re = (real_pcre *)argument_re;
|
2608 |
dfa_match_data match_block;
|
2609 |
dfa_match_data *md = &match_block;
|
2610 |
BOOL utf8, anchored, startline, firstline;
|
2611 |
const uschar *current_subject, *end_subject, *lcc;
|
2612 |
|
2613 |
pcre_study_data internal_study;
|
2614 |
const pcre_study_data *study = NULL;
|
2615 |
real_pcre internal_re;
|
2616 |
|
2617 |
const uschar *req_byte_ptr;
|
2618 |
const uschar *start_bits = NULL;
|
2619 |
BOOL first_byte_caseless = FALSE;
|
2620 |
BOOL req_byte_caseless = FALSE;
|
2621 |
int first_byte = -1;
|
2622 |
int req_byte = -1;
|
2623 |
int req_byte2 = -1;
|
2624 |
int newline;
|
2625 |
|
2626 |
/* Plausibility checks */
|
2627 |
|
2628 |
if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
|
2629 |
if (re == NULL || subject == NULL || workspace == NULL ||
|
2630 |
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
|
2631 |
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
|
2632 |
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
|
2633 |
|
2634 |
/* We need to find the pointer to any study data before we test for byte
|
2635 |
flipping, so we scan the extra_data block first. This may set two fields in the
|
2636 |
match block, so we must initialize them beforehand. However, the other fields
|
2637 |
in the match block must not be set until after the byte flipping. */
|
2638 |
|
2639 |
md->tables = re->tables;
|
2640 |
md->callout_data = NULL;
|
2641 |
|
2642 |
if (extra_data != NULL)
|
2643 |
{
|
2644 |
unsigned int flags = extra_data->flags;
|
2645 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
|
2646 |
study = (const pcre_study_data *)extra_data->study_data;
|
2647 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
|
2648 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
|
2649 |
return PCRE_ERROR_DFA_UMLIMIT;
|
2650 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
|
2651 |
md->callout_data = extra_data->callout_data;
|
2652 |
if ((flags & PCRE_EXTRA_TABLES) != 0)
|
2653 |
md->tables = extra_data->tables;
|
2654 |
}
|
2655 |
|
2656 |
/* Check that the first field in the block is the magic number. If it is not,
|
2657 |
test for a regex that was compiled on a host of opposite endianness. If this is
|
2658 |
the case, flipped values are put in internal_re and internal_study if there was
|
2659 |
study data too. */
|
2660 |
|
2661 |
if (re->magic_number != MAGIC_NUMBER)
|
2662 |
{
|
2663 |
re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
|
2664 |
if (re == NULL) return PCRE_ERROR_BADMAGIC;
|
2665 |
if (study != NULL) study = &internal_study;
|
2666 |
}
|
2667 |
|
2668 |
/* Set some local values */
|
2669 |
|
2670 |
current_subject = (const unsigned char *)subject + start_offset;
|
2671 |
end_subject = (const unsigned char *)subject + length;
|
2672 |
req_byte_ptr = current_subject - 1;
|
2673 |
|
2674 |
#ifdef SUPPORT_UTF8
|
2675 |
utf8 = (re->options & PCRE_UTF8) != 0;
|
2676 |
#else
|
2677 |
utf8 = FALSE;
|
2678 |
#endif
|
2679 |
|
2680 |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
|
2681 |
(re->options & PCRE_ANCHORED) != 0;
|
2682 |
|
2683 |
/* The remaining fixed data for passing around. */
|
2684 |
|
2685 |
md->start_code = (const uschar *)argument_re +
|
2686 |
re->name_table_offset + re->name_count * re->name_entry_size;
|
2687 |
md->start_subject = (const unsigned char *)subject;
|
2688 |
md->end_subject = end_subject;
|
2689 |
md->start_offset = start_offset;
|
2690 |
md->moptions = options;
|
2691 |
md->poptions = re->options;
|
2692 |
|
2693 |
/* If the BSR option is not set at match time, copy what was set
|
2694 |
at compile time. */
|
2695 |
|
2696 |
if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
|
2697 |
{
|
2698 |
if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
|
2699 |
md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
|
2700 |
#ifdef BSR_ANYCRLF
|
2701 |
else md->moptions |= PCRE_BSR_ANYCRLF;
|
2702 |
#endif
|
2703 |
}
|
2704 |
|
2705 |
/* Handle different types of newline. The three bits give eight cases. If
|
2706 |
nothing is set at run time, whatever was used at compile time applies. */
|
2707 |
|
2708 |
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
|
2709 |
PCRE_NEWLINE_BITS)
|
2710 |
{
|
2711 |
case 0: newline = NEWLINE; break; /* Compile-time default */
|
2712 |
case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
|
2713 |
case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
|
2714 |
case PCRE_NEWLINE_CR+
|
2715 |
PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
|
2716 |
case PCRE_NEWLINE_ANY: newline = -1; break;
|
2717 |
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
|
2718 |
default: return PCRE_ERROR_BADNEWLINE;
|
2719 |
}
|
2720 |
|
2721 |
if (newline == -2)
|
2722 |
{
|
2723 |
md->nltype = NLTYPE_ANYCRLF;
|
2724 |
}
|
2725 |
else if (newline < 0)
|
2726 |
{
|
2727 |
md->nltype = NLTYPE_ANY;
|
2728 |
}
|
2729 |
else
|
2730 |
{
|
2731 |
md->nltype = NLTYPE_FIXED;
|
2732 |
if (newline > 255)
|
2733 |
{
|
2734 |
md->nllen = 2;
|
2735 |
md->nl[0] = (newline >> 8) & 255;
|
2736 |
md->nl[1] = newline & 255;
|
2737 |
}
|
2738 |
else
|
2739 |
{
|
2740 |
md->nllen = 1;
|
2741 |
md->nl[0] = newline;
|
2742 |
}
|
2743 |
}
|
2744 |
|
2745 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
|
2746 |
back the character offset. */
|
2747 |
|
2748 |
#ifdef SUPPORT_UTF8
|
2749 |
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
2750 |
{
|
2751 |
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
|
2752 |
return PCRE_ERROR_BADUTF8;
|
2753 |
if (start_offset > 0 && start_offset < length)
|
2754 |
{
|
2755 |
int tb = ((uschar *)subject)[start_offset];
|
2756 |
if (tb > 127)
|
2757 |
{
|
2758 |
tb &= 0xc0;
|
2759 |
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
|
2760 |
}
|
2761 |
}
|
2762 |
}
|
2763 |
#endif
|
2764 |
|
2765 |
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
|
2766 |
is a feature that makes it possible to save compiled regex and re-use them
|
2767 |
in other programs later. */
|
2768 |
|
2769 |
if (md->tables == NULL) md->tables = _pcre_default_tables;
|
2770 |
|
2771 |
/* The lower casing table and the "must be at the start of a line" flag are
|
2772 |
used in a loop when finding where to start. */
|
2773 |
|
2774 |
lcc = md->tables + lcc_offset;
|
2775 |
startline = (re->flags & PCRE_STARTLINE) != 0;
|
2776 |
firstline = (re->options & PCRE_FIRSTLINE) != 0;
|
2777 |
|
2778 |
/* Set up the first character to match, if available. The first_byte value is
|
2779 |
never set for an anchored regular expression, but the anchoring may be forced
|
2780 |
at run time, so we have to test for anchoring. The first char may be unset for
|
2781 |
an unanchored pattern, of course. If there's no first char and the pattern was
|
2782 |
studied, there may be a bitmap of possible first characters. */
|
2783 |
|
2784 |
if (!anchored)
|
2785 |
{
|
2786 |
if ((re->flags & PCRE_FIRSTSET) != 0)
|
2787 |
{
|
2788 |
first_byte = re->first_byte & 255;
|
2789 |
if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
|
2790 |
first_byte = lcc[first_byte];
|
2791 |
}
|
2792 |
else
|
2793 |
{
|
2794 |
if (!startline && study != NULL &&
|
2795 |
(study->flags & PCRE_STUDY_MAPPED) != 0)
|
2796 |
start_bits = study->start_bits;
|
2797 |
}
|
2798 |
}
|
2799 |
|
2800 |
/* For anchored or unanchored matches, there may be a "last known required
|
2801 |
character" set. */
|
2802 |
|
2803 |
if ((re->flags & PCRE_REQCHSET) != 0)
|
2804 |
{
|
2805 |
req_byte = re->req_byte & 255;
|
2806 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
|
2807 |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
|
2808 |
}
|
2809 |
|
2810 |
/* Call the main matching function, looping for a non-anchored regex after a
|
2811 |
failed match. If not restarting, perform certain optimizations at the start of
|
2812 |
a match. */
|
2813 |
|
2814 |
for (;;)
|
2815 |
{
|
2816 |
int rc;
|
2817 |
|
2818 |
if ((options & PCRE_DFA_RESTART) == 0)
|
2819 |
{
|
2820 |
const uschar *save_end_subject = end_subject;
|
2821 |
|
2822 |
/* If firstline is TRUE, the start of the match is constrained to the first
|
2823 |
line of a multiline string. Implement this by temporarily adjusting
|
2824 |
end_subject so that we stop scanning at a newline. If the match fails at
|
2825 |
the newline, later code breaks this loop. */
|
2826 |
|
2827 |
if (firstline)
|
2828 |
{
|
2829 |
USPTR t = current_subject;
|
2830 |
#ifdef SUPPORT_UTF8
|
2831 |
if (utf8)
|
2832 |
{
|
2833 |
while (t < md->end_subject && !IS_NEWLINE(t))
|
2834 |
{
|
2835 |
t++;
|
2836 |
while (t < end_subject && (*t & 0xc0) == 0x80) t++;
|
2837 |
}
|
2838 |
}
|
2839 |
else
|
2840 |
#endif
|
2841 |
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
|
2842 |
end_subject = t;
|
2843 |
}
|
2844 |
|
2845 |
/* There are some optimizations that avoid running the match if a known
|
2846 |
starting point is not found. However, there is an option that disables
|
2847 |
these, for testing and for ensuring that all callouts do actually occur. */
|
2848 |
|
2849 |
if ((options & PCRE_NO_START_OPTIMIZE) == 0)
|
2850 |
{
|
2851 |
/* Advance to a known first byte. */
|
2852 |
|
2853 |
if (first_byte >= 0)
|
2854 |
{
|
2855 |
if (first_byte_caseless)
|
2856 |
while (current_subject < end_subject &&
|
2857 |
lcc[*current_subject] != first_byte)
|
2858 |
current_subject++;
|
2859 |
else
|
2860 |
while (current_subject < end_subject &&
|
2861 |
*current_subject != first_byte)
|
2862 |
current_subject++;
|
2863 |
}
|
2864 |
|
2865 |
/* Or to just after a linebreak for a multiline match if possible */
|
2866 |
|
2867 |
else if (startline)
|
2868 |
{
|
2869 |
if (current_subject > md->start_subject + start_offset)
|
2870 |
{
|
2871 |
#ifdef SUPPORT_UTF8
|
2872 |
if (utf8)
|
2873 |
{
|
2874 |
while (current_subject < end_subject &&
|
2875 |
!WAS_NEWLINE(current_subject))
|
2876 |
{
|
2877 |
current_subject++;
|
2878 |
while(current_subject < end_subject &&
|
2879 |
(*current_subject & 0xc0) == 0x80)
|
2880 |
current_subject++;
|
2881 |
}
|
2882 |
}
|
2883 |
else
|
2884 |
#endif
|
2885 |
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
|
2886 |
current_subject++;
|
2887 |
|
2888 |
/* If we have just passed a CR and the newline option is ANY or
|
2889 |
ANYCRLF, and we are now at a LF, advance the match position by one
|
2890 |
more character. */
|
2891 |
|
2892 |
if (current_subject[-1] == CHAR_CR &&
|
2893 |
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
|
2894 |
current_subject < end_subject &&
|
2895 |
*current_subject == CHAR_NL)
|
2896 |
current_subject++;
|
2897 |
}
|
2898 |
}
|
2899 |
|
2900 |
/* Or to a non-unique first char after study */
|
2901 |
|
2902 |
else if (start_bits != NULL)
|
2903 |
{
|
2904 |
while (current_subject < end_subject)
|
2905 |
{
|
2906 |
register unsigned int c = *current_subject;
|
2907 |
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
|
2908 |
else break;
|
2909 |
}
|
2910 |
}
|
2911 |
}
|
2912 |
|
2913 |
/* Restore fudged end_subject */
|
2914 |
|
2915 |
end_subject = save_end_subject;
|
2916 |
|
2917 |
/* The following two optimizations are disabled for partial matching or if
|
2918 |
disabling is explicitly requested (and of course, by the test above, this
|
2919 |
code is not obeyed when restarting after a partial match). */
|
2920 |
|
2921 |
if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
|
2922 |
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
|
2923 |
{
|
2924 |
/* If the pattern was studied, a minimum subject length may be set. This
|
2925 |
is a lower bound; no actual string of that length may actually match the
|
2926 |
pattern. Although the value is, strictly, in characters, we treat it as
|
2927 |
bytes to avoid spending too much time in this optimization. */
|
2928 |
|
2929 |
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
|
2930 |
end_subject - current_subject < study->minlength)
|
2931 |
return PCRE_ERROR_NOMATCH;
|
2932 |
|
2933 |
/* If req_byte is set, we know that that character must appear in the
|
2934 |
subject for the match to succeed. If the first character is set, req_byte
|
2935 |
must be later in the subject; otherwise the test starts at the match
|
2936 |
point. This optimization can save a huge amount of work in patterns with
|
2937 |
nested unlimited repeats that aren't going to match. Writing separate
|
2938 |
code for cased/caseless versions makes it go faster, as does using an
|
2939 |
autoincrement and backing off on a match.
|
2940 |
|
2941 |
HOWEVER: when the subject string is very, very long, searching to its end
|
2942 |
can take a long time, and give bad performance on quite ordinary
|
2943 |
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
|
2944 |
string... so we don't do this when the string is sufficiently long. */
|
2945 |
|
2946 |
if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
|
2947 |
{
|
2948 |
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
|
2949 |
|
2950 |
/* We don't need to repeat the search if we haven't yet reached the
|
2951 |
place we found it at last time. */
|
2952 |
|
2953 |
if (p > req_byte_ptr)
|
2954 |
{
|
2955 |
if (req_byte_caseless)
|
2956 |
{
|
2957 |
while (p < end_subject)
|
2958 |
{
|
2959 |
register int pp = *p++;
|
2960 |
if (pp == req_byte || pp == req_byte2) { p--; break; }
|
2961 |
}
|
2962 |
}
|
2963 |
else
|
2964 |
{
|
2965 |
while (p < end_subject)
|
2966 |
{
|
2967 |
if (*p++ == req_byte) { p--; break; }
|
2968 |
}
|
2969 |
}
|
2970 |
|
2971 |
/* If we can't find the required character, break the matching loop,
|
2972 |
which will cause a return or PCRE_ERROR_NOMATCH. */
|
2973 |
|
2974 |
if (p >= end_subject) break;
|
2975 |
|
2976 |
/* If we have found the required character, save the point where we
|
2977 |
found it, so that we don't search again next time round the loop if
|
2978 |
the start hasn't passed this character yet. */
|
2979 |
|
2980 |
req_byte_ptr = p;
|
2981 |
}
|
2982 |
}
|
2983 |
}
|
2984 |
} /* End of optimizations that are done when not restarting */
|
2985 |
|
2986 |
/* OK, now we can do the business */
|
2987 |
|
2988 |
md->start_used_ptr = current_subject;
|
2989 |
|
2990 |
rc = internal_dfa_exec(
|
2991 |
md, /* fixed match data */
|
2992 |
md->start_code, /* this subexpression's code */
|
2993 |
current_subject, /* where we currently are */
|
2994 |
start_offset, /* start offset in subject */
|
2995 |
offsets, /* offset vector */
|
2996 |
offsetcount, /* size of same */
|
2997 |
workspace, /* workspace vector */
|
2998 |
wscount, /* size of same */
|
2999 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
|
3000 |
0, /* function recurse level */
|
3001 |
0); /* regex recurse level */
|
3002 |
|
3003 |
/* Anything other than "no match" means we are done, always; otherwise, carry
|
3004 |
on only if not anchored. */
|
3005 |
|
3006 |
if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
|
3007 |
|
3008 |
/* Advance to the next subject character unless we are at the end of a line
|
3009 |
and firstline is set. */
|
3010 |
|
3011 |
if (firstline && IS_NEWLINE(current_subject)) break;
|
3012 |
current_subject++;
|
3013 |
if (utf8)
|
3014 |
{
|
3015 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
|
3016 |
current_subject++;
|
3017 |
}
|
3018 |
if (current_subject > end_subject) break;
|
3019 |
|
3020 |
/* If we have just passed a CR and we are now at a LF, and the pattern does
|
3021 |
not contain any explicit matches for \r or \n, and the newline option is CRLF
|
3022 |
or ANY or ANYCRLF, advance the match position by one more character. */
|
3023 |
|
3024 |
if (current_subject[-1] == CHAR_CR &&
|
3025 |
current_subject < end_subject &&
|
3026 |
*current_subject == CHAR_NL &&
|
3027 |
(re->flags & PCRE_HASCRORLF) == 0 &&
|
3028 |
(md->nltype == NLTYPE_ANY ||
|
3029 |
md->nltype == NLTYPE_ANYCRLF ||
|
3030 |
md->nllen == 2))
|
3031 |
current_subject++;
|
3032 |
|
3033 |
} /* "Bumpalong" loop */
|
3034 |
|
3035 |
return PCRE_ERROR_NOMATCH;
|
3036 |
}
|
3037 |
|
3038 |
/* End of pcre_dfa_exec.c */
|