1 |
/*************************************************
|
2 |
* Perl-Compatible Regular Expressions *
|
3 |
*************************************************/
|
4 |
|
5 |
/* PCRE is a library of functions to support regular expressions whose syntax
|
6 |
and semantics are as close as possible to those of the Perl 5 language (but see
|
7 |
below for why this module is different).
|
8 |
|
9 |
Written by Philip Hazel
|
10 |
Copyright (c) 1997-2013 University of Cambridge
|
11 |
|
12 |
-----------------------------------------------------------------------------
|
13 |
Redistribution and use in source and binary forms, with or without
|
14 |
modification, are permitted provided that the following conditions are met:
|
15 |
|
16 |
* Redistributions of source code must retain the above copyright notice,
|
17 |
this list of conditions and the following disclaimer.
|
18 |
|
19 |
* Redistributions in binary form must reproduce the above copyright
|
20 |
notice, this list of conditions and the following disclaimer in the
|
21 |
documentation and/or other materials provided with the distribution.
|
22 |
|
23 |
* Neither the name of the University of Cambridge nor the names of its
|
24 |
contributors may be used to endorse or promote products derived from
|
25 |
this software without specific prior written permission.
|
26 |
|
27 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
28 |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
29 |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
30 |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
31 |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
32 |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
33 |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
34 |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
35 |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
36 |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
37 |
POSSIBILITY OF SUCH DAMAGE.
|
38 |
-----------------------------------------------------------------------------
|
39 |
*/
|
40 |
|
41 |
/* This module contains the external function pcre_dfa_exec(), which is an
|
42 |
alternative matching function that uses a sort of DFA algorithm (not a true
|
43 |
FSM). This is NOT Perl-compatible, but it has advantages in certain
|
44 |
applications. */
|
45 |
|
46 |
|
47 |
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
|
48 |
the performance of his patterns greatly. I could not use it as it stood, as it
|
49 |
was not thread safe, and made assumptions about pattern sizes. Also, it caused
|
50 |
test 7 to loop, and test 9 to crash with a segfault.
|
51 |
|
52 |
The issue is the check for duplicate states, which is done by a simple linear
|
53 |
search up the state list. (Grep for "duplicate" below to find the code.) For
|
54 |
many patterns, there will never be many states active at one time, so a simple
|
55 |
linear search is fine. In patterns that have many active states, it might be a
|
56 |
bottleneck. The suggested code used an indexing scheme to remember which states
|
57 |
had previously been used for each character, and avoided the linear search when
|
58 |
it knew there was no chance of a duplicate. This was implemented when adding
|
59 |
states to the state lists.
|
60 |
|
61 |
I wrote some thread-safe, not-limited code to try something similar at the time
|
62 |
of checking for duplicates (instead of when adding states), using index vectors
|
63 |
on the stack. It did give a 13% improvement with one specially constructed
|
64 |
pattern for certain subject strings, but on other strings and on many of the
|
65 |
simpler patterns in the test suite it did worse. The major problem, I think,
|
66 |
was the extra time to initialize the index. This had to be done for each call
|
67 |
of internal_dfa_exec(). (The supplied patch used a static vector, initialized
|
68 |
only once - I suspect this was the cause of the problems with the tests.)
|
69 |
|
70 |
Overall, I concluded that the gains in some cases did not outweigh the losses
|
71 |
in others, so I abandoned this code. */
|
72 |
|
73 |
|
74 |
|
75 |
#ifdef HAVE_CONFIG_H
|
76 |
#include "config.h"
|
77 |
#endif
|
78 |
|
79 |
#define NLBLOCK md /* Block containing newline information */
|
80 |
#define PSSTART start_subject /* Field containing processed string start */
|
81 |
#define PSEND end_subject /* Field containing processed string end */
|
82 |
|
83 |
#include "pcre_internal.h"
|
84 |
|
85 |
|
86 |
/* For use to indent debugging output */
|
87 |
|
88 |
#define SP " "
|
89 |
|
90 |
|
91 |
/*************************************************
|
92 |
* Code parameters and static tables *
|
93 |
*************************************************/
|
94 |
|
95 |
/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
|
96 |
into others, under special conditions. A gap of 20 between the blocks should be
|
97 |
enough. The resulting opcodes don't have to be less than 256 because they are
|
98 |
never stored, so we push them well clear of the normal opcodes. */
|
99 |
|
100 |
#define OP_PROP_EXTRA 300
|
101 |
#define OP_EXTUNI_EXTRA 320
|
102 |
#define OP_ANYNL_EXTRA 340
|
103 |
#define OP_HSPACE_EXTRA 360
|
104 |
#define OP_VSPACE_EXTRA 380
|
105 |
|
106 |
|
107 |
/* This table identifies those opcodes that are followed immediately by a
|
108 |
character that is to be tested in some way. This makes it possible to
|
109 |
centralize the loading of these characters. In the case of Type * etc, the
|
110 |
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
|
111 |
small value. Non-zero values in the table are the offsets from the opcode where
|
112 |
the character is to be found. ***NOTE*** If the start of this table is
|
113 |
modified, the three tables that follow must also be modified. */
|
114 |
|
115 |
static const pcre_uint8 coptable[] = {
|
116 |
0, /* End */
|
117 |
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
|
118 |
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
|
119 |
0, 0, 0, /* Any, AllAny, Anybyte */
|
120 |
0, 0, /* \P, \p */
|
121 |
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
|
122 |
0, /* \X */
|
123 |
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
|
124 |
1, /* Char */
|
125 |
1, /* Chari */
|
126 |
1, /* not */
|
127 |
1, /* noti */
|
128 |
/* Positive single-char repeats */
|
129 |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
130 |
1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
|
131 |
1+IMM2_SIZE, /* exact */
|
132 |
1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
|
133 |
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
|
134 |
1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
|
135 |
1+IMM2_SIZE, /* exact I */
|
136 |
1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
|
137 |
/* Negative single-char repeats - only for chars < 256 */
|
138 |
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
139 |
1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
|
140 |
1+IMM2_SIZE, /* NOT exact */
|
141 |
1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
|
142 |
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
|
143 |
1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
|
144 |
1+IMM2_SIZE, /* NOT exact I */
|
145 |
1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
|
146 |
/* Positive type repeats */
|
147 |
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
148 |
1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
|
149 |
1+IMM2_SIZE, /* Type exact */
|
150 |
1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
|
151 |
/* Character class & ref repeats */
|
152 |
0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
|
153 |
0, 0, /* CRRANGE, CRMINRANGE */
|
154 |
0, /* CLASS */
|
155 |
0, /* NCLASS */
|
156 |
0, /* XCLASS - variable length */
|
157 |
0, /* REF */
|
158 |
0, /* REFI */
|
159 |
0, /* RECURSE */
|
160 |
0, /* CALLOUT */
|
161 |
0, /* Alt */
|
162 |
0, /* Ket */
|
163 |
0, /* KetRmax */
|
164 |
0, /* KetRmin */
|
165 |
0, /* KetRpos */
|
166 |
0, /* Reverse */
|
167 |
0, /* Assert */
|
168 |
0, /* Assert not */
|
169 |
0, /* Assert behind */
|
170 |
0, /* Assert behind not */
|
171 |
0, 0, /* ONCE, ONCE_NC */
|
172 |
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
173 |
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
174 |
0, 0, /* CREF, NCREF */
|
175 |
0, 0, /* RREF, NRREF */
|
176 |
0, /* DEF */
|
177 |
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
|
178 |
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
|
179 |
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
180 |
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
|
181 |
0, 0 /* CLOSE, SKIPZERO */
|
182 |
};
|
183 |
|
184 |
/* This table identifies those opcodes that inspect a character. It is used to
|
185 |
remember the fact that a character could have been inspected when the end of
|
186 |
the subject is reached. ***NOTE*** If the start of this table is modified, the
|
187 |
two tables that follow must also be modified. */
|
188 |
|
189 |
static const pcre_uint8 poptable[] = {
|
190 |
0, /* End */
|
191 |
0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
|
192 |
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
|
193 |
1, 1, 1, /* Any, AllAny, Anybyte */
|
194 |
1, 1, /* \P, \p */
|
195 |
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
|
196 |
1, /* \X */
|
197 |
0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
|
198 |
1, /* Char */
|
199 |
1, /* Chari */
|
200 |
1, /* not */
|
201 |
1, /* noti */
|
202 |
/* Positive single-char repeats */
|
203 |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
204 |
1, 1, 1, /* upto, minupto, exact */
|
205 |
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
|
206 |
1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
|
207 |
1, 1, 1, /* upto I, minupto I, exact I */
|
208 |
1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
|
209 |
/* Negative single-char repeats - only for chars < 256 */
|
210 |
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
211 |
1, 1, 1, /* NOT upto, minupto, exact */
|
212 |
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
|
213 |
1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
|
214 |
1, 1, 1, /* NOT upto I, minupto I, exact I */
|
215 |
1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
|
216 |
/* Positive type repeats */
|
217 |
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
218 |
1, 1, 1, /* Type upto, minupto, exact */
|
219 |
1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
|
220 |
/* Character class & ref repeats */
|
221 |
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
222 |
1, 1, /* CRRANGE, CRMINRANGE */
|
223 |
1, /* CLASS */
|
224 |
1, /* NCLASS */
|
225 |
1, /* XCLASS - variable length */
|
226 |
0, /* REF */
|
227 |
0, /* REFI */
|
228 |
0, /* RECURSE */
|
229 |
0, /* CALLOUT */
|
230 |
0, /* Alt */
|
231 |
0, /* Ket */
|
232 |
0, /* KetRmax */
|
233 |
0, /* KetRmin */
|
234 |
0, /* KetRpos */
|
235 |
0, /* Reverse */
|
236 |
0, /* Assert */
|
237 |
0, /* Assert not */
|
238 |
0, /* Assert behind */
|
239 |
0, /* Assert behind not */
|
240 |
0, 0, /* ONCE, ONCE_NC */
|
241 |
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
|
242 |
0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
|
243 |
0, 0, /* CREF, NCREF */
|
244 |
0, 0, /* RREF, NRREF */
|
245 |
0, /* DEF */
|
246 |
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
|
247 |
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
|
248 |
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
|
249 |
0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
|
250 |
0, 0 /* CLOSE, SKIPZERO */
|
251 |
};
|
252 |
|
253 |
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
254 |
and \w */
|
255 |
|
256 |
static const pcre_uint8 toptable1[] = {
|
257 |
0, 0, 0, 0, 0, 0,
|
258 |
ctype_digit, ctype_digit,
|
259 |
ctype_space, ctype_space,
|
260 |
ctype_word, ctype_word,
|
261 |
0, 0 /* OP_ANY, OP_ALLANY */
|
262 |
};
|
263 |
|
264 |
static const pcre_uint8 toptable2[] = {
|
265 |
0, 0, 0, 0, 0, 0,
|
266 |
ctype_digit, 0,
|
267 |
ctype_space, 0,
|
268 |
ctype_word, 0,
|
269 |
1, 1 /* OP_ANY, OP_ALLANY */
|
270 |
};
|
271 |
|
272 |
|
273 |
/* Structure for holding data about a particular state, which is in effect the
|
274 |
current data for an active path through the match tree. It must consist
|
275 |
entirely of ints because the working vector we are passed, and which we put
|
276 |
these structures in, is a vector of ints. */
|
277 |
|
278 |
typedef struct stateblock {
|
279 |
int offset; /* Offset to opcode */
|
280 |
int count; /* Count for repeats */
|
281 |
int data; /* Some use extra data */
|
282 |
} stateblock;
|
283 |
|
284 |
#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
|
285 |
|
286 |
|
287 |
#ifdef PCRE_DEBUG
|
288 |
/*************************************************
|
289 |
* Print character string *
|
290 |
*************************************************/
|
291 |
|
292 |
/* Character string printing function for debugging.
|
293 |
|
294 |
Arguments:
|
295 |
p points to string
|
296 |
length number of bytes
|
297 |
f where to print
|
298 |
|
299 |
Returns: nothing
|
300 |
*/
|
301 |
|
302 |
static void
|
303 |
pchars(const pcre_uchar *p, int length, FILE *f)
|
304 |
{
|
305 |
pcre_uint32 c;
|
306 |
while (length-- > 0)
|
307 |
{
|
308 |
if (isprint(c = *(p++)))
|
309 |
fprintf(f, "%c", c);
|
310 |
else
|
311 |
fprintf(f, "\\x{%02x}", c);
|
312 |
}
|
313 |
}
|
314 |
#endif
|
315 |
|
316 |
|
317 |
|
318 |
/*************************************************
|
319 |
* Execute a Regular Expression - DFA engine *
|
320 |
*************************************************/
|
321 |
|
322 |
/* This internal function applies a compiled pattern to a subject string,
|
323 |
starting at a given point, using a DFA engine. This function is called from the
|
324 |
external one, possibly multiple times if the pattern is not anchored. The
|
325 |
function calls itself recursively for some kinds of subpattern.
|
326 |
|
327 |
Arguments:
|
328 |
md the match_data block with fixed information
|
329 |
this_start_code the opening bracket of this subexpression's code
|
330 |
current_subject where we currently are in the subject string
|
331 |
start_offset start offset in the subject string
|
332 |
offsets vector to contain the matching string offsets
|
333 |
offsetcount size of same
|
334 |
workspace vector of workspace
|
335 |
wscount size of same
|
336 |
rlevel function call recursion level
|
337 |
|
338 |
Returns: > 0 => number of match offset pairs placed in offsets
|
339 |
= 0 => offsets overflowed; longest matches are present
|
340 |
-1 => failed to match
|
341 |
< -1 => some kind of unexpected problem
|
342 |
|
343 |
The following macros are used for adding states to the two state vectors (one
|
344 |
for the current character, one for the following character). */
|
345 |
|
346 |
#define ADD_ACTIVE(x,y) \
|
347 |
if (active_count++ < wscount) \
|
348 |
{ \
|
349 |
next_active_state->offset = (x); \
|
350 |
next_active_state->count = (y); \
|
351 |
next_active_state++; \
|
352 |
DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
353 |
} \
|
354 |
else return PCRE_ERROR_DFA_WSSIZE
|
355 |
|
356 |
#define ADD_ACTIVE_DATA(x,y,z) \
|
357 |
if (active_count++ < wscount) \
|
358 |
{ \
|
359 |
next_active_state->offset = (x); \
|
360 |
next_active_state->count = (y); \
|
361 |
next_active_state->data = (z); \
|
362 |
next_active_state++; \
|
363 |
DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
|
364 |
} \
|
365 |
else return PCRE_ERROR_DFA_WSSIZE
|
366 |
|
367 |
#define ADD_NEW(x,y) \
|
368 |
if (new_count++ < wscount) \
|
369 |
{ \
|
370 |
next_new_state->offset = (x); \
|
371 |
next_new_state->count = (y); \
|
372 |
next_new_state++; \
|
373 |
DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
|
374 |
} \
|
375 |
else return PCRE_ERROR_DFA_WSSIZE
|
376 |
|
377 |
#define ADD_NEW_DATA(x,y,z) \
|
378 |
if (new_count++ < wscount) \
|
379 |
{ \
|
380 |
next_new_state->offset = (x); \
|
381 |
next_new_state->count = (y); \
|
382 |
next_new_state->data = (z); \
|
383 |
next_new_state++; \
|
384 |
DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
|
385 |
(x), (y), (z), __LINE__)); \
|
386 |
} \
|
387 |
else return PCRE_ERROR_DFA_WSSIZE
|
388 |
|
389 |
/* And now, here is the code */
|
390 |
|
391 |
static int
|
392 |
internal_dfa_exec(
|
393 |
dfa_match_data *md,
|
394 |
const pcre_uchar *this_start_code,
|
395 |
const pcre_uchar *current_subject,
|
396 |
int start_offset,
|
397 |
int *offsets,
|
398 |
int offsetcount,
|
399 |
int *workspace,
|
400 |
int wscount,
|
401 |
int rlevel)
|
402 |
{
|
403 |
stateblock *active_states, *new_states, *temp_states;
|
404 |
stateblock *next_active_state, *next_new_state;
|
405 |
|
406 |
const pcre_uint8 *ctypes, *lcc, *fcc;
|
407 |
const pcre_uchar *ptr;
|
408 |
const pcre_uchar *end_code, *first_op;
|
409 |
|
410 |
dfa_recursion_info new_recursive;
|
411 |
|
412 |
int active_count, new_count, match_count;
|
413 |
|
414 |
/* Some fields in the md block are frequently referenced, so we load them into
|
415 |
independent variables in the hope that this will perform better. */
|
416 |
|
417 |
const pcre_uchar *start_subject = md->start_subject;
|
418 |
const pcre_uchar *end_subject = md->end_subject;
|
419 |
const pcre_uchar *start_code = md->start_code;
|
420 |
|
421 |
#ifdef SUPPORT_UTF
|
422 |
BOOL utf = (md->poptions & PCRE_UTF8) != 0;
|
423 |
#else
|
424 |
BOOL utf = FALSE;
|
425 |
#endif
|
426 |
|
427 |
BOOL reset_could_continue = FALSE;
|
428 |
|
429 |
rlevel++;
|
430 |
offsetcount &= (-2);
|
431 |
|
432 |
wscount -= 2;
|
433 |
wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
|
434 |
(2 * INTS_PER_STATEBLOCK);
|
435 |
|
436 |
DPRINTF(("\n%.*s---------------------\n"
|
437 |
"%.*sCall to internal_dfa_exec f=%d\n",
|
438 |
rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
|
439 |
|
440 |
ctypes = md->tables + ctypes_offset;
|
441 |
lcc = md->tables + lcc_offset;
|
442 |
fcc = md->tables + fcc_offset;
|
443 |
|
444 |
match_count = PCRE_ERROR_NOMATCH; /* A negative number */
|
445 |
|
446 |
active_states = (stateblock *)(workspace + 2);
|
447 |
next_new_state = new_states = active_states + wscount;
|
448 |
new_count = 0;
|
449 |
|
450 |
first_op = this_start_code + 1 + LINK_SIZE +
|
451 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
|
452 |
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
|
453 |
? IMM2_SIZE:0);
|
454 |
|
455 |
/* The first thing in any (sub) pattern is a bracket of some sort. Push all
|
456 |
the alternative states onto the list, and find out where the end is. This
|
457 |
makes is possible to use this function recursively, when we want to stop at a
|
458 |
matching internal ket rather than at the end.
|
459 |
|
460 |
If the first opcode in the first alternative is OP_REVERSE, we are dealing with
|
461 |
a backward assertion. In that case, we have to find out the maximum amount to
|
462 |
move back, and set up each alternative appropriately. */
|
463 |
|
464 |
if (*first_op == OP_REVERSE)
|
465 |
{
|
466 |
int max_back = 0;
|
467 |
int gone_back;
|
468 |
|
469 |
end_code = this_start_code;
|
470 |
do
|
471 |
{
|
472 |
int back = GET(end_code, 2+LINK_SIZE);
|
473 |
if (back > max_back) max_back = back;
|
474 |
end_code += GET(end_code, 1);
|
475 |
}
|
476 |
while (*end_code == OP_ALT);
|
477 |
|
478 |
/* If we can't go back the amount required for the longest lookbehind
|
479 |
pattern, go back as far as we can; some alternatives may still be viable. */
|
480 |
|
481 |
#ifdef SUPPORT_UTF
|
482 |
/* In character mode we have to step back character by character */
|
483 |
|
484 |
if (utf)
|
485 |
{
|
486 |
for (gone_back = 0; gone_back < max_back; gone_back++)
|
487 |
{
|
488 |
if (current_subject <= start_subject) break;
|
489 |
current_subject--;
|
490 |
ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
|
491 |
}
|
492 |
}
|
493 |
else
|
494 |
#endif
|
495 |
|
496 |
/* In byte-mode we can do this quickly. */
|
497 |
|
498 |
{
|
499 |
gone_back = (current_subject - max_back < start_subject)?
|
500 |
(int)(current_subject - start_subject) : max_back;
|
501 |
current_subject -= gone_back;
|
502 |
}
|
503 |
|
504 |
/* Save the earliest consulted character */
|
505 |
|
506 |
if (current_subject < md->start_used_ptr)
|
507 |
md->start_used_ptr = current_subject;
|
508 |
|
509 |
/* Now we can process the individual branches. */
|
510 |
|
511 |
end_code = this_start_code;
|
512 |
do
|
513 |
{
|
514 |
int back = GET(end_code, 2+LINK_SIZE);
|
515 |
if (back <= gone_back)
|
516 |
{
|
517 |
int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
|
518 |
ADD_NEW_DATA(-bstate, 0, gone_back - back);
|
519 |
}
|
520 |
end_code += GET(end_code, 1);
|
521 |
}
|
522 |
while (*end_code == OP_ALT);
|
523 |
}
|
524 |
|
525 |
/* This is the code for a "normal" subpattern (not a backward assertion). The
|
526 |
start of a whole pattern is always one of these. If we are at the top level,
|
527 |
we may be asked to restart matching from the same point that we reached for a
|
528 |
previous partial match. We still have to scan through the top-level branches to
|
529 |
find the end state. */
|
530 |
|
531 |
else
|
532 |
{
|
533 |
end_code = this_start_code;
|
534 |
|
535 |
/* Restarting */
|
536 |
|
537 |
if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
|
538 |
{
|
539 |
do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
|
540 |
new_count = workspace[1];
|
541 |
if (!workspace[0])
|
542 |
memcpy(new_states, active_states, new_count * sizeof(stateblock));
|
543 |
}
|
544 |
|
545 |
/* Not restarting */
|
546 |
|
547 |
else
|
548 |
{
|
549 |
int length = 1 + LINK_SIZE +
|
550 |
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
|
551 |
*this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
|
552 |
? IMM2_SIZE:0);
|
553 |
do
|
554 |
{
|
555 |
ADD_NEW((int)(end_code - start_code + length), 0);
|
556 |
end_code += GET(end_code, 1);
|
557 |
length = 1 + LINK_SIZE;
|
558 |
}
|
559 |
while (*end_code == OP_ALT);
|
560 |
}
|
561 |
}
|
562 |
|
563 |
workspace[0] = 0; /* Bit indicating which vector is current */
|
564 |
|
565 |
DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
|
566 |
|
567 |
/* Loop for scanning the subject */
|
568 |
|
569 |
ptr = current_subject;
|
570 |
for (;;)
|
571 |
{
|
572 |
int i, j;
|
573 |
int clen, dlen;
|
574 |
pcre_uint32 c, d;
|
575 |
int forced_fail = 0;
|
576 |
BOOL partial_newline = FALSE;
|
577 |
BOOL could_continue = reset_could_continue;
|
578 |
reset_could_continue = FALSE;
|
579 |
|
580 |
/* Make the new state list into the active state list and empty the
|
581 |
new state list. */
|
582 |
|
583 |
temp_states = active_states;
|
584 |
active_states = new_states;
|
585 |
new_states = temp_states;
|
586 |
active_count = new_count;
|
587 |
new_count = 0;
|
588 |
|
589 |
workspace[0] ^= 1; /* Remember for the restarting feature */
|
590 |
workspace[1] = active_count;
|
591 |
|
592 |
#ifdef PCRE_DEBUG
|
593 |
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
|
594 |
pchars(ptr, STRLEN_UC(ptr), stdout);
|
595 |
printf("\"\n");
|
596 |
|
597 |
printf("%.*sActive states: ", rlevel*2-2, SP);
|
598 |
for (i = 0; i < active_count; i++)
|
599 |
printf("%d/%d ", active_states[i].offset, active_states[i].count);
|
600 |
printf("\n");
|
601 |
#endif
|
602 |
|
603 |
/* Set the pointers for adding new states */
|
604 |
|
605 |
next_active_state = active_states + active_count;
|
606 |
next_new_state = new_states;
|
607 |
|
608 |
/* Load the current character from the subject outside the loop, as many
|
609 |
different states may want to look at it, and we assume that at least one
|
610 |
will. */
|
611 |
|
612 |
if (ptr < end_subject)
|
613 |
{
|
614 |
clen = 1; /* Number of data items in the character */
|
615 |
#ifdef SUPPORT_UTF
|
616 |
GETCHARLENTEST(c, ptr, clen);
|
617 |
#else
|
618 |
c = *ptr;
|
619 |
#endif /* SUPPORT_UTF */
|
620 |
}
|
621 |
else
|
622 |
{
|
623 |
clen = 0; /* This indicates the end of the subject */
|
624 |
c = NOTACHAR; /* This value should never actually be used */
|
625 |
}
|
626 |
|
627 |
/* Scan up the active states and act on each one. The result of an action
|
628 |
may be to add more states to the currently active list (e.g. on hitting a
|
629 |
parenthesis) or it may be to put states on the new list, for considering
|
630 |
when we move the character pointer on. */
|
631 |
|
632 |
for (i = 0; i < active_count; i++)
|
633 |
{
|
634 |
stateblock *current_state = active_states + i;
|
635 |
BOOL caseless = FALSE;
|
636 |
const pcre_uchar *code;
|
637 |
int state_offset = current_state->offset;
|
638 |
int codevalue, rrc;
|
639 |
unsigned int count;
|
640 |
|
641 |
#ifdef PCRE_DEBUG
|
642 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
|
643 |
if (clen == 0) printf("EOL\n");
|
644 |
else if (c > 32 && c < 127) printf("'%c'\n", c);
|
645 |
else printf("0x%02x\n", c);
|
646 |
#endif
|
647 |
|
648 |
/* A negative offset is a special case meaning "hold off going to this
|
649 |
(negated) state until the number of characters in the data field have
|
650 |
been skipped". If the could_continue flag was passed over from a previous
|
651 |
state, arrange for it to passed on. */
|
652 |
|
653 |
if (state_offset < 0)
|
654 |
{
|
655 |
if (current_state->data > 0)
|
656 |
{
|
657 |
DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
|
658 |
ADD_NEW_DATA(state_offset, current_state->count,
|
659 |
current_state->data - 1);
|
660 |
if (could_continue) reset_could_continue = TRUE;
|
661 |
continue;
|
662 |
}
|
663 |
else
|
664 |
{
|
665 |
current_state->offset = state_offset = -state_offset;
|
666 |
}
|
667 |
}
|
668 |
|
669 |
/* Check for a duplicate state with the same count, and skip if found.
|
670 |
See the note at the head of this module about the possibility of improving
|
671 |
performance here. */
|
672 |
|
673 |
for (j = 0; j < i; j++)
|
674 |
{
|
675 |
if (active_states[j].offset == state_offset &&
|
676 |
active_states[j].count == current_state->count)
|
677 |
{
|
678 |
DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
|
679 |
goto NEXT_ACTIVE_STATE;
|
680 |
}
|
681 |
}
|
682 |
|
683 |
/* The state offset is the offset to the opcode */
|
684 |
|
685 |
code = start_code + state_offset;
|
686 |
codevalue = *code;
|
687 |
|
688 |
/* If this opcode inspects a character, but we are at the end of the
|
689 |
subject, remember the fact for use when testing for a partial match. */
|
690 |
|
691 |
if (clen == 0 && poptable[codevalue] != 0)
|
692 |
could_continue = TRUE;
|
693 |
|
694 |
/* If this opcode is followed by an inline character, load it. It is
|
695 |
tempting to test for the presence of a subject character here, but that
|
696 |
is wrong, because sometimes zero repetitions of the subject are
|
697 |
permitted.
|
698 |
|
699 |
We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
|
700 |
argument that is not a data character - but is always one byte long because
|
701 |
the values are small. We have to take special action to deal with \P, \p,
|
702 |
\H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
|
703 |
these ones to new opcodes. */
|
704 |
|
705 |
if (coptable[codevalue] > 0)
|
706 |
{
|
707 |
dlen = 1;
|
708 |
#ifdef SUPPORT_UTF
|
709 |
if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
|
710 |
#endif /* SUPPORT_UTF */
|
711 |
d = code[coptable[codevalue]];
|
712 |
if (codevalue >= OP_TYPESTAR)
|
713 |
{
|
714 |
switch(d)
|
715 |
{
|
716 |
case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
|
717 |
case OP_NOTPROP:
|
718 |
case OP_PROP: codevalue += OP_PROP_EXTRA; break;
|
719 |
case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
|
720 |
case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
|
721 |
case OP_NOT_HSPACE:
|
722 |
case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
|
723 |
case OP_NOT_VSPACE:
|
724 |
case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
|
725 |
default: break;
|
726 |
}
|
727 |
}
|
728 |
}
|
729 |
else
|
730 |
{
|
731 |
dlen = 0; /* Not strictly necessary, but compilers moan */
|
732 |
d = NOTACHAR; /* if these variables are not set. */
|
733 |
}
|
734 |
|
735 |
|
736 |
/* Now process the individual opcodes */
|
737 |
|
738 |
switch (codevalue)
|
739 |
{
|
740 |
/* ========================================================================== */
|
741 |
/* These cases are never obeyed. This is a fudge that causes a compile-
|
742 |
time error if the vectors coptable or poptable, which are indexed by
|
743 |
opcode, are not the correct length. It seems to be the only way to do
|
744 |
such a check at compile time, as the sizeof() operator does not work
|
745 |
in the C preprocessor. */
|
746 |
|
747 |
case OP_TABLE_LENGTH:
|
748 |
case OP_TABLE_LENGTH +
|
749 |
((sizeof(coptable) == OP_TABLE_LENGTH) &&
|
750 |
(sizeof(poptable) == OP_TABLE_LENGTH)):
|
751 |
break;
|
752 |
|
753 |
/* ========================================================================== */
|
754 |
/* Reached a closing bracket. If not at the end of the pattern, carry
|
755 |
on with the next opcode. For repeating opcodes, also add the repeat
|
756 |
state. Note that KETRPOS will always be encountered at the end of the
|
757 |
subpattern, because the possessive subpattern repeats are always handled
|
758 |
using recursive calls. Thus, it never adds any new states.
|
759 |
|
760 |
At the end of the (sub)pattern, unless we have an empty string and
|
761 |
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
|
762 |
start of the subject, save the match data, shifting up all previous
|
763 |
matches so we always have the longest first. */
|
764 |
|
765 |
case OP_KET:
|
766 |
case OP_KETRMIN:
|
767 |
case OP_KETRMAX:
|
768 |
case OP_KETRPOS:
|
769 |
if (code != end_code)
|
770 |
{
|
771 |
ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
|
772 |
if (codevalue != OP_KET)
|
773 |
{
|
774 |
ADD_ACTIVE(state_offset - GET(code, 1), 0);
|
775 |
}
|
776 |
}
|
777 |
else
|
778 |
{
|
779 |
if (ptr > current_subject ||
|
780 |
((md->moptions & PCRE_NOTEMPTY) == 0 &&
|
781 |
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
|
782 |
current_subject > start_subject + md->start_offset)))
|
783 |
{
|
784 |
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
785 |
else if (match_count > 0 && ++match_count * 2 > offsetcount)
|
786 |
match_count = 0;
|
787 |
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
788 |
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
789 |
if (offsetcount >= 2)
|
790 |
{
|
791 |
offsets[0] = (int)(current_subject - start_subject);
|
792 |
offsets[1] = (int)(ptr - start_subject);
|
793 |
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
794 |
offsets[1] - offsets[0], (char *)current_subject));
|
795 |
}
|
796 |
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
|
797 |
{
|
798 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
799 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
|
800 |
match_count, rlevel*2-2, SP));
|
801 |
return match_count;
|
802 |
}
|
803 |
}
|
804 |
}
|
805 |
break;
|
806 |
|
807 |
/* ========================================================================== */
|
808 |
/* These opcodes add to the current list of states without looking
|
809 |
at the current character. */
|
810 |
|
811 |
/*-----------------------------------------------------------------*/
|
812 |
case OP_ALT:
|
813 |
do { code += GET(code, 1); } while (*code == OP_ALT);
|
814 |
ADD_ACTIVE((int)(code - start_code), 0);
|
815 |
break;
|
816 |
|
817 |
/*-----------------------------------------------------------------*/
|
818 |
case OP_BRA:
|
819 |
case OP_SBRA:
|
820 |
do
|
821 |
{
|
822 |
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
823 |
code += GET(code, 1);
|
824 |
}
|
825 |
while (*code == OP_ALT);
|
826 |
break;
|
827 |
|
828 |
/*-----------------------------------------------------------------*/
|
829 |
case OP_CBRA:
|
830 |
case OP_SCBRA:
|
831 |
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
|
832 |
code += GET(code, 1);
|
833 |
while (*code == OP_ALT)
|
834 |
{
|
835 |
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
836 |
code += GET(code, 1);
|
837 |
}
|
838 |
break;
|
839 |
|
840 |
/*-----------------------------------------------------------------*/
|
841 |
case OP_BRAZERO:
|
842 |
case OP_BRAMINZERO:
|
843 |
ADD_ACTIVE(state_offset + 1, 0);
|
844 |
code += 1 + GET(code, 2);
|
845 |
while (*code == OP_ALT) code += GET(code, 1);
|
846 |
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
847 |
break;
|
848 |
|
849 |
/*-----------------------------------------------------------------*/
|
850 |
case OP_SKIPZERO:
|
851 |
code += 1 + GET(code, 2);
|
852 |
while (*code == OP_ALT) code += GET(code, 1);
|
853 |
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
854 |
break;
|
855 |
|
856 |
/*-----------------------------------------------------------------*/
|
857 |
case OP_CIRC:
|
858 |
if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
|
859 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
860 |
break;
|
861 |
|
862 |
/*-----------------------------------------------------------------*/
|
863 |
case OP_CIRCM:
|
864 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
|
865 |
(ptr != end_subject && WAS_NEWLINE(ptr)))
|
866 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
867 |
break;
|
868 |
|
869 |
/*-----------------------------------------------------------------*/
|
870 |
case OP_EOD:
|
871 |
if (ptr >= end_subject)
|
872 |
{
|
873 |
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
|
874 |
could_continue = TRUE;
|
875 |
else { ADD_ACTIVE(state_offset + 1, 0); }
|
876 |
}
|
877 |
break;
|
878 |
|
879 |
/*-----------------------------------------------------------------*/
|
880 |
case OP_SOD:
|
881 |
if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
882 |
break;
|
883 |
|
884 |
/*-----------------------------------------------------------------*/
|
885 |
case OP_SOM:
|
886 |
if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
|
887 |
break;
|
888 |
|
889 |
|
890 |
/* ========================================================================== */
|
891 |
/* These opcodes inspect the next subject character, and sometimes
|
892 |
the previous one as well, but do not have an argument. The variable
|
893 |
clen contains the length of the current character and is zero if we are
|
894 |
at the end of the subject. */
|
895 |
|
896 |
/*-----------------------------------------------------------------*/
|
897 |
case OP_ANY:
|
898 |
if (clen > 0 && !IS_NEWLINE(ptr))
|
899 |
{
|
900 |
if (ptr + 1 >= md->end_subject &&
|
901 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
902 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
903 |
NLBLOCK->nllen == 2 &&
|
904 |
c == NLBLOCK->nl[0])
|
905 |
{
|
906 |
could_continue = partial_newline = TRUE;
|
907 |
}
|
908 |
else
|
909 |
{
|
910 |
ADD_NEW(state_offset + 1, 0);
|
911 |
}
|
912 |
}
|
913 |
break;
|
914 |
|
915 |
/*-----------------------------------------------------------------*/
|
916 |
case OP_ALLANY:
|
917 |
if (clen > 0)
|
918 |
{ ADD_NEW(state_offset + 1, 0); }
|
919 |
break;
|
920 |
|
921 |
/*-----------------------------------------------------------------*/
|
922 |
case OP_EODN:
|
923 |
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
924 |
could_continue = TRUE;
|
925 |
else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
|
926 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
927 |
break;
|
928 |
|
929 |
/*-----------------------------------------------------------------*/
|
930 |
case OP_DOLL:
|
931 |
if ((md->moptions & PCRE_NOTEOL) == 0)
|
932 |
{
|
933 |
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
934 |
could_continue = TRUE;
|
935 |
else if (clen == 0 ||
|
936 |
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
|
937 |
(ptr == end_subject - md->nllen)
|
938 |
))
|
939 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
940 |
else if (ptr + 1 >= md->end_subject &&
|
941 |
(md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
|
942 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
943 |
NLBLOCK->nllen == 2 &&
|
944 |
c == NLBLOCK->nl[0])
|
945 |
{
|
946 |
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
|
947 |
{
|
948 |
reset_could_continue = TRUE;
|
949 |
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
|
950 |
}
|
951 |
else could_continue = partial_newline = TRUE;
|
952 |
}
|
953 |
}
|
954 |
break;
|
955 |
|
956 |
/*-----------------------------------------------------------------*/
|
957 |
case OP_DOLLM:
|
958 |
if ((md->moptions & PCRE_NOTEOL) == 0)
|
959 |
{
|
960 |
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
961 |
could_continue = TRUE;
|
962 |
else if (clen == 0 ||
|
963 |
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
|
964 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
965 |
else if (ptr + 1 >= md->end_subject &&
|
966 |
(md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
|
967 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
968 |
NLBLOCK->nllen == 2 &&
|
969 |
c == NLBLOCK->nl[0])
|
970 |
{
|
971 |
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
|
972 |
{
|
973 |
reset_could_continue = TRUE;
|
974 |
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
|
975 |
}
|
976 |
else could_continue = partial_newline = TRUE;
|
977 |
}
|
978 |
}
|
979 |
else if (IS_NEWLINE(ptr))
|
980 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
981 |
break;
|
982 |
|
983 |
/*-----------------------------------------------------------------*/
|
984 |
|
985 |
case OP_DIGIT:
|
986 |
case OP_WHITESPACE:
|
987 |
case OP_WORDCHAR:
|
988 |
if (clen > 0 && c < 256 &&
|
989 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
|
990 |
{ ADD_NEW(state_offset + 1, 0); }
|
991 |
break;
|
992 |
|
993 |
/*-----------------------------------------------------------------*/
|
994 |
case OP_NOT_DIGIT:
|
995 |
case OP_NOT_WHITESPACE:
|
996 |
case OP_NOT_WORDCHAR:
|
997 |
if (clen > 0 && (c >= 256 ||
|
998 |
((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
|
999 |
{ ADD_NEW(state_offset + 1, 0); }
|
1000 |
break;
|
1001 |
|
1002 |
/*-----------------------------------------------------------------*/
|
1003 |
case OP_WORD_BOUNDARY:
|
1004 |
case OP_NOT_WORD_BOUNDARY:
|
1005 |
{
|
1006 |
int left_word, right_word;
|
1007 |
|
1008 |
if (ptr > start_subject)
|
1009 |
{
|
1010 |
const pcre_uchar *temp = ptr - 1;
|
1011 |
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
|
1012 |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
1013 |
if (utf) { BACKCHAR(temp); }
|
1014 |
#endif
|
1015 |
GETCHARTEST(d, temp);
|
1016 |
#ifdef SUPPORT_UCP
|
1017 |
if ((md->poptions & PCRE_UCP) != 0)
|
1018 |
{
|
1019 |
if (d == '_') left_word = TRUE; else
|
1020 |
{
|
1021 |
int cat = UCD_CATEGORY(d);
|
1022 |
left_word = (cat == ucp_L || cat == ucp_N);
|
1023 |
}
|
1024 |
}
|
1025 |
else
|
1026 |
#endif
|
1027 |
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
|
1028 |
}
|
1029 |
else left_word = FALSE;
|
1030 |
|
1031 |
if (clen > 0)
|
1032 |
{
|
1033 |
#ifdef SUPPORT_UCP
|
1034 |
if ((md->poptions & PCRE_UCP) != 0)
|
1035 |
{
|
1036 |
if (c == '_') right_word = TRUE; else
|
1037 |
{
|
1038 |
int cat = UCD_CATEGORY(c);
|
1039 |
right_word = (cat == ucp_L || cat == ucp_N);
|
1040 |
}
|
1041 |
}
|
1042 |
else
|
1043 |
#endif
|
1044 |
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
1045 |
}
|
1046 |
else right_word = FALSE;
|
1047 |
|
1048 |
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
1049 |
{ ADD_ACTIVE(state_offset + 1, 0); }
|
1050 |
}
|
1051 |
break;
|
1052 |
|
1053 |
|
1054 |
/*-----------------------------------------------------------------*/
|
1055 |
/* Check the next character by Unicode property. We will get here only
|
1056 |
if the support is in the binary; otherwise a compile-time error occurs.
|
1057 |
*/
|
1058 |
|
1059 |
#ifdef SUPPORT_UCP
|
1060 |
case OP_PROP:
|
1061 |
case OP_NOTPROP:
|
1062 |
if (clen > 0)
|
1063 |
{
|
1064 |
BOOL OK;
|
1065 |
const pcre_uint32 *cp;
|
1066 |
const ucd_record * prop = GET_UCD(c);
|
1067 |
switch(code[1])
|
1068 |
{
|
1069 |
case PT_ANY:
|
1070 |
OK = TRUE;
|
1071 |
break;
|
1072 |
|
1073 |
case PT_LAMP:
|
1074 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
1075 |
prop->chartype == ucp_Lt;
|
1076 |
break;
|
1077 |
|
1078 |
case PT_GC:
|
1079 |
OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
|
1080 |
break;
|
1081 |
|
1082 |
case PT_PC:
|
1083 |
OK = prop->chartype == code[2];
|
1084 |
break;
|
1085 |
|
1086 |
case PT_SC:
|
1087 |
OK = prop->script == code[2];
|
1088 |
break;
|
1089 |
|
1090 |
/* These are specials for combination cases. */
|
1091 |
|
1092 |
case PT_ALNUM:
|
1093 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1094 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
1095 |
break;
|
1096 |
|
1097 |
case PT_SPACE: /* Perl space */
|
1098 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1099 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
1100 |
break;
|
1101 |
|
1102 |
case PT_PXSPACE: /* POSIX space */
|
1103 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1104 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
1105 |
c == CHAR_FF || c == CHAR_CR;
|
1106 |
break;
|
1107 |
|
1108 |
case PT_WORD:
|
1109 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1110 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
1111 |
c == CHAR_UNDERSCORE;
|
1112 |
break;
|
1113 |
|
1114 |
case PT_CLIST:
|
1115 |
cp = PRIV(ucd_caseless_sets) + code[2];
|
1116 |
for (;;)
|
1117 |
{
|
1118 |
if (c < *cp) { OK = FALSE; break; }
|
1119 |
if (c == *cp++) { OK = TRUE; break; }
|
1120 |
}
|
1121 |
break;
|
1122 |
|
1123 |
case PT_UCNC:
|
1124 |
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
1125 |
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
|
1126 |
c >= 0xe000;
|
1127 |
break;
|
1128 |
|
1129 |
/* Should never occur, but keep compilers from grumbling. */
|
1130 |
|
1131 |
default:
|
1132 |
OK = codevalue != OP_PROP;
|
1133 |
break;
|
1134 |
}
|
1135 |
|
1136 |
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
|
1137 |
}
|
1138 |
break;
|
1139 |
#endif
|
1140 |
|
1141 |
|
1142 |
|
1143 |
/* ========================================================================== */
|
1144 |
/* These opcodes likewise inspect the subject character, but have an
|
1145 |
argument that is not a data character. It is one of these opcodes:
|
1146 |
OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
|
1147 |
OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
|
1148 |
|
1149 |
case OP_TYPEPLUS:
|
1150 |
case OP_TYPEMINPLUS:
|
1151 |
case OP_TYPEPOSPLUS:
|
1152 |
count = current_state->count; /* Already matched */
|
1153 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1154 |
if (clen > 0)
|
1155 |
{
|
1156 |
if (d == OP_ANY && ptr + 1 >= md->end_subject &&
|
1157 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
1158 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
1159 |
NLBLOCK->nllen == 2 &&
|
1160 |
c == NLBLOCK->nl[0])
|
1161 |
{
|
1162 |
could_continue = partial_newline = TRUE;
|
1163 |
}
|
1164 |
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1165 |
(c < 256 &&
|
1166 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1167 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1168 |
{
|
1169 |
if (count > 0 && codevalue == OP_TYPEPOSPLUS)
|
1170 |
{
|
1171 |
active_count--; /* Remove non-match possibility */
|
1172 |
next_active_state--;
|
1173 |
}
|
1174 |
count++;
|
1175 |
ADD_NEW(state_offset, count);
|
1176 |
}
|
1177 |
}
|
1178 |
break;
|
1179 |
|
1180 |
/*-----------------------------------------------------------------*/
|
1181 |
case OP_TYPEQUERY:
|
1182 |
case OP_TYPEMINQUERY:
|
1183 |
case OP_TYPEPOSQUERY:
|
1184 |
ADD_ACTIVE(state_offset + 2, 0);
|
1185 |
if (clen > 0)
|
1186 |
{
|
1187 |
if (d == OP_ANY && ptr + 1 >= md->end_subject &&
|
1188 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
1189 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
1190 |
NLBLOCK->nllen == 2 &&
|
1191 |
c == NLBLOCK->nl[0])
|
1192 |
{
|
1193 |
could_continue = partial_newline = TRUE;
|
1194 |
}
|
1195 |
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1196 |
(c < 256 &&
|
1197 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1198 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1199 |
{
|
1200 |
if (codevalue == OP_TYPEPOSQUERY)
|
1201 |
{
|
1202 |
active_count--; /* Remove non-match possibility */
|
1203 |
next_active_state--;
|
1204 |
}
|
1205 |
ADD_NEW(state_offset + 2, 0);
|
1206 |
}
|
1207 |
}
|
1208 |
break;
|
1209 |
|
1210 |
/*-----------------------------------------------------------------*/
|
1211 |
case OP_TYPESTAR:
|
1212 |
case OP_TYPEMINSTAR:
|
1213 |
case OP_TYPEPOSSTAR:
|
1214 |
ADD_ACTIVE(state_offset + 2, 0);
|
1215 |
if (clen > 0)
|
1216 |
{
|
1217 |
if (d == OP_ANY && ptr + 1 >= md->end_subject &&
|
1218 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
1219 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
1220 |
NLBLOCK->nllen == 2 &&
|
1221 |
c == NLBLOCK->nl[0])
|
1222 |
{
|
1223 |
could_continue = partial_newline = TRUE;
|
1224 |
}
|
1225 |
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1226 |
(c < 256 &&
|
1227 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1228 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1229 |
{
|
1230 |
if (codevalue == OP_TYPEPOSSTAR)
|
1231 |
{
|
1232 |
active_count--; /* Remove non-match possibility */
|
1233 |
next_active_state--;
|
1234 |
}
|
1235 |
ADD_NEW(state_offset, 0);
|
1236 |
}
|
1237 |
}
|
1238 |
break;
|
1239 |
|
1240 |
/*-----------------------------------------------------------------*/
|
1241 |
case OP_TYPEEXACT:
|
1242 |
count = current_state->count; /* Number already matched */
|
1243 |
if (clen > 0)
|
1244 |
{
|
1245 |
if (d == OP_ANY && ptr + 1 >= md->end_subject &&
|
1246 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
1247 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
1248 |
NLBLOCK->nllen == 2 &&
|
1249 |
c == NLBLOCK->nl[0])
|
1250 |
{
|
1251 |
could_continue = partial_newline = TRUE;
|
1252 |
}
|
1253 |
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1254 |
(c < 256 &&
|
1255 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1256 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1257 |
{
|
1258 |
if (++count >= GET2(code, 1))
|
1259 |
{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
|
1260 |
else
|
1261 |
{ ADD_NEW(state_offset, count); }
|
1262 |
}
|
1263 |
}
|
1264 |
break;
|
1265 |
|
1266 |
/*-----------------------------------------------------------------*/
|
1267 |
case OP_TYPEUPTO:
|
1268 |
case OP_TYPEMINUPTO:
|
1269 |
case OP_TYPEPOSUPTO:
|
1270 |
ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
|
1271 |
count = current_state->count; /* Number already matched */
|
1272 |
if (clen > 0)
|
1273 |
{
|
1274 |
if (d == OP_ANY && ptr + 1 >= md->end_subject &&
|
1275 |
(md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
|
1276 |
NLBLOCK->nltype == NLTYPE_FIXED &&
|
1277 |
NLBLOCK->nllen == 2 &&
|
1278 |
c == NLBLOCK->nl[0])
|
1279 |
{
|
1280 |
could_continue = partial_newline = TRUE;
|
1281 |
}
|
1282 |
else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
|
1283 |
(c < 256 &&
|
1284 |
(d != OP_ANY || !IS_NEWLINE(ptr)) &&
|
1285 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
|
1286 |
{
|
1287 |
if (codevalue == OP_TYPEPOSUPTO)
|
1288 |
{
|
1289 |
active_count--; /* Remove non-match possibility */
|
1290 |
next_active_state--;
|
1291 |
}
|
1292 |
if (++count >= GET2(code, 1))
|
1293 |
{ ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
|
1294 |
else
|
1295 |
{ ADD_NEW(state_offset, count); }
|
1296 |
}
|
1297 |
}
|
1298 |
break;
|
1299 |
|
1300 |
/* ========================================================================== */
|
1301 |
/* These are virtual opcodes that are used when something like
|
1302 |
OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
|
1303 |
argument. It keeps the code above fast for the other cases. The argument
|
1304 |
is in the d variable. */
|
1305 |
|
1306 |
#ifdef SUPPORT_UCP
|
1307 |
case OP_PROP_EXTRA + OP_TYPEPLUS:
|
1308 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS:
|
1309 |
case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
|
1310 |
count = current_state->count; /* Already matched */
|
1311 |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
|
1312 |
if (clen > 0)
|
1313 |
{
|
1314 |
BOOL OK;
|
1315 |
const pcre_uint32 *cp;
|
1316 |
const ucd_record * prop = GET_UCD(c);
|
1317 |
switch(code[2])
|
1318 |
{
|
1319 |
case PT_ANY:
|
1320 |
OK = TRUE;
|
1321 |
break;
|
1322 |
|
1323 |
case PT_LAMP:
|
1324 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
1325 |
prop->chartype == ucp_Lt;
|
1326 |
break;
|
1327 |
|
1328 |
case PT_GC:
|
1329 |
OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
|
1330 |
break;
|
1331 |
|
1332 |
case PT_PC:
|
1333 |
OK = prop->chartype == code[3];
|
1334 |
break;
|
1335 |
|
1336 |
case PT_SC:
|
1337 |
OK = prop->script == code[3];
|
1338 |
break;
|
1339 |
|
1340 |
/* These are specials for combination cases. */
|
1341 |
|
1342 |
case PT_ALNUM:
|
1343 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1344 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
1345 |
break;
|
1346 |
|
1347 |
case PT_SPACE: /* Perl space */
|
1348 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1349 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
1350 |
break;
|
1351 |
|
1352 |
case PT_PXSPACE: /* POSIX space */
|
1353 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1354 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
1355 |
c == CHAR_FF || c == CHAR_CR;
|
1356 |
break;
|
1357 |
|
1358 |
case PT_WORD:
|
1359 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1360 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
1361 |
c == CHAR_UNDERSCORE;
|
1362 |
break;
|
1363 |
|
1364 |
case PT_CLIST:
|
1365 |
cp = PRIV(ucd_caseless_sets) + code[3];
|
1366 |
for (;;)
|
1367 |
{
|
1368 |
if (c < *cp) { OK = FALSE; break; }
|
1369 |
if (c == *cp++) { OK = TRUE; break; }
|
1370 |
}
|
1371 |
break;
|
1372 |
|
1373 |
case PT_UCNC:
|
1374 |
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
1375 |
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
|
1376 |
c >= 0xe000;
|
1377 |
break;
|
1378 |
|
1379 |
/* Should never occur, but keep compilers from grumbling. */
|
1380 |
|
1381 |
default:
|
1382 |
OK = codevalue != OP_PROP;
|
1383 |
break;
|
1384 |
}
|
1385 |
|
1386 |
if (OK == (d == OP_PROP))
|
1387 |
{
|
1388 |
if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
|
1389 |
{
|
1390 |
active_count--; /* Remove non-match possibility */
|
1391 |
next_active_state--;
|
1392 |
}
|
1393 |
count++;
|
1394 |
ADD_NEW(state_offset, count);
|
1395 |
}
|
1396 |
}
|
1397 |
break;
|
1398 |
|
1399 |
/*-----------------------------------------------------------------*/
|
1400 |
case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
|
1401 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
|
1402 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
|
1403 |
count = current_state->count; /* Already matched */
|
1404 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1405 |
if (clen > 0)
|
1406 |
{
|
1407 |
int lgb, rgb;
|
1408 |
const pcre_uchar *nptr = ptr + clen;
|
1409 |
int ncount = 0;
|
1410 |
if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
|
1411 |
{
|
1412 |
active_count--; /* Remove non-match possibility */
|
1413 |
next_active_state--;
|
1414 |
}
|
1415 |
lgb = UCD_GRAPHBREAK(c);
|
1416 |
while (nptr < end_subject)
|
1417 |
{
|
1418 |
dlen = 1;
|
1419 |
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
|
1420 |
rgb = UCD_GRAPHBREAK(d);
|
1421 |
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
|
1422 |
ncount++;
|
1423 |
lgb = rgb;
|
1424 |
nptr += dlen;
|
1425 |
}
|
1426 |
count++;
|
1427 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1428 |
}
|
1429 |
break;
|
1430 |
#endif
|
1431 |
|
1432 |
/*-----------------------------------------------------------------*/
|
1433 |
case OP_ANYNL_EXTRA + OP_TYPEPLUS:
|
1434 |
case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
|
1435 |
case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
|
1436 |
count = current_state->count; /* Already matched */
|
1437 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1438 |
if (clen > 0)
|
1439 |
{
|
1440 |
int ncount = 0;
|
1441 |
switch (c)
|
1442 |
{
|
1443 |
case CHAR_VT:
|
1444 |
case CHAR_FF:
|
1445 |
case CHAR_NEL:
|
1446 |
#ifndef EBCDIC
|
1447 |
case 0x2028:
|
1448 |
case 0x2029:
|
1449 |
#endif /* Not EBCDIC */
|
1450 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1451 |
goto ANYNL01;
|
1452 |
|
1453 |
case CHAR_CR:
|
1454 |
if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
|
1455 |
/* Fall through */
|
1456 |
|
1457 |
ANYNL01:
|
1458 |
case CHAR_LF:
|
1459 |
if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
|
1460 |
{
|
1461 |
active_count--; /* Remove non-match possibility */
|
1462 |
next_active_state--;
|
1463 |
}
|
1464 |
count++;
|
1465 |
ADD_NEW_DATA(-state_offset, count, ncount);
|
1466 |
break;
|
1467 |
|
1468 |
default:
|
1469 |
break;
|
1470 |
}
|
1471 |
}
|
1472 |
break;
|
1473 |
|
1474 |
/*-----------------------------------------------------------------*/
|
1475 |
case OP_VSPACE_EXTRA + OP_TYPEPLUS:
|
1476 |
case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
|
1477 |
case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
|
1478 |
count = current_state->count; /* Already matched */
|
1479 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1480 |
if (clen > 0)
|
1481 |
{
|
1482 |
BOOL OK;
|
1483 |
switch (c)
|
1484 |
{
|
1485 |
VSPACE_CASES:
|
1486 |
OK = TRUE;
|
1487 |
break;
|
1488 |
|
1489 |
default:
|
1490 |
OK = FALSE;
|
1491 |
break;
|
1492 |
}
|
1493 |
|
1494 |
if (OK == (d == OP_VSPACE))
|
1495 |
{
|
1496 |
if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
|
1497 |
{
|
1498 |
active_count--; /* Remove non-match possibility */
|
1499 |
next_active_state--;
|
1500 |
}
|
1501 |
count++;
|
1502 |
ADD_NEW_DATA(-state_offset, count, 0);
|
1503 |
}
|
1504 |
}
|
1505 |
break;
|
1506 |
|
1507 |
/*-----------------------------------------------------------------*/
|
1508 |
case OP_HSPACE_EXTRA + OP_TYPEPLUS:
|
1509 |
case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
|
1510 |
case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
|
1511 |
count = current_state->count; /* Already matched */
|
1512 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
|
1513 |
if (clen > 0)
|
1514 |
{
|
1515 |
BOOL OK;
|
1516 |
switch (c)
|
1517 |
{
|
1518 |
HSPACE_CASES:
|
1519 |
OK = TRUE;
|
1520 |
break;
|
1521 |
|
1522 |
default:
|
1523 |
OK = FALSE;
|
1524 |
break;
|
1525 |
}
|
1526 |
|
1527 |
if (OK == (d == OP_HSPACE))
|
1528 |
{
|
1529 |
if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
|
1530 |
{
|
1531 |
active_count--; /* Remove non-match possibility */
|
1532 |
next_active_state--;
|
1533 |
}
|
1534 |
count++;
|
1535 |
ADD_NEW_DATA(-state_offset, count, 0);
|
1536 |
}
|
1537 |
}
|
1538 |
break;
|
1539 |
|
1540 |
/*-----------------------------------------------------------------*/
|
1541 |
#ifdef SUPPORT_UCP
|
1542 |
case OP_PROP_EXTRA + OP_TYPEQUERY:
|
1543 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY:
|
1544 |
case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
|
1545 |
count = 4;
|
1546 |
goto QS1;
|
1547 |
|
1548 |
case OP_PROP_EXTRA + OP_TYPESTAR:
|
1549 |
case OP_PROP_EXTRA + OP_TYPEMINSTAR:
|
1550 |
case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
|
1551 |
count = 0;
|
1552 |
|
1553 |
QS1:
|
1554 |
|
1555 |
ADD_ACTIVE(state_offset + 4, 0);
|
1556 |
if (clen > 0)
|
1557 |
{
|
1558 |
BOOL OK;
|
1559 |
const pcre_uint32 *cp;
|
1560 |
const ucd_record * prop = GET_UCD(c);
|
1561 |
switch(code[2])
|
1562 |
{
|
1563 |
case PT_ANY:
|
1564 |
OK = TRUE;
|
1565 |
break;
|
1566 |
|
1567 |
case PT_LAMP:
|
1568 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
1569 |
prop->chartype == ucp_Lt;
|
1570 |
break;
|
1571 |
|
1572 |
case PT_GC:
|
1573 |
OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
|
1574 |
break;
|
1575 |
|
1576 |
case PT_PC:
|
1577 |
OK = prop->chartype == code[3];
|
1578 |
break;
|
1579 |
|
1580 |
case PT_SC:
|
1581 |
OK = prop->script == code[3];
|
1582 |
break;
|
1583 |
|
1584 |
/* These are specials for combination cases. */
|
1585 |
|
1586 |
case PT_ALNUM:
|
1587 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1588 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
1589 |
break;
|
1590 |
|
1591 |
case PT_SPACE: /* Perl space */
|
1592 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1593 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
1594 |
break;
|
1595 |
|
1596 |
case PT_PXSPACE: /* POSIX space */
|
1597 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1598 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
1599 |
c == CHAR_FF || c == CHAR_CR;
|
1600 |
break;
|
1601 |
|
1602 |
case PT_WORD:
|
1603 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1604 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
1605 |
c == CHAR_UNDERSCORE;
|
1606 |
break;
|
1607 |
|
1608 |
case PT_CLIST:
|
1609 |
cp = PRIV(ucd_caseless_sets) + code[3];
|
1610 |
for (;;)
|
1611 |
{
|
1612 |
if (c < *cp) { OK = FALSE; break; }
|
1613 |
if (c == *cp++) { OK = TRUE; break; }
|
1614 |
}
|
1615 |
break;
|
1616 |
|
1617 |
case PT_UCNC:
|
1618 |
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
1619 |
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
|
1620 |
c >= 0xe000;
|
1621 |
break;
|
1622 |
|
1623 |
/* Should never occur, but keep compilers from grumbling. */
|
1624 |
|
1625 |
default:
|
1626 |
OK = codevalue != OP_PROP;
|
1627 |
break;
|
1628 |
}
|
1629 |
|
1630 |
if (OK == (d == OP_PROP))
|
1631 |
{
|
1632 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
|
1633 |
codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
|
1634 |
{
|
1635 |
active_count--; /* Remove non-match possibility */
|
1636 |
next_active_state--;
|
1637 |
}
|
1638 |
ADD_NEW(state_offset + count, 0);
|
1639 |
}
|
1640 |
}
|
1641 |
break;
|
1642 |
|
1643 |
/*-----------------------------------------------------------------*/
|
1644 |
case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
|
1645 |
case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
|
1646 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
|
1647 |
count = 2;
|
1648 |
goto QS2;
|
1649 |
|
1650 |
case OP_EXTUNI_EXTRA + OP_TYPESTAR:
|
1651 |
case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
|
1652 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
|
1653 |
count = 0;
|
1654 |
|
1655 |
QS2:
|
1656 |
|
1657 |
ADD_ACTIVE(state_offset + 2, 0);
|
1658 |
if (clen > 0)
|
1659 |
{
|
1660 |
int lgb, rgb;
|
1661 |
const pcre_uchar *nptr = ptr + clen;
|
1662 |
int ncount = 0;
|
1663 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
|
1664 |
codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
|
1665 |
{
|
1666 |
active_count--; /* Remove non-match possibility */
|
1667 |
next_active_state--;
|
1668 |
}
|
1669 |
lgb = UCD_GRAPHBREAK(c);
|
1670 |
while (nptr < end_subject)
|
1671 |
{
|
1672 |
dlen = 1;
|
1673 |
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
|
1674 |
rgb = UCD_GRAPHBREAK(d);
|
1675 |
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
|
1676 |
ncount++;
|
1677 |
lgb = rgb;
|
1678 |
nptr += dlen;
|
1679 |
}
|
1680 |
ADD_NEW_DATA(-(state_offset + count), 0, ncount);
|
1681 |
}
|
1682 |
break;
|
1683 |
#endif
|
1684 |
|
1685 |
/*-----------------------------------------------------------------*/
|
1686 |
case OP_ANYNL_EXTRA + OP_TYPEQUERY:
|
1687 |
case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
|
1688 |
case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
|
1689 |
count = 2;
|
1690 |
goto QS3;
|
1691 |
|
1692 |
case OP_ANYNL_EXTRA + OP_TYPESTAR:
|
1693 |
case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
|
1694 |
case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
|
1695 |
count = 0;
|
1696 |
|
1697 |
QS3:
|
1698 |
ADD_ACTIVE(state_offset + 2, 0);
|
1699 |
if (clen > 0)
|
1700 |
{
|
1701 |
int ncount = 0;
|
1702 |
switch (c)
|
1703 |
{
|
1704 |
case CHAR_VT:
|
1705 |
case CHAR_FF:
|
1706 |
case CHAR_NEL:
|
1707 |
#ifndef EBCDIC
|
1708 |
case 0x2028:
|
1709 |
case 0x2029:
|
1710 |
#endif /* Not EBCDIC */
|
1711 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1712 |
goto ANYNL02;
|
1713 |
|
1714 |
case CHAR_CR:
|
1715 |
if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
|
1716 |
/* Fall through */
|
1717 |
|
1718 |
ANYNL02:
|
1719 |
case CHAR_LF:
|
1720 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
|
1721 |
codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
|
1722 |
{
|
1723 |
active_count--; /* Remove non-match possibility */
|
1724 |
next_active_state--;
|
1725 |
}
|
1726 |
ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
|
1727 |
break;
|
1728 |
|
1729 |
default:
|
1730 |
break;
|
1731 |
}
|
1732 |
}
|
1733 |
break;
|
1734 |
|
1735 |
/*-----------------------------------------------------------------*/
|
1736 |
case OP_VSPACE_EXTRA + OP_TYPEQUERY:
|
1737 |
case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
|
1738 |
case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
|
1739 |
count = 2;
|
1740 |
goto QS4;
|
1741 |
|
1742 |
case OP_VSPACE_EXTRA + OP_TYPESTAR:
|
1743 |
case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
|
1744 |
case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
|
1745 |
count = 0;
|
1746 |
|
1747 |
QS4:
|
1748 |
ADD_ACTIVE(state_offset + 2, 0);
|
1749 |
if (clen > 0)
|
1750 |
{
|
1751 |
BOOL OK;
|
1752 |
switch (c)
|
1753 |
{
|
1754 |
VSPACE_CASES:
|
1755 |
OK = TRUE;
|
1756 |
break;
|
1757 |
|
1758 |
default:
|
1759 |
OK = FALSE;
|
1760 |
break;
|
1761 |
}
|
1762 |
if (OK == (d == OP_VSPACE))
|
1763 |
{
|
1764 |
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
|
1765 |
codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
|
1766 |
{
|
1767 |
active_count--; /* Remove non-match possibility */
|
1768 |
next_active_state--;
|
1769 |
}
|
1770 |
ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
|
1771 |
}
|
1772 |
}
|
1773 |
break;
|
1774 |
|
1775 |
/*-----------------------------------------------------------------*/
|
1776 |
case OP_HSPACE_EXTRA + OP_TYPEQUERY:
|
1777 |
case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
|
1778 |
case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
|
1779 |
count = 2;
|
1780 |
goto QS5;
|
1781 |
|
1782 |
case OP_HSPACE_EXTRA + OP_TYPESTAR:
|
1783 |
case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
|
1784 |
case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
|
1785 |
count = 0;
|
1786 |
|
1787 |
QS5:
|
1788 |
ADD_ACTIVE(state_offset + 2, 0);
|
1789 |
if (clen > 0)
|
1790 |
{
|
1791 |
BOOL OK;
|
1792 |
switch (c)
|
1793 |
{
|
1794 |
HSPACE_CASES:
|
1795 |
OK = TRUE;
|
1796 |
break;
|
1797 |
|
1798 |
default:
|
1799 |
OK = FALSE;
|
1800 |
break;
|
1801 |
}
|
1802 |
|
1803 |
if (OK == (d == OP_HSPACE))
|
1804 |
{
|
1805 |
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
|
1806 |
codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
|
1807 |
{
|
1808 |
active_count--; /* Remove non-match possibility */
|
1809 |
next_active_state--;
|
1810 |
}
|
1811 |
ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
|
1812 |
}
|
1813 |
}
|
1814 |
break;
|
1815 |
|
1816 |
/*-----------------------------------------------------------------*/
|
1817 |
#ifdef SUPPORT_UCP
|
1818 |
case OP_PROP_EXTRA + OP_TYPEEXACT:
|
1819 |
case OP_PROP_EXTRA + OP_TYPEUPTO:
|
1820 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO:
|
1821 |
case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
|
1822 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
|
1823 |
{ ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
|
1824 |
count = current_state->count; /* Number already matched */
|
1825 |
if (clen > 0)
|
1826 |
{
|
1827 |
BOOL OK;
|
1828 |
const pcre_uint32 *cp;
|
1829 |
const ucd_record * prop = GET_UCD(c);
|
1830 |
switch(code[1 + IMM2_SIZE + 1])
|
1831 |
{
|
1832 |
case PT_ANY:
|
1833 |
OK = TRUE;
|
1834 |
break;
|
1835 |
|
1836 |
case PT_LAMP:
|
1837 |
OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
|
1838 |
prop->chartype == ucp_Lt;
|
1839 |
break;
|
1840 |
|
1841 |
case PT_GC:
|
1842 |
OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
|
1843 |
break;
|
1844 |
|
1845 |
case PT_PC:
|
1846 |
OK = prop->chartype == code[1 + IMM2_SIZE + 2];
|
1847 |
break;
|
1848 |
|
1849 |
case PT_SC:
|
1850 |
OK = prop->script == code[1 + IMM2_SIZE + 2];
|
1851 |
break;
|
1852 |
|
1853 |
/* These are specials for combination cases. */
|
1854 |
|
1855 |
case PT_ALNUM:
|
1856 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1857 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N;
|
1858 |
break;
|
1859 |
|
1860 |
case PT_SPACE: /* Perl space */
|
1861 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1862 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
1863 |
break;
|
1864 |
|
1865 |
case PT_PXSPACE: /* POSIX space */
|
1866 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
|
1867 |
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
1868 |
c == CHAR_FF || c == CHAR_CR;
|
1869 |
break;
|
1870 |
|
1871 |
case PT_WORD:
|
1872 |
OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
|
1873 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
|
1874 |
c == CHAR_UNDERSCORE;
|
1875 |
break;
|
1876 |
|
1877 |
case PT_CLIST:
|
1878 |
cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
|
1879 |
for (;;)
|
1880 |
{
|
1881 |
if (c < *cp) { OK = FALSE; break; }
|
1882 |
if (c == *cp++) { OK = TRUE; break; }
|
1883 |
}
|
1884 |
break;
|
1885 |
|
1886 |
case PT_UCNC:
|
1887 |
OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
|
1888 |
c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
|
1889 |
c >= 0xe000;
|
1890 |
break;
|
1891 |
|
1892 |
/* Should never occur, but keep compilers from grumbling. */
|
1893 |
|
1894 |
default:
|
1895 |
OK = codevalue != OP_PROP;
|
1896 |
break;
|
1897 |
}
|
1898 |
|
1899 |
if (OK == (d == OP_PROP))
|
1900 |
{
|
1901 |
if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
|
1902 |
{
|
1903 |
active_count--; /* Remove non-match possibility */
|
1904 |
next_active_state--;
|
1905 |
}
|
1906 |
if (++count >= GET2(code, 1))
|
1907 |
{ ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
|
1908 |
else
|
1909 |
{ ADD_NEW(state_offset, count); }
|
1910 |
}
|
1911 |
}
|
1912 |
break;
|
1913 |
|
1914 |
/*-----------------------------------------------------------------*/
|
1915 |
case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
|
1916 |
case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
|
1917 |
case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
|
1918 |
case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
|
1919 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
|
1920 |
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
|
1921 |
count = current_state->count; /* Number already matched */
|
1922 |
if (clen > 0)
|
1923 |
{
|
1924 |
int lgb, rgb;
|
1925 |
const pcre_uchar *nptr = ptr + clen;
|
1926 |
int ncount = 0;
|
1927 |
if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
|
1928 |
{
|
1929 |
active_count--; /* Remove non-match possibility */
|
1930 |
next_active_state--;
|
1931 |
}
|
1932 |
lgb = UCD_GRAPHBREAK(c);
|
1933 |
while (nptr < end_subject)
|
1934 |
{
|
1935 |
dlen = 1;
|
1936 |
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
|
1937 |
rgb = UCD_GRAPHBREAK(d);
|
1938 |
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
|
1939 |
ncount++;
|
1940 |
lgb = rgb;
|
1941 |
nptr += dlen;
|
1942 |
}
|
1943 |
if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
1944 |
reset_could_continue = TRUE;
|
1945 |
if (++count >= GET2(code, 1))
|
1946 |
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
|
1947 |
else
|
1948 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1949 |
}
|
1950 |
break;
|
1951 |
#endif
|
1952 |
|
1953 |
/*-----------------------------------------------------------------*/
|
1954 |
case OP_ANYNL_EXTRA + OP_TYPEEXACT:
|
1955 |
case OP_ANYNL_EXTRA + OP_TYPEUPTO:
|
1956 |
case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
|
1957 |
case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
|
1958 |
if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
|
1959 |
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
|
1960 |
count = current_state->count; /* Number already matched */
|
1961 |
if (clen > 0)
|
1962 |
{
|
1963 |
int ncount = 0;
|
1964 |
switch (c)
|
1965 |
{
|
1966 |
case CHAR_VT:
|
1967 |
case CHAR_FF:
|
1968 |
case CHAR_NEL:
|
1969 |
#ifndef EBCDIC
|
1970 |
case 0x2028:
|
1971 |
case 0x2029:
|
1972 |
#endif /* Not EBCDIC */
|
1973 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
1974 |
goto ANYNL03;
|
1975 |
|
1976 |
case CHAR_CR:
|
1977 |
if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
|
1978 |
/* Fall through */
|
1979 |
|
1980 |
ANYNL03:
|
1981 |
case CHAR_LF:
|
1982 |
if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
|
1983 |
{
|
1984 |
active_count--; /* Remove non-match possibility */
|
1985 |
next_active_state--;
|
1986 |
}
|
1987 |
if (++count >= GET2(code, 1))
|
1988 |
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
|
1989 |
else
|
1990 |
{ ADD_NEW_DATA(-state_offset, count, ncount); }
|
1991 |
break;
|
1992 |
|
1993 |
default:
|
1994 |
break;
|
1995 |
}
|
1996 |
}
|
1997 |
break;
|
1998 |
|
1999 |
/*-----------------------------------------------------------------*/
|
2000 |
case OP_VSPACE_EXTRA + OP_TYPEEXACT:
|
2001 |
case OP_VSPACE_EXTRA + OP_TYPEUPTO:
|
2002 |
case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
|
2003 |
case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
|
2004 |
if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
|
2005 |
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
|
2006 |
count = current_state->count; /* Number already matched */
|
2007 |
if (clen > 0)
|
2008 |
{
|
2009 |
BOOL OK;
|
2010 |
switch (c)
|
2011 |
{
|
2012 |
VSPACE_CASES:
|
2013 |
OK = TRUE;
|
2014 |
break;
|
2015 |
|
2016 |
default:
|
2017 |
OK = FALSE;
|
2018 |
}
|
2019 |
|
2020 |
if (OK == (d == OP_VSPACE))
|
2021 |
{
|
2022 |
if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
|
2023 |
{
|
2024 |
active_count--; /* Remove non-match possibility */
|
2025 |
next_active_state--;
|
2026 |
}
|
2027 |
if (++count >= GET2(code, 1))
|
2028 |
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
|
2029 |
else
|
2030 |
{ ADD_NEW_DATA(-state_offset, count, 0); }
|
2031 |
}
|
2032 |
}
|
2033 |
break;
|
2034 |
|
2035 |
/*-----------------------------------------------------------------*/
|
2036 |
case OP_HSPACE_EXTRA + OP_TYPEEXACT:
|
2037 |
case OP_HSPACE_EXTRA + OP_TYPEUPTO:
|
2038 |
case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
|
2039 |
case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
|
2040 |
if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
|
2041 |
{ ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
|
2042 |
count = current_state->count; /* Number already matched */
|
2043 |
if (clen > 0)
|
2044 |
{
|
2045 |
BOOL OK;
|
2046 |
switch (c)
|
2047 |
{
|
2048 |
HSPACE_CASES:
|
2049 |
OK = TRUE;
|
2050 |
break;
|
2051 |
|
2052 |
default:
|
2053 |
OK = FALSE;
|
2054 |
break;
|
2055 |
}
|
2056 |
|
2057 |
if (OK == (d == OP_HSPACE))
|
2058 |
{
|
2059 |
if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
|
2060 |
{
|
2061 |
active_count--; /* Remove non-match possibility */
|
2062 |
next_active_state--;
|
2063 |
}
|
2064 |
if (++count >= GET2(code, 1))
|
2065 |
{ ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
|
2066 |
else
|
2067 |
{ ADD_NEW_DATA(-state_offset, count, 0); }
|
2068 |
}
|
2069 |
}
|
2070 |
break;
|
2071 |
|
2072 |
/* ========================================================================== */
|
2073 |
/* These opcodes are followed by a character that is usually compared
|
2074 |
to the current subject character; it is loaded into d. We still get
|
2075 |
here even if there is no subject character, because in some cases zero
|
2076 |
repetitions are permitted. */
|
2077 |
|
2078 |
/*-----------------------------------------------------------------*/
|
2079 |
case OP_CHAR:
|
2080 |
if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
|
2081 |
break;
|
2082 |
|
2083 |
/*-----------------------------------------------------------------*/
|
2084 |
case OP_CHARI:
|
2085 |
if (clen == 0) break;
|
2086 |
|
2087 |
#ifdef SUPPORT_UTF
|
2088 |
if (utf)
|
2089 |
{
|
2090 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
|
2091 |
{
|
2092 |
unsigned int othercase;
|
2093 |
if (c < 128)
|
2094 |
othercase = fcc[c];
|
2095 |
else
|
2096 |
/* If we have Unicode property support, we can use it to test the
|
2097 |
other case of the character. */
|
2098 |
#ifdef SUPPORT_UCP
|
2099 |
othercase = UCD_OTHERCASE(c);
|
2100 |
#else
|
2101 |
othercase = NOTACHAR;
|
2102 |
#endif
|
2103 |
|
2104 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
|
2105 |
}
|
2106 |
}
|
2107 |
else
|
2108 |
#endif /* SUPPORT_UTF */
|
2109 |
/* Not UTF mode */
|
2110 |
{
|
2111 |
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
|
2112 |
{ ADD_NEW(state_offset + 2, 0); }
|
2113 |
}
|
2114 |
break;
|
2115 |
|
2116 |
|
2117 |
#ifdef SUPPORT_UCP
|
2118 |
/*-----------------------------------------------------------------*/
|
2119 |
/* This is a tricky one because it can match more than one character.
|
2120 |
Find out how many characters to skip, and then set up a negative state
|
2121 |
to wait for them to pass before continuing. */
|
2122 |
|
2123 |
case OP_EXTUNI:
|
2124 |
if (clen > 0)
|
2125 |
{
|
2126 |
int lgb, rgb;
|
2127 |
const pcre_uchar *nptr = ptr + clen;
|
2128 |
int ncount = 0;
|
2129 |
lgb = UCD_GRAPHBREAK(c);
|
2130 |
while (nptr < end_subject)
|
2131 |
{
|
2132 |
dlen = 1;
|
2133 |
if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
|
2134 |
rgb = UCD_GRAPHBREAK(d);
|
2135 |
if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
|
2136 |
ncount++;
|
2137 |
lgb = rgb;
|
2138 |
nptr += dlen;
|
2139 |
}
|
2140 |
if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
2141 |
reset_could_continue = TRUE;
|
2142 |
ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
|
2143 |
}
|
2144 |
break;
|
2145 |
#endif
|
2146 |
|
2147 |
/*-----------------------------------------------------------------*/
|
2148 |
/* This is a tricky like EXTUNI because it too can match more than one
|
2149 |
character (when CR is followed by LF). In this case, set up a negative
|
2150 |
state to wait for one character to pass before continuing. */
|
2151 |
|
2152 |
case OP_ANYNL:
|
2153 |
if (clen > 0) switch(c)
|
2154 |
{
|
2155 |
case CHAR_VT:
|
2156 |
case CHAR_FF:
|
2157 |
case CHAR_NEL:
|
2158 |
#ifndef EBCDIC
|
2159 |
case 0x2028:
|
2160 |
case 0x2029:
|
2161 |
#endif /* Not EBCDIC */
|
2162 |
if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
|
2163 |
|
2164 |
case CHAR_LF:
|
2165 |
ADD_NEW(state_offset + 1, 0);
|
2166 |
break;
|
2167 |
|
2168 |
case CHAR_CR:
|
2169 |
if (ptr + 1 >= end_subject)
|
2170 |
{
|
2171 |
ADD_NEW(state_offset + 1, 0);
|
2172 |
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
|
2173 |
reset_could_continue = TRUE;
|
2174 |
}
|
2175 |
else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
|
2176 |
{
|
2177 |
ADD_NEW_DATA(-(state_offset + 1), 0, 1);
|
2178 |
}
|
2179 |
else
|
2180 |
{
|
2181 |
ADD_NEW(state_offset + 1, 0);
|
2182 |
}
|
2183 |
break;
|
2184 |
}
|
2185 |
break;
|
2186 |
|
2187 |
/*-----------------------------------------------------------------*/
|
2188 |
case OP_NOT_VSPACE:
|
2189 |
if (clen > 0) switch(c)
|
2190 |
{
|
2191 |
VSPACE_CASES:
|
2192 |
break;
|
2193 |
|
2194 |
default:
|
2195 |
ADD_NEW(state_offset + 1, 0);
|
2196 |
break;
|
2197 |
}
|
2198 |
break;
|
2199 |
|
2200 |
/*-----------------------------------------------------------------*/
|
2201 |
case OP_VSPACE:
|
2202 |
if (clen > 0) switch(c)
|
2203 |
{
|
2204 |
VSPACE_CASES:
|
2205 |
ADD_NEW(state_offset + 1, 0);
|
2206 |
break;
|
2207 |
|
2208 |
default:
|
2209 |
break;
|
2210 |
}
|
2211 |
break;
|
2212 |
|
2213 |
/*-----------------------------------------------------------------*/
|
2214 |
case OP_NOT_HSPACE:
|
2215 |
if (clen > 0) switch(c)
|
2216 |
{
|
2217 |
HSPACE_CASES:
|
2218 |
break;
|
2219 |
|
2220 |
default:
|
2221 |
ADD_NEW(state_offset + 1, 0);
|
2222 |
break;
|
2223 |
}
|
2224 |
break;
|
2225 |
|
2226 |
/*-----------------------------------------------------------------*/
|
2227 |
case OP_HSPACE:
|
2228 |
if (clen > 0) switch(c)
|
2229 |
{
|
2230 |
HSPACE_CASES:
|
2231 |
ADD_NEW(state_offset + 1, 0);
|
2232 |
break;
|
2233 |
|
2234 |
default:
|
2235 |
break;
|
2236 |
}
|
2237 |
break;
|
2238 |
|
2239 |
/*-----------------------------------------------------------------*/
|
2240 |
/* Match a negated single character casefully. */
|
2241 |
|
2242 |
case OP_NOT:
|
2243 |
if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
|
2244 |
break;
|
2245 |
|
2246 |
/*-----------------------------------------------------------------*/
|
2247 |
/* Match a negated single character caselessly. */
|
2248 |
|
2249 |
case OP_NOTI:
|
2250 |
if (clen > 0)
|
2251 |
{
|
2252 |
unsigned int otherd;
|
2253 |
#ifdef SUPPORT_UTF
|
2254 |
if (utf && d >= 128)
|
2255 |
{
|
2256 |
#ifdef SUPPORT_UCP
|
2257 |
otherd = UCD_OTHERCASE(d);
|
2258 |
#endif /* SUPPORT_UCP */
|
2259 |
}
|
2260 |
else
|
2261 |
#endif /* SUPPORT_UTF */
|
2262 |
otherd = TABLE_GET(d, fcc, d);
|
2263 |
if (c != d && c != otherd)
|
2264 |
{ ADD_NEW(state_offset + dlen + 1, 0); }
|
2265 |
}
|
2266 |
break;
|
2267 |
|
2268 |
/*-----------------------------------------------------------------*/
|
2269 |
case OP_PLUSI:
|
2270 |
case OP_MINPLUSI:
|
2271 |
case OP_POSPLUSI:
|
2272 |
case OP_NOTPLUSI:
|
2273 |
case OP_NOTMINPLUSI:
|
2274 |
case OP_NOTPOSPLUSI:
|
2275 |
caseless = TRUE;
|
2276 |
codevalue -= OP_STARI - OP_STAR;
|
2277 |
|
2278 |
/* Fall through */
|
2279 |
case OP_PLUS:
|
2280 |
case OP_MINPLUS:
|
2281 |
case OP_POSPLUS:
|
2282 |
case OP_NOTPLUS:
|
2283 |
case OP_NOTMINPLUS:
|
2284 |
case OP_NOTPOSPLUS:
|
2285 |
count = current_state->count; /* Already matched */
|
2286 |
if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
|
2287 |
if (clen > 0)
|
2288 |
{
|
2289 |
pcre_uint32 otherd = NOTACHAR;
|
2290 |
if (caseless)
|
2291 |
{
|
2292 |
#ifdef SUPPORT_UTF
|
2293 |
if (utf && d >= 128)
|
2294 |
{
|
2295 |
#ifdef SUPPORT_UCP
|
2296 |
otherd = UCD_OTHERCASE(d);
|
2297 |
#endif /* SUPPORT_UCP */
|
2298 |
}
|
2299 |
else
|
2300 |
#endif /* SUPPORT_UTF */
|
2301 |
otherd = TABLE_GET(d, fcc, d);
|
2302 |
}
|
2303 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2304 |
{
|
2305 |
if (count > 0 &&
|
2306 |
(codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
|
2307 |
{
|
2308 |
active_count--; /* Remove non-match possibility */
|
2309 |
next_active_state--;
|
2310 |
}
|
2311 |
count++;
|
2312 |
ADD_NEW(state_offset, count);
|
2313 |
}
|
2314 |
}
|
2315 |
break;
|
2316 |
|
2317 |
/*-----------------------------------------------------------------*/
|
2318 |
case OP_QUERYI:
|
2319 |
case OP_MINQUERYI:
|
2320 |
case OP_POSQUERYI:
|
2321 |
case OP_NOTQUERYI:
|
2322 |
case OP_NOTMINQUERYI:
|
2323 |
case OP_NOTPOSQUERYI:
|
2324 |
caseless = TRUE;
|
2325 |
codevalue -= OP_STARI - OP_STAR;
|
2326 |
/* Fall through */
|
2327 |
case OP_QUERY:
|
2328 |
case OP_MINQUERY:
|
2329 |
case OP_POSQUERY:
|
2330 |
case OP_NOTQUERY:
|
2331 |
case OP_NOTMINQUERY:
|
2332 |
case OP_NOTPOSQUERY:
|
2333 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
2334 |
if (clen > 0)
|
2335 |
{
|
2336 |
pcre_uint32 otherd = NOTACHAR;
|
2337 |
if (caseless)
|
2338 |
{
|
2339 |
#ifdef SUPPORT_UTF
|
2340 |
if (utf && d >= 128)
|
2341 |
{
|
2342 |
#ifdef SUPPORT_UCP
|
2343 |
otherd = UCD_OTHERCASE(d);
|
2344 |
#endif /* SUPPORT_UCP */
|
2345 |
}
|
2346 |
else
|
2347 |
#endif /* SUPPORT_UTF */
|
2348 |
otherd = TABLE_GET(d, fcc, d);
|
2349 |
}
|
2350 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2351 |
{
|
2352 |
if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
|
2353 |
{
|
2354 |
active_count--; /* Remove non-match possibility */
|
2355 |
next_active_state--;
|
2356 |
}
|
2357 |
ADD_NEW(state_offset + dlen + 1, 0);
|
2358 |
}
|
2359 |
}
|
2360 |
break;
|
2361 |
|
2362 |
/*-----------------------------------------------------------------*/
|
2363 |
case OP_STARI:
|
2364 |
case OP_MINSTARI:
|
2365 |
case OP_POSSTARI:
|
2366 |
case OP_NOTSTARI:
|
2367 |
case OP_NOTMINSTARI:
|
2368 |
case OP_NOTPOSSTARI:
|
2369 |
caseless = TRUE;
|
2370 |
codevalue -= OP_STARI - OP_STAR;
|
2371 |
/* Fall through */
|
2372 |
case OP_STAR:
|
2373 |
case OP_MINSTAR:
|
2374 |
case OP_POSSTAR:
|
2375 |
case OP_NOTSTAR:
|
2376 |
case OP_NOTMINSTAR:
|
2377 |
case OP_NOTPOSSTAR:
|
2378 |
ADD_ACTIVE(state_offset + dlen + 1, 0);
|
2379 |
if (clen > 0)
|
2380 |
{
|
2381 |
pcre_uint32 otherd = NOTACHAR;
|
2382 |
if (caseless)
|
2383 |
{
|
2384 |
#ifdef SUPPORT_UTF
|
2385 |
if (utf && d >= 128)
|
2386 |
{
|
2387 |
#ifdef SUPPORT_UCP
|
2388 |
otherd = UCD_OTHERCASE(d);
|
2389 |
#endif /* SUPPORT_UCP */
|
2390 |
}
|
2391 |
else
|
2392 |
#endif /* SUPPORT_UTF */
|
2393 |
otherd = TABLE_GET(d, fcc, d);
|
2394 |
}
|
2395 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2396 |
{
|
2397 |
if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
|
2398 |
{
|
2399 |
active_count--; /* Remove non-match possibility */
|
2400 |
next_active_state--;
|
2401 |
}
|
2402 |
ADD_NEW(state_offset, 0);
|
2403 |
}
|
2404 |
}
|
2405 |
break;
|
2406 |
|
2407 |
/*-----------------------------------------------------------------*/
|
2408 |
case OP_EXACTI:
|
2409 |
case OP_NOTEXACTI:
|
2410 |
caseless = TRUE;
|
2411 |
codevalue -= OP_STARI - OP_STAR;
|
2412 |
/* Fall through */
|
2413 |
case OP_EXACT:
|
2414 |
case OP_NOTEXACT:
|
2415 |
count = current_state->count; /* Number already matched */
|
2416 |
if (clen > 0)
|
2417 |
{
|
2418 |
pcre_uint32 otherd = NOTACHAR;
|
2419 |
if (caseless)
|
2420 |
{
|
2421 |
#ifdef SUPPORT_UTF
|
2422 |
if (utf && d >= 128)
|
2423 |
{
|
2424 |
#ifdef SUPPORT_UCP
|
2425 |
otherd = UCD_OTHERCASE(d);
|
2426 |
#endif /* SUPPORT_UCP */
|
2427 |
}
|
2428 |
else
|
2429 |
#endif /* SUPPORT_UTF */
|
2430 |
otherd = TABLE_GET(d, fcc, d);
|
2431 |
}
|
2432 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2433 |
{
|
2434 |
if (++count >= GET2(code, 1))
|
2435 |
{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
|
2436 |
else
|
2437 |
{ ADD_NEW(state_offset, count); }
|
2438 |
}
|
2439 |
}
|
2440 |
break;
|
2441 |
|
2442 |
/*-----------------------------------------------------------------*/
|
2443 |
case OP_UPTOI:
|
2444 |
case OP_MINUPTOI:
|
2445 |
case OP_POSUPTOI:
|
2446 |
case OP_NOTUPTOI:
|
2447 |
case OP_NOTMINUPTOI:
|
2448 |
case OP_NOTPOSUPTOI:
|
2449 |
caseless = TRUE;
|
2450 |
codevalue -= OP_STARI - OP_STAR;
|
2451 |
/* Fall through */
|
2452 |
case OP_UPTO:
|
2453 |
case OP_MINUPTO:
|
2454 |
case OP_POSUPTO:
|
2455 |
case OP_NOTUPTO:
|
2456 |
case OP_NOTMINUPTO:
|
2457 |
case OP_NOTPOSUPTO:
|
2458 |
ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
|
2459 |
count = current_state->count; /* Number already matched */
|
2460 |
if (clen > 0)
|
2461 |
{
|
2462 |
pcre_uint32 otherd = NOTACHAR;
|
2463 |
if (caseless)
|
2464 |
{
|
2465 |
#ifdef SUPPORT_UTF
|
2466 |
if (utf && d >= 128)
|
2467 |
{
|
2468 |
#ifdef SUPPORT_UCP
|
2469 |
otherd = UCD_OTHERCASE(d);
|
2470 |
#endif /* SUPPORT_UCP */
|
2471 |
}
|
2472 |
else
|
2473 |
#endif /* SUPPORT_UTF */
|
2474 |
otherd = TABLE_GET(d, fcc, d);
|
2475 |
}
|
2476 |
if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
|
2477 |
{
|
2478 |
if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
|
2479 |
{
|
2480 |
active_count--; /* Remove non-match possibility */
|
2481 |
next_active_state--;
|
2482 |
}
|
2483 |
if (++count >= GET2(code, 1))
|
2484 |
{ ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
|
2485 |
else
|
2486 |
{ ADD_NEW(state_offset, count); }
|
2487 |
}
|
2488 |
}
|
2489 |
break;
|
2490 |
|
2491 |
|
2492 |
/* ========================================================================== */
|
2493 |
/* These are the class-handling opcodes */
|
2494 |
|
2495 |
case OP_CLASS:
|
2496 |
case OP_NCLASS:
|
2497 |
case OP_XCLASS:
|
2498 |
{
|
2499 |
BOOL isinclass = FALSE;
|
2500 |
int next_state_offset;
|
2501 |
const pcre_uchar *ecode;
|
2502 |
|
2503 |
/* For a simple class, there is always just a 32-byte table, and we
|
2504 |
can set isinclass from it. */
|
2505 |
|
2506 |
if (codevalue != OP_XCLASS)
|
2507 |
{
|
2508 |
ecode = code + 1 + (32 / sizeof(pcre_uchar));
|
2509 |
if (clen > 0)
|
2510 |
{
|
2511 |
isinclass = (c > 255)? (codevalue == OP_NCLASS) :
|
2512 |
((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
|
2513 |
}
|
2514 |
}
|
2515 |
|
2516 |
/* An extended class may have a table or a list of single characters,
|
2517 |
ranges, or both, and it may be positive or negative. There's a
|
2518 |
function that sorts all this out. */
|
2519 |
|
2520 |
else
|
2521 |
{
|
2522 |
ecode = code + GET(code, 1);
|
2523 |
if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
|
2524 |
}
|
2525 |
|
2526 |
/* At this point, isinclass is set for all kinds of class, and ecode
|
2527 |
points to the byte after the end of the class. If there is a
|
2528 |
quantifier, this is where it will be. */
|
2529 |
|
2530 |
next_state_offset = (int)(ecode - start_code);
|
2531 |
|
2532 |
switch (*ecode)
|
2533 |
{
|
2534 |
case OP_CRSTAR:
|
2535 |
case OP_CRMINSTAR:
|
2536 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
2537 |
if (isinclass) { ADD_NEW(state_offset, 0); }
|
2538 |
break;
|
2539 |
|
2540 |
case OP_CRPLUS:
|
2541 |
case OP_CRMINPLUS:
|
2542 |
count = current_state->count; /* Already matched */
|
2543 |
if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
|
2544 |
if (isinclass) { count++; ADD_NEW(state_offset, count); }
|
2545 |
break;
|
2546 |
|
2547 |
case OP_CRQUERY:
|
2548 |
case OP_CRMINQUERY:
|
2549 |
ADD_ACTIVE(next_state_offset + 1, 0);
|
2550 |
if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
|
2551 |
break;
|
2552 |
|
2553 |
case OP_CRRANGE:
|
2554 |
case OP_CRMINRANGE:
|
2555 |
count = current_state->count; /* Already matched */
|
2556 |
if (count >= GET2(ecode, 1))
|
2557 |
{ ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
|
2558 |
if (isinclass)
|
2559 |
{
|
2560 |
unsigned int max = GET2(ecode, 1 + IMM2_SIZE);
|
2561 |
if (++count >= max && max != 0) /* Max 0 => no limit */
|
2562 |
{ ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
|
2563 |
else
|
2564 |
{ ADD_NEW(state_offset, count); }
|
2565 |
}
|
2566 |
break;
|
2567 |
|
2568 |
default:
|
2569 |
if (isinclass) { ADD_NEW(next_state_offset, 0); }
|
2570 |
break;
|
2571 |
}
|
2572 |
}
|
2573 |
break;
|
2574 |
|
2575 |
/* ========================================================================== */
|
2576 |
/* These are the opcodes for fancy brackets of various kinds. We have
|
2577 |
to use recursion in order to handle them. The "always failing" assertion
|
2578 |
(?!) is optimised to OP_FAIL when compiling, so we have to support that,
|
2579 |
though the other "backtracking verbs" are not supported. */
|
2580 |
|
2581 |
case OP_FAIL:
|
2582 |
forced_fail++; /* Count FAILs for multiple states */
|
2583 |
break;
|
2584 |
|
2585 |
case OP_ASSERT:
|
2586 |
case OP_ASSERT_NOT:
|
2587 |
case OP_ASSERTBACK:
|
2588 |
case OP_ASSERTBACK_NOT:
|
2589 |
{
|
2590 |
int rc;
|
2591 |
int local_offsets[2];
|
2592 |
int local_workspace[1000];
|
2593 |
const pcre_uchar *endasscode = code + GET(code, 1);
|
2594 |
|
2595 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
2596 |
|
2597 |
rc = internal_dfa_exec(
|
2598 |
md, /* static match data */
|
2599 |
code, /* this subexpression's code */
|
2600 |
ptr, /* where we currently are */
|
2601 |
(int)(ptr - start_subject), /* start offset */
|
2602 |
local_offsets, /* offset vector */
|
2603 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2604 |
local_workspace, /* workspace vector */
|
2605 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2606 |
rlevel); /* function recursion level */
|
2607 |
|
2608 |
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
2609 |
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
2610 |
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
|
2611 |
}
|
2612 |
break;
|
2613 |
|
2614 |
/*-----------------------------------------------------------------*/
|
2615 |
case OP_COND:
|
2616 |
case OP_SCOND:
|
2617 |
{
|
2618 |
int local_offsets[1000];
|
2619 |
int local_workspace[1000];
|
2620 |
int codelink = GET(code, 1);
|
2621 |
int condcode;
|
2622 |
|
2623 |
/* Because of the way auto-callout works during compile, a callout item
|
2624 |
is inserted between OP_COND and an assertion condition. This does not
|
2625 |
happen for the other conditions. */
|
2626 |
|
2627 |
if (code[LINK_SIZE+1] == OP_CALLOUT)
|
2628 |
{
|
2629 |
rrc = 0;
|
2630 |
if (PUBL(callout) != NULL)
|
2631 |
{
|
2632 |
PUBL(callout_block) cb;
|
2633 |
cb.version = 1; /* Version 1 of the callout block */
|
2634 |
cb.callout_number = code[LINK_SIZE+2];
|
2635 |
cb.offset_vector = offsets;
|
2636 |
#if defined COMPILE_PCRE8
|
2637 |
cb.subject = (PCRE_SPTR)start_subject;
|
2638 |
#elif defined COMPILE_PCRE16
|
2639 |
cb.subject = (PCRE_SPTR16)start_subject;
|
2640 |
#elif defined COMPILE_PCRE32
|
2641 |
cb.subject = (PCRE_SPTR32)start_subject;
|
2642 |
#endif
|
2643 |
cb.subject_length = (int)(end_subject - start_subject);
|
2644 |
cb.start_match = (int)(current_subject - start_subject);
|
2645 |
cb.current_position = (int)(ptr - start_subject);
|
2646 |
cb.pattern_position = GET(code, LINK_SIZE + 3);
|
2647 |
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
|
2648 |
cb.capture_top = 1;
|
2649 |
cb.capture_last = -1;
|
2650 |
cb.callout_data = md->callout_data;
|
2651 |
cb.mark = NULL; /* No (*MARK) support */
|
2652 |
if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
|
2653 |
}
|
2654 |
if (rrc > 0) break; /* Fail this thread */
|
2655 |
code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
|
2656 |
}
|
2657 |
|
2658 |
condcode = code[LINK_SIZE+1];
|
2659 |
|
2660 |
/* Back reference conditions are not supported */
|
2661 |
|
2662 |
if (condcode == OP_CREF || condcode == OP_NCREF)
|
2663 |
return PCRE_ERROR_DFA_UCOND;
|
2664 |
|
2665 |
/* The DEFINE condition is always false */
|
2666 |
|
2667 |
if (condcode == OP_DEF)
|
2668 |
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2669 |
|
2670 |
/* The only supported version of OP_RREF is for the value RREF_ANY,
|
2671 |
which means "test if in any recursion". We can't test for specifically
|
2672 |
recursed groups. */
|
2673 |
|
2674 |
else if (condcode == OP_RREF || condcode == OP_NRREF)
|
2675 |
{
|
2676 |
int value = GET2(code, LINK_SIZE + 2);
|
2677 |
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
|
2678 |
if (md->recursive != NULL)
|
2679 |
{ ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
|
2680 |
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2681 |
}
|
2682 |
|
2683 |
/* Otherwise, the condition is an assertion */
|
2684 |
|
2685 |
else
|
2686 |
{
|
2687 |
int rc;
|
2688 |
const pcre_uchar *asscode = code + LINK_SIZE + 1;
|
2689 |
const pcre_uchar *endasscode = asscode + GET(asscode, 1);
|
2690 |
|
2691 |
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
|
2692 |
|
2693 |
rc = internal_dfa_exec(
|
2694 |
md, /* fixed match data */
|
2695 |
asscode, /* this subexpression's code */
|
2696 |
ptr, /* where we currently are */
|
2697 |
(int)(ptr - start_subject), /* start offset */
|
2698 |
local_offsets, /* offset vector */
|
2699 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2700 |
local_workspace, /* workspace vector */
|
2701 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2702 |
rlevel); /* function recursion level */
|
2703 |
|
2704 |
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
2705 |
if ((rc >= 0) ==
|
2706 |
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
|
2707 |
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
|
2708 |
else
|
2709 |
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
2710 |
}
|
2711 |
}
|
2712 |
break;
|
2713 |
|
2714 |
/*-----------------------------------------------------------------*/
|
2715 |
case OP_RECURSE:
|
2716 |
{
|
2717 |
dfa_recursion_info *ri;
|
2718 |
int local_offsets[1000];
|
2719 |
int local_workspace[1000];
|
2720 |
const pcre_uchar *callpat = start_code + GET(code, 1);
|
2721 |
int recno = (callpat == md->start_code)? 0 :
|
2722 |
GET2(callpat, 1 + LINK_SIZE);
|
2723 |
int rc;
|
2724 |
|
2725 |
DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
|
2726 |
|
2727 |
/* Check for repeating a recursion without advancing the subject
|
2728 |
pointer. This should catch convoluted mutual recursions. (Some simple
|
2729 |
cases are caught at compile time.) */
|
2730 |
|
2731 |
for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
|
2732 |
if (recno == ri->group_num && ptr == ri->subject_position)
|
2733 |
return PCRE_ERROR_RECURSELOOP;
|
2734 |
|
2735 |
/* Remember this recursion and where we started it so as to
|
2736 |
catch infinite loops. */
|
2737 |
|
2738 |
new_recursive.group_num = recno;
|
2739 |
new_recursive.subject_position = ptr;
|
2740 |
new_recursive.prevrec = md->recursive;
|
2741 |
md->recursive = &new_recursive;
|
2742 |
|
2743 |
rc = internal_dfa_exec(
|
2744 |
md, /* fixed match data */
|
2745 |
callpat, /* this subexpression's code */
|
2746 |
ptr, /* where we currently are */
|
2747 |
(int)(ptr - start_subject), /* start offset */
|
2748 |
local_offsets, /* offset vector */
|
2749 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2750 |
local_workspace, /* workspace vector */
|
2751 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2752 |
rlevel); /* function recursion level */
|
2753 |
|
2754 |
md->recursive = new_recursive.prevrec; /* Done this recursion */
|
2755 |
|
2756 |
DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
|
2757 |
rc));
|
2758 |
|
2759 |
/* Ran out of internal offsets */
|
2760 |
|
2761 |
if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
|
2762 |
|
2763 |
/* For each successful matched substring, set up the next state with a
|
2764 |
count of characters to skip before trying it. Note that the count is in
|
2765 |
characters, not bytes. */
|
2766 |
|
2767 |
if (rc > 0)
|
2768 |
{
|
2769 |
for (rc = rc*2 - 2; rc >= 0; rc -= 2)
|
2770 |
{
|
2771 |
int charcount = local_offsets[rc+1] - local_offsets[rc];
|
2772 |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
2773 |
if (utf)
|
2774 |
{
|
2775 |
const pcre_uchar *p = start_subject + local_offsets[rc];
|
2776 |
const pcre_uchar *pp = start_subject + local_offsets[rc+1];
|
2777 |
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
2778 |
}
|
2779 |
#endif
|
2780 |
if (charcount > 0)
|
2781 |
{
|
2782 |
ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
|
2783 |
}
|
2784 |
else
|
2785 |
{
|
2786 |
ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
|
2787 |
}
|
2788 |
}
|
2789 |
}
|
2790 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
2791 |
}
|
2792 |
break;
|
2793 |
|
2794 |
/*-----------------------------------------------------------------*/
|
2795 |
case OP_BRAPOS:
|
2796 |
case OP_SBRAPOS:
|
2797 |
case OP_CBRAPOS:
|
2798 |
case OP_SCBRAPOS:
|
2799 |
case OP_BRAPOSZERO:
|
2800 |
{
|
2801 |
int charcount, matched_count;
|
2802 |
const pcre_uchar *local_ptr = ptr;
|
2803 |
BOOL allow_zero;
|
2804 |
|
2805 |
if (codevalue == OP_BRAPOSZERO)
|
2806 |
{
|
2807 |
allow_zero = TRUE;
|
2808 |
codevalue = *(++code); /* Codevalue will be one of above BRAs */
|
2809 |
}
|
2810 |
else allow_zero = FALSE;
|
2811 |
|
2812 |
/* Loop to match the subpattern as many times as possible as if it were
|
2813 |
a complete pattern. */
|
2814 |
|
2815 |
for (matched_count = 0;; matched_count++)
|
2816 |
{
|
2817 |
int local_offsets[2];
|
2818 |
int local_workspace[1000];
|
2819 |
|
2820 |
int rc = internal_dfa_exec(
|
2821 |
md, /* fixed match data */
|
2822 |
code, /* this subexpression's code */
|
2823 |
local_ptr, /* where we currently are */
|
2824 |
(int)(ptr - start_subject), /* start offset */
|
2825 |
local_offsets, /* offset vector */
|
2826 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2827 |
local_workspace, /* workspace vector */
|
2828 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2829 |
rlevel); /* function recursion level */
|
2830 |
|
2831 |
/* Failed to match */
|
2832 |
|
2833 |
if (rc < 0)
|
2834 |
{
|
2835 |
if (rc != PCRE_ERROR_NOMATCH) return rc;
|
2836 |
break;
|
2837 |
}
|
2838 |
|
2839 |
/* Matched: break the loop if zero characters matched. */
|
2840 |
|
2841 |
charcount = local_offsets[1] - local_offsets[0];
|
2842 |
if (charcount == 0) break;
|
2843 |
local_ptr += charcount; /* Advance temporary position ptr */
|
2844 |
}
|
2845 |
|
2846 |
/* At this point we have matched the subpattern matched_count
|
2847 |
times, and local_ptr is pointing to the character after the end of the
|
2848 |
last match. */
|
2849 |
|
2850 |
if (matched_count > 0 || allow_zero)
|
2851 |
{
|
2852 |
const pcre_uchar *end_subpattern = code;
|
2853 |
int next_state_offset;
|
2854 |
|
2855 |
do { end_subpattern += GET(end_subpattern, 1); }
|
2856 |
while (*end_subpattern == OP_ALT);
|
2857 |
next_state_offset =
|
2858 |
(int)(end_subpattern - start_code + LINK_SIZE + 1);
|
2859 |
|
2860 |
/* Optimization: if there are no more active states, and there
|
2861 |
are no new states yet set up, then skip over the subject string
|
2862 |
right here, to save looping. Otherwise, set up the new state to swing
|
2863 |
into action when the end of the matched substring is reached. */
|
2864 |
|
2865 |
if (i + 1 >= active_count && new_count == 0)
|
2866 |
{
|
2867 |
ptr = local_ptr;
|
2868 |
clen = 0;
|
2869 |
ADD_NEW(next_state_offset, 0);
|
2870 |
}
|
2871 |
else
|
2872 |
{
|
2873 |
const pcre_uchar *p = ptr;
|
2874 |
const pcre_uchar *pp = local_ptr;
|
2875 |
charcount = (int)(pp - p);
|
2876 |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
2877 |
if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
2878 |
#endif
|
2879 |
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
2880 |
}
|
2881 |
}
|
2882 |
}
|
2883 |
break;
|
2884 |
|
2885 |
/*-----------------------------------------------------------------*/
|
2886 |
case OP_ONCE:
|
2887 |
case OP_ONCE_NC:
|
2888 |
{
|
2889 |
int local_offsets[2];
|
2890 |
int local_workspace[1000];
|
2891 |
|
2892 |
int rc = internal_dfa_exec(
|
2893 |
md, /* fixed match data */
|
2894 |
code, /* this subexpression's code */
|
2895 |
ptr, /* where we currently are */
|
2896 |
(int)(ptr - start_subject), /* start offset */
|
2897 |
local_offsets, /* offset vector */
|
2898 |
sizeof(local_offsets)/sizeof(int), /* size of same */
|
2899 |
local_workspace, /* workspace vector */
|
2900 |
sizeof(local_workspace)/sizeof(int), /* size of same */
|
2901 |
rlevel); /* function recursion level */
|
2902 |
|
2903 |
if (rc >= 0)
|
2904 |
{
|
2905 |
const pcre_uchar *end_subpattern = code;
|
2906 |
int charcount = local_offsets[1] - local_offsets[0];
|
2907 |
int next_state_offset, repeat_state_offset;
|
2908 |
|
2909 |
do { end_subpattern += GET(end_subpattern, 1); }
|
2910 |
while (*end_subpattern == OP_ALT);
|
2911 |
next_state_offset =
|
2912 |
(int)(end_subpattern - start_code + LINK_SIZE + 1);
|
2913 |
|
2914 |
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
|
2915 |
arrange for the repeat state also to be added to the relevant list.
|
2916 |
Calculate the offset, or set -1 for no repeat. */
|
2917 |
|
2918 |
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
|
2919 |
*end_subpattern == OP_KETRMIN)?
|
2920 |
(int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
|
2921 |
|
2922 |
/* If we have matched an empty string, add the next state at the
|
2923 |
current character pointer. This is important so that the duplicate
|
2924 |
checking kicks in, which is what breaks infinite loops that match an
|
2925 |
empty string. */
|
2926 |
|
2927 |
if (charcount == 0)
|
2928 |
{
|
2929 |
ADD_ACTIVE(next_state_offset, 0);
|
2930 |
}
|
2931 |
|
2932 |
/* Optimization: if there are no more active states, and there
|
2933 |
are no new states yet set up, then skip over the subject string
|
2934 |
right here, to save looping. Otherwise, set up the new state to swing
|
2935 |
into action when the end of the matched substring is reached. */
|
2936 |
|
2937 |
else if (i + 1 >= active_count && new_count == 0)
|
2938 |
{
|
2939 |
ptr += charcount;
|
2940 |
clen = 0;
|
2941 |
ADD_NEW(next_state_offset, 0);
|
2942 |
|
2943 |
/* If we are adding a repeat state at the new character position,
|
2944 |
we must fudge things so that it is the only current state.
|
2945 |
Otherwise, it might be a duplicate of one we processed before, and
|
2946 |
that would cause it to be skipped. */
|
2947 |
|
2948 |
if (repeat_state_offset >= 0)
|
2949 |
{
|
2950 |
next_active_state = active_states;
|
2951 |
active_count = 0;
|
2952 |
i = -1;
|
2953 |
ADD_ACTIVE(repeat_state_offset, 0);
|
2954 |
}
|
2955 |
}
|
2956 |
else
|
2957 |
{
|
2958 |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
|
2959 |
if (utf)
|
2960 |
{
|
2961 |
const pcre_uchar *p = start_subject + local_offsets[0];
|
2962 |
const pcre_uchar *pp = start_subject + local_offsets[1];
|
2963 |
while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
|
2964 |
}
|
2965 |
#endif
|
2966 |
ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
|
2967 |
if (repeat_state_offset >= 0)
|
2968 |
{ ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
|
2969 |
}
|
2970 |
}
|
2971 |
else if (rc != PCRE_ERROR_NOMATCH) return rc;
|
2972 |
}
|
2973 |
break;
|
2974 |
|
2975 |
|
2976 |
/* ========================================================================== */
|
2977 |
/* Handle callouts */
|
2978 |
|
2979 |
case OP_CALLOUT:
|
2980 |
rrc = 0;
|
2981 |
if (PUBL(callout) != NULL)
|
2982 |
{
|
2983 |
PUBL(callout_block) cb;
|
2984 |
cb.version = 1; /* Version 1 of the callout block */
|
2985 |
cb.callout_number = code[1];
|
2986 |
cb.offset_vector = offsets;
|
2987 |
#if defined COMPILE_PCRE8
|
2988 |
cb.subject = (PCRE_SPTR)start_subject;
|
2989 |
#elif defined COMPILE_PCRE16
|
2990 |
cb.subject = (PCRE_SPTR16)start_subject;
|
2991 |
#elif defined COMPILE_PCRE32
|
2992 |
cb.subject = (PCRE_SPTR32)start_subject;
|
2993 |
#endif
|
2994 |
cb.subject_length = (int)(end_subject - start_subject);
|
2995 |
cb.start_match = (int)(current_subject - start_subject);
|
2996 |
cb.current_position = (int)(ptr - start_subject);
|
2997 |
cb.pattern_position = GET(code, 2);
|
2998 |
cb.next_item_length = GET(code, 2 + LINK_SIZE);
|
2999 |
cb.capture_top = 1;
|
3000 |
cb.capture_last = -1;
|
3001 |
cb.callout_data = md->callout_data;
|
3002 |
cb.mark = NULL; /* No (*MARK) support */
|
3003 |
if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
|
3004 |
}
|
3005 |
if (rrc == 0)
|
3006 |
{ ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
|
3007 |
break;
|
3008 |
|
3009 |
|
3010 |
/* ========================================================================== */
|
3011 |
default: /* Unsupported opcode */
|
3012 |
return PCRE_ERROR_DFA_UITEM;
|
3013 |
}
|
3014 |
|
3015 |
NEXT_ACTIVE_STATE: continue;
|
3016 |
|
3017 |
} /* End of loop scanning active states */
|
3018 |
|
3019 |
/* We have finished the processing at the current subject character. If no
|
3020 |
new states have been set for the next character, we have found all the
|
3021 |
matches that we are going to find. If we are at the top level and partial
|
3022 |
matching has been requested, check for appropriate conditions.
|
3023 |
|
3024 |
The "forced_ fail" variable counts the number of (*F) encountered for the
|
3025 |
character. If it is equal to the original active_count (saved in
|
3026 |
workspace[1]) it means that (*F) was found on every active state. In this
|
3027 |
case we don't want to give a partial match.
|
3028 |
|
3029 |
The "could_continue" variable is true if a state could have continued but
|
3030 |
for the fact that the end of the subject was reached. */
|
3031 |
|
3032 |
if (new_count <= 0)
|
3033 |
{
|
3034 |
if (rlevel == 1 && /* Top level, and */
|
3035 |
could_continue && /* Some could go on, and */
|
3036 |
forced_fail != workspace[1] && /* Not all forced fail & */
|
3037 |
( /* either... */
|
3038 |
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
|
3039 |
|| /* or... */
|
3040 |
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
|
3041 |
match_count < 0) /* no matches */
|
3042 |
) && /* And... */
|
3043 |
(
|
3044 |
partial_newline || /* Either partial NL */
|
3045 |
( /* or ... */
|
3046 |
ptr >= end_subject && /* End of subject and */
|
3047 |
ptr > md->start_used_ptr) /* Inspected non-empty string */
|
3048 |
)
|
3049 |
)
|
3050 |
match_count = PCRE_ERROR_PARTIAL;
|
3051 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
3052 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
|
3053 |
rlevel*2-2, SP));
|
3054 |
break; /* In effect, "return", but see the comment below */
|
3055 |
}
|
3056 |
|
3057 |
/* One or more states are active for the next character. */
|
3058 |
|
3059 |
ptr += clen; /* Advance to next subject character */
|
3060 |
} /* Loop to move along the subject string */
|
3061 |
|
3062 |
/* Control gets here from "break" a few lines above. We do it this way because
|
3063 |
if we use "return" above, we have compiler trouble. Some compilers warn if
|
3064 |
there's nothing here because they think the function doesn't return a value. On
|
3065 |
the other hand, if we put a dummy statement here, some more clever compilers
|
3066 |
complain that it can't be reached. Sigh. */
|
3067 |
|
3068 |
return match_count;
|
3069 |
}
|
3070 |
|
3071 |
|
3072 |
|
3073 |
|
3074 |
/*************************************************
|
3075 |
* Execute a Regular Expression - DFA engine *
|
3076 |
*************************************************/
|
3077 |
|
3078 |
/* This external function applies a compiled re to a subject string using a DFA
|
3079 |
engine. This function calls the internal function multiple times if the pattern
|
3080 |
is not anchored.
|
3081 |
|
3082 |
Arguments:
|
3083 |
argument_re points to the compiled expression
|
3084 |
extra_data points to extra data or is NULL
|
3085 |
subject points to the subject string
|
3086 |
length length of subject string (may contain binary zeros)
|
3087 |
start_offset where to start in the subject string
|
3088 |
options option bits
|
3089 |
offsets vector of match offsets
|
3090 |
offsetcount size of same
|
3091 |
workspace workspace vector
|
3092 |
wscount size of same
|
3093 |
|
3094 |
Returns: > 0 => number of match offset pairs placed in offsets
|
3095 |
= 0 => offsets overflowed; longest matches are present
|
3096 |
-1 => failed to match
|
3097 |
< -1 => some kind of unexpected problem
|
3098 |
*/
|
3099 |
|
3100 |
#if defined COMPILE_PCRE8
|
3101 |
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
3102 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
|
3103 |
const char *subject, int length, int start_offset, int options, int *offsets,
|
3104 |
int offsetcount, int *workspace, int wscount)
|
3105 |
#elif defined COMPILE_PCRE16
|
3106 |
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
3107 |
pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
|
3108 |
PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
|
3109 |
int offsetcount, int *workspace, int wscount)
|
3110 |
#elif defined COMPILE_PCRE32
|
3111 |
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
|
3112 |
pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
|
3113 |
PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
|
3114 |
int offsetcount, int *workspace, int wscount)
|
3115 |
#endif
|
3116 |
{
|
3117 |
REAL_PCRE *re = (REAL_PCRE *)argument_re;
|
3118 |
dfa_match_data match_block;
|
3119 |
dfa_match_data *md = &match_block;
|
3120 |
BOOL utf, anchored, startline, firstline;
|
3121 |
const pcre_uchar *current_subject, *end_subject;
|
3122 |
const pcre_study_data *study = NULL;
|
3123 |
|
3124 |
const pcre_uchar *req_char_ptr;
|
3125 |
const pcre_uint8 *start_bits = NULL;
|
3126 |
BOOL has_first_char = FALSE;
|
3127 |
BOOL has_req_char = FALSE;
|
3128 |
pcre_uchar first_char = 0;
|
3129 |
pcre_uchar first_char2 = 0;
|
3130 |
pcre_uchar req_char = 0;
|
3131 |
pcre_uchar req_char2 = 0;
|
3132 |
int newline;
|
3133 |
|
3134 |
/* Plausibility checks */
|
3135 |
|
3136 |
if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
|
3137 |
if (re == NULL || subject == NULL || workspace == NULL ||
|
3138 |
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
|
3139 |
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
|
3140 |
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
|
3141 |
if (length < 0) return PCRE_ERROR_BADLENGTH;
|
3142 |
if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
|
3143 |
|
3144 |
/* Check that the first field in the block is the magic number. If it is not,
|
3145 |
return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
|
3146 |
REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
|
3147 |
means that the pattern is likely compiled with different endianness. */
|
3148 |
|
3149 |
if (re->magic_number != MAGIC_NUMBER)
|
3150 |
return re->magic_number == REVERSED_MAGIC_NUMBER?
|
3151 |
PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
|
3152 |
if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
|
3153 |
|
3154 |
/* If restarting after a partial match, do some sanity checks on the contents
|
3155 |
of the workspace. */
|
3156 |
|
3157 |
if ((options & PCRE_DFA_RESTART) != 0)
|
3158 |
{
|
3159 |
if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
|
3160 |
workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
|
3161 |
return PCRE_ERROR_DFA_BADRESTART;
|
3162 |
}
|
3163 |
|
3164 |
/* Set up study, callout, and table data */
|
3165 |
|
3166 |
md->tables = re->tables;
|
3167 |
md->callout_data = NULL;
|
3168 |
|
3169 |
if (extra_data != NULL)
|
3170 |
{
|
3171 |
unsigned int flags = extra_data->flags;
|
3172 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
|
3173 |
study = (const pcre_study_data *)extra_data->study_data;
|
3174 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
|
3175 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
|
3176 |
return PCRE_ERROR_DFA_UMLIMIT;
|
3177 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
|
3178 |
md->callout_data = extra_data->callout_data;
|
3179 |
if ((flags & PCRE_EXTRA_TABLES) != 0)
|
3180 |
md->tables = extra_data->tables;
|
3181 |
}
|
3182 |
|
3183 |
/* Set some local values */
|
3184 |
|
3185 |
current_subject = (const pcre_uchar *)subject + start_offset;
|
3186 |
end_subject = (const pcre_uchar *)subject + length;
|
3187 |
req_char_ptr = current_subject - 1;
|
3188 |
|
3189 |
#ifdef SUPPORT_UTF
|
3190 |
/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
|
3191 |
utf = (re->options & PCRE_UTF8) != 0;
|
3192 |
#else
|
3193 |
utf = FALSE;
|
3194 |
#endif
|
3195 |
|
3196 |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
|
3197 |
(re->options & PCRE_ANCHORED) != 0;
|
3198 |
|
3199 |
/* The remaining fixed data for passing around. */
|
3200 |
|
3201 |
md->start_code = (const pcre_uchar *)argument_re +
|
3202 |
re->name_table_offset + re->name_count * re->name_entry_size;
|
3203 |
md->start_subject = (const pcre_uchar *)subject;
|
3204 |
md->end_subject = end_subject;
|
3205 |
md->start_offset = start_offset;
|
3206 |
md->moptions = options;
|
3207 |
md->poptions = re->options;
|
3208 |
|
3209 |
/* If the BSR option is not set at match time, copy what was set
|
3210 |
at compile time. */
|
3211 |
|
3212 |
if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
|
3213 |
{
|
3214 |
if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
|
3215 |
md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
|
3216 |
#ifdef BSR_ANYCRLF
|
3217 |
else md->moptions |= PCRE_BSR_ANYCRLF;
|
3218 |
#endif
|
3219 |
}
|
3220 |
|
3221 |
/* Handle different types of newline. The three bits give eight cases. If
|
3222 |
nothing is set at run time, whatever was used at compile time applies. */
|
3223 |
|
3224 |
switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
|
3225 |
PCRE_NEWLINE_BITS)
|
3226 |
{
|
3227 |
case 0: newline = NEWLINE; break; /* Compile-time default */
|
3228 |
case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
|
3229 |
case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
|
3230 |
case PCRE_NEWLINE_CR+
|
3231 |
PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
|
3232 |
case PCRE_NEWLINE_ANY: newline = -1; break;
|
3233 |
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
|
3234 |
default: return PCRE_ERROR_BADNEWLINE;
|
3235 |
}
|
3236 |
|
3237 |
if (newline == -2)
|
3238 |
{
|
3239 |
md->nltype = NLTYPE_ANYCRLF;
|
3240 |
}
|
3241 |
else if (newline < 0)
|
3242 |
{
|
3243 |
md->nltype = NLTYPE_ANY;
|
3244 |
}
|
3245 |
else
|
3246 |
{
|
3247 |
md->nltype = NLTYPE_FIXED;
|
3248 |
if (newline > 255)
|
3249 |
{
|
3250 |
md->nllen = 2;
|
3251 |
md->nl[0] = (newline >> 8) & 255;
|
3252 |
md->nl[1] = newline & 255;
|
3253 |
}
|
3254 |
else
|
3255 |
{
|
3256 |
md->nllen = 1;
|
3257 |
md->nl[0] = newline;
|
3258 |
}
|
3259 |
}
|
3260 |
|
3261 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing
|
3262 |
back the character offset. */
|
3263 |
|
3264 |
#ifdef SUPPORT_UTF
|
3265 |
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
|
3266 |
{
|
3267 |
int erroroffset;
|
3268 |
int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
|
3269 |
if (errorcode != 0)
|
3270 |
{
|
3271 |
if (offsetcount >= 2)
|
3272 |
{
|
3273 |
offsets[0] = erroroffset;
|
3274 |
offsets[1] = errorcode;
|
3275 |
}
|
3276 |
#if defined COMPILE_PCRE8
|
3277 |
return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
|
3278 |
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
|
3279 |
#elif defined COMPILE_PCRE16
|
3280 |
return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
|
3281 |
PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
|
3282 |
#elif defined COMPILE_PCRE32
|
3283 |
return PCRE_ERROR_BADUTF32;
|
3284 |
#endif
|
3285 |
}
|
3286 |
#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
|
3287 |
if (start_offset > 0 && start_offset < length &&
|
3288 |
NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
|
3289 |
return PCRE_ERROR_BADUTF8_OFFSET;
|
3290 |
#endif
|
3291 |
}
|
3292 |
#endif
|
3293 |
|
3294 |
/* If the exec call supplied NULL for tables, use the inbuilt ones. This
|
3295 |
is a feature that makes it possible to save compiled regex and re-use them
|
3296 |
in other programs later. */
|
3297 |
|
3298 |
if (md->tables == NULL) md->tables = PRIV(default_tables);
|
3299 |
|
3300 |
/* The "must be at the start of a line" flags are used in a loop when finding
|
3301 |
where to start. */
|
3302 |
|
3303 |
startline = (re->flags & PCRE_STARTLINE) != 0;
|
3304 |
firstline = (re->options & PCRE_FIRSTLINE) != 0;
|
3305 |
|
3306 |
/* Set up the first character to match, if available. The first_byte value is
|
3307 |
never set for an anchored regular expression, but the anchoring may be forced
|
3308 |
at run time, so we have to test for anchoring. The first char may be unset for
|
3309 |
an unanchored pattern, of course. If there's no first char and the pattern was
|
3310 |
studied, there may be a bitmap of possible first characters. */
|
3311 |
|
3312 |
if (!anchored)
|
3313 |
{
|
3314 |
if ((re->flags & PCRE_FIRSTSET) != 0)
|
3315 |
{
|
3316 |
has_first_char = TRUE;
|
3317 |
first_char = first_char2 = (pcre_uchar)(re->first_char);
|
3318 |
if ((re->flags & PCRE_FCH_CASELESS) != 0)
|
3319 |
{
|
3320 |
first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
|
3321 |
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
|
3322 |
if (utf && first_char > 127)
|
3323 |
first_char2 = UCD_OTHERCASE(first_char);
|
3324 |
#endif
|
3325 |
}
|
3326 |
}
|
3327 |
else
|
3328 |
{
|
3329 |
if (!startline && study != NULL &&
|
3330 |
(study->flags & PCRE_STUDY_MAPPED) != 0)
|
3331 |
start_bits = study->start_bits;
|
3332 |
}
|
3333 |
}
|
3334 |
|
3335 |
/* For anchored or unanchored matches, there may be a "last known required
|
3336 |
character" set. */
|
3337 |
|
3338 |
if ((re->flags & PCRE_REQCHSET) != 0)
|
3339 |
{
|
3340 |
has_req_char = TRUE;
|
3341 |
req_char = req_char2 = (pcre_uchar)(re->req_char);
|
3342 |
if ((re->flags & PCRE_RCH_CASELESS) != 0)
|
3343 |
{
|
3344 |
req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
|
3345 |
#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
|
3346 |
if (utf && req_char > 127)
|
3347 |
req_char2 = UCD_OTHERCASE(req_char);
|
3348 |
#endif
|
3349 |
}
|
3350 |
}
|
3351 |
|
3352 |
/* Call the main matching function, looping for a non-anchored regex after a
|
3353 |
failed match. If not restarting, perform certain optimizations at the start of
|
3354 |
a match. */
|
3355 |
|
3356 |
for (;;)
|
3357 |
{
|
3358 |
int rc;
|
3359 |
|
3360 |
if ((options & PCRE_DFA_RESTART) == 0)
|
3361 |
{
|
3362 |
const pcre_uchar *save_end_subject = end_subject;
|
3363 |
|
3364 |
/* If firstline is TRUE, the start of the match is constrained to the first
|
3365 |
line of a multiline string. Implement this by temporarily adjusting
|
3366 |
end_subject so that we stop scanning at a newline. If the match fails at
|
3367 |
the newline, later code breaks this loop. */
|
3368 |
|
3369 |
if (firstline)
|
3370 |
{
|
3371 |
PCRE_PUCHAR t = current_subject;
|
3372 |
#ifdef SUPPORT_UTF
|
3373 |
if (utf)
|
3374 |
{
|
3375 |
while (t < md->end_subject && !IS_NEWLINE(t))
|
3376 |
{
|
3377 |
t++;
|
3378 |
ACROSSCHAR(t < end_subject, *t, t++);
|
3379 |
}
|
3380 |
}
|
3381 |
else
|
3382 |
#endif
|
3383 |
while (t < md->end_subject && !IS_NEWLINE(t)) t++;
|
3384 |
end_subject = t;
|
3385 |
}
|
3386 |
|
3387 |
/* There are some optimizations that avoid running the match if a known
|
3388 |
starting point is not found. However, there is an option that disables
|
3389 |
these, for testing and for ensuring that all callouts do actually occur.
|
3390 |
The option can be set in the regex by (*NO_START_OPT) or passed in
|
3391 |
match-time options. */
|
3392 |
|
3393 |
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
|
3394 |
{
|
3395 |
/* Advance to a known first char. */
|
3396 |
|
3397 |
if (has_first_char)
|
3398 |
{
|
3399 |
if (first_char != first_char2)
|
3400 |
{
|
3401 |
pcre_uchar csc;
|
3402 |
while (current_subject < end_subject &&
|
3403 |
(csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
|
3404 |
current_subject++;
|
3405 |
}
|
3406 |
else
|
3407 |
while (current_subject < end_subject &&
|
3408 |
RAWUCHARTEST(current_subject) != first_char)
|
3409 |
current_subject++;
|
3410 |
}
|
3411 |
|
3412 |
/* Or to just after a linebreak for a multiline match if possible */
|
3413 |
|
3414 |
else if (startline)
|
3415 |
{
|
3416 |
if (current_subject > md->start_subject + start_offset)
|
3417 |
{
|
3418 |
#ifdef SUPPORT_UTF
|
3419 |
if (utf)
|
3420 |
{
|
3421 |
while (current_subject < end_subject &&
|
3422 |
!WAS_NEWLINE(current_subject))
|
3423 |
{
|
3424 |
current_subject++;
|
3425 |
ACROSSCHAR(current_subject < end_subject, *current_subject,
|
3426 |
current_subject++);
|
3427 |
}
|
3428 |
}
|
3429 |
else
|
3430 |
#endif
|
3431 |
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
|
3432 |
current_subject++;
|
3433 |
|
3434 |
/* If we have just passed a CR and the newline option is ANY or
|
3435 |
ANYCRLF, and we are now at a LF, advance the match position by one
|
3436 |
more character. */
|
3437 |
|
3438 |
if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
|
3439 |
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
|
3440 |
current_subject < end_subject &&
|
3441 |
RAWUCHARTEST(current_subject) == CHAR_NL)
|
3442 |
current_subject++;
|
3443 |
}
|
3444 |
}
|
3445 |
|
3446 |
/* Or to a non-unique first char after study */
|
3447 |
|
3448 |
else if (start_bits != NULL)
|
3449 |
{
|
3450 |
while (current_subject < end_subject)
|
3451 |
{
|
3452 |
register pcre_uint32 c = RAWUCHARTEST(current_subject);
|
3453 |
#ifndef COMPILE_PCRE8
|
3454 |
if (c > 255) c = 255;
|
3455 |
#endif
|
3456 |
if ((start_bits[c/8] & (1 << (c&7))) == 0)
|
3457 |
{
|
3458 |
current_subject++;
|
3459 |
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
|
3460 |
/* In non 8-bit mode, the iteration will stop for
|
3461 |
characters > 255 at the beginning or not stop at all. */
|
3462 |
if (utf)
|
3463 |
ACROSSCHAR(current_subject < end_subject, *current_subject,
|
3464 |
current_subject++);
|
3465 |
#endif
|
3466 |
}
|
3467 |
else break;
|
3468 |
}
|
3469 |
}
|
3470 |
}
|
3471 |
|
3472 |
/* Restore fudged end_subject */
|
3473 |
|
3474 |
end_subject = save_end_subject;
|
3475 |
|
3476 |
/* The following two optimizations are disabled for partial matching or if
|
3477 |
disabling is explicitly requested (and of course, by the test above, this
|
3478 |
code is not obeyed when restarting after a partial match). */
|
3479 |
|
3480 |
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
|
3481 |
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
|
3482 |
{
|
3483 |
/* If the pattern was studied, a minimum subject length may be set. This
|
3484 |
is a lower bound; no actual string of that length may actually match the
|
3485 |
pattern. Although the value is, strictly, in characters, we treat it as
|
3486 |
bytes to avoid spending too much time in this optimization. */
|
3487 |
|
3488 |
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
|
3489 |
(pcre_uint32)(end_subject - current_subject) < study->minlength)
|
3490 |
return PCRE_ERROR_NOMATCH;
|
3491 |
|
3492 |
/* If req_char is set, we know that that character must appear in the
|
3493 |
subject for the match to succeed. If the first character is set, req_char
|
3494 |
must be later in the subject; otherwise the test starts at the match
|
3495 |
point. This optimization can save a huge amount of work in patterns with
|
3496 |
nested unlimited repeats that aren't going to match. Writing separate
|
3497 |
code for cased/caseless versions makes it go faster, as does using an
|
3498 |
autoincrement and backing off on a match.
|
3499 |
|
3500 |
HOWEVER: when the subject string is very, very long, searching to its end
|
3501 |
can take a long time, and give bad performance on quite ordinary
|
3502 |
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
|
3503 |
string... so we don't do this when the string is sufficiently long. */
|
3504 |
|
3505 |
if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
|
3506 |
{
|
3507 |
register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
|
3508 |
|
3509 |
/* We don't need to repeat the search if we haven't yet reached the
|
3510 |
place we found it at last time. */
|
3511 |
|
3512 |
if (p > req_char_ptr)
|
3513 |
{
|
3514 |
if (req_char != req_char2)
|
3515 |
{
|
3516 |
while (p < end_subject)
|
3517 |
{
|
3518 |
register pcre_uint32 pp = RAWUCHARINCTEST(p);
|
3519 |
if (pp == req_char || pp == req_char2) { p--; break; }
|
3520 |
}
|
3521 |
}
|
3522 |
else
|
3523 |
{
|
3524 |
while (p < end_subject)
|
3525 |
{
|
3526 |
if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
|
3527 |
}
|
3528 |
}
|
3529 |
|
3530 |
/* If we can't find the required character, break the matching loop,
|
3531 |
which will cause a return or PCRE_ERROR_NOMATCH. */
|
3532 |
|
3533 |
if (p >= end_subject) break;
|
3534 |
|
3535 |
/* If we have found the required character, save the point where we
|
3536 |
found it, so that we don't search again next time round the loop if
|
3537 |
the start hasn't passed this character yet. */
|
3538 |
|
3539 |
req_char_ptr = p;
|
3540 |
}
|
3541 |
}
|
3542 |
}
|
3543 |
} /* End of optimizations that are done when not restarting */
|
3544 |
|
3545 |
/* OK, now we can do the business */
|
3546 |
|
3547 |
md->start_used_ptr = current_subject;
|
3548 |
md->recursive = NULL;
|
3549 |
|
3550 |
rc = internal_dfa_exec(
|
3551 |
md, /* fixed match data */
|
3552 |
md->start_code, /* this subexpression's code */
|
3553 |
current_subject, /* where we currently are */
|
3554 |
start_offset, /* start offset in subject */
|
3555 |
offsets, /* offset vector */
|
3556 |
offsetcount, /* size of same */
|
3557 |
workspace, /* workspace vector */
|
3558 |
wscount, /* size of same */
|
3559 |
0); /* function recurse level */
|
3560 |
|
3561 |
/* Anything other than "no match" means we are done, always; otherwise, carry
|
3562 |
on only if not anchored. */
|
3563 |
|
3564 |
if (rc != PCRE_ERROR_NOMATCH || anchored)
|
3565 |
{
|
3566 |
if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
|
3567 |
{
|
3568 |
offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
|
3569 |
offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
|
3570 |
if (offsetcount > 2)
|
3571 |
offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
|
3572 |
}
|
3573 |
return rc;
|
3574 |
}
|
3575 |
|
3576 |
/* Advance to the next subject character unless we are at the end of a line
|
3577 |
and firstline is set. */
|
3578 |
|
3579 |
if (firstline && IS_NEWLINE(current_subject)) break;
|
3580 |
current_subject++;
|
3581 |
#ifdef SUPPORT_UTF
|
3582 |
if (utf)
|
3583 |
{
|
3584 |
ACROSSCHAR(current_subject < end_subject, *current_subject,
|
3585 |
current_subject++);
|
3586 |
}
|
3587 |
#endif
|
3588 |
if (current_subject > end_subject) break;
|
3589 |
|
3590 |
/* If we have just passed a CR and we are now at a LF, and the pattern does
|
3591 |
not contain any explicit matches for \r or \n, and the newline option is CRLF
|
3592 |
or ANY or ANYCRLF, advance the match position by one more character. */
|
3593 |
|
3594 |
if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
|
3595 |
current_subject < end_subject &&
|
3596 |
RAWUCHARTEST(current_subject) == CHAR_NL &&
|
3597 |
(re->flags & PCRE_HASCRORLF) == 0 &&
|
3598 |
(md->nltype == NLTYPE_ANY ||
|
3599 |
md->nltype == NLTYPE_ANYCRLF ||
|
3600 |
md->nllen == 2))
|
3601 |
current_subject++;
|
3602 |
|
3603 |
} /* "Bumpalong" loop */
|
3604 |
|
3605 |
return PCRE_ERROR_NOMATCH;
|
3606 |
}
|
3607 |
|
3608 |
/* End of pcre_dfa_exec.c */
|