6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
7 |
|
|
8 |
Written by Philip Hazel |
Written by Philip Hazel |
9 |
Copyright (c) 1997-2008 University of Cambridge |
Copyright (c) 1997-2009 University of Cambridge |
10 |
|
|
11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
54 |
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; |
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; |
55 |
|
|
56 |
|
|
57 |
|
|
58 |
|
/************************************************* |
59 |
|
* Find the minimum subject length for a group * |
60 |
|
*************************************************/ |
61 |
|
|
62 |
|
/* Scan a parenthesized group and compute the minimum length of subject that |
63 |
|
is needed to match it. This is a lower bound; it does not mean there is a |
64 |
|
string of that length that matches. In UTF8 mode, the result is in characters |
65 |
|
rather than bytes. |
66 |
|
|
67 |
|
Arguments: |
68 |
|
code pointer to start of group (the bracket) |
69 |
|
startcode pointer to start of the whole pattern |
70 |
|
options the compiling options |
71 |
|
|
72 |
|
Returns: the minimum length |
73 |
|
-1 if \C was encountered |
74 |
|
-2 internal error (missing capturing bracket) |
75 |
|
*/ |
76 |
|
|
77 |
|
static int |
78 |
|
find_minlength(const uschar *code, const uschar *startcode, int options) |
79 |
|
{ |
80 |
|
int length = -1; |
81 |
|
BOOL utf8 = (options & PCRE_UTF8) != 0; |
82 |
|
BOOL had_recurse = FALSE; |
83 |
|
register int branchlength = 0; |
84 |
|
register uschar *cc = (uschar *)code + 1 + LINK_SIZE; |
85 |
|
|
86 |
|
if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2; |
87 |
|
|
88 |
|
/* Scan along the opcodes for this branch. If we get to the end of the |
89 |
|
branch, check the length against that of the other branches. */ |
90 |
|
|
91 |
|
for (;;) |
92 |
|
{ |
93 |
|
int d, min; |
94 |
|
uschar *cs, *ce; |
95 |
|
register int op = *cc; |
96 |
|
|
97 |
|
switch (op) |
98 |
|
{ |
99 |
|
case OP_CBRA: |
100 |
|
case OP_SCBRA: |
101 |
|
case OP_BRA: |
102 |
|
case OP_SBRA: |
103 |
|
case OP_ONCE: |
104 |
|
case OP_COND: |
105 |
|
case OP_SCOND: |
106 |
|
d = find_minlength(cc, startcode, options); |
107 |
|
if (d < 0) return d; |
108 |
|
branchlength += d; |
109 |
|
do cc += GET(cc, 1); while (*cc == OP_ALT); |
110 |
|
cc += 1 + LINK_SIZE; |
111 |
|
break; |
112 |
|
|
113 |
|
/* Reached end of a branch; if it's a ket it is the end of a nested |
114 |
|
call. If it's ALT it is an alternation in a nested call. If it is |
115 |
|
END it's the end of the outer call. All can be handled by the same code. */ |
116 |
|
|
117 |
|
case OP_ALT: |
118 |
|
case OP_KET: |
119 |
|
case OP_KETRMAX: |
120 |
|
case OP_KETRMIN: |
121 |
|
case OP_END: |
122 |
|
if (length < 0 || (!had_recurse && branchlength < length)) |
123 |
|
length = branchlength; |
124 |
|
if (*cc != OP_ALT) return length; |
125 |
|
cc += 1 + LINK_SIZE; |
126 |
|
branchlength = 0; |
127 |
|
had_recurse = FALSE; |
128 |
|
break; |
129 |
|
|
130 |
|
/* Skip over assertive subpatterns */ |
131 |
|
|
132 |
|
case OP_ASSERT: |
133 |
|
case OP_ASSERT_NOT: |
134 |
|
case OP_ASSERTBACK: |
135 |
|
case OP_ASSERTBACK_NOT: |
136 |
|
do cc += GET(cc, 1); while (*cc == OP_ALT); |
137 |
|
/* Fall through */ |
138 |
|
|
139 |
|
/* Skip over things that don't match chars */ |
140 |
|
|
141 |
|
case OP_REVERSE: |
142 |
|
case OP_CREF: |
143 |
|
case OP_RREF: |
144 |
|
case OP_DEF: |
145 |
|
case OP_OPT: |
146 |
|
case OP_CALLOUT: |
147 |
|
case OP_SOD: |
148 |
|
case OP_SOM: |
149 |
|
case OP_EOD: |
150 |
|
case OP_EODN: |
151 |
|
case OP_CIRC: |
152 |
|
case OP_DOLL: |
153 |
|
case OP_NOT_WORD_BOUNDARY: |
154 |
|
case OP_WORD_BOUNDARY: |
155 |
|
cc += _pcre_OP_lengths[*cc]; |
156 |
|
break; |
157 |
|
|
158 |
|
/* Skip over a subpattern that has a {0} or {0,x} quantifier */ |
159 |
|
|
160 |
|
case OP_BRAZERO: |
161 |
|
case OP_BRAMINZERO: |
162 |
|
case OP_SKIPZERO: |
163 |
|
cc += _pcre_OP_lengths[*cc]; |
164 |
|
do cc += GET(cc, 1); while (*cc == OP_ALT); |
165 |
|
cc += 1 + LINK_SIZE; |
166 |
|
break; |
167 |
|
|
168 |
|
/* Handle literal characters and + repetitions */ |
169 |
|
|
170 |
|
case OP_CHAR: |
171 |
|
case OP_CHARNC: |
172 |
|
case OP_NOT: |
173 |
|
case OP_PLUS: |
174 |
|
case OP_MINPLUS: |
175 |
|
case OP_POSPLUS: |
176 |
|
case OP_NOTPLUS: |
177 |
|
case OP_NOTMINPLUS: |
178 |
|
case OP_NOTPOSPLUS: |
179 |
|
branchlength++; |
180 |
|
cc += 2; |
181 |
|
#ifdef SUPPORT_UTF8 |
182 |
|
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; |
183 |
|
#endif |
184 |
|
break; |
185 |
|
|
186 |
|
case OP_TYPEPLUS: |
187 |
|
case OP_TYPEMINPLUS: |
188 |
|
case OP_TYPEPOSPLUS: |
189 |
|
branchlength++; |
190 |
|
cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2; |
191 |
|
break; |
192 |
|
|
193 |
|
/* Handle exact repetitions. The count is already in characters, but we |
194 |
|
need to skip over a multibyte character in UTF8 mode. */ |
195 |
|
|
196 |
|
case OP_EXACT: |
197 |
|
case OP_NOTEXACT: |
198 |
|
branchlength += GET2(cc,1); |
199 |
|
cc += 4; |
200 |
|
#ifdef SUPPORT_UTF8 |
201 |
|
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; |
202 |
|
#endif |
203 |
|
break; |
204 |
|
|
205 |
|
case OP_TYPEEXACT: |
206 |
|
branchlength += GET2(cc,1); |
207 |
|
cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4; |
208 |
|
break; |
209 |
|
|
210 |
|
/* Handle single-char non-literal matchers */ |
211 |
|
|
212 |
|
case OP_PROP: |
213 |
|
case OP_NOTPROP: |
214 |
|
cc += 2; |
215 |
|
/* Fall through */ |
216 |
|
|
217 |
|
case OP_NOT_DIGIT: |
218 |
|
case OP_DIGIT: |
219 |
|
case OP_NOT_WHITESPACE: |
220 |
|
case OP_WHITESPACE: |
221 |
|
case OP_NOT_WORDCHAR: |
222 |
|
case OP_WORDCHAR: |
223 |
|
case OP_ANY: |
224 |
|
case OP_ALLANY: |
225 |
|
case OP_EXTUNI: |
226 |
|
case OP_HSPACE: |
227 |
|
case OP_NOT_HSPACE: |
228 |
|
case OP_VSPACE: |
229 |
|
case OP_NOT_VSPACE: |
230 |
|
branchlength++; |
231 |
|
cc++; |
232 |
|
break; |
233 |
|
|
234 |
|
/* "Any newline" might match two characters */ |
235 |
|
|
236 |
|
case OP_ANYNL: |
237 |
|
branchlength += 2; |
238 |
|
cc++; |
239 |
|
break; |
240 |
|
|
241 |
|
/* The single-byte matcher means we can't proceed in UTF-8 mode */ |
242 |
|
|
243 |
|
case OP_ANYBYTE: |
244 |
|
#ifdef SUPPORT_UTF8 |
245 |
|
if (utf8) return -1; |
246 |
|
#endif |
247 |
|
branchlength++; |
248 |
|
cc++; |
249 |
|
break; |
250 |
|
|
251 |
|
/* For repeated character types, we have to test for \p and \P, which have |
252 |
|
an extra two bytes of parameters. */ |
253 |
|
|
254 |
|
case OP_TYPESTAR: |
255 |
|
case OP_TYPEMINSTAR: |
256 |
|
case OP_TYPEQUERY: |
257 |
|
case OP_TYPEMINQUERY: |
258 |
|
case OP_TYPEPOSSTAR: |
259 |
|
case OP_TYPEPOSQUERY: |
260 |
|
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2; |
261 |
|
cc += _pcre_OP_lengths[op]; |
262 |
|
break; |
263 |
|
|
264 |
|
case OP_TYPEUPTO: |
265 |
|
case OP_TYPEMINUPTO: |
266 |
|
case OP_TYPEPOSUPTO: |
267 |
|
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; |
268 |
|
cc += _pcre_OP_lengths[op]; |
269 |
|
break; |
270 |
|
|
271 |
|
/* Check a class for variable quantification */ |
272 |
|
|
273 |
|
#ifdef SUPPORT_UTF8 |
274 |
|
case OP_XCLASS: |
275 |
|
cc += GET(cc, 1) - 33; |
276 |
|
/* Fall through */ |
277 |
|
#endif |
278 |
|
|
279 |
|
case OP_CLASS: |
280 |
|
case OP_NCLASS: |
281 |
|
cc += 33; |
282 |
|
|
283 |
|
switch (*cc) |
284 |
|
{ |
285 |
|
case OP_CRPLUS: |
286 |
|
case OP_CRMINPLUS: |
287 |
|
branchlength++; |
288 |
|
/* Fall through */ |
289 |
|
|
290 |
|
case OP_CRSTAR: |
291 |
|
case OP_CRMINSTAR: |
292 |
|
case OP_CRQUERY: |
293 |
|
case OP_CRMINQUERY: |
294 |
|
cc++; |
295 |
|
break; |
296 |
|
|
297 |
|
case OP_CRRANGE: |
298 |
|
case OP_CRMINRANGE: |
299 |
|
branchlength += GET2(cc,1); |
300 |
|
cc += 5; |
301 |
|
break; |
302 |
|
|
303 |
|
default: |
304 |
|
branchlength++; |
305 |
|
break; |
306 |
|
} |
307 |
|
break; |
308 |
|
|
309 |
|
/* Backreferences and subroutine calls are treated in the same way: we find |
310 |
|
the minimum length for the subpattern. A recursion, however, causes an |
311 |
|
a flag to be set that causes the length of this branch to be ignored. The |
312 |
|
logic is that a recursion can only make sense if there is another |
313 |
|
alternation that stops the recursing. That will provide the minimum length |
314 |
|
(when no recursion happens). A backreference within the group that it is |
315 |
|
referencing behaves in the same way. */ |
316 |
|
|
317 |
|
case OP_REF: |
318 |
|
ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1)); |
319 |
|
if (cs == NULL) return -2; |
320 |
|
do ce += GET(ce, 1); while (*ce == OP_ALT); |
321 |
|
if (cc > cs && cc < ce) |
322 |
|
{ |
323 |
|
d = 0; |
324 |
|
had_recurse = TRUE; |
325 |
|
} |
326 |
|
else d = find_minlength(cs, startcode, options); |
327 |
|
cc += 3; |
328 |
|
|
329 |
|
/* Handle repeated back references */ |
330 |
|
|
331 |
|
switch (*cc) |
332 |
|
{ |
333 |
|
case OP_CRSTAR: |
334 |
|
case OP_CRMINSTAR: |
335 |
|
case OP_CRQUERY: |
336 |
|
case OP_CRMINQUERY: |
337 |
|
min = 0; |
338 |
|
cc++; |
339 |
|
break; |
340 |
|
|
341 |
|
case OP_CRRANGE: |
342 |
|
case OP_CRMINRANGE: |
343 |
|
min = GET2(cc, 1); |
344 |
|
cc += 5; |
345 |
|
break; |
346 |
|
|
347 |
|
default: |
348 |
|
min = 1; |
349 |
|
break; |
350 |
|
} |
351 |
|
|
352 |
|
branchlength += min * d; |
353 |
|
break; |
354 |
|
|
355 |
|
case OP_RECURSE: |
356 |
|
cs = ce = (uschar *)startcode + GET(cc, 1); |
357 |
|
if (cs == NULL) return -2; |
358 |
|
do ce += GET(ce, 1); while (*ce == OP_ALT); |
359 |
|
if (cc > cs && cc < ce) |
360 |
|
had_recurse = TRUE; |
361 |
|
else |
362 |
|
branchlength += find_minlength(cs, startcode, options); |
363 |
|
cc += 1 + LINK_SIZE; |
364 |
|
break; |
365 |
|
|
366 |
|
/* Anything else does not or need not match a character. We can get the |
367 |
|
item's length from the table, but for those that can match zero occurrences |
368 |
|
of a character, we must take special action for UTF-8 characters. */ |
369 |
|
|
370 |
|
case OP_UPTO: |
371 |
|
case OP_NOTUPTO: |
372 |
|
case OP_MINUPTO: |
373 |
|
case OP_NOTMINUPTO: |
374 |
|
case OP_POSUPTO: |
375 |
|
case OP_STAR: |
376 |
|
case OP_MINSTAR: |
377 |
|
case OP_NOTMINSTAR: |
378 |
|
case OP_POSSTAR: |
379 |
|
case OP_NOTPOSSTAR: |
380 |
|
case OP_QUERY: |
381 |
|
case OP_MINQUERY: |
382 |
|
case OP_NOTMINQUERY: |
383 |
|
case OP_POSQUERY: |
384 |
|
case OP_NOTPOSQUERY: |
385 |
|
cc += _pcre_OP_lengths[op]; |
386 |
|
#ifdef SUPPORT_UTF8 |
387 |
|
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f]; |
388 |
|
#endif |
389 |
|
break; |
390 |
|
|
391 |
|
/* For the record, these are the opcodes that are matched by "default": |
392 |
|
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP, |
393 |
|
OP_THEN. */ |
394 |
|
|
395 |
|
default: |
396 |
|
cc += _pcre_OP_lengths[op]; |
397 |
|
break; |
398 |
|
} |
399 |
|
} |
400 |
|
/* Control never gets here */ |
401 |
|
} |
402 |
|
|
403 |
|
|
404 |
|
|
405 |
/************************************************* |
/************************************************* |
406 |
* Set a bit and maybe its alternate case * |
* Set a bit and maybe its alternate case * |
407 |
*************************************************/ |
*************************************************/ |
848 |
set NULL unless error |
set NULL unless error |
849 |
|
|
850 |
Returns: pointer to a pcre_extra block, with study_data filled in and the |
Returns: pointer to a pcre_extra block, with study_data filled in and the |
851 |
appropriate flag set; |
appropriate flags set; |
852 |
NULL on error or if no optimization possible |
NULL on error or if no optimization possible |
853 |
*/ |
*/ |
854 |
|
|
855 |
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION |
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION |
856 |
pcre_study(const pcre *external_re, int options, const char **errorptr) |
pcre_study(const pcre *external_re, int options, const char **errorptr) |
857 |
{ |
{ |
858 |
|
int min; |
859 |
|
BOOL bits_set = FALSE; |
860 |
uschar start_bits[32]; |
uschar start_bits[32]; |
861 |
pcre_extra *extra; |
pcre_extra *extra; |
862 |
pcre_study_data *study; |
pcre_study_data *study; |
883 |
(re->name_count * re->name_entry_size); |
(re->name_count * re->name_entry_size); |
884 |
|
|
885 |
/* For an anchored pattern, or an unanchored pattern that has a first char, or |
/* For an anchored pattern, or an unanchored pattern that has a first char, or |
886 |
a multiline pattern that matches only at "line starts", no further processing |
a multiline pattern that matches only at "line starts", there is no point in |
887 |
at present. */ |
seeking a list of starting bytes. */ |
888 |
|
|
889 |
if ((re->options & PCRE_ANCHORED) != 0 || |
if ((re->options & PCRE_ANCHORED) == 0 && |
890 |
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) |
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0) |
891 |
return NULL; |
{ |
892 |
|
/* Set the character tables in the block that is passed around */ |
893 |
|
|
894 |
|
tables = re->tables; |
895 |
|
if (tables == NULL) |
896 |
|
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, |
897 |
|
(void *)(&tables)); |
898 |
|
|
899 |
|
compile_block.lcc = tables + lcc_offset; |
900 |
|
compile_block.fcc = tables + fcc_offset; |
901 |
|
compile_block.cbits = tables + cbits_offset; |
902 |
|
compile_block.ctypes = tables + ctypes_offset; |
903 |
|
|
904 |
|
/* See if we can find a fixed set of initial characters for the pattern. */ |
905 |
|
|
906 |
|
memset(start_bits, 0, 32 * sizeof(uschar)); |
907 |
|
bits_set = set_start_bits(code, start_bits, |
908 |
|
(re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, |
909 |
|
&compile_block) == SSB_DONE; |
910 |
|
} |
911 |
|
|
912 |
|
/* Find the minimum length of subject string. */ |
913 |
|
|
914 |
|
min = find_minlength(code, code, re->options); |
915 |
|
|
916 |
/* Set the character tables in the block that is passed around */ |
/* Return NULL if no optimization is possible. */ |
917 |
|
|
918 |
tables = re->tables; |
if (!bits_set && min < 0) return NULL; |
|
if (tables == NULL) |
|
|
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, |
|
|
(void *)(&tables)); |
|
|
|
|
|
compile_block.lcc = tables + lcc_offset; |
|
|
compile_block.fcc = tables + fcc_offset; |
|
|
compile_block.cbits = tables + cbits_offset; |
|
|
compile_block.ctypes = tables + ctypes_offset; |
|
|
|
|
|
/* See if we can find a fixed set of initial characters for the pattern. */ |
|
|
|
|
|
memset(start_bits, 0, 32 * sizeof(uschar)); |
|
|
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, |
|
|
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; |
|
919 |
|
|
920 |
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in |
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in |
921 |
the latter, which is pointed to by the former, which may also get additional |
the latter, which is pointed to by the former, which may also get additional |
938 |
extra->study_data = study; |
extra->study_data = study; |
939 |
|
|
940 |
study->size = sizeof(pcre_study_data); |
study->size = sizeof(pcre_study_data); |
941 |
study->options = PCRE_STUDY_MAPPED; |
study->flags = 0; |
942 |
memcpy(study->start_bits, start_bits, sizeof(start_bits)); |
|
943 |
|
if (bits_set) |
944 |
|
{ |
945 |
|
study->flags |= PCRE_STUDY_MAPPED; |
946 |
|
memcpy(study->start_bits, start_bits, sizeof(start_bits)); |
947 |
|
} |
948 |
|
|
949 |
|
if (min >= 0) |
950 |
|
{ |
951 |
|
study->flags |= PCRE_STUDY_MINLEN; |
952 |
|
study->minlength = min; |
953 |
|
} |
954 |
|
|
955 |
return extra; |
return extra; |
956 |
} |
} |