9 |
|
|
10 |
Written by: Philip Hazel <ph10@cam.ac.uk> |
Written by: Philip Hazel <ph10@cam.ac.uk> |
11 |
|
|
12 |
Copyright (c) 1997-1999 University of Cambridge |
Copyright (c) 1997-2000 University of Cambridge |
13 |
|
|
14 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
15 |
Permission is granted to anyone to use this software for any purpose on any |
Permission is granted to anyone to use this software for any purpose on any |
66 |
#define BRASTACK_SIZE 200 |
#define BRASTACK_SIZE 200 |
67 |
|
|
68 |
|
|
69 |
|
/* The number of bytes in a literal character string above which we can't add |
70 |
|
any more is different when UTF-8 characters may be encountered. */ |
71 |
|
|
72 |
|
#ifdef SUPPORT_UTF8 |
73 |
|
#define MAXLIT 250 |
74 |
|
#else |
75 |
|
#define MAXLIT 255 |
76 |
|
#endif |
77 |
|
|
78 |
|
|
79 |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
80 |
|
|
81 |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
92 |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
93 |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", "{", |
94 |
"*", "*?", "+", "+?", "?", "??", "{", "{", |
"*", "*?", "+", "+?", "?", "??", "{", "{", |
95 |
"class", "Ref", |
"class", "Ref", "Recurse", |
96 |
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", |
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", |
97 |
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref", |
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref", |
98 |
"Brazero", "Braminzero", "Bra" |
"Brazero", "Braminzero", "Bra" |
117 |
0, 0, -ESC_z /* x - z */ |
0, 0, -ESC_z /* x - z */ |
118 |
}; |
}; |
119 |
|
|
120 |
|
/* Tables of names of POSIX character classes and their lengths. The list is |
121 |
|
terminated by a zero length entry. The first three must be alpha, upper, lower, |
122 |
|
as this is assumed for handling case independence. */ |
123 |
|
|
124 |
|
static const char *posix_names[] = { |
125 |
|
"alpha", "lower", "upper", |
126 |
|
"alnum", "ascii", "cntrl", "digit", "graph", |
127 |
|
"print", "punct", "space", "word", "xdigit" }; |
128 |
|
|
129 |
|
static const uschar posix_name_lengths[] = { |
130 |
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; |
131 |
|
|
132 |
|
/* Table of class bit maps for each POSIX class; up to three may be combined |
133 |
|
to form the class. */ |
134 |
|
|
135 |
|
static const int posix_class_maps[] = { |
136 |
|
cbit_lower, cbit_upper, -1, /* alpha */ |
137 |
|
cbit_lower, -1, -1, /* lower */ |
138 |
|
cbit_upper, -1, -1, /* upper */ |
139 |
|
cbit_digit, cbit_lower, cbit_upper, /* alnum */ |
140 |
|
cbit_print, cbit_cntrl, -1, /* ascii */ |
141 |
|
cbit_cntrl, -1, -1, /* cntrl */ |
142 |
|
cbit_digit, -1, -1, /* digit */ |
143 |
|
cbit_graph, -1, -1, /* graph */ |
144 |
|
cbit_print, -1, -1, /* print */ |
145 |
|
cbit_punct, -1, -1, /* punct */ |
146 |
|
cbit_space, -1, -1, /* space */ |
147 |
|
cbit_word, -1, -1, /* word */ |
148 |
|
cbit_xdigit,-1, -1 /* xdigit */ |
149 |
|
}; |
150 |
|
|
151 |
|
|
152 |
/* Definition to allow mutual recursion */ |
/* Definition to allow mutual recursion */ |
153 |
|
|
154 |
static BOOL |
static BOOL |
155 |
compile_regex(int, int, int *, uschar **, const uschar **, const char **, |
compile_regex(int, int, int *, uschar **, const uschar **, const char **, |
156 |
BOOL, int, compile_data *); |
BOOL, int, int *, int *, compile_data *); |
157 |
|
|
158 |
|
/* Structure for building a chain of data that actually lives on the |
159 |
|
stack, for holding the values of the subject pointer at the start of each |
160 |
|
subpattern, so as to detect when an empty string has been matched by a |
161 |
|
subpattern - to break infinite loops. */ |
162 |
|
|
163 |
|
typedef struct eptrblock { |
164 |
|
struct eptrblock *prev; |
165 |
|
const uschar *saved_eptr; |
166 |
|
} eptrblock; |
167 |
|
|
168 |
|
/* Flag bits for the match() function */ |
169 |
|
|
170 |
|
#define match_condassert 0x01 /* Called to check a condition assertion */ |
171 |
|
#define match_isgroup 0x02 /* Set if start of bracketed group */ |
172 |
|
|
173 |
|
|
174 |
|
|
186 |
|
|
187 |
|
|
188 |
|
|
189 |
|
/************************************************* |
190 |
|
* Macros and tables for character handling * |
191 |
|
*************************************************/ |
192 |
|
|
193 |
|
/* When UTF-8 encoding is being used, a character is no longer just a single |
194 |
|
byte. The macros for character handling generate simple sequences when used in |
195 |
|
byte-mode, and more complicated ones for UTF-8 characters. */ |
196 |
|
|
197 |
|
#ifndef SUPPORT_UTF8 |
198 |
|
#define GETCHARINC(c, eptr) c = *eptr++; |
199 |
|
#define GETCHARLEN(c, eptr, len) c = *eptr; |
200 |
|
#define BACKCHAR(eptr) |
201 |
|
|
202 |
|
#else /* SUPPORT_UTF8 */ |
203 |
|
|
204 |
|
/* Get the next UTF-8 character, advancing the pointer */ |
205 |
|
|
206 |
|
#define GETCHARINC(c, eptr) \ |
207 |
|
c = *eptr++; \ |
208 |
|
if (md->utf8 && (c & 0xc0) == 0xc0) \ |
209 |
|
{ \ |
210 |
|
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
211 |
|
int s = 6 - a; /* Amount to shift next byte */ \ |
212 |
|
c &= utf8_table3[a]; /* Low order bits from first byte */ \ |
213 |
|
while (a-- > 0) \ |
214 |
|
{ \ |
215 |
|
c |= (*eptr++ & 0x3f) << s; \ |
216 |
|
s += 6; \ |
217 |
|
} \ |
218 |
|
} |
219 |
|
|
220 |
|
/* Get the next UTF-8 character, not advancing the pointer, setting length */ |
221 |
|
|
222 |
|
#define GETCHARLEN(c, eptr, len) \ |
223 |
|
c = *eptr; \ |
224 |
|
len = 1; \ |
225 |
|
if (md->utf8 && (c & 0xc0) == 0xc0) \ |
226 |
|
{ \ |
227 |
|
int i; \ |
228 |
|
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
229 |
|
int s = 6 - a; /* Amount to shift next byte */ \ |
230 |
|
c &= utf8_table3[a]; /* Low order bits from first byte */ \ |
231 |
|
for (i = 1; i <= a; i++) \ |
232 |
|
{ \ |
233 |
|
c |= (eptr[i] & 0x3f) << s; \ |
234 |
|
s += 6; \ |
235 |
|
} \ |
236 |
|
len += a; \ |
237 |
|
} |
238 |
|
|
239 |
|
/* If the pointer is not at the start of a character, move it back until |
240 |
|
it is. */ |
241 |
|
|
242 |
|
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; |
243 |
|
|
244 |
|
#endif |
245 |
|
|
246 |
|
|
247 |
|
|
248 |
/************************************************* |
/************************************************* |
249 |
* Default character tables * |
* Default character tables * |
259 |
|
|
260 |
|
|
261 |
|
|
262 |
|
#ifdef SUPPORT_UTF8 |
263 |
|
/************************************************* |
264 |
|
* Tables for UTF-8 support * |
265 |
|
*************************************************/ |
266 |
|
|
267 |
|
/* These are the breakpoints for different numbers of bytes in a UTF-8 |
268 |
|
character. */ |
269 |
|
|
270 |
|
static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; |
271 |
|
|
272 |
|
/* These are the indicator bits and the mask for the data bits to set in the |
273 |
|
first byte of a character, indexed by the number of additional bytes. */ |
274 |
|
|
275 |
|
static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
276 |
|
static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
277 |
|
|
278 |
|
/* Table of the number of extra characters, indexed by the first character |
279 |
|
masked with 0x3f. The highest number for a valid UTF-8 character is in fact |
280 |
|
0x3d. */ |
281 |
|
|
282 |
|
static uschar utf8_table4[] = { |
283 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
284 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
285 |
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
286 |
|
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; |
287 |
|
|
288 |
|
|
289 |
|
/************************************************* |
290 |
|
* Convert character value to UTF-8 * |
291 |
|
*************************************************/ |
292 |
|
|
293 |
|
/* This function takes an integer value in the range 0 - 0x7fffffff |
294 |
|
and encodes it as a UTF-8 character in 0 to 6 bytes. |
295 |
|
|
296 |
|
Arguments: |
297 |
|
cvalue the character value |
298 |
|
buffer pointer to buffer for result - at least 6 bytes long |
299 |
|
|
300 |
|
Returns: number of characters placed in the buffer |
301 |
|
*/ |
302 |
|
|
303 |
|
static int |
304 |
|
ord2utf8(int cvalue, uschar *buffer) |
305 |
|
{ |
306 |
|
register int i, j; |
307 |
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) |
308 |
|
if (cvalue <= utf8_table1[i]) break; |
309 |
|
*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]); |
310 |
|
cvalue >>= 6 - i; |
311 |
|
for (j = 0; j < i; j++) |
312 |
|
{ |
313 |
|
*buffer++ = 0x80 | (cvalue & 0x3f); |
314 |
|
cvalue >>= 6; |
315 |
|
} |
316 |
|
return i + 1; |
317 |
|
} |
318 |
|
#endif |
319 |
|
|
320 |
|
|
321 |
|
|
322 |
/************************************************* |
/************************************************* |
323 |
* Return version string * |
* Return version string * |
324 |
*************************************************/ |
*************************************************/ |
325 |
|
|
326 |
|
#define STRING(a) # a |
327 |
|
#define XSTRING(s) STRING(s) |
328 |
|
|
329 |
const char * |
const char * |
330 |
pcre_version(void) |
pcre_version(void) |
331 |
{ |
{ |
332 |
return PCRE_VERSION; |
return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE); |
333 |
} |
} |
334 |
|
|
335 |
|
|
336 |
|
|
337 |
|
|
338 |
/************************************************* |
/************************************************* |
339 |
* Return info about a compiled pattern * |
* (Obsolete) Return info about compiled pattern * |
340 |
*************************************************/ |
*************************************************/ |
341 |
|
|
342 |
/* This function picks potentially useful data out of the private |
/* This is the original "info" function. It picks potentially useful data out |
343 |
structure. |
of the private structure, but its interface was too rigid. It remains for |
344 |
|
backwards compatibility. The public options are passed back in an int - though |
345 |
|
the re->options field has been expanded to a long int, all the public options |
346 |
|
at the low end of it, and so even on 16-bit systems this will still be OK. |
347 |
|
Therefore, I haven't changed the API for pcre_info(). |
348 |
|
|
349 |
Arguments: |
Arguments: |
350 |
external_re points to compiled code |
external_re points to compiled code |
353 |
or -1 if multiline and all branches start ^, |
or -1 if multiline and all branches start ^, |
354 |
or -2 otherwise |
or -2 otherwise |
355 |
|
|
356 |
Returns: number of identifying extraction brackets |
Returns: number of capturing subpatterns |
357 |
or negative values on error |
or negative values on error |
358 |
*/ |
*/ |
359 |
|
|
363 |
const real_pcre *re = (const real_pcre *)external_re; |
const real_pcre *re = (const real_pcre *)external_re; |
364 |
if (re == NULL) return PCRE_ERROR_NULL; |
if (re == NULL) return PCRE_ERROR_NULL; |
365 |
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
366 |
if (optptr != NULL) *optptr = (re->options & PUBLIC_OPTIONS); |
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); |
367 |
if (first_char != NULL) |
if (first_char != NULL) |
368 |
*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char : |
*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char : |
369 |
((re->options & PCRE_STARTLINE) != 0)? -1 : -2; |
((re->options & PCRE_STARTLINE) != 0)? -1 : -2; |
372 |
|
|
373 |
|
|
374 |
|
|
375 |
|
/************************************************* |
376 |
|
* Return info about compiled pattern * |
377 |
|
*************************************************/ |
378 |
|
|
379 |
|
/* This is a newer "info" function which has an extensible interface so |
380 |
|
that additional items can be added compatibly. |
381 |
|
|
382 |
|
Arguments: |
383 |
|
external_re points to compiled code |
384 |
|
external_study points to study data, or NULL |
385 |
|
what what information is required |
386 |
|
where where to put the information |
387 |
|
|
388 |
|
Returns: 0 if data returned, negative on error |
389 |
|
*/ |
390 |
|
|
391 |
|
int |
392 |
|
pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what, |
393 |
|
void *where) |
394 |
|
{ |
395 |
|
const real_pcre *re = (const real_pcre *)external_re; |
396 |
|
const real_pcre_extra *study = (const real_pcre_extra *)study_data; |
397 |
|
|
398 |
|
if (re == NULL || where == NULL) return PCRE_ERROR_NULL; |
399 |
|
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
400 |
|
|
401 |
|
switch (what) |
402 |
|
{ |
403 |
|
case PCRE_INFO_OPTIONS: |
404 |
|
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; |
405 |
|
break; |
406 |
|
|
407 |
|
case PCRE_INFO_SIZE: |
408 |
|
*((size_t *)where) = re->size; |
409 |
|
break; |
410 |
|
|
411 |
|
case PCRE_INFO_CAPTURECOUNT: |
412 |
|
*((int *)where) = re->top_bracket; |
413 |
|
break; |
414 |
|
|
415 |
|
case PCRE_INFO_BACKREFMAX: |
416 |
|
*((int *)where) = re->top_backref; |
417 |
|
break; |
418 |
|
|
419 |
|
case PCRE_INFO_FIRSTCHAR: |
420 |
|
*((int *)where) = |
421 |
|
((re->options & PCRE_FIRSTSET) != 0)? re->first_char : |
422 |
|
((re->options & PCRE_STARTLINE) != 0)? -1 : -2; |
423 |
|
break; |
424 |
|
|
425 |
|
case PCRE_INFO_FIRSTTABLE: |
426 |
|
*((const uschar **)where) = |
427 |
|
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? |
428 |
|
study->start_bits : NULL; |
429 |
|
break; |
430 |
|
|
431 |
|
case PCRE_INFO_LASTLITERAL: |
432 |
|
*((int *)where) = |
433 |
|
((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1; |
434 |
|
break; |
435 |
|
|
436 |
|
default: return PCRE_ERROR_BADOPTION; |
437 |
|
} |
438 |
|
|
439 |
|
return 0; |
440 |
|
} |
441 |
|
|
442 |
|
|
443 |
|
|
444 |
#ifdef DEBUG |
#ifdef DEBUG |
445 |
/************************************************* |
/************************************************* |
477 |
|
|
478 |
/* This function is called when a \ has been encountered. It either returns a |
/* This function is called when a \ has been encountered. It either returns a |
479 |
positive value for a simple escape such as \n, or a negative value which |
positive value for a simple escape such as \n, or a negative value which |
480 |
encodes one of the more complicated things such as \d. On entry, ptr is |
encodes one of the more complicated things such as \d. When UTF-8 is enabled, |
481 |
pointing at the \. On exit, it is on the final character of the escape |
a positive value greater than 255 may be returned. On entry, ptr is pointing at |
482 |
sequence. |
the \. On exit, it is on the final character of the escape sequence. |
483 |
|
|
484 |
Arguments: |
Arguments: |
485 |
ptrptr points to the pattern position pointer |
ptrptr points to the pattern position pointer |
499 |
int options, BOOL isclass, compile_data *cd) |
int options, BOOL isclass, compile_data *cd) |
500 |
{ |
{ |
501 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
502 |
int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */ |
int c, i; |
503 |
int i; |
|
504 |
|
/* If backslash is at the end of the pattern, it's an error. */ |
505 |
|
|
506 |
|
c = *(++ptr); |
507 |
if (c == 0) *errorptr = ERR1; |
if (c == 0) *errorptr = ERR1; |
508 |
|
|
509 |
/* Digits or letters may have special meaning; all others are literals. */ |
/* Digits or letters may have special meaning; all others are literals. */ |
563 |
} |
} |
564 |
|
|
565 |
/* \0 always starts an octal number, but we may drop through to here with a |
/* \0 always starts an octal number, but we may drop through to here with a |
566 |
larger first octal digit */ |
larger first octal digit. */ |
567 |
|
|
568 |
case '0': |
case '0': |
569 |
c -= '0'; |
c -= '0'; |
570 |
while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && |
while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && |
571 |
ptr[1] != '8' && ptr[1] != '9') |
ptr[1] != '8' && ptr[1] != '9') |
572 |
c = c * 8 + *(++ptr) - '0'; |
c = c * 8 + *(++ptr) - '0'; |
573 |
|
c &= 255; /* Take least significant 8 bits */ |
574 |
break; |
break; |
575 |
|
|
576 |
/* Special escapes not starting with a digit are straightforward */ |
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number |
577 |
|
which can be greater than 0xff, but only if the ddd are hex digits. */ |
578 |
|
|
579 |
case 'x': |
case 'x': |
580 |
|
#ifdef SUPPORT_UTF8 |
581 |
|
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) |
582 |
|
{ |
583 |
|
const uschar *pt = ptr + 2; |
584 |
|
register int count = 0; |
585 |
|
c = 0; |
586 |
|
while ((cd->ctypes[*pt] & ctype_xdigit) != 0) |
587 |
|
{ |
588 |
|
count++; |
589 |
|
c = c * 16 + cd->lcc[*pt] - |
590 |
|
(((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W'); |
591 |
|
pt++; |
592 |
|
} |
593 |
|
if (*pt == '}') |
594 |
|
{ |
595 |
|
if (c < 0 || count > 8) *errorptr = ERR34; |
596 |
|
ptr = pt; |
597 |
|
break; |
598 |
|
} |
599 |
|
/* If the sequence of hex digits does not end with '}', then we don't |
600 |
|
recognize this construct; fall through to the normal \x handling. */ |
601 |
|
} |
602 |
|
#endif |
603 |
|
|
604 |
|
/* Read just a single hex char */ |
605 |
|
|
606 |
c = 0; |
c = 0; |
607 |
while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) |
while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) |
608 |
{ |
{ |
612 |
} |
} |
613 |
break; |
break; |
614 |
|
|
615 |
|
/* Other special escapes not starting with a digit are straightforward */ |
616 |
|
|
617 |
case 'c': |
case 'c': |
618 |
c = *(++ptr); |
c = *(++ptr); |
619 |
if (c == 0) |
if (c == 0) |
751 |
|
|
752 |
Arguments: |
Arguments: |
753 |
code points to the start of the pattern (the bracket) |
code points to the start of the pattern (the bracket) |
754 |
|
options the compiling options |
755 |
|
|
756 |
Returns: the fixed length, or -1 if there is no fixed length |
Returns: the fixed length, or -1 if there is no fixed length |
757 |
*/ |
*/ |
758 |
|
|
759 |
static int |
static int |
760 |
find_fixedlength(uschar *code) |
find_fixedlength(uschar *code, int options) |
761 |
{ |
{ |
762 |
int length = -1; |
int length = -1; |
763 |
|
|
778 |
case OP_BRA: |
case OP_BRA: |
779 |
case OP_ONCE: |
case OP_ONCE: |
780 |
case OP_COND: |
case OP_COND: |
781 |
d = find_fixedlength(cc); |
d = find_fixedlength(cc, options); |
782 |
if (d < 0) return -1; |
if (d < 0) return -1; |
783 |
branchlength += d; |
branchlength += d; |
784 |
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
815 |
|
|
816 |
case OP_REVERSE: |
case OP_REVERSE: |
817 |
cc++; |
cc++; |
818 |
|
/* Fall through */ |
819 |
|
|
820 |
case OP_CREF: |
case OP_CREF: |
821 |
case OP_OPT: |
case OP_OPT: |
832 |
cc++; |
cc++; |
833 |
break; |
break; |
834 |
|
|
835 |
/* Handle char strings */ |
/* Handle char strings. In UTF-8 mode we must count characters, not bytes. |
836 |
|
This requires a scan of the string, unfortunately. We assume valid UTF-8 |
837 |
|
strings, so all we do is reduce the length by one for byte whose bits are |
838 |
|
10xxxxxx. */ |
839 |
|
|
840 |
case OP_CHARS: |
case OP_CHARS: |
841 |
branchlength += *(++cc); |
branchlength += *(++cc); |
842 |
|
#ifdef SUPPORT_UTF8 |
843 |
|
for (d = 1; d <= *cc; d++) |
844 |
|
if ((cc[d] & 0xc0) == 0x80) branchlength--; |
845 |
|
#endif |
846 |
cc += *cc + 1; |
cc += *cc + 1; |
847 |
break; |
break; |
848 |
|
|
906 |
|
|
907 |
|
|
908 |
/************************************************* |
/************************************************* |
909 |
|
* Check for POSIX class syntax * |
910 |
|
*************************************************/ |
911 |
|
|
912 |
|
/* This function is called when the sequence "[:" or "[." or "[=" is |
913 |
|
encountered in a character class. It checks whether this is followed by an |
914 |
|
optional ^ and then a sequence of letters, terminated by a matching ":]" or |
915 |
|
".]" or "=]". |
916 |
|
|
917 |
|
Argument: |
918 |
|
ptr pointer to the initial [ |
919 |
|
endptr where to return the end pointer |
920 |
|
cd pointer to compile data |
921 |
|
|
922 |
|
Returns: TRUE or FALSE |
923 |
|
*/ |
924 |
|
|
925 |
|
static BOOL |
926 |
|
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) |
927 |
|
{ |
928 |
|
int terminator; /* Don't combine these lines; the Solaris cc */ |
929 |
|
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
930 |
|
if (*(++ptr) == '^') ptr++; |
931 |
|
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; |
932 |
|
if (*ptr == terminator && ptr[1] == ']') |
933 |
|
{ |
934 |
|
*endptr = ptr; |
935 |
|
return TRUE; |
936 |
|
} |
937 |
|
return FALSE; |
938 |
|
} |
939 |
|
|
940 |
|
|
941 |
|
|
942 |
|
|
943 |
|
/************************************************* |
944 |
|
* Check POSIX class name * |
945 |
|
*************************************************/ |
946 |
|
|
947 |
|
/* This function is called to check the name given in a POSIX-style class entry |
948 |
|
such as [:alnum:]. |
949 |
|
|
950 |
|
Arguments: |
951 |
|
ptr points to the first letter |
952 |
|
len the length of the name |
953 |
|
|
954 |
|
Returns: a value representing the name, or -1 if unknown |
955 |
|
*/ |
956 |
|
|
957 |
|
static int |
958 |
|
check_posix_name(const uschar *ptr, int len) |
959 |
|
{ |
960 |
|
register int yield = 0; |
961 |
|
while (posix_name_lengths[yield] != 0) |
962 |
|
{ |
963 |
|
if (len == posix_name_lengths[yield] && |
964 |
|
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield; |
965 |
|
yield++; |
966 |
|
} |
967 |
|
return -1; |
968 |
|
} |
969 |
|
|
970 |
|
|
971 |
|
|
972 |
|
|
973 |
|
/************************************************* |
974 |
* Compile one branch * |
* Compile one branch * |
975 |
*************************************************/ |
*************************************************/ |
976 |
|
|
983 |
ptrptr points to the current pattern pointer |
ptrptr points to the current pattern pointer |
984 |
errorptr points to pointer to error message |
errorptr points to pointer to error message |
985 |
optchanged set to the value of the last OP_OPT item compiled |
optchanged set to the value of the last OP_OPT item compiled |
986 |
|
reqchar set to the last literal character required, else -1 |
987 |
|
countlits set to count of mandatory literal characters |
988 |
cd contains pointers to tables |
cd contains pointers to tables |
989 |
|
|
990 |
Returns: TRUE on success |
Returns: TRUE on success |
994 |
static BOOL |
static BOOL |
995 |
compile_branch(int options, int *brackets, uschar **codeptr, |
compile_branch(int options, int *brackets, uschar **codeptr, |
996 |
const uschar **ptrptr, const char **errorptr, int *optchanged, |
const uschar **ptrptr, const char **errorptr, int *optchanged, |
997 |
compile_data *cd) |
int *reqchar, int *countlits, compile_data *cd) |
998 |
{ |
{ |
999 |
int repeat_type, op_type; |
int repeat_type, op_type; |
1000 |
int repeat_min, repeat_max; |
int repeat_min, repeat_max; |
1001 |
int bravalue, length; |
int bravalue, length; |
1002 |
int greedy_default, greedy_non_default; |
int greedy_default, greedy_non_default; |
1003 |
|
int prevreqchar; |
1004 |
|
int condcount = 0; |
1005 |
|
int subcountlits = 0; |
1006 |
register int c; |
register int c; |
1007 |
register uschar *code = *codeptr; |
register uschar *code = *codeptr; |
1008 |
uschar *tempcode; |
uschar *tempcode; |
1016 |
greedy_default = ((options & PCRE_UNGREEDY) != 0); |
greedy_default = ((options & PCRE_UNGREEDY) != 0); |
1017 |
greedy_non_default = greedy_default ^ 1; |
greedy_non_default = greedy_default ^ 1; |
1018 |
|
|
1019 |
|
/* Initialize no required char, and count of literals */ |
1020 |
|
|
1021 |
|
*reqchar = prevreqchar = -1; |
1022 |
|
*countlits = 0; |
1023 |
|
|
1024 |
/* Switch on next character until the end of the branch */ |
/* Switch on next character until the end of the branch */ |
1025 |
|
|
1026 |
for (;; ptr++) |
for (;; ptr++) |
1030 |
int class_lastchar; |
int class_lastchar; |
1031 |
int newoptions; |
int newoptions; |
1032 |
int condref; |
int condref; |
1033 |
|
int subreqchar; |
1034 |
|
|
1035 |
c = *ptr; |
c = *ptr; |
1036 |
if ((options & PCRE_EXTENDED) != 0) |
if ((options & PCRE_EXTENDED) != 0) |
1038 |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
1039 |
if (c == '#') |
if (c == '#') |
1040 |
{ |
{ |
1041 |
while ((c = *(++ptr)) != 0 && c != '\n'); |
/* The space before the ; is to avoid a warning on a silly compiler |
1042 |
|
on the Macintosh. */ |
1043 |
|
while ((c = *(++ptr)) != 0 && c != '\n') ; |
1044 |
continue; |
continue; |
1045 |
} |
} |
1046 |
} |
} |
1115 |
goto FAILED; |
goto FAILED; |
1116 |
} |
} |
1117 |
|
|
1118 |
|
/* Handle POSIX class names. Perl allows a negation extension of the |
1119 |
|
form [:^name]. A square bracket that doesn't match the syntax is |
1120 |
|
treated as a literal. We also recognize the POSIX constructions |
1121 |
|
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl |
1122 |
|
5.6 does. */ |
1123 |
|
|
1124 |
|
if (c == '[' && |
1125 |
|
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
1126 |
|
check_posix_syntax(ptr, &tempptr, cd)) |
1127 |
|
{ |
1128 |
|
BOOL local_negate = FALSE; |
1129 |
|
int posix_class, i; |
1130 |
|
register const uschar *cbits = cd->cbits; |
1131 |
|
|
1132 |
|
if (ptr[1] != ':') |
1133 |
|
{ |
1134 |
|
*errorptr = ERR31; |
1135 |
|
goto FAILED; |
1136 |
|
} |
1137 |
|
|
1138 |
|
ptr += 2; |
1139 |
|
if (*ptr == '^') |
1140 |
|
{ |
1141 |
|
local_negate = TRUE; |
1142 |
|
ptr++; |
1143 |
|
} |
1144 |
|
|
1145 |
|
posix_class = check_posix_name(ptr, tempptr - ptr); |
1146 |
|
if (posix_class < 0) |
1147 |
|
{ |
1148 |
|
*errorptr = ERR30; |
1149 |
|
goto FAILED; |
1150 |
|
} |
1151 |
|
|
1152 |
|
/* If matching is caseless, upper and lower are converted to |
1153 |
|
alpha. This relies on the fact that the class table starts with |
1154 |
|
alpha, lower, upper as the first 3 entries. */ |
1155 |
|
|
1156 |
|
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
1157 |
|
posix_class = 0; |
1158 |
|
|
1159 |
|
/* Or into the map we are building up to 3 of the static class |
1160 |
|
tables, or their negations. */ |
1161 |
|
|
1162 |
|
posix_class *= 3; |
1163 |
|
for (i = 0; i < 3; i++) |
1164 |
|
{ |
1165 |
|
int taboffset = posix_class_maps[posix_class + i]; |
1166 |
|
if (taboffset < 0) break; |
1167 |
|
if (local_negate) |
1168 |
|
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset]; |
1169 |
|
else |
1170 |
|
for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset]; |
1171 |
|
} |
1172 |
|
|
1173 |
|
ptr = tempptr + 1; |
1174 |
|
class_charcount = 10; /* Set > 1; assumes more than 1 per class */ |
1175 |
|
continue; |
1176 |
|
} |
1177 |
|
|
1178 |
/* Backslash may introduce a single character, or it may introduce one |
/* Backslash may introduce a single character, or it may introduce one |
1179 |
of the specials, which just set a flag. Escaped items are checked for |
of the specials, which just set a flag. Escaped items are checked for |
1180 |
validity in the pre-compiling pass. The sequence \b is a special case. |
validity in the pre-compiling pass. The sequence \b is a special case. |
1202 |
continue; |
continue; |
1203 |
|
|
1204 |
case ESC_w: |
case ESC_w: |
1205 |
for (c = 0; c < 32; c++) |
for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word]; |
|
class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]); |
|
1206 |
continue; |
continue; |
1207 |
|
|
1208 |
case ESC_W: |
case ESC_W: |
1209 |
for (c = 0; c < 32; c++) |
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word]; |
|
class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]); |
|
1210 |
continue; |
continue; |
1211 |
|
|
1212 |
case ESC_s: |
case ESC_s: |
1222 |
goto FAILED; |
goto FAILED; |
1223 |
} |
} |
1224 |
} |
} |
1225 |
/* Fall through if single character */ |
|
1226 |
|
/* Fall through if single character, but don't at present allow |
1227 |
|
chars > 255 in UTF-8 mode. */ |
1228 |
|
|
1229 |
|
#ifdef SUPPORT_UTF8 |
1230 |
|
if (c > 255) |
1231 |
|
{ |
1232 |
|
*errorptr = ERR33; |
1233 |
|
goto FAILED; |
1234 |
|
} |
1235 |
|
#endif |
1236 |
} |
} |
1237 |
|
|
1238 |
/* A single character may be followed by '-' to form a range. However, |
/* A single character may be followed by '-' to form a range. However, |
1252 |
} |
} |
1253 |
|
|
1254 |
/* The second part of a range can be a single-character escape, but |
/* The second part of a range can be a single-character escape, but |
1255 |
not any of the other escapes. */ |
not any of the other escapes. Perl 5.6 treats a hyphen as a literal |
1256 |
|
in such circumstances. */ |
1257 |
|
|
1258 |
if (d == '\\') |
if (d == '\\') |
1259 |
{ |
{ |
1260 |
|
const uschar *oldptr = ptr; |
1261 |
d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); |
d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); |
1262 |
|
|
1263 |
|
#ifdef SUPPORT_UTF8 |
1264 |
|
if (d > 255) |
1265 |
|
{ |
1266 |
|
*errorptr = ERR33; |
1267 |
|
goto FAILED; |
1268 |
|
} |
1269 |
|
#endif |
1270 |
|
/* \b is backslash; any other special means the '-' was literal */ |
1271 |
|
|
1272 |
if (d < 0) |
if (d < 0) |
1273 |
{ |
{ |
1274 |
if (d == -ESC_b) d = '\b'; else |
if (d == -ESC_b) d = '\b'; else |
1275 |
{ |
{ |
1276 |
*errorptr = ERR7; |
ptr = oldptr - 2; |
1277 |
goto FAILED; |
goto SINGLE_CHARACTER; /* A few lines below */ |
1278 |
} |
} |
1279 |
} |
} |
1280 |
} |
} |
1302 |
/* Handle a lone single character - we can get here for a normal |
/* Handle a lone single character - we can get here for a normal |
1303 |
non-escape char, or after \ that introduces a single character. */ |
non-escape char, or after \ that introduces a single character. */ |
1304 |
|
|
1305 |
|
SINGLE_CHARACTER: |
1306 |
|
|
1307 |
class [c/8] |= (1 << (c&7)); |
class [c/8] |= (1 << (c&7)); |
1308 |
if ((options & PCRE_CASELESS) != 0) |
if ((options & PCRE_CASELESS) != 0) |
1309 |
{ |
{ |
1388 |
{ repeat_type = greedy_non_default; ptr++; } |
{ repeat_type = greedy_non_default; ptr++; } |
1389 |
else repeat_type = greedy_default; |
else repeat_type = greedy_default; |
1390 |
|
|
|
/* If the maximum is zero then the minimum must also be zero; Perl allows |
|
|
this case, so we do too - by simply omitting the item altogether. */ |
|
|
|
|
|
if (repeat_max == 0) code = previous; |
|
|
|
|
1391 |
/* If previous was a string of characters, chop off the last one and use it |
/* If previous was a string of characters, chop off the last one and use it |
1392 |
as the subject of the repeat. If there was only one character, we can |
as the subject of the repeat. If there was only one character, we can |
1393 |
abolish the previous item altogether. */ |
abolish the previous item altogether. A repeat with a zero minimum wipes |
1394 |
|
out any reqchar setting, backing up to the previous value. We must also |
1395 |
|
adjust the countlits value. */ |
1396 |
|
|
1397 |
else if (*previous == OP_CHARS) |
if (*previous == OP_CHARS) |
1398 |
{ |
{ |
1399 |
int len = previous[1]; |
int len = previous[1]; |
1400 |
|
|
1401 |
|
if (repeat_min == 0) *reqchar = prevreqchar; |
1402 |
|
*countlits += repeat_min - 1; |
1403 |
|
|
1404 |
if (len == 1) |
if (len == 1) |
1405 |
{ |
{ |
1406 |
c = previous[2]; |
c = previous[2]; |
1439 |
code = previous; |
code = previous; |
1440 |
|
|
1441 |
OUTPUT_SINGLE_REPEAT: |
OUTPUT_SINGLE_REPEAT: |
1442 |
repeat_type += op_type; /* Combine both values for many cases */ |
|
1443 |
|
/* If the maximum is zero then the minimum must also be zero; Perl allows |
1444 |
|
this case, so we do too - by simply omitting the item altogether. */ |
1445 |
|
|
1446 |
|
if (repeat_max == 0) goto END_REPEAT; |
1447 |
|
|
1448 |
|
/* Combine the op_type with the repeat_type */ |
1449 |
|
|
1450 |
|
repeat_type += op_type; |
1451 |
|
|
1452 |
/* A minimum of zero is handled either as the special case * or ?, or as |
/* A minimum of zero is handled either as the special case * or ?, or as |
1453 |
an UPTO, with the maximum given. */ |
an UPTO, with the maximum given. */ |
1524 |
} |
} |
1525 |
|
|
1526 |
/* If previous was a character class or a back reference, we put the repeat |
/* If previous was a character class or a back reference, we put the repeat |
1527 |
stuff after it. */ |
stuff after it, but just skip the item if the repeat was {0,0}. */ |
1528 |
|
|
1529 |
else if (*previous == OP_CLASS || *previous == OP_REF) |
else if (*previous == OP_CLASS || *previous == OP_REF) |
1530 |
{ |
{ |
1531 |
|
if (repeat_max == 0) |
1532 |
|
{ |
1533 |
|
code = previous; |
1534 |
|
goto END_REPEAT; |
1535 |
|
} |
1536 |
if (repeat_min == 0 && repeat_max == -1) |
if (repeat_min == 0 && repeat_max == -1) |
1537 |
*code++ = OP_CRSTAR + repeat_type; |
*code++ = OP_CRSTAR + repeat_type; |
1538 |
else if (repeat_min == 1 && repeat_max == -1) |
else if (repeat_min == 1 && repeat_max == -1) |
1583 |
|
|
1584 |
if (repeat_min == 0) |
if (repeat_min == 0) |
1585 |
{ |
{ |
1586 |
|
/* If we set up a required char from the bracket, we must back off |
1587 |
|
to the previous value and reset the countlits value too. */ |
1588 |
|
|
1589 |
|
if (subcountlits > 0) |
1590 |
|
{ |
1591 |
|
*reqchar = prevreqchar; |
1592 |
|
*countlits -= subcountlits; |
1593 |
|
} |
1594 |
|
|
1595 |
/* If the maximum is also zero, we just omit the group from the output |
/* If the maximum is also zero, we just omit the group from the output |
1596 |
altogether. */ |
altogether. */ |
1597 |
|
|
1598 |
if (repeat_max == 0) |
if (repeat_max == 0) |
1599 |
{ |
{ |
1600 |
code = previous; |
code = previous; |
1601 |
previous = NULL; |
goto END_REPEAT; |
|
break; |
|
1602 |
} |
} |
1603 |
|
|
1604 |
/* If the maximum is 1 or unlimited, we just have to stick in the |
/* If the maximum is 1 or unlimited, we just have to stick in the |
1703 |
correct offset was computed above. */ |
correct offset was computed above. */ |
1704 |
|
|
1705 |
else code[-ketoffset] = OP_KETRMAX + repeat_type; |
else code[-ketoffset] = OP_KETRMAX + repeat_type; |
|
|
|
|
|
|
|
#ifdef NEVER |
|
|
/* If the minimum is greater than zero, and the maximum is unlimited or |
|
|
equal to the minimum, the first copy remains where it is, and is |
|
|
replicated up to the minimum number of times. This case includes the + |
|
|
repeat, but of course no replication is needed in that case. */ |
|
|
|
|
|
if (repeat_min > 0 && (repeat_max == -1 || repeat_max == repeat_min)) |
|
|
{ |
|
|
for (i = 1; i < repeat_min; i++) |
|
|
{ |
|
|
memcpy(code, previous, len); |
|
|
code += len; |
|
|
} |
|
|
} |
|
|
|
|
|
/* If the minimum is zero, stick BRAZERO in front of the first copy. |
|
|
Then, if there is a fixed upper limit, replicated up to that many times, |
|
|
sticking BRAZERO in front of all the optional ones. */ |
|
|
|
|
|
else |
|
|
{ |
|
|
if (repeat_min == 0) |
|
|
{ |
|
|
memmove(previous+1, previous, len); |
|
|
code++; |
|
|
*previous++ = OP_BRAZERO + repeat_type; |
|
|
} |
|
|
|
|
|
for (i = 1; i < repeat_min; i++) |
|
|
{ |
|
|
memcpy(code, previous, len); |
|
|
code += len; |
|
|
} |
|
|
|
|
|
for (i = (repeat_min > 0)? repeat_min : 1; i < repeat_max; i++) |
|
|
{ |
|
|
*code++ = OP_BRAZERO + repeat_type; |
|
|
memcpy(code, previous, len); |
|
|
code += len; |
|
|
} |
|
|
} |
|
|
|
|
|
/* If the maximum is unlimited, set a repeater in the final copy. We |
|
|
can't just offset backwards from the current code point, because we |
|
|
don't know if there's been an options resetting after the ket. The |
|
|
correct offset was computed above. */ |
|
|
|
|
|
if (repeat_max == -1) code[-ketoffset] = OP_KETRMAX + repeat_type; |
|
|
#endif |
|
|
|
|
|
|
|
1706 |
} |
} |
1707 |
|
|
1708 |
/* Else there's some kind of shambles */ |
/* Else there's some kind of shambles */ |
1715 |
|
|
1716 |
/* In all case we no longer have a previous item. */ |
/* In all case we no longer have a previous item. */ |
1717 |
|
|
1718 |
|
END_REPEAT: |
1719 |
previous = NULL; |
previous = NULL; |
1720 |
break; |
break; |
1721 |
|
|
1793 |
ptr++; |
ptr++; |
1794 |
break; |
break; |
1795 |
|
|
1796 |
|
case 'R': /* Pattern recursion */ |
1797 |
|
*code++ = OP_RECURSE; |
1798 |
|
ptr++; |
1799 |
|
continue; |
1800 |
|
|
1801 |
default: /* Option setting */ |
default: /* Option setting */ |
1802 |
set = unset = 0; |
set = unset = 0; |
1803 |
optset = &set; |
optset = &set; |
1889 |
(bravalue == OP_ASSERTBACK || |
(bravalue == OP_ASSERTBACK || |
1890 |
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ |
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ |
1891 |
condref, /* Condition reference number */ |
condref, /* Condition reference number */ |
1892 |
|
&subreqchar, /* For possible last char */ |
1893 |
|
&subcountlits, /* For literal count */ |
1894 |
cd)) /* Tables block */ |
cd)) /* Tables block */ |
1895 |
goto FAILED; |
goto FAILED; |
1896 |
|
|
1904 |
|
|
1905 |
if (bravalue == OP_COND) |
if (bravalue == OP_COND) |
1906 |
{ |
{ |
|
int branchcount = 0; |
|
1907 |
uschar *tc = code; |
uschar *tc = code; |
1908 |
|
condcount = 0; |
1909 |
|
|
1910 |
do { |
do { |
1911 |
branchcount++; |
condcount++; |
1912 |
tc += (tc[1] << 8) | tc[2]; |
tc += (tc[1] << 8) | tc[2]; |
1913 |
} |
} |
1914 |
while (*tc != OP_KET); |
while (*tc != OP_KET); |
1915 |
|
|
1916 |
if (branchcount > 2) |
if (condcount > 2) |
1917 |
{ |
{ |
1918 |
*errorptr = ERR27; |
*errorptr = ERR27; |
1919 |
goto FAILED; |
goto FAILED; |
1920 |
} |
} |
1921 |
} |
} |
1922 |
|
|
1923 |
|
/* Handle updating of the required character. If the subpattern didn't |
1924 |
|
set one, leave it as it was. Otherwise, update it for normal brackets of |
1925 |
|
all kinds, forward assertions, and conditions with two branches. Don't |
1926 |
|
update the literal count for forward assertions, however. If the bracket |
1927 |
|
is followed by a quantifier with zero repeat, we have to back off. Hence |
1928 |
|
the definition of prevreqchar and subcountlits outside the main loop so |
1929 |
|
that they can be accessed for the back off. */ |
1930 |
|
|
1931 |
|
if (subreqchar > 0 && |
1932 |
|
(bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT || |
1933 |
|
(bravalue == OP_COND && condcount == 2))) |
1934 |
|
{ |
1935 |
|
prevreqchar = *reqchar; |
1936 |
|
*reqchar = subreqchar; |
1937 |
|
if (bravalue != OP_ASSERT) *countlits += subcountlits; |
1938 |
|
} |
1939 |
|
|
1940 |
/* Now update the main code pointer to the end of the group. */ |
/* Now update the main code pointer to the end of the group. */ |
1941 |
|
|
1942 |
code = tempcode; |
code = tempcode; |
2004 |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
2005 |
if (c == '#') |
if (c == '#') |
2006 |
{ |
{ |
2007 |
while ((c = *(++ptr)) != 0 && c != '\n'); |
/* The space before the ; is to avoid a warning on a silly compiler |
2008 |
|
on the Macintosh. */ |
2009 |
|
while ((c = *(++ptr)) != 0 && c != '\n') ; |
2010 |
if (c == 0) break; |
if (c == 0) break; |
2011 |
continue; |
continue; |
2012 |
} |
} |
2021 |
tempptr = ptr; |
tempptr = ptr; |
2022 |
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); |
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); |
2023 |
if (c < 0) { ptr = tempptr; break; } |
if (c < 0) { ptr = tempptr; break; } |
2024 |
|
|
2025 |
|
/* If a character is > 127 in UTF-8 mode, we have to turn it into |
2026 |
|
two or more characters in the UTF-8 encoding. */ |
2027 |
|
|
2028 |
|
#ifdef SUPPORT_UTF8 |
2029 |
|
if (c > 127 && (options & PCRE_UTF8) != 0) |
2030 |
|
{ |
2031 |
|
uschar buffer[8]; |
2032 |
|
int len = ord2utf8(c, buffer); |
2033 |
|
for (c = 0; c < len; c++) *code++ = buffer[c]; |
2034 |
|
length += len; |
2035 |
|
continue; |
2036 |
|
} |
2037 |
|
#endif |
2038 |
} |
} |
2039 |
|
|
2040 |
/* Ordinary character or single-char escape */ |
/* Ordinary character or single-char escape */ |
2045 |
|
|
2046 |
/* This "while" is the end of the "do" above. */ |
/* This "while" is the end of the "do" above. */ |
2047 |
|
|
2048 |
while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); |
while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); |
2049 |
|
|
2050 |
|
/* Update the last character and the count of literals */ |
2051 |
|
|
2052 |
|
prevreqchar = (length > 1)? code[-2] : *reqchar; |
2053 |
|
*reqchar = code[-1]; |
2054 |
|
*countlits += length; |
2055 |
|
|
2056 |
/* Compute the length and set it in the data vector, and advance to |
/* Compute the length and set it in the data vector, and advance to |
2057 |
the next state. */ |
the next state. */ |
2058 |
|
|
2059 |
previous[1] = length; |
previous[1] = length; |
2060 |
if (length < 255) ptr--; |
if (length < MAXLIT) ptr--; |
2061 |
break; |
break; |
2062 |
} |
} |
2063 |
} /* end of big loop */ |
} /* end of big loop */ |
2096 |
errorptr -> pointer to error message |
errorptr -> pointer to error message |
2097 |
lookbehind TRUE if this is a lookbehind assertion |
lookbehind TRUE if this is a lookbehind assertion |
2098 |
condref > 0 for OPT_CREF setting at start of conditional group |
condref > 0 for OPT_CREF setting at start of conditional group |
2099 |
|
reqchar -> place to put the last required character, or a negative number |
2100 |
|
countlits -> place to put the shortest literal count of any branch |
2101 |
cd points to the data block with tables pointers |
cd points to the data block with tables pointers |
2102 |
|
|
2103 |
Returns: TRUE on success |
Returns: TRUE on success |
2106 |
static BOOL |
static BOOL |
2107 |
compile_regex(int options, int optchanged, int *brackets, uschar **codeptr, |
compile_regex(int options, int optchanged, int *brackets, uschar **codeptr, |
2108 |
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref, |
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref, |
2109 |
compile_data *cd) |
int *reqchar, int *countlits, compile_data *cd) |
2110 |
{ |
{ |
2111 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
2112 |
uschar *code = *codeptr; |
uschar *code = *codeptr; |
2114 |
uschar *start_bracket = code; |
uschar *start_bracket = code; |
2115 |
uschar *reverse_count = NULL; |
uschar *reverse_count = NULL; |
2116 |
int oldoptions = options & PCRE_IMS; |
int oldoptions = options & PCRE_IMS; |
2117 |
|
int branchreqchar, branchcountlits; |
2118 |
|
|
2119 |
|
*reqchar = -1; |
2120 |
|
*countlits = INT_MAX; |
2121 |
code += 3; |
code += 3; |
2122 |
|
|
2123 |
/* At the start of a reference-based conditional group, insert the reference |
/* At the start of a reference-based conditional group, insert the reference |
2156 |
|
|
2157 |
/* Now compile the branch */ |
/* Now compile the branch */ |
2158 |
|
|
2159 |
if (!compile_branch(options,brackets,&code,&ptr,errorptr,&optchanged,cd)) |
if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged, |
2160 |
|
&branchreqchar, &branchcountlits, cd)) |
2161 |
{ |
{ |
2162 |
*ptrptr = ptr; |
*ptrptr = ptr; |
2163 |
return FALSE; |
return FALSE; |
2169 |
last_branch[1] = length >> 8; |
last_branch[1] = length >> 8; |
2170 |
last_branch[2] = length & 255; |
last_branch[2] = length & 255; |
2171 |
|
|
2172 |
|
/* Save the last required character if all branches have the same; a current |
2173 |
|
value of -1 means unset, while -2 means "previous branch had no last required |
2174 |
|
char". */ |
2175 |
|
|
2176 |
|
if (*reqchar != -2) |
2177 |
|
{ |
2178 |
|
if (branchreqchar >= 0) |
2179 |
|
{ |
2180 |
|
if (*reqchar == -1) *reqchar = branchreqchar; |
2181 |
|
else if (*reqchar != branchreqchar) *reqchar = -2; |
2182 |
|
} |
2183 |
|
else *reqchar = -2; |
2184 |
|
} |
2185 |
|
|
2186 |
|
/* Keep the shortest literal count */ |
2187 |
|
|
2188 |
|
if (branchcountlits < *countlits) *countlits = branchcountlits; |
2189 |
|
DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits)); |
2190 |
|
|
2191 |
/* If lookbehind, check that this branch matches a fixed-length string, |
/* If lookbehind, check that this branch matches a fixed-length string, |
2192 |
and put the length into the OP_REVERSE item. Temporarily mark the end of |
and put the length into the OP_REVERSE item. Temporarily mark the end of |
2193 |
the branch with OP_END. */ |
the branch with OP_END. */ |
2195 |
if (lookbehind) |
if (lookbehind) |
2196 |
{ |
{ |
2197 |
*code = OP_END; |
*code = OP_END; |
2198 |
length = find_fixedlength(last_branch); |
length = find_fixedlength(last_branch, options); |
2199 |
DPRINTF(("fixed length = %d\n", length)); |
DPRINTF(("fixed length = %d\n", length)); |
2200 |
if (length < 0) |
if (length < 0) |
2201 |
{ |
{ |
2282 |
code += 2; |
code += 2; |
2283 |
break; |
break; |
2284 |
|
|
2285 |
|
case OP_WORD_BOUNDARY: |
2286 |
|
case OP_NOT_WORD_BOUNDARY: |
2287 |
|
code++; |
2288 |
|
break; |
2289 |
|
|
2290 |
case OP_ASSERT_NOT: |
case OP_ASSERT_NOT: |
2291 |
case OP_ASSERTBACK: |
case OP_ASSERTBACK: |
2292 |
case OP_ASSERTBACK_NOT: |
case OP_ASSERTBACK_NOT: |
2314 |
it's anchored. However, if this is a multiline pattern, then only OP_SOD |
it's anchored. However, if this is a multiline pattern, then only OP_SOD |
2315 |
counts, since OP_CIRC can match in the middle. |
counts, since OP_CIRC can match in the middle. |
2316 |
|
|
2317 |
A branch is also implicitly anchored if it starts with .* because that will try |
A branch is also implicitly anchored if it starts with .* and DOTALL is set, |
2318 |
the rest of the pattern at all possible matching points, so there is no point |
because that will try the rest of the pattern at all possible matching points, |
2319 |
trying them again. |
so there is no point trying them again. |
2320 |
|
|
2321 |
Arguments: |
Arguments: |
2322 |
code points to start of expression (the bracket) |
code points to start of expression (the bracket) |
2334 |
register int op = *scode; |
register int op = *scode; |
2335 |
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
2336 |
{ if (!is_anchored(scode, options)) return FALSE; } |
{ if (!is_anchored(scode, options)) return FALSE; } |
2337 |
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) |
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) && |
2338 |
|
(*options & PCRE_DOTALL) != 0) |
2339 |
{ if (scode[1] != OP_ANY) return FALSE; } |
{ if (scode[1] != OP_ANY) return FALSE; } |
2340 |
else if (op != OP_SOD && |
else if (op != OP_SOD && |
2341 |
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) |
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC)) |
2349 |
|
|
2350 |
|
|
2351 |
/************************************************* |
/************************************************* |
2352 |
* Check for start with \n line expression * |
* Check for starting with ^ or .* * |
2353 |
*************************************************/ |
*************************************************/ |
2354 |
|
|
2355 |
/* This is called for multiline expressions to try to find out if every branch |
/* This is called to find out if every branch starts with ^ or .* so that |
2356 |
starts with ^ so that "first char" processing can be done to speed things up. |
"first char" processing can be done to speed things up in multiline |
2357 |
|
matching and for non-DOTALL patterns that start with .* (which must start at |
2358 |
|
the beginning or after \n). |
2359 |
|
|
2360 |
Argument: points to start of expression (the bracket) |
Argument: points to start of expression (the bracket) |
2361 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
2369 |
register int op = *scode; |
register int op = *scode; |
2370 |
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND) |
2371 |
{ if (!is_startline(scode)) return FALSE; } |
{ if (!is_startline(scode)) return FALSE; } |
2372 |
|
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR) |
2373 |
|
{ if (scode[1] != OP_ANY) return FALSE; } |
2374 |
else if (op != OP_CIRC) return FALSE; |
else if (op != OP_CIRC) return FALSE; |
2375 |
code += (code[1] << 8) + code[2]; |
code += (code[1] << 8) + code[2]; |
2376 |
} |
} |
2469 |
real_pcre *re; |
real_pcre *re; |
2470 |
int length = 3; /* For initial BRA plus length */ |
int length = 3; /* For initial BRA plus length */ |
2471 |
int runlength; |
int runlength; |
2472 |
int c, size; |
int c, reqchar, countlits; |
2473 |
int bracount = 0; |
int bracount = 0; |
2474 |
int top_backref = 0; |
int top_backref = 0; |
2475 |
int branch_extra = 0; |
int branch_extra = 0; |
2476 |
int branch_newextra; |
int branch_newextra; |
2477 |
unsigned int brastackptr = 0; |
unsigned int brastackptr = 0; |
2478 |
|
size_t size; |
2479 |
uschar *code; |
uschar *code; |
2480 |
const uschar *ptr; |
const uschar *ptr; |
2481 |
compile_data compile_block; |
compile_data compile_block; |
2486 |
uschar *code_base, *code_end; |
uschar *code_base, *code_end; |
2487 |
#endif |
#endif |
2488 |
|
|
2489 |
|
/* Can't support UTF8 unless PCRE has been compiled to include the code. */ |
2490 |
|
|
2491 |
|
#ifndef SUPPORT_UTF8 |
2492 |
|
if ((options & PCRE_UTF8) != 0) |
2493 |
|
{ |
2494 |
|
*errorptr = ERR32; |
2495 |
|
return NULL; |
2496 |
|
} |
2497 |
|
#endif |
2498 |
|
|
2499 |
/* We can't pass back an error message if errorptr is NULL; I guess the best we |
/* We can't pass back an error message if errorptr is NULL; I guess the best we |
2500 |
can do is just return NULL. */ |
can do is just return NULL. */ |
2501 |
|
|
2548 |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
2549 |
if (c == '#') |
if (c == '#') |
2550 |
{ |
{ |
2551 |
while ((c = *(++ptr)) != 0 && c != '\n'); |
/* The space before the ; is to avoid a warning on a silly compiler |
2552 |
|
on the Macintosh. */ |
2553 |
|
while ((c = *(++ptr)) != 0 && c != '\n') ; |
2554 |
continue; |
continue; |
2555 |
} |
} |
2556 |
} |
} |
2715 |
ptr += 2; |
ptr += 2; |
2716 |
break; |
break; |
2717 |
|
|
2718 |
|
/* A recursive call to the regex is an extension, to provide the |
2719 |
|
facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */ |
2720 |
|
|
2721 |
|
case 'R': |
2722 |
|
if (ptr[3] != ')') |
2723 |
|
{ |
2724 |
|
*errorptr = ERR29; |
2725 |
|
goto PCRE_ERROR_RETURN; |
2726 |
|
} |
2727 |
|
ptr += 3; |
2728 |
|
length += 1; |
2729 |
|
break; |
2730 |
|
|
2731 |
/* Lookbehinds are in Perl from version 5.005 */ |
/* Lookbehinds are in Perl from version 5.005 */ |
2732 |
|
|
2733 |
case '<': |
case '<': |
2760 |
else /* An assertion must follow */ |
else /* An assertion must follow */ |
2761 |
{ |
{ |
2762 |
ptr++; /* Can treat like ':' as far as spacing is concerned */ |
ptr++; /* Can treat like ':' as far as spacing is concerned */ |
2763 |
|
if (ptr[2] != '?' || |
2764 |
if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL) |
(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') ) |
2765 |
{ |
{ |
2766 |
ptr += 2; /* To get right offset in message */ |
ptr += 2; /* To get right offset in message */ |
2767 |
*errorptr = ERR28; |
*errorptr = ERR28; |
2835 |
will lead to an over-estimate on the length, but this shouldn't |
will lead to an over-estimate on the length, but this shouldn't |
2836 |
matter very much. We also have to allow for resetting options at |
matter very much. We also have to allow for resetting options at |
2837 |
the start of any alternations, which we do by setting |
the start of any alternations, which we do by setting |
2838 |
branch_newextra to 2. */ |
branch_newextra to 2. Finally, we record whether the case-dependent |
2839 |
|
flag ever changes within the regex. This is used by the "required |
2840 |
|
character" code. */ |
2841 |
|
|
2842 |
case ':': |
case ':': |
2843 |
if (((set|unset) & PCRE_IMS) != 0) |
if (((set|unset) & PCRE_IMS) != 0) |
2844 |
{ |
{ |
2845 |
length += 4; |
length += 4; |
2846 |
branch_newextra = 2; |
branch_newextra = 2; |
2847 |
|
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; |
2848 |
} |
} |
2849 |
goto END_OPTIONS; |
goto END_OPTIONS; |
2850 |
|
|
2974 |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
2975 |
if (c == '#') |
if (c == '#') |
2976 |
{ |
{ |
2977 |
while ((c = *(++ptr)) != 0 && c != '\n'); |
/* The space before the ; is to avoid a warning on a silly compiler |
2978 |
|
on the Macintosh. */ |
2979 |
|
while ((c = *(++ptr)) != 0 && c != '\n') ; |
2980 |
continue; |
continue; |
2981 |
} |
} |
2982 |
} |
} |
2991 |
&compile_block); |
&compile_block); |
2992 |
if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
2993 |
if (c < 0) { ptr = saveptr; break; } |
if (c < 0) { ptr = saveptr; break; } |
2994 |
|
|
2995 |
|
#ifdef SUPPORT_UTF8 |
2996 |
|
if (c > 127 && (options & PCRE_UTF8) != 0) |
2997 |
|
{ |
2998 |
|
int i; |
2999 |
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) |
3000 |
|
if (c <= utf8_table1[i]) break; |
3001 |
|
runlength += i; |
3002 |
|
} |
3003 |
|
#endif |
3004 |
} |
} |
3005 |
|
|
3006 |
/* Ordinary character or single-char escape */ |
/* Ordinary character or single-char escape */ |
3010 |
|
|
3011 |
/* This "while" is the end of the "do" above. */ |
/* This "while" is the end of the "do" above. */ |
3012 |
|
|
3013 |
while (runlength < 255 && |
while (runlength < MAXLIT && |
3014 |
(compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); |
(compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); |
3015 |
|
|
3016 |
ptr--; |
ptr--; |
3042 |
return NULL; |
return NULL; |
3043 |
} |
} |
3044 |
|
|
3045 |
/* Put in the magic number and the options. */ |
/* Put in the magic number, and save the size, options, and table pointer */ |
3046 |
|
|
3047 |
re->magic_number = MAGIC_NUMBER; |
re->magic_number = MAGIC_NUMBER; |
3048 |
|
re->size = size; |
3049 |
re->options = options; |
re->options = options; |
3050 |
re->tables = tables; |
re->tables = tables; |
3051 |
|
|
3058 |
*code = OP_BRA; |
*code = OP_BRA; |
3059 |
bracount = 0; |
bracount = 0; |
3060 |
(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1, |
(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1, |
3061 |
&compile_block); |
&reqchar, &countlits, &compile_block); |
3062 |
re->top_bracket = bracount; |
re->top_bracket = bracount; |
3063 |
re->top_backref = top_backref; |
re->top_backref = top_backref; |
3064 |
|
|
3090 |
return NULL; |
return NULL; |
3091 |
} |
} |
3092 |
|
|
3093 |
/* If the anchored option was not passed, set flag if we can determine that it |
/* If the anchored option was not passed, set flag if we can determine that the |
3094 |
is anchored by virtue of ^ characters or \A or anything else. Otherwise, see if |
pattern is anchored by virtue of ^ characters or \A or anything else (such as |
3095 |
we can determine what the first character has to be, because that speeds up |
starting with .* when DOTALL is set). |
3096 |
unanchored matches no end. In the case of multiline matches, an alternative is |
|
3097 |
to set the PCRE_STARTLINE flag if all branches start with ^. */ |
Otherwise, see if we can determine what the first character has to be, because |
3098 |
|
that speeds up unanchored matches no end. If not, see if we can set the |
3099 |
|
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches |
3100 |
|
start with ^. and also when all branches start with .* for non-DOTALL matches. |
3101 |
|
*/ |
3102 |
|
|
3103 |
if ((options & PCRE_ANCHORED) == 0) |
if ((options & PCRE_ANCHORED) == 0) |
3104 |
{ |
{ |
3118 |
} |
} |
3119 |
} |
} |
3120 |
|
|
3121 |
|
/* Save the last required character if there are at least two literal |
3122 |
|
characters on all paths, or if there is no first character setting. */ |
3123 |
|
|
3124 |
|
if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0)) |
3125 |
|
{ |
3126 |
|
re->req_char = reqchar; |
3127 |
|
re->options |= PCRE_REQCHSET; |
3128 |
|
} |
3129 |
|
|
3130 |
/* Print out the compiled data for debugging */ |
/* Print out the compiled data for debugging */ |
3131 |
|
|
3132 |
#ifdef DEBUG |
#ifdef DEBUG |
3136 |
|
|
3137 |
if (re->options != 0) |
if (re->options != 0) |
3138 |
{ |
{ |
3139 |
printf("%s%s%s%s%s%s%s%s\n", |
printf("%s%s%s%s%s%s%s%s%s\n", |
3140 |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
3141 |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
3142 |
|
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", |
3143 |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
3144 |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
3145 |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
3154 |
else printf("First char = \\x%02x\n", re->first_char); |
else printf("First char = \\x%02x\n", re->first_char); |
3155 |
} |
} |
3156 |
|
|
3157 |
|
if ((re->options & PCRE_REQCHSET) != 0) |
3158 |
|
{ |
3159 |
|
if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char); |
3160 |
|
else printf("Req char = \\x%02x\n", re->req_char); |
3161 |
|
} |
3162 |
|
|
3163 |
code_end = code; |
code_end = code; |
3164 |
code_base = code = re->code; |
code_base = code = re->code; |
3165 |
|
|
3393 |
|
|
3394 |
static BOOL |
static BOOL |
3395 |
match_ref(int offset, register const uschar *eptr, int length, match_data *md, |
match_ref(int offset, register const uschar *eptr, int length, match_data *md, |
3396 |
int ims) |
unsigned long int ims) |
3397 |
{ |
{ |
3398 |
const uschar *p = md->start_subject + md->offset_vector[offset]; |
const uschar *p = md->start_subject + md->offset_vector[offset]; |
3399 |
|
|
3444 |
offset_top current top pointer |
offset_top current top pointer |
3445 |
md pointer to "static" info for the match |
md pointer to "static" info for the match |
3446 |
ims current /i, /m, and /s options |
ims current /i, /m, and /s options |
3447 |
condassert TRUE if called to check a condition assertion |
eptrb pointer to chain of blocks containing eptr at start of |
3448 |
eptrb eptr at start of last bracket |
brackets - for testing for empty matches |
3449 |
|
flags can contain |
3450 |
|
match_condassert - this is an assertion condition |
3451 |
|
match_isgroup - this is the start of a bracketed group |
3452 |
|
|
3453 |
Returns: TRUE if matched |
Returns: TRUE if matched |
3454 |
*/ |
*/ |
3455 |
|
|
3456 |
static BOOL |
static BOOL |
3457 |
match(register const uschar *eptr, register const uschar *ecode, |
match(register const uschar *eptr, register const uschar *ecode, |
3458 |
int offset_top, match_data *md, int ims, BOOL condassert, const uschar *eptrb) |
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, |
3459 |
|
int flags) |
3460 |
{ |
{ |
3461 |
int original_ims = ims; /* Save for resetting on ')' */ |
unsigned long int original_ims = ims; /* Save for resetting on ')' */ |
3462 |
|
eptrblock newptrb; |
3463 |
|
|
3464 |
|
/* At the start of a bracketed group, add the current subject pointer to the |
3465 |
|
stack of such pointers, to be re-instated at the end of the group when we hit |
3466 |
|
the closing ket. When match() is called in other circumstances, we don't add to |
3467 |
|
the stack. */ |
3468 |
|
|
3469 |
|
if ((flags & match_isgroup) != 0) |
3470 |
|
{ |
3471 |
|
newptrb.prev = eptrb; |
3472 |
|
newptrb.saved_eptr = eptr; |
3473 |
|
eptrb = &newptrb; |
3474 |
|
} |
3475 |
|
|
3476 |
|
/* Now start processing the operations. */ |
3477 |
|
|
3478 |
for (;;) |
for (;;) |
3479 |
{ |
{ |
3519 |
|
|
3520 |
do |
do |
3521 |
{ |
{ |
3522 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE; |
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup)) |
3523 |
|
return TRUE; |
3524 |
ecode += (ecode[1] << 8) + ecode[2]; |
ecode += (ecode[1] << 8) + ecode[2]; |
3525 |
} |
} |
3526 |
while (*ecode == OP_ALT); |
while (*ecode == OP_ALT); |
3546 |
DPRINTF(("start bracket 0\n")); |
DPRINTF(("start bracket 0\n")); |
3547 |
do |
do |
3548 |
{ |
{ |
3549 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE; |
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup)) |
3550 |
|
return TRUE; |
3551 |
ecode += (ecode[1] << 8) + ecode[2]; |
ecode += (ecode[1] << 8) + ecode[2]; |
3552 |
} |
} |
3553 |
while (*ecode == OP_ALT); |
while (*ecode == OP_ALT); |
3566 |
return match(eptr, |
return match(eptr, |
3567 |
ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)? |
ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)? |
3568 |
5 : 3 + (ecode[1] << 8) + ecode[2]), |
5 : 3 + (ecode[1] << 8) + ecode[2]), |
3569 |
offset_top, md, ims, FALSE, eptr); |
offset_top, md, ims, eptrb, match_isgroup); |
3570 |
} |
} |
3571 |
|
|
3572 |
/* The condition is an assertion. Call match() to evaluate it - setting |
/* The condition is an assertion. Call match() to evaluate it - setting |
3574 |
|
|
3575 |
else |
else |
3576 |
{ |
{ |
3577 |
if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL)) |
if (match(eptr, ecode+3, offset_top, md, ims, NULL, |
3578 |
|
match_condassert | match_isgroup)) |
3579 |
{ |
{ |
3580 |
ecode += 3 + (ecode[4] << 8) + ecode[5]; |
ecode += 3 + (ecode[4] << 8) + ecode[5]; |
3581 |
while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2]; |
while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2]; |
3582 |
} |
} |
3583 |
else ecode += (ecode[1] << 8) + ecode[2]; |
else ecode += (ecode[1] << 8) + ecode[2]; |
3584 |
return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr); |
return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup); |
3585 |
} |
} |
3586 |
/* Control never reaches here */ |
/* Control never reaches here */ |
3587 |
|
|
3591 |
ecode += 2; |
ecode += 2; |
3592 |
break; |
break; |
3593 |
|
|
3594 |
/* End of the pattern */ |
/* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched |
3595 |
|
an empty string - recursion will then try other alternatives, if any. */ |
3596 |
|
|
3597 |
case OP_END: |
case OP_END: |
3598 |
|
if (md->notempty && eptr == md->start_match) return FALSE; |
3599 |
md->end_match_ptr = eptr; /* Record where we ended */ |
md->end_match_ptr = eptr; /* Record where we ended */ |
3600 |
md->end_offset_top = offset_top; /* and how many extracts were taken */ |
md->end_offset_top = offset_top; /* and how many extracts were taken */ |
3601 |
return TRUE; |
return TRUE; |
3605 |
case OP_OPT: |
case OP_OPT: |
3606 |
ims = ecode[1]; |
ims = ecode[1]; |
3607 |
ecode += 2; |
ecode += 2; |
3608 |
DPRINTF(("ims set to %02x\n", ims)); |
DPRINTF(("ims set to %02lx\n", ims)); |
3609 |
break; |
break; |
3610 |
|
|
3611 |
/* Assertion brackets. Check the alternative branches in turn - the |
/* Assertion brackets. Check the alternative branches in turn - the |
3618 |
case OP_ASSERTBACK: |
case OP_ASSERTBACK: |
3619 |
do |
do |
3620 |
{ |
{ |
3621 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break; |
if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break; |
3622 |
ecode += (ecode[1] << 8) + ecode[2]; |
ecode += (ecode[1] << 8) + ecode[2]; |
3623 |
} |
} |
3624 |
while (*ecode == OP_ALT); |
while (*ecode == OP_ALT); |
3626 |
|
|
3627 |
/* If checking an assertion for a condition, return TRUE. */ |
/* If checking an assertion for a condition, return TRUE. */ |
3628 |
|
|
3629 |
if (condassert) return TRUE; |
if ((flags & match_condassert) != 0) return TRUE; |
3630 |
|
|
3631 |
/* Continue from after the assertion, updating the offsets high water |
/* Continue from after the assertion, updating the offsets high water |
3632 |
mark, since extracts may have been taken during the assertion. */ |
mark, since extracts may have been taken during the assertion. */ |
3642 |
case OP_ASSERTBACK_NOT: |
case OP_ASSERTBACK_NOT: |
3643 |
do |
do |
3644 |
{ |
{ |
3645 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE; |
if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) |
3646 |
|
return FALSE; |
3647 |
ecode += (ecode[1] << 8) + ecode[2]; |
ecode += (ecode[1] << 8) + ecode[2]; |
3648 |
} |
} |
3649 |
while (*ecode == OP_ALT); |
while (*ecode == OP_ALT); |
3650 |
|
|
3651 |
if (condassert) return TRUE; |
if ((flags & match_condassert) != 0) return TRUE; |
3652 |
|
|
3653 |
ecode += 3; |
ecode += 3; |
3654 |
continue; |
continue; |
3655 |
|
|
3656 |
/* Move the subject pointer back. This occurs only at the start of |
/* Move the subject pointer back. This occurs only at the start of |
3657 |
each branch of a lookbehind assertion. If we are too close to the start to |
each branch of a lookbehind assertion. If we are too close to the start to |
3658 |
move back, this match function fails. */ |
move back, this match function fails. When working with UTF-8 we move |
3659 |
|
back a number of characters, not bytes. */ |
3660 |
|
|
3661 |
case OP_REVERSE: |
case OP_REVERSE: |
3662 |
|
#ifdef SUPPORT_UTF8 |
3663 |
|
c = (ecode[1] << 8) + ecode[2]; |
3664 |
|
for (i = 0; i < c; i++) |
3665 |
|
{ |
3666 |
|
eptr--; |
3667 |
|
BACKCHAR(eptr) |
3668 |
|
} |
3669 |
|
#else |
3670 |
eptr -= (ecode[1] << 8) + ecode[2]; |
eptr -= (ecode[1] << 8) + ecode[2]; |
3671 |
|
#endif |
3672 |
|
|
3673 |
if (eptr < md->start_subject) return FALSE; |
if (eptr < md->start_subject) return FALSE; |
3674 |
ecode += 3; |
ecode += 3; |
3675 |
break; |
break; |
3676 |
|
|
3677 |
|
/* Recursion matches the current regex, nested. If there are any capturing |
3678 |
|
brackets started but not finished, we have to save their starting points |
3679 |
|
and reinstate them after the recursion. However, we don't know how many |
3680 |
|
such there are (offset_top records the completed total) so we just have |
3681 |
|
to save all the potential data. There may be up to 99 such values, which |
3682 |
|
is a bit large to put on the stack, but using malloc for small numbers |
3683 |
|
seems expensive. As a compromise, the stack is used when there are fewer |
3684 |
|
than 16 values to store; otherwise malloc is used. A problem is what to do |
3685 |
|
if the malloc fails ... there is no way of returning to the top level with |
3686 |
|
an error. Save the top 15 values on the stack, and accept that the rest |
3687 |
|
may be wrong. */ |
3688 |
|
|
3689 |
|
case OP_RECURSE: |
3690 |
|
{ |
3691 |
|
BOOL rc; |
3692 |
|
int *save; |
3693 |
|
int stacksave[15]; |
3694 |
|
|
3695 |
|
c = md->offset_max; |
3696 |
|
|
3697 |
|
if (c < 16) save = stacksave; else |
3698 |
|
{ |
3699 |
|
save = (int *)(pcre_malloc)((c+1) * sizeof(int)); |
3700 |
|
if (save == NULL) |
3701 |
|
{ |
3702 |
|
save = stacksave; |
3703 |
|
c = 15; |
3704 |
|
} |
3705 |
|
} |
3706 |
|
|
3707 |
|
for (i = 1; i <= c; i++) |
3708 |
|
save[i] = md->offset_vector[md->offset_end - i]; |
3709 |
|
rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb, |
3710 |
|
match_isgroup); |
3711 |
|
for (i = 1; i <= c; i++) |
3712 |
|
md->offset_vector[md->offset_end - i] = save[i]; |
3713 |
|
if (save != stacksave) (pcre_free)(save); |
3714 |
|
if (!rc) return FALSE; |
3715 |
|
|
3716 |
|
/* In case the recursion has set more capturing values, save the final |
3717 |
|
number, then move along the subject till after the recursive match, |
3718 |
|
and advance one byte in the pattern code. */ |
3719 |
|
|
3720 |
|
offset_top = md->end_offset_top; |
3721 |
|
eptr = md->end_match_ptr; |
3722 |
|
ecode++; |
3723 |
|
} |
3724 |
|
break; |
3725 |
|
|
3726 |
/* "Once" brackets are like assertion brackets except that after a match, |
/* "Once" brackets are like assertion brackets except that after a match, |
3727 |
the point in the subject string is not moved back. Thus there can never be |
the point in the subject string is not moved back. Thus there can never be |
3733 |
case OP_ONCE: |
case OP_ONCE: |
3734 |
{ |
{ |
3735 |
const uschar *prev = ecode; |
const uschar *prev = ecode; |
3736 |
|
const uschar *saved_eptr = eptr; |
3737 |
|
|
3738 |
do |
do |
3739 |
{ |
{ |
3740 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break; |
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup)) |
3741 |
|
break; |
3742 |
ecode += (ecode[1] << 8) + ecode[2]; |
ecode += (ecode[1] << 8) + ecode[2]; |
3743 |
} |
} |
3744 |
while (*ecode == OP_ALT); |
while (*ecode == OP_ALT); |
3761 |
5.005. If there is an options reset, it will get obeyed in the normal |
5.005. If there is an options reset, it will get obeyed in the normal |
3762 |
course of events. */ |
course of events. */ |
3763 |
|
|
3764 |
if (*ecode == OP_KET || eptr == eptrb) |
if (*ecode == OP_KET || eptr == saved_eptr) |
3765 |
{ |
{ |
3766 |
ecode += 3; |
ecode += 3; |
3767 |
break; |
break; |
3775 |
if (ecode[3] == OP_OPT) |
if (ecode[3] == OP_OPT) |
3776 |
{ |
{ |
3777 |
ims = (ims & ~PCRE_IMS) | ecode[4]; |
ims = (ims & ~PCRE_IMS) | ecode[4]; |
3778 |
DPRINTF(("ims set to %02x at group repeat\n", ims)); |
DPRINTF(("ims set to %02lx at group repeat\n", ims)); |
3779 |
} |
} |
3780 |
|
|
3781 |
if (*ecode == OP_KETRMIN) |
if (*ecode == OP_KETRMIN) |
3782 |
{ |
{ |
3783 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) || |
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) || |
3784 |
match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE; |
match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup)) |
3785 |
|
return TRUE; |
3786 |
} |
} |
3787 |
else /* OP_KETRMAX */ |
else /* OP_KETRMAX */ |
3788 |
{ |
{ |
3789 |
if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) || |
if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) || |
3790 |
match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE; |
match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE; |
3791 |
} |
} |
3792 |
} |
} |
3793 |
return FALSE; |
return FALSE; |
3808 |
case OP_BRAZERO: |
case OP_BRAZERO: |
3809 |
{ |
{ |
3810 |
const uschar *next = ecode+1; |
const uschar *next = ecode+1; |
3811 |
if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE; |
if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup)) |
3812 |
|
return TRUE; |
3813 |
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
3814 |
ecode = next + 3; |
ecode = next + 3; |
3815 |
} |
} |
3819 |
{ |
{ |
3820 |
const uschar *next = ecode+1; |
const uschar *next = ecode+1; |
3821 |
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT); |
3822 |
if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE; |
if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup)) |
3823 |
|
return TRUE; |
3824 |
ecode++; |
ecode++; |
3825 |
} |
} |
3826 |
break; |
break; |
3835 |
case OP_KETRMAX: |
case OP_KETRMAX: |
3836 |
{ |
{ |
3837 |
const uschar *prev = ecode - (ecode[1] << 8) - ecode[2]; |
const uschar *prev = ecode - (ecode[1] << 8) - ecode[2]; |
3838 |
|
const uschar *saved_eptr = eptrb->saved_eptr; |
3839 |
|
|
3840 |
|
eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */ |
3841 |
|
|
3842 |
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || |
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || |
3843 |
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || |
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || |
3857 |
int number = *prev - OP_BRA; |
int number = *prev - OP_BRA; |
3858 |
int offset = number << 1; |
int offset = number << 1; |
3859 |
|
|
3860 |
DPRINTF(("end bracket %d\n", number)); |
#ifdef DEBUG |
3861 |
|
printf("end bracket %d", number); |
3862 |
|
printf("\n"); |
3863 |
|
#endif |
3864 |
|
|
3865 |
if (number > 0) |
if (number > 0) |
3866 |
{ |
{ |
3878 |
the group. */ |
the group. */ |
3879 |
|
|
3880 |
ims = original_ims; |
ims = original_ims; |
3881 |
DPRINTF(("ims reset to %02x\n", ims)); |
DPRINTF(("ims reset to %02lx\n", ims)); |
3882 |
|
|
3883 |
/* For a non-repeating ket, just continue at this level. This also |
/* For a non-repeating ket, just continue at this level. This also |
3884 |
happens for a repeating ket if no characters were matched in the group. |
happens for a repeating ket if no characters were matched in the group. |
3886 |
5.005. If there is an options reset, it will get obeyed in the normal |
5.005. If there is an options reset, it will get obeyed in the normal |
3887 |
course of events. */ |
course of events. */ |
3888 |
|
|
3889 |
if (*ecode == OP_KET || eptr == eptrb) |
if (*ecode == OP_KET || eptr == saved_eptr) |
3890 |
{ |
{ |
3891 |
ecode += 3; |
ecode += 3; |
3892 |
break; |
break; |
3897 |
|
|
3898 |
if (*ecode == OP_KETRMIN) |
if (*ecode == OP_KETRMIN) |
3899 |
{ |
{ |
3900 |
if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) || |
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) || |
3901 |
match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE; |
match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup)) |
3902 |
|
return TRUE; |
3903 |
} |
} |
3904 |
else /* OP_KETRMAX */ |
else /* OP_KETRMAX */ |
3905 |
{ |
{ |
3906 |
if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) || |
if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) || |
3907 |
match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE; |
match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE; |
3908 |
} |
} |
3909 |
} |
} |
3910 |
return FALSE; |
return FALSE; |
3989 |
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n') |
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n') |
3990 |
return FALSE; |
return FALSE; |
3991 |
if (eptr++ >= md->end_subject) return FALSE; |
if (eptr++ >= md->end_subject) return FALSE; |
3992 |
|
#ifdef SUPPORT_UTF8 |
3993 |
|
if (md->utf8) |
3994 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
3995 |
|
#endif |
3996 |
ecode++; |
ecode++; |
3997 |
break; |
break; |
3998 |
|
|
4119 |
{ |
{ |
4120 |
for (i = min;; i++) |
for (i = min;; i++) |
4121 |
{ |
{ |
4122 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4123 |
return TRUE; |
return TRUE; |
4124 |
if (i >= max || !match_ref(offset, eptr, length, md, ims)) |
if (i >= max || !match_ref(offset, eptr, length, md, ims)) |
4125 |
return FALSE; |
return FALSE; |
4140 |
} |
} |
4141 |
while (eptr >= pp) |
while (eptr >= pp) |
4142 |
{ |
{ |
4143 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4144 |
return TRUE; |
return TRUE; |
4145 |
eptr -= length; |
eptr -= length; |
4146 |
} |
} |
4194 |
for (i = 1; i <= min; i++) |
for (i = 1; i <= min; i++) |
4195 |
{ |
{ |
4196 |
if (eptr >= md->end_subject) return FALSE; |
if (eptr >= md->end_subject) return FALSE; |
4197 |
c = *eptr++; |
GETCHARINC(c, eptr) /* Get character; increment eptr */ |
4198 |
|
|
4199 |
|
#ifdef SUPPORT_UTF8 |
4200 |
|
/* We do not yet support class members > 255 */ |
4201 |
|
if (c > 255) return FALSE; |
4202 |
|
#endif |
4203 |
|
|
4204 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
4205 |
return FALSE; |
return FALSE; |
4206 |
} |
} |
4217 |
{ |
{ |
4218 |
for (i = min;; i++) |
for (i = min;; i++) |
4219 |
{ |
{ |
4220 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4221 |
return TRUE; |
return TRUE; |
4222 |
if (i >= max || eptr >= md->end_subject) return FALSE; |
if (i >= max || eptr >= md->end_subject) return FALSE; |
4223 |
c = *eptr++; |
GETCHARINC(c, eptr) /* Get character; increment eptr */ |
4224 |
|
|
4225 |
|
#ifdef SUPPORT_UTF8 |
4226 |
|
/* We do not yet support class members > 255 */ |
4227 |
|
if (c > 255) return FALSE; |
4228 |
|
#endif |
4229 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
4230 |
return FALSE; |
return FALSE; |
4231 |
} |
} |
4237 |
else |
else |
4238 |
{ |
{ |
4239 |
const uschar *pp = eptr; |
const uschar *pp = eptr; |
4240 |
for (i = min; i < max; eptr++, i++) |
int len = 1; |
4241 |
|
for (i = min; i < max; i++) |
4242 |
{ |
{ |
4243 |
if (eptr >= md->end_subject) break; |
if (eptr >= md->end_subject) break; |
4244 |
c = *eptr; |
GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */ |
4245 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
|
4246 |
break; |
#ifdef SUPPORT_UTF8 |
4247 |
|
/* We do not yet support class members > 255 */ |
4248 |
|
if (c > 255) break; |
4249 |
|
#endif |
4250 |
|
if ((data[c/8] & (1 << (c&7))) == 0) break; |
4251 |
|
eptr += len; |
4252 |
} |
} |
4253 |
|
|
4254 |
while (eptr >= pp) |
while (eptr >= pp) |
4255 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
{ |
4256 |
|
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4257 |
return TRUE; |
return TRUE; |
4258 |
|
|
4259 |
|
#ifdef SUPPORT_UTF8 |
4260 |
|
BACKCHAR(eptr) |
4261 |
|
#endif |
4262 |
|
} |
4263 |
return FALSE; |
return FALSE; |
4264 |
} |
} |
4265 |
} |
} |
4355 |
{ |
{ |
4356 |
for (i = min;; i++) |
for (i = min;; i++) |
4357 |
{ |
{ |
4358 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4359 |
return TRUE; |
return TRUE; |
4360 |
if (i >= max || eptr >= md->end_subject || |
if (i >= max || eptr >= md->end_subject || |
4361 |
c != md->lcc[*eptr++]) |
c != md->lcc[*eptr++]) |
4372 |
eptr++; |
eptr++; |
4373 |
} |
} |
4374 |
while (eptr >= pp) |
while (eptr >= pp) |
4375 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4376 |
return TRUE; |
return TRUE; |
4377 |
return FALSE; |
return FALSE; |
4378 |
} |
} |
4389 |
{ |
{ |
4390 |
for (i = min;; i++) |
for (i = min;; i++) |
4391 |
{ |
{ |
4392 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4393 |
return TRUE; |
return TRUE; |
4394 |
if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE; |
if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE; |
4395 |
} |
} |
4404 |
eptr++; |
eptr++; |
4405 |
} |
} |
4406 |
while (eptr >= pp) |
while (eptr >= pp) |
4407 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4408 |
return TRUE; |
return TRUE; |
4409 |
return FALSE; |
return FALSE; |
4410 |
} |
} |
4486 |
{ |
{ |
4487 |
for (i = min;; i++) |
for (i = min;; i++) |
4488 |
{ |
{ |
4489 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4490 |
return TRUE; |
return TRUE; |
4491 |
if (i >= max || eptr >= md->end_subject || |
if (i >= max || eptr >= md->end_subject || |
4492 |
c == md->lcc[*eptr++]) |
c == md->lcc[*eptr++]) |
4503 |
eptr++; |
eptr++; |
4504 |
} |
} |
4505 |
while (eptr >= pp) |
while (eptr >= pp) |
4506 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4507 |
return TRUE; |
return TRUE; |
4508 |
return FALSE; |
return FALSE; |
4509 |
} |
} |
4520 |
{ |
{ |
4521 |
for (i = min;; i++) |
for (i = min;; i++) |
4522 |
{ |
{ |
4523 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4524 |
return TRUE; |
return TRUE; |
4525 |
if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE; |
if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE; |
4526 |
} |
} |
4535 |
eptr++; |
eptr++; |
4536 |
} |
} |
4537 |
while (eptr >= pp) |
while (eptr >= pp) |
4538 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4539 |
return TRUE; |
return TRUE; |
4540 |
return FALSE; |
return FALSE; |
4541 |
} |
} |
4579 |
|
|
4580 |
/* First, ensure the minimum number of matches are present. Use inline |
/* First, ensure the minimum number of matches are present. Use inline |
4581 |
code for maximizing the speed, and do the type test once at the start |
code for maximizing the speed, and do the type test once at the start |
4582 |
(i.e. keep it out of the loop). Also test that there are at least the |
(i.e. keep it out of the loop). Also we can test that there are at least |
4583 |
minimum number of characters before we start. */ |
the minimum number of bytes before we start, except when doing '.' in |
4584 |
|
UTF8 mode. Leave the test in in all cases; in the special case we have |
4585 |
|
to test after each character. */ |
4586 |
|
|
4587 |
if (min > md->end_subject - eptr) return FALSE; |
if (min > md->end_subject - eptr) return FALSE; |
4588 |
if (min > 0) switch(ctype) |
if (min > 0) switch(ctype) |
4589 |
{ |
{ |
4590 |
case OP_ANY: |
case OP_ANY: |
4591 |
|
#ifdef SUPPORT_UTF8 |
4592 |
|
if (md->utf8) |
4593 |
|
{ |
4594 |
|
for (i = 1; i <= min; i++) |
4595 |
|
{ |
4596 |
|
if (eptr >= md->end_subject || |
4597 |
|
(*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0)) |
4598 |
|
return FALSE; |
4599 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4600 |
|
} |
4601 |
|
break; |
4602 |
|
} |
4603 |
|
#endif |
4604 |
|
/* Non-UTF8 can be faster */ |
4605 |
if ((ims & PCRE_DOTALL) == 0) |
if ((ims & PCRE_DOTALL) == 0) |
4606 |
{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; } |
{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; } |
4607 |
else eptr += min; |
else eptr += min; |
4651 |
{ |
{ |
4652 |
for (i = min;; i++) |
for (i = min;; i++) |
4653 |
{ |
{ |
4654 |
if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE; |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE; |
4655 |
if (i >= max || eptr >= md->end_subject) return FALSE; |
if (i >= max || eptr >= md->end_subject) return FALSE; |
4656 |
|
|
4657 |
c = *eptr++; |
c = *eptr++; |
4659 |
{ |
{ |
4660 |
case OP_ANY: |
case OP_ANY: |
4661 |
if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE; |
if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE; |
4662 |
|
#ifdef SUPPORT_UTF8 |
4663 |
|
if (md->utf8) |
4664 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4665 |
|
#endif |
4666 |
break; |
break; |
4667 |
|
|
4668 |
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
4702 |
switch(ctype) |
switch(ctype) |
4703 |
{ |
{ |
4704 |
case OP_ANY: |
case OP_ANY: |
4705 |
|
|
4706 |
|
/* Special code is required for UTF8, but when the maximum is unlimited |
4707 |
|
we don't need it. */ |
4708 |
|
|
4709 |
|
#ifdef SUPPORT_UTF8 |
4710 |
|
if (md->utf8 && max < INT_MAX) |
4711 |
|
{ |
4712 |
|
if ((ims & PCRE_DOTALL) == 0) |
4713 |
|
{ |
4714 |
|
for (i = min; i < max; i++) |
4715 |
|
{ |
4716 |
|
if (eptr >= md->end_subject || *eptr++ == '\n') break; |
4717 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4718 |
|
} |
4719 |
|
} |
4720 |
|
else |
4721 |
|
{ |
4722 |
|
for (i = min; i < max; i++) |
4723 |
|
{ |
4724 |
|
eptr++; |
4725 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4726 |
|
} |
4727 |
|
} |
4728 |
|
break; |
4729 |
|
} |
4730 |
|
#endif |
4731 |
|
/* Non-UTF8 can be faster */ |
4732 |
if ((ims & PCRE_DOTALL) == 0) |
if ((ims & PCRE_DOTALL) == 0) |
4733 |
{ |
{ |
4734 |
for (i = min; i < max; i++) |
for (i = min; i < max; i++) |
4801 |
} |
} |
4802 |
|
|
4803 |
while (eptr >= pp) |
while (eptr >= pp) |
4804 |
if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb)) |
{ |
4805 |
|
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4806 |
return TRUE; |
return TRUE; |
4807 |
|
#ifdef SUPPORT_UTF8 |
4808 |
|
if (md->utf8) |
4809 |
|
while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--; |
4810 |
|
#endif |
4811 |
|
} |
4812 |
return FALSE; |
return FALSE; |
4813 |
} |
} |
4814 |
/* Control never gets here */ |
/* Control never gets here */ |
4845 |
external_extra points to "hints" from pcre_study() or is NULL |
external_extra points to "hints" from pcre_study() or is NULL |
4846 |
subject points to the subject string |
subject points to the subject string |
4847 |
length length of subject string (may contain binary zeros) |
length length of subject string (may contain binary zeros) |
4848 |
|
start_offset where to start in the subject string |
4849 |
options option bits |
options option bits |
4850 |
offsets points to a vector of ints to be filled in with offsets |
offsets points to a vector of ints to be filled in with offsets |
4851 |
offsetcount the number of elements in the vector |
offsetcount the number of elements in the vector |
4858 |
|
|
4859 |
int |
int |
4860 |
pcre_exec(const pcre *external_re, const pcre_extra *external_extra, |
pcre_exec(const pcre *external_re, const pcre_extra *external_extra, |
4861 |
const char *subject, int length, int options, int *offsets, int offsetcount) |
const char *subject, int length, int start_offset, int options, int *offsets, |
4862 |
|
int offsetcount) |
4863 |
{ |
{ |
4864 |
int resetcount, ocount; |
int resetcount, ocount; |
4865 |
int first_char = -1; |
int first_char = -1; |
4866 |
int ims = 0; |
int req_char = -1; |
4867 |
|
int req_char2 = -1; |
4868 |
|
unsigned long int ims = 0; |
4869 |
match_data match_block; |
match_data match_block; |
4870 |
const uschar *start_bits = NULL; |
const uschar *start_bits = NULL; |
4871 |
const uschar *start_match = (const uschar *)subject; |
const uschar *start_match = (const uschar *)subject + start_offset; |
4872 |
const uschar *end_subject; |
const uschar *end_subject; |
4873 |
|
const uschar *req_char_ptr = start_match - 1; |
4874 |
const real_pcre *re = (const real_pcre *)external_re; |
const real_pcre *re = (const real_pcre *)external_re; |
4875 |
const real_pcre_extra *extra = (const real_pcre_extra *)external_extra; |
const real_pcre_extra *extra = (const real_pcre_extra *)external_extra; |
4876 |
BOOL using_temporary_offsets = FALSE; |
BOOL using_temporary_offsets = FALSE; |
4883 |
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; |
4884 |
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC; |
4885 |
|
|
4886 |
|
match_block.start_pattern = re->code; |
4887 |
match_block.start_subject = (const uschar *)subject; |
match_block.start_subject = (const uschar *)subject; |
4888 |
match_block.end_subject = match_block.start_subject + length; |
match_block.end_subject = match_block.start_subject + length; |
4889 |
end_subject = match_block.end_subject; |
end_subject = match_block.end_subject; |
4890 |
|
|
4891 |
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
4892 |
|
match_block.utf8 = (re->options & PCRE_UTF8) != 0; |
4893 |
|
|
4894 |
match_block.notbol = (options & PCRE_NOTBOL) != 0; |
match_block.notbol = (options & PCRE_NOTBOL) != 0; |
4895 |
match_block.noteol = (options & PCRE_NOTEOL) != 0; |
match_block.noteol = (options & PCRE_NOTEOL) != 0; |
4896 |
|
match_block.notempty = (options & PCRE_NOTEMPTY) != 0; |
4897 |
|
|
4898 |
match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */ |
match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */ |
4899 |
|
|
4964 |
start_bits = extra->start_bits; |
start_bits = extra->start_bits; |
4965 |
} |
} |
4966 |
|
|
4967 |
/* Loop for unanchored matches; for anchored regexps the loop runs just once. */ |
/* For anchored or unanchored matches, there may be a "last known required |
4968 |
|
character" set. If the PCRE_CASELESS is set, implying that the match starts |
4969 |
|
caselessly, or if there are any changes of this flag within the regex, set up |
4970 |
|
both cases of the character. Otherwise set the two values the same, which will |
4971 |
|
avoid duplicate testing (which takes significant time). This covers the vast |
4972 |
|
majority of cases. It will be suboptimal when the case flag changes in a regex |
4973 |
|
and the required character in fact is caseful. */ |
4974 |
|
|
4975 |
|
if ((re->options & PCRE_REQCHSET) != 0) |
4976 |
|
{ |
4977 |
|
req_char = re->req_char; |
4978 |
|
req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)? |
4979 |
|
(re->tables + fcc_offset)[req_char] : req_char; |
4980 |
|
} |
4981 |
|
|
4982 |
|
/* Loop for handling unanchored repeated matching attempts; for anchored regexs |
4983 |
|
the loop runs just once. */ |
4984 |
|
|
4985 |
do |
do |
4986 |
{ |
{ |
5009 |
|
|
5010 |
else if (startline) |
else if (startline) |
5011 |
{ |
{ |
5012 |
if (start_match > match_block.start_subject) |
if (start_match > match_block.start_subject + start_offset) |
5013 |
{ |
{ |
5014 |
while (start_match < end_subject && start_match[-1] != '\n') |
while (start_match < end_subject && start_match[-1] != '\n') |
5015 |
start_match++; |
start_match++; |
5016 |
} |
} |
5017 |
} |
} |
5018 |
|
|
5019 |
/* Or to a non-unique first char */ |
/* Or to a non-unique first char after study */ |
5020 |
|
|
5021 |
else if (start_bits != NULL) |
else if (start_bits != NULL) |
5022 |
{ |
{ |
5033 |
printf("\n"); |
printf("\n"); |
5034 |
#endif |
#endif |
5035 |
|
|
5036 |
|
/* If req_char is set, we know that that character must appear in the subject |
5037 |
|
for the match to succeed. If the first character is set, req_char must be |
5038 |
|
later in the subject; otherwise the test starts at the match point. This |
5039 |
|
optimization can save a huge amount of backtracking in patterns with nested |
5040 |
|
unlimited repeats that aren't going to match. We don't know what the state of |
5041 |
|
case matching may be when this character is hit, so test for it in both its |
5042 |
|
cases if necessary. However, the different cased versions will not be set up |
5043 |
|
unless PCRE_CASELESS was given or the casing state changes within the regex. |
5044 |
|
Writing separate code makes it go faster, as does using an autoincrement and |
5045 |
|
backing off on a match. */ |
5046 |
|
|
5047 |
|
if (req_char >= 0) |
5048 |
|
{ |
5049 |
|
register const uschar *p = start_match + ((first_char >= 0)? 1 : 0); |
5050 |
|
|
5051 |
|
/* We don't need to repeat the search if we haven't yet reached the |
5052 |
|
place we found it at last time. */ |
5053 |
|
|
5054 |
|
if (p > req_char_ptr) |
5055 |
|
{ |
5056 |
|
/* Do a single test if no case difference is set up */ |
5057 |
|
|
5058 |
|
if (req_char == req_char2) |
5059 |
|
{ |
5060 |
|
while (p < end_subject) |
5061 |
|
{ |
5062 |
|
if (*p++ == req_char) { p--; break; } |
5063 |
|
} |
5064 |
|
} |
5065 |
|
|
5066 |
|
/* Otherwise test for either case */ |
5067 |
|
|
5068 |
|
else |
5069 |
|
{ |
5070 |
|
while (p < end_subject) |
5071 |
|
{ |
5072 |
|
register int pp = *p++; |
5073 |
|
if (pp == req_char || pp == req_char2) { p--; break; } |
5074 |
|
} |
5075 |
|
} |
5076 |
|
|
5077 |
|
/* If we can't find the required character, break the matching loop */ |
5078 |
|
|
5079 |
|
if (p >= end_subject) break; |
5080 |
|
|
5081 |
|
/* If we have found the required character, save the point where we |
5082 |
|
found it, so that we don't search again next time round the loop if |
5083 |
|
the start hasn't passed this character yet. */ |
5084 |
|
|
5085 |
|
req_char_ptr = p; |
5086 |
|
} |
5087 |
|
} |
5088 |
|
|
5089 |
/* When a match occurs, substrings will be set for all internal extractions; |
/* When a match occurs, substrings will be set for all internal extractions; |
5090 |
we just need to set up the whole thing as substring 0 before returning. If |
we just need to set up the whole thing as substring 0 before returning. If |
5091 |
there were too many extractions, set the return code to zero. In the case |
there were too many extractions, set the return code to zero. In the case |
5093 |
those back references that we can. In this case there need not be overflow |
those back references that we can. In this case there need not be overflow |
5094 |
if certain parts of the pattern were not used. */ |
if certain parts of the pattern were not used. */ |
5095 |
|
|
5096 |
if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match)) |
match_block.start_match = start_match; |
5097 |
|
if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup)) |
5098 |
continue; |
continue; |
5099 |
|
|
5100 |
/* Copy the offset information from temporary store if necessary */ |
/* Copy the offset information from temporary store if necessary */ |