66 |
#define BRASTACK_SIZE 200 |
#define BRASTACK_SIZE 200 |
67 |
|
|
68 |
|
|
69 |
|
/* The number of bytes in a literal character string above which we can't add |
70 |
|
any more is different when UTF-8 characters may be encountered. */ |
71 |
|
|
72 |
|
#ifdef SUPPORT_UTF8 |
73 |
|
#define MAXLIT 250 |
74 |
|
#else |
75 |
|
#define MAXLIT 255 |
76 |
|
#endif |
77 |
|
|
78 |
|
|
79 |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
80 |
|
|
81 |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
186 |
|
|
187 |
|
|
188 |
|
|
189 |
|
/************************************************* |
190 |
|
* Macros and tables for character handling * |
191 |
|
*************************************************/ |
192 |
|
|
193 |
|
/* When UTF-8 encoding is being used, a character is no longer just a single |
194 |
|
byte. The macros for character handling generate simple sequences when used in |
195 |
|
byte-mode, and more complicated ones for UTF-8 characters. */ |
196 |
|
|
197 |
|
#ifndef SUPPORT_UTF8 |
198 |
|
#define GETCHARINC(c, eptr) c = *eptr++; |
199 |
|
#define GETCHARLEN(c, eptr, len) c = *eptr; |
200 |
|
#define BACKCHAR(eptr) |
201 |
|
|
202 |
|
#else /* SUPPORT_UTF8 */ |
203 |
|
|
204 |
|
/* Get the next UTF-8 character, advancing the pointer */ |
205 |
|
|
206 |
|
#define GETCHARINC(c, eptr) \ |
207 |
|
c = *eptr++; \ |
208 |
|
if (md->utf8 && (c & 0xc0) == 0xc0) \ |
209 |
|
{ \ |
210 |
|
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
211 |
|
int s = 6 - a; /* Amount to shift next byte */ \ |
212 |
|
c &= utf8_table3[a]; /* Low order bits from first byte */ \ |
213 |
|
while (a-- > 0) \ |
214 |
|
{ \ |
215 |
|
c |= (*eptr++ & 0x3f) << s; \ |
216 |
|
s += 6; \ |
217 |
|
} \ |
218 |
|
} |
219 |
|
|
220 |
|
/* Get the next UTF-8 character, not advancing the pointer, setting length */ |
221 |
|
|
222 |
|
#define GETCHARLEN(c, eptr, len) \ |
223 |
|
c = *eptr; \ |
224 |
|
len = 1; \ |
225 |
|
if (md->utf8 && (c & 0xc0) == 0xc0) \ |
226 |
|
{ \ |
227 |
|
int i; \ |
228 |
|
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \ |
229 |
|
int s = 6 - a; /* Amount to shift next byte */ \ |
230 |
|
c &= utf8_table3[a]; /* Low order bits from first byte */ \ |
231 |
|
for (i = 1; i <= a; i++) \ |
232 |
|
{ \ |
233 |
|
c |= (eptr[i] & 0x3f) << s; \ |
234 |
|
s += 6; \ |
235 |
|
} \ |
236 |
|
len += a; \ |
237 |
|
} |
238 |
|
|
239 |
|
/* If the pointer is not at the start of a character, move it back until |
240 |
|
it is. */ |
241 |
|
|
242 |
|
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--; |
243 |
|
|
244 |
|
#endif |
245 |
|
|
246 |
|
|
247 |
|
|
248 |
/************************************************* |
/************************************************* |
249 |
* Default character tables * |
* Default character tables * |
259 |
|
|
260 |
|
|
261 |
|
|
262 |
|
#ifdef SUPPORT_UTF8 |
263 |
|
/************************************************* |
264 |
|
* Tables for UTF-8 support * |
265 |
|
*************************************************/ |
266 |
|
|
267 |
|
/* These are the breakpoints for different numbers of bytes in a UTF-8 |
268 |
|
character. */ |
269 |
|
|
270 |
|
static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; |
271 |
|
|
272 |
|
/* These are the indicator bits and the mask for the data bits to set in the |
273 |
|
first byte of a character, indexed by the number of additional bytes. */ |
274 |
|
|
275 |
|
static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; |
276 |
|
static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; |
277 |
|
|
278 |
|
/* Table of the number of extra characters, indexed by the first character |
279 |
|
masked with 0x3f. The highest number for a valid UTF-8 character is in fact |
280 |
|
0x3d. */ |
281 |
|
|
282 |
|
static uschar utf8_table4[] = { |
283 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
284 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
285 |
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
286 |
|
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; |
287 |
|
|
288 |
|
|
289 |
|
/************************************************* |
290 |
|
* Convert character value to UTF-8 * |
291 |
|
*************************************************/ |
292 |
|
|
293 |
|
/* This function takes an integer value in the range 0 - 0x7fffffff |
294 |
|
and encodes it as a UTF-8 character in 0 to 6 bytes. |
295 |
|
|
296 |
|
Arguments: |
297 |
|
cvalue the character value |
298 |
|
buffer pointer to buffer for result - at least 6 bytes long |
299 |
|
|
300 |
|
Returns: number of characters placed in the buffer |
301 |
|
*/ |
302 |
|
|
303 |
|
static int |
304 |
|
ord2utf8(int cvalue, uschar *buffer) |
305 |
|
{ |
306 |
|
register int i, j; |
307 |
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) |
308 |
|
if (cvalue <= utf8_table1[i]) break; |
309 |
|
*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]); |
310 |
|
cvalue >>= 6 - i; |
311 |
|
for (j = 0; j < i; j++) |
312 |
|
{ |
313 |
|
*buffer++ = 0x80 | (cvalue & 0x3f); |
314 |
|
cvalue >>= 6; |
315 |
|
} |
316 |
|
return i + 1; |
317 |
|
} |
318 |
|
#endif |
319 |
|
|
320 |
|
|
321 |
|
|
322 |
/************************************************* |
/************************************************* |
323 |
* Return version string * |
* Return version string * |
324 |
*************************************************/ |
*************************************************/ |
477 |
|
|
478 |
/* This function is called when a \ has been encountered. It either returns a |
/* This function is called when a \ has been encountered. It either returns a |
479 |
positive value for a simple escape such as \n, or a negative value which |
positive value for a simple escape such as \n, or a negative value which |
480 |
encodes one of the more complicated things such as \d. On entry, ptr is |
encodes one of the more complicated things such as \d. When UTF-8 is enabled, |
481 |
pointing at the \. On exit, it is on the final character of the escape |
a positive value greater than 255 may be returned. On entry, ptr is pointing at |
482 |
sequence. |
the \. On exit, it is on the final character of the escape sequence. |
483 |
|
|
484 |
Arguments: |
Arguments: |
485 |
ptrptr points to the pattern position pointer |
ptrptr points to the pattern position pointer |
501 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
502 |
int c, i; |
int c, i; |
503 |
|
|
504 |
c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */ |
/* If backslash is at the end of the pattern, it's an error. */ |
505 |
|
|
506 |
|
c = *(++ptr); |
507 |
if (c == 0) *errorptr = ERR1; |
if (c == 0) *errorptr = ERR1; |
508 |
|
|
509 |
/* Digits or letters may have special meaning; all others are literals. */ |
/* Digits or letters may have special meaning; all others are literals. */ |
563 |
} |
} |
564 |
|
|
565 |
/* \0 always starts an octal number, but we may drop through to here with a |
/* \0 always starts an octal number, but we may drop through to here with a |
566 |
larger first octal digit */ |
larger first octal digit. */ |
567 |
|
|
568 |
case '0': |
case '0': |
569 |
c -= '0'; |
c -= '0'; |
570 |
while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && |
while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 && |
571 |
ptr[1] != '8' && ptr[1] != '9') |
ptr[1] != '8' && ptr[1] != '9') |
572 |
c = c * 8 + *(++ptr) - '0'; |
c = c * 8 + *(++ptr) - '0'; |
573 |
|
c &= 255; /* Take least significant 8 bits */ |
574 |
break; |
break; |
575 |
|
|
576 |
/* Special escapes not starting with a digit are straightforward */ |
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number |
577 |
|
which can be greater than 0xff, but only if the ddd are hex digits. */ |
578 |
|
|
579 |
case 'x': |
case 'x': |
580 |
|
#ifdef SUPPORT_UTF8 |
581 |
|
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0) |
582 |
|
{ |
583 |
|
const uschar *pt = ptr + 2; |
584 |
|
register int count = 0; |
585 |
|
c = 0; |
586 |
|
while ((cd->ctypes[*pt] & ctype_xdigit) != 0) |
587 |
|
{ |
588 |
|
count++; |
589 |
|
c = c * 16 + cd->lcc[*pt] - |
590 |
|
(((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W'); |
591 |
|
pt++; |
592 |
|
} |
593 |
|
if (*pt == '}') |
594 |
|
{ |
595 |
|
if (c < 0 || count > 8) *errorptr = ERR34; |
596 |
|
ptr = pt; |
597 |
|
break; |
598 |
|
} |
599 |
|
/* If the sequence of hex digits does not end with '}', then we don't |
600 |
|
recognize this construct; fall through to the normal \x handling. */ |
601 |
|
} |
602 |
|
#endif |
603 |
|
|
604 |
|
/* Read just a single hex char */ |
605 |
|
|
606 |
c = 0; |
c = 0; |
607 |
while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) |
while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0) |
608 |
{ |
{ |
612 |
} |
} |
613 |
break; |
break; |
614 |
|
|
615 |
|
/* Other special escapes not starting with a digit are straightforward */ |
616 |
|
|
617 |
case 'c': |
case 'c': |
618 |
c = *(++ptr); |
c = *(++ptr); |
619 |
if (c == 0) |
if (c == 0) |
751 |
|
|
752 |
Arguments: |
Arguments: |
753 |
code points to the start of the pattern (the bracket) |
code points to the start of the pattern (the bracket) |
754 |
|
options the compiling options |
755 |
|
|
756 |
Returns: the fixed length, or -1 if there is no fixed length |
Returns: the fixed length, or -1 if there is no fixed length |
757 |
*/ |
*/ |
758 |
|
|
759 |
static int |
static int |
760 |
find_fixedlength(uschar *code) |
find_fixedlength(uschar *code, int options) |
761 |
{ |
{ |
762 |
int length = -1; |
int length = -1; |
763 |
|
|
778 |
case OP_BRA: |
case OP_BRA: |
779 |
case OP_ONCE: |
case OP_ONCE: |
780 |
case OP_COND: |
case OP_COND: |
781 |
d = find_fixedlength(cc); |
d = find_fixedlength(cc, options); |
782 |
if (d < 0) return -1; |
if (d < 0) return -1; |
783 |
branchlength += d; |
branchlength += d; |
784 |
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT); |
832 |
cc++; |
cc++; |
833 |
break; |
break; |
834 |
|
|
835 |
/* Handle char strings */ |
/* Handle char strings. In UTF-8 mode we must count characters, not bytes. |
836 |
|
This requires a scan of the string, unfortunately. We assume valid UTF-8 |
837 |
|
strings, so all we do is reduce the length by one for byte whose bits are |
838 |
|
10xxxxxx. */ |
839 |
|
|
840 |
case OP_CHARS: |
case OP_CHARS: |
841 |
branchlength += *(++cc); |
branchlength += *(++cc); |
842 |
|
#ifdef SUPPORT_UTF8 |
843 |
|
for (d = 1; d <= *cc; d++) |
844 |
|
if ((cc[d] & 0xc0) == 0x80) branchlength--; |
845 |
|
#endif |
846 |
cc += *cc + 1; |
cc += *cc + 1; |
847 |
break; |
break; |
848 |
|
|
1222 |
goto FAILED; |
goto FAILED; |
1223 |
} |
} |
1224 |
} |
} |
1225 |
/* Fall through if single character */ |
|
1226 |
|
/* Fall through if single character, but don't at present allow |
1227 |
|
chars > 255 in UTF-8 mode. */ |
1228 |
|
|
1229 |
|
#ifdef SUPPORT_UTF8 |
1230 |
|
if (c > 255) |
1231 |
|
{ |
1232 |
|
*errorptr = ERR33; |
1233 |
|
goto FAILED; |
1234 |
|
} |
1235 |
|
#endif |
1236 |
} |
} |
1237 |
|
|
1238 |
/* A single character may be followed by '-' to form a range. However, |
/* A single character may be followed by '-' to form a range. However, |
1252 |
} |
} |
1253 |
|
|
1254 |
/* The second part of a range can be a single-character escape, but |
/* The second part of a range can be a single-character escape, but |
1255 |
not any of the other escapes. */ |
not any of the other escapes. Perl 5.6 treats a hyphen as a literal |
1256 |
|
in such circumstances. */ |
1257 |
|
|
1258 |
if (d == '\\') |
if (d == '\\') |
1259 |
{ |
{ |
1260 |
|
const uschar *oldptr = ptr; |
1261 |
d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); |
d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd); |
1262 |
|
|
1263 |
|
#ifdef SUPPORT_UTF8 |
1264 |
|
if (d > 255) |
1265 |
|
{ |
1266 |
|
*errorptr = ERR33; |
1267 |
|
goto FAILED; |
1268 |
|
} |
1269 |
|
#endif |
1270 |
|
/* \b is backslash; any other special means the '-' was literal */ |
1271 |
|
|
1272 |
if (d < 0) |
if (d < 0) |
1273 |
{ |
{ |
1274 |
if (d == -ESC_b) d = '\b'; else |
if (d == -ESC_b) d = '\b'; else |
1275 |
{ |
{ |
1276 |
*errorptr = ERR7; |
ptr = oldptr - 2; |
1277 |
goto FAILED; |
goto SINGLE_CHARACTER; /* A few lines below */ |
1278 |
} |
} |
1279 |
} |
} |
1280 |
} |
} |
1302 |
/* Handle a lone single character - we can get here for a normal |
/* Handle a lone single character - we can get here for a normal |
1303 |
non-escape char, or after \ that introduces a single character. */ |
non-escape char, or after \ that introduces a single character. */ |
1304 |
|
|
1305 |
|
SINGLE_CHARACTER: |
1306 |
|
|
1307 |
class [c/8] |= (1 << (c&7)); |
class [c/8] |= (1 << (c&7)); |
1308 |
if ((options & PCRE_CASELESS) != 0) |
if ((options & PCRE_CASELESS) != 0) |
1309 |
{ |
{ |
2021 |
tempptr = ptr; |
tempptr = ptr; |
2022 |
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); |
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd); |
2023 |
if (c < 0) { ptr = tempptr; break; } |
if (c < 0) { ptr = tempptr; break; } |
2024 |
|
|
2025 |
|
/* If a character is > 127 in UTF-8 mode, we have to turn it into |
2026 |
|
two or more characters in the UTF-8 encoding. */ |
2027 |
|
|
2028 |
|
#ifdef SUPPORT_UTF8 |
2029 |
|
if (c > 127 && (options & PCRE_UTF8) != 0) |
2030 |
|
{ |
2031 |
|
uschar buffer[8]; |
2032 |
|
int len = ord2utf8(c, buffer); |
2033 |
|
for (c = 0; c < len; c++) *code++ = buffer[c]; |
2034 |
|
length += len; |
2035 |
|
continue; |
2036 |
|
} |
2037 |
|
#endif |
2038 |
} |
} |
2039 |
|
|
2040 |
/* Ordinary character or single-char escape */ |
/* Ordinary character or single-char escape */ |
2045 |
|
|
2046 |
/* This "while" is the end of the "do" above. */ |
/* This "while" is the end of the "do" above. */ |
2047 |
|
|
2048 |
while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); |
while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0); |
2049 |
|
|
2050 |
/* Update the last character and the count of literals */ |
/* Update the last character and the count of literals */ |
2051 |
|
|
2057 |
the next state. */ |
the next state. */ |
2058 |
|
|
2059 |
previous[1] = length; |
previous[1] = length; |
2060 |
if (length < 255) ptr--; |
if (length < MAXLIT) ptr--; |
2061 |
break; |
break; |
2062 |
} |
} |
2063 |
} /* end of big loop */ |
} /* end of big loop */ |
2195 |
if (lookbehind) |
if (lookbehind) |
2196 |
{ |
{ |
2197 |
*code = OP_END; |
*code = OP_END; |
2198 |
length = find_fixedlength(last_branch); |
length = find_fixedlength(last_branch, options); |
2199 |
DPRINTF(("fixed length = %d\n", length)); |
DPRINTF(("fixed length = %d\n", length)); |
2200 |
if (length < 0) |
if (length < 0) |
2201 |
{ |
{ |
2486 |
uschar *code_base, *code_end; |
uschar *code_base, *code_end; |
2487 |
#endif |
#endif |
2488 |
|
|
2489 |
|
/* Can't support UTF8 unless PCRE has been compiled to include the code. */ |
2490 |
|
|
2491 |
|
#ifndef SUPPORT_UTF8 |
2492 |
|
if ((options & PCRE_UTF8) != 0) |
2493 |
|
{ |
2494 |
|
*errorptr = ERR32; |
2495 |
|
return NULL; |
2496 |
|
} |
2497 |
|
#endif |
2498 |
|
|
2499 |
/* We can't pass back an error message if errorptr is NULL; I guess the best we |
/* We can't pass back an error message if errorptr is NULL; I guess the best we |
2500 |
can do is just return NULL. */ |
can do is just return NULL. */ |
2501 |
|
|
2991 |
&compile_block); |
&compile_block); |
2992 |
if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
if (*errorptr != NULL) goto PCRE_ERROR_RETURN; |
2993 |
if (c < 0) { ptr = saveptr; break; } |
if (c < 0) { ptr = saveptr; break; } |
2994 |
|
|
2995 |
|
#ifdef SUPPORT_UTF8 |
2996 |
|
if (c > 127 && (options & PCRE_UTF8) != 0) |
2997 |
|
{ |
2998 |
|
int i; |
2999 |
|
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++) |
3000 |
|
if (c <= utf8_table1[i]) break; |
3001 |
|
runlength += i; |
3002 |
|
} |
3003 |
|
#endif |
3004 |
} |
} |
3005 |
|
|
3006 |
/* Ordinary character or single-char escape */ |
/* Ordinary character or single-char escape */ |
3010 |
|
|
3011 |
/* This "while" is the end of the "do" above. */ |
/* This "while" is the end of the "do" above. */ |
3012 |
|
|
3013 |
while (runlength < 255 && |
while (runlength < MAXLIT && |
3014 |
(compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); |
(compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0); |
3015 |
|
|
3016 |
ptr--; |
ptr--; |
3655 |
|
|
3656 |
/* Move the subject pointer back. This occurs only at the start of |
/* Move the subject pointer back. This occurs only at the start of |
3657 |
each branch of a lookbehind assertion. If we are too close to the start to |
each branch of a lookbehind assertion. If we are too close to the start to |
3658 |
move back, this match function fails. */ |
move back, this match function fails. When working with UTF-8 we move |
3659 |
|
back a number of characters, not bytes. */ |
3660 |
|
|
3661 |
case OP_REVERSE: |
case OP_REVERSE: |
3662 |
|
#ifdef SUPPORT_UTF8 |
3663 |
|
c = (ecode[1] << 8) + ecode[2]; |
3664 |
|
for (i = 0; i < c; i++) |
3665 |
|
{ |
3666 |
|
eptr--; |
3667 |
|
BACKCHAR(eptr) |
3668 |
|
} |
3669 |
|
#else |
3670 |
eptr -= (ecode[1] << 8) + ecode[2]; |
eptr -= (ecode[1] << 8) + ecode[2]; |
3671 |
|
#endif |
3672 |
|
|
3673 |
if (eptr < md->start_subject) return FALSE; |
if (eptr < md->start_subject) return FALSE; |
3674 |
ecode += 3; |
ecode += 3; |
3675 |
break; |
break; |
3989 |
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n') |
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n') |
3990 |
return FALSE; |
return FALSE; |
3991 |
if (eptr++ >= md->end_subject) return FALSE; |
if (eptr++ >= md->end_subject) return FALSE; |
3992 |
|
#ifdef SUPPORT_UTF8 |
3993 |
|
if (md->utf8) |
3994 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
3995 |
|
#endif |
3996 |
ecode++; |
ecode++; |
3997 |
break; |
break; |
3998 |
|
|
4194 |
for (i = 1; i <= min; i++) |
for (i = 1; i <= min; i++) |
4195 |
{ |
{ |
4196 |
if (eptr >= md->end_subject) return FALSE; |
if (eptr >= md->end_subject) return FALSE; |
4197 |
c = *eptr++; |
GETCHARINC(c, eptr) /* Get character; increment eptr */ |
4198 |
|
|
4199 |
|
#ifdef SUPPORT_UTF8 |
4200 |
|
/* We do not yet support class members > 255 */ |
4201 |
|
if (c > 255) return FALSE; |
4202 |
|
#endif |
4203 |
|
|
4204 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
4205 |
return FALSE; |
return FALSE; |
4206 |
} |
} |
4220 |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) |
4221 |
return TRUE; |
return TRUE; |
4222 |
if (i >= max || eptr >= md->end_subject) return FALSE; |
if (i >= max || eptr >= md->end_subject) return FALSE; |
4223 |
c = *eptr++; |
GETCHARINC(c, eptr) /* Get character; increment eptr */ |
4224 |
|
|
4225 |
|
#ifdef SUPPORT_UTF8 |
4226 |
|
/* We do not yet support class members > 255 */ |
4227 |
|
if (c > 255) return FALSE; |
4228 |
|
#endif |
4229 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
4230 |
return FALSE; |
return FALSE; |
4231 |
} |
} |
4237 |
else |
else |
4238 |
{ |
{ |
4239 |
const uschar *pp = eptr; |
const uschar *pp = eptr; |
4240 |
for (i = min; i < max; eptr++, i++) |
int len = 1; |
4241 |
|
for (i = min; i < max; i++) |
4242 |
{ |
{ |
4243 |
if (eptr >= md->end_subject) break; |
if (eptr >= md->end_subject) break; |
4244 |
c = *eptr; |
GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */ |
4245 |
if ((data[c/8] & (1 << (c&7))) != 0) continue; |
|
4246 |
break; |
#ifdef SUPPORT_UTF8 |
4247 |
|
/* We do not yet support class members > 255 */ |
4248 |
|
if (c > 255) break; |
4249 |
|
#endif |
4250 |
|
if ((data[c/8] & (1 << (c&7))) == 0) break; |
4251 |
|
eptr += len; |
4252 |
} |
} |
4253 |
|
|
4254 |
while (eptr >= pp) |
while (eptr >= pp) |
4255 |
|
{ |
4256 |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4257 |
return TRUE; |
return TRUE; |
4258 |
|
|
4259 |
|
#ifdef SUPPORT_UTF8 |
4260 |
|
BACKCHAR(eptr) |
4261 |
|
#endif |
4262 |
|
} |
4263 |
return FALSE; |
return FALSE; |
4264 |
} |
} |
4265 |
} |
} |
4579 |
|
|
4580 |
/* First, ensure the minimum number of matches are present. Use inline |
/* First, ensure the minimum number of matches are present. Use inline |
4581 |
code for maximizing the speed, and do the type test once at the start |
code for maximizing the speed, and do the type test once at the start |
4582 |
(i.e. keep it out of the loop). Also test that there are at least the |
(i.e. keep it out of the loop). Also we can test that there are at least |
4583 |
minimum number of characters before we start. */ |
the minimum number of bytes before we start, except when doing '.' in |
4584 |
|
UTF8 mode. Leave the test in in all cases; in the special case we have |
4585 |
|
to test after each character. */ |
4586 |
|
|
4587 |
if (min > md->end_subject - eptr) return FALSE; |
if (min > md->end_subject - eptr) return FALSE; |
4588 |
if (min > 0) switch(ctype) |
if (min > 0) switch(ctype) |
4589 |
{ |
{ |
4590 |
case OP_ANY: |
case OP_ANY: |
4591 |
|
#ifdef SUPPORT_UTF8 |
4592 |
|
if (md->utf8) |
4593 |
|
{ |
4594 |
|
for (i = 1; i <= min; i++) |
4595 |
|
{ |
4596 |
|
if (eptr >= md->end_subject || |
4597 |
|
(*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0)) |
4598 |
|
return FALSE; |
4599 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4600 |
|
} |
4601 |
|
break; |
4602 |
|
} |
4603 |
|
#endif |
4604 |
|
/* Non-UTF8 can be faster */ |
4605 |
if ((ims & PCRE_DOTALL) == 0) |
if ((ims & PCRE_DOTALL) == 0) |
4606 |
{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; } |
{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; } |
4607 |
else eptr += min; |
else eptr += min; |
4659 |
{ |
{ |
4660 |
case OP_ANY: |
case OP_ANY: |
4661 |
if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE; |
if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE; |
4662 |
|
#ifdef SUPPORT_UTF8 |
4663 |
|
if (md->utf8) |
4664 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4665 |
|
#endif |
4666 |
break; |
break; |
4667 |
|
|
4668 |
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
4702 |
switch(ctype) |
switch(ctype) |
4703 |
{ |
{ |
4704 |
case OP_ANY: |
case OP_ANY: |
4705 |
|
|
4706 |
|
/* Special code is required for UTF8, but when the maximum is unlimited |
4707 |
|
we don't need it. */ |
4708 |
|
|
4709 |
|
#ifdef SUPPORT_UTF8 |
4710 |
|
if (md->utf8 && max < INT_MAX) |
4711 |
|
{ |
4712 |
|
if ((ims & PCRE_DOTALL) == 0) |
4713 |
|
{ |
4714 |
|
for (i = min; i < max; i++) |
4715 |
|
{ |
4716 |
|
if (eptr >= md->end_subject || *eptr++ == '\n') break; |
4717 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4718 |
|
} |
4719 |
|
} |
4720 |
|
else |
4721 |
|
{ |
4722 |
|
for (i = min; i < max; i++) |
4723 |
|
{ |
4724 |
|
eptr++; |
4725 |
|
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; |
4726 |
|
} |
4727 |
|
} |
4728 |
|
break; |
4729 |
|
} |
4730 |
|
#endif |
4731 |
|
/* Non-UTF8 can be faster */ |
4732 |
if ((ims & PCRE_DOTALL) == 0) |
if ((ims & PCRE_DOTALL) == 0) |
4733 |
{ |
{ |
4734 |
for (i = min; i < max; i++) |
for (i = min; i < max; i++) |
4801 |
} |
} |
4802 |
|
|
4803 |
while (eptr >= pp) |
while (eptr >= pp) |
4804 |
|
{ |
4805 |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0)) |
4806 |
return TRUE; |
return TRUE; |
4807 |
|
#ifdef SUPPORT_UTF8 |
4808 |
|
if (md->utf8) |
4809 |
|
while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--; |
4810 |
|
#endif |
4811 |
|
} |
4812 |
return FALSE; |
return FALSE; |
4813 |
} |
} |
4814 |
/* Control never gets here */ |
/* Control never gets here */ |
4889 |
end_subject = match_block.end_subject; |
end_subject = match_block.end_subject; |
4890 |
|
|
4891 |
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; |
4892 |
|
match_block.utf8 = (re->options & PCRE_UTF8) != 0; |
4893 |
|
|
4894 |
match_block.notbol = (options & PCRE_NOTBOL) != 0; |
match_block.notbol = (options & PCRE_NOTBOL) != 0; |
4895 |
match_block.noteol = (options & PCRE_NOTEOL) != 0; |
match_block.noteol = (options & PCRE_NOTEOL) != 0; |