393 |
"internal error: previously-checked referenced subpattern not found\0" |
"internal error: previously-checked referenced subpattern not found\0" |
394 |
"DEFINE group contains more than one branch\0" |
"DEFINE group contains more than one branch\0" |
395 |
/* 55 */ |
/* 55 */ |
396 |
"repeating a DEFINE group is not allowed\0" |
"repeating a DEFINE group is not allowed\0" /** DEAD **/ |
397 |
"inconsistent NEWLINE options\0" |
"inconsistent NEWLINE options\0" |
398 |
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" |
"\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" |
399 |
"a numbered reference must not be zero\0" |
"a numbered reference must not be zero\0" |
409 |
"(*MARK) must have an argument\0" |
"(*MARK) must have an argument\0" |
410 |
"this version of PCRE is not compiled with PCRE_UCP support\0" |
"this version of PCRE is not compiled with PCRE_UCP support\0" |
411 |
"\\c must be followed by an ASCII character\0" |
"\\c must be followed by an ASCII character\0" |
412 |
|
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
413 |
; |
; |
414 |
|
|
415 |
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
546 |
/* Definition to allow mutual recursion */ |
/* Definition to allow mutual recursion */ |
547 |
|
|
548 |
static BOOL |
static BOOL |
549 |
compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int *, |
compile_regex(int, uschar **, const uschar **, int *, BOOL, BOOL, int, int, |
550 |
int *, branch_chain *, compile_data *, int *); |
int *, int *, branch_chain *, compile_data *, int *); |
551 |
|
|
552 |
|
|
553 |
|
|
578 |
|
|
579 |
|
|
580 |
/************************************************* |
/************************************************* |
581 |
|
* Check for counted repeat * |
582 |
|
*************************************************/ |
583 |
|
|
584 |
|
/* This function is called when a '{' is encountered in a place where it might |
585 |
|
start a quantifier. It looks ahead to see if it really is a quantifier or not. |
586 |
|
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} |
587 |
|
where the ddds are digits. |
588 |
|
|
589 |
|
Arguments: |
590 |
|
p pointer to the first char after '{' |
591 |
|
|
592 |
|
Returns: TRUE or FALSE |
593 |
|
*/ |
594 |
|
|
595 |
|
static BOOL |
596 |
|
is_counted_repeat(const uschar *p) |
597 |
|
{ |
598 |
|
if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
599 |
|
while ((digitab[*p] & ctype_digit) != 0) p++; |
600 |
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
601 |
|
|
602 |
|
if (*p++ != CHAR_COMMA) return FALSE; |
603 |
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
604 |
|
|
605 |
|
if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
606 |
|
while ((digitab[*p] & ctype_digit) != 0) p++; |
607 |
|
|
608 |
|
return (*p == CHAR_RIGHT_CURLY_BRACKET); |
609 |
|
} |
610 |
|
|
611 |
|
|
612 |
|
|
613 |
|
/************************************************* |
614 |
* Handle escapes * |
* Handle escapes * |
615 |
*************************************************/ |
*************************************************/ |
616 |
|
|
681 |
*errorcodeptr = ERR37; |
*errorcodeptr = ERR37; |
682 |
break; |
break; |
683 |
|
|
684 |
/* \g must be followed by one of a number of specific things: |
/* In a character class, \g is just a literal "g". Outside a character |
685 |
|
class, \g must be followed by one of a number of specific things: |
686 |
|
|
687 |
(1) A number, either plain or braced. If positive, it is an absolute |
(1) A number, either plain or braced. If positive, it is an absolute |
688 |
backreference. If negative, it is a relative backreference. This is a Perl |
backreference. If negative, it is a relative backreference. This is a Perl |
699 |
the -ESC_g code (cf \k). */ |
the -ESC_g code (cf \k). */ |
700 |
|
|
701 |
case CHAR_g: |
case CHAR_g: |
702 |
|
if (isclass) break; |
703 |
if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) |
if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) |
704 |
{ |
{ |
705 |
c = -ESC_g; |
c = -ESC_g; |
921 |
} |
} |
922 |
|
|
923 |
/* Perl supports \N{name} for character names, as well as plain \N for "not |
/* Perl supports \N{name} for character names, as well as plain \N for "not |
924 |
newline". PCRE does not support \N{name}. */ |
newline". PCRE does not support \N{name}. However, it does support |
925 |
|
quantification such as \N{2,3}. */ |
926 |
|
|
927 |
if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET && |
928 |
|
!is_counted_repeat(ptr+2)) |
929 |
*errorcodeptr = ERR37; |
*errorcodeptr = ERR37; |
930 |
|
|
931 |
/* If PCRE_UCP is set, we change the values for \d etc. */ |
/* If PCRE_UCP is set, we change the values for \d etc. */ |
1035 |
|
|
1036 |
|
|
1037 |
/************************************************* |
/************************************************* |
|
* Check for counted repeat * |
|
|
*************************************************/ |
|
|
|
|
|
/* This function is called when a '{' is encountered in a place where it might |
|
|
start a quantifier. It looks ahead to see if it really is a quantifier or not. |
|
|
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd} |
|
|
where the ddds are digits. |
|
|
|
|
|
Arguments: |
|
|
p pointer to the first char after '{' |
|
|
|
|
|
Returns: TRUE or FALSE |
|
|
*/ |
|
|
|
|
|
static BOOL |
|
|
is_counted_repeat(const uschar *p) |
|
|
{ |
|
|
if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
|
|
while ((digitab[*p] & ctype_digit) != 0) p++; |
|
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
|
|
|
|
|
if (*p++ != CHAR_COMMA) return FALSE; |
|
|
if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; |
|
|
|
|
|
if ((digitab[*p++] & ctype_digit) == 0) return FALSE; |
|
|
while ((digitab[*p] & ctype_digit) != 0) p++; |
|
|
|
|
|
return (*p == CHAR_RIGHT_CURLY_BRACKET); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/************************************************* |
|
1038 |
* Read repeat counts * |
* Read repeat counts * |
1039 |
*************************************************/ |
*************************************************/ |
1040 |
|
|
1980 |
} |
} |
1981 |
|
|
1982 |
/* For a recursion/subroutine call, if its end has been reached, which |
/* For a recursion/subroutine call, if its end has been reached, which |
1983 |
implies a subroutine call, we can scan it. */ |
implies a backward reference subroutine call, we can scan it. If it's a |
1984 |
|
forward reference subroutine call, we can't. To detect forward reference |
1985 |
|
we have to scan up the list that is kept in the workspace. This function is |
1986 |
|
called only when doing the real compile, not during the pre-compile that |
1987 |
|
measures the size of the compiled pattern. */ |
1988 |
|
|
1989 |
if (c == OP_RECURSE) |
if (c == OP_RECURSE) |
1990 |
{ |
{ |
1991 |
BOOL empty_branch = FALSE; |
const uschar *scode; |
1992 |
const uschar *scode = cd->start_code + GET(code, 1); |
BOOL empty_branch; |
1993 |
|
|
1994 |
|
/* Test for forward reference */ |
1995 |
|
|
1996 |
|
for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE) |
1997 |
|
if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE; |
1998 |
|
|
1999 |
|
/* Not a forward reference, test for completed backward reference */ |
2000 |
|
|
2001 |
|
empty_branch = FALSE; |
2002 |
|
scode = cd->start_code + GET(code, 1); |
2003 |
if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ |
if (GET(scode, 1) == 0) return TRUE; /* Unclosed */ |
2004 |
|
|
2005 |
|
/* Completed backwards reference */ |
2006 |
|
|
2007 |
do |
do |
2008 |
{ |
{ |
2009 |
if (could_be_empty_branch(scode, endcode, utf8, cd)) |
if (could_be_empty_branch(scode, endcode, utf8, cd)) |
2014 |
scode += GET(scode, 1); |
scode += GET(scode, 1); |
2015 |
} |
} |
2016 |
while (*scode == OP_ALT); |
while (*scode == OP_ALT); |
2017 |
|
|
2018 |
if (!empty_branch) return FALSE; /* All branches are non-empty */ |
if (!empty_branch) return FALSE; /* All branches are non-empty */ |
2019 |
continue; |
continue; |
2020 |
} |
} |
2240 |
the current branch of the current pattern to see if it could match the empty |
the current branch of the current pattern to see if it could match the empty |
2241 |
string. If it could, we must look outwards for branches at other levels, |
string. If it could, we must look outwards for branches at other levels, |
2242 |
stopping when we pass beyond the bracket which is the subject of the recursion. |
stopping when we pass beyond the bracket which is the subject of the recursion. |
2243 |
|
This function is called only during the real compile, not during the |
2244 |
|
pre-compile. |
2245 |
|
|
2246 |
Arguments: |
Arguments: |
2247 |
code points to start of the recursion |
code points to start of the recursion |
2292 |
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, |
2293 |
I think. |
I think. |
2294 |
|
|
2295 |
|
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not. |
2296 |
|
It seems that the appearance of a nested POSIX class supersedes an apparent |
2297 |
|
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or |
2298 |
|
a digit. |
2299 |
|
|
2300 |
|
In Perl, unescaped square brackets may also appear as part of class names. For |
2301 |
|
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for |
2302 |
|
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not |
2303 |
|
seem right at all. PCRE does not allow closing square brackets in POSIX class |
2304 |
|
names. |
2305 |
|
|
2306 |
Arguments: |
Arguments: |
2307 |
ptr pointer to the initial [ |
ptr pointer to the initial [ |
2308 |
endptr where to return the end pointer |
endptr where to return the end pointer |
2317 |
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ |
2318 |
for (++ptr; *ptr != 0; ptr++) |
for (++ptr; *ptr != 0; ptr++) |
2319 |
{ |
{ |
2320 |
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else |
if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
2321 |
|
ptr++; |
2322 |
|
else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
2323 |
|
else |
2324 |
{ |
{ |
|
if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; |
|
2325 |
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
2326 |
{ |
{ |
2327 |
*endptr = ptr; |
*endptr = ptr; |
2328 |
return TRUE; |
return TRUE; |
2329 |
} |
} |
2330 |
|
if (*ptr == CHAR_LEFT_SQUARE_BRACKET && |
2331 |
|
(ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || |
2332 |
|
ptr[1] == CHAR_EQUALS_SIGN) && |
2333 |
|
check_posix_syntax(ptr, endptr)) |
2334 |
|
return FALSE; |
2335 |
} |
} |
2336 |
} |
} |
2337 |
return FALSE; |
return FALSE; |
3041 |
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) |
firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE) |
3042 |
reqbyteptr set to the last literal character required, else < 0 |
reqbyteptr set to the last literal character required, else < 0 |
3043 |
bcptr points to current branch chain |
bcptr points to current branch chain |
3044 |
|
cond_depth conditional nesting depth |
3045 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
3046 |
lengthptr NULL during the real compile phase |
lengthptr NULL during the real compile phase |
3047 |
points to length accumulator during pre-compile phase |
points to length accumulator during pre-compile phase |
3053 |
static BOOL |
static BOOL |
3054 |
compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, |
compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, |
3055 |
int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, |
int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, |
3056 |
compile_data *cd, int *lengthptr) |
int cond_depth, compile_data *cd, int *lengthptr) |
3057 |
{ |
{ |
3058 |
int repeat_type, op_type; |
int repeat_type, op_type; |
3059 |
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ |
int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ |
3062 |
int firstbyte, reqbyte; |
int firstbyte, reqbyte; |
3063 |
int zeroreqbyte, zerofirstbyte; |
int zeroreqbyte, zerofirstbyte; |
3064 |
int req_caseopt, reqvary, tempreqvary; |
int req_caseopt, reqvary, tempreqvary; |
3065 |
int options = *optionsptr; |
int options = *optionsptr; /* May change dynamically */ |
3066 |
int after_manual_callout = 0; |
int after_manual_callout = 0; |
3067 |
int length_prevgroup = 0; |
int length_prevgroup = 0; |
3068 |
register int c; |
register int c; |
3080 |
uschar *save_hwm = NULL; |
uschar *save_hwm = NULL; |
3081 |
uschar classbits[32]; |
uschar classbits[32]; |
3082 |
|
|
3083 |
|
/* We can fish out the UTF-8 setting once and for all into a BOOL, but we |
3084 |
|
must not do this for other options (e.g. PCRE_EXTENDED) because they may change |
3085 |
|
dynamically as we process the pattern. */ |
3086 |
|
|
3087 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3088 |
BOOL class_utf8; |
BOOL class_utf8; |
3089 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
3264 |
previous_callout = NULL; |
previous_callout = NULL; |
3265 |
} |
} |
3266 |
|
|
3267 |
/* In extended mode, skip white space and comments */ |
/* In extended mode, skip white space and comments. */ |
3268 |
|
|
3269 |
if ((options & PCRE_EXTENDED) != 0) |
if ((options & PCRE_EXTENDED) != 0) |
3270 |
{ |
{ |
4233 |
op_type = 0; /* Default single-char op codes */ |
op_type = 0; /* Default single-char op codes */ |
4234 |
possessive_quantifier = FALSE; /* Default not possessive quantifier */ |
possessive_quantifier = FALSE; /* Default not possessive quantifier */ |
4235 |
|
|
4236 |
/* Save start of previous item, in case we have to move it up to make space |
/* Save start of previous item, in case we have to move it up in order to |
4237 |
for an inserted OP_ONCE for the additional '+' extension. */ |
insert something before it. */ |
4238 |
|
|
4239 |
tempcode = previous; |
tempcode = previous; |
4240 |
|
|
4257 |
} |
} |
4258 |
else repeat_type = greedy_default; |
else repeat_type = greedy_default; |
4259 |
|
|
4260 |
|
/* If previous was a recursion call, wrap it in atomic brackets so that |
4261 |
|
previous becomes the atomic group. All recursions were so wrapped in the |
4262 |
|
past, but it no longer happens for non-repeated recursions. In fact, the |
4263 |
|
repeated ones could be re-implemented independently so as not to need this, |
4264 |
|
but for the moment we rely on the code for repeating groups. */ |
4265 |
|
|
4266 |
|
if (*previous == OP_RECURSE) |
4267 |
|
{ |
4268 |
|
memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE); |
4269 |
|
*previous = OP_ONCE; |
4270 |
|
PUT(previous, 1, 2 + 2*LINK_SIZE); |
4271 |
|
previous[2 + 2*LINK_SIZE] = OP_KET; |
4272 |
|
PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE); |
4273 |
|
code += 2 + 2 * LINK_SIZE; |
4274 |
|
length_prevgroup = 3 + 3*LINK_SIZE; |
4275 |
|
|
4276 |
|
/* When actually compiling, we need to check whether this was a forward |
4277 |
|
reference, and if so, adjust the offset. */ |
4278 |
|
|
4279 |
|
if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE) |
4280 |
|
{ |
4281 |
|
int offset = GET(cd->hwm, -LINK_SIZE); |
4282 |
|
if (offset == previous + 1 - cd->start_code) |
4283 |
|
PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE); |
4284 |
|
} |
4285 |
|
} |
4286 |
|
|
4287 |
|
/* Now handle repetition for the different types of item. */ |
4288 |
|
|
4289 |
/* If previous was a character match, abolish the item and generate a |
/* If previous was a character match, abolish the item and generate a |
4290 |
repeat item instead. If a char item has a minumum of more than one, ensure |
repeat item instead. If a char item has a minumum of more than one, ensure |
4291 |
that it is set in reqbyte - it might not be if a sequence such as x{3} is |
that it is set in reqbyte - it might not be if a sequence such as x{3} is |
4577 |
} |
} |
4578 |
|
|
4579 |
/* If previous was a bracket group, we may have to replicate it in certain |
/* If previous was a bracket group, we may have to replicate it in certain |
4580 |
cases. Note that at this point we can encounter only the "basic" BRA and |
cases. Note that at this point we can encounter only the "basic" bracket |
4581 |
KET opcodes, as this is the place where they get converted into the more |
opcodes such as BRA and CBRA, as this is the place where they get converted |
4582 |
special varieties. */ |
into the more special varieties such as BRAPOS and SBRA. A test for >= |
4583 |
|
OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK, |
4584 |
|
ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow |
4585 |
|
repetition of assertions, but now it does, for Perl compatibility. */ |
4586 |
|
|
4587 |
else if (*previous == OP_BRA || *previous == OP_CBRA || |
else if (*previous >= OP_ASSERT && *previous <= OP_COND) |
|
*previous == OP_ONCE || *previous == OP_COND) |
|
4588 |
{ |
{ |
4589 |
register int i; |
register int i; |
4590 |
int len = (int)(code - previous); |
int len = (int)(code - previous); |
4591 |
uschar *bralink = NULL; |
uschar *bralink = NULL; |
4592 |
uschar *brazeroptr = NULL; |
uschar *brazeroptr = NULL; |
4593 |
|
|
4594 |
/* Repeating a DEFINE group is pointless */ |
/* Repeating a DEFINE group is pointless, but Perl allows the syntax, so |
4595 |
|
we just ignore the repeat. */ |
4596 |
|
|
4597 |
if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) |
if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) |
4598 |
|
goto END_REPEAT; |
4599 |
|
|
4600 |
|
/* There is no sense in actually repeating assertions. The only potential |
4601 |
|
use of repetition is in cases when the assertion is optional. Therefore, |
4602 |
|
if the minimum is greater than zero, just ignore the repeat. If the |
4603 |
|
maximum is not not zero or one, set it to 1. */ |
4604 |
|
|
4605 |
|
if (*previous < OP_ONCE) /* Assertion */ |
4606 |
{ |
{ |
4607 |
*errorcodeptr = ERR55; |
if (repeat_min > 0) goto END_REPEAT; |
4608 |
goto FAILED; |
if (repeat_max < 0 || repeat_max > 1) repeat_max = 1; |
4609 |
} |
} |
4610 |
|
|
4611 |
/* The case of a zero minimum is special because of the need to stick |
/* The case of a zero minimum is special because of the need to stick |
4626 |
** goto END_REPEAT; |
** goto END_REPEAT; |
4627 |
** } |
** } |
4628 |
|
|
4629 |
However, that fails when a group is referenced as a subroutine from |
However, that fails when a group or a subgroup within it is referenced |
4630 |
elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it |
as a subroutine from elsewhere in the pattern, so now we stick in |
4631 |
so that it is skipped on execution. As we don't have a list of which |
OP_SKIPZERO in front of it so that it is skipped on execution. As we |
4632 |
groups are referenced, we cannot do this selectively. |
don't have a list of which groups are referenced, we cannot do this |
4633 |
|
selectively. |
4634 |
|
|
4635 |
If the maximum is 1 or unlimited, we just have to stick in the BRAZERO |
If the maximum is 1 or unlimited, we just have to stick in the BRAZERO |
4636 |
and do no more at this point. However, we do need to adjust any |
and do no more at this point. However, we do need to adjust any |
4816 |
} |
} |
4817 |
|
|
4818 |
/* If the maximum is unlimited, set a repeater in the final copy. For |
/* If the maximum is unlimited, set a repeater in the final copy. For |
4819 |
ONCE brackets, that's all we need to do. |
ONCE brackets, that's all we need to do. However, possessively repeated |
4820 |
|
ONCE brackets can be converted into non-capturing brackets, as the |
4821 |
(To be done next, after recursion adjusted) |
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to |
|
However, possessively repeated |
|
|
ONCE brackets can be converted into non-capturing brackets, as the |
|
|
behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to |
|
4822 |
deal with possessive ONCEs specially. |
deal with possessive ONCEs specially. |
|
(....) |
|
4823 |
|
|
4824 |
Otherwise, if the quantifier was possessive, we convert the BRA code to |
Otherwise, if the quantifier was possessive, we convert the BRA code to |
4825 |
the POS form, and the KET code to KETRPOS. (It turns out to be convenient |
the POS form, and the KET code to KETRPOS. (It turns out to be convenient |
4841 |
uschar *ketcode = code - 1 - LINK_SIZE; |
uschar *ketcode = code - 1 - LINK_SIZE; |
4842 |
uschar *bracode = ketcode - GET(ketcode, 1); |
uschar *bracode = ketcode - GET(ketcode, 1); |
4843 |
|
|
4844 |
/**** |
if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA; |
4845 |
if (*bracode == OP_ONCE && possessive_quantifier) |
if (*bracode == OP_ONCE) |
|
*bracode = OP_BRA; |
|
|
****/ |
|
|
|
|
|
if (*bracode == OP_ONCE) |
|
4846 |
*ketcode = OP_KETRMAX + repeat_type; |
*ketcode = OP_KETRMAX + repeat_type; |
4847 |
else |
else |
4848 |
{ |
{ |
4893 |
there are special alternative opcodes for this case. For anything else, we |
there are special alternative opcodes for this case. For anything else, we |
4894 |
wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+' |
wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+' |
4895 |
notation is just syntactic sugar, taken from Sun's Java package, but the |
notation is just syntactic sugar, taken from Sun's Java package, but the |
4896 |
special opcodes can optimize it. |
special opcodes can optimize it. |
4897 |
|
|
4898 |
Possessively repeated subpatterns have already been handled in the code |
Possessively repeated subpatterns have already been handled in the code |
4899 |
just above, so possessive_quantifier is always FALSE for them at this |
just above, so possessive_quantifier is always FALSE for them at this |
4900 |
stage. |
stage. |
4901 |
|
|
4902 |
Note that the repeated item starts at tempcode, not at previous, which |
Note that the repeated item starts at tempcode, not at previous, which |
4903 |
might be the first part of a string whose (former) last char we repeated. |
might be the first part of a string whose (former) last char we repeated. |
4904 |
|
|
5004 |
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; |
while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; |
5005 |
namelen = (int)(ptr - name); |
namelen = (int)(ptr - name); |
5006 |
|
|
5007 |
|
/* It appears that Perl allows any characters whatsoever, other than |
5008 |
|
a closing parenthesis, to appear in arguments, so we no longer insist on |
5009 |
|
letters, digits, and underscores. */ |
5010 |
|
|
5011 |
if (*ptr == CHAR_COLON) |
if (*ptr == CHAR_COLON) |
5012 |
{ |
{ |
5013 |
arg = ++ptr; |
arg = ++ptr; |
5014 |
while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0 |
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
|
|| *ptr == '_') ptr++; |
|
5015 |
arglen = (int)(ptr - arg); |
arglen = (int)(ptr - arg); |
5016 |
} |
} |
5017 |
|
|
5028 |
if (namelen == verbs[i].len && |
if (namelen == verbs[i].len && |
5029 |
strncmp((char *)name, vn, namelen) == 0) |
strncmp((char *)name, vn, namelen) == 0) |
5030 |
{ |
{ |
5031 |
/* Check for open captures before ACCEPT and convert it to |
/* Check for open captures before ACCEPT and convert it to |
5032 |
ASSERT_ACCEPT if in an assertion. */ |
ASSERT_ACCEPT if in an assertion. */ |
5033 |
|
|
5034 |
if (verbs[i].op == OP_ACCEPT) |
if (verbs[i].op == OP_ACCEPT) |
5038 |
{ |
{ |
5039 |
*errorcodeptr = ERR59; |
*errorcodeptr = ERR59; |
5040 |
goto FAILED; |
goto FAILED; |
5041 |
} |
} |
5042 |
cd->had_accept = TRUE; |
cd->had_accept = TRUE; |
5043 |
for (oc = cd->open_caps; oc != NULL; oc = oc->next) |
for (oc = cd->open_caps; oc != NULL; oc = oc->next) |
5044 |
{ |
{ |
5339 |
/* ------------------------------------------------------------ */ |
/* ------------------------------------------------------------ */ |
5340 |
case CHAR_EQUALS_SIGN: /* Positive lookahead */ |
case CHAR_EQUALS_SIGN: /* Positive lookahead */ |
5341 |
bravalue = OP_ASSERT; |
bravalue = OP_ASSERT; |
5342 |
cd->assert_depth += 1; |
cd->assert_depth += 1; |
5343 |
ptr++; |
ptr++; |
5344 |
break; |
break; |
5345 |
|
|
5354 |
continue; |
continue; |
5355 |
} |
} |
5356 |
bravalue = OP_ASSERT_NOT; |
bravalue = OP_ASSERT_NOT; |
5357 |
cd->assert_depth += 1; |
cd->assert_depth += 1; |
5358 |
break; |
break; |
5359 |
|
|
5360 |
|
|
5364 |
{ |
{ |
5365 |
case CHAR_EQUALS_SIGN: /* Positive lookbehind */ |
case CHAR_EQUALS_SIGN: /* Positive lookbehind */ |
5366 |
bravalue = OP_ASSERTBACK; |
bravalue = OP_ASSERTBACK; |
5367 |
cd->assert_depth += 1; |
cd->assert_depth += 1; |
5368 |
ptr += 2; |
ptr += 2; |
5369 |
break; |
break; |
5370 |
|
|
5371 |
case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ |
case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ |
5372 |
bravalue = OP_ASSERTBACK_NOT; |
bravalue = OP_ASSERTBACK_NOT; |
5373 |
cd->assert_depth += 1; |
cd->assert_depth += 1; |
5374 |
ptr += 2; |
ptr += 2; |
5375 |
break; |
break; |
5376 |
|
|
5761 |
|
|
5762 |
/* Fudge the value of "called" so that when it is inserted as an |
/* Fudge the value of "called" so that when it is inserted as an |
5763 |
offset below, what it actually inserted is the reference number |
offset below, what it actually inserted is the reference number |
5764 |
of the group. */ |
of the group. Then remember the forward reference. */ |
5765 |
|
|
5766 |
called = cd->start_code + recno; |
called = cd->start_code + recno; |
5767 |
PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code)); |
PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code)); |
5768 |
} |
} |
5769 |
|
|
5770 |
/* If not a forward reference, and the subpattern is still open, |
/* If not a forward reference, and the subpattern is still open, |
5771 |
this is a recursive call. We check to see if this is a left |
this is a recursive call. We check to see if this is a left |
5772 |
recursion that could loop for ever, and diagnose that case. */ |
recursion that could loop for ever, and diagnose that case. We |
5773 |
|
must not, however, do this check if we are in a conditional |
5774 |
|
subpattern because the condition might be testing for recursion in |
5775 |
|
a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid. |
5776 |
|
Forever loops are also detected at runtime, so those that occur in |
5777 |
|
conditional subpatterns will be picked up then. */ |
5778 |
|
|
5779 |
else if (GET(called, 1) == 0 && |
else if (GET(called, 1) == 0 && cond_depth <= 0 && |
5780 |
could_be_empty(called, code, bcptr, utf8, cd)) |
could_be_empty(called, code, bcptr, utf8, cd)) |
5781 |
{ |
{ |
5782 |
*errorcodeptr = ERR40; |
*errorcodeptr = ERR40; |
5784 |
} |
} |
5785 |
} |
} |
5786 |
|
|
5787 |
/* Insert the recursion/subroutine item, automatically wrapped inside |
/* Insert the recursion/subroutine item. */ |
|
"once" brackets. Set up a "previous group" length so that a |
|
|
subsequent quantifier will work. */ |
|
|
|
|
|
*code = OP_ONCE; |
|
|
PUT(code, 1, 2 + 2*LINK_SIZE); |
|
|
code += 1 + LINK_SIZE; |
|
5788 |
|
|
5789 |
*code = OP_RECURSE; |
*code = OP_RECURSE; |
5790 |
PUT(code, 1, (int)(called - cd->start_code)); |
PUT(code, 1, (int)(called - cd->start_code)); |
5791 |
code += 1 + LINK_SIZE; |
code += 1 + LINK_SIZE; |
|
|
|
|
*code = OP_KET; |
|
|
PUT(code, 1, 2 + 2*LINK_SIZE); |
|
|
code += 1 + LINK_SIZE; |
|
|
|
|
|
length_prevgroup = 3 + 3*LINK_SIZE; |
|
5792 |
} |
} |
5793 |
|
|
5794 |
/* Can't determine a first byte now */ |
/* Can't determine a first byte now */ |
5903 |
skipbytes = 2; |
skipbytes = 2; |
5904 |
} |
} |
5905 |
|
|
5906 |
/* Process nested bracketed regex. Assertions may not be repeated, but |
/* Process nested bracketed regex. Assertions used not to be repeatable, |
5907 |
other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a |
but this was changed for Perl compatibility, so all kinds can now be |
5908 |
non-register variable (tempcode) in order to be able to pass its address |
repeated. We copy code into a non-register variable (tempcode) in order to |
5909 |
because some compilers complain otherwise. */ |
be able to pass its address because some compilers complain otherwise. */ |
5910 |
|
|
5911 |
previous = (bravalue >= OP_ONCE)? code : NULL; |
previous = code; /* For handling repetition */ |
5912 |
*code = bravalue; |
*code = bravalue; |
5913 |
tempcode = code; |
tempcode = code; |
5914 |
tempreqvary = cd->req_varyopt; /* Save value before bracket */ |
tempreqvary = cd->req_varyopt; /* Save value before bracket */ |
5915 |
length_prevgroup = 0; /* Initialize for pre-compile phase */ |
length_prevgroup = 0; /* Initialize for pre-compile phase */ |
5916 |
|
|
5917 |
if (!compile_regex( |
if (!compile_regex( |
5918 |
newoptions, /* The complete new option state */ |
newoptions, /* The complete new option state */ |
5919 |
&tempcode, /* Where to put code (updated) */ |
&tempcode, /* Where to put code (updated) */ |
5920 |
&ptr, /* Input pointer (updated) */ |
&ptr, /* Input pointer (updated) */ |
5921 |
errorcodeptr, /* Where to put an error message */ |
errorcodeptr, /* Where to put an error message */ |
5922 |
(bravalue == OP_ASSERTBACK || |
(bravalue == OP_ASSERTBACK || |
5923 |
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ |
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */ |
5924 |
reset_bracount, /* True if (?| group */ |
reset_bracount, /* True if (?| group */ |
5925 |
skipbytes, /* Skip over bracket number */ |
skipbytes, /* Skip over bracket number */ |
5926 |
&subfirstbyte, /* For possible first char */ |
cond_depth + |
5927 |
&subreqbyte, /* For possible last char */ |
((bravalue == OP_COND)?1:0), /* Depth of condition subpatterns */ |
5928 |
bcptr, /* Current branch chain */ |
&subfirstbyte, /* For possible first char */ |
5929 |
cd, /* Tables block */ |
&subreqbyte, /* For possible last char */ |
5930 |
(lengthptr == NULL)? NULL : /* Actual compile phase */ |
bcptr, /* Current branch chain */ |
5931 |
&length_prevgroup /* Pre-compile phase */ |
cd, /* Tables block */ |
5932 |
|
(lengthptr == NULL)? NULL : /* Actual compile phase */ |
5933 |
|
&length_prevgroup /* Pre-compile phase */ |
5934 |
)) |
)) |
5935 |
goto FAILED; |
goto FAILED; |
5936 |
|
|
5937 |
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) |
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT) |
5938 |
cd->assert_depth -= 1; |
cd->assert_depth -= 1; |
5939 |
|
|
5940 |
/* At the end of compiling, code is still pointing to the start of the |
/* At the end of compiling, code is still pointing to the start of the |
5941 |
group, while tempcode has been updated to point past the end of the group |
group, while tempcode has been updated to point past the end of the group |
6175 |
} |
} |
6176 |
|
|
6177 |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
/* \k<name> or \k'name' is a back reference by name (Perl syntax). |
6178 |
We also support \k{name} (.NET syntax) */ |
We also support \k{name} (.NET syntax). */ |
6179 |
|
|
6180 |
if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN || |
if (-c == ESC_k) |
|
ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET)) |
|
6181 |
{ |
{ |
6182 |
|
if ((ptr[1] != CHAR_LESS_THAN_SIGN && |
6183 |
|
ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET)) |
6184 |
|
{ |
6185 |
|
*errorcodeptr = ERR69; |
6186 |
|
break; |
6187 |
|
} |
6188 |
is_recurse = FALSE; |
is_recurse = FALSE; |
6189 |
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? |
terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? |
6190 |
CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? |
CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? |
6388 |
lookbehind TRUE if this is a lookbehind assertion |
lookbehind TRUE if this is a lookbehind assertion |
6389 |
reset_bracount TRUE to reset the count for each branch |
reset_bracount TRUE to reset the count for each branch |
6390 |
skipbytes skip this many bytes at start (for brackets and OP_COND) |
skipbytes skip this many bytes at start (for brackets and OP_COND) |
6391 |
|
cond_depth depth of nesting for conditional subpatterns |
6392 |
firstbyteptr place to put the first required character, or a negative number |
firstbyteptr place to put the first required character, or a negative number |
6393 |
reqbyteptr place to put the last required character, or a negative number |
reqbyteptr place to put the last required character, or a negative number |
6394 |
bcptr pointer to the chain of currently open branches |
bcptr pointer to the chain of currently open branches |
6402 |
static BOOL |
static BOOL |
6403 |
compile_regex(int options, uschar **codeptr, const uschar **ptrptr, |
compile_regex(int options, uschar **codeptr, const uschar **ptrptr, |
6404 |
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, |
int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, |
6405 |
int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, |
int cond_depth, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, |
6406 |
int *lengthptr) |
compile_data *cd, int *lengthptr) |
6407 |
{ |
{ |
6408 |
const uschar *ptr = *ptrptr; |
const uschar *ptr = *ptrptr; |
6409 |
uschar *code = *codeptr; |
uschar *code = *codeptr; |
6440 |
|
|
6441 |
/* If this is a capturing subpattern, add to the chain of open capturing items |
/* If this is a capturing subpattern, add to the chain of open capturing items |
6442 |
so that we can detect them if (*ACCEPT) is encountered. This is also used to |
so that we can detect them if (*ACCEPT) is encountered. This is also used to |
6443 |
detect groups that contain recursive back references to themselves. Note that |
detect groups that contain recursive back references to themselves. Note that |
6444 |
only OP_CBRA need be tested here; changing this opcode to one of its variants, |
only OP_CBRA need be tested here; changing this opcode to one of its variants, |
6445 |
e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */ |
e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */ |
6446 |
|
|
6447 |
if (*code == OP_CBRA) |
if (*code == OP_CBRA) |
6482 |
into the length. */ |
into the length. */ |
6483 |
|
|
6484 |
if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, |
if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, |
6485 |
&branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) |
&branchreqbyte, &bc, cond_depth, cd, |
6486 |
|
(lengthptr == NULL)? NULL : &length)) |
6487 |
{ |
{ |
6488 |
*ptrptr = ptr; |
*ptrptr = ptr; |
6489 |
return FALSE; |
return FALSE; |
6916 |
case OP_EXACT: |
case OP_EXACT: |
6917 |
scode += 2; |
scode += 2; |
6918 |
/* Fall through */ |
/* Fall through */ |
6919 |
|
|
6920 |
case OP_CHAR: |
case OP_CHAR: |
6921 |
case OP_PLUS: |
case OP_PLUS: |
6922 |
case OP_MINPLUS: |
case OP_MINPLUS: |
6929 |
case OP_EXACTI: |
case OP_EXACTI: |
6930 |
scode += 2; |
scode += 2; |
6931 |
/* Fall through */ |
/* Fall through */ |
6932 |
|
|
6933 |
case OP_CHARI: |
case OP_CHARI: |
6934 |
case OP_PLUSI: |
case OP_PLUSI: |
6935 |
case OP_MINPLUSI: |
case OP_MINPLUSI: |
7089 |
|
|
7090 |
/* Can't support UTF8 unless PCRE has been compiled to include the code. The |
/* Can't support UTF8 unless PCRE has been compiled to include the code. The |
7091 |
return of an error code from _pcre_valid_utf8() is a new feature, introduced in |
return of an error code from _pcre_valid_utf8() is a new feature, introduced in |
7092 |
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is |
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is |
7093 |
not used here. */ |
not used here. */ |
7094 |
|
|
7095 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
7119 |
|
|
7120 |
/* Check validity of \R options. */ |
/* Check validity of \R options. */ |
7121 |
|
|
7122 |
switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) |
if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == |
7123 |
|
(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) |
7124 |
{ |
{ |
7125 |
case 0: |
errorcode = ERR56; |
7126 |
case PCRE_BSR_ANYCRLF: |
goto PCRE_EARLY_ERROR_RETURN; |
|
case PCRE_BSR_UNICODE: |
|
|
break; |
|
|
default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; |
|
7127 |
} |
} |
7128 |
|
|
7129 |
/* Handle different types of newline. The three bits give seven cases. The |
/* Handle different types of newline. The three bits give seven cases. The |
7208 |
ptr += skipatstart; |
ptr += skipatstart; |
7209 |
code = cworkspace; |
code = cworkspace; |
7210 |
*code = OP_BRA; |
*code = OP_BRA; |
7211 |
(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE, |
(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE, |
7212 |
FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); |
FALSE, 0, 0, &firstbyte, &reqbyte, NULL, cd, &length); |
7213 |
if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; |
if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; |
7214 |
|
|
7215 |
DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, |
DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, |
7282 |
ptr = (const uschar *)pattern + skipatstart; |
ptr = (const uschar *)pattern + skipatstart; |
7283 |
code = (uschar *)codestart; |
code = (uschar *)codestart; |
7284 |
*code = OP_BRA; |
*code = OP_BRA; |
7285 |
(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, |
(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0, |
7286 |
&firstbyte, &reqbyte, NULL, cd, NULL); |
&firstbyte, &reqbyte, NULL, cd, NULL); |
7287 |
re->top_bracket = cd->bracount; |
re->top_bracket = cd->bracount; |
7288 |
re->top_backref = cd->top_backref; |
re->top_backref = cd->top_backref; |