--- code/trunk/pcre_compile.c 2008/03/05 17:23:42 323 +++ code/trunk/pcre_compile.c 2008/04/19 16:41:04 341 @@ -158,7 +158,7 @@ "SKIP\0" "THEN"; -static verbitem verbs[] = { +static const verbitem verbs[] = { { 6, OP_ACCEPT }, { 6, OP_COMMIT }, { 1, OP_FAIL }, @@ -168,7 +168,7 @@ { 4, OP_THEN } }; -static int verbcount = sizeof(verbs)/sizeof(verbitem); +static const int verbcount = sizeof(verbs)/sizeof(verbitem); /* Tables of names of POSIX character classes and their lengths. The names are @@ -295,14 +295,15 @@ /* 55 */ "repeating a DEFINE group is not allowed\0" "inconsistent NEWLINE options\0" - "\\g is not followed by a braced name or an optionally braced non-zero number\0" - "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" + "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" + "a numbered reference must not be zero\0" "(*VERB) with an argument is not supported\0" /* 60 */ "(*VERB) not recognized\0" "number is too big\0" "subpattern name expected\0" - "digit expected after (?+"; + "digit expected after (?+\0" + "] is an invalid data character in JavaScript compatibility mode"; /* Table to identify digits and hex digits. This is used when compiling @@ -531,14 +532,31 @@ *errorcodeptr = ERR37; break; - /* \g must be followed by a number, either plain or braced. If positive, it - is an absolute backreference. If negative, it is a relative backreference. - This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a - reference to a named group. This is part of Perl's movement towards a - unified syntax for back references. As this is synonymous with \k{name}, we - fudge it up by pretending it really was \k. */ + /* \g must be followed by one of a number of specific things: + + (1) A number, either plain or braced. If positive, it is an absolute + backreference. If negative, it is a relative backreference. This is a Perl + 5.10 feature. + + (2) Perl 5.10 also supports \g{name} as a reference to a named group. This + is part of Perl's movement towards a unified syntax for back references. As + this is synonymous with \k{name}, we fudge it up by pretending it really + was \k. + + (3) For Oniguruma compatibility we also support \g followed by a name or a + number either in angle brackets or in single quotes. However, these are + (possibly recursive) subroutine calls, _not_ backreferences. Just return + the -ESC_g code (cf \k). */ case 'g': + if (ptr[1] == '<' || ptr[1] == '\'') + { + c = -ESC_g; + break; + } + + /* Handle the Perl-compatible cases */ + if (ptr[1] == '{') { const uschar *p; @@ -565,17 +583,23 @@ while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; - if (c < 0) + if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } - - if (c == 0 || (braced && *(++ptr) != '}')) + + if (braced && *(++ptr) != '}') { *errorcodeptr = ERR57; break; } + + if (c == 0) + { + *errorcodeptr = ERR58; + break; + } if (negated) { @@ -611,7 +635,7 @@ c -= '0'; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; - if (c < 0) + if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; @@ -952,7 +976,7 @@ Arguments: ptr current position in the pattern - count current count of capturing parens so far encountered + cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode @@ -961,10 +985,11 @@ */ static int -find_parens(const uschar *ptr, int count, const uschar *name, int lorn, +find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode) { const uschar *thisname; +int count = cd->bracount; for (; *ptr != 0; ptr++) { @@ -984,10 +1009,34 @@ continue; } - /* Skip over character classes */ + /* Skip over character classes; this logic must be similar to the way they + are handled for real. If the first character is '^', skip it. Also, if the + first few characters (either before or after ^) are \Q\E or \E we skip them + too. This makes for compatibility with Perl. */ if (*ptr == '[') { + BOOL negate_class = FALSE; + for (;;) + { + int c = *(++ptr); + if (c == '\\') + { + if (ptr[1] == 'E') ptr++; + else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; + else break; + } + else if (!negate_class && c == '^') + negate_class = TRUE; + else break; + } + + /* If the next character is ']', it is a data character that must be + skipped, except in JavaScript compatibility mode. */ + + if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) + ptr++; + while (*(++ptr) != ']') { if (*ptr == 0) return -1; @@ -1544,7 +1593,7 @@ /* Groups with zero repeats can of course be empty; skip them. */ - if (c == OP_BRAZERO || c == OP_BRAMINZERO) + if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) { code += _pcre_OP_lengths[c]; do code += GET(code, 1); while (*code == OP_ALT); @@ -1824,11 +1873,12 @@ that is referenced. This means that groups can be replicated for fixed repetition simply by copying (because the recursion is allowed to refer to earlier groups that are outside the current group). However, when a group is -optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before -it, after it has been compiled. This means that any OP_RECURSE items within it -that refer to the group itself or any contained groups have to have their -offsets adjusted. That one of the jobs of this function. Before it is called, -the partially compiled regex must be temporarily terminated with OP_END. +optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is +inserted before it, after it has been compiled. This means that any OP_RECURSE +items within it that refer to the group itself or any contained groups have to +have their offsets adjusted. That one of the jobs of this function. Before it +is called, the partially compiled regex must be temporarily terminated with +OP_END. This function has been extended with the possibility of forward references for recursions and subroutine calls. It must also check the list of such references @@ -1859,7 +1909,7 @@ /* See if this recursion is on the forward reference list. If so, adjust the reference. */ - + for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) { offset = GET(hc, 0); @@ -2435,7 +2485,7 @@ /* Get next byte in the pattern */ c = *ptr; - + /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop. */ @@ -2630,7 +2680,17 @@ opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. - */ + + In JavaScript compatibility mode, an isolated ']' causes an error. In + default (Perl) mode, it is treated as a data character. */ + + case ']': + if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *errorcodeptr = ERR64; + goto FAILED; + } + goto NORMAL_CHAR; case '[': previous = code; @@ -2663,6 +2723,19 @@ negate_class = TRUE; else break; } + + /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, + an initial ']' is taken as a data character -- the code below handles + that. In JS mode, [] must always fail, so generate OP_FAIL, whereas + [^] must match any character, so generate OP_ALLANY. */ + + if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *code++ = negate_class? OP_ALLANY : OP_FAIL; + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + break; + } /* If a class contains a negative special such as \S, we need to flip the negation flag at the end, so that support for characters > 255 works @@ -3819,28 +3892,38 @@ if (repeat_min == 0) { - /* If the maximum is also zero, we just omit the group from the output - altogether. */ + /* If the maximum is also zero, we used to just omit the group from the + output altogether, like this: - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } + ** if (repeat_max == 0) + ** { + ** code = previous; + ** goto END_REPEAT; + ** } + + However, that fails when a group is referenced as a subroutine from + elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it + so that it is skipped on execution. As we don't have a list of which + groups are referenced, we cannot do this selectively. + + If the maximum is 1 or unlimited, we just have to stick in the BRAZERO + and do no more at this point. However, we do need to adjust any + OP_RECURSE calls inside the group that refer to the group itself or any + internal or forward referenced group, because the offset is from the + start of the whole regex. Temporarily terminate the pattern while doing + this. */ - /* If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. However, we do need to adjust - any OP_RECURSE calls inside the group that refer to the group itself or - any internal or forward referenced group, because the offset is from - the start of the whole regex. Temporarily terminate the pattern while - doing this. */ - - if (repeat_max <= 1) + if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; adjust_recurse(previous, 1, utf8, cd, save_hwm); memmove(previous+1, previous, len); code++; + if (repeat_max == 0) + { + *previous++ = OP_SKIPZERO; + goto END_REPEAT; + } *previous++ = OP_BRAZERO + repeat_type; } @@ -4034,6 +4117,13 @@ } } } + + /* If previous is OP_FAIL, it was generated by an empty class [] in + JavaScript mode. The other ways in which OP_FAIL can be generated, that is + by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" + error above. We can just ignore the repeat in JS case. */ + + else if (*previous == OP_FAIL) goto END_REPEAT; /* Else there's some kind of shambles */ @@ -4115,7 +4205,7 @@ bravalue = OP_CBRA; save_hwm = cd->hwm; reset_bracount = FALSE; - + /* First deal with various "verbs" that can be introduced by '*'. */ if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) @@ -4321,7 +4411,7 @@ /* Search the pattern for a forward reference */ - else if ((i = find_parens(ptr, cd->bracount, name, namelen, + else if ((i = find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); @@ -4567,7 +4657,7 @@ references (?P=name) and recursion (?P>name), as well as falling through from the Perl recursion syntax (?&name). We also come here from the Perl \k or \k'name' back reference syntax and the \k{name} - .NET syntax. */ + .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ NAMED_REF_OR_RECURSE: name = ++ptr; @@ -4618,7 +4708,7 @@ recno = GET2(slot, 0); } else if ((recno = /* Forward back reference */ - find_parens(ptr, cd->bracount, name, namelen, + find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { *errorcodeptr = ERR15; @@ -4645,6 +4735,15 @@ case '5': case '6': case '7': case '8': case '9': /* subroutine */ { const uschar *called; + terminator = ')'; + + /* Come here from the \g<...> and \g'...' code (Oniguruma + compatibility). However, the syntax has been checked to ensure that + the ... are a (signed) number, so that neither ERR63 nor ERR29 will + be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY + ever be taken. */ + + HANDLE_NUMERICAL_RECURSION: if ((refsign = *ptr) == '+') { @@ -4666,7 +4765,7 @@ while((digitab[*ptr] & ctype_digit) != 0) recno = recno * 10 + *ptr++ - '0'; - if (*ptr != ')') + if (*ptr != terminator) { *errorcodeptr = ERR29; goto FAILED; @@ -4719,8 +4818,8 @@ if (called == NULL) { - if (find_parens(ptr, cd->bracount, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + if (find_parens(ptr, cd, NULL, recno, + (options & PCRE_EXTENDED) != 0) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -5062,7 +5161,7 @@ back references and those types that consume a character may be repeated. We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are ever created. */ - + case '\\': tempptr = ptr; c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); @@ -5089,6 +5188,64 @@ zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; + + /* \g or \g'name' is a subroutine call by name and \g or \g'n' + is a subroutine call by number (Oniguruma syntax). In fact, the value + -ESC_g is returned only for these cases. So we don't need to check for < + or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is + -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as + that is a synonym for a named back reference). */ + + if (-c == ESC_g) + { + const uschar *p; + save_hwm = cd->hwm; /* Normally this is set when '(' is read */ + terminator = (*(++ptr) == '<')? '>' : '\''; + + /* These two statements stop the compiler for warning about possibly + unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In + fact, because we actually check for a number below, the paths that + would actually be in error are never taken. */ + + skipbytes = 0; + reset_bracount = FALSE; + + /* Test for a name */ + + if (ptr[1] != '+' && ptr[1] != '-') + { + BOOL isnumber = TRUE; + for (p = ptr + 1; *p != 0 && *p != terminator; p++) + { + if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; + if ((cd->ctypes[*p] & ctype_word) == 0) break; + } + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + if (isnumber) + { + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } + is_recurse = TRUE; + goto NAMED_REF_OR_RECURSE; + } + + /* Test a signed number in angle brackets or quotes. */ + + p = ptr + 2; + while ((digitab[*p] & ctype_digit) != 0) p++; + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } /* \k or \k'name' is a back reference by name (Perl syntax). We also support \k{name} (.NET syntax) */ @@ -6108,7 +6265,7 @@ if (groupptr == NULL) errorcode = ERR53; else PUT(((uschar *)codestart), offset, groupptr - codestart); } - + /* Give an error if there's back reference to a non-existent capturing subpattern. */