--- code/trunk/pcre_compile.c 2007/11/21 15:35:09 275 +++ code/trunk/pcre_compile.c 2009/03/23 12:05:43 406 @@ -6,7 +6,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2007 University of Cambridge + Copyright (c) 1997-2009 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -97,21 +97,56 @@ on. Zero means further processing is needed (for things like \x), or the escape is invalid. */ -#ifndef EBCDIC /* This is the "normal" table for ASCII systems */ +#ifndef EBCDIC + +/* This is the "normal" table for ASCII systems or for EBCDIC systems running +in UTF-8 mode. */ + static const short int escapes[] = { - 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */ - 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */ - '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */ --ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */ --ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */ --ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */ - '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */ --ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */ --ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */ - 0, 0, -ESC_z /* x - z */ + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + CHAR_COLON, CHAR_SEMICOLON, + CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, + CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, + CHAR_COMMERCIAL_AT, -ESC_A, + -ESC_B, -ESC_C, + -ESC_D, -ESC_E, + 0, -ESC_G, + -ESC_H, 0, + 0, -ESC_K, + 0, 0, + 0, 0, + -ESC_P, -ESC_Q, + -ESC_R, -ESC_S, + 0, 0, + -ESC_V, -ESC_W, + -ESC_X, 0, + -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, + CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, + CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, + CHAR_GRAVE_ACCENT, 7, + -ESC_b, 0, + -ESC_d, ESC_e, + ESC_f, 0, + -ESC_h, 0, + 0, -ESC_k, + 0, 0, + ESC_n, 0, + -ESC_p, 0, + ESC_r, -ESC_s, + ESC_tee, 0, + -ESC_v, -ESC_w, + 0, 0, + -ESC_z }; -#else /* This is the "abnormal" table for EBCDIC systems */ +#else + +/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */ + static const short int escapes[] = { /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|', /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0, @@ -142,7 +177,9 @@ /* Table of special "verbs" like (*PRUNE). This is a short table, so it is searched linearly. Put all the names into a single string, in order to reduce -the number of relocations when a shared library is dynamically linked. */ +the number of relocations when a shared library is dynamically linked. The +string is built from string macros so that it works in UTF-8 mode on EBCDIC +platforms. */ typedef struct verbitem { int len; @@ -150,15 +187,15 @@ } verbitem; static const char verbnames[] = - "ACCEPT\0" - "COMMIT\0" - "F\0" - "FAIL\0" - "PRUNE\0" - "SKIP\0" - "THEN"; + STRING_ACCEPT0 + STRING_COMMIT0 + STRING_F0 + STRING_FAIL0 + STRING_PRUNE0 + STRING_SKIP0 + STRING_THEN; -static verbitem verbs[] = { +static const verbitem verbs[] = { { 6, OP_ACCEPT }, { 6, OP_COMMIT }, { 1, OP_FAIL }, @@ -168,7 +205,7 @@ { 4, OP_THEN } }; -static int verbcount = sizeof(verbs)/sizeof(verbitem); +static const int verbcount = sizeof(verbs)/sizeof(verbitem); /* Tables of names of POSIX character classes and their lengths. The names are @@ -178,9 +215,10 @@ for handling case independence. */ static const char posix_names[] = - "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0" - "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0" - "word\0" "xdigit"; + STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0 + STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0 + STRING_graph0 STRING_print0 STRING_punct0 STRING_space0 + STRING_word0 STRING_xdigit; static const uschar posix_name_lengths[] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 }; @@ -295,14 +333,15 @@ /* 55 */ "repeating a DEFINE group is not allowed\0" "inconsistent NEWLINE options\0" - "\\g is not followed by a braced name or an optionally braced non-zero number\0" - "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" + "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0" + "a numbered reference must not be zero\0" "(*VERB) with an argument is not supported\0" /* 60 */ "(*VERB) not recognized\0" "number is too big\0" "subpattern name expected\0" - "digit expected after (?+"; + "digit expected after (?+\0" + "] is an invalid data character in JavaScript compatibility mode"; /* Table to identify digits and hex digits. This is used when compiling @@ -321,7 +360,11 @@ Then we can use ctype_digit and ctype_xdigit in the code. */ -#ifndef EBCDIC /* This is the "normal" case, for ASCII systems */ +#ifndef EBCDIC + +/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in +UTF-8 mode. */ + static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */ @@ -357,7 +400,10 @@ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */ -#else /* This is the "abnormal" case, for EBCDIC systems */ +#else + +/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ + static const unsigned char digitab[] = { 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ @@ -454,7 +500,7 @@ find_error_text(int n) { const char *s = error_texts; -for (; n > 0; n--) while (*s++ != 0); +for (; n > 0; n--) while (*s++ != 0) {}; return s; } @@ -502,9 +548,9 @@ in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ -#ifndef EBCDIC /* ASCII coding */ -else if (c < '0' || c > 'z') {} /* Not alphanumeric */ -else if ((i = escapes[c - '0']) != 0) c = i; +#ifndef EBCDIC /* ASCII/UTF-8 coding */ +else if (c < CHAR_0 || c > CHAR_z) {} /* Not alphanumeric */ +else if ((i = escapes[c - CHAR_0]) != 0) c = i; #else /* EBCDIC coding */ else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ @@ -523,28 +569,45 @@ /* A number of Perl escapes are not handled by PCRE. We give an explicit error. */ - case 'l': - case 'L': - case 'N': - case 'u': - case 'U': + case CHAR_l: + case CHAR_L: + case CHAR_N: + case CHAR_u: + case CHAR_U: *errorcodeptr = ERR37; break; - /* \g must be followed by a number, either plain or braced. If positive, it - is an absolute backreference. If negative, it is a relative backreference. - This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a - reference to a named group. This is part of Perl's movement towards a - unified syntax for back references. As this is synonymous with \k{name}, we - fudge it up by pretending it really was \k. */ + /* \g must be followed by one of a number of specific things: + + (1) A number, either plain or braced. If positive, it is an absolute + backreference. If negative, it is a relative backreference. This is a Perl + 5.10 feature. + + (2) Perl 5.10 also supports \g{name} as a reference to a named group. This + is part of Perl's movement towards a unified syntax for back references. As + this is synonymous with \k{name}, we fudge it up by pretending it really + was \k. + + (3) For Oniguruma compatibility we also support \g followed by a name or a + number either in angle brackets or in single quotes. However, these are + (possibly recursive) subroutine calls, _not_ backreferences. Just return + the -ESC_g code (cf \k). */ - case 'g': - if (ptr[1] == '{') + case CHAR_g: + if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE) + { + c = -ESC_g; + break; + } + + /* Handle the Perl-compatible cases */ + + if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { const uschar *p; - for (p = ptr+2; *p != 0 && *p != '}'; p++) - if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; - if (*p != 0 && *p != '}') + for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++) + if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break; + if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET) { c = -ESC_k; break; @@ -554,7 +617,7 @@ } else braced = FALSE; - if (ptr[1] == '-') + if (ptr[1] == CHAR_MINUS) { negated = TRUE; ptr++; @@ -563,20 +626,26 @@ c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; + c = c * 10 + *(++ptr) - CHAR_0; - if (c < 0) + if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } - if (c == 0 || (braced && *(++ptr) != '}')) + if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET) { *errorcodeptr = ERR57; break; } + if (c == 0) + { + *errorcodeptr = ERR58; + break; + } + if (negated) { if (c > bracount) @@ -602,16 +671,16 @@ value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ - case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': + case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: + case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: if (!isclass) { oldptr = ptr; - c -= '0'; + c -= CHAR_0; while ((digitab[ptr[1]] & ctype_digit) != 0) - c = c * 10 + *(++ptr) - '0'; - if (c < 0) + c = c * 10 + *(++ptr) - CHAR_0; + if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; @@ -628,7 +697,7 @@ generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ - if ((c = *ptr) >= '8') + if ((c = *ptr) >= CHAR_8) { ptr--; c = 0; @@ -641,10 +710,10 @@ to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. */ - case '0': - c -= '0'; - while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') - c = c * 8 + *(++ptr) - '0'; + case CHAR_0: + c -= CHAR_0; + while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) + c = c * 8 + *(++ptr) - CHAR_0; if (!utf8 && c > 255) *errorcodeptr = ERR51; break; @@ -652,8 +721,8 @@ than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. */ - case 'x': - if (ptr[1] == '{') + case CHAR_x: + if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) { const uschar *pt = ptr + 2; int count = 0; @@ -662,19 +731,19 @@ while ((digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; - if (c == 0 && cc == '0') continue; /* Leading zeroes */ + if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ count++; -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ - c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); + if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif } - if (*pt == '}') + if (*pt == CHAR_RIGHT_CURLY_BRACKET) { if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; @@ -690,14 +759,14 @@ c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { - int cc; /* Some compilers don't like ++ */ - cc = *(++ptr); /* in initializers */ -#ifndef EBCDIC /* ASCII coding */ - if (cc >= 'a') cc -= 32; /* Convert to upper case */ - c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); + int cc; /* Some compilers don't like */ + cc = *(++ptr); /* ++ in initializers */ +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ + c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10)); #else /* EBCDIC coding */ - if (cc <= 'z') cc += 64; /* Convert to upper case */ - c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); + if (cc <= CHAR_z) cc += 64; /* Convert to upper case */ + c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); #endif } break; @@ -706,7 +775,7 @@ This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ - case 'c': + case CHAR_c: c = *(++ptr); if (c == 0) { @@ -714,11 +783,11 @@ break; } -#ifndef EBCDIC /* ASCII coding */ - if (c >= 'a' && c <= 'z') c -= 32; +#ifndef EBCDIC /* ASCII/UTF-8 coding */ + if (c >= CHAR_a && c <= CHAR_z) c -= 32; c ^= 0x40; #else /* EBCDIC coding */ - if (c >= 'a' && c <= 'z') c += 64; + if (c >= CHAR_a && c <= CHAR_z) c += 64; c ^= 0xC0; #endif break; @@ -780,9 +849,9 @@ /* \P or \p can be followed by a name in {}, optionally preceded by ^ for negation. */ -if (c == '{') +if (c == CHAR_LEFT_CURLY_BRACKET) { - if (ptr[1] == '^') + if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT) { *negptr = TRUE; ptr++; @@ -791,10 +860,10 @@ { c = *(++ptr); if (c == 0) goto ERROR_RETURN; - if (c == '}') break; + if (c == CHAR_RIGHT_CURLY_BRACKET) break; name[i] = c; } - if (c !='}') goto ERROR_RETURN; + if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN; name[i] = 0; } @@ -859,15 +928,15 @@ { if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; -if (*p == '}') return TRUE; +if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; -if (*p++ != ',') return FALSE; -if (*p == '}') return TRUE; +if (*p++ != CHAR_COMMA) return FALSE; +if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE; if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; -return (*p == '}'); +return (*p == CHAR_RIGHT_CURLY_BRACKET); } @@ -900,7 +969,7 @@ /* Read the minimum value and do a paranoid check: a negative value indicates an integer overflow. */ -while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; +while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0; if (min < 0 || min > 65535) { *errorcodeptr = ERR5; @@ -910,12 +979,12 @@ /* Read the maximum value if there is one, and again do a paranoid on its size. Also, max must not be less than min. */ -if (*p == '}') max = min; else +if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else { - if (*(++p) != '}') + if (*(++p) != CHAR_RIGHT_CURLY_BRACKET) { max = 0; - while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; + while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0; if (max < 0 || max > 65535) { *errorcodeptr = ERR5; @@ -952,7 +1021,7 @@ Arguments: ptr current position in the pattern - count current count of capturing parens so far encountered + cd compile background data name name to seek, or NULL if seeking a numbered subpattern lorn name length, or subpattern number if name is NULL xmode TRUE if we are in /x mode @@ -961,10 +1030,11 @@ */ static int -find_parens(const uschar *ptr, int count, const uschar *name, int lorn, +find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode) { const uschar *thisname; +int count = cd->bracount; for (; *ptr != 0; ptr++) { @@ -972,33 +1042,63 @@ /* Skip over backslashed characters and also entire \Q...\E */ - if (*ptr == '\\') + if (*ptr == CHAR_BACKSLASH) { if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) + if (*ptr == CHAR_Q) for (;;) { - while (*(++ptr) != 0 && *ptr != '\\'); + while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; + if (*(++ptr) == CHAR_E) break; } continue; } - /* Skip over character classes */ + /* Skip over character classes; this logic must be similar to the way they + are handled for real. If the first character is '^', skip it. Also, if the + first few characters (either before or after ^) are \Q\E or \E we skip them + too. This makes for compatibility with Perl. Note the use of STR macros to + encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */ - if (*ptr == '[') + if (*ptr == CHAR_LEFT_SQUARE_BRACKET) { - while (*(++ptr) != ']') + BOOL negate_class = FALSE; + for (;;) + { + int c = *(++ptr); + if (c == CHAR_BACKSLASH) + { + if (ptr[1] == CHAR_E) + ptr++; + else if (strncmp((const char *)ptr+1, + STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; + } + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) + negate_class = TRUE; + else break; + } + + /* If the next character is ']', it is a data character that must be + skipped, except in JavaScript compatibility mode. */ + + if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET && + (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) + ptr++; + + while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET) { if (*ptr == 0) return -1; - if (*ptr == '\\') + if (*ptr == CHAR_BACKSLASH) { if (*(++ptr) == 0) return -1; - if (*ptr == 'Q') for (;;) + if (*ptr == CHAR_Q) for (;;) { - while (*(++ptr) != 0 && *ptr != '\\'); + while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {}; if (*ptr == 0) return -1; - if (*(++ptr) == 'E') break; + if (*(++ptr) == CHAR_E) break; } continue; } @@ -1008,17 +1108,17 @@ /* Skip comments in /x mode */ - if (xmode && *ptr == '#') + if (xmode && *ptr == CHAR_NUMBER_SIGN) { - while (*(++ptr) != 0 && *ptr != '\n'); + while (*(++ptr) != 0 && *ptr != CHAR_NL) {}; if (*ptr == 0) return -1; continue; } /* An opening parens must now be a real metacharacter */ - if (*ptr != '(') continue; - if (ptr[1] != '?' && ptr[1] != '*') + if (*ptr != CHAR_LEFT_PARENTHESIS) continue; + if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK) { count++; if (name == NULL && count == lorn) return count; @@ -1026,19 +1126,19 @@ } ptr += 2; - if (*ptr == 'P') ptr++; /* Allow optional P */ + if (*ptr == CHAR_P) ptr++; /* Allow optional P */ /* We have to disambiguate (? */ - if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && - *ptr != '\'') + if ((*ptr != CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_EXCLAMATION_MARK || + ptr[1] == CHAR_EQUALS_SIGN) && *ptr != CHAR_APOSTROPHE) continue; count++; if (name == NULL && count == lorn) return count; term = *ptr++; - if (term == '<') term = '>'; + if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN; thisname = ptr; while (*ptr != term) ptr++; if (name != NULL && lorn == ptr - thisname && @@ -1252,6 +1352,7 @@ case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: branchlength++; cc++; break; @@ -1400,6 +1501,8 @@ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } +#else + (void)(utf8); /* Keep compiler happy by referencing function argument */ #endif } } @@ -1493,6 +1596,8 @@ if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } +#else + (void)(utf8); /* Keep compiler happy by referencing function argument */ #endif } } @@ -1508,8 +1613,9 @@ can match the empty string or not. It is called from could_be_empty() below and from compile_branch() when checking for an unlimited repeat of a group that can match nothing. Note that first_significant_code() skips over -assertions. If we hit an unclosed bracket, we return "empty" - this means we've -struck an inner bracket whose current branch will already have been scanned. +backward and negative forward assertions when its final argument is TRUE. If we +hit an unclosed bracket, we return "empty" - this means we've struck an inner +bracket whose current branch will already have been scanned. Arguments: code points to start of search @@ -1531,9 +1637,19 @@ c = *code; + /* Skip over forward assertions; the other assertions are skipped by + first_significant_code() with a TRUE final argument. */ + + if (c == OP_ASSERT) + { + do code += GET(code, 1); while (*code == OP_ALT); + c = *code; + continue; + } + /* Groups with zero repeats can of course be empty; skip them. */ - if (c == OP_BRAZERO || c == OP_BRAMINZERO) + if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) { code += _pcre_OP_lengths[c]; do code += GET(code, 1); while (*code == OP_ALT); @@ -1548,17 +1664,25 @@ BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ - /* Scan a closed bracket */ + /* If a conditional group has only one branch, there is a second, implied, + empty branch, so just skip over the conditional, because it could be empty. + Otherwise, scan the individual branches of the group. */ - empty_branch = FALSE; - do - { - if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) - empty_branch = TRUE; + if (c == OP_COND && code[GET(code, 1)] != OP_ALT) code += GET(code, 1); + else + { + empty_branch = FALSE; + do + { + if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) + empty_branch = TRUE; + code += GET(code, 1); + } + while (*code == OP_ALT); + if (!empty_branch) return FALSE; /* All branches are non-empty */ } - while (*code == OP_ALT); - if (!empty_branch) return FALSE; /* All branches are non-empty */ + c = *code; continue; } @@ -1619,6 +1743,7 @@ case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: + case OP_ALLANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: @@ -1726,29 +1851,48 @@ *************************************************/ /* This function is called when the sequence "[:" or "[." or "[=" is -encountered in a character class. It checks whether this is followed by an -optional ^ and then a sequence of letters, terminated by a matching ":]" or -".]" or "=]". +encountered in a character class. It checks whether this is followed by a +sequence of characters terminated by a matching ":]" or ".]" or "=]". If we +reach an unescaped ']' without the special preceding character, return FALSE. + +Originally, this function only recognized a sequence of letters between the +terminators, but it seems that Perl recognizes any sequence of characters, +though of course unknown POSIX names are subsequently rejected. Perl gives an +"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE +didn't consider this to be a POSIX class. Likewise for [:1234:]. + +The problem in trying to be exactly like Perl is in the handling of escapes. We +have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX +class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code +below handles the special case of \], but does not try to do any other escape +processing. This makes it different from Perl for cases such as [:l\ower:] +where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize +"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does, +I think. -Argument: +Arguments: ptr pointer to the initial [ endptr where to return the end pointer - cd pointer to compile data Returns: TRUE or FALSE */ static BOOL -check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd) +check_posix_syntax(const uschar *ptr, const uschar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ -if (*(++ptr) == '^') ptr++; -while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++; -if (*ptr == terminator && ptr[1] == ']') +for (++ptr; *ptr != 0; ptr++) { - *endptr = ptr; - return TRUE; + if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else + { + if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE; + if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) + { + *endptr = ptr; + return TRUE; + } + } } return FALSE; } @@ -1794,11 +1938,12 @@ that is referenced. This means that groups can be replicated for fixed repetition simply by copying (because the recursion is allowed to refer to earlier groups that are outside the current group). However, when a group is -optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before -it, after it has been compiled. This means that any OP_RECURSE items within it -that refer to the group itself or any contained groups have to have their -offsets adjusted. That one of the jobs of this function. Before it is called, -the partially compiled regex must be temporarily terminated with OP_END. +optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is +inserted before it, after it has been compiled. This means that any OP_RECURSE +items within it that refer to the group itself or any contained groups have to +have their offsets adjusted. That one of the jobs of this function. Before it +is called, the partially compiled regex must be temporarily terminated with +OP_END. This function has been extended with the possibility of forward references for recursions and subroutine calls. It must also check the list of such references @@ -1933,7 +2078,7 @@ unsigned int c, othercase, next; for (c = *cptr; c <= d; c++) - { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; } + { if ((othercase = UCD_OTHERCASE(c)) != c) break; } if (c > d) return FALSE; @@ -1942,7 +2087,7 @@ for (++c; c <= d; c++) { - if (_pcre_ucp_othercase(c) != next) break; + if (UCD_OTHERCASE(c) != next) break; next++; } @@ -1988,7 +2133,7 @@ for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') + if (*ptr == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } @@ -2000,7 +2145,7 @@ /* If the next item is one that we can handle, get its value. A non-negative value is a character, a negative value is an escape value. */ -if (*ptr == '\\') +if (*ptr == CHAR_BACKSLASH) { int temperrorcode = 0; next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); @@ -2025,7 +2170,7 @@ for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; - if (*ptr == '#') + if (*ptr == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } @@ -2036,8 +2181,9 @@ /* If the next thing is itself optional, we have to give up. */ -if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) - return FALSE; +if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK || + strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) + return FALSE; /* Now compare the next item with the previous opcode. If the previous is a positive single character match, "item" either contains the character or, if @@ -2052,6 +2198,8 @@ case OP_CHAR: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } +#else + (void)(utf8_char); /* Keep compiler happy by referencing function argument */ #endif return item != next; @@ -2070,7 +2218,7 @@ unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP - othercase = _pcre_ucp_othercase((unsigned int)next); + othercase = UCD_OTHERCASE((unsigned int)next); #else othercase = NOTACHAR; #endif @@ -2083,7 +2231,6 @@ /* For OP_NOT, "item" must be a single-byte character. */ case OP_NOT: - if (next < 0) return FALSE; /* Not a character */ if (item == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 @@ -2092,7 +2239,7 @@ unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP - othercase = _pcre_ucp_othercase(next); + othercase = UCD_OTHERCASE(next); #else othercase = NOTACHAR; #endif @@ -2346,6 +2493,7 @@ BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; uschar *class_utf8data; +uschar *class_utf8data_base; uschar utf8_char[6]; #else BOOL utf8 = FALSE; @@ -2385,7 +2533,7 @@ for (;; ptr++) { BOOL negate_class; - BOOL should_flip_negation; + BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; @@ -2473,7 +2621,7 @@ if (inescq && c != 0) { - if (c == '\\' && ptr[1] == 'E') + if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) { inescq = FALSE; ptr++; @@ -2499,8 +2647,9 @@ /* Fill in length of a previous callout, except when the next thing is a quantifier. */ - is_quantifier = c == '*' || c == '+' || c == '?' || - (c == '{' && is_counted_repeat(ptr+1)); + is_quantifier = + c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK || + (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1)); if (!is_quantifier && previous_callout != NULL && after_manual_callout-- <= 0) @@ -2515,7 +2664,7 @@ if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; - if (c == '#') + if (c == CHAR_NUMBER_SIGN) { while (*(++ptr) != 0) { @@ -2540,8 +2689,8 @@ { /* ===================================================================*/ case 0: /* The branch terminates at string end */ - case '|': /* or | or ) */ - case ')': + case CHAR_VERTICAL_LINE: /* or | or ) */ + case CHAR_RIGHT_PARENTHESIS: *firstbyteptr = firstbyte; *reqbyteptr = reqbyte; *codeptr = code; @@ -2563,7 +2712,7 @@ /* Handle single-character metacharacters. In multiline mode, ^ disables the setting of any following char as a first character. */ - case '^': + case CHAR_CIRCUMFLEX_ACCENT: if ((options & PCRE_MULTILINE) != 0) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; @@ -2572,7 +2721,7 @@ *code++ = OP_CIRC; break; - case '$': + case CHAR_DOLLAR_SIGN: previous = NULL; *code++ = OP_DOLL; break; @@ -2580,12 +2729,12 @@ /* There can never be a first char if '.' is first, whatever happens about repeats. The value of reqbyte doesn't change either. */ - case '.': + case CHAR_DOT: if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; previous = code; - *code++ = OP_ANY; + *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; @@ -2600,18 +2749,29 @@ opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. - */ - case '[': + In JavaScript compatibility mode, an isolated ']' causes an error. In + default (Perl) mode, it is treated as a data character. */ + + case CHAR_RIGHT_SQUARE_BRACKET: + if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *errorcodeptr = ERR64; + goto FAILED; + } + goto NORMAL_CHAR; + + case CHAR_LEFT_SQUARE_BRACKET: previous = code; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ - if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || + ptr[1] == CHAR_EQUALS_SIGN) && + check_posix_syntax(ptr, &tempptr)) { - *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; + *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31; goto FAILED; } @@ -2623,19 +2783,37 @@ for (;;) { c = *(++ptr); - if (c == '\\') + if (c == CHAR_BACKSLASH) { - if (ptr[1] == 'E') ptr++; - else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; - else break; + if (ptr[1] == CHAR_E) + ptr++; + else if (strncmp((const char *)ptr+1, + STR_Q STR_BACKSLASH STR_E, 3) == 0) + ptr += 3; + else + break; } - else if (!negate_class && c == '^') + else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT) negate_class = TRUE; else break; } - /* If a class contains a negative special such as \S, we need to flip the - negation flag at the end, so that support for characters > 255 works + /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, + an initial ']' is taken as a data character -- the code below handles + that. In JS mode, [] must always fail, so generate OP_FAIL, whereas + [^] must match any character, so generate OP_ALLANY. */ + + if (c == CHAR_RIGHT_SQUARE_BRACKET && + (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) + { + *code++ = negate_class? OP_ALLANY : OP_FAIL; + if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; + zerofirstbyte = firstbyte; + break; + } + + /* If a class contains a negative special such as \S, we need to flip the + negation flag at the end, so that support for characters > 255 works correctly (they are all included in the class). */ should_flip_negation = FALSE; @@ -2657,6 +2835,7 @@ #ifdef SUPPORT_UTF8 class_utf8 = FALSE; /* No chars >= 256 */ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ + class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ #endif /* Process characters until ] is reached. By writing this as a "do" it @@ -2672,13 +2851,25 @@ { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } + + /* In the pre-compile phase, accumulate the length of any UTF-8 extra + data and reset the pointer. This is so that very large classes that + contain a zillion UTF-8 characters no longer overwrite the work space + (which is on the stack). */ + + if (lengthptr != NULL) + { + *lengthptr += class_utf8data - class_utf8data_base; + class_utf8data = class_utf8data_base; + } + #endif /* Inside \Q...\E everything is literal except \E */ if (inescq) { - if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ + if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E) /* If we are at \E */ { inescq = FALSE; /* Reset literal state */ ptr++; /* Skip the 'E' */ @@ -2693,26 +2884,26 @@ [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5.6 and 5.8 do. */ - if (c == '[' && - (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && - check_posix_syntax(ptr, &tempptr, cd)) + if (c == CHAR_LEFT_SQUARE_BRACKET && + (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT || + ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; register const uschar *cbits = cd->cbits; uschar pbits[32]; - if (ptr[1] != ':') + if (ptr[1] != CHAR_COLON) { *errorcodeptr = ERR31; goto FAILED; } ptr += 2; - if (*ptr == '^') + if (*ptr == CHAR_CIRCUMFLEX_ACCENT) { local_negate = TRUE; - should_flip_negation = TRUE; /* Note negative special */ + should_flip_negation = TRUE; /* Note negative special */ ptr++; } @@ -2782,17 +2973,17 @@ to 'or' into the one we are building. We assume they have more than one character in them, so set class_charcount bigger than one. */ - if (c == '\\') + if (c == CHAR_BACKSLASH) { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; - if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ - else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ - else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ + if (-c == ESC_b) c = CHAR_BS; /* \b is backspace in a class */ + else if (-c == ESC_X) c = CHAR_X; /* \X is literal X in a class */ + else if (-c == ESC_R) c = CHAR_R; /* \R is literal R in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ { - if (ptr[1] == '\\' && ptr[2] == 'E') + if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) { ptr += 2; /* avoid empty string */ } @@ -2815,7 +3006,7 @@ continue; case ESC_D: - should_flip_negation = TRUE; + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; continue; @@ -2824,7 +3015,7 @@ continue; case ESC_W: - should_flip_negation = TRUE; + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; @@ -2834,14 +3025,11 @@ continue; case ESC_S: - should_flip_negation = TRUE; + should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; - case ESC_E: /* Perl ignores an orphan \E */ - continue; - default: /* Not recognized; fall through */ break; /* Need "default" setting to stop compiler warning. */ } @@ -3021,7 +3209,7 @@ entirely. The code for handling \Q and \E is messy. */ CHECK_RANGE: - while (ptr[1] == '\\' && ptr[2] == 'E') + while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) { inescq = FALSE; ptr += 2; @@ -3031,28 +3219,29 @@ /* Remember \r or \n */ - if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; + if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* Check for range */ - if (!inescq && ptr[1] == '-') + if (!inescq && ptr[1] == CHAR_MINUS) { int d; ptr += 2; - while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; + while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2; /* If we hit \Q (not followed by \E) at this point, go into escaped mode. */ - while (*ptr == '\\' && ptr[1] == 'Q') + while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q) { ptr += 2; - if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } + if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) + { ptr += 2; continue; } inescq = TRUE; break; } - if (*ptr == 0 || (!inescq && *ptr == ']')) + if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) { ptr = oldptr; goto LONE_SINGLE_CHARACTER; @@ -3071,7 +3260,7 @@ not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. */ - if (!inescq && d == '\\') + if (!inescq && d == CHAR_BACKSLASH) { d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; @@ -3081,9 +3270,9 @@ if (d < 0) { - if (d == -ESC_b) d = '\b'; - else if (d == -ESC_X) d = 'X'; - else if (d == -ESC_R) d = 'R'; else + if (d == -ESC_b) d = CHAR_BS; + else if (d == -ESC_X) d = CHAR_X; + else if (d == -ESC_R) d = CHAR_R; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; /* A few lines below */ @@ -3104,7 +3293,7 @@ /* Remember \r or \n */ - if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; + if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless @@ -3224,7 +3413,7 @@ if ((options & PCRE_CASELESS) != 0) { unsigned int othercase; - if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) + if ((othercase = UCD_OTHERCASE(c)) != c) { *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); @@ -3251,7 +3440,7 @@ /* Loop until ']' reached. This "while" is the end of the "do" above. */ - while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); + while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq)); if (c == 0) /* Missing terminating ']' */ { @@ -3340,9 +3529,9 @@ zeroreqbyte = reqbyte; /* If there are characters with values > 255, we have to compile an - extended class, with its own opcode, unless there was a negated special - such as \S in the class, because in that case all characters > 255 are in - the class, so any that were explicitly given as well can be ignored. If + extended class, with its own opcode, unless there was a negated special + such as \S in the class, because in that case all characters > 255 are in + the class, so any that were explicitly given as well can be ignored. If (when there are explicit characters > 255 that must be listed) there are no characters < 256, we can omit the bitmap in the actual compiled code. */ @@ -3373,11 +3562,11 @@ } #endif - /* If there are no characters > 255, set the opcode to OP_CLASS or - OP_NCLASS, depending on whether the whole class was negated and whether - there were negative specials such as \S in the class. Then copy the 32-byte + /* If there are no characters > 255, set the opcode to OP_CLASS or + OP_NCLASS, depending on whether the whole class was negated and whether + there were negative specials such as \S in the class. Then copy the 32-byte map into the code vector, negating it if necessary. */ - + *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { @@ -3396,23 +3585,23 @@ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this has been tested above. */ - case '{': + case CHAR_LEFT_CURLY_BRACKET: if (!is_quantifier) goto NORMAL_CHAR; ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); if (*errorcodeptr != 0) goto FAILED; goto REPEAT; - case '*': + case CHAR_ASTERISK: repeat_min = 0; repeat_max = -1; goto REPEAT; - case '+': + case CHAR_PLUS: repeat_min = 1; repeat_max = -1; goto REPEAT; - case '?': + case CHAR_QUESTION_MARK: repeat_min = 0; repeat_max = 1; @@ -3447,13 +3636,13 @@ but if PCRE_UNGREEDY is set, it works the other way round. We change the repeat type to the non-default. */ - if (ptr[1] == '+') + if (ptr[1] == CHAR_PLUS) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; ptr++; } - else if (ptr[1] == '?') + else if (ptr[1] == CHAR_QUESTION_MARK) { repeat_type = greedy_non_default; ptr++; @@ -3779,28 +3968,38 @@ if (repeat_min == 0) { - /* If the maximum is also zero, we just omit the group from the output - altogether. */ - - if (repeat_max == 0) - { - code = previous; - goto END_REPEAT; - } + /* If the maximum is also zero, we used to just omit the group from the + output altogether, like this: - /* If the maximum is 1 or unlimited, we just have to stick in the - BRAZERO and do no more at this point. However, we do need to adjust - any OP_RECURSE calls inside the group that refer to the group itself or - any internal or forward referenced group, because the offset is from - the start of the whole regex. Temporarily terminate the pattern while - doing this. */ + ** if (repeat_max == 0) + ** { + ** code = previous; + ** goto END_REPEAT; + ** } + + However, that fails when a group is referenced as a subroutine from + elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it + so that it is skipped on execution. As we don't have a list of which + groups are referenced, we cannot do this selectively. + + If the maximum is 1 or unlimited, we just have to stick in the BRAZERO + and do no more at this point. However, we do need to adjust any + OP_RECURSE calls inside the group that refer to the group itself or any + internal or forward referenced group, because the offset is from the + start of the whole regex. Temporarily terminate the pattern while doing + this. */ - if (repeat_max <= 1) + if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; adjust_recurse(previous, 1, utf8, cd, save_hwm); memmove(previous+1, previous, len); code++; + if (repeat_max == 0) + { + *previous++ = OP_SKIPZERO; + goto END_REPEAT; + } *previous++ = OP_BRAZERO + repeat_type; } @@ -3995,6 +4194,13 @@ } } + /* If previous is OP_FAIL, it was generated by an empty class [] in + JavaScript mode. The other ways in which OP_FAIL can be generated, that is + by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" + error above. We can just ignore the repeat in JS case. */ + + else if (*previous == OP_FAIL) goto END_REPEAT; + /* Else there's some kind of shambles */ else @@ -4021,7 +4227,9 @@ int len; if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || *tempcode == OP_NOTEXACT) - tempcode += _pcre_OP_lengths[*tempcode]; + tempcode += _pcre_OP_lengths[*tempcode] + + ((*tempcode == OP_TYPEEXACT && + (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); len = code - tempcode; if (len > 0) switch (*tempcode) { @@ -4067,7 +4275,7 @@ lookbehind or option setting or condition or all the other extended parenthesis forms. */ - case '(': + case CHAR_LEFT_PARENTHESIS: newoptions = options; skipbytes = 0; bravalue = OP_CBRA; @@ -4076,19 +4284,19 @@ /* First deal with various "verbs" that can be introduced by '*'. */ - if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) + if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0) { int i, namelen; const char *vn = verbnames; const uschar *name = ++ptr; previous = NULL; - while ((cd->ctypes[*++ptr] & ctype_letter) != 0); - if (*ptr == ':') + while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; + if (*ptr == CHAR_COLON) { *errorcodeptr = ERR59; /* Not supported */ goto FAILED; } - if (*ptr != ')') + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR60; goto FAILED; @@ -4113,7 +4321,7 @@ /* Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. */ - else if (*ptr == '?') + else if (*ptr == CHAR_QUESTION_MARK) { int i, set, unset, namelen; int *optset; @@ -4122,9 +4330,9 @@ switch (*(++ptr)) { - case '#': /* Comment; skip to ket */ + case CHAR_NUMBER_SIGN: /* Comment; skip to ket */ ptr++; - while (*ptr != 0 && *ptr != ')') ptr++; + while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; if (*ptr == 0) { *errorcodeptr = ERR18; @@ -4134,19 +4342,19 @@ /* ------------------------------------------------------------ */ - case '|': /* Reset capture count for each branch */ + case CHAR_VERTICAL_LINE: /* Reset capture count for each branch */ reset_bracount = TRUE; /* Fall through */ /* ------------------------------------------------------------ */ - case ':': /* Non-capturing bracket */ + case CHAR_COLON: /* Non-capturing bracket */ bravalue = OP_BRA; ptr++; break; /* ------------------------------------------------------------ */ - case '(': + case CHAR_LEFT_PARENTHESIS: bravalue = OP_COND; /* Conditional group */ /* A condition can be an assertion, a number (referring to a numbered @@ -4166,7 +4374,8 @@ the switch. This will take control down to where bracketed groups, including assertions, are processed. */ - if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<')) + if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN || + ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN)) break; /* Most other conditions use OP_CREF (a couple change to OP_RREF @@ -4178,7 +4387,7 @@ /* Check for a test for recursion in a named group. */ - if (ptr[1] == 'R' && ptr[2] == '&') + if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND) { terminator = -1; ptr += 2; @@ -4188,20 +4397,20 @@ /* Check for a test for a named group's having been set, using the Perl syntax (?() or (?('name') */ - else if (ptr[1] == '<') + else if (ptr[1] == CHAR_LESS_THAN_SIGN) { - terminator = '>'; + terminator = CHAR_GREATER_THAN_SIGN; ptr++; } - else if (ptr[1] == '\'') + else if (ptr[1] == CHAR_APOSTROPHE) { - terminator = '\''; + terminator = CHAR_APOSTROPHE; ptr++; } else { terminator = 0; - if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr); + if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr); } /* We now expect to read a name; any thing else is an error */ @@ -4221,12 +4430,13 @@ { if (recno >= 0) recno = ((digitab[*ptr] & ctype_digit) != 0)? - recno * 10 + *ptr - '0' : -1; + recno * 10 + *ptr - CHAR_0 : -1; ptr++; } namelen = ptr - name; - if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')') + if ((terminator > 0 && *ptr++ != terminator) || + *ptr++ != CHAR_RIGHT_PARENTHESIS) { ptr--; /* Error offset */ *errorcodeptr = ERR26; @@ -4248,7 +4458,7 @@ *errorcodeptr = ERR58; goto FAILED; } - recno = (refsign == '-')? + recno = (refsign == CHAR_MINUS)? cd->bracount - recno + 1 : recno +cd->bracount; if (recno <= 0 || recno > cd->final_bracount) { @@ -4279,7 +4489,7 @@ /* Search the pattern for a forward reference */ - else if ((i = find_parens(ptr, cd->bracount, name, namelen, + else if ((i = find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); @@ -4300,7 +4510,7 @@ /* Check for (?(R) for recursion. Allow digits after R to specify a specific group number. */ - else if (*name == 'R') + else if (*name == CHAR_R) { recno = 0; for (i = 1; i < namelen; i++) @@ -4310,7 +4520,7 @@ *errorcodeptr = ERR15; goto FAILED; } - recno = recno * 10 + name[i] - '0'; + recno = recno * 10 + name[i] - CHAR_0; } if (recno == 0) recno = RREF_ANY; code[1+LINK_SIZE] = OP_RREF; /* Change test type */ @@ -4320,14 +4530,14 @@ /* Similarly, check for the (?(DEFINE) "condition", which is always false. */ - else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0) + else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0) { code[1+LINK_SIZE] = OP_DEF; skipbytes = 1; } /* Check for the "name" actually being a subpattern number. We are - in the second pass here, so final_bracount is set. */ + in the second pass here, so final_bracount is set. */ else if (recno > 0 && recno <= cd->final_bracount) { @@ -4345,16 +4555,16 @@ /* ------------------------------------------------------------ */ - case '=': /* Positive lookahead */ + case CHAR_EQUALS_SIGN: /* Positive lookahead */ bravalue = OP_ASSERT; ptr++; break; /* ------------------------------------------------------------ */ - case '!': /* Negative lookahead */ + case CHAR_EXCLAMATION_MARK: /* Negative lookahead */ ptr++; - if (*ptr == ')') /* Optimize (?!) */ + if (*ptr == CHAR_RIGHT_PARENTHESIS) /* Optimize (?!) */ { *code++ = OP_FAIL; previous = NULL; @@ -4365,15 +4575,15 @@ /* ------------------------------------------------------------ */ - case '<': /* Lookbehind or named define */ + case CHAR_LESS_THAN_SIGN: /* Lookbehind or named define */ switch (ptr[1]) { - case '=': /* Positive lookbehind */ + case CHAR_EQUALS_SIGN: /* Positive lookbehind */ bravalue = OP_ASSERTBACK; ptr += 2; break; - case '!': /* Negative lookbehind */ + case CHAR_EXCLAMATION_MARK: /* Negative lookbehind */ bravalue = OP_ASSERTBACK_NOT; ptr += 2; break; @@ -4388,22 +4598,22 @@ /* ------------------------------------------------------------ */ - case '>': /* One-time brackets */ + case CHAR_GREATER_THAN_SIGN: /* One-time brackets */ bravalue = OP_ONCE; ptr++; break; /* ------------------------------------------------------------ */ - case 'C': /* Callout - may be followed by digits; */ + case CHAR_C: /* Callout - may be followed by digits; */ previous_callout = code; /* Save for later completion */ after_manual_callout = 1; /* Skip one item before completing */ *code++ = OP_CALLOUT; { int n = 0; while ((digitab[*(++ptr)] & ctype_digit) != 0) - n = n * 10 + *ptr - '0'; - if (*ptr != ')') + n = n * 10 + *ptr - CHAR_0; + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR39; goto FAILED; @@ -4423,14 +4633,15 @@ /* ------------------------------------------------------------ */ - case 'P': /* Python-style named subpattern handling */ - if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */ + case CHAR_P: /* Python-style named subpattern handling */ + if (*(++ptr) == CHAR_EQUALS_SIGN || + *ptr == CHAR_GREATER_THAN_SIGN) /* Reference or recursion */ { - is_recurse = *ptr == '>'; - terminator = ')'; + is_recurse = *ptr == CHAR_GREATER_THAN_SIGN; + terminator = CHAR_RIGHT_PARENTHESIS; goto NAMED_REF_OR_RECURSE; } - else if (*ptr != '<') /* Test for Python-style definition */ + else if (*ptr != CHAR_LESS_THAN_SIGN) /* Test for Python-style defn */ { *errorcodeptr = ERR41; goto FAILED; @@ -4440,9 +4651,10 @@ /* ------------------------------------------------------------ */ DEFINE_NAME: /* Come here from (?< handling */ - case '\'': + case CHAR_APOSTROPHE: { - terminator = (*ptr == '<')? '>' : '\''; + terminator = (*ptr == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; name = ++ptr; while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; @@ -4516,8 +4728,8 @@ /* ------------------------------------------------------------ */ - case '&': /* Perl recursion/subroutine syntax */ - terminator = ')'; + case CHAR_AMPERSAND: /* Perl recursion/subroutine syntax */ + terminator = CHAR_RIGHT_PARENTHESIS; is_recurse = TRUE; /* Fall through */ @@ -4525,7 +4737,7 @@ references (?P=name) and recursion (?P>name), as well as falling through from the Perl recursion syntax (?&name). We also come here from the Perl \k or \k'name' back reference syntax and the \k{name} - .NET syntax. */ + .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */ NAMED_REF_OR_RECURSE: name = ++ptr; @@ -4541,7 +4753,7 @@ { *errorcodeptr = ERR62; goto FAILED; - } + } if (*ptr != terminator) { *errorcodeptr = ERR42; @@ -4555,8 +4767,8 @@ recno = 0; } - /* In the real compile, seek the name in the table. We check the name - first, and then check that we have reached the end of the name in the + /* In the real compile, seek the name in the table. We check the name + first, and then check that we have reached the end of the name in the table. That way, if the name that is longer than any in the table, the comparison will fail without reading beyond the table entry. */ @@ -4566,7 +4778,7 @@ for (i = 0; i < cd->names_found; i++) { if (strncmp((char *)name, (char *)slot+2, namelen) == 0 && - slot[2+namelen] == 0) + slot[2+namelen] == 0) break; slot += cd->name_entry_size; } @@ -4576,7 +4788,7 @@ recno = GET2(slot, 0); } else if ((recno = /* Forward back reference */ - find_parens(ptr, cd->bracount, name, namelen, + find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { *errorcodeptr = ERR15; @@ -4592,28 +4804,37 @@ /* ------------------------------------------------------------ */ - case 'R': /* Recursion */ + case CHAR_R: /* Recursion */ ptr++; /* Same as (?0) */ /* Fall through */ /* ------------------------------------------------------------ */ - case '-': case '+': - case '0': case '1': case '2': case '3': case '4': /* Recursion or */ - case '5': case '6': case '7': case '8': case '9': /* subroutine */ + case CHAR_MINUS: case CHAR_PLUS: /* Recursion or subroutine */ + case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: + case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: { const uschar *called; + terminator = CHAR_RIGHT_PARENTHESIS; - if ((refsign = *ptr) == '+') + /* Come here from the \g<...> and \g'...' code (Oniguruma + compatibility). However, the syntax has been checked to ensure that + the ... are a (signed) number, so that neither ERR63 nor ERR29 will + be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY + ever be taken. */ + + HANDLE_NUMERICAL_RECURSION: + + if ((refsign = *ptr) == CHAR_PLUS) { ptr++; if ((digitab[*ptr] & ctype_digit) == 0) { *errorcodeptr = ERR63; goto FAILED; - } - } - else if (refsign == '-') + } + } + else if (refsign == CHAR_MINUS) { if ((digitab[ptr[1]] & ctype_digit) == 0) goto OTHER_CHAR_AFTER_QUERY; @@ -4622,15 +4843,15 @@ recno = 0; while((digitab[*ptr] & ctype_digit) != 0) - recno = recno * 10 + *ptr++ - '0'; + recno = recno * 10 + *ptr++ - CHAR_0; - if (*ptr != ')') + if (*ptr != terminator) { *errorcodeptr = ERR29; goto FAILED; } - if (refsign == '-') + if (refsign == CHAR_MINUS) { if (recno == 0) { @@ -4644,7 +4865,7 @@ goto FAILED; } } - else if (refsign == '+') + else if (refsign == CHAR_PLUS) { if (recno == 0) { @@ -4677,8 +4898,8 @@ if (called == NULL) { - if (find_parens(ptr, cd->bracount, NULL, recno, - (options & PCRE_EXTENDED) != 0) < 0) + if (find_parens(ptr, cd, NULL, recno, + (options & PCRE_EXTENDED) != 0) < 0) { *errorcodeptr = ERR15; goto FAILED; @@ -4730,23 +4951,23 @@ set = unset = 0; optset = &set; - while (*ptr != ')' && *ptr != ':') + while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON) { switch (*ptr++) { - case '-': optset = &unset; break; + case CHAR_MINUS: optset = &unset; break; - case 'J': /* Record that it changed in the external options */ + case CHAR_J: /* Record that it changed in the external options */ *optset |= PCRE_DUPNAMES; cd->external_flags |= PCRE_JCHANGED; break; - case 'i': *optset |= PCRE_CASELESS; break; - case 'm': *optset |= PCRE_MULTILINE; break; - case 's': *optset |= PCRE_DOTALL; break; - case 'x': *optset |= PCRE_EXTENDED; break; - case 'U': *optset |= PCRE_UNGREEDY; break; - case 'X': *optset |= PCRE_EXTRA; break; + case CHAR_i: *optset |= PCRE_CASELESS; break; + case CHAR_m: *optset |= PCRE_MULTILINE; break; + case CHAR_s: *optset |= PCRE_DOTALL; break; + case CHAR_x: *optset |= PCRE_EXTENDED; break; + case CHAR_U: *optset |= PCRE_UNGREEDY; break; + case CHAR_X: *optset |= PCRE_EXTRA; break; default: *errorcodeptr = ERR12; ptr--; /* Correct the offset */ @@ -4777,18 +4998,15 @@ both phases. If we are not at the pattern start, compile code to change the ims - options if this setting actually changes any of them. We also pass the - new setting back so that it can be put at the start of any following - branches, and when this group ends (if we are in a group), a resetting - item can be compiled. */ + options if this setting actually changes any of them, and reset the + greedy defaults and the case value for firstbyte and reqbyte. */ - if (*ptr == ')') + if (*ptr == CHAR_RIGHT_PARENTHESIS) { if (code == cd->start_code + 1 + LINK_SIZE && (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE)) { cd->external_options = newoptions; - options = newoptions; } else { @@ -4797,17 +5015,17 @@ *code++ = OP_OPT; *code++ = newoptions & PCRE_IMS; } - - /* Change options at this level, and pass them back for use - in subsequent branches. Reset the greedy defaults and the case - value for firstbyte and reqbyte. */ - - *optionsptr = options = newoptions; greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; - req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; + req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; } + /* Change options at this level, and pass them back for use + in subsequent branches. When not at the start of the pattern, this + information is also necessary so that a resetting item can be + compiled at the end of a group (if we are in a group). */ + + *optionsptr = options = newoptions; previous = NULL; /* This item can't be repeated */ continue; /* It is complete */ } @@ -4923,7 +5141,7 @@ /* Error if hit end of pattern */ - if (*ptr != ')') + if (*ptr != CHAR_RIGHT_PARENTHESIS) { *errorcodeptr = ERR14; goto FAILED; @@ -5021,7 +5239,7 @@ We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are ever created. */ - case '\\': + case CHAR_BACKSLASH: tempptr = ptr; c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); if (*errorcodeptr != 0) goto FAILED; @@ -5030,8 +5248,9 @@ { if (-c == ESC_Q) /* Handle start of quoted string */ { - if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */ - else inescq = TRUE; + if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) + ptr += 2; /* avoid empty string */ + else inescq = TRUE; continue; } @@ -5048,13 +5267,75 @@ zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; + /* \g or \g'name' is a subroutine call by name and \g or \g'n' + is a subroutine call by number (Oniguruma syntax). In fact, the value + -ESC_g is returned only for these cases. So we don't need to check for < + or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is + -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as + that is a synonym for a named back reference). */ + + if (-c == ESC_g) + { + const uschar *p; + save_hwm = cd->hwm; /* Normally this is set when '(' is read */ + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE; + + /* These two statements stop the compiler for warning about possibly + unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In + fact, because we actually check for a number below, the paths that + would actually be in error are never taken. */ + + skipbytes = 0; + reset_bracount = FALSE; + + /* Test for a name */ + + if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS) + { + BOOL isnumber = TRUE; + for (p = ptr + 1; *p != 0 && *p != terminator; p++) + { + if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE; + if ((cd->ctypes[*p] & ctype_word) == 0) break; + } + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + if (isnumber) + { + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } + is_recurse = TRUE; + goto NAMED_REF_OR_RECURSE; + } + + /* Test a signed number in angle brackets or quotes. */ + + p = ptr + 2; + while ((digitab[*p] & ctype_digit) != 0) p++; + if (*p != terminator) + { + *errorcodeptr = ERR57; + break; + } + ptr++; + goto HANDLE_NUMERICAL_RECURSION; + } + /* \k or \k'name' is a back reference by name (Perl syntax). We also support \k{name} (.NET syntax) */ - if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{')) + if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN || + ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET)) { is_recurse = FALSE; - terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}'; + terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)? + CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)? + CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET; goto NAMED_REF_OR_RECURSE; } @@ -5157,7 +5438,7 @@ /* Remember if \r or \n were seen */ - if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n') + if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; /* Set the first and required bytes appropriately. If no previous first @@ -5402,7 +5683,7 @@ compile a resetting op-code following, except at the very end of the pattern. Return leaving the pointer at the terminating char. */ - if (*ptr != '|') + if (*ptr != CHAR_VERTICAL_LINE) { if (lengthptr == NULL) { @@ -5425,7 +5706,7 @@ /* Resetting option if needed */ - if ((options & PCRE_IMS) != oldims && *ptr == ')') + if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS) { *code++ = OP_OPT; *code++ = oldims; @@ -5554,14 +5835,14 @@ if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; } - /* .* is not anchored unless DOTALL is set and it isn't in brackets that - are or may be referenced. */ + /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and + it isn't in brackets that are or may be referenced. */ else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || - op == OP_TYPEPOSSTAR) && - (*options & PCRE_DOTALL) != 0) + op == OP_TYPEPOSSTAR)) { - if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; + if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) + return FALSE; } /* Check for explicit anchoring */ @@ -5607,6 +5888,32 @@ NULL, 0, FALSE); register int op = *scode; + /* If we are at the start of a conditional assertion group, *both* the + conditional assertion *and* what follows the condition must satisfy the test + for start of line. Other kinds of condition fail. Note that there may be an + auto-callout at the start of a condition. */ + + if (op == OP_COND) + { + scode += 1 + LINK_SIZE; + if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT]; + switch (*scode) + { + case OP_CREF: + case OP_RREF: + case OP_DEF: + return FALSE; + + default: /* Assertion */ + if (!is_startline(scode, bracket_map, backref_map)) return FALSE; + do scode += GET(scode, 1); while (*scode == OP_ALT); + scode += 1 + LINK_SIZE; + break; + } + scode = first_significant_code(scode, NULL, 0, FALSE); + op = *scode; + } + /* Non-capturing brackets */ if (op == OP_BRA) @@ -5625,8 +5932,10 @@ /* Other brackets */ - else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND) - { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } + else if (op == OP_ASSERT || op == OP_ONCE) + { + if (!is_startline(scode, bracket_map, backref_map)) return FALSE; + } /* .* means "start at start or after \n" if it isn't in brackets that may be referenced. */ @@ -5743,7 +6052,7 @@ with errorptr and erroroffset set */ -PCRE_EXP_DEFN pcre * +PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile(const char *pattern, int options, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -5751,7 +6060,7 @@ } -PCRE_EXP_DEFN pcre * +PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION pcre_compile2(const char *pattern, int options, int *errorcodeptr, const char **errorptr, int *erroroffset, const unsigned char *tables) { @@ -5778,7 +6087,6 @@ uschar cworkspace[COMPILE_WORK_SIZE]; - /* Set this early so that early errors get offset 0. */ ptr = (const uschar *)pattern; @@ -5824,7 +6132,7 @@ } #endif -if ((options & ~PUBLIC_OPTIONS) != 0) +if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0) { errorcode = ERR17; goto PCRE_EARLY_ERROR_RETURN; @@ -5841,25 +6149,26 @@ /* Check for global one-time settings at the start of the pattern, and remember the offset for later. */ -while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*') +while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS && + ptr[skipatstart+1] == CHAR_ASTERISK) { int newnl = 0; int newbsr = 0; - if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0) + if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } - else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5) == 0) { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0) { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } - else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0) { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } - else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) + else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } if (newnl != 0) @@ -5887,10 +6196,10 @@ switch (options & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Build-time default */ - case PCRE_NEWLINE_CR: newline = '\r'; break; - case PCRE_NEWLINE_LF: newline = '\n'; break; + case PCRE_NEWLINE_CR: newline = CHAR_CR; break; + case PCRE_NEWLINE_LF: newline = CHAR_NL; break; case PCRE_NEWLINE_CR+ - PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; + PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;