42 |
supporting internal functions that are not used by other modules. */ |
supporting internal functions that are not used by other modules. */ |
43 |
|
|
44 |
|
|
45 |
|
#define NLBLOCK cd /* The block containing newline information */ |
46 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
47 |
|
|
48 |
|
|
191 |
"unrecognized character after (?<", |
"unrecognized character after (?<", |
192 |
/* 25 */ |
/* 25 */ |
193 |
"lookbehind assertion is not fixed length", |
"lookbehind assertion is not fixed length", |
194 |
"malformed number after (?(", |
"malformed number or name after (?(", |
195 |
"conditional group contains more than two branches", |
"conditional group contains more than two branches", |
196 |
"assertion expected after (?(", |
"assertion expected after (?(", |
197 |
"(?R or (?digits must be followed by )", |
"(?R or (?digits must be followed by )", |
211 |
"recursive call could loop indefinitely", |
"recursive call could loop indefinitely", |
212 |
"unrecognized character after (?P", |
"unrecognized character after (?P", |
213 |
"syntax error after (?P", |
"syntax error after (?P", |
214 |
"two named groups have the same name", |
"two named subpatterns have the same name", |
215 |
"invalid UTF-8 string", |
"invalid UTF-8 string", |
216 |
/* 45 */ |
/* 45 */ |
217 |
"support for \\P, \\p, and \\X has not been compiled", |
"support for \\P, \\p, and \\X has not been compiled", |
218 |
"malformed \\P or \\p sequence", |
"malformed \\P or \\p sequence", |
219 |
"unknown property name after \\P or \\p" |
"unknown property name after \\P or \\p", |
220 |
|
"subpattern name is too long (maximum 32 characters)", |
221 |
|
"too many named subpatterns (maximum 10,000)", |
222 |
|
/* 50 */ |
223 |
|
"repeated subpattern is too long", |
224 |
|
"octal value is greater than \\377 (not in UTF-8 mode)" |
225 |
}; |
}; |
226 |
|
|
227 |
|
|
466 |
} |
} |
467 |
|
|
468 |
/* \0 always starts an octal number, but we may drop through to here with a |
/* \0 always starts an octal number, but we may drop through to here with a |
469 |
larger first octal digit. */ |
larger first octal digit. The original code used just to take the least |
470 |
|
significant 8 bits of octal numbers (I think this is what early Perls used |
471 |
|
to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more |
472 |
|
than 3 octal digits. */ |
473 |
|
|
474 |
case '0': |
case '0': |
475 |
c -= '0'; |
c -= '0'; |
476 |
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') |
while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') |
477 |
c = c * 8 + *(++ptr) - '0'; |
c = c * 8 + *(++ptr) - '0'; |
478 |
c &= 255; /* Take least significant 8 bits */ |
if (!utf8 && c > 255) *errorcodeptr = ERR51; |
479 |
break; |
break; |
480 |
|
|
481 |
/* \x is complicated. \x{ddd} is a character number which can be greater |
/* \x is complicated. \x{ddd} is a character number which can be greater |
772 |
|
|
773 |
|
|
774 |
/************************************************* |
/************************************************* |
775 |
|
* Find forward referenced named subpattern * |
776 |
|
*************************************************/ |
777 |
|
|
778 |
|
/* This function scans along a pattern looking for capturing subpatterns, and |
779 |
|
counting them. If it finds a named pattern that matches the name it is given, |
780 |
|
it returns its number. This is used for forward references to named |
781 |
|
subpatterns. We know that if (?P< is encountered, the name will be terminated |
782 |
|
by '>' because that is checked in the first pass. |
783 |
|
|
784 |
|
Arguments: |
785 |
|
pointer current position in the pattern |
786 |
|
count current count of capturing parens |
787 |
|
name name to seek |
788 |
|
namelen name length |
789 |
|
|
790 |
|
Returns: the number of the named subpattern, or -1 if not found |
791 |
|
*/ |
792 |
|
|
793 |
|
static int |
794 |
|
find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen) |
795 |
|
{ |
796 |
|
const uschar *thisname; |
797 |
|
for (; *ptr != 0; ptr++) |
798 |
|
{ |
799 |
|
if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; } |
800 |
|
if (*ptr != '(') continue; |
801 |
|
if (ptr[1] != '?') { count++; continue; } |
802 |
|
if (ptr[2] == '(') { ptr += 2; continue; } |
803 |
|
if (ptr[2] != 'P' || ptr[3] != '<') continue; |
804 |
|
count++; |
805 |
|
ptr += 4; |
806 |
|
thisname = ptr; |
807 |
|
while (*ptr != '>') ptr++; |
808 |
|
if (namelen == ptr - thisname && strncmp(name, thisname, namelen) == 0) |
809 |
|
return count; |
810 |
|
} |
811 |
|
return -1; |
812 |
|
} |
813 |
|
|
814 |
|
|
815 |
|
|
816 |
|
/************************************************* |
817 |
* Find first significant op code * |
* Find first significant op code * |
818 |
*************************************************/ |
*************************************************/ |
819 |
|
|
968 |
|
|
969 |
case OP_CHAR: |
case OP_CHAR: |
970 |
case OP_CHARNC: |
case OP_CHARNC: |
971 |
|
case OP_NOT: |
972 |
branchlength++; |
branchlength++; |
973 |
cc += 2; |
cc += 2; |
974 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1083 |
static const uschar * |
static const uschar * |
1084 |
find_bracket(const uschar *code, BOOL utf8, int number) |
find_bracket(const uschar *code, BOOL utf8, int number) |
1085 |
{ |
{ |
|
#ifndef SUPPORT_UTF8 |
|
|
utf8 = utf8; /* Stop pedantic compilers complaining */ |
|
|
#endif |
|
|
|
|
1086 |
for (;;) |
for (;;) |
1087 |
{ |
{ |
1088 |
register int c = *code; |
register int c = *code; |
1089 |
if (c == OP_END) return NULL; |
if (c == OP_END) return NULL; |
1090 |
|
|
1091 |
|
/* XCLASS is used for classes that cannot be represented just by a bit |
1092 |
|
map. This includes negated single high-valued characters. The length in |
1093 |
|
the table is zero; the actual length is stored in the compiled code. */ |
1094 |
|
|
1095 |
|
if (c == OP_XCLASS) code += GET(code, 1); |
1096 |
|
|
1097 |
|
/* Handle bracketed group */ |
1098 |
|
|
1099 |
else if (c > OP_BRA) |
else if (c > OP_BRA) |
1100 |
{ |
{ |
1101 |
int n = c - OP_BRA; |
int n = c - OP_BRA; |
1103 |
if (n == number) return (uschar *)code; |
if (n == number) return (uschar *)code; |
1104 |
code += _pcre_OP_lengths[OP_BRA]; |
code += _pcre_OP_lengths[OP_BRA]; |
1105 |
} |
} |
1106 |
|
|
1107 |
|
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
1108 |
|
that are followed by a character may be followed by a multi-byte character. |
1109 |
|
The length in the table is a minimum, so we have to scan along to skip the |
1110 |
|
extra bytes. All opcodes are less than 128, so we can use relatively |
1111 |
|
efficient code. */ |
1112 |
|
|
1113 |
else |
else |
1114 |
{ |
{ |
1115 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
|
|
by a multi-byte character. The length in the table is a minimum, so we have |
|
|
to scan along to skip the extra bytes. All opcodes are less than 128, so we |
|
|
can use relatively efficient code. */ |
|
|
|
|
1116 |
if (utf8) switch(c) |
if (utf8) switch(c) |
1117 |
{ |
{ |
1118 |
case OP_CHAR: |
case OP_CHAR: |
1128 |
case OP_MINQUERY: |
case OP_MINQUERY: |
1129 |
while ((*code & 0xc0) == 0x80) code++; |
while ((*code & 0xc0) == 0x80) code++; |
1130 |
break; |
break; |
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit |
|
|
map. This includes negated single high-valued characters. The length in |
|
|
the table is zero; the actual length is stored in the compiled code. */ |
|
|
|
|
|
case OP_XCLASS: |
|
|
code += GET(code, 1) + 1; |
|
|
break; |
|
1131 |
} |
} |
|
#endif |
|
1132 |
} |
} |
1133 |
} |
} |
1134 |
} |
} |
1152 |
static const uschar * |
static const uschar * |
1153 |
find_recurse(const uschar *code, BOOL utf8) |
find_recurse(const uschar *code, BOOL utf8) |
1154 |
{ |
{ |
|
#ifndef SUPPORT_UTF8 |
|
|
utf8 = utf8; /* Stop pedantic compilers complaining */ |
|
|
#endif |
|
|
|
|
1155 |
for (;;) |
for (;;) |
1156 |
{ |
{ |
1157 |
register int c = *code; |
register int c = *code; |
1158 |
if (c == OP_END) return NULL; |
if (c == OP_END) return NULL; |
1159 |
else if (c == OP_RECURSE) return code; |
if (c == OP_RECURSE) return code; |
1160 |
|
|
1161 |
|
/* XCLASS is used for classes that cannot be represented just by a bit |
1162 |
|
map. This includes negated single high-valued characters. The length in |
1163 |
|
the table is zero; the actual length is stored in the compiled code. */ |
1164 |
|
|
1165 |
|
if (c == OP_XCLASS) code += GET(code, 1); |
1166 |
|
|
1167 |
|
/* All bracketed groups have the same length. */ |
1168 |
|
|
1169 |
else if (c > OP_BRA) |
else if (c > OP_BRA) |
1170 |
{ |
{ |
1171 |
code += _pcre_OP_lengths[OP_BRA]; |
code += _pcre_OP_lengths[OP_BRA]; |
1172 |
} |
} |
1173 |
|
|
1174 |
|
/* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes |
1175 |
|
that are followed by a character may be followed by a multi-byte character. |
1176 |
|
The length in the table is a minimum, so we have to scan along to skip the |
1177 |
|
extra bytes. All opcodes are less than 128, so we can use relatively |
1178 |
|
efficient code. */ |
1179 |
|
|
1180 |
else |
else |
1181 |
{ |
{ |
1182 |
code += _pcre_OP_lengths[c]; |
code += _pcre_OP_lengths[c]; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
|
|
|
|
/* In UTF-8 mode, opcodes that are followed by a character may be followed |
|
|
by a multi-byte character. The length in the table is a minimum, so we have |
|
|
to scan along to skip the extra bytes. All opcodes are less than 128, so we |
|
|
can use relatively efficient code. */ |
|
|
|
|
1183 |
if (utf8) switch(c) |
if (utf8) switch(c) |
1184 |
{ |
{ |
1185 |
case OP_CHAR: |
case OP_CHAR: |
1195 |
case OP_MINQUERY: |
case OP_MINQUERY: |
1196 |
while ((*code & 0xc0) == 0x80) code++; |
while ((*code & 0xc0) == 0x80) code++; |
1197 |
break; |
break; |
|
|
|
|
/* XCLASS is used for classes that cannot be represented just by a bit |
|
|
map. This includes negated single high-valued characters. The length in |
|
|
the table is zero; the actual length is stored in the compiled code. */ |
|
|
|
|
|
case OP_XCLASS: |
|
|
code += GET(code, 1) + 1; |
|
|
break; |
|
1198 |
} |
} |
|
#endif |
|
1199 |
} |
} |
1200 |
} |
} |
1201 |
} |
} |
1611 |
int firstbyte, reqbyte; |
int firstbyte, reqbyte; |
1612 |
int zeroreqbyte, zerofirstbyte; |
int zeroreqbyte, zerofirstbyte; |
1613 |
int req_caseopt, reqvary, tempreqvary; |
int req_caseopt, reqvary, tempreqvary; |
|
int condcount = 0; |
|
1614 |
int options = *optionsptr; |
int options = *optionsptr; |
1615 |
int after_manual_callout = 0; |
int after_manual_callout = 0; |
1616 |
register int c; |
register int c; |
1724 |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
1725 |
if (c == '#') |
if (c == '#') |
1726 |
{ |
{ |
1727 |
/* The space before the ; is to avoid a warning on a silly compiler |
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; |
1728 |
on the Macintosh. */ |
if (*ptr != 0) |
1729 |
while ((c = *(++ptr)) != 0 && c != NEWLINE) ; |
{ |
1730 |
if (c != 0) continue; /* Else fall through to handle end of string */ |
ptr += cd->nllen - 1; |
1731 |
|
continue; |
1732 |
|
} |
1733 |
|
/* Else fall through to handle end of string */ |
1734 |
|
c = 0; |
1735 |
} |
} |
1736 |
} |
} |
1737 |
|
|
2896 |
case '(': |
case '(': |
2897 |
bravalue = OP_COND; /* Conditional group */ |
bravalue = OP_COND; /* Conditional group */ |
2898 |
|
|
2899 |
/* Condition to test for recursion */ |
/* A condition can be a number, referring to a numbered group, a name, |
2900 |
|
referring to a named group, 'R', referring to recursion, or an |
2901 |
|
assertion. There are two unfortunate ambiguities, caused by history. |
2902 |
|
(a) 'R' can be the recursive thing or the name 'R', and (b) a number |
2903 |
|
could be a name that consists of digits. In both cases, we look for a |
2904 |
|
name first; if not found, we try the other cases. If the first |
2905 |
|
character after (?( is a word character, we know the rest up to ) will |
2906 |
|
also be word characters because the syntax was checked in the first |
2907 |
|
pass. */ |
2908 |
|
|
2909 |
if (ptr[1] == 'R') |
if ((cd->ctypes[ptr[1]] & ctype_word) != 0) |
2910 |
{ |
{ |
2911 |
code[1+LINK_SIZE] = OP_CREF; |
int i, namelen; |
2912 |
PUT2(code, 2+LINK_SIZE, CREF_RECURSE); |
int condref = 0; |
2913 |
|
const uschar *name; |
2914 |
|
uschar *slot = cd->name_table; |
2915 |
|
|
2916 |
|
/* This is needed for all successful cases. */ |
2917 |
|
|
2918 |
skipbytes = 3; |
skipbytes = 3; |
|
ptr += 3; |
|
|
} |
|
2919 |
|
|
2920 |
/* Condition to test for a numbered subpattern match. We know that |
/* Read the name, but also get it as a number if it's all digits */ |
|
if a digit follows ( then there will just be digits until ) because |
|
|
the syntax was checked in the first pass. */ |
|
2921 |
|
|
2922 |
else if ((digitab[ptr[1]] && ctype_digit) != 0) |
name = ++ptr; |
2923 |
{ |
while (*ptr != ')') |
|
int condref; /* Don't amalgamate; some compilers */ |
|
|
condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */ |
|
|
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0'; |
|
|
if (condref == 0) |
|
2924 |
{ |
{ |
2925 |
*errorcodeptr = ERR35; |
if (condref >= 0) |
2926 |
goto FAILED; |
condref = ((digitab[*ptr] & ctype_digit) != 0)? |
2927 |
|
condref * 10 + *ptr - '0' : -1; |
2928 |
|
ptr++; |
2929 |
} |
} |
2930 |
|
namelen = ptr - name; |
2931 |
ptr++; |
ptr++; |
2932 |
code[1+LINK_SIZE] = OP_CREF; |
|
2933 |
PUT2(code, 2+LINK_SIZE, condref); |
for (i = 0; i < cd->names_found; i++) |
2934 |
skipbytes = 3; |
{ |
2935 |
|
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
2936 |
|
slot += cd->name_entry_size; |
2937 |
|
} |
2938 |
|
|
2939 |
|
/* Found a previous named subpattern */ |
2940 |
|
|
2941 |
|
if (i < cd->names_found) |
2942 |
|
{ |
2943 |
|
condref = GET2(slot, 0); |
2944 |
|
code[1+LINK_SIZE] = OP_CREF; |
2945 |
|
PUT2(code, 2+LINK_SIZE, condref); |
2946 |
|
} |
2947 |
|
|
2948 |
|
/* Search the pattern for a forward reference */ |
2949 |
|
|
2950 |
|
else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0) |
2951 |
|
{ |
2952 |
|
code[1+LINK_SIZE] = OP_CREF; |
2953 |
|
PUT2(code, 2+LINK_SIZE, i); |
2954 |
|
} |
2955 |
|
|
2956 |
|
/* Check for 'R' for recursion */ |
2957 |
|
|
2958 |
|
else if (namelen == 1 && *name == 'R') |
2959 |
|
{ |
2960 |
|
code[1+LINK_SIZE] = OP_CREF; |
2961 |
|
PUT2(code, 2+LINK_SIZE, CREF_RECURSE); |
2962 |
|
} |
2963 |
|
|
2964 |
|
/* Check for a subpattern number */ |
2965 |
|
|
2966 |
|
else if (condref > 0) |
2967 |
|
{ |
2968 |
|
code[1+LINK_SIZE] = OP_CREF; |
2969 |
|
PUT2(code, 2+LINK_SIZE, condref); |
2970 |
|
} |
2971 |
|
|
2972 |
|
/* Either an unidentified subpattern, or a reference to (?(0) */ |
2973 |
|
|
2974 |
|
else |
2975 |
|
{ |
2976 |
|
*errorcodeptr = (condref == 0)? ERR35: ERR15; |
2977 |
|
goto FAILED; |
2978 |
|
} |
2979 |
} |
} |
2980 |
|
|
2981 |
/* For conditions that are assertions, we just fall through, having |
/* For conditions that are assertions, we just fall through, having |
2982 |
set bravalue above. */ |
set bravalue above. */ |
2983 |
|
|
2984 |
break; |
break; |
2985 |
|
|
2986 |
case '=': /* Positive lookahead */ |
case '=': /* Positive lookahead */ |
3052 |
{ |
{ |
3053 |
if (slot[2+namelen] == 0) |
if (slot[2+namelen] == 0) |
3054 |
{ |
{ |
3055 |
*errorcodeptr = ERR43; |
if ((options & PCRE_DUPNAMES) == 0) |
3056 |
goto FAILED; |
{ |
3057 |
|
*errorcodeptr = ERR43; |
3058 |
|
goto FAILED; |
3059 |
|
} |
3060 |
} |
} |
3061 |
crc = -1; /* Current name is substring */ |
else crc = -1; /* Current name is substring */ |
3062 |
} |
} |
3063 |
if (crc < 0) |
if (crc < 0) |
3064 |
{ |
{ |
3091 |
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break; |
3092 |
slot += cd->name_entry_size; |
slot += cd->name_entry_size; |
3093 |
} |
} |
3094 |
if (i >= cd->names_found) |
|
3095 |
|
if (i < cd->names_found) /* Back reference */ |
3096 |
|
{ |
3097 |
|
recno = GET2(slot, 0); |
3098 |
|
} |
3099 |
|
else if ((recno = /* Forward back reference */ |
3100 |
|
find_named_parens(ptr, *brackets, name, namelen)) <= 0) |
3101 |
{ |
{ |
3102 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
3103 |
goto FAILED; |
goto FAILED; |
3104 |
} |
} |
3105 |
|
|
|
recno = GET2(slot, 0); |
|
|
|
|
3106 |
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ |
if (type == '>') goto HANDLE_RECURSION; /* A few lines below */ |
3107 |
|
|
3108 |
/* Back reference */ |
/* Back reference */ |
3142 |
regex in case it doesn't exist. */ |
regex in case it doesn't exist. */ |
3143 |
|
|
3144 |
*code = OP_END; |
*code = OP_END; |
3145 |
called = (recno == 0)? |
called = (recno == 0)? cd->start_code : |
3146 |
cd->start_code : find_bracket(cd->start_code, utf8, recno); |
find_bracket(cd->start_code, utf8, recno); |
|
|
|
3147 |
if (called == NULL) |
if (called == NULL) |
3148 |
{ |
{ |
3149 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
3190 |
case '-': optset = &unset; break; |
case '-': optset = &unset; break; |
3191 |
|
|
3192 |
case 'i': *optset |= PCRE_CASELESS; break; |
case 'i': *optset |= PCRE_CASELESS; break; |
3193 |
|
case 'J': *optset |= PCRE_DUPNAMES; break; |
3194 |
case 'm': *optset |= PCRE_MULTILINE; break; |
case 'm': *optset |= PCRE_MULTILINE; break; |
3195 |
case 's': *optset |= PCRE_DOTALL; break; |
case 's': *optset |= PCRE_DOTALL; break; |
3196 |
case 'x': *optset |= PCRE_EXTENDED; break; |
case 'x': *optset |= PCRE_EXTENDED; break; |
3307 |
else if (bravalue == OP_COND) |
else if (bravalue == OP_COND) |
3308 |
{ |
{ |
3309 |
uschar *tc = code; |
uschar *tc = code; |
3310 |
condcount = 0; |
int condcount = 0; |
3311 |
|
|
3312 |
do { |
do { |
3313 |
condcount++; |
condcount++; |
4012 |
} |
} |
4013 |
|
|
4014 |
|
|
4015 |
|
|
4016 |
PCRE_DATA_SCOPE pcre * |
PCRE_DATA_SCOPE pcre * |
4017 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
4018 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
4019 |
{ |
{ |
4020 |
real_pcre *re; |
real_pcre *re; |
4021 |
int length = 1 + LINK_SIZE; /* For initial BRA plus length */ |
int length = 1 + LINK_SIZE; /* For initial BRA plus length */ |
4022 |
int c, firstbyte, reqbyte; |
int c, firstbyte, reqbyte, newline; |
4023 |
int bracount = 0; |
int bracount = 0; |
4024 |
int branch_extra = 0; |
int branch_extra = 0; |
4025 |
int branch_newextra; |
int branch_newextra; |
4040 |
const uschar *codestart; |
const uschar *codestart; |
4041 |
const uschar *ptr; |
const uschar *ptr; |
4042 |
compile_data compile_block; |
compile_data compile_block; |
4043 |
|
compile_data *cd = &compile_block; |
4044 |
int brastack[BRASTACK_SIZE]; |
int brastack[BRASTACK_SIZE]; |
4045 |
uschar bralenstack[BRASTACK_SIZE]; |
uschar bralenstack[BRASTACK_SIZE]; |
4046 |
|
|
4094 |
/* Set up pointers to the individual character tables */ |
/* Set up pointers to the individual character tables */ |
4095 |
|
|
4096 |
if (tables == NULL) tables = _pcre_default_tables; |
if (tables == NULL) tables = _pcre_default_tables; |
4097 |
compile_block.lcc = tables + lcc_offset; |
cd->lcc = tables + lcc_offset; |
4098 |
compile_block.fcc = tables + fcc_offset; |
cd->fcc = tables + fcc_offset; |
4099 |
compile_block.cbits = tables + cbits_offset; |
cd->cbits = tables + cbits_offset; |
4100 |
compile_block.ctypes = tables + ctypes_offset; |
cd->ctypes = tables + ctypes_offset; |
4101 |
|
|
4102 |
|
/* Handle different types of newline. The two bits give four cases. The current |
4103 |
|
code allows for one- or two-byte sequences. */ |
4104 |
|
|
4105 |
|
switch (options & PCRE_NEWLINE_CRLF) |
4106 |
|
{ |
4107 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
4108 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
4109 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
4110 |
|
case PCRE_NEWLINE_CR+ |
4111 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
4112 |
|
} |
4113 |
|
|
4114 |
|
if (newline > 255) |
4115 |
|
{ |
4116 |
|
cd->nllen = 2; |
4117 |
|
cd->nl[0] = (newline >> 8) & 255; |
4118 |
|
cd->nl[1] = newline & 255; |
4119 |
|
} |
4120 |
|
else |
4121 |
|
{ |
4122 |
|
cd->nllen = 1; |
4123 |
|
cd->nl[0] = newline; |
4124 |
|
} |
4125 |
|
|
4126 |
/* Maximum back reference and backref bitmap. This is updated for numeric |
/* Maximum back reference and backref bitmap. This is updated for numeric |
4127 |
references during the first pass, but for named references during the actual |
references during the first pass, but for named references during the actual |
4128 |
compile pass. The bitmap records up to 31 back references to help in deciding |
compile pass. The bitmap records up to 31 back references to help in deciding |
4129 |
whether (.*) can be treated as anchored or not. */ |
whether (.*) can be treated as anchored or not. */ |
4130 |
|
|
4131 |
compile_block.top_backref = 0; |
cd->top_backref = 0; |
4132 |
compile_block.backref_map = 0; |
cd->backref_map = 0; |
4133 |
|
|
4134 |
/* Reflect pattern for debugging output */ |
/* Reflect pattern for debugging output */ |
4135 |
|
|
4163 |
|
|
4164 |
if ((options & PCRE_EXTENDED) != 0) |
if ((options & PCRE_EXTENDED) != 0) |
4165 |
{ |
{ |
4166 |
if ((compile_block.ctypes[c] & ctype_space) != 0) continue; |
if ((cd->ctypes[c] & ctype_space) != 0) continue; |
4167 |
if (c == '#') |
if (c == '#') |
4168 |
{ |
{ |
4169 |
/* The space before the ; is to avoid a warning on a silly compiler |
while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break; |
4170 |
on the Macintosh. */ |
if (*ptr != 0) |
4171 |
while ((c = *(++ptr)) != 0 && c != NEWLINE) ; |
{ |
4172 |
if (c == 0) break; |
ptr += cd->nllen - 1; |
4173 |
continue; |
continue; |
4174 |
|
} |
4175 |
|
break; /* End loop at end of pattern */ |
4176 |
} |
} |
4177 |
} |
} |
4178 |
|
|
4262 |
if (c <= -ESC_REF) |
if (c <= -ESC_REF) |
4263 |
{ |
{ |
4264 |
int refnum = -c - ESC_REF; |
int refnum = -c - ESC_REF; |
4265 |
compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1; |
cd->backref_map |= (refnum < 32)? (1 << refnum) : 1; |
4266 |
if (refnum > compile_block.top_backref) |
if (refnum > cd->top_backref) |
4267 |
compile_block.top_backref = refnum; |
cd->top_backref = refnum; |
4268 |
length += 2; /* For single back reference */ |
length += 2; /* For single back reference */ |
4269 |
if (ptr[1] == '{' && is_counted_repeat(ptr+2)) |
if (ptr[1] == '{' && is_counted_repeat(ptr+2)) |
4270 |
{ |
{ |
4418 |
/* Check the syntax for POSIX stuff. The bits we actually handle are |
/* Check the syntax for POSIX stuff. The bits we actually handle are |
4419 |
checked during the real compile phase. */ |
checked during the real compile phase. */ |
4420 |
|
|
4421 |
else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block)) |
else if (*ptr == '[' && |
4422 |
|
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && |
4423 |
|
check_posix_syntax(ptr, &ptr, cd)) |
4424 |
{ |
{ |
4425 |
ptr++; |
ptr++; |
4426 |
class_optcount = 10; /* Make sure > 1 */ |
class_optcount = 10; /* Make sure > 1 */ |
4653 |
ptr += 2; |
ptr += 2; |
4654 |
break; |
break; |
4655 |
|
|
4656 |
|
/* Named subpatterns are an extension copied from Python */ |
4657 |
|
|
4658 |
|
case 'P': |
4659 |
|
ptr += 3; |
4660 |
|
|
4661 |
|
/* Handle the definition of a named subpattern */ |
4662 |
|
|
4663 |
|
if (*ptr == '<') |
4664 |
|
{ |
4665 |
|
const uschar *p; /* Don't amalgamate; some compilers */ |
4666 |
|
p = ++ptr; /* grumble at autoincrement in declaration */ |
4667 |
|
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
4668 |
|
if (*ptr != '>') |
4669 |
|
{ |
4670 |
|
errorcode = ERR42; |
4671 |
|
goto PCRE_ERROR_RETURN; |
4672 |
|
} |
4673 |
|
name_count++; |
4674 |
|
if (name_count > MAX_NAME_COUNT) |
4675 |
|
{ |
4676 |
|
errorcode = ERR49; |
4677 |
|
goto PCRE_ERROR_RETURN; |
4678 |
|
} |
4679 |
|
if (ptr - p > max_name_size) |
4680 |
|
{ |
4681 |
|
max_name_size = (ptr - p); |
4682 |
|
if (max_name_size > MAX_NAME_SIZE) |
4683 |
|
{ |
4684 |
|
errorcode = ERR48; |
4685 |
|
goto PCRE_ERROR_RETURN; |
4686 |
|
} |
4687 |
|
} |
4688 |
|
capturing = TRUE; /* Named parentheses are always capturing */ |
4689 |
|
break; /* Go handle capturing parentheses */ |
4690 |
|
} |
4691 |
|
|
4692 |
|
/* Handle back references and recursive calls to named subpatterns */ |
4693 |
|
|
4694 |
|
if (*ptr == '=' || *ptr == '>') |
4695 |
|
{ |
4696 |
|
length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */ |
4697 |
|
while ((cd->ctypes[*(++ptr)] & ctype_word) != 0); |
4698 |
|
if (*ptr != ')') |
4699 |
|
{ |
4700 |
|
errorcode = ERR42; |
4701 |
|
goto PCRE_ERROR_RETURN; |
4702 |
|
} |
4703 |
|
goto RECURSE_CHECK_QUANTIFIED; |
4704 |
|
} |
4705 |
|
|
4706 |
|
/* Unknown character after (?P */ |
4707 |
|
|
4708 |
|
errorcode = ERR41; |
4709 |
|
goto PCRE_ERROR_RETURN; |
4710 |
|
|
4711 |
/* (?R) specifies a recursive call to the regex, which is an extension |
/* (?R) specifies a recursive call to the regex, which is an extension |
4712 |
to provide the facility which can be obtained by (?p{perl-code}) in |
to provide the facility which can be obtained by (?p{perl-code}) in |
4713 |
Perl 5.6. In Perl 5.8 this has become (??{perl-code}). |
Perl 5.6. In Perl 5.8 this has become (??{perl-code}). |
4733 |
|
|
4734 |
/* If this item is quantified, it will get wrapped inside brackets so |
/* If this item is quantified, it will get wrapped inside brackets so |
4735 |
as to use the code for quantified brackets. We jump down and use the |
as to use the code for quantified brackets. We jump down and use the |
4736 |
code that handles this for real brackets. */ |
code that handles this for real brackets. Come here from code for |
4737 |
|
named recursions/subroutines. */ |
4738 |
|
|
4739 |
|
RECURSE_CHECK_QUANTIFIED: |
4740 |
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') |
if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{') |
4741 |
{ |
{ |
4742 |
length += 2 + 2 * LINK_SIZE; /* to make bracketed */ |
length += 2 + 2 * LINK_SIZE; /* to make bracketed */ |
4760 |
length += 2 + 2*LINK_SIZE; |
length += 2 + 2*LINK_SIZE; |
4761 |
continue; |
continue; |
4762 |
|
|
|
/* Named subpatterns are an extension copied from Python */ |
|
|
|
|
|
case 'P': |
|
|
ptr += 3; |
|
|
|
|
|
/* Handle the definition of a named subpattern */ |
|
|
|
|
|
if (*ptr == '<') |
|
|
{ |
|
|
const uschar *p; /* Don't amalgamate; some compilers */ |
|
|
p = ++ptr; /* grumble at autoincrement in declaration */ |
|
|
while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++; |
|
|
if (*ptr != '>') |
|
|
{ |
|
|
errorcode = ERR42; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
} |
|
|
name_count++; |
|
|
if (ptr - p > max_name_size) max_name_size = (ptr - p); |
|
|
capturing = TRUE; /* Named parentheses are always capturing */ |
|
|
break; |
|
|
} |
|
|
|
|
|
/* Handle back references and recursive calls to named subpatterns */ |
|
|
|
|
|
if (*ptr == '=' || *ptr == '>') |
|
|
{ |
|
|
length += 2 + 2*LINK_SIZE; /* Allow for the automatic "once" */ |
|
|
while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0); |
|
|
if (*ptr != ')') |
|
|
{ |
|
|
errorcode = ERR42; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
} |
|
|
break; |
|
|
} |
|
|
|
|
|
/* Unknown character after (?P */ |
|
|
|
|
|
errorcode = ERR41; |
|
|
goto PCRE_ERROR_RETURN; |
|
|
|
|
4763 |
/* Lookbehinds are in Perl from version 5.005 */ |
/* Lookbehinds are in Perl from version 5.005 */ |
4764 |
|
|
4765 |
case '<': |
case '<': |
4775 |
|
|
4776 |
/* Conditionals are in Perl from version 5.005. The bracket must either |
/* Conditionals are in Perl from version 5.005. The bracket must either |
4777 |
be followed by a number (for bracket reference) or by an assertion |
be followed by a number (for bracket reference) or by an assertion |
4778 |
group, or (a PCRE extension) by 'R' for a recursion test. */ |
group. PCRE extends this by allowing a name to reference a named group; |
4779 |
|
unfortunately, previously 'R' was implemented for a recursion test. |
4780 |
|
When this is compiled, we look for the named group 'R' first. At this |
4781 |
|
point we just do a basic syntax check. */ |
4782 |
|
|
4783 |
case '(': |
case '(': |
4784 |
if (ptr[3] == 'R' && ptr[4] == ')') |
if ((cd->ctypes[ptr[3]] & ctype_word) != 0) |
|
{ |
|
|
ptr += 4; |
|
|
length += 3; |
|
|
} |
|
|
else if ((digitab[ptr[3]] & ctype_digit) != 0) |
|
4785 |
{ |
{ |
4786 |
ptr += 4; |
ptr += 4; |
4787 |
length += 3; |
length += 3; |
4788 |
while ((digitab[*ptr] & ctype_digit) != 0) ptr++; |
while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++; |
4789 |
if (*ptr != ')') |
if (*ptr != ')') |
4790 |
{ |
{ |
4791 |
errorcode = ERR26; |
errorcode = ERR26; |
4824 |
*optset |= PCRE_CASELESS; |
*optset |= PCRE_CASELESS; |
4825 |
continue; |
continue; |
4826 |
|
|
4827 |
|
case 'J': |
4828 |
|
*optset |= PCRE_DUPNAMES; |
4829 |
|
options |= PCRE_JCHANGED; /* Record that it changed */ |
4830 |
|
continue; |
4831 |
|
|
4832 |
case 'm': |
case 'm': |
4833 |
*optset |= PCRE_MULTILINE; |
*optset |= PCRE_MULTILINE; |
4834 |
continue; |
continue; |
4894 |
will lead to an over-estimate on the length, but this shouldn't |
will lead to an over-estimate on the length, but this shouldn't |
4895 |
matter very much. We also have to allow for resetting options at |
matter very much. We also have to allow for resetting options at |
4896 |
the start of any alternations, which we do by setting |
the start of any alternations, which we do by setting |
4897 |
branch_newextra to 2. Finally, we record whether the case-dependent |
branch_newextra to 2. */ |
|
flag ever changes within the regex. This is used by the "required |
|
|
character" code. */ |
|
4898 |
|
|
4899 |
case ':': |
case ':': |
4900 |
if (((set|unset) & PCRE_IMS) != 0) |
if (((set|unset) & PCRE_IMS) != 0) |
4901 |
{ |
{ |
4902 |
length += 4; |
length += 4; |
4903 |
branch_newextra = 2; |
branch_newextra = 2; |
|
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED; |
|
4904 |
} |
} |
4905 |
goto END_OPTIONS; |
goto END_OPTIONS; |
4906 |
|
|
4980 |
{ |
{ |
4981 |
duplength = length - brastack[--brastackptr]; |
duplength = length - brastack[--brastackptr]; |
4982 |
branch_extra = bralenstack[brastackptr]; |
branch_extra = bralenstack[brastackptr]; |
4983 |
|
/* This is a paranoid check to stop integer overflow later on */ |
4984 |
|
if (duplength > MAX_DUPLENGTH) |
4985 |
|
{ |
4986 |
|
errorcode = ERR50; |
4987 |
|
goto PCRE_ERROR_RETURN; |
4988 |
|
} |
4989 |
} |
} |
4990 |
else duplength = 0; |
else duplength = 0; |
4991 |
|
|
5090 |
} |
} |
5091 |
|
|
5092 |
/* Compute the size of data block needed and get it, either from malloc or |
/* Compute the size of data block needed and get it, either from malloc or |
5093 |
externally provided function. */ |
externally provided function. Integer overflow should no longer be possible |
5094 |
|
because nowadays we limit the maximum value of name_count and max_name size. */ |
5095 |
|
|
5096 |
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); |
size = length + sizeof(real_pcre) + name_count * (max_name_size + 3); |
5097 |
re = (real_pcre *)(pcre_malloc)(size); |
re = (real_pcre *)(pcre_malloc)(size); |
5121 |
/* The starting points of the name/number translation table and of the code are |
/* The starting points of the name/number translation table and of the code are |
5122 |
passed around in the compile data block. */ |
passed around in the compile data block. */ |
5123 |
|
|
5124 |
compile_block.names_found = 0; |
cd->names_found = 0; |
5125 |
compile_block.name_entry_size = max_name_size + 3; |
cd->name_entry_size = max_name_size + 3; |
5126 |
compile_block.name_table = (uschar *)re + re->name_table_offset; |
cd->name_table = (uschar *)re + re->name_table_offset; |
5127 |
codestart = compile_block.name_table + re->name_entry_size * re->name_count; |
codestart = cd->name_table + re->name_entry_size * re->name_count; |
5128 |
compile_block.start_code = codestart; |
cd->start_code = codestart; |
5129 |
compile_block.start_pattern = (const uschar *)pattern; |
cd->start_pattern = (const uschar *)pattern; |
5130 |
compile_block.req_varyopt = 0; |
cd->req_varyopt = 0; |
5131 |
compile_block.nopartial = FALSE; |
cd->nopartial = FALSE; |
5132 |
|
|
5133 |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
/* Set up a starting, non-extracting bracket, then compile the expression. On |
5134 |
error, errorcode will be set non-zero, so we don't need to look at the result |
error, errorcode will be set non-zero, so we don't need to look at the result |
5139 |
*code = OP_BRA; |
*code = OP_BRA; |
5140 |
bracount = 0; |
bracount = 0; |
5141 |
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, |
(void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr, |
5142 |
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block); |
&errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd); |
5143 |
re->top_bracket = bracount; |
re->top_bracket = bracount; |
5144 |
re->top_backref = compile_block.top_backref; |
re->top_backref = cd->top_backref; |
5145 |
|
|
5146 |
if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL; |
if (cd->nopartial) re->options |= PCRE_NOPARTIAL; |
5147 |
|
|
5148 |
/* If not reached end of pattern on success, there's an excess bracket. */ |
/* If not reached end of pattern on success, there's an excess bracket. */ |
5149 |
|
|
5189 |
if ((options & PCRE_ANCHORED) == 0) |
if ((options & PCRE_ANCHORED) == 0) |
5190 |
{ |
{ |
5191 |
int temp_options = options; |
int temp_options = options; |
5192 |
if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map)) |
if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) |
5193 |
re->options |= PCRE_ANCHORED; |
re->options |= PCRE_ANCHORED; |
5194 |
else |
else |
5195 |
{ |
{ |
5199 |
{ |
{ |
5200 |
int ch = firstbyte & 255; |
int ch = firstbyte & 255; |
5201 |
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && |
re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && |
5202 |
compile_block.fcc[ch] == ch)? ch : firstbyte; |
cd->fcc[ch] == ch)? ch : firstbyte; |
5203 |
re->options |= PCRE_FIRSTSET; |
re->options |= PCRE_FIRSTSET; |
5204 |
} |
} |
5205 |
else if (is_startline(codestart, 0, compile_block.backref_map)) |
else if (is_startline(codestart, 0, cd->backref_map)) |
5206 |
re->options |= PCRE_STARTLINE; |
re->options |= PCRE_STARTLINE; |
5207 |
} |
} |
5208 |
} |
} |
5216 |
{ |
{ |
5217 |
int ch = reqbyte & 255; |
int ch = reqbyte & 255; |
5218 |
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && |
re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && |
5219 |
compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; |
cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; |
5220 |
re->options |= PCRE_REQCHSET; |
re->options |= PCRE_REQCHSET; |
5221 |
} |
} |
5222 |
|
|
5230 |
|
|
5231 |
if (re->options != 0) |
if (re->options != 0) |
5232 |
{ |
{ |
5233 |
printf("%s%s%s%s%s%s%s%s%s%s\n", |
printf("%s%s%s%s%s%s%s%s%s\n", |
5234 |
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", |
((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "", |
5235 |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "", |
5236 |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
((re->options & PCRE_CASELESS) != 0)? "caseless " : "", |
|
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "", |
|
5237 |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
((re->options & PCRE_EXTENDED) != 0)? "extended " : "", |
5238 |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "", |
5239 |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |
((re->options & PCRE_DOTALL) != 0)? "dotall " : "", |