53 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
54 |
|
|
55 |
|
|
56 |
/* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which |
/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which |
57 |
is also used by pcretest. PCRE_DEBUG is not defined when building a production |
is also used by pcretest. PCRE_DEBUG is not defined when building a production |
58 |
library. We do not need to select pcre16_printint.c specially, because the |
library. We do not need to select pcre16_printint.c specially, because the |
59 |
COMPILE_PCREx macro will already be appropriately set. */ |
COMPILE_PCREx macro will already be appropriately set. */ |
68 |
|
|
69 |
/* Macro for setting individual bits in class bitmaps. */ |
/* Macro for setting individual bits in class bitmaps. */ |
70 |
|
|
71 |
#define SETBIT(a,b) a[b/8] |= (1 << (b%8)) |
#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7)) |
72 |
|
|
73 |
/* Maximum length value to check against when making sure that the integer that |
/* Maximum length value to check against when making sure that the integer that |
74 |
holds the compiled pattern length does not overflow. We make it a bit less than |
holds the compiled pattern length does not overflow. We make it a bit less than |
77 |
|
|
78 |
#define OFLOW_MAX (INT_MAX - 20) |
#define OFLOW_MAX (INT_MAX - 20) |
79 |
|
|
80 |
|
/* Definitions to allow mutual recursion */ |
81 |
|
|
82 |
|
static int |
83 |
|
add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *, |
84 |
|
const pcre_uint32 *, unsigned int); |
85 |
|
|
86 |
|
static BOOL |
87 |
|
compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, |
88 |
|
int, int, int *, int *, branch_chain *, compile_data *, int *); |
89 |
|
|
90 |
|
|
91 |
|
|
92 |
/************************************************* |
/************************************************* |
93 |
* Code parameters and static tables * |
* Code parameters and static tables * |
123 |
|
|
124 |
#define REQ_CASELESS 0x10000000l /* Indicates caselessness */ |
#define REQ_CASELESS 0x10000000l /* Indicates caselessness */ |
125 |
#define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */ |
#define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */ |
126 |
|
#define REQ_MASK (REQ_CASELESS | REQ_VARY) |
127 |
|
|
128 |
/* Repeated character flags. */ |
/* Repeated character flags. */ |
129 |
|
|
501 |
"too many forward references\0" |
"too many forward references\0" |
502 |
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" |
"disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0" |
503 |
"invalid UTF-16 string\0" |
"invalid UTF-16 string\0" |
504 |
|
/* 75 */ |
505 |
|
"name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0" |
506 |
|
"character value in \\u.... sequence is too large\0" |
507 |
|
"invalid UTF-32 string\0" |
508 |
; |
; |
509 |
|
|
510 |
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
644 |
#endif |
#endif |
645 |
|
|
646 |
|
|
|
/* Definition to allow mutual recursion */ |
|
|
|
|
|
static BOOL |
|
|
compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int, |
|
|
int *, int *, branch_chain *, compile_data *, int *); |
|
|
|
|
647 |
|
|
648 |
|
|
649 |
/************************************************* |
/************************************************* |
796 |
|
|
797 |
#else /* EBCDIC coding */ |
#else /* EBCDIC coding */ |
798 |
/* Not alphanumeric */ |
/* Not alphanumeric */ |
799 |
else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} |
else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {} |
800 |
else if ((i = escapes[c - 0x48]) != 0) c = i; |
else if ((i = escapes[c - 0x48]) != 0) c = i; |
801 |
#endif |
#endif |
802 |
|
|
839 |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
840 |
#endif |
#endif |
841 |
} |
} |
842 |
|
|
843 |
|
#if defined COMPILE_PCRE8 |
844 |
|
if (c > (utf ? 0x10ffff : 0xff)) |
845 |
|
#elif defined COMPILE_PCRE16 |
846 |
|
if (c > (utf ? 0x10ffff : 0xffff)) |
847 |
|
#elif defined COMPILE_PCRE32 |
848 |
|
if (utf && c > 0x10ffff) |
849 |
|
#endif |
850 |
|
{ |
851 |
|
*errorcodeptr = ERR76; |
852 |
|
} |
853 |
|
else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73; |
854 |
} |
} |
855 |
} |
} |
856 |
else |
else |
1071 |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
1072 |
#endif |
#endif |
1073 |
|
|
1074 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
1075 |
if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } |
if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } |
1076 |
#else |
#elif defined COMPILE_PCRE16 |
|
#ifdef COMPILE_PCRE16 |
|
1077 |
if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } |
if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } |
1078 |
#endif |
#elif defined COMPILE_PCRE32 |
1079 |
|
if (utf && c > 0x10ffff) { c = -1; break; } |
1080 |
#endif |
#endif |
1081 |
} |
} |
1082 |
|
|
1369 |
name name to seek, or NULL if seeking a numbered subpattern |
name name to seek, or NULL if seeking a numbered subpattern |
1370 |
lorn name length, or subpattern number if name is NULL |
lorn name length, or subpattern number if name is NULL |
1371 |
xmode TRUE if we are in /x mode |
xmode TRUE if we are in /x mode |
1372 |
utf TRUE if we are in UTF-8 / UTF-16 mode |
utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode |
1373 |
count pointer to the current capturing subpattern number (updated) |
count pointer to the current capturing subpattern number (updated) |
1374 |
|
|
1375 |
Returns: the number of the named subpattern, or -1 if not found |
Returns: the number of the named subpattern, or -1 if not found |
1603 |
name name to seek, or NULL if seeking a numbered subpattern |
name name to seek, or NULL if seeking a numbered subpattern |
1604 |
lorn name length, or subpattern number if name is NULL |
lorn name length, or subpattern number if name is NULL |
1605 |
xmode TRUE if we are in /x mode |
xmode TRUE if we are in /x mode |
1606 |
utf TRUE if we are in UTF-8 / UTF-16 mode |
utf TRUE if we are in UTF-8 / UTF-16 / UTF-32 mode |
1607 |
|
|
1608 |
Returns: the number of the found subpattern, or -1 if not found |
Returns: the number of the found subpattern, or -1 if not found |
1609 |
*/ |
*/ |
1706 |
|
|
1707 |
Arguments: |
Arguments: |
1708 |
code points to the start of the pattern (the bracket) |
code points to the start of the pattern (the bracket) |
1709 |
utf TRUE in UTF-8 / UTF-16 mode |
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode |
1710 |
atend TRUE if called when the pattern is complete |
atend TRUE if called when the pattern is complete |
1711 |
cd the "compile data" structure |
cd the "compile data" structure |
1712 |
|
|
1840 |
case OP_NOTI: |
case OP_NOTI: |
1841 |
branchlength++; |
branchlength++; |
1842 |
cc += 2; |
cc += 2; |
1843 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
1844 |
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
1845 |
#endif |
#endif |
1846 |
break; |
break; |
1854 |
case OP_NOTEXACTI: |
case OP_NOTEXACTI: |
1855 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
1856 |
cc += 2 + IMM2_SIZE; |
cc += 2 + IMM2_SIZE; |
1857 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
1858 |
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]); |
1859 |
#endif |
#endif |
1860 |
break; |
break; |
1861 |
|
|
1862 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
1863 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
1864 |
if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2; |
if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) |
1865 |
|
cc += 2; |
1866 |
cc += 1 + IMM2_SIZE + 1; |
cc += 1 + IMM2_SIZE + 1; |
1867 |
break; |
break; |
1868 |
|
|
1897 |
|
|
1898 |
/* Check a class for variable quantification */ |
/* Check a class for variable quantification */ |
1899 |
|
|
1900 |
#if defined SUPPORT_UTF || defined COMPILE_PCRE16 |
#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32 |
1901 |
case OP_XCLASS: |
case OP_XCLASS: |
1902 |
cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; |
cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS]; |
1903 |
/* Fall through */ |
/* Fall through */ |
2036 |
|
|
2037 |
Arguments: |
Arguments: |
2038 |
code points to start of expression |
code points to start of expression |
2039 |
utf TRUE in UTF-8 / UTF-16 mode |
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode |
2040 |
number the required bracket number or negative to find a lookbehind |
number the required bracket number or negative to find a lookbehind |
2041 |
|
|
2042 |
Returns: pointer to the opcode for the bracket, or NULL if not found |
Returns: pointer to the opcode for the bracket, or NULL if not found |
2100 |
case OP_TYPEMINUPTO: |
case OP_TYPEMINUPTO: |
2101 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
2102 |
case OP_TYPEPOSUPTO: |
case OP_TYPEPOSUPTO: |
2103 |
if (code[1 + IMM2_SIZE] == OP_PROP |
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
2104 |
|| code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
code += 2; |
2105 |
break; |
break; |
2106 |
|
|
2107 |
case OP_MARK: |
case OP_MARK: |
2123 |
a multi-byte character. The length in the table is a minimum, so we have to |
a multi-byte character. The length in the table is a minimum, so we have to |
2124 |
arrange to skip the extra bytes. */ |
arrange to skip the extra bytes. */ |
2125 |
|
|
2126 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
2127 |
if (utf) switch(c) |
if (utf) switch(c) |
2128 |
{ |
{ |
2129 |
case OP_CHAR: |
case OP_CHAR: |
2175 |
|
|
2176 |
Arguments: |
Arguments: |
2177 |
code points to start of expression |
code points to start of expression |
2178 |
utf TRUE in UTF-8 / UTF-16 mode |
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode |
2179 |
|
|
2180 |
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
2181 |
*/ |
*/ |
2220 |
case OP_TYPEUPTO: |
case OP_TYPEUPTO: |
2221 |
case OP_TYPEMINUPTO: |
case OP_TYPEMINUPTO: |
2222 |
case OP_TYPEEXACT: |
case OP_TYPEEXACT: |
2223 |
if (code[1 + IMM2_SIZE] == OP_PROP |
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
2224 |
|| code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
code += 2; |
2225 |
break; |
break; |
2226 |
|
|
2227 |
case OP_MARK: |
case OP_MARK: |
2243 |
by a multi-byte character. The length in the table is a minimum, so we have |
by a multi-byte character. The length in the table is a minimum, so we have |
2244 |
to arrange to skip the extra bytes. */ |
to arrange to skip the extra bytes. */ |
2245 |
|
|
2246 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
2247 |
if (utf) switch(c) |
if (utf) switch(c) |
2248 |
{ |
{ |
2249 |
case OP_CHAR: |
case OP_CHAR: |
2329 |
Arguments: |
Arguments: |
2330 |
code points to start of search |
code points to start of search |
2331 |
endcode points to where to stop |
endcode points to where to stop |
2332 |
utf TRUE if in UTF-8 / UTF-16 mode |
utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode |
2333 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
2334 |
|
|
2335 |
Returns: TRUE if what is matched could be empty |
Returns: TRUE if what is matched could be empty |
2546 |
case OP_TYPEUPTO: |
case OP_TYPEUPTO: |
2547 |
case OP_TYPEMINUPTO: |
case OP_TYPEMINUPTO: |
2548 |
case OP_TYPEPOSUPTO: |
case OP_TYPEPOSUPTO: |
2549 |
if (code[1 + IMM2_SIZE] == OP_PROP |
if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP) |
2550 |
|| code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2; |
code += 2; |
2551 |
break; |
break; |
2552 |
|
|
2553 |
/* End of branch */ |
/* End of branch */ |
2562 |
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, |
/* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, |
2563 |
MINUPTO, and POSUPTO may be followed by a multibyte character */ |
MINUPTO, and POSUPTO may be followed by a multibyte character */ |
2564 |
|
|
2565 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
2566 |
case OP_STAR: |
case OP_STAR: |
2567 |
case OP_STARI: |
case OP_STARI: |
2568 |
case OP_MINSTAR: |
case OP_MINSTAR: |
2628 |
code points to start of the recursion |
code points to start of the recursion |
2629 |
endcode points to where to stop (current RECURSE item) |
endcode points to where to stop (current RECURSE item) |
2630 |
bcptr points to the chain of current (unclosed) branch starts |
bcptr points to the chain of current (unclosed) branch starts |
2631 |
utf TRUE if in UTF-8 / UTF-16 mode |
utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode |
2632 |
cd pointers to tables etc |
cd pointers to tables etc |
2633 |
|
|
2634 |
Returns: TRUE if what is matched could be empty |
Returns: TRUE if what is matched could be empty |
2775 |
Arguments: |
Arguments: |
2776 |
group points to the start of the group |
group points to the start of the group |
2777 |
adjust the amount by which the group is to be moved |
adjust the amount by which the group is to be moved |
2778 |
utf TRUE in UTF-8 / UTF-16 mode |
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode |
2779 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
2780 |
save_hwm the hwm forward reference pointer at the start of the group |
save_hwm the hwm forward reference pointer at the start of the group |
2781 |
|
|
2879 |
*************************************************/ |
*************************************************/ |
2880 |
|
|
2881 |
/* This function is passed the start and end of a class range, in UTF-8 mode |
/* This function is passed the start and end of a class range, in UTF-8 mode |
2882 |
with UCP support. It searches up the characters, looking for internal ranges of |
with UCP support. It searches up the characters, looking for ranges of |
2883 |
characters in the "other" case. Each call returns the next one, updating the |
characters in the "other" case. Each call returns the next one, updating the |
2884 |
start address. |
start address. A character with multiple other cases is returned on its own |
2885 |
|
with a special return value. |
2886 |
|
|
2887 |
Arguments: |
Arguments: |
2888 |
cptr points to starting character value; updated |
cptr points to starting character value; updated |
2890 |
ocptr where to put start of othercase range |
ocptr where to put start of othercase range |
2891 |
odptr where to put end of othercase range |
odptr where to put end of othercase range |
2892 |
|
|
2893 |
Yield: TRUE when range returned; FALSE when no more |
Yield: -1 when no more |
2894 |
|
0 when a range is returned |
2895 |
|
>0 the CASESET offset for char with multiple other cases |
2896 |
|
in this case, ocptr contains the original |
2897 |
*/ |
*/ |
2898 |
|
|
2899 |
static BOOL |
static int |
2900 |
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, |
get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, |
2901 |
unsigned int *odptr) |
unsigned int *odptr) |
2902 |
{ |
{ |
2903 |
unsigned int c, othercase, next; |
unsigned int c, othercase, next; |
2904 |
|
int co; |
2905 |
|
|
2906 |
|
/* Find the first character that has an other case. If it has multiple other |
2907 |
|
cases, return its case offset value. */ |
2908 |
|
|
2909 |
for (c = *cptr; c <= d; c++) |
for (c = *cptr; c <= d; c++) |
2910 |
{ if ((othercase = UCD_OTHERCASE(c)) != c) break; } |
{ |
2911 |
|
if ((co = UCD_CASESET(c)) != 0) |
2912 |
|
{ |
2913 |
|
*ocptr = c++; /* Character that has the set */ |
2914 |
|
*cptr = c; /* Rest of input range */ |
2915 |
|
return co; |
2916 |
|
} |
2917 |
|
if ((othercase = UCD_OTHERCASE(c)) != c) break; |
2918 |
|
} |
2919 |
|
|
2920 |
if (c > d) return FALSE; |
if (c > d) return -1; /* Reached end of range */ |
2921 |
|
|
2922 |
*ocptr = othercase; |
*ocptr = othercase; |
2923 |
next = othercase + 1; |
next = othercase + 1; |
2928 |
next++; |
next++; |
2929 |
} |
} |
2930 |
|
|
2931 |
*odptr = next - 1; |
*odptr = next - 1; /* End of othercase range */ |
2932 |
*cptr = c; |
*cptr = c; /* Rest of input range */ |
2933 |
|
return 0; |
|
return TRUE; |
|
2934 |
} |
} |
2935 |
|
|
2936 |
|
|
2954 |
static BOOL |
static BOOL |
2955 |
check_char_prop(int c, int ptype, int pdata, BOOL negated) |
check_char_prop(int c, int ptype, int pdata, BOOL negated) |
2956 |
{ |
{ |
2957 |
|
#ifdef SUPPORT_UCP |
2958 |
|
const pcre_uint32 *p; |
2959 |
|
#endif |
2960 |
|
|
2961 |
const ucd_record *prop = GET_UCD(c); |
const ucd_record *prop = GET_UCD(c); |
2962 |
|
|
2963 |
switch(ptype) |
switch(ptype) |
2964 |
{ |
{ |
2965 |
case PT_LAMP: |
case PT_LAMP: |
2997 |
return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
return (PRIV(ucp_gentype)[prop->chartype] == ucp_L || |
2998 |
PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
PRIV(ucp_gentype)[prop->chartype] == ucp_N || |
2999 |
c == CHAR_UNDERSCORE) == negated; |
c == CHAR_UNDERSCORE) == negated; |
3000 |
|
|
3001 |
|
#ifdef SUPPORT_UCP |
3002 |
|
case PT_CLIST: |
3003 |
|
p = PRIV(ucd_caseless_sets) + prop->caseset; |
3004 |
|
for (;;) |
3005 |
|
{ |
3006 |
|
if ((unsigned int)c < *p) return !negated; |
3007 |
|
if ((unsigned int)c == *p++) return negated; |
3008 |
|
} |
3009 |
|
break; /* Control never reaches here */ |
3010 |
|
#endif |
3011 |
} |
} |
3012 |
|
|
3013 |
return FALSE; |
return FALSE; |
3014 |
} |
} |
3015 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
3026 |
|
|
3027 |
Arguments: |
Arguments: |
3028 |
previous pointer to the repeated opcode |
previous pointer to the repeated opcode |
3029 |
utf TRUE in UTF-8 / UTF-16 mode |
utf TRUE in UTF-8 / UTF-16 / UTF-32 mode |
3030 |
ptr next character in pattern |
ptr next character in pattern |
3031 |
options options bits |
options options bits |
3032 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
3038 |
check_auto_possessive(const pcre_uchar *previous, BOOL utf, |
check_auto_possessive(const pcre_uchar *previous, BOOL utf, |
3039 |
const pcre_uchar *ptr, int options, compile_data *cd) |
const pcre_uchar *ptr, int options, compile_data *cd) |
3040 |
{ |
{ |
3041 |
pcre_int32 c, next; |
pcre_int32 c = NOTACHAR; |
3042 |
|
pcre_int32 next; |
3043 |
int op_code = *previous++; |
int op_code = *previous++; |
3044 |
|
|
3045 |
/* Skip whitespace and comments in extended mode */ |
/* Skip whitespace and comments in extended mode */ |
3113 |
STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) |
STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0) |
3114 |
return FALSE; |
return FALSE; |
3115 |
|
|
3116 |
/* Now compare the next item with the previous opcode. First, handle cases when |
/* If the previous item is a character, get its value. */ |
|
the next item is a character. */ |
|
3117 |
|
|
3118 |
if (next >= 0) switch(op_code) |
if (op_code == OP_CHAR || op_code == OP_CHARI || |
3119 |
|
op_code == OP_NOT || op_code == OP_NOTI) |
3120 |
{ |
{ |
|
case OP_CHAR: |
|
3121 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3122 |
GETCHARTEST(c, previous); |
GETCHARTEST(c, previous); |
3123 |
#else |
#else |
3124 |
c = *previous; |
c = *previous; |
3125 |
#endif |
#endif |
3126 |
return c != next; |
} |
3127 |
|
|
3128 |
/* For CHARI (caseless character) we must check the other case. If we have |
/* Now compare the next item with the previous opcode. First, handle cases when |
3129 |
Unicode property support, we can use it to test the other case of |
the next item is a character. */ |
|
high-valued characters. */ |
|
3130 |
|
|
3131 |
case OP_CHARI: |
if (next >= 0) |
3132 |
#ifdef SUPPORT_UTF |
{ |
3133 |
GETCHARTEST(c, previous); |
/* For a caseless UTF match, the next character may have more than one other |
3134 |
#else |
case, which maps to the special PT_CLIST property. Check this first. */ |
3135 |
c = *previous; |
|
|
#endif |
|
|
if (c == next) return FALSE; |
|
|
#ifdef SUPPORT_UTF |
|
|
if (utf) |
|
|
{ |
|
|
unsigned int othercase; |
|
|
if (next < 128) othercase = cd->fcc[next]; else |
|
3136 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
3137 |
othercase = UCD_OTHERCASE((unsigned int)next); |
if (utf && (unsigned int)c != NOTACHAR && (options & PCRE_CASELESS) != 0) |
3138 |
#else |
{ |
3139 |
othercase = NOTACHAR; |
int ocs = UCD_CASESET(next); |
3140 |
#endif |
if (ocs > 0) return check_char_prop(c, PT_CLIST, ocs, op_code >= OP_NOT); |
|
return (unsigned int)c != othercase; |
|
3141 |
} |
} |
|
else |
|
|
#endif /* SUPPORT_UTF */ |
|
|
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
|
|
|
|
|
case OP_NOT: |
|
|
#ifdef SUPPORT_UTF |
|
|
GETCHARTEST(c, previous); |
|
|
#else |
|
|
c = *previous; |
|
3142 |
#endif |
#endif |
|
return c == next; |
|
3143 |
|
|
3144 |
case OP_NOTI: |
switch(op_code) |
3145 |
|
{ |
3146 |
|
case OP_CHAR: |
3147 |
|
return c != next; |
3148 |
|
|
3149 |
|
/* For CHARI (caseless character) we must check the other case. If we have |
3150 |
|
Unicode property support, we can use it to test the other case of |
3151 |
|
high-valued characters. We know that next can have only one other case, |
3152 |
|
because multi-other-case characters are dealt with above. */ |
3153 |
|
|
3154 |
|
case OP_CHARI: |
3155 |
|
if (c == next) return FALSE; |
3156 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3157 |
GETCHARTEST(c, previous); |
if (utf) |
3158 |
|
{ |
3159 |
|
unsigned int othercase; |
3160 |
|
if (next < 128) othercase = cd->fcc[next]; else |
3161 |
|
#ifdef SUPPORT_UCP |
3162 |
|
othercase = UCD_OTHERCASE((unsigned int)next); |
3163 |
#else |
#else |
3164 |
c = *previous; |
othercase = NOTACHAR; |
3165 |
#endif |
#endif |
3166 |
if (c == next) return TRUE; |
return (unsigned int)c != othercase; |
3167 |
|
} |
3168 |
|
else |
3169 |
|
#endif /* SUPPORT_UTF */ |
3170 |
|
return (c != TABLE_GET((unsigned int)next, cd->fcc, next)); /* Not UTF */ |
3171 |
|
|
3172 |
|
case OP_NOT: |
3173 |
|
return c == next; |
3174 |
|
|
3175 |
|
case OP_NOTI: |
3176 |
|
if (c == next) return TRUE; |
3177 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3178 |
if (utf) |
if (utf) |
3179 |
{ |
{ |
3180 |
unsigned int othercase; |
unsigned int othercase; |
3181 |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
3182 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
3183 |
othercase = UCD_OTHERCASE((unsigned int)next); |
othercase = UCD_OTHERCASE((unsigned int)next); |
3184 |
#else |
#else |
3185 |
othercase = NOTACHAR; |
othercase = NOTACHAR; |
3186 |
#endif |
#endif |
3187 |
return (unsigned int)c == othercase; |
return (unsigned int)c == othercase; |
3188 |
} |
} |
3189 |
else |
else |
3190 |
#endif /* SUPPORT_UTF */ |
#endif /* SUPPORT_UTF */ |
3191 |
return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Non-UTF-8 mode */ |
return (c == TABLE_GET((unsigned int)next, cd->fcc, next)); /* Not UTF */ |
3192 |
|
|
3193 |
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. |
/* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set. |
3194 |
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */ |
3195 |
|
|
3196 |
case OP_DIGIT: |
case OP_DIGIT: |
3197 |
return next > 255 || (cd->ctypes[next] & ctype_digit) == 0; |
return next > 255 || (cd->ctypes[next] & ctype_digit) == 0; |
3198 |
|
|
3199 |
case OP_NOT_DIGIT: |
case OP_NOT_DIGIT: |
3200 |
return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0; |
return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0; |
3201 |
|
|
3202 |
case OP_WHITESPACE: |
case OP_WHITESPACE: |
3203 |
return next > 255 || (cd->ctypes[next] & ctype_space) == 0; |
return next > 255 || (cd->ctypes[next] & ctype_space) == 0; |
3204 |
|
|
3205 |
case OP_NOT_WHITESPACE: |
case OP_NOT_WHITESPACE: |
3206 |
return next <= 255 && (cd->ctypes[next] & ctype_space) != 0; |
return next <= 255 && (cd->ctypes[next] & ctype_space) != 0; |
3207 |
|
|
3208 |
case OP_WORDCHAR: |
case OP_WORDCHAR: |
3209 |
return next > 255 || (cd->ctypes[next] & ctype_word) == 0; |
return next > 255 || (cd->ctypes[next] & ctype_word) == 0; |
3210 |
|
|
3211 |
case OP_NOT_WORDCHAR: |
case OP_NOT_WORDCHAR: |
3212 |
return next <= 255 && (cd->ctypes[next] & ctype_word) != 0; |
return next <= 255 && (cd->ctypes[next] & ctype_word) != 0; |
3213 |
|
|
3214 |
case OP_HSPACE: |
case OP_HSPACE: |
3215 |
case OP_NOT_HSPACE: |
case OP_NOT_HSPACE: |
3216 |
switch(next) |
switch(next) |
3217 |
{ |
{ |
3218 |
case 0x09: |
HSPACE_CASES: |
3219 |
case 0x20: |
return op_code == OP_NOT_HSPACE; |
|
case 0xa0: |
|
|
case 0x1680: |
|
|
case 0x180e: |
|
|
case 0x2000: |
|
|
case 0x2001: |
|
|
case 0x2002: |
|
|
case 0x2003: |
|
|
case 0x2004: |
|
|
case 0x2005: |
|
|
case 0x2006: |
|
|
case 0x2007: |
|
|
case 0x2008: |
|
|
case 0x2009: |
|
|
case 0x200A: |
|
|
case 0x202f: |
|
|
case 0x205f: |
|
|
case 0x3000: |
|
|
return op_code == OP_NOT_HSPACE; |
|
|
default: |
|
|
return op_code != OP_NOT_HSPACE; |
|
|
} |
|
3220 |
|
|
3221 |
case OP_ANYNL: |
default: |
3222 |
case OP_VSPACE: |
return op_code != OP_NOT_HSPACE; |
3223 |
case OP_NOT_VSPACE: |
} |
3224 |
switch(next) |
|
3225 |
{ |
case OP_ANYNL: |
3226 |
case 0x0a: |
case OP_VSPACE: |
3227 |
case 0x0b: |
case OP_NOT_VSPACE: |
3228 |
case 0x0c: |
switch(next) |
3229 |
case 0x0d: |
{ |
3230 |
case 0x85: |
VSPACE_CASES: |
3231 |
case 0x2028: |
return op_code == OP_NOT_VSPACE; |
3232 |
case 0x2029: |
|
3233 |
return op_code == OP_NOT_VSPACE; |
default: |
3234 |
default: |
return op_code != OP_NOT_VSPACE; |
3235 |
return op_code != OP_NOT_VSPACE; |
} |
|
} |
|
3236 |
|
|
3237 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
3238 |
case OP_PROP: |
case OP_PROP: |
3239 |
return check_char_prop(next, previous[0], previous[1], FALSE); |
return check_char_prop(next, previous[0], previous[1], FALSE); |
3240 |
|
|
3241 |
case OP_NOTPROP: |
case OP_NOTPROP: |
3242 |
return check_char_prop(next, previous[0], previous[1], TRUE); |
return check_char_prop(next, previous[0], previous[1], TRUE); |
3243 |
#endif |
#endif |
3244 |
|
|
3245 |
default: |
default: |
3246 |
return FALSE; |
return FALSE; |
3247 |
|
} |
3248 |
} |
} |
3249 |
|
|
|
|
|
3250 |
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP |
/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP |
3251 |
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are |
is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are |
3252 |
generated only when PCRE_UCP is *not* set, that is, when only ASCII |
generated only when PCRE_UCP is *not* set, that is, when only ASCII |
3257 |
{ |
{ |
3258 |
case OP_CHAR: |
case OP_CHAR: |
3259 |
case OP_CHARI: |
case OP_CHARI: |
|
#ifdef SUPPORT_UTF |
|
|
GETCHARTEST(c, previous); |
|
|
#else |
|
|
c = *previous; |
|
|
#endif |
|
3260 |
switch(-next) |
switch(-next) |
3261 |
{ |
{ |
3262 |
case ESC_d: |
case ESC_d: |
3281 |
case ESC_H: |
case ESC_H: |
3282 |
switch(c) |
switch(c) |
3283 |
{ |
{ |
3284 |
case 0x09: |
HSPACE_CASES: |
|
case 0x20: |
|
|
case 0xa0: |
|
|
case 0x1680: |
|
|
case 0x180e: |
|
|
case 0x2000: |
|
|
case 0x2001: |
|
|
case 0x2002: |
|
|
case 0x2003: |
|
|
case 0x2004: |
|
|
case 0x2005: |
|
|
case 0x2006: |
|
|
case 0x2007: |
|
|
case 0x2008: |
|
|
case 0x2009: |
|
|
case 0x200A: |
|
|
case 0x202f: |
|
|
case 0x205f: |
|
|
case 0x3000: |
|
3285 |
return -next != ESC_h; |
return -next != ESC_h; |
3286 |
|
|
3287 |
default: |
default: |
3288 |
return -next == ESC_h; |
return -next == ESC_h; |
3289 |
} |
} |
3292 |
case ESC_V: |
case ESC_V: |
3293 |
switch(c) |
switch(c) |
3294 |
{ |
{ |
3295 |
case 0x0a: |
VSPACE_CASES: |
|
case 0x0b: |
|
|
case 0x0c: |
|
|
case 0x0d: |
|
|
case 0x85: |
|
|
case 0x2028: |
|
|
case 0x2029: |
|
3296 |
return -next != ESC_v; |
return -next != ESC_v; |
3297 |
|
|
3298 |
default: |
default: |
3299 |
return -next == ESC_v; |
return -next == ESC_v; |
3300 |
} |
} |
3399 |
|
|
3400 |
|
|
3401 |
/************************************************* |
/************************************************* |
3402 |
|
* Add a character or range to a class * |
3403 |
|
*************************************************/ |
3404 |
|
|
3405 |
|
/* This function packages up the logic of adding a character or range of |
3406 |
|
characters to a class. The character values in the arguments will be within the |
3407 |
|
valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is |
3408 |
|
mutually recursive with the function immediately below. |
3409 |
|
|
3410 |
|
Arguments: |
3411 |
|
classbits the bit map for characters < 256 |
3412 |
|
uchardptr points to the pointer for extra data |
3413 |
|
options the options word |
3414 |
|
cd contains pointers to tables etc. |
3415 |
|
start start of range character |
3416 |
|
end end of range character |
3417 |
|
|
3418 |
|
Returns: the number of < 256 characters added |
3419 |
|
the pointer to extra data is updated |
3420 |
|
*/ |
3421 |
|
|
3422 |
|
static int |
3423 |
|
add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, |
3424 |
|
compile_data *cd, unsigned int start, unsigned int end) |
3425 |
|
{ |
3426 |
|
unsigned int c; |
3427 |
|
int n8 = 0; |
3428 |
|
|
3429 |
|
/* If caseless matching is required, scan the range and process alternate |
3430 |
|
cases. In Unicode, there are 8-bit characters that have alternate cases that |
3431 |
|
are greater than 255 and vice-versa. Sometimes we can just extend the original |
3432 |
|
range. */ |
3433 |
|
|
3434 |
|
if ((options & PCRE_CASELESS) != 0) |
3435 |
|
{ |
3436 |
|
#ifdef SUPPORT_UCP |
3437 |
|
if ((options & PCRE_UTF8) != 0) |
3438 |
|
{ |
3439 |
|
int rc; |
3440 |
|
unsigned int oc, od; |
3441 |
|
|
3442 |
|
options &= ~PCRE_CASELESS; /* Remove for recursive calls */ |
3443 |
|
c = start; |
3444 |
|
|
3445 |
|
while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0) |
3446 |
|
{ |
3447 |
|
/* Handle a single character that has more than one other case. */ |
3448 |
|
|
3449 |
|
if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd, |
3450 |
|
PRIV(ucd_caseless_sets) + rc, oc); |
3451 |
|
|
3452 |
|
/* Do nothing if the other case range is within the original range. */ |
3453 |
|
|
3454 |
|
else if (oc >= start && od <= end) continue; |
3455 |
|
|
3456 |
|
/* Extend the original range if there is overlap, noting that if oc < c, we |
3457 |
|
can't have od > end because a subrange is always shorter than the basic |
3458 |
|
range. Otherwise, use a recursive call to add the additional range. */ |
3459 |
|
|
3460 |
|
else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */ |
3461 |
|
else if (od > end && oc <= end + 1) end = od; /* Extend upwards */ |
3462 |
|
else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od); |
3463 |
|
} |
3464 |
|
} |
3465 |
|
else |
3466 |
|
#endif /* SUPPORT_UCP */ |
3467 |
|
|
3468 |
|
/* Not UTF-mode, or no UCP */ |
3469 |
|
|
3470 |
|
for (c = start; c <= end && c < 256; c++) |
3471 |
|
{ |
3472 |
|
SETBIT(classbits, cd->fcc[c]); |
3473 |
|
n8++; |
3474 |
|
} |
3475 |
|
} |
3476 |
|
|
3477 |
|
/* Now handle the original range. Adjust the final value according to the bit |
3478 |
|
length - this means that the same lists of (e.g.) horizontal spaces can be used |
3479 |
|
in all cases. */ |
3480 |
|
|
3481 |
|
#if defined COMPILE_PCRE8 |
3482 |
|
#ifdef SUPPORT_UTF |
3483 |
|
if ((options & PCRE_UTF8) == 0) |
3484 |
|
#endif |
3485 |
|
if (end > 0xff) end = 0xff; |
3486 |
|
|
3487 |
|
#elif defined COMPILE_PCRE16 |
3488 |
|
#ifdef SUPPORT_UTF |
3489 |
|
if ((options & PCRE_UTF16) == 0) |
3490 |
|
#endif |
3491 |
|
if (end > 0xffff) end = 0xffff; |
3492 |
|
|
3493 |
|
#elif defined COMPILE_PCRE32 |
3494 |
|
#ifdef SUPPORT_UTF |
3495 |
|
if ((options & PCRE_UTF32) == 0) |
3496 |
|
if (end > 0xffffu) end = 0xffffu; // FIXMEchpe rebase fix this |
3497 |
|
#endif |
3498 |
|
#endif /* COMPILE_PCRE[8|16|32] */ |
3499 |
|
|
3500 |
|
/* If all characters are less than 256, use the bit map. Otherwise use extra |
3501 |
|
data. */ |
3502 |
|
|
3503 |
|
if (end < 0x100) |
3504 |
|
{ |
3505 |
|
for (c = start; c <= end; c++) |
3506 |
|
{ |
3507 |
|
n8++; |
3508 |
|
SETBIT(classbits, c); |
3509 |
|
} |
3510 |
|
} |
3511 |
|
|
3512 |
|
else |
3513 |
|
{ |
3514 |
|
pcre_uchar *uchardata = *uchardptr; |
3515 |
|
|
3516 |
|
#ifdef SUPPORT_UTF |
3517 |
|
if ((options & PCRE_UTF8) != 0) /* All UTFs use the same flag bit */ |
3518 |
|
{ |
3519 |
|
if (start < end) |
3520 |
|
{ |
3521 |
|
*uchardata++ = XCL_RANGE; |
3522 |
|
uchardata += PRIV(ord2utf)(start, uchardata); |
3523 |
|
uchardata += PRIV(ord2utf)(end, uchardata); |
3524 |
|
} |
3525 |
|
else if (start == end) |
3526 |
|
{ |
3527 |
|
*uchardata++ = XCL_SINGLE; |
3528 |
|
uchardata += PRIV(ord2utf)(start, uchardata); |
3529 |
|
} |
3530 |
|
} |
3531 |
|
else |
3532 |
|
#endif /* SUPPORT_UTF */ |
3533 |
|
|
3534 |
|
/* Without UTF support, character values are constrained by the bit length, |
3535 |
|
and can only be > 256 for 16-bit and 32-bit libraries. */ |
3536 |
|
|
3537 |
|
#ifdef COMPILE_PCRE8 |
3538 |
|
{} |
3539 |
|
#else |
3540 |
|
if (start < end) |
3541 |
|
{ |
3542 |
|
*uchardata++ = XCL_RANGE; |
3543 |
|
*uchardata++ = start; |
3544 |
|
*uchardata++ = end; |
3545 |
|
} |
3546 |
|
else if (start == end) |
3547 |
|
{ |
3548 |
|
*uchardata++ = XCL_SINGLE; |
3549 |
|
*uchardata++ = start; |
3550 |
|
} |
3551 |
|
#endif |
3552 |
|
|
3553 |
|
*uchardptr = uchardata; /* Updata extra data pointer */ |
3554 |
|
} |
3555 |
|
|
3556 |
|
return n8; /* Number of 8-bit characters */ |
3557 |
|
} |
3558 |
|
|
3559 |
|
|
3560 |
|
|
3561 |
|
|
3562 |
|
/************************************************* |
3563 |
|
* Add a list of characters to a class * |
3564 |
|
*************************************************/ |
3565 |
|
|
3566 |
|
/* This function is used for adding a list of case-equivalent characters to a |
3567 |
|
class, and also for adding a list of horizontal or vertical whitespace. If the |
3568 |
|
list is in order (which it should be), ranges of characters are detected and |
3569 |
|
handled appropriately. This function is mutually recursive with the function |
3570 |
|
above. |
3571 |
|
|
3572 |
|
Arguments: |
3573 |
|
classbits the bit map for characters < 256 |
3574 |
|
uchardptr points to the pointer for extra data |
3575 |
|
options the options word |
3576 |
|
cd contains pointers to tables etc. |
3577 |
|
p points to row of 32-bit values, terminated by NOTACHAR |
3578 |
|
except character to omit; this is used when adding lists of |
3579 |
|
case-equivalent characters to avoid including the one we |
3580 |
|
already know about |
3581 |
|
|
3582 |
|
Returns: the number of < 256 characters added |
3583 |
|
the pointer to extra data is updated |
3584 |
|
*/ |
3585 |
|
|
3586 |
|
static int |
3587 |
|
add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options, |
3588 |
|
compile_data *cd, const pcre_uint32 *p, unsigned int except) |
3589 |
|
{ |
3590 |
|
int n8 = 0; |
3591 |
|
while (p[0] < NOTACHAR) |
3592 |
|
{ |
3593 |
|
int n = 0; |
3594 |
|
if (p[0] != except) |
3595 |
|
{ |
3596 |
|
while(p[n+1] == p[0] + n + 1) n++; |
3597 |
|
n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]); |
3598 |
|
} |
3599 |
|
p += n + 1; |
3600 |
|
} |
3601 |
|
return n8; |
3602 |
|
} |
3603 |
|
|
3604 |
|
|
3605 |
|
|
3606 |
|
/************************************************* |
3607 |
|
* Add characters not in a list to a class * |
3608 |
|
*************************************************/ |
3609 |
|
|
3610 |
|
/* This function is used for adding the complement of a list of horizontal or |
3611 |
|
vertical whitespace to a class. The list must be in order. |
3612 |
|
|
3613 |
|
Arguments: |
3614 |
|
classbits the bit map for characters < 256 |
3615 |
|
uchardptr points to the pointer for extra data |
3616 |
|
options the options word |
3617 |
|
cd contains pointers to tables etc. |
3618 |
|
p points to row of 32-bit values, terminated by NOTACHAR |
3619 |
|
|
3620 |
|
Returns: the number of < 256 characters added |
3621 |
|
the pointer to extra data is updated |
3622 |
|
*/ |
3623 |
|
|
3624 |
|
static int |
3625 |
|
add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, |
3626 |
|
int options, compile_data *cd, const pcre_uint32 *p) |
3627 |
|
{ |
3628 |
|
int n8 = 0; |
3629 |
|
if (p[0] > 0) |
3630 |
|
n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1); |
3631 |
|
while (p[0] < NOTACHAR) |
3632 |
|
{ |
3633 |
|
while (p[1] == p[0] + 1) p++; |
3634 |
|
n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1, |
3635 |
|
(p[1] == NOTACHAR)? 0x10ffff : p[1] - 1); |
3636 |
|
p++; |
3637 |
|
} |
3638 |
|
return n8; |
3639 |
|
} |
3640 |
|
|
3641 |
|
|
3642 |
|
|
3643 |
|
/************************************************* |
3644 |
* Compile one branch * |
* Compile one branch * |
3645 |
*************************************************/ |
*************************************************/ |
3646 |
|
|
3703 |
dynamically as we process the pattern. */ |
dynamically as we process the pattern. */ |
3704 |
|
|
3705 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3706 |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ |
3707 |
BOOL utf = (options & PCRE_UTF8) != 0; |
BOOL utf = (options & PCRE_UTF8) != 0; |
3708 |
pcre_uchar utf_chars[6]; |
pcre_uchar utf_chars[6]; |
3709 |
#else |
#else |
3710 |
BOOL utf = FALSE; |
BOOL utf = FALSE; |
3711 |
#endif |
#endif |
3712 |
|
|
3713 |
/* Helper variables for OP_XCLASS opcode (for characters > 255). */ |
/* Helper variables for OP_XCLASS opcode (for characters > 255). We define |
3714 |
|
class_uchardata always so that it can be passed to add_to_class() always, |
3715 |
|
though it will not be used in non-UTF 8-bit cases. This avoids having to supply |
3716 |
|
alternative calls for the different cases. */ |
3717 |
|
|
3718 |
|
pcre_uchar *class_uchardata; |
3719 |
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
3720 |
BOOL xclass; |
BOOL xclass; |
|
pcre_uchar *class_uchardata; |
|
3721 |
pcre_uchar *class_uchardata_base; |
pcre_uchar *class_uchardata_base; |
3722 |
#endif |
#endif |
3723 |
|
|
3761 |
BOOL is_recurse; |
BOOL is_recurse; |
3762 |
BOOL reset_bracount; |
BOOL reset_bracount; |
3763 |
int class_has_8bitchar; |
int class_has_8bitchar; |
3764 |
int class_single_char; |
int class_one_char; |
3765 |
int newoptions; |
int newoptions; |
3766 |
int recno; |
int recno; |
3767 |
int refsign; |
int refsign; |
4059 |
|
|
4060 |
should_flip_negation = FALSE; |
should_flip_negation = FALSE; |
4061 |
|
|
4062 |
/* For optimization purposes, we track some properties of the class. |
/* For optimization purposes, we track some properties of the class: |
4063 |
class_has_8bitchar will be non-zero, if the class contains at least one |
class_has_8bitchar will be non-zero if the class contains at least one < |
4064 |
< 256 character. class_single_char will be 1 if the class contains only |
256 character; class_one_char will be 1 if the class contains just one |
4065 |
a single character. */ |
character. */ |
4066 |
|
|
4067 |
class_has_8bitchar = 0; |
class_has_8bitchar = 0; |
4068 |
class_single_char = 0; |
class_one_char = 0; |
4069 |
|
|
4070 |
/* Initialize the 32-char bit map to all zeros. We build the map in a |
/* Initialize the 32-char bit map to all zeros. We build the map in a |
4071 |
temporary bit of memory, in case the class contains only 1 character (less |
temporary bit of memory, in case the class contains fewer than two |
4072 |
than 256), because in that case the compiled code doesn't use the bit map. |
8-bit characters because in that case the compiled code doesn't use the bit |
4073 |
*/ |
map. */ |
4074 |
|
|
4075 |
memset(classbits, 0, 32 * sizeof(pcre_uint8)); |
memset(classbits, 0, 32 * sizeof(pcre_uint8)); |
4076 |
|
|
4077 |
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
4078 |
xclass = FALSE; /* No chars >= 256 */ |
xclass = FALSE; |
4079 |
class_uchardata = code + LINK_SIZE + 2; /* For UTF-8 items */ |
class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ |
4080 |
class_uchardata_base = class_uchardata; /* For resetting in pass 1 */ |
class_uchardata_base = class_uchardata; /* Save the start */ |
4081 |
#endif |
#endif |
4082 |
|
|
4083 |
/* Process characters until ] is reached. By writing this as a "do" it |
/* Process characters until ] is reached. By writing this as a "do" it |
4088 |
{ |
{ |
4089 |
const pcre_uchar *oldptr; |
const pcre_uchar *oldptr; |
4090 |
|
|
4091 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4092 |
if (utf && HAS_EXTRALEN(c)) |
if (utf && HAS_EXTRALEN(c)) |
4093 |
{ /* Braces are required because the */ |
{ /* Braces are required because the */ |
4094 |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
4099 |
/* In the pre-compile phase, accumulate the length of any extra |
/* In the pre-compile phase, accumulate the length of any extra |
4100 |
data and reset the pointer. This is so that very large classes that |
data and reset the pointer. This is so that very large classes that |
4101 |
contain a zillion > 255 characters no longer overwrite the work space |
contain a zillion > 255 characters no longer overwrite the work space |
4102 |
(which is on the stack). */ |
(which is on the stack). We have to remember that there was XCLASS data, |
4103 |
|
however. */ |
4104 |
|
|
4105 |
if (lengthptr != NULL) |
if (lengthptr != NULL && class_uchardata > class_uchardata_base) |
4106 |
{ |
{ |
4107 |
|
xclass = TRUE; |
4108 |
*lengthptr += class_uchardata - class_uchardata_base; |
*lengthptr += class_uchardata - class_uchardata_base; |
4109 |
class_uchardata = class_uchardata_base; |
class_uchardata = class_uchardata_base; |
4110 |
} |
} |
4163 |
alpha. This relies on the fact that the class table starts with |
alpha. This relies on the fact that the class table starts with |
4164 |
alpha, lower, upper as the first 3 entries. */ |
alpha, lower, upper as the first 3 entries. */ |
4165 |
|
|
4166 |
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) |
4167 |
posix_class = 0; |
posix_class = 0; |
4168 |
|
|
4169 |
/* When PCRE_UCP is set, some of the POSIX classes are converted to |
/* When PCRE_UCP is set, some of the POSIX classes are converted to |
4206 |
for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; |
for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; |
4207 |
} |
} |
4208 |
|
|
4209 |
/* Not see if we need to remove any special characters. An option |
/* Now see if we need to remove any special characters. An option |
4210 |
value of 1 removes vertical space and 2 removes underscore. */ |
value of 1 removes vertical space and 2 removes underscore. */ |
4211 |
|
|
4212 |
if (tabopt < 0) tabopt = -tabopt; |
if (tabopt < 0) tabopt = -tabopt; |
4222 |
for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; |
for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; |
4223 |
|
|
4224 |
ptr = tempptr + 1; |
ptr = tempptr + 1; |
4225 |
/* Every class contains at least one < 256 characters. */ |
/* Every class contains at least one < 256 character. */ |
4226 |
class_has_8bitchar = 1; |
class_has_8bitchar = 1; |
4227 |
/* Every class contains at least two characters. */ |
/* Every class contains at least two characters. */ |
4228 |
class_single_char = 2; |
class_one_char = 2; |
4229 |
continue; /* End of POSIX syntax handling */ |
continue; /* End of POSIX syntax handling */ |
4230 |
} |
} |
4231 |
|
|
4233 |
of the specials, which just set a flag. The sequence \b is a special |
of the specials, which just set a flag. The sequence \b is a special |
4234 |
case. Inside a class (and only there) it is treated as backspace. We |
case. Inside a class (and only there) it is treated as backspace. We |
4235 |
assume that other escapes have more than one character in them, so |
assume that other escapes have more than one character in them, so |
4236 |
speculatively set both class_has_8bitchar and class_single_char bigger |
speculatively set both class_has_8bitchar and class_one_char bigger |
4237 |
than one. Unrecognized escapes fall through and are either treated |
than one. Unrecognized escapes fall through and are either treated |
4238 |
as literal characters (by default), or are faulted if |
as literal characters (by default), or are faulted if |
4239 |
PCRE_EXTRA is set. */ |
PCRE_EXTRA is set. */ |
4266 |
/* Every class contains at least two < 256 characters. */ |
/* Every class contains at least two < 256 characters. */ |
4267 |
class_has_8bitchar++; |
class_has_8bitchar++; |
4268 |
/* Every class contains at least two characters. */ |
/* Every class contains at least two characters. */ |
4269 |
class_single_char += 2; |
class_one_char += 2; |
4270 |
|
|
4271 |
switch (-c) |
switch (-c) |
4272 |
{ |
{ |
4302 |
|
|
4303 |
/* Perl 5.004 onwards omits VT from \s, but we must preserve it |
/* Perl 5.004 onwards omits VT from \s, but we must preserve it |
4304 |
if it was previously set by something earlier in the character |
if it was previously set by something earlier in the character |
4305 |
class. */ |
class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and |
4306 |
|
EBCDIC, so we lazily just adjust the appropriate bit. */ |
4307 |
|
|
4308 |
case ESC_s: |
case ESC_s: |
4309 |
classbits[0] |= cbits[cbit_space]; |
classbits[0] |= cbits[cbit_space]; |
4316 |
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; |
for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; |
4317 |
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ |
classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ |
4318 |
continue; |
continue; |
4319 |
|
|
4320 |
|
/* The rest apply in both UCP and non-UCP cases. */ |
4321 |
|
|
4322 |
case ESC_h: |
case ESC_h: |
4323 |
SETBIT(classbits, 0x09); /* VT */ |
(void)add_list_to_class(classbits, &class_uchardata, options, cd, |
4324 |
SETBIT(classbits, 0x20); /* SPACE */ |
PRIV(hspace_list), NOTACHAR); |
|
SETBIT(classbits, 0xa0); /* NSBP */ |
|
|
#ifndef COMPILE_PCRE8 |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
*class_uchardata++ = 0x1680; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
*class_uchardata++ = 0x180e; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x2000; |
|
|
*class_uchardata++ = 0x200a; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
*class_uchardata++ = 0x202f; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
*class_uchardata++ = 0x205f; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
*class_uchardata++ = 0x3000; |
|
|
#elif defined SUPPORT_UTF |
|
|
if (utf) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata); |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata); |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata); |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata); |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); |
|
|
} |
|
|
#endif |
|
4325 |
continue; |
continue; |
4326 |
|
|
4327 |
case ESC_H: |
case ESC_H: |
4328 |
for (c = 0; c < 32; c++) |
(void)add_not_list_to_class(classbits, &class_uchardata, options, |
4329 |
{ |
cd, PRIV(hspace_list)); |
|
int x = 0xff; |
|
|
switch (c) |
|
|
{ |
|
|
case 0x09/8: x ^= 1 << (0x09%8); break; |
|
|
case 0x20/8: x ^= 1 << (0x20%8); break; |
|
|
case 0xa0/8: x ^= 1 << (0xa0%8); break; |
|
|
default: break; |
|
|
} |
|
|
classbits[c] |= x; |
|
|
} |
|
|
#ifndef COMPILE_PCRE8 |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x0100; |
|
|
*class_uchardata++ = 0x167f; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x1681; |
|
|
*class_uchardata++ = 0x180d; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x180f; |
|
|
*class_uchardata++ = 0x1fff; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x200b; |
|
|
*class_uchardata++ = 0x202e; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x2030; |
|
|
*class_uchardata++ = 0x205e; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x2060; |
|
|
*class_uchardata++ = 0x2fff; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x3001; |
|
|
#ifdef SUPPORT_UTF |
|
|
if (utf) |
|
|
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
|
|
else |
|
|
#endif |
|
|
*class_uchardata++ = 0xffff; |
|
|
#elif defined SUPPORT_UTF |
|
|
if (utf) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
|
|
} |
|
|
#endif |
|
4330 |
continue; |
continue; |
4331 |
|
|
4332 |
case ESC_v: |
case ESC_v: |
4333 |
SETBIT(classbits, 0x0a); /* LF */ |
(void)add_list_to_class(classbits, &class_uchardata, options, cd, |
4334 |
SETBIT(classbits, 0x0b); /* VT */ |
PRIV(vspace_list), NOTACHAR); |
|
SETBIT(classbits, 0x0c); /* FF */ |
|
|
SETBIT(classbits, 0x0d); /* CR */ |
|
|
SETBIT(classbits, 0x85); /* NEL */ |
|
|
#ifndef COMPILE_PCRE8 |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x2028; |
|
|
*class_uchardata++ = 0x2029; |
|
|
#elif defined SUPPORT_UTF |
|
|
if (utf) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); |
|
|
} |
|
|
#endif |
|
4335 |
continue; |
continue; |
4336 |
|
|
4337 |
case ESC_V: |
case ESC_V: |
4338 |
for (c = 0; c < 32; c++) |
(void)add_not_list_to_class(classbits, &class_uchardata, options, |
4339 |
{ |
cd, PRIV(vspace_list)); |
|
int x = 0xff; |
|
|
switch (c) |
|
|
{ |
|
|
case 0x0a/8: x ^= 1 << (0x0a%8); |
|
|
x ^= 1 << (0x0b%8); |
|
|
x ^= 1 << (0x0c%8); |
|
|
x ^= 1 << (0x0d%8); |
|
|
break; |
|
|
case 0x85/8: x ^= 1 << (0x85%8); break; |
|
|
default: break; |
|
|
} |
|
|
classbits[c] |= x; |
|
|
} |
|
|
|
|
|
#ifndef COMPILE_PCRE8 |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x0100; |
|
|
*class_uchardata++ = 0x2027; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
*class_uchardata++ = 0x202a; |
|
|
#ifdef SUPPORT_UTF |
|
|
if (utf) |
|
|
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
|
|
else |
|
|
#endif |
|
|
*class_uchardata++ = 0xffff; |
|
|
#elif defined SUPPORT_UTF |
|
|
if (utf) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata); |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
|
|
} |
|
|
#endif |
|
4340 |
continue; |
continue; |
4341 |
|
|
4342 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
4347 |
int pdata; |
int pdata; |
4348 |
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); |
4349 |
if (ptype < 0) goto FAILED; |
if (ptype < 0) goto FAILED; |
|
xclass = TRUE; |
|
4350 |
*class_uchardata++ = ((-c == ESC_p) != negated)? |
*class_uchardata++ = ((-c == ESC_p) != negated)? |
4351 |
XCL_PROP : XCL_NOTPROP; |
XCL_PROP : XCL_NOTPROP; |
4352 |
*class_uchardata++ = ptype; |
*class_uchardata++ = ptype; |
4366 |
goto FAILED; |
goto FAILED; |
4367 |
} |
} |
4368 |
class_has_8bitchar--; /* Undo the speculative increase. */ |
class_has_8bitchar--; /* Undo the speculative increase. */ |
4369 |
class_single_char -= 2; /* Undo the speculative increase. */ |
class_one_char -= 2; /* Undo the speculative increase. */ |
4370 |
c = *ptr; /* Get the final character and fall through */ |
c = *ptr; /* Get the final character and fall through */ |
4371 |
break; |
break; |
4372 |
} |
} |
4373 |
} |
} |
4374 |
|
|
4375 |
/* Fall through if we have a single character (c >= 0). This may be |
/* Fall through if the escape just defined a single character (c >= 0). |
4376 |
greater than 256. */ |
This may be greater than 256. */ |
4377 |
|
|
4378 |
} /* End of backslash handling */ |
} /* End of backslash handling */ |
4379 |
|
|
4380 |
/* A single character may be followed by '-' to form a range. However, |
/* A character may be followed by '-' to form a range. However, Perl does |
4381 |
Perl does not permit ']' to be the end of the range. A '-' character |
not permit ']' to be the end of the range. A '-' character at the end is |
4382 |
at the end is treated as a literal. Perl ignores orphaned \E sequences |
treated as a literal. Perl ignores orphaned \E sequences entirely. The |
4383 |
entirely. The code for handling \Q and \E is messy. */ |
code for handling \Q and \E is messy. */ |
4384 |
|
|
4385 |
CHECK_RANGE: |
CHECK_RANGE: |
4386 |
while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) |
while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E) |
4388 |
inescq = FALSE; |
inescq = FALSE; |
4389 |
ptr += 2; |
ptr += 2; |
4390 |
} |
} |
|
|
|
4391 |
oldptr = ptr; |
oldptr = ptr; |
4392 |
|
|
4393 |
/* Remember \r or \n */ |
/* Remember if \r or \n were explicitly used */ |
4394 |
|
|
4395 |
if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; |
if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; |
4396 |
|
|
4413 |
inescq = TRUE; |
inescq = TRUE; |
4414 |
break; |
break; |
4415 |
} |
} |
4416 |
|
|
4417 |
|
/* Minus (hyphen) at the end of a class is treated as a literal, so put |
4418 |
|
back the pointer and jump to handle the character that preceded it. */ |
4419 |
|
|
4420 |
if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) |
if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET)) |
4421 |
{ |
{ |
4422 |
ptr = oldptr; |
ptr = oldptr; |
4423 |
goto LONE_SINGLE_CHARACTER; |
goto CLASS_SINGLE_CHARACTER; |
4424 |
} |
} |
4425 |
|
|
4426 |
|
/* Otherwise, we have a potential range; pick up the next character */ |
4427 |
|
|
4428 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4429 |
if (utf) |
if (utf) |
4443 |
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); |
d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); |
4444 |
if (*errorcodeptr != 0) goto FAILED; |
if (*errorcodeptr != 0) goto FAILED; |
4445 |
|
|
4446 |
/* \b is backspace; any other special means the '-' was literal */ |
/* \b is backspace; any other special means the '-' was literal. */ |
4447 |
|
|
4448 |
if (d < 0) |
if (d < 0) |
4449 |
{ |
{ |
4450 |
if (d == -ESC_b) d = CHAR_BS; else |
if (d == -ESC_b) d = CHAR_BS; else |
4451 |
{ |
{ |
4452 |
ptr = oldptr; |
ptr = oldptr; |
4453 |
goto LONE_SINGLE_CHARACTER; /* A few lines below */ |
goto CLASS_SINGLE_CHARACTER; /* A few lines below */ |
4454 |
} |
} |
4455 |
} |
} |
4456 |
} |
} |
4457 |
|
|
4458 |
/* Check that the two values are in the correct order. Optimize |
/* Check that the two values are in the correct order. Optimize |
4459 |
one-character ranges */ |
one-character ranges. */ |
4460 |
|
|
4461 |
if (d < c) |
if (d < c) |
4462 |
{ |
{ |
4463 |
*errorcodeptr = ERR8; |
*errorcodeptr = ERR8; |
4464 |
goto FAILED; |
goto FAILED; |
4465 |
} |
} |
4466 |
|
if (d == c) goto CLASS_SINGLE_CHARACTER; /* A few lines below */ |
4467 |
|
|
4468 |
if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ |
/* We have found a character range, so single character optimizations |
4469 |
|
cannot be done anymore. Any value greater than 1 indicates that there |
4470 |
|
is more than one character. */ |
4471 |
|
|
4472 |
|
class_one_char = 2; |
4473 |
|
|
4474 |
/* Remember \r or \n */ |
/* Remember an explicit \r or \n, and add the range to the class. */ |
4475 |
|
|
4476 |
if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; |
if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF; |
4477 |
|
|
4478 |
/* Since we found a character range, single character optimizations |
class_has_8bitchar += |
4479 |
cannot be done anymore. */ |
add_to_class(classbits, &class_uchardata, options, cd, c, d); |
4480 |
class_single_char = 2; |
|
|
|
|
|
/* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless |
|
|
matching, we have to use an XCLASS with extra data items. Caseless |
|
|
matching for characters > 127 is available only if UCP support is |
|
|
available. */ |
|
|
|
|
|
#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) |
|
|
if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127))) |
|
|
#elif defined SUPPORT_UTF |
|
|
if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) |
|
|
#elif !(defined COMPILE_PCRE8) |
|
|
if (d > 255) |
|
|
#endif |
|
|
#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
|
|
|
/* With UCP support, we can find the other case equivalents of |
|
|
the relevant characters. There may be several ranges. Optimize how |
|
|
they fit with the basic range. */ |
|
|
|
|
|
#ifdef SUPPORT_UCP |
|
|
#ifndef COMPILE_PCRE8 |
|
|
if (utf && (options & PCRE_CASELESS) != 0) |
|
|
#else |
|
|
if ((options & PCRE_CASELESS) != 0) |
|
|
#endif |
|
|
{ |
|
|
unsigned int occ, ocd; |
|
|
unsigned int cc = c; |
|
|
unsigned int origd = d; |
|
|
while (get_othercase_range(&cc, origd, &occ, &ocd)) |
|
|
{ |
|
|
if (occ >= (unsigned int)c && |
|
|
ocd <= (unsigned int)d) |
|
|
continue; /* Skip embedded ranges */ |
|
|
|
|
|
if (occ < (unsigned int)c && |
|
|
ocd >= (unsigned int)c - 1) /* Extend the basic range */ |
|
|
{ /* if there is overlap, */ |
|
|
c = occ; /* noting that if occ < c */ |
|
|
continue; /* we can't have ocd > d */ |
|
|
} /* because a subrange is */ |
|
|
if (ocd > (unsigned int)d && |
|
|
occ <= (unsigned int)d + 1) /* always shorter than */ |
|
|
{ /* the basic range. */ |
|
|
d = ocd; |
|
|
continue; |
|
|
} |
|
|
|
|
|
if (occ == ocd) |
|
|
{ |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
} |
|
|
else |
|
|
{ |
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
class_uchardata += PRIV(ord2utf)(occ, class_uchardata); |
|
|
} |
|
|
class_uchardata += PRIV(ord2utf)(ocd, class_uchardata); |
|
|
} |
|
|
} |
|
|
#endif /* SUPPORT_UCP */ |
|
|
|
|
|
/* Now record the original range, possibly modified for UCP caseless |
|
|
overlapping ranges. */ |
|
|
|
|
|
*class_uchardata++ = XCL_RANGE; |
|
|
#ifdef SUPPORT_UTF |
|
|
#ifndef COMPILE_PCRE8 |
|
|
if (utf) |
|
|
{ |
|
|
class_uchardata += PRIV(ord2utf)(c, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(d, class_uchardata); |
|
|
} |
|
|
else |
|
|
{ |
|
|
*class_uchardata++ = c; |
|
|
*class_uchardata++ = d; |
|
|
} |
|
|
#else |
|
|
class_uchardata += PRIV(ord2utf)(c, class_uchardata); |
|
|
class_uchardata += PRIV(ord2utf)(d, class_uchardata); |
|
|
#endif |
|
|
#else /* SUPPORT_UTF */ |
|
|
*class_uchardata++ = c; |
|
|
*class_uchardata++ = d; |
|
|
#endif /* SUPPORT_UTF */ |
|
|
|
|
|
/* With UCP support, we are done. Without UCP support, there is no |
|
|
caseless matching for UTF characters > 127; we can use the bit map |
|
|
for the smaller ones. As for 16 bit characters without UTF, we |
|
|
can still use */ |
|
|
|
|
|
#ifdef SUPPORT_UCP |
|
|
#ifndef COMPILE_PCRE8 |
|
|
if (utf) |
|
|
#endif |
|
|
continue; /* With next character in the class */ |
|
|
#endif /* SUPPORT_UCP */ |
|
|
|
|
|
#if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8) |
|
|
if (utf) |
|
|
{ |
|
|
if ((options & PCRE_CASELESS) == 0 || c > 127) continue; |
|
|
/* Adjust upper limit and fall through to set up the map */ |
|
|
d = 127; |
|
|
} |
|
|
else |
|
|
{ |
|
|
if (c > 255) continue; |
|
|
/* Adjust upper limit and fall through to set up the map */ |
|
|
d = 255; |
|
|
} |
|
|
#elif defined SUPPORT_UTF && !defined(SUPPORT_UCP) |
|
|
if ((options & PCRE_CASELESS) == 0 || c > 127) continue; |
|
|
/* Adjust upper limit and fall through to set up the map */ |
|
|
d = 127; |
|
|
#else |
|
|
if (c > 255) continue; |
|
|
/* Adjust upper limit and fall through to set up the map */ |
|
|
d = 255; |
|
|
#endif /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */ |
|
|
} |
|
|
#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */ |
|
|
|
|
|
/* We use the bit map for 8 bit mode, or when the characters fall |
|
|
partially or entirely to [0-255] ([0-127] for UCP) ranges. */ |
|
|
|
|
|
class_has_8bitchar = 1; |
|
|
|
|
|
/* We can save a bit of time by skipping this in the pre-compile. */ |
|
|
|
|
|
if (lengthptr == NULL) for (; c <= d; c++) |
|
|
{ |
|
|
classbits[c/8] |= (1 << (c&7)); |
|
|
if ((options & PCRE_CASELESS) != 0) |
|
|
{ |
|
|
int uc = cd->fcc[c]; /* flip case */ |
|
|
classbits[uc/8] |= (1 << (uc&7)); |
|
|
} |
|
|
} |
|
|
|
|
4481 |
continue; /* Go get the next char in the class */ |
continue; /* Go get the next char in the class */ |
4482 |
} |
} |
4483 |
|
|
4484 |
/* Handle a lone single character - we can get here for a normal |
/* Handle a single character - we can get here for a normal non-escape |
4485 |
non-escape char, or after \ that introduces a single character or for an |
char, or after \ that introduces a single character or for an apparent |
4486 |
apparent range that isn't. */ |
range that isn't. Only the value 1 matters for class_one_char, so don't |
4487 |
|
increase it if it is already 2 or more ... just in case there's a class |
4488 |
LONE_SINGLE_CHARACTER: |
with a zillion characters in it. */ |
4489 |
|
|
4490 |
/* Only the value of 1 matters for class_single_char. */ |
CLASS_SINGLE_CHARACTER: |
4491 |
|
if (class_one_char < 2) class_one_char++; |
4492 |
if (class_single_char < 2) class_single_char++; |
|
4493 |
|
/* If class_one_char is 1, we have the first single character in the |
4494 |
/* If class_charcount is 1, we saw precisely one character. As long as |
class, and there have been no prior ranges, or XCLASS items generated by |
4495 |
there was no use of \p or \P, in other words, no use of any XCLASS |
escapes. If this is the final character in the class, we can optimize by |
4496 |
features, we can optimize. |
turning the item into a 1-character OP_CHAR[I] if it's positive, or |
4497 |
|
OP_NOT[I] if it's negative. In the positive case, it can cause firstchar |
4498 |
The optimization throws away the bit map. We turn the item into a |
to be set. Otherwise, there can be no first char if this item is first, |
4499 |
1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative. |
whatever repeat count may follow. In the case of reqchar, save the |
4500 |
In the positive case, it can cause firstchar to be set. Otherwise, there |
previous value for reinstating. */ |
|
can be no first char if this item is first, whatever repeat count may |
|
|
follow. In the case of reqchar, save the previous value for reinstating. */ |
|
4501 |
|
|
4502 |
if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
if (class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) |
4503 |
{ |
{ |
4504 |
ptr++; |
ptr++; |
4505 |
zeroreqchar = reqchar; |
zeroreqchar = reqchar; |
4506 |
|
|
4507 |
if (negate_class) |
if (negate_class) |
4508 |
{ |
{ |
4509 |
|
#ifdef SUPPORT_UCP |
4510 |
|
// FIXMEchpe pcreuint32? |
4511 |
|
int d; |
4512 |
|
#endif |
4513 |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
4514 |
zerofirstchar = firstchar; |
zerofirstchar = firstchar; |
4515 |
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; |
|
4516 |
#ifdef SUPPORT_UTF |
/* For caseless UTF-8 mode when UCP support is available, check |
4517 |
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
whether this character has more than one other case. If so, generate |
4518 |
code += PRIV(ord2utf)(c, code); |
a special OP_NOTPROP item instead of OP_NOTI. */ |
4519 |
else |
|
4520 |
|
#ifdef SUPPORT_UCP |
4521 |
|
if (utf && (options & PCRE_CASELESS) != 0 && |
4522 |
|
(d = UCD_CASESET(c)) != 0) |
4523 |
|
{ |
4524 |
|
*code++ = OP_NOTPROP; |
4525 |
|
*code++ = PT_CLIST; |
4526 |
|
*code++ = d; |
4527 |
|
} |
4528 |
|
else |
4529 |
#endif |
#endif |
4530 |
*code++ = c; |
/* Char has only one other case, or UCP not available */ |
4531 |
goto NOT_CHAR; |
|
4532 |
|
{ |
4533 |
|
*code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT; |
4534 |
|
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4535 |
|
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
4536 |
|
code += PRIV(ord2utf)(c, code); |
4537 |
|
else |
4538 |
|
#endif |
4539 |
|
*code++ = c; |
4540 |
|
} |
4541 |
|
|
4542 |
|
/* We are finished with this character class */ |
4543 |
|
|
4544 |
|
goto END_CLASS; |
4545 |
} |
} |
4546 |
|
|
4547 |
/* For a single, positive character, get the value into mcbuffer, and |
/* For a single, positive character, get the value into mcbuffer, and |
4548 |
then we can handle this with the normal one-character code. */ |
then we can handle this with the normal one-character code. */ |
4549 |
|
|
4550 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4551 |
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
4552 |
mclength = PRIV(ord2utf)(c, mcbuffer); |
mclength = PRIV(ord2utf)(c, mcbuffer); |
4553 |
else |
else |
4558 |
} |
} |
4559 |
goto ONE_CHAR; |
goto ONE_CHAR; |
4560 |
} /* End of 1-char optimization */ |
} /* End of 1-char optimization */ |
4561 |
|
|
4562 |
/* Handle a character that cannot go in the bit map. */ |
/* There is more than one character in the class, or an XCLASS item |
4563 |
|
has been generated. Add this character to the class. */ |
4564 |
#if defined SUPPORT_UTF && !(defined COMPILE_PCRE8) |
|
4565 |
if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127))) |
class_has_8bitchar += |
4566 |
#elif defined SUPPORT_UTF |
add_to_class(classbits, &class_uchardata, options, cd, c, c); |
|
if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) |
|
|
#elif !(defined COMPILE_PCRE8) |
|
|
if (c > 255) |
|
|
#endif |
|
|
|
|
|
#if defined SUPPORT_UTF || !(defined COMPILE_PCRE8) |
|
|
{ |
|
|
xclass = TRUE; |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
#ifdef SUPPORT_UTF |
|
|
#ifndef COMPILE_PCRE8 |
|
|
/* In non 8 bit mode, we can get here even if we are not in UTF mode. */ |
|
|
if (!utf) |
|
|
*class_uchardata++ = c; |
|
|
else |
|
|
#endif |
|
|
class_uchardata += PRIV(ord2utf)(c, class_uchardata); |
|
|
#else /* SUPPORT_UTF */ |
|
|
*class_uchardata++ = c; |
|
|
#endif /* SUPPORT_UTF */ |
|
|
|
|
|
#ifdef SUPPORT_UCP |
|
|
#ifdef COMPILE_PCRE8 |
|
|
if ((options & PCRE_CASELESS) != 0) |
|
|
#else |
|
|
/* In non 8 bit mode, we can get here even if we are not in UTF mode. */ |
|
|
if (utf && (options & PCRE_CASELESS) != 0) |
|
|
#endif |
|
|
{ |
|
|
unsigned int othercase; |
|
|
if ((int)(othercase = UCD_OTHERCASE(c)) != c) |
|
|
{ |
|
|
*class_uchardata++ = XCL_SINGLE; |
|
|
class_uchardata += PRIV(ord2utf)(othercase, class_uchardata); |
|
|
} |
|
|
} |
|
|
#endif /* SUPPORT_UCP */ |
|
|
|
|
|
} |
|
|
else |
|
|
#endif /* SUPPORT_UTF || COMPILE_PCRE16 */ |
|
|
|
|
|
/* Handle a single-byte character */ |
|
|
{ |
|
|
class_has_8bitchar = 1; |
|
|
classbits[c/8] |= (1 << (c&7)); |
|
|
if ((options & PCRE_CASELESS) != 0) |
|
|
{ |
|
|
c = cd->fcc[c]; /* flip case */ |
|
|
classbits[c/8] |= (1 << (c&7)); |
|
|
} |
|
|
} |
|
4567 |
} |
} |
4568 |
|
|
4569 |
/* Loop until ']' reached. This "while" is the end of the "do" far above. |
/* Loop until ']' reached. This "while" is the end of the "do" far above. |
4583 |
goto FAILED; |
goto FAILED; |
4584 |
} |
} |
4585 |
|
|
4586 |
|
/* We will need an XCLASS if data has been placed in class_uchardata. In |
4587 |
|
the second phase this is a sufficient test. However, in the pre-compile |
4588 |
|
phase, class_uchardata gets emptied to prevent workspace overflow, so it |
4589 |
|
only if the very last character in the class needs XCLASS will it contain |
4590 |
|
anything at this point. For this reason, xclass gets set TRUE above when |
4591 |
|
uchar_classdata is emptied, and that's why this code is the way it is here |
4592 |
|
instead of just doing a test on class_uchardata below. */ |
4593 |
|
|
4594 |
|
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8 |
4595 |
|
if (class_uchardata > class_uchardata_base) xclass = TRUE; |
4596 |
|
#endif |
4597 |
|
|
4598 |
/* If this is the first thing in the branch, there can be no first char |
/* If this is the first thing in the branch, there can be no first char |
4599 |
setting, whatever the repeat count. Any reqchar setting must remain |
setting, whatever the repeat count. Any reqchar setting must remain |
4600 |
unchanged after any kind of repeat. */ |
unchanged after any kind of repeat. */ |
4657 |
memcpy(code, classbits, 32); |
memcpy(code, classbits, 32); |
4658 |
} |
} |
4659 |
code += 32 / sizeof(pcre_uchar); |
code += 32 / sizeof(pcre_uchar); |
4660 |
NOT_CHAR: |
|
4661 |
|
END_CLASS: |
4662 |
break; |
break; |
4663 |
|
|
4664 |
|
|
4768 |
if (*previous == OP_CHAR || *previous == OP_CHARI |
if (*previous == OP_CHAR || *previous == OP_CHARI |
4769 |
|| *previous == OP_NOT || *previous == OP_NOTI) |
|| *previous == OP_NOT || *previous == OP_NOTI) |
4770 |
{ |
{ |
4771 |
switch (*previous) |
switch (*previous) |
4772 |
{ |
{ |
4773 |
default: /* Make compiler happy. */ |
default: /* Make compiler happy. */ |
4774 |
case OP_CHAR: op_type = OP_STAR - OP_STAR; break; |
case OP_CHAR: op_type = OP_STAR - OP_STAR; break; |
4782 |
hold the length of the character in bytes, plus UTF_LENGTH to flag that |
hold the length of the character in bytes, plus UTF_LENGTH to flag that |
4783 |
it's a length rather than a small character. */ |
it's a length rather than a small character. */ |
4784 |
|
|
4785 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4786 |
if (utf && NOT_FIRSTCHAR(code[-1])) |
if (utf && NOT_FIRSTCHAR(code[-1])) |
4787 |
{ |
{ |
4788 |
pcre_uchar *lastchar = code - 1; |
pcre_uchar *lastchar = code - 1; |
4918 |
|
|
4919 |
if (repeat_max < 0) |
if (repeat_max < 0) |
4920 |
{ |
{ |
4921 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4922 |
if (utf && (c & UTF_LENGTH) != 0) |
if (utf && (c & UTF_LENGTH) != 0) |
4923 |
{ |
{ |
4924 |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
4943 |
|
|
4944 |
else if (repeat_max != repeat_min) |
else if (repeat_max != repeat_min) |
4945 |
{ |
{ |
4946 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4947 |
if (utf && (c & UTF_LENGTH) != 0) |
if (utf && (c & UTF_LENGTH) != 0) |
4948 |
{ |
{ |
4949 |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
4973 |
|
|
4974 |
/* The character or character type itself comes last in all cases. */ |
/* The character or character type itself comes last in all cases. */ |
4975 |
|
|
4976 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
4977 |
if (utf && (c & UTF_LENGTH) != 0) |
if (utf && (c & UTF_LENGTH) != 0) |
4978 |
{ |
{ |
4979 |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
memcpy(code, utf_chars, IN_UCHARS(c & 7)); |
5460 |
else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) |
else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT) |
5461 |
{ |
{ |
5462 |
tempcode += PRIV(OP_lengths)[*tempcode]; |
tempcode += PRIV(OP_lengths)[*tempcode]; |
5463 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
5464 |
if (utf && HAS_EXTRALEN(tempcode[-1])) |
if (utf && HAS_EXTRALEN(tempcode[-1])) |
5465 |
tempcode += GET_EXTRALEN(tempcode[-1]); |
tempcode += GET_EXTRALEN(tempcode[-1]); |
5466 |
#endif |
#endif |
5558 |
arg = ++ptr; |
arg = ++ptr; |
5559 |
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++; |
5560 |
arglen = (int)(ptr - arg); |
arglen = (int)(ptr - arg); |
5561 |
|
if ((unsigned int)arglen > MAX_MARK) |
5562 |
|
{ |
5563 |
|
*errorcodeptr = ERR75; |
5564 |
|
goto FAILED; |
5565 |
|
} |
5566 |
} |
} |
5567 |
|
|
5568 |
if (*ptr != CHAR_RIGHT_PARENTHESIS) |
if (*ptr != CHAR_RIGHT_PARENTHESIS) |
5578 |
if (namelen == verbs[i].len && |
if (namelen == verbs[i].len && |
5579 |
STRNCMP_UC_C8(name, vn, namelen) == 0) |
STRNCMP_UC_C8(name, vn, namelen) == 0) |
5580 |
{ |
{ |
5581 |
|
int setverb; |
5582 |
|
|
5583 |
/* Check for open captures before ACCEPT and convert it to |
/* Check for open captures before ACCEPT and convert it to |
5584 |
ASSERT_ACCEPT if in an assertion. */ |
ASSERT_ACCEPT if in an assertion. */ |
5585 |
|
|
5597 |
*code++ = OP_CLOSE; |
*code++ = OP_CLOSE; |
5598 |
PUT2INC(code, 0, oc->number); |
PUT2INC(code, 0, oc->number); |
5599 |
} |
} |
5600 |
*code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; |
setverb = *code++ = |
5601 |
|
(cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT; |
5602 |
|
|
5603 |
/* Do not set firstchar after *ACCEPT */ |
/* Do not set firstchar after *ACCEPT */ |
5604 |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
if (firstchar == REQ_UNSET) firstchar = REQ_NONE; |
5613 |
*errorcodeptr = ERR66; |
*errorcodeptr = ERR66; |
5614 |
goto FAILED; |
goto FAILED; |
5615 |
} |
} |
5616 |
*code = verbs[i].op; |
setverb = *code++ = verbs[i].op; |
|
if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN; |
|
5617 |
} |
} |
5618 |
|
|
5619 |
else |
else |
5623 |
*errorcodeptr = ERR59; |
*errorcodeptr = ERR59; |
5624 |
goto FAILED; |
goto FAILED; |
5625 |
} |
} |
5626 |
*code = verbs[i].op_arg; |
setverb = *code++ = verbs[i].op_arg; |
|
if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN; |
|
5627 |
*code++ = arglen; |
*code++ = arglen; |
5628 |
memcpy(code, arg, IN_UCHARS(arglen)); |
memcpy(code, arg, IN_UCHARS(arglen)); |
5629 |
code += arglen; |
code += arglen; |
5630 |
*code++ = 0; |
*code++ = 0; |
5631 |
} |
} |
5632 |
|
|
5633 |
|
switch (setverb) |
5634 |
|
{ |
5635 |
|
case OP_THEN: |
5636 |
|
case OP_THEN_ARG: |
5637 |
|
cd->external_flags |= PCRE_HASTHEN; |
5638 |
|
break; |
5639 |
|
|
5640 |
|
case OP_PRUNE: |
5641 |
|
case OP_PRUNE_ARG: |
5642 |
|
case OP_SKIP: |
5643 |
|
case OP_SKIP_ARG: |
5644 |
|
cd->had_pruneorskip = TRUE; |
5645 |
|
break; |
5646 |
|
} |
5647 |
|
|
5648 |
break; /* Found verb, exit loop */ |
break; /* Found verb, exit loop */ |
5649 |
} |
} |
5650 |
|
|
6830 |
/* For the rest (including \X when Unicode properties are supported), we |
/* For the rest (including \X when Unicode properties are supported), we |
6831 |
can obtain the OP value by negating the escape value in the default |
can obtain the OP value by negating the escape value in the default |
6832 |
situation when PCRE_UCP is not set. When it *is* set, we substitute |
situation when PCRE_UCP is not set. When it *is* set, we substitute |
6833 |
Unicode property tests. Note that \b and \B do a one-character |
Unicode property tests. Note that \b and \B do a one-character |
6834 |
lookbehind. */ |
lookbehind. */ |
6835 |
|
|
6836 |
else |
else |
6837 |
{ |
{ |
6838 |
if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0) |
if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0) |
6839 |
cd->max_lookbehind = 1; |
cd->max_lookbehind = 1; |
6840 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
6841 |
if (-c >= ESC_DU && -c <= ESC_wu) |
if (-c >= ESC_DU && -c <= ESC_wu) |
6842 |
{ |
{ |
6860 |
a value > 127. We set its representation in the length/buffer, and then |
a value > 127. We set its representation in the length/buffer, and then |
6861 |
handle it as a data character. */ |
handle it as a data character. */ |
6862 |
|
|
6863 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
6864 |
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR) |
6865 |
mclength = PRIV(ord2utf)(c, mcbuffer); |
mclength = PRIV(ord2utf)(c, mcbuffer); |
6866 |
else |
else |
6883 |
mclength = 1; |
mclength = 1; |
6884 |
mcbuffer[0] = c; |
mcbuffer[0] = c; |
6885 |
|
|
6886 |
#ifdef SUPPORT_UTF |
#if defined SUPPORT_UTF && !defined COMPILE_PCRE32 |
6887 |
if (utf && HAS_EXTRALEN(c)) |
if (utf && HAS_EXTRALEN(c)) |
6888 |
ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); |
ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr)); |
6889 |
#endif |
#endif |
6893 |
|
|
6894 |
ONE_CHAR: |
ONE_CHAR: |
6895 |
previous = code; |
previous = code; |
6896 |
|
|
6897 |
|
/* For caseless UTF-8 mode when UCP support is available, check whether |
6898 |
|
this character has more than one other case. If so, generate a special |
6899 |
|
OP_PROP item instead of OP_CHARI. */ |
6900 |
|
|
6901 |
|
#ifdef SUPPORT_UCP |
6902 |
|
if (utf && (options & PCRE_CASELESS) != 0) |
6903 |
|
{ |
6904 |
|
GETCHAR(c, mcbuffer); |
6905 |
|
if ((c = UCD_CASESET(c)) != 0) |
6906 |
|
{ |
6907 |
|
*code++ = OP_PROP; |
6908 |
|
*code++ = PT_CLIST; |
6909 |
|
*code++ = c; |
6910 |
|
if (firstchar == REQ_UNSET) firstchar = zerofirstchar = REQ_NONE; |
6911 |
|
break; |
6912 |
|
} |
6913 |
|
} |
6914 |
|
#endif |
6915 |
|
|
6916 |
|
/* Caseful matches, or not one of the multicase characters. */ |
6917 |
|
|
6918 |
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR; |
*code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR; |
6919 |
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; |
for (c = 0; c < mclength; c++) *code++ = mcbuffer[c]; |
6920 |
|
|
7166 |
*ptrptr = ptr; |
*ptrptr = ptr; |
7167 |
return FALSE; |
return FALSE; |
7168 |
} |
} |
7169 |
else |
else |
7170 |
{ |
{ |
7171 |
if (fixed_length > cd->max_lookbehind) |
if (fixed_length > cd->max_lookbehind) |
7172 |
cd->max_lookbehind = fixed_length; |
cd->max_lookbehind = fixed_length; |
7173 |
PUT(reverse_count, 0, fixed_length); |
PUT(reverse_count, 0, fixed_length); |
7174 |
} |
} |
7175 |
} |
} |
7176 |
} |
} |
7303 |
However, by keeping a bitmap of the first 31 back references, we can catch some |
However, by keeping a bitmap of the first 31 back references, we can catch some |
7304 |
of the more common cases more precisely. |
of the more common cases more precisely. |
7305 |
|
|
7306 |
|
... A second exception is when the .* appears inside an atomic group, because |
7307 |
|
this prevents the number of characters it matches from being adjusted. |
7308 |
|
|
7309 |
Arguments: |
Arguments: |
7310 |
code points to start of expression (the bracket) |
code points to start of expression (the bracket) |
7311 |
bracket_map a bitmap of which brackets we are inside while testing; this |
bracket_map a bitmap of which brackets we are inside while testing; this |
7312 |
handles up to substring 31; after that we just have to take |
handles up to substring 31; after that we just have to take |
7313 |
the less precise approach |
the less precise approach |
7314 |
backref_map the back reference bitmap |
cd points to the compile data block |
7315 |
|
atomcount atomic group level |
7316 |
|
|
7317 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
7318 |
*/ |
*/ |
7319 |
|
|
7320 |
static BOOL |
static BOOL |
7321 |
is_anchored(register const pcre_uchar *code, unsigned int bracket_map, |
is_anchored(register const pcre_uchar *code, unsigned int bracket_map, |
7322 |
unsigned int backref_map) |
compile_data *cd, int atomcount) |
7323 |
{ |
{ |
7324 |
do { |
do { |
7325 |
const pcre_uchar *scode = first_significant_code( |
const pcre_uchar *scode = first_significant_code( |
7331 |
if (op == OP_BRA || op == OP_BRAPOS || |
if (op == OP_BRA || op == OP_BRAPOS || |
7332 |
op == OP_SBRA || op == OP_SBRAPOS) |
op == OP_SBRA || op == OP_SBRAPOS) |
7333 |
{ |
{ |
7334 |
if (!is_anchored(scode, bracket_map, backref_map)) return FALSE; |
if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE; |
7335 |
} |
} |
7336 |
|
|
7337 |
/* Capturing brackets */ |
/* Capturing brackets */ |
7341 |
{ |
{ |
7342 |
int n = GET2(scode, 1+LINK_SIZE); |
int n = GET2(scode, 1+LINK_SIZE); |
7343 |
int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
7344 |
if (!is_anchored(scode, new_map, backref_map)) return FALSE; |
if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE; |
7345 |
} |
} |
7346 |
|
|
7347 |
/* Other brackets */ |
/* Positive forward assertions and conditions */ |
7348 |
|
|
7349 |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC || |
else if (op == OP_ASSERT || op == OP_COND) |
|
op == OP_COND) |
|
7350 |
{ |
{ |
7351 |
if (!is_anchored(scode, bracket_map, backref_map)) return FALSE; |
if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE; |
7352 |
|
} |
7353 |
|
|
7354 |
|
/* Atomic groups */ |
7355 |
|
|
7356 |
|
else if (op == OP_ONCE || op == OP_ONCE_NC) |
7357 |
|
{ |
7358 |
|
if (!is_anchored(scode, bracket_map, cd, atomcount + 1)) |
7359 |
|
return FALSE; |
7360 |
} |
} |
7361 |
|
|
7362 |
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and |
/* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and |
7363 |
it isn't in brackets that are or may be referenced. */ |
it isn't in brackets that are or may be referenced or inside an atomic |
7364 |
|
group. */ |
7365 |
|
|
7366 |
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || |
7367 |
op == OP_TYPEPOSSTAR)) |
op == OP_TYPEPOSSTAR)) |
7368 |
{ |
{ |
7369 |
if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0) |
if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 || |
7370 |
|
atomcount > 0 || cd->had_pruneorskip) |
7371 |
return FALSE; |
return FALSE; |
7372 |
} |
} |
7373 |
|
|
7374 |
/* Check for explicit anchoring */ |
/* Check for explicit anchoring */ |
7375 |
|
|
7376 |
else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; |
else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE; |
7377 |
|
|
7378 |
code += GET(code, 1); |
code += GET(code, 1); |
7379 |
} |
} |
7380 |
while (*code == OP_ALT); /* Loop for each alternative */ |
while (*code == OP_ALT); /* Loop for each alternative */ |
7392 |
matching and for non-DOTALL patterns that start with .* (which must start at |
matching and for non-DOTALL patterns that start with .* (which must start at |
7393 |
the beginning or after \n). As in the case of is_anchored() (see above), we |
the beginning or after \n). As in the case of is_anchored() (see above), we |
7394 |
have to take account of back references to capturing brackets that contain .* |
have to take account of back references to capturing brackets that contain .* |
7395 |
because in that case we can't make the assumption. |
because in that case we can't make the assumption. Also, the appearance of .* |
7396 |
|
inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not |
7397 |
|
count, because once again the assumption no longer holds. |
7398 |
|
|
7399 |
Arguments: |
Arguments: |
7400 |
code points to start of expression (the bracket) |
code points to start of expression (the bracket) |
7401 |
bracket_map a bitmap of which brackets we are inside while testing; this |
bracket_map a bitmap of which brackets we are inside while testing; this |
7402 |
handles up to substring 31; after that we just have to take |
handles up to substring 31; after that we just have to take |
7403 |
the less precise approach |
the less precise approach |
7404 |
backref_map the back reference bitmap |
cd points to the compile data |
7405 |
|
atomcount atomic group level |
7406 |
|
|
7407 |
Returns: TRUE or FALSE |
Returns: TRUE or FALSE |
7408 |
*/ |
*/ |
7409 |
|
|
7410 |
static BOOL |
static BOOL |
7411 |
is_startline(const pcre_uchar *code, unsigned int bracket_map, |
is_startline(const pcre_uchar *code, unsigned int bracket_map, |
7412 |
unsigned int backref_map) |
compile_data *cd, int atomcount) |
7413 |
{ |
{ |
7414 |
do { |
do { |
7415 |
const pcre_uchar *scode = first_significant_code( |
const pcre_uchar *scode = first_significant_code( |
7435 |
return FALSE; |
return FALSE; |
7436 |
|
|
7437 |
default: /* Assertion */ |
default: /* Assertion */ |
7438 |
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; |
7439 |
do scode += GET(scode, 1); while (*scode == OP_ALT); |
do scode += GET(scode, 1); while (*scode == OP_ALT); |
7440 |
scode += 1 + LINK_SIZE; |
scode += 1 + LINK_SIZE; |
7441 |
break; |
break; |
7449 |
if (op == OP_BRA || op == OP_BRAPOS || |
if (op == OP_BRA || op == OP_BRAPOS || |
7450 |
op == OP_SBRA || op == OP_SBRAPOS) |
op == OP_SBRA || op == OP_SBRAPOS) |
7451 |
{ |
{ |
7452 |
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; |
7453 |
} |
} |
7454 |
|
|
7455 |
/* Capturing brackets */ |
/* Capturing brackets */ |
7459 |
{ |
{ |
7460 |
int n = GET2(scode, 1+LINK_SIZE); |
int n = GET2(scode, 1+LINK_SIZE); |
7461 |
int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
int new_map = bracket_map | ((n < 32)? (1 << n) : 1); |
7462 |
if (!is_startline(scode, new_map, backref_map)) return FALSE; |
if (!is_startline(scode, new_map, cd, atomcount)) return FALSE; |
7463 |
} |
} |
7464 |
|
|
7465 |
/* Other brackets */ |
/* Positive forward assertions */ |
7466 |
|
|
7467 |
else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC) |
else if (op == OP_ASSERT) |
7468 |
{ |
{ |
7469 |
if (!is_startline(scode, bracket_map, backref_map)) return FALSE; |
if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE; |
7470 |
} |
} |
7471 |
|
|
7472 |
|
/* Atomic brackets */ |
7473 |
|
|
7474 |
/* .* means "start at start or after \n" if it isn't in brackets that |
else if (op == OP_ONCE || op == OP_ONCE_NC) |
7475 |
may be referenced. */ |
{ |
7476 |
|
if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE; |
7477 |
|
} |
7478 |
|
|
7479 |
|
/* .* means "start at start or after \n" if it isn't in atomic brackets or |
7480 |
|
brackets that may be referenced, as long as the pattern does not contain |
7481 |
|
*PRUNE or *SKIP, because these break the feature. Consider, for example, |
7482 |
|
/.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the |
7483 |
|
start of a line. */ |
7484 |
|
|
7485 |
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) |
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) |
7486 |
{ |
{ |
7487 |
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; |
if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 || |
7488 |
|
atomcount > 0 || cd->had_pruneorskip) |
7489 |
|
return FALSE; |
7490 |
} |
} |
7491 |
|
|
7492 |
/* Check for explicit circumflex */ |
/* Check for explicit circumflex; anything else gives a FALSE result. Note |
7493 |
|
in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC |
7494 |
|
because the number of characters matched by .* cannot be adjusted inside |
7495 |
|
them. */ |
7496 |
|
|
7497 |
else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; |
else if (op != OP_CIRC && op != OP_CIRCM) return FALSE; |
7498 |
|
|
7614 |
with errorptr and erroroffset set |
with errorptr and erroroffset set |
7615 |
*/ |
*/ |
7616 |
|
|
7617 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
7618 |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
7619 |
pcre_compile(const char *pattern, int options, const char **errorptr, |
pcre_compile(const char *pattern, int options, const char **errorptr, |
7620 |
int *erroroffset, const unsigned char *tables) |
int *erroroffset, const unsigned char *tables) |
7621 |
#else |
#elif defined COMPILE_PCRE16 |
7622 |
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION |
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION |
7623 |
pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr, |
pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr, |
7624 |
int *erroroffset, const unsigned char *tables) |
int *erroroffset, const unsigned char *tables) |
7625 |
|
#elif defined COMPILE_PCRE32 |
7626 |
|
PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION |
7627 |
|
pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr, |
7628 |
|
int *erroroffset, const unsigned char *tables) |
7629 |
#endif |
#endif |
7630 |
{ |
{ |
7631 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
7632 |
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
7633 |
#else |
#elif defined COMPILE_PCRE16 |
7634 |
return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
7635 |
|
#elif defined COMPILE_PCRE32 |
7636 |
|
return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables); |
7637 |
#endif |
#endif |
7638 |
} |
} |
7639 |
|
|
7640 |
|
|
7641 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
7642 |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION |
7643 |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
pcre_compile2(const char *pattern, int options, int *errorcodeptr, |
7644 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
7645 |
#else |
#elif defined COMPILE_PCRE16 |
7646 |
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION |
PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION |
7647 |
pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr, |
pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr, |
7648 |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
const char **errorptr, int *erroroffset, const unsigned char *tables) |
7649 |
|
#elif defined COMPILE_PCRE32 |
7650 |
|
PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION |
7651 |
|
pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr, |
7652 |
|
const char **errorptr, int *erroroffset, const unsigned char *tables) |
7653 |
#endif |
#endif |
7654 |
{ |
{ |
7655 |
REAL_PCRE *re; |
REAL_PCRE *re; |
7735 |
if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0) |
if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0) |
7736 |
{ skipatstart += 8; options |= PCRE_UTF16; continue; } |
{ skipatstart += 8; options |= PCRE_UTF16; continue; } |
7737 |
#endif |
#endif |
7738 |
|
#ifdef COMPILE_PCRE32 |
7739 |
|
if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0) |
7740 |
|
{ skipatstart += 8; options |= PCRE_UTF32; continue; } |
7741 |
|
#endif |
7742 |
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0) |
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0) |
7743 |
{ skipatstart += 6; options |= PCRE_UCP; continue; } |
{ skipatstart += 6; options |= PCRE_UCP; continue; } |
7744 |
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0) |
else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0) |
7767 |
else break; |
else break; |
7768 |
} |
} |
7769 |
|
|
7770 |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */ |
7771 |
utf = (options & PCRE_UTF8) != 0; |
utf = (options & PCRE_UTF8) != 0; |
7772 |
|
|
7773 |
/* Can't support UTF unless PCRE has been compiled to include the code. The |
/* Can't support UTF unless PCRE has been compiled to include the code. The |
7779 |
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && |
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && |
7780 |
(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) |
(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) |
7781 |
{ |
{ |
7782 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
7783 |
errorcode = ERR44; |
errorcode = ERR44; |
7784 |
#else |
#elif defined COMPILE_PCRE16 |
7785 |
errorcode = ERR74; |
errorcode = ERR74; |
7786 |
|
#elif defined COMPILE_PCRE32 |
7787 |
|
errorcode = ERR77; |
7788 |
#endif |
#endif |
7789 |
goto PCRE_EARLY_ERROR_RETURN2; |
goto PCRE_EARLY_ERROR_RETURN2; |
7790 |
} |
} |
7948 |
re->ref_count = 0; |
re->ref_count = 0; |
7949 |
re->tables = (tables == PRIV(default_tables))? NULL : tables; |
re->tables = (tables == PRIV(default_tables))? NULL : tables; |
7950 |
re->nullpad = NULL; |
re->nullpad = NULL; |
7951 |
|
#ifdef COMPILE_PCRE32 |
7952 |
|
re->dummy1 = re->dummy2 = 0; |
7953 |
|
#endif |
7954 |
|
|
7955 |
/* The starting points of the name/number translation table and of the code are |
/* The starting points of the name/number translation table and of the code are |
7956 |
passed around in the compile data block. The start/end pattern and initial |
passed around in the compile data block. The start/end pattern and initial |
7970 |
cd->hwm = (pcre_uchar *)(cd->start_workspace); |
cd->hwm = (pcre_uchar *)(cd->start_workspace); |
7971 |
cd->req_varyopt = 0; |
cd->req_varyopt = 0; |
7972 |
cd->had_accept = FALSE; |
cd->had_accept = FALSE; |
7973 |
|
cd->had_pruneorskip = FALSE; |
7974 |
cd->check_lookbehind = FALSE; |
cd->check_lookbehind = FALSE; |
7975 |
cd->open_caps = NULL; |
cd->open_caps = NULL; |
7976 |
|
|
8094 |
} |
} |
8095 |
|
|
8096 |
/* If the anchored option was not passed, set the flag if we can determine that |
/* If the anchored option was not passed, set the flag if we can determine that |
8097 |
the pattern is anchored by virtue of ^ characters or \A or anything else (such |
the pattern is anchored by virtue of ^ characters or \A or anything else, such |
8098 |
as starting with .* when DOTALL is set). |
as starting with non-atomic .* when DOTALL is set and there are no occurrences |
8099 |
|
of *PRUNE or *SKIP. |
8100 |
|
|
8101 |
Otherwise, if we know what the first byte has to be, save it, because that |
Otherwise, if we know what the first byte has to be, save it, because that |
8102 |
speeds up unanchored matches no end. If not, see if we can set the |
speeds up unanchored matches no end. If not, see if we can set the |
8103 |
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches |
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches |
8104 |
start with ^. and also when all branches start with .* for non-DOTALL matches. |
start with ^. and also when all branches start with non-atomic .* for |
8105 |
*/ |
non-DOTALL matches when *PRUNE and SKIP are not present. */ |
8106 |
|
|
8107 |
if ((re->options & PCRE_ANCHORED) == 0) |
if ((re->options & PCRE_ANCHORED) == 0) |
8108 |
{ |
{ |
8109 |
if (is_anchored(codestart, 0, cd->backref_map)) |
if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED; |
|
re->options |= PCRE_ANCHORED; |
|
8110 |
else |
else |
8111 |
{ |
{ |
8112 |
if (firstchar < 0) |
if (firstchar < 0) |
8113 |
firstchar = find_firstassertedchar(codestart, FALSE); |
firstchar = find_firstassertedchar(codestart, FALSE); |
8114 |
if (firstchar >= 0) /* Remove caseless flag for non-caseable chars */ |
if (firstchar >= 0) /* Remove caseless flag for non-caseable chars */ |
8115 |
{ |
{ |
8116 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
8117 |
re->first_char = firstchar & 0xff; |
re->first_char = firstchar & 0xff; |
8118 |
#else |
#elif defined COMPILE_PCRE16 |
|
#ifdef COMPILE_PCRE16 |
|
8119 |
re->first_char = firstchar & 0xffff; |
re->first_char = firstchar & 0xffff; |
8120 |
#endif |
#elif defined COMPILE_PCRE32 |
8121 |
|
re->first_char = firstchar & ~REQ_MASK; |
8122 |
#endif |
#endif |
8123 |
if ((firstchar & REQ_CASELESS) != 0) |
if ((firstchar & REQ_CASELESS) != 0) |
8124 |
{ |
{ |
8143 |
|
|
8144 |
re->flags |= PCRE_FIRSTSET; |
re->flags |= PCRE_FIRSTSET; |
8145 |
} |
} |
8146 |
else if (is_startline(codestart, 0, cd->backref_map)) |
|
8147 |
re->flags |= PCRE_STARTLINE; |
else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE; |
8148 |
} |
} |
8149 |
} |
} |
8150 |
|
|
8155 |
if (reqchar >= 0 && |
if (reqchar >= 0 && |
8156 |
((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0)) |
((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0)) |
8157 |
{ |
{ |
8158 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
8159 |
re->req_char = reqchar & 0xff; |
re->req_char = reqchar & 0xff; |
8160 |
#else |
#elif defined COMPILE_PCRE16 |
|
#ifdef COMPILE_PCRE16 |
|
8161 |
re->req_char = reqchar & 0xffff; |
re->req_char = reqchar & 0xffff; |
8162 |
#endif |
#elif defined COMPILE_PCRE32 |
8163 |
|
re->req_char = reqchar & ~REQ_MASK; |
8164 |
#endif |
#endif |
8165 |
if ((reqchar & REQ_CASELESS) != 0) |
if ((reqchar & REQ_CASELESS) != 0) |
8166 |
{ |
{ |
8212 |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
else printf("Req char = \\x%02x%s\n", ch, caseless); |
8213 |
} |
} |
8214 |
|
|
8215 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
8216 |
pcre_printint((pcre *)re, stdout, TRUE); |
pcre_printint((pcre *)re, stdout, TRUE); |
8217 |
#else |
#elif defined COMPILE_PCRE16 |
8218 |
pcre16_printint((pcre *)re, stdout, TRUE); |
pcre16_printint((pcre *)re, stdout, TRUE); |
8219 |
|
#elif defined COMPILE_PCRE32 |
8220 |
|
pcre32_printint((pcre *)re, stdout, TRUE); |
8221 |
#endif |
#endif |
8222 |
|
|
8223 |
/* This check is done here in the debugging case so that the code that |
/* This check is done here in the debugging case so that the code that |
8233 |
} |
} |
8234 |
#endif /* PCRE_DEBUG */ |
#endif /* PCRE_DEBUG */ |
8235 |
|
|
8236 |
#ifdef COMPILE_PCRE8 |
#if defined COMPILE_PCRE8 |
8237 |
return (pcre *)re; |
return (pcre *)re; |
8238 |
#else |
#elif defined COMPILE_PCRE16 |
8239 |
return (pcre16 *)re; |
return (pcre16 *)re; |
8240 |
|
#elif defined COMPILE_PCRE32 |
8241 |
|
return (pcre32 *)re; |
8242 |
#endif |
#endif |
8243 |
} |
} |
8244 |
|
|