470 |
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
"\\k is not followed by a braced, angle-bracketed, or quoted name\0" |
471 |
/* 70 */ |
/* 70 */ |
472 |
"internal error: unknown opcode in find_fixedlength()\0" |
"internal error: unknown opcode in find_fixedlength()\0" |
473 |
|
"Not allowed UTF-8 / UTF-16 code point (>= 0xd800 && <= 0xdfff)\0" |
474 |
; |
; |
475 |
|
|
476 |
/* Table to identify digits and hex digits. This is used when compiling |
/* Table to identify digits and hex digits. This is used when compiling |
539 |
|
|
540 |
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ |
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */ |
541 |
|
|
542 |
static const pcre_unit8 digitab[] = |
static const pcre_uint8 digitab[] = |
543 |
{ |
{ |
544 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */ |
545 |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */ |
707 |
check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount, |
check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount, |
708 |
int options, BOOL isclass) |
int options, BOOL isclass) |
709 |
{ |
{ |
710 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
711 |
|
BOOL utf = (options & PCRE_UTF8) != 0; |
712 |
const pcre_uchar *ptr = *ptrptr + 1; |
const pcre_uchar *ptr = *ptrptr + 1; |
713 |
int c, i; |
pcre_int32 c; |
714 |
|
int i; |
715 |
|
|
716 |
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ |
717 |
ptr--; /* Set pointer back to the last byte */ |
ptr--; /* Set pointer back to the last byte */ |
943 |
c -= CHAR_0; |
c -= CHAR_0; |
944 |
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) |
while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) |
945 |
c = c * 8 + *(++ptr) - CHAR_0; |
c = c * 8 + *(++ptr) - CHAR_0; |
946 |
if (!utf8 && c > 0xff) *errorcodeptr = ERR51; |
if (!utf && c > 0xff) *errorcodeptr = ERR51; |
947 |
break; |
break; |
948 |
|
|
949 |
/* \x is complicated. \x{ddd} is a character number which can be greater |
/* \x is complicated. \x{ddd} is a character number which can be greater |
950 |
than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is |
than 0xff in utf or non-8bit mode, but only if the ddd are hex digits. |
951 |
treated as a data character. */ |
If not, { is treated as a data character. */ |
952 |
|
|
953 |
case CHAR_x: |
case CHAR_x: |
954 |
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
if ((options & PCRE_JAVASCRIPT_COMPAT) != 0) |
977 |
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
if (ptr[1] == CHAR_LEFT_CURLY_BRACKET) |
978 |
{ |
{ |
979 |
const pcre_uchar *pt = ptr + 2; |
const pcre_uchar *pt = ptr + 2; |
|
int count = 0; |
|
980 |
|
|
981 |
c = 0; |
c = 0; |
982 |
while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) |
while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) |
983 |
{ |
{ |
984 |
register int cc = *pt++; |
register int cc = *pt++; |
985 |
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ |
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */ |
|
count++; |
|
986 |
|
|
987 |
#ifndef EBCDIC /* ASCII/UTF-8 coding */ |
#ifndef EBCDIC /* ASCII/UTF-8 coding */ |
988 |
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */ |
991 |
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */ |
992 |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10)); |
993 |
#endif |
#endif |
|
} |
|
994 |
|
|
|
if (*pt == CHAR_RIGHT_CURLY_BRACKET) |
|
|
{ |
|
995 |
#ifdef COMPILE_PCRE8 |
#ifdef COMPILE_PCRE8 |
996 |
if (c < 0 || count > (utf8? 8:2)) *errorcodeptr = ERR34; |
if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; } |
997 |
#else |
#else |
998 |
#ifdef COMPILE_PCRE16 |
#ifdef COMPILE_PCRE16 |
999 |
if (c < 0 || count > (utf8? 8:4)) *errorcodeptr = ERR34; |
if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; } |
1000 |
#endif |
#endif |
1001 |
#endif |
#endif |
1002 |
|
} |
1003 |
|
|
1004 |
|
if (c < 0) |
1005 |
|
{ |
1006 |
|
while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++; |
1007 |
|
*errorcodeptr = ERR34; |
1008 |
|
} |
1009 |
|
|
1010 |
|
if (*pt == CHAR_RIGHT_CURLY_BRACKET) |
1011 |
|
{ |
1012 |
|
if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR71; |
1013 |
ptr = pt; |
ptr = pt; |
1014 |
break; |
break; |
1015 |
} |
} |
1290 |
name name to seek, or NULL if seeking a numbered subpattern |
name name to seek, or NULL if seeking a numbered subpattern |
1291 |
lorn name length, or subpattern number if name is NULL |
lorn name length, or subpattern number if name is NULL |
1292 |
xmode TRUE if we are in /x mode |
xmode TRUE if we are in /x mode |
1293 |
utf8 TRUE if we are in UTF-8 mode |
utf TRUE if we are in UTF-8 / UTF-16 mode |
1294 |
count pointer to the current capturing subpattern number (updated) |
count pointer to the current capturing subpattern number (updated) |
1295 |
|
|
1296 |
Returns: the number of the named subpattern, or -1 if not found |
Returns: the number of the named subpattern, or -1 if not found |
1298 |
|
|
1299 |
static int |
static int |
1300 |
find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn, |
find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn, |
1301 |
BOOL xmode, BOOL utf8, int *count) |
BOOL xmode, BOOL utf, int *count) |
1302 |
{ |
{ |
1303 |
pcre_uchar *ptr = *ptrptr; |
pcre_uchar *ptr = *ptrptr; |
1304 |
int start_count = *count; |
int start_count = *count; |
1467 |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
1468 |
ptr++; |
ptr++; |
1469 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1470 |
if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; |
if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; |
1471 |
#endif |
#endif |
1472 |
} |
} |
1473 |
if (*ptr == 0) goto FAIL_EXIT; |
if (*ptr == 0) goto FAIL_EXIT; |
1478 |
|
|
1479 |
if (*ptr == CHAR_LEFT_PARENTHESIS) |
if (*ptr == CHAR_LEFT_PARENTHESIS) |
1480 |
{ |
{ |
1481 |
int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count); |
int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count); |
1482 |
if (rc > 0) return rc; |
if (rc > 0) return rc; |
1483 |
if (*ptr == 0) goto FAIL_EXIT; |
if (*ptr == 0) goto FAIL_EXIT; |
1484 |
} |
} |
1524 |
name name to seek, or NULL if seeking a numbered subpattern |
name name to seek, or NULL if seeking a numbered subpattern |
1525 |
lorn name length, or subpattern number if name is NULL |
lorn name length, or subpattern number if name is NULL |
1526 |
xmode TRUE if we are in /x mode |
xmode TRUE if we are in /x mode |
1527 |
utf8 TRUE if we are in UTF-8 mode |
utf TRUE if we are in UTF-8 / UTF-16 mode |
1528 |
|
|
1529 |
Returns: the number of the found subpattern, or -1 if not found |
Returns: the number of the found subpattern, or -1 if not found |
1530 |
*/ |
*/ |
1531 |
|
|
1532 |
static int |
static int |
1533 |
find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode, |
find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode, |
1534 |
BOOL utf8) |
BOOL utf) |
1535 |
{ |
{ |
1536 |
pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern; |
pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern; |
1537 |
int count = 0; |
int count = 0; |
1544 |
|
|
1545 |
for (;;) |
for (;;) |
1546 |
{ |
{ |
1547 |
rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count); |
rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count); |
1548 |
if (rc > 0 || *ptr++ == 0) break; |
if (rc > 0 || *ptr++ == 0) break; |
1549 |
} |
} |
1550 |
|
|
1627 |
|
|
1628 |
Arguments: |
Arguments: |
1629 |
code points to the start of the pattern (the bracket) |
code points to the start of the pattern (the bracket) |
1630 |
utf8 TRUE in UTF-8 mode |
utf TRUE in UTF-8 / UTF-16 mode |
1631 |
atend TRUE if called when the pattern is complete |
atend TRUE if called when the pattern is complete |
1632 |
cd the "compile data" structure |
cd the "compile data" structure |
1633 |
|
|
1639 |
*/ |
*/ |
1640 |
|
|
1641 |
static int |
static int |
1642 |
find_fixedlength(pcre_uchar *code, BOOL utf8, BOOL atend, compile_data *cd) |
find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd) |
1643 |
{ |
{ |
1644 |
int length = -1; |
int length = -1; |
1645 |
|
|
1666 |
case OP_ONCE: |
case OP_ONCE: |
1667 |
case OP_ONCE_NC: |
case OP_ONCE_NC: |
1668 |
case OP_COND: |
case OP_COND: |
1669 |
d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf8, atend, cd); |
d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd); |
1670 |
if (d < 0) return d; |
if (d < 0) return d; |
1671 |
branchlength += d; |
branchlength += d; |
1672 |
do cc += GET(cc, 1); while (*cc == OP_ALT); |
do cc += GET(cc, 1); while (*cc == OP_ALT); |
1700 |
cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ |
cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1); /* Start subpattern */ |
1701 |
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ |
do ce += GET(ce, 1); while (*ce == OP_ALT); /* End subpattern */ |
1702 |
if (cc > cs && cc < ce) return -1; /* Recursion */ |
if (cc > cs && cc < ce) return -1; /* Recursion */ |
1703 |
d = find_fixedlength(cs + 2, utf8, atend, cd); |
d = find_fixedlength(cs + 2, utf, atend, cd); |
1704 |
if (d < 0) return d; |
if (d < 0) return d; |
1705 |
branchlength += d; |
branchlength += d; |
1706 |
cc += 1 + LINK_SIZE; |
cc += 1 + LINK_SIZE; |
1760 |
branchlength++; |
branchlength++; |
1761 |
cc += 2; |
cc += 2; |
1762 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1763 |
if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
1764 |
#endif |
#endif |
1765 |
break; |
break; |
1766 |
|
|
1774 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
1775 |
cc += 2 + IMM2_SIZE; |
cc += 2 + IMM2_SIZE; |
1776 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1777 |
if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
1778 |
#endif |
#endif |
1779 |
break; |
break; |
1780 |
|
|
1954 |
|
|
1955 |
Arguments: |
Arguments: |
1956 |
code points to start of expression |
code points to start of expression |
1957 |
utf8 TRUE in UTF-8 mode |
utf TRUE in UTF-8 / UTF-16 mode |
1958 |
number the required bracket number or negative to find a lookbehind |
number the required bracket number or negative to find a lookbehind |
1959 |
|
|
1960 |
Returns: pointer to the opcode for the bracket, or NULL if not found |
Returns: pointer to the opcode for the bracket, or NULL if not found |
1961 |
*/ |
*/ |
1962 |
|
|
1963 |
const pcre_uchar * |
const pcre_uchar * |
1964 |
PRIV(find_bracket)(const pcre_uchar *code, BOOL utf8, int number) |
PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number) |
1965 |
{ |
{ |
1966 |
for (;;) |
for (;;) |
1967 |
{ |
{ |
2042 |
arrange to skip the extra bytes. */ |
arrange to skip the extra bytes. */ |
2043 |
|
|
2044 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
2045 |
if (utf8) switch(c) |
if (utf) switch(c) |
2046 |
{ |
{ |
2047 |
case OP_CHAR: |
case OP_CHAR: |
2048 |
case OP_CHARI: |
case OP_CHARI: |
2076 |
break; |
break; |
2077 |
} |
} |
2078 |
#else |
#else |
2079 |
(void)(utf8); /* Keep compiler happy by referencing function argument */ |
(void)(utf); /* Keep compiler happy by referencing function argument */ |
2080 |
#endif |
#endif |
2081 |
} |
} |
2082 |
} |
} |
2093 |
|
|
2094 |
Arguments: |
Arguments: |
2095 |
code points to start of expression |
code points to start of expression |
2096 |
utf8 TRUE in UTF-8 mode |
utf TRUE in UTF-8 / UTF-16 mode |
2097 |
|
|
2098 |
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
Returns: pointer to the opcode for OP_RECURSE, or NULL if not found |
2099 |
*/ |
*/ |
2100 |
|
|
2101 |
static const pcre_uchar * |
static const pcre_uchar * |
2102 |
find_recurse(const pcre_uchar *code, BOOL utf8) |
find_recurse(const pcre_uchar *code, BOOL utf) |
2103 |
{ |
{ |
2104 |
for (;;) |
for (;;) |
2105 |
{ |
{ |
2162 |
to arrange to skip the extra bytes. */ |
to arrange to skip the extra bytes. */ |
2163 |
|
|
2164 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
2165 |
if (utf8) switch(c) |
if (utf) switch(c) |
2166 |
{ |
{ |
2167 |
case OP_CHAR: |
case OP_CHAR: |
2168 |
case OP_CHARI: |
case OP_CHARI: |
2196 |
break; |
break; |
2197 |
} |
} |
2198 |
#else |
#else |
2199 |
(void)(utf8); /* Keep compiler happy by referencing function argument */ |
(void)(utf); /* Keep compiler happy by referencing function argument */ |
2200 |
#endif |
#endif |
2201 |
} |
} |
2202 |
} |
} |
2219 |
Arguments: |
Arguments: |
2220 |
code points to start of search |
code points to start of search |
2221 |
endcode points to where to stop |
endcode points to where to stop |
2222 |
utf8 TRUE if in UTF8 mode |
utf TRUE if in UTF-8 / UTF-16 mode |
2223 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
2224 |
|
|
2225 |
Returns: TRUE if what is matched could be empty |
Returns: TRUE if what is matched could be empty |
2227 |
|
|
2228 |
static BOOL |
static BOOL |
2229 |
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, |
could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode, |
2230 |
BOOL utf8, compile_data *cd) |
BOOL utf, compile_data *cd) |
2231 |
{ |
{ |
2232 |
register int c; |
register int c; |
2233 |
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); |
for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE); |
2275 |
|
|
2276 |
do |
do |
2277 |
{ |
{ |
2278 |
if (could_be_empty_branch(scode, endcode, utf8, cd)) |
if (could_be_empty_branch(scode, endcode, utf, cd)) |
2279 |
{ |
{ |
2280 |
empty_branch = TRUE; |
empty_branch = TRUE; |
2281 |
break; |
break; |
2331 |
empty_branch = FALSE; |
empty_branch = FALSE; |
2332 |
do |
do |
2333 |
{ |
{ |
2334 |
if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd)) |
if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd)) |
2335 |
empty_branch = TRUE; |
empty_branch = TRUE; |
2336 |
code += GET(code, 1); |
code += GET(code, 1); |
2337 |
} |
} |
2465 |
case OP_MINQUERYI: |
case OP_MINQUERYI: |
2466 |
case OP_POSQUERY: |
case OP_POSQUERY: |
2467 |
case OP_POSQUERYI: |
case OP_POSQUERYI: |
2468 |
if (utf8 && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f]; |
if (utf && code[1] >= 0xc0) code += PRIV(utf8_table4)[code[1] & 0x3f]; |
2469 |
break; |
break; |
2470 |
|
|
2471 |
case OP_UPTO: |
case OP_UPTO: |
2474 |
case OP_MINUPTOI: |
case OP_MINUPTOI: |
2475 |
case OP_POSUPTO: |
case OP_POSUPTO: |
2476 |
case OP_POSUPTOI: |
case OP_POSUPTOI: |
2477 |
if (utf8 && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f]; |
if (utf && code[1 + IMM2_SIZE] >= 0xc0) code += PRIV(utf8_table4)[code[1 + IMM2_SIZE] & 0x3f]; |
2478 |
break; |
break; |
2479 |
#endif |
#endif |
2480 |
|
|
2518 |
code points to start of the recursion |
code points to start of the recursion |
2519 |
endcode points to where to stop (current RECURSE item) |
endcode points to where to stop (current RECURSE item) |
2520 |
bcptr points to the chain of current (unclosed) branch starts |
bcptr points to the chain of current (unclosed) branch starts |
2521 |
utf8 TRUE if in UTF-8 mode |
utf TRUE if in UTF-8 / UTF-16 mode |
2522 |
cd pointers to tables etc |
cd pointers to tables etc |
2523 |
|
|
2524 |
Returns: TRUE if what is matched could be empty |
Returns: TRUE if what is matched could be empty |
2526 |
|
|
2527 |
static BOOL |
static BOOL |
2528 |
could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, |
could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode, |
2529 |
branch_chain *bcptr, BOOL utf8, compile_data *cd) |
branch_chain *bcptr, BOOL utf, compile_data *cd) |
2530 |
{ |
{ |
2531 |
while (bcptr != NULL && bcptr->current_branch >= code) |
while (bcptr != NULL && bcptr->current_branch >= code) |
2532 |
{ |
{ |
2533 |
if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd)) |
if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd)) |
2534 |
return FALSE; |
return FALSE; |
2535 |
bcptr = bcptr->outer; |
bcptr = bcptr->outer; |
2536 |
} |
} |
2665 |
Arguments: |
Arguments: |
2666 |
group points to the start of the group |
group points to the start of the group |
2667 |
adjust the amount by which the group is to be moved |
adjust the amount by which the group is to be moved |
2668 |
utf8 TRUE in UTF-8 mode |
utf TRUE in UTF-8 / UTF-16 mode |
2669 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
2670 |
save_hwm the hwm forward reference pointer at the start of the group |
save_hwm the hwm forward reference pointer at the start of the group |
2671 |
|
|
2673 |
*/ |
*/ |
2674 |
|
|
2675 |
static void |
static void |
2676 |
adjust_recurse(pcre_uchar *group, int adjust, BOOL utf8, compile_data *cd, |
adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd, |
2677 |
pcre_uchar *save_hwm) |
pcre_uchar *save_hwm) |
2678 |
{ |
{ |
2679 |
pcre_uchar *ptr = group; |
pcre_uchar *ptr = group; |
2680 |
|
|
2681 |
while ((ptr = (pcre_uchar *)find_recurse(ptr, utf8)) != NULL) |
while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL) |
2682 |
{ |
{ |
2683 |
int offset; |
int offset; |
2684 |
pcre_uchar *hc; |
pcre_uchar *hc; |
2884 |
|
|
2885 |
Arguments: |
Arguments: |
2886 |
previous pointer to the repeated opcode |
previous pointer to the repeated opcode |
2887 |
utf8 TRUE in UTF-8 mode |
utf TRUE in UTF-8 / UTF-16 mode |
2888 |
ptr next character in pattern |
ptr next character in pattern |
2889 |
options options bits |
options options bits |
2890 |
cd contains pointers to tables etc. |
cd contains pointers to tables etc. |
2893 |
*/ |
*/ |
2894 |
|
|
2895 |
static BOOL |
static BOOL |
2896 |
check_auto_possessive(const pcre_uchar *previous, BOOL utf8, |
check_auto_possessive(const pcre_uchar *previous, BOOL utf, |
2897 |
const pcre_uchar *ptr, int options, compile_data *cd) |
const pcre_uchar *ptr, int options, compile_data *cd) |
2898 |
{ |
{ |
2899 |
int c, next; |
int c, next; |
2914 |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
2915 |
ptr++; |
ptr++; |
2916 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
2917 |
if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; |
if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; |
2918 |
#endif |
#endif |
2919 |
} |
} |
2920 |
} |
} |
2936 |
else if ((cd->ctypes[*ptr] & ctype_meta) == 0) |
else if ((cd->ctypes[*ptr] & ctype_meta) == 0) |
2937 |
{ |
{ |
2938 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
2939 |
if (utf8) { GETCHARINC(next, ptr); } else |
if (utf) { GETCHARINC(next, ptr); } else |
2940 |
#endif |
#endif |
2941 |
next = *ptr++; |
next = *ptr++; |
2942 |
} |
} |
2958 |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } |
2959 |
ptr++; |
ptr++; |
2960 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
2961 |
if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; |
if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; |
2962 |
#endif |
#endif |
2963 |
} |
} |
2964 |
} |
} |
2997 |
#endif |
#endif |
2998 |
if (c == next) return FALSE; |
if (c == next) return FALSE; |
2999 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3000 |
if (utf8) |
if (utf) |
3001 |
{ |
{ |
3002 |
unsigned int othercase; |
unsigned int othercase; |
3003 |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
3022 |
case OP_NOTI: |
case OP_NOTI: |
3023 |
if ((c = *previous) == next) return TRUE; |
if ((c = *previous) == next) return TRUE; |
3024 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3025 |
if (utf8) |
if (utf) |
3026 |
{ |
{ |
3027 |
unsigned int othercase; |
unsigned int othercase; |
3028 |
if (next < 128) othercase = cd->fcc[next]; else |
if (next < 128) othercase = cd->fcc[next]; else |
3357 |
dynamically as we process the pattern. */ |
dynamically as we process the pattern. */ |
3358 |
|
|
3359 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3360 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
3361 |
pcre_uint8 utf8_char[6]; |
BOOL utf = (options & PCRE_UTF8) != 0; |
3362 |
|
pcre_uchar utf_chars[6]; |
3363 |
#else |
#else |
3364 |
BOOL utf8 = FALSE; |
BOOL utf = FALSE; |
3365 |
#endif |
#endif |
3366 |
|
|
3367 |
/* Helper variables for OP_XCLASS opcode (for characters > 255). */ |
/* Helper variables for OP_XCLASS opcode (for characters > 255). */ |
3469 |
} |
} |
3470 |
|
|
3471 |
*lengthptr += (int)(code - last_code); |
*lengthptr += (int)(code - last_code); |
3472 |
DPRINTF(("length=%d added %d c=%c\n", *lengthptr, (int)(code - last_code), |
DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr, |
3473 |
c)); |
(int)(code - last_code), c, c)); |
3474 |
|
|
3475 |
/* If "previous" is set and it is not at the start of the work space, move |
/* If "previous" is set and it is not at the start of the work space, move |
3476 |
it back to there, in order to avoid filling up the work space. Otherwise, |
it back to there, in order to avoid filling up the work space. Otherwise, |
3557 |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } |
3558 |
ptr++; |
ptr++; |
3559 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3560 |
if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++; |
if (utf) while ((*ptr & 0xc0) == 0x80) ptr++; |
3561 |
#endif |
#endif |
3562 |
} |
} |
3563 |
if (*ptr != 0) continue; |
if (*ptr != 0) continue; |
3737 |
const pcre_uchar *oldptr; |
const pcre_uchar *oldptr; |
3738 |
|
|
3739 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
3740 |
if (utf8 && c > 127) |
if (utf && c > 127) |
3741 |
{ /* Braces are required because the */ |
{ /* Braces are required because the */ |
3742 |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ |
3743 |
} |
} |
3955 |
SETBIT(classbits, 0x20); /* SPACE */ |
SETBIT(classbits, 0x20); /* SPACE */ |
3956 |
SETBIT(classbits, 0xa0); /* NSBP */ |
SETBIT(classbits, 0xa0); /* NSBP */ |
3957 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3958 |
if (utf8) |
if (utf) |
3959 |
{ |
{ |
3960 |
xclass = TRUE; |
xclass = TRUE; |
3961 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
3962 |
class_uchardata += PRIV(ord2utf8)(0x1680, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata); |
3963 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
3964 |
class_uchardata += PRIV(ord2utf8)(0x180e, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata); |
3965 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
3966 |
class_uchardata += PRIV(ord2utf8)(0x2000, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata); |
3967 |
class_uchardata += PRIV(ord2utf8)(0x200A, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x200A, class_uchardata); |
3968 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
3969 |
class_uchardata += PRIV(ord2utf8)(0x202f, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata); |
3970 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
3971 |
class_uchardata += PRIV(ord2utf8)(0x205f, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata); |
3972 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
3973 |
class_uchardata += PRIV(ord2utf8)(0x3000, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata); |
3974 |
} |
} |
3975 |
#endif |
#endif |
3976 |
continue; |
continue; |
3990 |
} |
} |
3991 |
|
|
3992 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
3993 |
if (utf8) |
if (utf) |
3994 |
{ |
{ |
3995 |
xclass = TRUE; |
xclass = TRUE; |
3996 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
3997 |
class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); |
3998 |
class_uchardata += PRIV(ord2utf8)(0x167f, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata); |
3999 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4000 |
class_uchardata += PRIV(ord2utf8)(0x1681, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata); |
4001 |
class_uchardata += PRIV(ord2utf8)(0x180d, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata); |
4002 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4003 |
class_uchardata += PRIV(ord2utf8)(0x180f, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata); |
4004 |
class_uchardata += PRIV(ord2utf8)(0x1fff, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata); |
4005 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4006 |
class_uchardata += PRIV(ord2utf8)(0x200B, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x200B, class_uchardata); |
4007 |
class_uchardata += PRIV(ord2utf8)(0x202e, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata); |
4008 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4009 |
class_uchardata += PRIV(ord2utf8)(0x2030, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata); |
4010 |
class_uchardata += PRIV(ord2utf8)(0x205e, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata); |
4011 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4012 |
class_uchardata += PRIV(ord2utf8)(0x2060, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata); |
4013 |
class_uchardata += PRIV(ord2utf8)(0x2fff, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata); |
4014 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4015 |
class_uchardata += PRIV(ord2utf8)(0x3001, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata); |
4016 |
class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
4017 |
} |
} |
4018 |
#endif |
#endif |
4019 |
continue; |
continue; |
4025 |
SETBIT(classbits, 0x0d); /* CR */ |
SETBIT(classbits, 0x0d); /* CR */ |
4026 |
SETBIT(classbits, 0x85); /* NEL */ |
SETBIT(classbits, 0x85); /* NEL */ |
4027 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4028 |
if (utf8) |
if (utf) |
4029 |
{ |
{ |
4030 |
xclass = TRUE; |
xclass = TRUE; |
4031 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4032 |
class_uchardata += PRIV(ord2utf8)(0x2028, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata); |
4033 |
class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); |
4034 |
} |
} |
4035 |
#endif |
#endif |
4036 |
continue; |
continue; |
4053 |
} |
} |
4054 |
|
|
4055 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4056 |
if (utf8) |
if (utf) |
4057 |
{ |
{ |
4058 |
xclass = TRUE; |
xclass = TRUE; |
4059 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4060 |
class_uchardata += PRIV(ord2utf8)(0x0100, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata); |
4061 |
class_uchardata += PRIV(ord2utf8)(0x2027, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata); |
4062 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4063 |
class_uchardata += PRIV(ord2utf8)(0x2029, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata); |
4064 |
class_uchardata += PRIV(ord2utf8)(0x7fffffff, class_uchardata); |
class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata); |
4065 |
} |
} |
4066 |
#endif |
#endif |
4067 |
continue; |
continue; |
4149 |
} |
} |
4150 |
|
|
4151 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4152 |
if (utf8) |
if (utf) |
4153 |
{ /* Braces are required because the */ |
{ /* Braces are required because the */ |
4154 |
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ |
GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ |
4155 |
} |
} |
4199 |
available. */ |
available. */ |
4200 |
|
|
4201 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4202 |
if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) |
if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) |
4203 |
#endif |
#endif |
4204 |
#ifndef COMPILE_PCRE8 |
#ifndef COMPILE_PCRE8 |
4205 |
if (d > 255) |
if (d > 255) |
4244 |
else |
else |
4245 |
{ |
{ |
4246 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4247 |
class_uchardata += PRIV(ord2utf8)(occ, class_uchardata); |
class_uchardata += PRIV(ord2utf)(occ, class_uchardata); |
4248 |
} |
} |
4249 |
class_uchardata += PRIV(ord2utf8)(ocd, class_uchardata); |
class_uchardata += PRIV(ord2utf)(ocd, class_uchardata); |
4250 |
} |
} |
4251 |
} |
} |
4252 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
4256 |
|
|
4257 |
*class_uchardata++ = XCL_RANGE; |
*class_uchardata++ = XCL_RANGE; |
4258 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4259 |
class_uchardata += PRIV(ord2utf8)(c, class_uchardata); |
class_uchardata += PRIV(ord2utf)(c, class_uchardata); |
4260 |
class_uchardata += PRIV(ord2utf8)(d, class_uchardata); |
class_uchardata += PRIV(ord2utf)(d, class_uchardata); |
4261 |
#else |
#else |
4262 |
*class_uchardata++ = c; |
*class_uchardata++ = c; |
4263 |
*class_uchardata++ = d; |
*class_uchardata++ = d; |
4314 |
/* Handle a character that cannot go in the bit map */ |
/* Handle a character that cannot go in the bit map */ |
4315 |
|
|
4316 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4317 |
if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) |
if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) |
4318 |
#endif |
#endif |
4319 |
#ifndef COMPILE_PCRE8 |
#ifndef COMPILE_PCRE8 |
4320 |
if (c > 255) |
if (c > 255) |
4324 |
xclass = TRUE; |
xclass = TRUE; |
4325 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
4326 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4327 |
class_uchardata += PRIV(ord2utf8)(c, class_uchardata); |
class_uchardata += PRIV(ord2utf)(c, class_uchardata); |
4328 |
#else |
#else |
4329 |
*class_uchardata++ = c; |
*class_uchardata++ = c; |
4330 |
#endif |
#endif |
4336 |
if ((othercase = UCD_OTHERCASE(c)) != c) |
if ((othercase = UCD_OTHERCASE(c)) != c) |
4337 |
{ |
{ |
4338 |
*class_uchardata++ = XCL_SINGLE; |
*class_uchardata++ = XCL_SINGLE; |
4339 |
class_uchardata += PRIV(ord2utf8)(othercase, class_uchardata); |
class_uchardata += PRIV(ord2utf)(othercase, class_uchardata); |
4340 |
} |
} |
4341 |
} |
} |
4342 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
4394 |
|
|
4395 |
#ifdef SUPPORT_UTF |
#ifdef SUPPORT_UTF |
4396 |
if (class_charcount == 1 && !xclass && |
if (class_charcount == 1 && !xclass && |
4397 |
(!utf8 || !negate_class || class_lastchar < 128)) |
(!utf || !negate_class || class_lastchar < 128)) |
|
#elif defined COMPILE_PCRE8 |
|
|
if (class_charcount == 1) |
|
4398 |
#else |
#else |
4399 |
if (class_charcount == 1 && !xclass) |
if (class_charcount == 1) |
4400 |
#endif |
#endif |
4401 |
{ |
{ |
4402 |
zeroreqchar = reqchar; |
zeroreqchar = reqchar; |
4416 |
then we can handle this with the normal one-character code. */ |
then we can handle this with the normal one-character code. */ |
4417 |
|
|
4418 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4419 |
if (utf8 && class_lastchar > 127) |
if (utf && class_lastchar > 127) |
4420 |
mclength = PRIV(ord2utf8)(class_lastchar, mcbuffer); |
mclength = PRIV(ord2utf)(class_lastchar, mcbuffer); |
4421 |
else |
else |
4422 |
#endif |
#endif |
4423 |
{ |
{ |
4607 |
length rather than a small character. */ |
length rather than a small character. */ |
4608 |
|
|
4609 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4610 |
if (utf8 && (code[-1] & 0x80) != 0) |
if (utf && (code[-1] & 0x80) != 0) |
4611 |
{ |
{ |
4612 |
pcre_uchar *lastchar = code - 1; |
pcre_uchar *lastchar = code - 1; |
4613 |
while((*lastchar & 0xc0) == 0x80) lastchar--; |
while((*lastchar & 0xc0) == 0x80) lastchar--; |
4614 |
c = code - lastchar; /* Length of UTF-8 character */ |
c = code - lastchar; /* Length of UTF-8 character */ |
4615 |
memcpy(utf8_char, lastchar, c); /* Save the char */ |
memcpy(utf_chars, lastchar, c); /* Save the char */ |
4616 |
c |= 0x80; /* Flag c as a length */ |
c |= 0x80; /* Flag c as a length */ |
4617 |
} |
} |
4618 |
else |
else |
4633 |
|
|
4634 |
if (!possessive_quantifier && |
if (!possessive_quantifier && |
4635 |
repeat_max < 0 && |
repeat_max < 0 && |
4636 |
check_auto_possessive(previous, utf8, ptr + 1, options, cd)) |
check_auto_possessive(previous, utf, ptr + 1, options, cd)) |
4637 |
{ |
{ |
4638 |
repeat_type = 0; /* Force greedy */ |
repeat_type = 0; /* Force greedy */ |
4639 |
possessive_quantifier = TRUE; |
possessive_quantifier = TRUE; |
4654 |
c = previous[1]; |
c = previous[1]; |
4655 |
if (!possessive_quantifier && |
if (!possessive_quantifier && |
4656 |
repeat_max < 0 && |
repeat_max < 0 && |
4657 |
check_auto_possessive(previous, utf8, ptr + 1, options, cd)) |
check_auto_possessive(previous, utf, ptr + 1, options, cd)) |
4658 |
{ |
{ |
4659 |
repeat_type = 0; /* Force greedy */ |
repeat_type = 0; /* Force greedy */ |
4660 |
possessive_quantifier = TRUE; |
possessive_quantifier = TRUE; |
4678 |
|
|
4679 |
if (!possessive_quantifier && |
if (!possessive_quantifier && |
4680 |
repeat_max < 0 && |
repeat_max < 0 && |
4681 |
check_auto_possessive(previous, utf8, ptr + 1, options, cd)) |
check_auto_possessive(previous, utf, ptr + 1, options, cd)) |
4682 |
{ |
{ |
4683 |
repeat_type = 0; /* Force greedy */ |
repeat_type = 0; /* Force greedy */ |
4684 |
possessive_quantifier = TRUE; |
possessive_quantifier = TRUE; |
4763 |
if (repeat_max < 0) |
if (repeat_max < 0) |
4764 |
{ |
{ |
4765 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4766 |
if (utf8 && c >= 128) |
if (utf && c >= 128) |
4767 |
{ |
{ |
4768 |
memcpy(code, utf8_char, c & 7); |
memcpy(code, utf_chars, c & 7); |
4769 |
code += c & 7; |
code += c & 7; |
4770 |
} |
} |
4771 |
else |
else |
4788 |
else if (repeat_max != repeat_min) |
else if (repeat_max != repeat_min) |
4789 |
{ |
{ |
4790 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4791 |
if (utf8 && c >= 128) |
if (utf && c >= 128) |
4792 |
{ |
{ |
4793 |
memcpy(code, utf8_char, c & 7); |
memcpy(code, utf_chars, c & 7); |
4794 |
code += c & 7; |
code += c & 7; |
4795 |
} |
} |
4796 |
else |
else |
4818 |
/* The character or character type itself comes last in all cases. */ |
/* The character or character type itself comes last in all cases. */ |
4819 |
|
|
4820 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
4821 |
if (utf8 && c >= 128) |
if (utf && c >= 128) |
4822 |
{ |
{ |
4823 |
memcpy(code, utf8_char, c & 7); |
memcpy(code, utf_chars, c & 7); |
4824 |
code += c & 7; |
code += c & 7; |
4825 |
} |
} |
4826 |
else |
else |
4947 |
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ |
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ |
4948 |
{ |
{ |
4949 |
*code = OP_END; |
*code = OP_END; |
4950 |
adjust_recurse(previous, 1, utf8, cd, save_hwm); |
adjust_recurse(previous, 1, utf, cd, save_hwm); |
4951 |
memmove(previous + 1, previous, IN_UCHARS(len)); |
memmove(previous + 1, previous, IN_UCHARS(len)); |
4952 |
code++; |
code++; |
4953 |
if (repeat_max == 0) |
if (repeat_max == 0) |
4971 |
{ |
{ |
4972 |
int offset; |
int offset; |
4973 |
*code = OP_END; |
*code = OP_END; |
4974 |
adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); |
adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm); |
4975 |
memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); |
memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len)); |
4976 |
code += 2 + LINK_SIZE; |
code += 2 + LINK_SIZE; |
4977 |
*previous++ = OP_BRAZERO + repeat_type; |
*previous++ = OP_BRAZERO + repeat_type; |
5173 |
pcre_uchar *scode = bracode; |
pcre_uchar *scode = bracode; |
5174 |
do |
do |
5175 |
{ |
{ |
5176 |
if (could_be_empty_branch(scode, ketcode, utf8, cd)) |
if (could_be_empty_branch(scode, ketcode, utf, cd)) |
5177 |
{ |
{ |
5178 |
*bracode += OP_SBRA - OP_BRA; |
*bracode += OP_SBRA - OP_BRA; |
5179 |
break; |
break; |
5196 |
{ |
{ |
5197 |
int nlen = (int)(code - bracode); |
int nlen = (int)(code - bracode); |
5198 |
*code = OP_END; |
*code = OP_END; |
5199 |
adjust_recurse(bracode, 1 + LINK_SIZE, utf8, cd, save_hwm); |
adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm); |
5200 |
memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); |
memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen)); |
5201 |
code += 1 + LINK_SIZE; |
code += 1 + LINK_SIZE; |
5202 |
nlen += 1 + LINK_SIZE; |
nlen += 1 + LINK_SIZE; |
5274 |
{ |
{ |
5275 |
tempcode += PRIV(OP_lengths)[*tempcode]; |
tempcode += PRIV(OP_lengths)[*tempcode]; |
5276 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
5277 |
if (utf8 && tempcode[-1] >= 0xc0) |
if (utf && tempcode[-1] >= 0xc0) |
5278 |
tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f]; |
tempcode += PRIV(utf8_table4)[tempcode[-1] & 0x3f]; |
5279 |
#endif |
#endif |
5280 |
} |
} |
5312 |
|
|
5313 |
default: |
default: |
5314 |
*code = OP_END; |
*code = OP_END; |
5315 |
adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm); |
adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm); |
5316 |
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); |
memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len)); |
5317 |
code += 1 + LINK_SIZE; |
code += 1 + LINK_SIZE; |
5318 |
len += 1 + LINK_SIZE; |
len += 1 + LINK_SIZE; |
5621 |
/* Search the pattern for a forward reference */ |
/* Search the pattern for a forward reference */ |
5622 |
|
|
5623 |
else if ((i = find_parens(cd, name, namelen, |
else if ((i = find_parens(cd, name, namelen, |
5624 |
(options & PCRE_EXTENDED) != 0, utf8)) > 0) |
(options & PCRE_EXTENDED) != 0, utf)) > 0) |
5625 |
{ |
{ |
5626 |
PUT2(code, 2+LINK_SIZE, i); |
PUT2(code, 2+LINK_SIZE, i); |
5627 |
code[1+LINK_SIZE]++; |
code[1+LINK_SIZE]++; |
5966 |
temp = cd->end_pattern; |
temp = cd->end_pattern; |
5967 |
cd->end_pattern = ptr; |
cd->end_pattern = ptr; |
5968 |
recno = find_parens(cd, name, namelen, |
recno = find_parens(cd, name, namelen, |
5969 |
(options & PCRE_EXTENDED) != 0, utf8); |
(options & PCRE_EXTENDED) != 0, utf); |
5970 |
cd->end_pattern = temp; |
cd->end_pattern = temp; |
5971 |
if (recno < 0) recno = 0; /* Forward ref; set dummy number */ |
if (recno < 0) recno = 0; /* Forward ref; set dummy number */ |
5972 |
} |
} |
5993 |
} |
} |
5994 |
else if ((recno = /* Forward back reference */ |
else if ((recno = /* Forward back reference */ |
5995 |
find_parens(cd, name, namelen, |
find_parens(cd, name, namelen, |
5996 |
(options & PCRE_EXTENDED) != 0, utf8)) <= 0) |
(options & PCRE_EXTENDED) != 0, utf)) <= 0) |
5997 |
{ |
{ |
5998 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
5999 |
goto FAILED; |
goto FAILED; |
6097 |
{ |
{ |
6098 |
*code = OP_END; |
*code = OP_END; |
6099 |
if (recno != 0) |
if (recno != 0) |
6100 |
called = PRIV(find_bracket)(cd->start_code, utf8, recno); |
called = PRIV(find_bracket)(cd->start_code, utf, recno); |
6101 |
|
|
6102 |
/* Forward reference */ |
/* Forward reference */ |
6103 |
|
|
6104 |
if (called == NULL) |
if (called == NULL) |
6105 |
{ |
{ |
6106 |
if (find_parens(cd, NULL, recno, |
if (find_parens(cd, NULL, recno, |
6107 |
(options & PCRE_EXTENDED) != 0, utf8) < 0) |
(options & PCRE_EXTENDED) != 0, utf) < 0) |
6108 |
{ |
{ |
6109 |
*errorcodeptr = ERR15; |
*errorcodeptr = ERR15; |
6110 |
goto FAILED; |
goto FAILED; |
6128 |
conditional subpatterns will be picked up then. */ |
conditional subpatterns will be picked up then. */ |
6129 |
|
|
6130 |
else if (GET(called, 1) == 0 && cond_depth <= 0 && |
else if (GET(called, 1) == 0 && cond_depth <= 0 && |
6131 |
could_be_empty(called, code, bcptr, utf8, cd)) |
could_be_empty(called, code, bcptr, utf, cd)) |
6132 |
{ |
{ |
6133 |
*errorcodeptr = ERR40; |
*errorcodeptr = ERR40; |
6134 |
goto FAILED; |
goto FAILED; |
6626 |
|
|
6627 |
{ |
{ |
6628 |
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; |
6629 |
*code++ = (!utf8 && c == -ESC_C)? OP_ALLANY : -c; |
*code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c; |
6630 |
} |
} |
6631 |
} |
} |
6632 |
continue; |
continue; |
6637 |
handle it as a data character. */ |
handle it as a data character. */ |
6638 |
|
|
6639 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
6640 |
if (utf8 && c > 127) |
if (utf && c > 127) |
6641 |
mclength = PRIV(ord2utf8)(c, mcbuffer); |
mclength = PRIV(ord2utf)(c, mcbuffer); |
6642 |
else |
else |
6643 |
#endif |
#endif |
6644 |
|
|
6660 |
mcbuffer[0] = c; |
mcbuffer[0] = c; |
6661 |
|
|
6662 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
6663 |
if (utf8 && c >= 0xc0) |
if (utf && c >= 0xc0) |
6664 |
{ |
{ |
6665 |
while ((ptr[1] & 0xc0) == 0x80) |
while ((ptr[1] & 0xc0) == 0x80) |
6666 |
mcbuffer[mclength++] = *(++ptr); |
mcbuffer[mclength++] = *(++ptr); |
7368 |
int newline; |
int newline; |
7369 |
int errorcode = 0; |
int errorcode = 0; |
7370 |
int skipatstart = 0; |
int skipatstart = 0; |
7371 |
BOOL utf8; |
BOOL utf; |
7372 |
size_t size; |
size_t size; |
7373 |
pcre_uchar *code; |
pcre_uchar *code; |
7374 |
const pcre_uchar *codestart; |
const pcre_uchar *codestart; |
7466 |
else break; |
else break; |
7467 |
} |
} |
7468 |
|
|
7469 |
utf8 = (options & PCRE_UTF8) != 0; |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
7470 |
|
utf = (options & PCRE_UTF8) != 0; |
7471 |
|
|
7472 |
/* Can't support UTF8 unless PCRE has been compiled to include the code. The |
/* Can't support UTF8 unless PCRE has been compiled to include the code. The |
7473 |
return of an error code from PRIV(valid_utf8)() is a new feature, introduced in |
return of an error code from PRIV(valid_utf)() is a new feature, introduced in |
7474 |
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is |
release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is |
7475 |
not used here. */ |
not used here. */ |
7476 |
|
|
7477 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
7478 |
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && |
if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 && |
7479 |
(errorcode = PRIV(valid_utf8)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) |
(errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0) |
7480 |
{ |
{ |
7481 |
errorcode = ERR44; |
errorcode = ERR44; |
7482 |
goto PCRE_EARLY_ERROR_RETURN2; |
goto PCRE_EARLY_ERROR_RETURN2; |
7483 |
} |
} |
7484 |
#else |
#else |
7485 |
if (utf8) |
if (utf) |
7486 |
{ |
{ |
7487 |
errorcode = ERR32; |
errorcode = ERR32; |
7488 |
goto PCRE_EARLY_ERROR_RETURN; |
goto PCRE_EARLY_ERROR_RETURN; |
7697 |
cd->hwm -= LINK_SIZE; |
cd->hwm -= LINK_SIZE; |
7698 |
offset = GET(cd->hwm, 0); |
offset = GET(cd->hwm, 0); |
7699 |
recno = GET(codestart, offset); |
recno = GET(codestart, offset); |
7700 |
groupptr = PRIV(find_bracket)(codestart, utf8, recno); |
groupptr = PRIV(find_bracket)(codestart, utf, recno); |
7701 |
if (groupptr == NULL) errorcode = ERR53; |
if (groupptr == NULL) errorcode = ERR53; |
7702 |
else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart)); |
else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart)); |
7703 |
} |
} |
7724 |
of zero, but that is a pathological case, and it does no harm.) When we find |
of zero, but that is a pathological case, and it does no harm.) When we find |
7725 |
one, we temporarily terminate the branch it is in while we scan it. */ |
one, we temporarily terminate the branch it is in while we scan it. */ |
7726 |
|
|
7727 |
for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf8, -1); |
for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1); |
7728 |
cc != NULL; |
cc != NULL; |
7729 |
cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf8, -1)) |
cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1)) |
7730 |
{ |
{ |
7731 |
if (GET(cc, 1) == 0) |
if (GET(cc, 1) == 0) |
7732 |
{ |
{ |