82 |
int recurse_depth) |
int recurse_depth) |
83 |
{ |
{ |
84 |
int length = -1; |
int length = -1; |
85 |
BOOL utf8 = (options & PCRE_UTF8) != 0; |
/* PCRE_UTF16 has the same value as PCRE_UTF8. */ |
86 |
|
BOOL utf = (options & PCRE_UTF8) != 0; |
87 |
BOOL had_recurse = FALSE; |
BOOL had_recurse = FALSE; |
88 |
register int branchlength = 0; |
register int branchlength = 0; |
89 |
register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; |
register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE; |
225 |
branchlength++; |
branchlength++; |
226 |
cc += 2; |
cc += 2; |
227 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
228 |
if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
229 |
#endif |
#endif |
230 |
break; |
break; |
231 |
|
|
246 |
branchlength += GET2(cc,1); |
branchlength += GET2(cc,1); |
247 |
cc += 2 + IMM2_SIZE; |
cc += 2 + IMM2_SIZE; |
248 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
249 |
if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
250 |
#endif |
#endif |
251 |
break; |
break; |
252 |
|
|
294 |
|
|
295 |
case OP_ANYBYTE: |
case OP_ANYBYTE: |
296 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
297 |
if (utf8) return -1; |
if (utf) return -1; |
298 |
#endif |
#endif |
299 |
branchlength++; |
branchlength++; |
300 |
cc++; |
cc++; |
375 |
case OP_REFI: |
case OP_REFI: |
376 |
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) |
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) |
377 |
{ |
{ |
378 |
ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf8, GET2(cc, 1)); |
ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1)); |
379 |
if (cs == NULL) return -2; |
if (cs == NULL) return -2; |
380 |
do ce += GET(ce, 1); while (*ce == OP_ALT); |
do ce += GET(ce, 1); while (*ce == OP_ALT); |
381 |
if (cc > cs && cc < ce) |
if (cc > cs && cc < ce) |
487 |
|
|
488 |
cc += PRIV(OP_lengths)[op]; |
cc += PRIV(OP_lengths)[op]; |
489 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
490 |
if (utf8 && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
if (utf && cc[-1] >= 0xc0) cc += PRIV(utf8_table4)[cc[-1] & 0x3f]; |
491 |
#endif |
#endif |
492 |
break; |
break; |
493 |
|
|
538 |
p points to the character |
p points to the character |
539 |
caseless the caseless flag |
caseless the caseless flag |
540 |
cd the block with char table pointers |
cd the block with char table pointers |
541 |
utf8 TRUE for UTF-8 mode |
utf TRUE for UTF-8 / UTF-16 mode |
542 |
|
|
543 |
Returns: pointer after the character |
Returns: pointer after the character |
544 |
*/ |
*/ |
545 |
|
|
546 |
static const pcre_uchar * |
static const pcre_uchar * |
547 |
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, |
set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless, |
548 |
compile_data *cd, BOOL utf8) |
compile_data *cd, BOOL utf) |
549 |
{ |
{ |
550 |
unsigned int c = *p; |
unsigned int c = *p; |
551 |
|
|
552 |
SET_BIT(c); |
SET_BIT(c); |
553 |
|
|
554 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
555 |
if (utf8 && c > 127) |
if (utf && c > 127) |
556 |
{ |
{ |
557 |
GETCHARINC(c, p); |
GETCHARINC(c, p); |
558 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
559 |
if (caseless) |
if (caseless) |
560 |
{ |
{ |
561 |
pcre_uint8 buff[8]; |
pcre_uchar buff[6]; |
562 |
c = UCD_OTHERCASE(c); |
c = UCD_OTHERCASE(c); |
563 |
(void)PRIV(ord2utf8)(c, buff); |
(void)PRIV(ord2utf)(c, buff); |
564 |
SET_BIT(buff[0]); |
SET_BIT(buff[0]); |
565 |
} |
} |
566 |
#endif |
#endif |
608 |
{ |
{ |
609 |
if ((cd->cbits[c/8] & (1 << (c&7))) != 0) |
if ((cd->cbits[c/8] & (1 << (c&7))) != 0) |
610 |
{ |
{ |
611 |
pcre_uint8 buff[8]; |
pcre_uchar buff[6]; |
612 |
(void)PRIV(ord2utf8)(c, buff); |
(void)PRIV(ord2utf)(c, buff); |
613 |
SET_BIT(buff[0]); |
SET_BIT(buff[0]); |
614 |
} |
} |
615 |
} |
} |
664 |
Arguments: |
Arguments: |
665 |
code points to an expression |
code points to an expression |
666 |
start_bits points to a 32-byte table, initialized to 0 |
start_bits points to a 32-byte table, initialized to 0 |
667 |
utf8 TRUE if in UTF-8 mode |
utf TRUE if in UTF-8 / UTF-16 mode |
668 |
cd the block with char table pointers |
cd the block with char table pointers |
669 |
|
|
670 |
Returns: SSB_FAIL => Failed to find any starting bytes |
Returns: SSB_FAIL => Failed to find any starting bytes |
674 |
*/ |
*/ |
675 |
|
|
676 |
static int |
static int |
677 |
set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf8, |
set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf, |
678 |
compile_data *cd) |
compile_data *cd) |
679 |
{ |
{ |
680 |
register int c; |
register int c; |
681 |
int yield = SSB_DONE; |
int yield = SSB_DONE; |
682 |
int table_limit = utf8? 16:32; |
int table_limit = utf? 16:32; |
683 |
|
|
684 |
#if 0 |
#if 0 |
685 |
/* ========================================================================= */ |
/* ========================================================================= */ |
818 |
case OP_ONCE: |
case OP_ONCE: |
819 |
case OP_ONCE_NC: |
case OP_ONCE_NC: |
820 |
case OP_ASSERT: |
case OP_ASSERT: |
821 |
rc = set_start_bits(tcode, start_bits, utf8, cd); |
rc = set_start_bits(tcode, start_bits, utf, cd); |
822 |
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
823 |
if (rc == SSB_DONE) try_next = FALSE; else |
if (rc == SSB_DONE) try_next = FALSE; else |
824 |
{ |
{ |
865 |
case OP_BRAZERO: |
case OP_BRAZERO: |
866 |
case OP_BRAMINZERO: |
case OP_BRAMINZERO: |
867 |
case OP_BRAPOSZERO: |
case OP_BRAPOSZERO: |
868 |
rc = set_start_bits(++tcode, start_bits, utf8, cd); |
rc = set_start_bits(++tcode, start_bits, utf, cd); |
869 |
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc; |
870 |
/* ========================================================================= |
/* ========================================================================= |
871 |
See the comment at the head of this function concerning the next line, |
See the comment at the head of this function concerning the next line, |
892 |
case OP_QUERY: |
case OP_QUERY: |
893 |
case OP_MINQUERY: |
case OP_MINQUERY: |
894 |
case OP_POSQUERY: |
case OP_POSQUERY: |
895 |
tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); |
tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); |
896 |
break; |
break; |
897 |
|
|
898 |
case OP_STARI: |
case OP_STARI: |
901 |
case OP_QUERYI: |
case OP_QUERYI: |
902 |
case OP_MINQUERYI: |
case OP_MINQUERYI: |
903 |
case OP_POSQUERYI: |
case OP_POSQUERYI: |
904 |
tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); |
tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); |
905 |
break; |
break; |
906 |
|
|
907 |
/* Single-char upto sets the bit and tries the next */ |
/* Single-char upto sets the bit and tries the next */ |
909 |
case OP_UPTO: |
case OP_UPTO: |
910 |
case OP_MINUPTO: |
case OP_MINUPTO: |
911 |
case OP_POSUPTO: |
case OP_POSUPTO: |
912 |
tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf8); |
tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf); |
913 |
break; |
break; |
914 |
|
|
915 |
case OP_UPTOI: |
case OP_UPTOI: |
916 |
case OP_MINUPTOI: |
case OP_MINUPTOI: |
917 |
case OP_POSUPTOI: |
case OP_POSUPTOI: |
918 |
tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf8); |
tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf); |
919 |
break; |
break; |
920 |
|
|
921 |
/* At least one single char sets the bit and stops */ |
/* At least one single char sets the bit and stops */ |
927 |
case OP_PLUS: |
case OP_PLUS: |
928 |
case OP_MINPLUS: |
case OP_MINPLUS: |
929 |
case OP_POSPLUS: |
case OP_POSPLUS: |
930 |
(void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf8); |
(void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf); |
931 |
try_next = FALSE; |
try_next = FALSE; |
932 |
break; |
break; |
933 |
|
|
938 |
case OP_PLUSI: |
case OP_PLUSI: |
939 |
case OP_MINPLUSI: |
case OP_MINPLUSI: |
940 |
case OP_POSPLUSI: |
case OP_POSPLUSI: |
941 |
(void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf8); |
(void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf); |
942 |
try_next = FALSE; |
try_next = FALSE; |
943 |
break; |
break; |
944 |
|
|
951 |
case OP_HSPACE: |
case OP_HSPACE: |
952 |
SET_BIT(0x09); |
SET_BIT(0x09); |
953 |
SET_BIT(0x20); |
SET_BIT(0x20); |
954 |
if (utf8) |
if (utf) |
955 |
{ |
{ |
956 |
SET_BIT(0xC2); /* For U+00A0 */ |
SET_BIT(0xC2); /* For U+00A0 */ |
957 |
SET_BIT(0xE1); /* For U+1680, U+180E */ |
SET_BIT(0xE1); /* For U+1680, U+180E */ |
968 |
SET_BIT(0x0B); |
SET_BIT(0x0B); |
969 |
SET_BIT(0x0C); |
SET_BIT(0x0C); |
970 |
SET_BIT(0x0D); |
SET_BIT(0x0D); |
971 |
if (utf8) |
if (utf) |
972 |
{ |
{ |
973 |
SET_BIT(0xC2); /* For U+0085 */ |
SET_BIT(0xC2); /* For U+0085 */ |
974 |
SET_BIT(0xE2); /* For U+2028, U+2029 */ |
SET_BIT(0xE2); /* For U+2028, U+2029 */ |
1058 |
case OP_HSPACE: |
case OP_HSPACE: |
1059 |
SET_BIT(0x09); |
SET_BIT(0x09); |
1060 |
SET_BIT(0x20); |
SET_BIT(0x20); |
1061 |
if (utf8) |
if (utf) |
1062 |
{ |
{ |
1063 |
SET_BIT(0xC2); /* For U+00A0 */ |
SET_BIT(0xC2); /* For U+00A0 */ |
1064 |
SET_BIT(0xE1); /* For U+1680, U+180E */ |
SET_BIT(0xE1); /* For U+1680, U+180E */ |
1074 |
SET_BIT(0x0B); |
SET_BIT(0x0B); |
1075 |
SET_BIT(0x0C); |
SET_BIT(0x0C); |
1076 |
SET_BIT(0x0D); |
SET_BIT(0x0D); |
1077 |
if (utf8) |
if (utf) |
1078 |
{ |
{ |
1079 |
SET_BIT(0xC2); /* For U+0085 */ |
SET_BIT(0xC2); /* For U+0085 */ |
1080 |
SET_BIT(0xE2); /* For U+2028, U+2029 */ |
SET_BIT(0xE2); /* For U+2028, U+2029 */ |
1127 |
|
|
1128 |
case OP_NCLASS: |
case OP_NCLASS: |
1129 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1130 |
if (utf8) |
if (utf) |
1131 |
{ |
{ |
1132 |
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
1133 |
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
1148 |
characters in the range 128 - 255. */ |
characters in the range 128 - 255. */ |
1149 |
|
|
1150 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1151 |
if (utf8) |
if (utf) |
1152 |
{ |
{ |
1153 |
for (c = 0; c < 16; c++) start_bits[c] |= map[c]; |
for (c = 0; c < 16; c++) start_bits[c] |= map[c]; |
1154 |
for (c = 128; c < 256; c++) |
for (c = 128; c < 256; c++) |