6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
7 |
|
|
8 |
Written by Philip Hazel |
Written by Philip Hazel |
9 |
Copyright (c) 1997-2005 University of Cambridge |
Copyright (c) 1997-2006 University of Cambridge |
10 |
|
|
11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
288 |
const uschar *end_subject = md->end_subject; |
const uschar *end_subject = md->end_subject; |
289 |
const uschar *start_code = md->start_code; |
const uschar *start_code = md->start_code; |
290 |
|
|
291 |
|
#ifdef SUPPORT_UTF8 |
292 |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
293 |
|
#endif |
294 |
|
|
295 |
rlevel++; |
rlevel++; |
296 |
offsetcount &= (-2); |
offsetcount &= (-2); |
482 |
const uschar *code; |
const uschar *code; |
483 |
int state_offset = current_state->offset; |
int state_offset = current_state->offset; |
484 |
int count, codevalue; |
int count, codevalue; |
485 |
int chartype, othercase; |
int chartype, script; |
486 |
|
|
487 |
#ifdef DEBUG |
#ifdef DEBUG |
488 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
759 |
case OP_NOTPROP: |
case OP_NOTPROP: |
760 |
if (clen > 0) |
if (clen > 0) |
761 |
{ |
{ |
762 |
int rqdtype, category; |
BOOL OK; |
763 |
category = _pcre_ucp_findchar(c, &chartype, &othercase); |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
764 |
rqdtype = code[1]; |
switch(code[1]) |
|
if (rqdtype >= 128) |
|
|
{ |
|
|
if ((rqdtype - 128 == category) == (codevalue == OP_PROP)) |
|
|
{ ADD_NEW(state_offset + 2, 0); } |
|
|
} |
|
|
else |
|
765 |
{ |
{ |
766 |
if ((rqdtype == chartype) == (codevalue == OP_PROP)) |
case PT_ANY: |
767 |
{ ADD_NEW(state_offset + 2, 0); } |
OK = TRUE; |
768 |
|
break; |
769 |
|
|
770 |
|
case PT_LAMP: |
771 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
772 |
|
break; |
773 |
|
|
774 |
|
case PT_GC: |
775 |
|
OK = category == code[2]; |
776 |
|
break; |
777 |
|
|
778 |
|
case PT_PC: |
779 |
|
OK = chartype == code[2]; |
780 |
|
break; |
781 |
|
|
782 |
|
case PT_SC: |
783 |
|
OK = script == code[2]; |
784 |
|
break; |
785 |
|
|
786 |
|
/* Should never occur, but keep compilers from grumbling. */ |
787 |
|
|
788 |
|
default: |
789 |
|
OK = codevalue != OP_PROP; |
790 |
|
break; |
791 |
} |
} |
792 |
|
|
793 |
|
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } |
794 |
} |
} |
795 |
break; |
break; |
796 |
#endif |
#endif |
883 |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
884 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
885 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
886 |
if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } |
887 |
if (clen > 0) |
if (clen > 0) |
888 |
{ |
{ |
889 |
int category = _pcre_ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
890 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
891 |
if ((d == OP_PROP) == |
switch(code[2]) |
892 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
893 |
{ count++; ADD_NEW(state_offset, count); } |
case PT_ANY: |
894 |
|
OK = TRUE; |
895 |
|
break; |
896 |
|
|
897 |
|
case PT_LAMP: |
898 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
899 |
|
break; |
900 |
|
|
901 |
|
case PT_GC: |
902 |
|
OK = category == code[3]; |
903 |
|
break; |
904 |
|
|
905 |
|
case PT_PC: |
906 |
|
OK = chartype == code[3]; |
907 |
|
break; |
908 |
|
|
909 |
|
case PT_SC: |
910 |
|
OK = script == code[3]; |
911 |
|
break; |
912 |
|
|
913 |
|
/* Should never occur, but keep compilers from grumbling. */ |
914 |
|
|
915 |
|
default: |
916 |
|
OK = codevalue != OP_PROP; |
917 |
|
break; |
918 |
|
} |
919 |
|
|
920 |
|
if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); } |
921 |
} |
} |
922 |
break; |
break; |
923 |
|
|
926 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
927 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
928 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
929 |
if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
930 |
{ |
{ |
931 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
932 |
int ncount = 0; |
int ncount = 0; |
935 |
int nd; |
int nd; |
936 |
int ndlen = 1; |
int ndlen = 1; |
937 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
938 |
if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
939 |
ncount++; |
ncount++; |
940 |
nptr += ndlen; |
nptr += ndlen; |
941 |
} |
} |
947 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
948 |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
949 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
950 |
count = 3; |
count = 4; |
951 |
goto QS1; |
goto QS1; |
952 |
|
|
953 |
case OP_PROP_EXTRA + OP_TYPESTAR: |
case OP_PROP_EXTRA + OP_TYPESTAR: |
956 |
|
|
957 |
QS1: |
QS1: |
958 |
|
|
959 |
ADD_ACTIVE(state_offset + 3, 0); |
ADD_ACTIVE(state_offset + 4, 0); |
960 |
if (clen > 0) |
if (clen > 0) |
961 |
{ |
{ |
962 |
int category = _pcre_ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
963 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
964 |
if ((d == OP_PROP) == |
switch(code[2]) |
965 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
966 |
{ ADD_NEW(state_offset + count, 0); } |
case PT_ANY: |
967 |
|
OK = TRUE; |
968 |
|
break; |
969 |
|
|
970 |
|
case PT_LAMP: |
971 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
972 |
|
break; |
973 |
|
|
974 |
|
case PT_GC: |
975 |
|
OK = category == code[3]; |
976 |
|
break; |
977 |
|
|
978 |
|
case PT_PC: |
979 |
|
OK = chartype == code[3]; |
980 |
|
break; |
981 |
|
|
982 |
|
case PT_SC: |
983 |
|
OK = script == code[3]; |
984 |
|
break; |
985 |
|
|
986 |
|
/* Should never occur, but keep compilers from grumbling. */ |
987 |
|
|
988 |
|
default: |
989 |
|
OK = codevalue != OP_PROP; |
990 |
|
break; |
991 |
|
} |
992 |
|
|
993 |
|
if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); } |
994 |
} |
} |
995 |
break; |
break; |
996 |
|
|
1007 |
QS2: |
QS2: |
1008 |
|
|
1009 |
ADD_ACTIVE(state_offset + 2, 0); |
ADD_ACTIVE(state_offset + 2, 0); |
1010 |
if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1011 |
{ |
{ |
1012 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1013 |
int ncount = 0; |
int ncount = 0; |
1016 |
int nd; |
int nd; |
1017 |
int ndlen = 1; |
int ndlen = 1; |
1018 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
1019 |
if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
1020 |
ncount++; |
ncount++; |
1021 |
nptr += ndlen; |
nptr += ndlen; |
1022 |
} |
} |
1029 |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
1030 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
1031 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
1032 |
{ ADD_ACTIVE(state_offset + 5, 0); } |
{ ADD_ACTIVE(state_offset + 6, 0); } |
1033 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
1034 |
if (clen > 0) |
if (clen > 0) |
1035 |
{ |
{ |
1036 |
int category = _pcre_ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
1037 |
int rqdtype = code[4]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
1038 |
if ((d == OP_PROP) == |
switch(code[4]) |
1039 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
1040 |
|
case PT_ANY: |
1041 |
|
OK = TRUE; |
1042 |
|
break; |
1043 |
|
|
1044 |
|
case PT_LAMP: |
1045 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
1046 |
|
break; |
1047 |
|
|
1048 |
|
case PT_GC: |
1049 |
|
OK = category == code[5]; |
1050 |
|
break; |
1051 |
|
|
1052 |
|
case PT_PC: |
1053 |
|
OK = chartype == code[5]; |
1054 |
|
break; |
1055 |
|
|
1056 |
|
case PT_SC: |
1057 |
|
OK = script == code[5]; |
1058 |
|
break; |
1059 |
|
|
1060 |
|
/* Should never occur, but keep compilers from grumbling. */ |
1061 |
|
|
1062 |
|
default: |
1063 |
|
OK = codevalue != OP_PROP; |
1064 |
|
break; |
1065 |
|
} |
1066 |
|
|
1067 |
|
if (OK == (d == OP_PROP)) |
1068 |
{ |
{ |
1069 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
1070 |
{ ADD_NEW(state_offset + 5, 0); } |
{ ADD_NEW(state_offset + 6, 0); } |
1071 |
else |
else |
1072 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
1073 |
} |
} |
1081 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
1082 |
{ ADD_ACTIVE(state_offset + 4, 0); } |
{ ADD_ACTIVE(state_offset + 4, 0); } |
1083 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
1084 |
if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1085 |
{ |
{ |
1086 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1087 |
int ncount = 0; |
int ncount = 0; |
1090 |
int nd; |
int nd; |
1091 |
int ndlen = 1; |
int ndlen = 1; |
1092 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
1093 |
if (_pcre_ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
1094 |
ncount++; |
ncount++; |
1095 |
nptr += ndlen; |
nptr += ndlen; |
1096 |
} |
} |
1121 |
{ |
{ |
1122 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
1123 |
{ |
{ |
1124 |
|
int othercase; |
1125 |
if (c < 128) othercase = fcc[c]; else |
if (c < 128) othercase = fcc[c]; else |
1126 |
|
|
1127 |
/* If we have Unicode property support, we can use it to test the |
/* If we have Unicode property support, we can use it to test the |
1128 |
other case of the character, if there is one. The result of |
other case of the character. */ |
|
_pcre_ucp_findchar() is < 0 if the char isn't found, and othercase is |
|
|
returned as zero if there isn't another case. */ |
|
1129 |
|
|
1130 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1131 |
if (_pcre_ucp_findchar(c, &chartype, &othercase) < 0) |
othercase = _pcre_ucp_othercase(c); |
1132 |
|
#else |
1133 |
|
othercase = -1; |
1134 |
#endif |
#endif |
|
othercase = -1; |
|
1135 |
|
|
1136 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
1137 |
} |
} |
1153 |
to wait for them to pass before continuing. */ |
to wait for them to pass before continuing. */ |
1154 |
|
|
1155 |
case OP_EXTUNI: |
case OP_EXTUNI: |
1156 |
if (clen > 0 && _pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1157 |
{ |
{ |
1158 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1159 |
int ncount = 0; |
int ncount = 0; |
1161 |
{ |
{ |
1162 |
int nclen = 1; |
int nclen = 1; |
1163 |
GETCHARLEN(c, nptr, nclen); |
GETCHARLEN(c, nptr, nclen); |
1164 |
if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; |
1165 |
ncount++; |
ncount++; |
1166 |
nptr += nclen; |
nptr += nclen; |
1167 |
} |
} |
1196 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1197 |
{ |
{ |
1198 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1199 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1200 |
{ |
{ |
1201 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1202 |
if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1203 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1204 |
} |
} |
1205 |
else |
else |
1223 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims && PCRE_CASELESS) != 0) |
1224 |
{ |
{ |
1225 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1226 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1227 |
{ |
{ |
1228 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1229 |
if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1230 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1231 |
} |
} |
1232 |
else |
else |
1250 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims && PCRE_CASELESS) != 0) |
1251 |
{ |
{ |
1252 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1253 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1254 |
{ |
{ |
1255 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1256 |
if (_pcre_ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1257 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1258 |
} |
} |
1259 |
else |
else |
1281 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1282 |
{ |
{ |
1283 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1284 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1285 |
{ |
{ |
1286 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1287 |
if (_pcre_ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1288 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1289 |
} |
} |
1290 |
else |
else |
1622 |
cb.version = 1; /* Version 1 of the callout block */ |
cb.version = 1; /* Version 1 of the callout block */ |
1623 |
cb.callout_number = code[1]; |
cb.callout_number = code[1]; |
1624 |
cb.offset_vector = offsets; |
cb.offset_vector = offsets; |
1625 |
cb.subject = (char *)start_subject; |
cb.subject = (PCRE_SPTR)start_subject; |
1626 |
cb.subject_length = end_subject - start_subject; |
cb.subject_length = end_subject - start_subject; |
1627 |
cb.start_match = current_subject - start_subject; |
cb.start_match = current_subject - start_subject; |
1628 |
cb.current_position = ptr - start_subject; |
cb.current_position = ptr - start_subject; |
1714 |
< -1 => some kind of unexpected problem |
< -1 => some kind of unexpected problem |
1715 |
*/ |
*/ |
1716 |
|
|
1717 |
PCRE_EXPORT int |
PCRE_DATA_SCOPE int |
1718 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
1719 |
const char *subject, int length, int start_offset, int options, int *offsets, |
const char *subject, int length, int start_offset, int options, int *offsets, |
1720 |
int offsetcount, int *workspace, int wscount) |
int offsetcount, int *workspace, int wscount) |
1758 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
1759 |
study = (const pcre_study_data *)extra_data->study_data; |
study = (const pcre_study_data *)extra_data->study_data; |
1760 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
1761 |
|
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
1762 |
|
return PCRE_ERROR_DFA_UMLIMIT; |
1763 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
1764 |
match_block.callout_data = extra_data->callout_data; |
match_block.callout_data = extra_data->callout_data; |
1765 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
1785 |
req_byte_ptr = current_subject - 1; |
req_byte_ptr = current_subject - 1; |
1786 |
|
|
1787 |
utf8 = (re->options & PCRE_UTF8) != 0; |
utf8 = (re->options & PCRE_UTF8) != 0; |
1788 |
anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0; |
|
1789 |
|
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
1790 |
|
(re->options & PCRE_ANCHORED) != 0; |
1791 |
|
|
1792 |
/* The remaining fixed data for passing around. */ |
/* The remaining fixed data for passing around. */ |
1793 |
|
|
1878 |
|
|
1879 |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
1880 |
start of the match is constrained to the first line of a multiline string. |
start of the match is constrained to the first line of a multiline string. |
1881 |
Implement this by temporarily adjusting end_subject so that we stop scanning |
Implement this by temporarily adjusting end_subject so that we stop |
1882 |
at a newline. If the match fails at the newline, later code breaks this loop. |
scanning at a newline. If the match fails at the newline, later code breaks |
1883 |
*/ |
this loop. */ |
1884 |
|
|
1885 |
if (firstline) |
if (firstline) |
1886 |
{ |
{ |