6 |
and semantics are as close as possible to those of the Perl 5 language. |
and semantics are as close as possible to those of the Perl 5 language. |
7 |
|
|
8 |
Written by Philip Hazel |
Written by Philip Hazel |
9 |
Copyright (c) 1997-2005 University of Cambridge |
Copyright (c) 1997-2006 University of Cambridge |
10 |
|
|
11 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
12 |
Redistribution and use in source and binary forms, with or without |
Redistribution and use in source and binary forms, with or without |
43 |
compatible, but it has advantages in certain applications. */ |
compatible, but it has advantages in certain applications. */ |
44 |
|
|
45 |
|
|
46 |
|
#define NLBLOCK md /* The block containing newline information */ |
47 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
48 |
|
|
49 |
|
|
289 |
const uschar *end_subject = md->end_subject; |
const uschar *end_subject = md->end_subject; |
290 |
const uschar *start_code = md->start_code; |
const uschar *start_code = md->start_code; |
291 |
|
|
292 |
|
#ifdef SUPPORT_UTF8 |
293 |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; |
294 |
|
#endif |
295 |
|
|
296 |
rlevel++; |
rlevel++; |
297 |
offsetcount &= (-2); |
offsetcount &= (-2); |
424 |
for (;;) |
for (;;) |
425 |
{ |
{ |
426 |
int i, j; |
int i, j; |
427 |
int c, d, clen, dlen; |
int clen, dlen; |
428 |
|
unsigned int c, d; |
429 |
|
|
430 |
/* Make the new state list into the active state list and empty the |
/* Make the new state list into the active state list and empty the |
431 |
new state list. */ |
new state list. */ |
484 |
const uschar *code; |
const uschar *code; |
485 |
int state_offset = current_state->offset; |
int state_offset = current_state->offset; |
486 |
int count, codevalue; |
int count, codevalue; |
487 |
int chartype, othercase; |
int chartype, script; |
488 |
|
|
489 |
#ifdef DEBUG |
#ifdef DEBUG |
490 |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); |
649 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
650 |
case OP_CIRC: |
case OP_CIRC: |
651 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
652 |
((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE)) |
((ims & PCRE_MULTILINE) != 0 && |
653 |
|
ptr >= start_subject + md->nllen && |
654 |
|
ptr != end_subject && |
655 |
|
IS_NEWLINE(ptr - md->nllen))) |
656 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
657 |
break; |
break; |
658 |
|
|
686 |
|
|
687 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
688 |
case OP_ANY: |
case OP_ANY: |
689 |
if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0)) |
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || |
690 |
|
ptr > end_subject - md->nllen || |
691 |
|
!IS_NEWLINE(ptr))) |
692 |
{ ADD_NEW(state_offset + 1, 0); } |
{ ADD_NEW(state_offset + 1, 0); } |
693 |
break; |
break; |
694 |
|
|
695 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
696 |
case OP_EODN: |
case OP_EODN: |
697 |
if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject)) |
if (clen == 0 || |
698 |
|
(ptr == end_subject - md->nllen && IS_NEWLINE(ptr))) |
699 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
700 |
break; |
break; |
701 |
|
|
703 |
case OP_DOLL: |
case OP_DOLL: |
704 |
if ((md->moptions & PCRE_NOTEOL) == 0) |
if ((md->moptions & PCRE_NOTEOL) == 0) |
705 |
{ |
{ |
706 |
if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject || |
if (clen == 0 || |
707 |
(ims & PCRE_MULTILINE) != 0))) |
(ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) && |
708 |
|
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
709 |
|
)) |
710 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
711 |
} |
} |
712 |
else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0) |
else if ((ims & PCRE_MULTILINE) != 0 && |
713 |
|
ptr <= end_subject - md->nllen && IS_NEWLINE(ptr)) |
714 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
715 |
break; |
break; |
716 |
|
|
770 |
case OP_NOTPROP: |
case OP_NOTPROP: |
771 |
if (clen > 0) |
if (clen > 0) |
772 |
{ |
{ |
773 |
int rqdtype, category; |
BOOL OK; |
774 |
category = ucp_findchar(c, &chartype, &othercase); |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
775 |
rqdtype = code[1]; |
switch(code[1]) |
|
if (rqdtype >= 128) |
|
|
{ |
|
|
if ((rqdtype - 128 == category) == (codevalue == OP_PROP)) |
|
|
{ ADD_NEW(state_offset + 2, 0); } |
|
|
} |
|
|
else |
|
776 |
{ |
{ |
777 |
if ((rqdtype == chartype) == (codevalue == OP_PROP)) |
case PT_ANY: |
778 |
{ ADD_NEW(state_offset + 2, 0); } |
OK = TRUE; |
779 |
|
break; |
780 |
|
|
781 |
|
case PT_LAMP: |
782 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
783 |
|
break; |
784 |
|
|
785 |
|
case PT_GC: |
786 |
|
OK = category == code[2]; |
787 |
|
break; |
788 |
|
|
789 |
|
case PT_PC: |
790 |
|
OK = chartype == code[2]; |
791 |
|
break; |
792 |
|
|
793 |
|
case PT_SC: |
794 |
|
OK = script == code[2]; |
795 |
|
break; |
796 |
|
|
797 |
|
/* Should never occur, but keep compilers from grumbling. */ |
798 |
|
|
799 |
|
default: |
800 |
|
OK = codevalue != OP_PROP; |
801 |
|
break; |
802 |
} |
} |
803 |
|
|
804 |
|
if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } |
805 |
} |
} |
806 |
break; |
break; |
807 |
#endif |
#endif |
822 |
{ |
{ |
823 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
824 |
(c < 256 && |
(c < 256 && |
825 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
826 |
|
(ims & PCRE_DOTALL) != 0 || |
827 |
|
ptr > end_subject - md->nllen || |
828 |
|
!IS_NEWLINE(ptr) |
829 |
|
) && |
830 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
831 |
{ |
{ |
832 |
count++; |
count++; |
843 |
{ |
{ |
844 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
845 |
(c < 256 && |
(c < 256 && |
846 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
847 |
|
(ims & PCRE_DOTALL) != 0 || |
848 |
|
ptr > end_subject - md->nllen || |
849 |
|
!IS_NEWLINE(ptr) |
850 |
|
) && |
851 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
852 |
{ |
{ |
853 |
ADD_NEW(state_offset + 2, 0); |
ADD_NEW(state_offset + 2, 0); |
863 |
{ |
{ |
864 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
865 |
(c < 256 && |
(c < 256 && |
866 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
867 |
|
(ims & PCRE_DOTALL) != 0 || |
868 |
|
ptr > end_subject - md->nllen || |
869 |
|
!IS_NEWLINE(ptr) |
870 |
|
) && |
871 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
872 |
{ |
{ |
873 |
ADD_NEW(state_offset, 0); |
ADD_NEW(state_offset, 0); |
886 |
{ |
{ |
887 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
888 |
(c < 256 && |
(c < 256 && |
889 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
890 |
|
(ims & PCRE_DOTALL) != 0 || |
891 |
|
ptr > end_subject - md->nllen || |
892 |
|
!IS_NEWLINE(ptr) |
893 |
|
) && |
894 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
895 |
{ |
{ |
896 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
910 |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
case OP_PROP_EXTRA + OP_TYPEPLUS: |
911 |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
case OP_PROP_EXTRA + OP_TYPEMINPLUS: |
912 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
913 |
if (count > 0) { ADD_ACTIVE(state_offset + 3, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } |
914 |
if (clen > 0) |
if (clen > 0) |
915 |
{ |
{ |
916 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
917 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
918 |
if ((d == OP_PROP) == |
switch(code[2]) |
919 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
920 |
{ count++; ADD_NEW(state_offset, count); } |
case PT_ANY: |
921 |
|
OK = TRUE; |
922 |
|
break; |
923 |
|
|
924 |
|
case PT_LAMP: |
925 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
926 |
|
break; |
927 |
|
|
928 |
|
case PT_GC: |
929 |
|
OK = category == code[3]; |
930 |
|
break; |
931 |
|
|
932 |
|
case PT_PC: |
933 |
|
OK = chartype == code[3]; |
934 |
|
break; |
935 |
|
|
936 |
|
case PT_SC: |
937 |
|
OK = script == code[3]; |
938 |
|
break; |
939 |
|
|
940 |
|
/* Should never occur, but keep compilers from grumbling. */ |
941 |
|
|
942 |
|
default: |
943 |
|
OK = codevalue != OP_PROP; |
944 |
|
break; |
945 |
|
} |
946 |
|
|
947 |
|
if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); } |
948 |
} |
} |
949 |
break; |
break; |
950 |
|
|
953 |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: |
954 |
count = current_state->count; /* Already matched */ |
count = current_state->count; /* Already matched */ |
955 |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } |
956 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
957 |
{ |
{ |
958 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
959 |
int ncount = 0; |
int ncount = 0; |
962 |
int nd; |
int nd; |
963 |
int ndlen = 1; |
int ndlen = 1; |
964 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
965 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
966 |
ncount++; |
ncount++; |
967 |
nptr += ndlen; |
nptr += ndlen; |
968 |
} |
} |
974 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
975 |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
case OP_PROP_EXTRA + OP_TYPEQUERY: |
976 |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
case OP_PROP_EXTRA + OP_TYPEMINQUERY: |
977 |
count = 3; |
count = 4; |
978 |
goto QS1; |
goto QS1; |
979 |
|
|
980 |
case OP_PROP_EXTRA + OP_TYPESTAR: |
case OP_PROP_EXTRA + OP_TYPESTAR: |
983 |
|
|
984 |
QS1: |
QS1: |
985 |
|
|
986 |
ADD_ACTIVE(state_offset + 3, 0); |
ADD_ACTIVE(state_offset + 4, 0); |
987 |
if (clen > 0) |
if (clen > 0) |
988 |
{ |
{ |
989 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
990 |
int rqdtype = code[2]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
991 |
if ((d == OP_PROP) == |
switch(code[2]) |
992 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
993 |
{ ADD_NEW(state_offset + count, 0); } |
case PT_ANY: |
994 |
|
OK = TRUE; |
995 |
|
break; |
996 |
|
|
997 |
|
case PT_LAMP: |
998 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
999 |
|
break; |
1000 |
|
|
1001 |
|
case PT_GC: |
1002 |
|
OK = category == code[3]; |
1003 |
|
break; |
1004 |
|
|
1005 |
|
case PT_PC: |
1006 |
|
OK = chartype == code[3]; |
1007 |
|
break; |
1008 |
|
|
1009 |
|
case PT_SC: |
1010 |
|
OK = script == code[3]; |
1011 |
|
break; |
1012 |
|
|
1013 |
|
/* Should never occur, but keep compilers from grumbling. */ |
1014 |
|
|
1015 |
|
default: |
1016 |
|
OK = codevalue != OP_PROP; |
1017 |
|
break; |
1018 |
|
} |
1019 |
|
|
1020 |
|
if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); } |
1021 |
} |
} |
1022 |
break; |
break; |
1023 |
|
|
1034 |
QS2: |
QS2: |
1035 |
|
|
1036 |
ADD_ACTIVE(state_offset + 2, 0); |
ADD_ACTIVE(state_offset + 2, 0); |
1037 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1038 |
{ |
{ |
1039 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1040 |
int ncount = 0; |
int ncount = 0; |
1043 |
int nd; |
int nd; |
1044 |
int ndlen = 1; |
int ndlen = 1; |
1045 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
1046 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
1047 |
ncount++; |
ncount++; |
1048 |
nptr += ndlen; |
nptr += ndlen; |
1049 |
} |
} |
1056 |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
case OP_PROP_EXTRA + OP_TYPEUPTO: |
1057 |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
case OP_PROP_EXTRA + OP_TYPEMINUPTO: |
1058 |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) |
1059 |
{ ADD_ACTIVE(state_offset + 5, 0); } |
{ ADD_ACTIVE(state_offset + 6, 0); } |
1060 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
1061 |
if (clen > 0) |
if (clen > 0) |
1062 |
{ |
{ |
1063 |
int category = ucp_findchar(c, &chartype, &othercase); |
BOOL OK; |
1064 |
int rqdtype = code[4]; |
int category = _pcre_ucp_findprop(c, &chartype, &script); |
1065 |
if ((d == OP_PROP) == |
switch(code[4]) |
1066 |
(rqdtype == ((rqdtype >= 128)? (category + 128) : chartype))) |
{ |
1067 |
|
case PT_ANY: |
1068 |
|
OK = TRUE; |
1069 |
|
break; |
1070 |
|
|
1071 |
|
case PT_LAMP: |
1072 |
|
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt; |
1073 |
|
break; |
1074 |
|
|
1075 |
|
case PT_GC: |
1076 |
|
OK = category == code[5]; |
1077 |
|
break; |
1078 |
|
|
1079 |
|
case PT_PC: |
1080 |
|
OK = chartype == code[5]; |
1081 |
|
break; |
1082 |
|
|
1083 |
|
case PT_SC: |
1084 |
|
OK = script == code[5]; |
1085 |
|
break; |
1086 |
|
|
1087 |
|
/* Should never occur, but keep compilers from grumbling. */ |
1088 |
|
|
1089 |
|
default: |
1090 |
|
OK = codevalue != OP_PROP; |
1091 |
|
break; |
1092 |
|
} |
1093 |
|
|
1094 |
|
if (OK == (d == OP_PROP)) |
1095 |
{ |
{ |
1096 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
1097 |
{ ADD_NEW(state_offset + 5, 0); } |
{ ADD_NEW(state_offset + 6, 0); } |
1098 |
else |
else |
1099 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
1100 |
} |
} |
1108 |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) |
1109 |
{ ADD_ACTIVE(state_offset + 4, 0); } |
{ ADD_ACTIVE(state_offset + 4, 0); } |
1110 |
count = current_state->count; /* Number already matched */ |
count = current_state->count; /* Number already matched */ |
1111 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1112 |
{ |
{ |
1113 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1114 |
int ncount = 0; |
int ncount = 0; |
1117 |
int nd; |
int nd; |
1118 |
int ndlen = 1; |
int ndlen = 1; |
1119 |
GETCHARLEN(nd, nptr, ndlen); |
GETCHARLEN(nd, nptr, ndlen); |
1120 |
if (ucp_findchar(nd, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break; |
1121 |
ncount++; |
ncount++; |
1122 |
nptr += ndlen; |
nptr += ndlen; |
1123 |
} |
} |
1148 |
{ |
{ |
1149 |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else |
1150 |
{ |
{ |
1151 |
|
int othercase; |
1152 |
if (c < 128) othercase = fcc[c]; else |
if (c < 128) othercase = fcc[c]; else |
1153 |
|
|
1154 |
/* If we have Unicode property support, we can use it to test the |
/* If we have Unicode property support, we can use it to test the |
1155 |
other case of the character, if there is one. The result of |
other case of the character. */ |
|
ucp_findchar() is < 0 if the char isn't found, and othercase is |
|
|
returned as zero if there isn't another case. */ |
|
1156 |
|
|
1157 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1158 |
if (ucp_findchar(c, &chartype, &othercase) < 0) |
othercase = _pcre_ucp_othercase(c); |
1159 |
|
#else |
1160 |
|
othercase = -1; |
1161 |
#endif |
#endif |
|
othercase = -1; |
|
1162 |
|
|
1163 |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } |
1164 |
} |
} |
1180 |
to wait for them to pass before continuing. */ |
to wait for them to pass before continuing. */ |
1181 |
|
|
1182 |
case OP_EXTUNI: |
case OP_EXTUNI: |
1183 |
if (clen > 0 && ucp_findchar(c, &chartype, &othercase) != ucp_M) |
if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M) |
1184 |
{ |
{ |
1185 |
const uschar *nptr = ptr + clen; |
const uschar *nptr = ptr + clen; |
1186 |
int ncount = 0; |
int ncount = 0; |
1188 |
{ |
{ |
1189 |
int nclen = 1; |
int nclen = 1; |
1190 |
GETCHARLEN(c, nptr, nclen); |
GETCHARLEN(c, nptr, nclen); |
1191 |
if (ucp_findchar(c, &chartype, &othercase) != ucp_M) break; |
if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break; |
1192 |
ncount++; |
ncount++; |
1193 |
nptr += nclen; |
nptr += nclen; |
1194 |
} |
} |
1223 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1224 |
{ |
{ |
1225 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1226 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1227 |
{ |
{ |
1228 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1229 |
if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1230 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1231 |
} |
} |
1232 |
else |
else |
1247 |
if (clen > 0) |
if (clen > 0) |
1248 |
{ |
{ |
1249 |
int otherd = -1; |
int otherd = -1; |
1250 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1251 |
{ |
{ |
1252 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1253 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1254 |
{ |
{ |
1255 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1256 |
if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1257 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1258 |
} |
} |
1259 |
else |
else |
1274 |
if (clen > 0) |
if (clen > 0) |
1275 |
{ |
{ |
1276 |
int otherd = -1; |
int otherd = -1; |
1277 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1278 |
{ |
{ |
1279 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1280 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1281 |
{ |
{ |
1282 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1283 |
if (ucp_findchar(c, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1284 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1285 |
} |
} |
1286 |
else |
else |
1308 |
if ((ims & PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1309 |
{ |
{ |
1310 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1311 |
if (utf8 && c >= 128) |
if (utf8 && d >= 128) |
1312 |
{ |
{ |
1313 |
#ifdef SUPPORT_UCP |
#ifdef SUPPORT_UCP |
1314 |
if (ucp_findchar(d, &chartype, &otherd) < 0) otherd = -1; |
otherd = _pcre_ucp_othercase(d); |
1315 |
#endif /* SUPPORT_UCP */ |
#endif /* SUPPORT_UCP */ |
1316 |
} |
} |
1317 |
else |
else |
1397 |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
1398 |
if (isinclass) |
if (isinclass) |
1399 |
{ |
{ |
1400 |
if (++count >= GET2(ecode, 3)) |
int max = GET2(ecode, 3); |
1401 |
|
if (++count >= max && max != 0) /* Max 0 => no limit */ |
1402 |
{ ADD_NEW(next_state_offset + 5, 0); } |
{ ADD_NEW(next_state_offset + 5, 0); } |
1403 |
else |
else |
1404 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
1555 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
1556 |
case OP_ONCE: |
case OP_ONCE: |
1557 |
{ |
{ |
|
const uschar *endcode; |
|
1558 |
int local_offsets[2]; |
int local_offsets[2]; |
1559 |
int local_workspace[1000]; |
int local_workspace[1000]; |
1560 |
|
|
1576 |
const uschar *end_subpattern = code; |
const uschar *end_subpattern = code; |
1577 |
int charcount = local_offsets[1] - local_offsets[0]; |
int charcount = local_offsets[1] - local_offsets[0]; |
1578 |
int next_state_offset, repeat_state_offset; |
int next_state_offset, repeat_state_offset; |
|
BOOL is_repeated; |
|
1579 |
|
|
1580 |
do { end_subpattern += GET(end_subpattern, 1); } |
do { end_subpattern += GET(end_subpattern, 1); } |
1581 |
while (*end_subpattern == OP_ALT); |
while (*end_subpattern == OP_ALT); |
1650 |
cb.version = 1; /* Version 1 of the callout block */ |
cb.version = 1; /* Version 1 of the callout block */ |
1651 |
cb.callout_number = code[1]; |
cb.callout_number = code[1]; |
1652 |
cb.offset_vector = offsets; |
cb.offset_vector = offsets; |
1653 |
cb.subject = (char *)start_subject; |
cb.subject = (PCRE_SPTR)start_subject; |
1654 |
cb.subject_length = end_subject - start_subject; |
cb.subject_length = end_subject - start_subject; |
1655 |
cb.start_match = current_subject - start_subject; |
cb.start_match = current_subject - start_subject; |
1656 |
cb.current_position = ptr - start_subject; |
cb.current_position = ptr - start_subject; |
1698 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
1699 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
1700 |
rlevel*2-2, SP)); |
rlevel*2-2, SP)); |
1701 |
return match_count; |
break; /* In effect, "return", but see the comment below */ |
1702 |
} |
} |
1703 |
|
|
1704 |
/* One or more states are active for the next character. */ |
/* One or more states are active for the next character. */ |
1706 |
ptr += clen; /* Advance to next subject character */ |
ptr += clen; /* Advance to next subject character */ |
1707 |
} /* Loop to move along the subject string */ |
} /* Loop to move along the subject string */ |
1708 |
|
|
1709 |
/* Control never gets here, but we must keep the compiler happy. */ |
/* Control gets here from "break" a few lines above. We do it this way because |
1710 |
|
if we use "return" above, we have compiler trouble. Some compilers warn if |
1711 |
|
there's nothing here because they think the function doesn't return a value. On |
1712 |
|
the other hand, if we put a dummy statement here, some more clever compilers |
1713 |
|
complain that it can't be reached. Sigh. */ |
1714 |
|
|
1715 |
DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n" |
return match_count; |
|
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP)); |
|
|
return PCRE_ERROR_NOMATCH; |
|
1716 |
} |
} |
1717 |
|
|
1718 |
|
|
1744 |
< -1 => some kind of unexpected problem |
< -1 => some kind of unexpected problem |
1745 |
*/ |
*/ |
1746 |
|
|
1747 |
EXPORT int |
PCRE_DATA_SCOPE int |
1748 |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, |
1749 |
const char *subject, int length, int start_offset, int options, int *offsets, |
const char *subject, int length, int start_offset, int options, int *offsets, |
1750 |
int offsetcount, int *workspace, int wscount) |
int offsetcount, int *workspace, int wscount) |
1751 |
{ |
{ |
1752 |
real_pcre *re = (real_pcre *)argument_re; |
real_pcre *re = (real_pcre *)argument_re; |
1753 |
dfa_match_data match_block; |
dfa_match_data match_block; |
1754 |
|
dfa_match_data *md = &match_block; |
1755 |
BOOL utf8, anchored, startline, firstline; |
BOOL utf8, anchored, startline, firstline; |
1756 |
const uschar *current_subject, *end_subject, *lcc; |
const uschar *current_subject, *end_subject, *lcc; |
1757 |
|
|
1766 |
int first_byte = -1; |
int first_byte = -1; |
1767 |
int req_byte = -1; |
int req_byte = -1; |
1768 |
int req_byte2 = -1; |
int req_byte2 = -1; |
1769 |
|
int newline; |
1770 |
|
|
1771 |
/* Plausibility checks */ |
/* Plausibility checks */ |
1772 |
|
|
1781 |
match block, so we must initialize them beforehand. However, the other fields |
match block, so we must initialize them beforehand. However, the other fields |
1782 |
in the match block must not be set until after the byte flipping. */ |
in the match block must not be set until after the byte flipping. */ |
1783 |
|
|
1784 |
match_block.tables = re->tables; |
md->tables = re->tables; |
1785 |
match_block.callout_data = NULL; |
md->callout_data = NULL; |
1786 |
|
|
1787 |
if (extra_data != NULL) |
if (extra_data != NULL) |
1788 |
{ |
{ |
1790 |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) |
1791 |
study = (const pcre_study_data *)extra_data->study_data; |
study = (const pcre_study_data *)extra_data->study_data; |
1792 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; |
1793 |
|
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
1794 |
|
return PCRE_ERROR_DFA_UMLIMIT; |
1795 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
1796 |
match_block.callout_data = extra_data->callout_data; |
md->callout_data = extra_data->callout_data; |
1797 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
1798 |
match_block.tables = extra_data->tables; |
md->tables = extra_data->tables; |
1799 |
} |
} |
1800 |
|
|
1801 |
/* Check that the first field in the block is the magic number. If it is not, |
/* Check that the first field in the block is the magic number. If it is not, |
1816 |
end_subject = (const unsigned char *)subject + length; |
end_subject = (const unsigned char *)subject + length; |
1817 |
req_byte_ptr = current_subject - 1; |
req_byte_ptr = current_subject - 1; |
1818 |
|
|
1819 |
|
#ifdef SUPPORT_UTF8 |
1820 |
utf8 = (re->options & PCRE_UTF8) != 0; |
utf8 = (re->options & PCRE_UTF8) != 0; |
1821 |
anchored = (options & PCRE_ANCHORED) != 0 || (re->options & PCRE_ANCHORED) != 0; |
#else |
1822 |
|
utf8 = FALSE; |
1823 |
|
#endif |
1824 |
|
|
1825 |
|
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
1826 |
|
(re->options & PCRE_ANCHORED) != 0; |
1827 |
|
|
1828 |
/* The remaining fixed data for passing around. */ |
/* The remaining fixed data for passing around. */ |
1829 |
|
|
1830 |
match_block.start_code = (const uschar *)argument_re + |
md->start_code = (const uschar *)argument_re + |
1831 |
re->name_table_offset + re->name_count * re->name_entry_size; |
re->name_table_offset + re->name_count * re->name_entry_size; |
1832 |
match_block.start_subject = (const unsigned char *)subject; |
md->start_subject = (const unsigned char *)subject; |
1833 |
match_block.end_subject = end_subject; |
md->end_subject = end_subject; |
1834 |
match_block.moptions = options; |
md->moptions = options; |
1835 |
match_block.poptions = re->options; |
md->poptions = re->options; |
1836 |
|
|
1837 |
|
/* Handle different types of newline. The two bits give four cases. If nothing |
1838 |
|
is set at run time, whatever was used at compile time applies. */ |
1839 |
|
|
1840 |
|
switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & |
1841 |
|
PCRE_NEWLINE_CRLF) |
1842 |
|
{ |
1843 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
1844 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
1845 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
1846 |
|
case PCRE_NEWLINE_CR+ |
1847 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
1848 |
|
} |
1849 |
|
|
1850 |
|
if (newline > 255) |
1851 |
|
{ |
1852 |
|
md->nllen = 2; |
1853 |
|
md->nl[0] = (newline >> 8) & 255; |
1854 |
|
md->nl[1] = newline & 255; |
1855 |
|
} |
1856 |
|
else |
1857 |
|
{ |
1858 |
|
md->nllen = 1; |
1859 |
|
md->nl[0] = newline; |
1860 |
|
} |
1861 |
|
|
1862 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
1863 |
back the character offset. */ |
back the character offset. */ |
1883 |
is a feature that makes it possible to save compiled regex and re-use them |
is a feature that makes it possible to save compiled regex and re-use them |
1884 |
in other programs later. */ |
in other programs later. */ |
1885 |
|
|
1886 |
if (match_block.tables == NULL) match_block.tables = _pcre_default_tables; |
if (md->tables == NULL) md->tables = _pcre_default_tables; |
1887 |
|
|
1888 |
/* The lower casing table and the "must be at the start of a line" flag are |
/* The lower casing table and the "must be at the start of a line" flag are |
1889 |
used in a loop when finding where to start. */ |
used in a loop when finding where to start. */ |
1890 |
|
|
1891 |
lcc = match_block.tables + lcc_offset; |
lcc = md->tables + lcc_offset; |
1892 |
startline = (re->options & PCRE_STARTLINE) != 0; |
startline = (re->options & PCRE_STARTLINE) != 0; |
1893 |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
1894 |
|
|
1921 |
{ |
{ |
1922 |
req_byte = re->req_byte & 255; |
req_byte = re->req_byte & 255; |
1923 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
1924 |
req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */ |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
1925 |
} |
} |
1926 |
|
|
1927 |
/* Call the main matching function, looping for a non-anchored regex after a |
/* Call the main matching function, looping for a non-anchored regex after a |
1939 |
|
|
1940 |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
/* Advance to a unique first char if possible. If firstline is TRUE, the |
1941 |
start of the match is constrained to the first line of a multiline string. |
start of the match is constrained to the first line of a multiline string. |
1942 |
Implement this by temporarily adjusting end_subject so that we stop scanning |
Implement this by temporarily adjusting end_subject so that we stop |
1943 |
at a newline. If the match fails at the newline, later code breaks this loop. |
scanning at a newline. If the match fails at the newline, later code breaks |
1944 |
*/ |
this loop. */ |
1945 |
|
|
1946 |
if (firstline) |
if (firstline) |
1947 |
{ |
{ |
1948 |
const uschar *t = current_subject; |
const uschar *t = current_subject; |
1949 |
while (t < save_end_subject && *t != '\n') t++; |
while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; |
1950 |
end_subject = t; |
end_subject = t; |
1951 |
} |
} |
1952 |
|
|
1961 |
current_subject++; |
current_subject++; |
1962 |
} |
} |
1963 |
|
|
1964 |
/* Or to just after \n for a multiline match if possible */ |
/* Or to just after a linebreak for a multiline match if possible */ |
1965 |
|
|
1966 |
else if (startline) |
else if (startline) |
1967 |
{ |
{ |
1968 |
if (current_subject > match_block.start_subject + start_offset) |
if (current_subject > md->start_subject + md->nllen + |
1969 |
|
start_offset) |
1970 |
{ |
{ |
1971 |
while (current_subject < end_subject && current_subject[-1] != NEWLINE) |
while (current_subject <= end_subject && |
1972 |
|
!IS_NEWLINE(current_subject - md->nllen)) |
1973 |
current_subject++; |
current_subject++; |
1974 |
} |
} |
1975 |
} |
} |
2050 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
2051 |
|
|
2052 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
2053 |
&match_block, /* fixed match data */ |
md, /* fixed match data */ |
2054 |
match_block.start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |
2055 |
current_subject, /* where we currently are */ |
current_subject, /* where we currently are */ |
2056 |
start_offset, /* start offset in subject */ |
start_offset, /* start offset in subject */ |
2057 |
offsets, /* offset vector */ |
offsets, /* offset vector */ |
2058 |
offsetcount, /* size of same */ |
offsetcount, /* size of same */ |
2059 |
workspace, /* workspace vector */ |
workspace, /* workspace vector */ |
2060 |
wscount, /* size of same */ |
wscount, /* size of same */ |
2061 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
2062 |
0, /* function recurse level */ |
0, /* function recurse level */ |
2063 |
0); /* regex recurse level */ |
0); /* regex recurse level */ |
2064 |
|
|
2065 |
/* Anything other than "no match" means we are done, always; otherwise, carry |
/* Anything other than "no match" means we are done, always; otherwise, carry |
2066 |
on only if not anchored. */ |
on only if not anchored. */ |
2070 |
/* Advance to the next subject character unless we are at the end of a line |
/* Advance to the next subject character unless we are at the end of a line |
2071 |
and firstline is set. */ |
and firstline is set. */ |
2072 |
|
|
2073 |
if (firstline && *current_subject == NEWLINE) break; |
if (firstline && |
2074 |
|
current_subject <= end_subject - md->nllen && |
2075 |
|
IS_NEWLINE(current_subject)) break; |
2076 |
current_subject++; |
current_subject++; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
2077 |
if (utf8) |
if (utf8) |
2078 |
{ |
{ |
2079 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
2080 |
current_subject++; |
current_subject++; |
2081 |
} |
} |
|
#endif |
|
|
|
|
2082 |
if (current_subject > end_subject) break; |
if (current_subject > end_subject) break; |
2083 |
} |
} |
2084 |
|
|