43 |
compatible, but it has advantages in certain applications. */ |
compatible, but it has advantages in certain applications. */ |
44 |
|
|
45 |
|
|
46 |
|
#define NLBLOCK md /* The block containing newline information */ |
47 |
#include "pcre_internal.h" |
#include "pcre_internal.h" |
48 |
|
|
49 |
|
|
424 |
for (;;) |
for (;;) |
425 |
{ |
{ |
426 |
int i, j; |
int i, j; |
427 |
int c, d, clen, dlen; |
int clen, dlen; |
428 |
|
unsigned int c, d; |
429 |
|
|
430 |
/* Make the new state list into the active state list and empty the |
/* Make the new state list into the active state list and empty the |
431 |
new state list. */ |
new state list. */ |
649 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
650 |
case OP_CIRC: |
case OP_CIRC: |
651 |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || |
652 |
((ims & PCRE_MULTILINE) != 0 && ptr[-1] == NEWLINE)) |
((ims & PCRE_MULTILINE) != 0 && |
653 |
|
ptr >= start_subject + md->nllen && |
654 |
|
ptr != end_subject && |
655 |
|
IS_NEWLINE(ptr - md->nllen))) |
656 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
657 |
break; |
break; |
658 |
|
|
686 |
|
|
687 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
688 |
case OP_ANY: |
case OP_ANY: |
689 |
if (clen > 0 && (c != NEWLINE || (ims & PCRE_DOTALL) != 0)) |
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || |
690 |
|
ptr > end_subject - md->nllen || |
691 |
|
!IS_NEWLINE(ptr))) |
692 |
{ ADD_NEW(state_offset + 1, 0); } |
{ ADD_NEW(state_offset + 1, 0); } |
693 |
break; |
break; |
694 |
|
|
695 |
/*-----------------------------------------------------------------*/ |
/*-----------------------------------------------------------------*/ |
696 |
case OP_EODN: |
case OP_EODN: |
697 |
if (clen == 0 || (c == NEWLINE && ptr + 1 == end_subject)) |
if (clen == 0 || |
698 |
|
(ptr == end_subject - md->nllen && IS_NEWLINE(ptr))) |
699 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
700 |
break; |
break; |
701 |
|
|
703 |
case OP_DOLL: |
case OP_DOLL: |
704 |
if ((md->moptions & PCRE_NOTEOL) == 0) |
if ((md->moptions & PCRE_NOTEOL) == 0) |
705 |
{ |
{ |
706 |
if (clen == 0 || (c == NEWLINE && (ptr + 1 == end_subject || |
if (clen == 0 || |
707 |
(ims & PCRE_MULTILINE) != 0))) |
(ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) && |
708 |
|
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) |
709 |
|
)) |
710 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
711 |
} |
} |
712 |
else if (c == NEWLINE && (ims & PCRE_MULTILINE) != 0) |
else if ((ims & PCRE_MULTILINE) != 0 && |
713 |
|
ptr <= end_subject - md->nllen && IS_NEWLINE(ptr)) |
714 |
{ ADD_ACTIVE(state_offset + 1, 0); } |
{ ADD_ACTIVE(state_offset + 1, 0); } |
715 |
break; |
break; |
716 |
|
|
822 |
{ |
{ |
823 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
824 |
(c < 256 && |
(c < 256 && |
825 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
826 |
|
(ims & PCRE_DOTALL) != 0 || |
827 |
|
ptr > end_subject - md->nllen || |
828 |
|
!IS_NEWLINE(ptr) |
829 |
|
) && |
830 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
831 |
{ |
{ |
832 |
count++; |
count++; |
843 |
{ |
{ |
844 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
845 |
(c < 256 && |
(c < 256 && |
846 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
847 |
|
(ims & PCRE_DOTALL) != 0 || |
848 |
|
ptr > end_subject - md->nllen || |
849 |
|
!IS_NEWLINE(ptr) |
850 |
|
) && |
851 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
852 |
{ |
{ |
853 |
ADD_NEW(state_offset + 2, 0); |
ADD_NEW(state_offset + 2, 0); |
863 |
{ |
{ |
864 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
865 |
(c < 256 && |
(c < 256 && |
866 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
867 |
|
(ims & PCRE_DOTALL) != 0 || |
868 |
|
ptr > end_subject - md->nllen || |
869 |
|
!IS_NEWLINE(ptr) |
870 |
|
) && |
871 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
872 |
{ |
{ |
873 |
ADD_NEW(state_offset, 0); |
ADD_NEW(state_offset, 0); |
886 |
{ |
{ |
887 |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || |
888 |
(c < 256 && |
(c < 256 && |
889 |
(d != OP_ANY || c != '\n' || (ims & PCRE_DOTALL) != 0) && |
(d != OP_ANY || |
890 |
|
(ims & PCRE_DOTALL) != 0 || |
891 |
|
ptr > end_subject - md->nllen || |
892 |
|
!IS_NEWLINE(ptr) |
893 |
|
) && |
894 |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) |
895 |
{ |
{ |
896 |
if (++count >= GET2(code, 1)) |
if (++count >= GET2(code, 1)) |
1247 |
if (clen > 0) |
if (clen > 0) |
1248 |
{ |
{ |
1249 |
int otherd = -1; |
int otherd = -1; |
1250 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1251 |
{ |
{ |
1252 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1253 |
if (utf8 && d >= 128) |
if (utf8 && d >= 128) |
1274 |
if (clen > 0) |
if (clen > 0) |
1275 |
{ |
{ |
1276 |
int otherd = -1; |
int otherd = -1; |
1277 |
if ((ims && PCRE_CASELESS) != 0) |
if ((ims & PCRE_CASELESS) != 0) |
1278 |
{ |
{ |
1279 |
#ifdef SUPPORT_UTF8 |
#ifdef SUPPORT_UTF8 |
1280 |
if (utf8 && d >= 128) |
if (utf8 && d >= 128) |
1397 |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
{ ADD_ACTIVE(next_state_offset + 5, 0); } |
1398 |
if (isinclass) |
if (isinclass) |
1399 |
{ |
{ |
1400 |
if (++count >= GET2(ecode, 3)) |
int max = GET2(ecode, 3); |
1401 |
|
if (++count >= max && max != 0) /* Max 0 => no limit */ |
1402 |
{ ADD_NEW(next_state_offset + 5, 0); } |
{ ADD_NEW(next_state_offset + 5, 0); } |
1403 |
else |
else |
1404 |
{ ADD_NEW(state_offset, count); } |
{ ADD_NEW(state_offset, count); } |
1698 |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" |
1699 |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, |
1700 |
rlevel*2-2, SP)); |
rlevel*2-2, SP)); |
1701 |
return match_count; |
break; /* In effect, "return", but see the comment below */ |
1702 |
} |
} |
1703 |
|
|
1704 |
/* One or more states are active for the next character. */ |
/* One or more states are active for the next character. */ |
1706 |
ptr += clen; /* Advance to next subject character */ |
ptr += clen; /* Advance to next subject character */ |
1707 |
} /* Loop to move along the subject string */ |
} /* Loop to move along the subject string */ |
1708 |
|
|
1709 |
/* Control never gets here, but we must keep the compiler happy. */ |
/* Control gets here from "break" a few lines above. We do it this way because |
1710 |
|
if we use "return" above, we have compiler trouble. Some compilers warn if |
1711 |
|
there's nothing here because they think the function doesn't return a value. On |
1712 |
|
the other hand, if we put a dummy statement here, some more clever compilers |
1713 |
|
complain that it can't be reached. Sigh. */ |
1714 |
|
|
1715 |
DPRINTF(("%.*s+++ Unexpected end of internal_dfa_exec %d +++\n" |
return match_count; |
|
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, rlevel*2-2, SP)); |
|
|
return PCRE_ERROR_NOMATCH; |
|
1716 |
} |
} |
1717 |
|
|
1718 |
|
|
1751 |
{ |
{ |
1752 |
real_pcre *re = (real_pcre *)argument_re; |
real_pcre *re = (real_pcre *)argument_re; |
1753 |
dfa_match_data match_block; |
dfa_match_data match_block; |
1754 |
|
dfa_match_data *md = &match_block; |
1755 |
BOOL utf8, anchored, startline, firstline; |
BOOL utf8, anchored, startline, firstline; |
1756 |
const uschar *current_subject, *end_subject, *lcc; |
const uschar *current_subject, *end_subject, *lcc; |
1757 |
|
|
1766 |
int first_byte = -1; |
int first_byte = -1; |
1767 |
int req_byte = -1; |
int req_byte = -1; |
1768 |
int req_byte2 = -1; |
int req_byte2 = -1; |
1769 |
|
int newline; |
1770 |
|
|
1771 |
/* Plausibility checks */ |
/* Plausibility checks */ |
1772 |
|
|
1781 |
match block, so we must initialize them beforehand. However, the other fields |
match block, so we must initialize them beforehand. However, the other fields |
1782 |
in the match block must not be set until after the byte flipping. */ |
in the match block must not be set until after the byte flipping. */ |
1783 |
|
|
1784 |
match_block.tables = re->tables; |
md->tables = re->tables; |
1785 |
match_block.callout_data = NULL; |
md->callout_data = NULL; |
1786 |
|
|
1787 |
if (extra_data != NULL) |
if (extra_data != NULL) |
1788 |
{ |
{ |
1793 |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) |
1794 |
return PCRE_ERROR_DFA_UMLIMIT; |
return PCRE_ERROR_DFA_UMLIMIT; |
1795 |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) |
1796 |
match_block.callout_data = extra_data->callout_data; |
md->callout_data = extra_data->callout_data; |
1797 |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
if ((flags & PCRE_EXTRA_TABLES) != 0) |
1798 |
match_block.tables = extra_data->tables; |
md->tables = extra_data->tables; |
1799 |
} |
} |
1800 |
|
|
1801 |
/* Check that the first field in the block is the magic number. If it is not, |
/* Check that the first field in the block is the magic number. If it is not, |
1816 |
end_subject = (const unsigned char *)subject + length; |
end_subject = (const unsigned char *)subject + length; |
1817 |
req_byte_ptr = current_subject - 1; |
req_byte_ptr = current_subject - 1; |
1818 |
|
|
1819 |
|
#ifdef SUPPORT_UTF8 |
1820 |
utf8 = (re->options & PCRE_UTF8) != 0; |
utf8 = (re->options & PCRE_UTF8) != 0; |
1821 |
|
#else |
1822 |
|
utf8 = FALSE; |
1823 |
|
#endif |
1824 |
|
|
1825 |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || |
1826 |
(re->options & PCRE_ANCHORED) != 0; |
(re->options & PCRE_ANCHORED) != 0; |
1827 |
|
|
1828 |
/* The remaining fixed data for passing around. */ |
/* The remaining fixed data for passing around. */ |
1829 |
|
|
1830 |
match_block.start_code = (const uschar *)argument_re + |
md->start_code = (const uschar *)argument_re + |
1831 |
re->name_table_offset + re->name_count * re->name_entry_size; |
re->name_table_offset + re->name_count * re->name_entry_size; |
1832 |
match_block.start_subject = (const unsigned char *)subject; |
md->start_subject = (const unsigned char *)subject; |
1833 |
match_block.end_subject = end_subject; |
md->end_subject = end_subject; |
1834 |
match_block.moptions = options; |
md->moptions = options; |
1835 |
match_block.poptions = re->options; |
md->poptions = re->options; |
1836 |
|
|
1837 |
|
/* Handle different types of newline. The two bits give four cases. If nothing |
1838 |
|
is set at run time, whatever was used at compile time applies. */ |
1839 |
|
|
1840 |
|
switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) & |
1841 |
|
PCRE_NEWLINE_CRLF) |
1842 |
|
{ |
1843 |
|
default: newline = NEWLINE; break; /* Compile-time default */ |
1844 |
|
case PCRE_NEWLINE_CR: newline = '\r'; break; |
1845 |
|
case PCRE_NEWLINE_LF: newline = '\n'; break; |
1846 |
|
case PCRE_NEWLINE_CR+ |
1847 |
|
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; |
1848 |
|
} |
1849 |
|
|
1850 |
|
if (newline > 255) |
1851 |
|
{ |
1852 |
|
md->nllen = 2; |
1853 |
|
md->nl[0] = (newline >> 8) & 255; |
1854 |
|
md->nl[1] = newline & 255; |
1855 |
|
} |
1856 |
|
else |
1857 |
|
{ |
1858 |
|
md->nllen = 1; |
1859 |
|
md->nl[0] = newline; |
1860 |
|
} |
1861 |
|
|
1862 |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
/* Check a UTF-8 string if required. Unfortunately there's no way of passing |
1863 |
back the character offset. */ |
back the character offset. */ |
1883 |
is a feature that makes it possible to save compiled regex and re-use them |
is a feature that makes it possible to save compiled regex and re-use them |
1884 |
in other programs later. */ |
in other programs later. */ |
1885 |
|
|
1886 |
if (match_block.tables == NULL) match_block.tables = _pcre_default_tables; |
if (md->tables == NULL) md->tables = _pcre_default_tables; |
1887 |
|
|
1888 |
/* The lower casing table and the "must be at the start of a line" flag are |
/* The lower casing table and the "must be at the start of a line" flag are |
1889 |
used in a loop when finding where to start. */ |
used in a loop when finding where to start. */ |
1890 |
|
|
1891 |
lcc = match_block.tables + lcc_offset; |
lcc = md->tables + lcc_offset; |
1892 |
startline = (re->options & PCRE_STARTLINE) != 0; |
startline = (re->options & PCRE_STARTLINE) != 0; |
1893 |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
firstline = (re->options & PCRE_FIRSTLINE) != 0; |
1894 |
|
|
1921 |
{ |
{ |
1922 |
req_byte = re->req_byte & 255; |
req_byte = re->req_byte & 255; |
1923 |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; |
1924 |
req_byte2 = (match_block.tables + fcc_offset)[req_byte]; /* case flipped */ |
req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ |
1925 |
} |
} |
1926 |
|
|
1927 |
/* Call the main matching function, looping for a non-anchored regex after a |
/* Call the main matching function, looping for a non-anchored regex after a |
1946 |
if (firstline) |
if (firstline) |
1947 |
{ |
{ |
1948 |
const uschar *t = current_subject; |
const uschar *t = current_subject; |
1949 |
while (t < save_end_subject && *t != '\n') t++; |
while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++; |
1950 |
end_subject = t; |
end_subject = t; |
1951 |
} |
} |
1952 |
|
|
1961 |
current_subject++; |
current_subject++; |
1962 |
} |
} |
1963 |
|
|
1964 |
/* Or to just after \n for a multiline match if possible */ |
/* Or to just after a linebreak for a multiline match if possible */ |
1965 |
|
|
1966 |
else if (startline) |
else if (startline) |
1967 |
{ |
{ |
1968 |
if (current_subject > match_block.start_subject + start_offset) |
if (current_subject > md->start_subject + md->nllen + |
1969 |
|
start_offset) |
1970 |
{ |
{ |
1971 |
while (current_subject < end_subject && current_subject[-1] != NEWLINE) |
while (current_subject <= end_subject && |
1972 |
|
!IS_NEWLINE(current_subject - md->nllen)) |
1973 |
current_subject++; |
current_subject++; |
1974 |
} |
} |
1975 |
} |
} |
2050 |
/* OK, now we can do the business */ |
/* OK, now we can do the business */ |
2051 |
|
|
2052 |
rc = internal_dfa_exec( |
rc = internal_dfa_exec( |
2053 |
&match_block, /* fixed match data */ |
md, /* fixed match data */ |
2054 |
match_block.start_code, /* this subexpression's code */ |
md->start_code, /* this subexpression's code */ |
2055 |
current_subject, /* where we currently are */ |
current_subject, /* where we currently are */ |
2056 |
start_offset, /* start offset in subject */ |
start_offset, /* start offset in subject */ |
2057 |
offsets, /* offset vector */ |
offsets, /* offset vector */ |
2058 |
offsetcount, /* size of same */ |
offsetcount, /* size of same */ |
2059 |
workspace, /* workspace vector */ |
workspace, /* workspace vector */ |
2060 |
wscount, /* size of same */ |
wscount, /* size of same */ |
2061 |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ |
2062 |
0, /* function recurse level */ |
0, /* function recurse level */ |
2063 |
0); /* regex recurse level */ |
0); /* regex recurse level */ |
2064 |
|
|
2065 |
/* Anything other than "no match" means we are done, always; otherwise, carry |
/* Anything other than "no match" means we are done, always; otherwise, carry |
2066 |
on only if not anchored. */ |
on only if not anchored. */ |
2070 |
/* Advance to the next subject character unless we are at the end of a line |
/* Advance to the next subject character unless we are at the end of a line |
2071 |
and firstline is set. */ |
and firstline is set. */ |
2072 |
|
|
2073 |
if (firstline && *current_subject == NEWLINE) break; |
if (firstline && |
2074 |
|
current_subject <= end_subject - md->nllen && |
2075 |
|
IS_NEWLINE(current_subject)) break; |
2076 |
current_subject++; |
current_subject++; |
|
|
|
|
#ifdef SUPPORT_UTF8 |
|
2077 |
if (utf8) |
if (utf8) |
2078 |
{ |
{ |
2079 |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) |
2080 |
current_subject++; |
current_subject++; |
2081 |
} |
} |
|
#endif |
|
|
|
|
2082 |
if (current_subject > end_subject) break; |
if (current_subject > end_subject) break; |
2083 |
} |
} |
2084 |
|
|