5121 |
} |
} |
5122 |
else |
else |
5123 |
if (!startline && study != NULL && |
if (!startline && study != NULL && |
5124 |
(study->options & PCRE_STUDY_MAPPED) != 0) |
(study->flags & PCRE_STUDY_MAPPED) != 0) |
5125 |
start_bits = study->start_bits; |
start_bits = study->start_bits; |
5126 |
} |
} |
5127 |
|
|
5247 |
/* Restore fudged end_subject */ |
/* Restore fudged end_subject */ |
5248 |
|
|
5249 |
end_subject = save_end_subject; |
end_subject = save_end_subject; |
5250 |
|
|
5251 |
#ifdef DEBUG /* Sigh. Some compilers never learn. */ |
/* The following two optimizations are disabled for partial matching or if |
|
printf(">>>> Match against: "); |
|
|
pchars(start_match, end_subject - start_match, TRUE, md); |
|
|
printf("\n"); |
|
|
#endif |
|
|
|
|
|
/* If req_byte is set, we know that that character must appear in the |
|
|
subject for the match to succeed. If the first character is set, req_byte |
|
|
must be later in the subject; otherwise the test starts at the match point. |
|
|
This optimization can save a huge amount of backtracking in patterns with |
|
|
nested unlimited repeats that aren't going to match. Writing separate code |
|
|
for cased/caseless versions makes it go faster, as does using an |
|
|
autoincrement and backing off on a match. |
|
|
|
|
|
HOWEVER: when the subject string is very, very long, searching to its end |
|
|
can take a long time, and give bad performance on quite ordinary patterns. |
|
|
This showed up when somebody was matching something like /^\d+C/ on a |
|
|
32-megabyte string... so we don't do this when the string is sufficiently |
|
|
long. |
|
|
|
|
|
ALSO: this processing is disabled when partial matching is requested, or if |
|
5252 |
disabling is explicitly requested. */ |
disabling is explicitly requested. */ |
5253 |
|
|
5254 |
if ((options & PCRE_NO_START_OPTIMIZE) == 0 && |
if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial) |
5255 |
req_byte >= 0 && |
{ |
5256 |
end_subject - start_match < REQ_BYTE_MAX && |
/* If the pattern was studied, a minimum subject length may be set. This is |
5257 |
!md->partial) |
a lower bound; no actual string of that length may actually match the |
5258 |
{ |
pattern. Although the value is, strictly, in characters, we treat it as |
5259 |
register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); |
bytes to avoid spending too much time in this optimization. */ |
5260 |
|
|
5261 |
/* We don't need to repeat the search if we haven't yet reached the |
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 && |
5262 |
place we found it at last time. */ |
end_subject - start_match < study->minlength) |
|
|
|
|
if (p > req_byte_ptr) |
|
5263 |
{ |
{ |
5264 |
if (req_byte_caseless) |
rc = MATCH_NOMATCH; |
5265 |
|
break; |
5266 |
|
} |
5267 |
|
|
5268 |
|
/* If req_byte is set, we know that that character must appear in the |
5269 |
|
subject for the match to succeed. If the first character is set, req_byte |
5270 |
|
must be later in the subject; otherwise the test starts at the match point. |
5271 |
|
This optimization can save a huge amount of backtracking in patterns with |
5272 |
|
nested unlimited repeats that aren't going to match. Writing separate code |
5273 |
|
for cased/caseless versions makes it go faster, as does using an |
5274 |
|
autoincrement and backing off on a match. |
5275 |
|
|
5276 |
|
HOWEVER: when the subject string is very, very long, searching to its end |
5277 |
|
can take a long time, and give bad performance on quite ordinary patterns. |
5278 |
|
This showed up when somebody was matching something like /^\d+C/ on a |
5279 |
|
32-megabyte string... so we don't do this when the string is sufficiently |
5280 |
|
long. */ |
5281 |
|
|
5282 |
|
if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX) |
5283 |
|
{ |
5284 |
|
register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); |
5285 |
|
|
5286 |
|
/* We don't need to repeat the search if we haven't yet reached the |
5287 |
|
place we found it at last time. */ |
5288 |
|
|
5289 |
|
if (p > req_byte_ptr) |
5290 |
{ |
{ |
5291 |
while (p < end_subject) |
if (req_byte_caseless) |
5292 |
{ |
{ |
5293 |
register int pp = *p++; |
while (p < end_subject) |
5294 |
if (pp == req_byte || pp == req_byte2) { p--; break; } |
{ |
5295 |
|
register int pp = *p++; |
5296 |
|
if (pp == req_byte || pp == req_byte2) { p--; break; } |
5297 |
|
} |
5298 |
} |
} |
5299 |
} |
else |
|
else |
|
|
{ |
|
|
while (p < end_subject) |
|
5300 |
{ |
{ |
5301 |
if (*p++ == req_byte) { p--; break; } |
while (p < end_subject) |
5302 |
|
{ |
5303 |
|
if (*p++ == req_byte) { p--; break; } |
5304 |
|
} |
5305 |
} |
} |
5306 |
|
|
5307 |
|
/* If we can't find the required character, break the matching loop, |
5308 |
|
forcing a match failure. */ |
5309 |
|
|
5310 |
|
if (p >= end_subject) |
5311 |
|
{ |
5312 |
|
rc = MATCH_NOMATCH; |
5313 |
|
break; |
5314 |
|
} |
5315 |
|
|
5316 |
|
/* If we have found the required character, save the point where we |
5317 |
|
found it, so that we don't search again next time round the loop if |
5318 |
|
the start hasn't passed this character yet. */ |
5319 |
|
|
5320 |
|
req_byte_ptr = p; |
5321 |
} |
} |
|
|
|
|
/* If we can't find the required character, break the matching loop, |
|
|
forcing a match failure. */ |
|
|
|
|
|
if (p >= end_subject) |
|
|
{ |
|
|
rc = MATCH_NOMATCH; |
|
|
break; |
|
|
} |
|
|
|
|
|
/* If we have found the required character, save the point where we |
|
|
found it, so that we don't search again next time round the loop if |
|
|
the start hasn't passed this character yet. */ |
|
|
|
|
|
req_byte_ptr = p; |
|
5322 |
} |
} |
5323 |
} |
} |
5324 |
|
|
5325 |
|
#ifdef DEBUG /* Sigh. Some compilers never learn. */ |
5326 |
|
printf(">>>> Match against: "); |
5327 |
|
pchars(start_match, end_subject - start_match, TRUE, md); |
5328 |
|
printf("\n"); |
5329 |
|
#endif |
5330 |
|
|
5331 |
/* OK, we can now run the match. If "hitend" is set afterwards, remember the |
/* OK, we can now run the match. If "hitend" is set afterwards, remember the |
5332 |
first starting point for which a partial match was found. */ |
first starting point for which a partial match was found. */ |