13 |
.B const unsigned char *\fItableptr\fR); |
.B const unsigned char *\fItableptr\fR); |
14 |
.PP |
.PP |
15 |
.br |
.br |
|
.B const unsigned char *pcre_maketables(void); |
|
|
.PP |
|
|
.br |
|
16 |
.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR, |
.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR, |
17 |
.ti +5n |
.ti +5n |
18 |
.B const char **\fIerrptr\fR); |
.B const char **\fIerrptr\fR); |
20 |
.br |
.br |
21 |
.B int pcre_exec(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR," |
.B int pcre_exec(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR," |
22 |
.ti +5n |
.ti +5n |
23 |
.B "const char *\fIsubject\fR," int \fIlength\fR, int \fIoptions\fR, |
.B "const char *\fIsubject\fR," int \fIlength\fR, int \fIstartoffset\fR, |
24 |
|
.ti +5n |
25 |
|
.B int \fIoptions\fR, int *\fIovector\fR, int \fIovecsize\fR); |
26 |
|
.PP |
27 |
|
.br |
28 |
|
.B int pcre_copy_substring(const char *\fIsubject\fR, int *\fIovector\fR, |
29 |
|
.ti +5n |
30 |
|
.B int \fIstringcount\fR, int \fIstringnumber\fR, char *\fIbuffer\fR, |
31 |
|
.ti +5n |
32 |
|
.B int \fIbuffersize\fR); |
33 |
|
.PP |
34 |
|
.br |
35 |
|
.B int pcre_get_substring(const char *\fIsubject\fR, int *\fIovector\fR, |
36 |
|
.ti +5n |
37 |
|
.B int \fIstringcount\fR, int \fIstringnumber\fR, |
38 |
|
.ti +5n |
39 |
|
.B const char **\fIstringptr\fR); |
40 |
|
.PP |
41 |
|
.br |
42 |
|
.B int pcre_get_substring_list(const char *\fIsubject\fR, |
43 |
.ti +5n |
.ti +5n |
44 |
.B int *\fIovector\fR, int \fIovecsize\fR); |
.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);" |
45 |
|
.PP |
46 |
|
.br |
47 |
|
.B const unsigned char *pcre_maketables(void); |
48 |
.PP |
.PP |
49 |
.br |
.br |
50 |
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int |
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int |
70 |
a set of wrapper functions that correspond to the POSIX API. See |
a set of wrapper functions that correspond to the POSIX API. See |
71 |
\fBpcreposix (3)\fR. |
\fBpcreposix (3)\fR. |
72 |
|
|
73 |
The three functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and |
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR |
74 |
\fBpcre_exec()\fR are used for compiling and matching regular expressions. The |
are used for compiling and matching regular expressions, while |
75 |
function \fBpcre_maketables()\fR is used (optionally) to build a set of |
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
76 |
character tables in the current locale for passing to \fBpcre_compile()\fR. |
\fBpcre_get_substring_list()\fR are convenience functions for extracting |
77 |
|
captured substrings from a matched subject string. The function |
78 |
|
\fBpcre_maketables()\fR is used (optionally) to build a set of character tables |
79 |
|
in the current locale for passing to \fBpcre_compile()\fR. |
80 |
|
|
81 |
The function \fBpcre_info()\fR is used to find out information about a compiled |
The function \fBpcre_info()\fR is used to find out information about a compiled |
82 |
pattern, while the function \fBpcre_version()\fR returns a pointer to a string |
pattern, while the function \fBpcre_version()\fR returns a pointer to a string |
249 |
The tables are built in memory that is obtained via \fBpcre_malloc\fR. The |
The tables are built in memory that is obtained via \fBpcre_malloc\fR. The |
250 |
pointer that is passed to \fBpcre_compile\fR is saved with the compiled |
pointer that is passed to \fBpcre_compile\fR is saved with the compiled |
251 |
pattern, and the same tables are used via this pointer by \fBpcre_study()\fR |
pattern, and the same tables are used via this pointer by \fBpcre_study()\fR |
252 |
and \fBpcre_match()\fR. Thus for any single pattern, compilation, studying and |
and \fBpcre_exec()\fR. Thus for any single pattern, compilation, studying and |
253 |
matching all happen in the same locale, but different patterns can be compiled |
matching all happen in the same locale, but different patterns can be compiled |
254 |
in different locales. It is the caller's responsibility to ensure that the |
in different locales. It is the caller's responsibility to ensure that the |
255 |
memory containing the tables remains available for as long as it is needed. |
memory containing the tables remains available for as long as it is needed. |
256 |
|
|
257 |
|
|
258 |
|
.SH INFORMATION ABOUT A PATTERN |
259 |
|
The \fBpcre_info()\fR function returns information about a compiled pattern. |
260 |
|
Its yield is the number of capturing subpatterns, or one of the following |
261 |
|
negative numbers: |
262 |
|
|
263 |
|
PCRE_ERROR_NULL the argument \fIcode\fR was NULL |
264 |
|
PCRE_ERROR_BADMAGIC the "magic number" was not found |
265 |
|
|
266 |
|
If the \fIoptptr\fR argument is not NULL, a copy of the options with which the |
267 |
|
pattern was compiled is placed in the integer it points to. These option bits |
268 |
|
are those specified in the call to \fBpcre_compile()\fR, modified by any |
269 |
|
top-level option settings within the pattern itself, and with the PCRE_ANCHORED |
270 |
|
bit set if the form of the pattern implies that it can match only at the start |
271 |
|
of a subject string. |
272 |
|
|
273 |
|
If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL, |
274 |
|
it is used to pass back information about the first character of any matched |
275 |
|
string. If there is a fixed first character, e.g. from a pattern such as |
276 |
|
(cat|cow|coyote), then it is returned in the integer pointed to by |
277 |
|
\fIfirstcharptr\fR. Otherwise, if either |
278 |
|
|
279 |
|
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch |
280 |
|
starts with "^", or |
281 |
|
|
282 |
|
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set |
283 |
|
(if it were set, the pattern would be anchored), |
284 |
|
|
285 |
|
then -1 is returned, indicating that the pattern matches only at the |
286 |
|
start of a subject string or after any "\\n" within the string. Otherwise -2 is |
287 |
|
returned. |
288 |
|
|
289 |
|
|
290 |
.SH MATCHING A PATTERN |
.SH MATCHING A PATTERN |
291 |
The function \fBpcre_exec()\fR is called to match a subject string against a |
The function \fBpcre_exec()\fR is called to match a subject string against a |
292 |
pre-compiled pattern, which is passed in the \fIcode\fR argument. If the |
pre-compiled pattern, which is passed in the \fIcode\fR argument. If the |
293 |
pattern has been studied, the result of the study should be passed in the |
pattern has been studied, the result of the study should be passed in the |
294 |
\fIextra\fR argument. Otherwise this must be NULL. |
\fIextra\fR argument. Otherwise this must be NULL. |
295 |
|
|
|
The subject string is passed as a pointer in \fIsubject\fR and a length in |
|
|
\fIlength\fR. Unlike the pattern string, it may contain binary zero characters. |
|
|
|
|
296 |
The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose |
The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose |
297 |
unused bits must be zero. However, if a pattern was compiled with |
unused bits must be zero. However, if a pattern was compiled with |
298 |
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it |
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it |
313 |
it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never |
it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never |
314 |
to match. |
to match. |
315 |
|
|
316 |
|
The subject string is passed as a pointer in \fIsubject\fR, a length in |
317 |
|
\fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern |
318 |
|
string, it may contain binary zero characters. When the starting offset is |
319 |
|
zero, the search for a match starts at the beginning of the subject, and this |
320 |
|
is by far the most common case. |
321 |
|
|
322 |
|
A non-zero starting offset is useful when searching for another match in the |
323 |
|
same subject by calling \fBpcre_exec()\fR again after a previous success. |
324 |
|
Setting \fIstartoffset\fR differs from just passing over a shortened string and |
325 |
|
setting PCRE_NOTBOL in the case of a pattern that begins with any kind of |
326 |
|
lookbehind. For example, consider the pattern |
327 |
|
|
328 |
|
\\Biss\\B |
329 |
|
|
330 |
|
which finds occurrences of "iss" in the middle of words. (\\B matches only if |
331 |
|
the current position in the subject is not a word boundary.) When applied to |
332 |
|
the string "Mississipi" the first call to \fBpcre_exec()\fR finds the first |
333 |
|
occurrence. If \fBpcre_exec()\fR is called again with just the remainder of the |
334 |
|
subject, namely "issipi", it does not match, because \\B is always false at the |
335 |
|
start of the subject, which is deemed to be a word boundary. However, if |
336 |
|
\fBpcre_exec()\fR is passed the entire string again, but with \fIstartoffset\fR |
337 |
|
set to 4, it finds the second occurrence of "iss" because it is able to look |
338 |
|
behind the starting point to discover that it is preceded by a letter. |
339 |
|
|
340 |
|
If a non-zero starting offset is passed when the pattern is anchored, one |
341 |
|
attempt to match at the given offset is tried. This can only succeed if the |
342 |
|
pattern does not require the match to be at the start of the subject. |
343 |
|
|
344 |
In general, a pattern matches a certain portion of the subject, and in |
In general, a pattern matches a certain portion of the subject, and in |
345 |
addition, further substrings from the subject may be picked out by parts of the |
addition, further substrings from the subject may be picked out by parts of the |
346 |
pattern. Following the usage in Jeffrey Friedl's book, this is called |
pattern. Following the usage in Jeffrey Friedl's book, this is called |
369 |
subpatterns, the return value from a successful match is 1, indicating that |
subpatterns, the return value from a successful match is 1, indicating that |
370 |
just the first pair of offsets has been set. |
just the first pair of offsets has been set. |
371 |
|
|
372 |
|
Some convenience functions are provided for extracting the captured substrings |
373 |
|
as separate strings. These are described in the following section. |
374 |
|
|
375 |
It is possible for an capturing subpattern number \fIn+1\fR to match some |
It is possible for an capturing subpattern number \fIn+1\fR to match some |
376 |
part of the subject when subpattern \fIn\fR has not been used at all. For |
part of the subject when subpattern \fIn\fR has not been used at all. For |
377 |
example, if the string "abc" is matched against the pattern (a|(z))(bc) |
example, if the string "abc" is matched against the pattern (a|(z))(bc) |
432 |
the end of matching. |
the end of matching. |
433 |
|
|
434 |
|
|
435 |
.SH INFORMATION ABOUT A PATTERN |
.SH EXTRACTING CAPTURED SUBSTRINGS |
436 |
The \fBpcre_info()\fR function returns information about a compiled pattern. |
Captured substrings can be accessed directly by using the offsets returned by |
437 |
Its yield is the number of capturing subpatterns, or one of the following |
\fBpcre_exec()\fR in \fIovector\fR. For convenience, the functions |
438 |
negative numbers: |
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
439 |
|
\fBpcre_get_substring_list()\fR are provided for extracting captured substrings |
440 |
|
as new, separate, zero-terminated strings. A substring that contains a binary |
441 |
|
zero is correctly extracted and has a further zero added on the end, but the |
442 |
|
result does not, of course, function as a C string. |
443 |
|
|
444 |
|
The first three arguments are the same for all three functions: \fIsubject\fR |
445 |
|
is the subject string which has just been successfully matched, \fIovector\fR |
446 |
|
is a pointer to the vector of integer offsets that was passed to |
447 |
|
\fBpcre_exec()\fR, and \fIstringcount\fR is the number of substrings that |
448 |
|
were captured by the match, including the substring that matched the entire |
449 |
|
regular expression. This is the value returned by \fBpcre_exec\fR if it |
450 |
|
is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it |
451 |
|
ran out of space in \fIovector\fR, then the value passed as |
452 |
|
\fIstringcount\fR should be the size of the vector divided by three. |
453 |
|
|
454 |
|
The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR |
455 |
|
extract a single substring, whose number is given as \fIstringnumber\fR. A |
456 |
|
value of zero extracts the substring that matched the entire pattern, while |
457 |
|
higher values extract the captured substrings. For \fBpcre_copy_substring()\fR, |
458 |
|
the string is placed in \fIbuffer\fR, whose length is given by |
459 |
|
\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of store is |
460 |
|
obtained via \fBpcre_malloc\fR, and its address is returned via |
461 |
|
\fIstringptr\fR. The yield of the function is the length of the string, not |
462 |
|
including the terminating zero, or one of |
463 |
|
|
464 |
PCRE_ERROR_NULL the argument \fIcode\fR was NULL |
PCRE_ERROR_NOMEMORY (-6) |
|
PCRE_ERROR_BADMAGIC the "magic number" was not found |
|
465 |
|
|
466 |
If the \fIoptptr\fR argument is not NULL, a copy of the options with which the |
The buffer was too small for \fBpcre_copy_substring()\fR, or the attempt to get |
467 |
pattern was compiled is placed in the integer it points to. |
memory failed for \fBpcre_get_substring()\fR. |
468 |
|
|
469 |
|
PCRE_ERROR_NOSUBSTRING (-7) |
470 |
|
|
471 |
|
There is no substring whose number is \fIstringnumber\fR. |
472 |
|
|
473 |
|
The \fBpcre_get_substring_list()\fR function extracts all available substrings |
474 |
|
and builds a list of pointers to them. All this is done in a single block of |
475 |
|
memory which is obtained via \fBpcre_malloc\fR. The address of the memory block |
476 |
|
is returned via \fIlistptr\fR, which is also the start of the list of string |
477 |
|
pointers. The end of the list is marked by a NULL pointer. The yield of the |
478 |
|
function is zero if all went well, or |
479 |
|
|
480 |
|
PCRE_ERROR_NOMEMORY (-6) |
481 |
|
|
482 |
|
if the attempt to get the memory block failed. |
483 |
|
|
484 |
|
When any of these functions encounter a substring that is unset, which can |
485 |
|
happen when capturing subpattern number \fIn+1\fR matches some part of the |
486 |
|
subject, but subpattern \fIn\fR has not been used at all, they return an empty |
487 |
|
string. This can be distinguished from a genuine zero-length substring by |
488 |
|
inspecting the appropriate offset in \fIovector\fR, which is negative for unset |
489 |
|
substrings. |
490 |
|
|
|
If the \fIfirstcharptr\fR argument is not NULL, is is used to pass back |
|
|
information about the first character of any matched string. If there is a |
|
|
fixed first character, e.g. from a pattern such as (cat|cow|coyote), then it is |
|
|
returned in the integer pointed to by \fIfirstcharptr\fR. Otherwise, if the |
|
|
pattern was compiled with the PCRE_MULTILINE option, and every branch started |
|
|
with "^", then -1 is returned, indicating that the pattern will match at the |
|
|
start of a subject string or after any "\\n" within the string. Otherwise -2 is |
|
|
returned. |
|
491 |
|
|
492 |
|
|
493 |
.SH LIMITATIONS |
.SH LIMITATIONS |
755 |
The \\A, \\Z, and \\z assertions differ from the traditional circumflex and |
The \\A, \\Z, and \\z assertions differ from the traditional circumflex and |
756 |
dollar (described below) in that they only ever match at the very start and end |
dollar (described below) in that they only ever match at the very start and end |
757 |
of the subject string, whatever options are set. They are not affected by the |
of the subject string, whatever options are set. They are not affected by the |
758 |
PCRE_NOTBOL or PCRE_NOTEOL options. The difference between \\Z and \\z is that |
PCRE_NOTBOL or PCRE_NOTEOL options. If the \fIstartoffset\fR argument of |
759 |
\\Z matches before a newline that is the last character of the string as well |
\fBpcre_exec()\fR is non-zero, \\A can never match. The difference between \\Z |
760 |
as at the end of the string, whereas \\z matches only at the end. |
and \\z is that \\Z matches before a newline that is the last character of the |
761 |
|
string as well as at the end of the string, whereas \\z matches only at the |
762 |
|
end. |
763 |
|
|
764 |
|
|
765 |
.SH CIRCUMFLEX AND DOLLAR |
.SH CIRCUMFLEX AND DOLLAR |
766 |
Outside a character class, in the default matching mode, the circumflex |
Outside a character class, in the default matching mode, the circumflex |
767 |
character is an assertion which is true only if the current matching point is |
character is an assertion which is true only if the current matching point is |
768 |
at the start of the subject string. Inside a character class, circumflex has an |
at the start of the subject string. If the \fIstartoffset\fR argument of |
769 |
entirely different meaning (see below). |
\fBpcre_exec()\fR is non-zero, circumflex can never match. Inside a character |
770 |
|
class, circumflex has an entirely different meaning (see below). |
771 |
|
|
772 |
Circumflex need not be the first character of the pattern if a number of |
Circumflex need not be the first character of the pattern if a number of |
773 |
alternatives are involved, but it should be the first thing in each alternative |
alternatives are involved, but it should be the first thing in each alternative |
794 |
addition to matching at the start and end of the subject string. For example, |
addition to matching at the start and end of the subject string. For example, |
795 |
the pattern /^abc$/ matches the subject string "def\\nabc" in multiline mode, |
the pattern /^abc$/ matches the subject string "def\\nabc" in multiline mode, |
796 |
but not otherwise. Consequently, patterns that are anchored in single line mode |
but not otherwise. Consequently, patterns that are anchored in single line mode |
797 |
because all branches start with "^" are not anchored in multiline mode. The |
because all branches start with "^" are not anchored in multiline mode, and a |
798 |
PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is set. |
match for circumflex is possible when the \fIstartoffset\fR argument of |
799 |
|
\fBpcre_exec()\fR is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if |
800 |
|
PCRE_MULTILINE is set. |
801 |
|
|
802 |
Note that the sequences \\A, \\Z, and \\z can be used to match the start and |
Note that the sequences \\A, \\Z, and \\z can be used to match the start and |
803 |
end of the subject in both modes, and if all branches of a pattern start with |
end of the subject in both modes, and if all branches of a pattern start with |
847 |
character class. For example, [d-m] matches any letter between d and m, |
character class. For example, [d-m] matches any letter between d and m, |
848 |
inclusive. If a minus character is required in a class, it must be escaped with |
inclusive. If a minus character is required in a class, it must be escaped with |
849 |
a backslash or appear in a position where it cannot be interpreted as |
a backslash or appear in a position where it cannot be interpreted as |
850 |
indicating a range, typically as the first or last character in the class. It |
indicating a range, typically as the first or last character in the class. |
851 |
is not possible to have the character "]" as the end character of a range, |
|
852 |
since a sequence such as [w-] is interpreted as a class of two characters. The |
It is not possible to have the literal character "]" as the end character of a |
853 |
octal or hexadecimal representation of "]" can, however, be used to end a |
range. A pattern such as [W-]46] is interpreted as a class of two characters |
854 |
range. |
("W" and "-") followed by a literal string "46]", so it would match "W46]" or |
855 |
|
"-46]". However, if the "]" is escaped with a backslash it is interpreted as |
856 |
|
the end of range, so [W-\\]46] is interpreted as a single class containing a |
857 |
|
range followed by two separate characters. The octal or hexadecimal |
858 |
|
representation of "]" can also be used to end a range. |
859 |
|
|
860 |
Ranges operate in ASCII collating sequence. They can also be used for |
Ranges operate in ASCII collating sequence. They can also be used for |
861 |
characters specified numerically, for example [\\000-\\037]. If a range that |
characters specified numerically, for example [\\000-\\037]. If a range that |
1091 |
is greater than 1 or with a limited maximum, more store is required for the |
is greater than 1 or with a limited maximum, more store is required for the |
1092 |
compiled pattern, in proportion to the size of the minimum or maximum. |
compiled pattern, in proportion to the size of the minimum or maximum. |
1093 |
|
|
1094 |
If a pattern starts with .* then it is implicitly anchored, since whatever |
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent |
1095 |
follows will be tried against every character position in the subject string. |
to Perl's /s) is set, thus allowing the . to match newlines, then the pattern |
1096 |
PCRE treats this as though it were preceded by \\A. |
is implicitly anchored, because whatever follows will be tried against every |
1097 |
|
character position in the subject string, so there is no point in retrying the |
1098 |
|
overall match at any position after the first. PCRE treats such a pattern as |
1099 |
|
though it were preceded by \\A. In cases where it is known that the subject |
1100 |
|
string contains no newlines, it is worth setting PCRE_DOTALL when the pattern |
1101 |
|
begins with .* in order to obtain this optimization, or alternatively using ^ |
1102 |
|
to indicate anchoring explicitly. |
1103 |
|
|
1104 |
When a capturing subpattern is repeated, the value captured is the substring |
When a capturing subpattern is repeated, the value captured is the substring |
1105 |
that matched the final iteration. For example, after |
that matched the final iteration. For example, after |
1249 |
preceded by "foo". |
preceded by "foo". |
1250 |
|
|
1251 |
Assertion subpatterns are not capturing subpatterns, and may not be repeated, |
Assertion subpatterns are not capturing subpatterns, and may not be repeated, |
1252 |
because it makes no sense to assert the same thing several times. If an |
because it makes no sense to assert the same thing several times. If any kind |
1253 |
assertion contains capturing subpatterns within it, these are always counted |
of assertion contains capturing subpatterns within it, these are counted for |
1254 |
for the purposes of numbering the capturing subpatterns in the whole pattern. |
the purposes of numbering the capturing subpatterns in the whole pattern. |
1255 |
Substring capturing is carried out for positive assertions, but it does not |
However, substring capturing is carried out only for positive assertions, |
1256 |
make sense for negative assertions. |
because it does not make sense for negative assertions. |
1257 |
|
|
1258 |
Assertions count towards the maximum of 200 parenthesized subpatterns. |
Assertions count towards the maximum of 200 parenthesized subpatterns. |
1259 |
|
|
1290 |
the current point in the subject string. |
the current point in the subject string. |
1291 |
|
|
1292 |
Once-only subpatterns are not capturing subpatterns. Simple cases such as the |
Once-only subpatterns are not capturing subpatterns. Simple cases such as the |
1293 |
above example can be though of as a maximizing repeat that must swallow |
above example can be thought of as a maximizing repeat that must swallow |
1294 |
everything it can. So, while both \\d+ and \\d+? are prepared to adjust the |
everything it can. So, while both \\d+ and \\d+? are prepared to adjust the |
1295 |
number of digits they match in order to make the rest of the pattern match, |
number of digits they match in order to make the rest of the pattern match, |
1296 |
(?>\\d+) can only match an entire sequence of digits. |
(?>\\d+) can only match an entire sequence of digits. |
1309 |
then see if what follows matches the rest of the pattern. If the pattern is |
then see if what follows matches the rest of the pattern. If the pattern is |
1310 |
specified as |
specified as |
1311 |
|
|
1312 |
.*abcd$ |
^.*abcd$ |
1313 |
|
|
1314 |
then the initial .* matches the entire string at first, but when this fails, it |
then the initial .* matches the entire string at first, but when this fails, it |
1315 |
backtracks to match all but the last character, then all but the last two |
backtracks to match all but the last character, then all but the last two |
1317 |
from right to left, so we are no better off. However, if the pattern is written |
from right to left, so we are no better off. However, if the pattern is written |
1318 |
as |
as |
1319 |
|
|
1320 |
(?>.*)(?<=abcd) |
^(?>.*)(?<=abcd) |
1321 |
|
|
1322 |
then there can be no backtracking for the .* item; it can match only the entire |
then there can be no backtracking for the .* item; it can match only the entire |
1323 |
string. The subsequent lookbehind assertion does a single test on the last four |
string. The subsequent lookbehind assertion does a single test on the last four |
1391 |
contains a lot of discussion about optimizing regular expressions for efficient |
contains a lot of discussion about optimizing regular expressions for efficient |
1392 |
performance. |
performance. |
1393 |
|
|
1394 |
|
When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is |
1395 |
|
implicitly anchored by PCRE, since it can match only at the start of a subject |
1396 |
|
string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization, |
1397 |
|
because the . metacharacter does not then match a newline, and if the subject |
1398 |
|
string contains newlines, the pattern may match from the character immediately |
1399 |
|
following one of them instead of from the very start. For example, the pattern |
1400 |
|
|
1401 |
|
(.*) second |
1402 |
|
|
1403 |
|
matches the subject "first\\nand second" (where \\n stands for a newline |
1404 |
|
character) with the first captured substring being "and". In order to do this, |
1405 |
|
PCRE has to retry the match starting after every newline in the subject. |
1406 |
|
|
1407 |
|
If you are using such a pattern with subject strings that do not contain |
1408 |
|
newlines, the best performance is obtained by setting PCRE_DOTALL, or starting |
1409 |
|
the pattern with ^.* to indicate explicit anchoring. That saves PCRE from |
1410 |
|
having to scan along the subject looking for a newline to restart at. |
1411 |
|
|
1412 |
.SH AUTHOR |
.SH AUTHOR |
1413 |
Philip Hazel <ph10@cam.ac.uk> |
Philip Hazel <ph10@cam.ac.uk> |
1420 |
.br |
.br |
1421 |
Phone: +44 1223 334714 |
Phone: +44 1223 334714 |
1422 |
|
|
1423 |
|
Last updated: 10 June 1999 |
1424 |
|
.br |
1425 |
Copyright (c) 1997-1999 University of Cambridge. |
Copyright (c) 1997-1999 University of Cambridge. |