28 |
int pcre_get_substring_list(const char *subject, |
int pcre_get_substring_list(const char *subject, |
29 |
int *ovector, int stringcount, const char ***listptr); |
int *ovector, int stringcount, const char ***listptr); |
30 |
|
|
31 |
|
void pcre_free_substring(const char *stringptr); |
32 |
|
|
33 |
|
void pcre_free_substring_list(const char **stringptr); |
34 |
|
|
35 |
const unsigned char *pcre_maketables(void); |
const unsigned char *pcre_maketables(void); |
36 |
|
|
37 |
|
int pcre_fullinfo(const pcre *code, const pcre_extra *extra, |
38 |
|
int what, void *where); |
39 |
|
|
40 |
int pcre_info(const pcre *code, int *optptr, *firstcharptr); |
int pcre_info(const pcre *code, int *optptr, *firstcharptr); |
41 |
|
|
42 |
char *pcre_version(void); |
char *pcre_version(void); |
52 |
The PCRE library is a set of functions that implement regu- |
The PCRE library is a set of functions that implement regu- |
53 |
lar expression pattern matching using the same syntax and |
lar expression pattern matching using the same syntax and |
54 |
semantics as Perl 5, with just a few differences (see |
semantics as Perl 5, with just a few differences (see |
55 |
|
|
56 |
below). The current implementation corresponds to Perl |
below). The current implementation corresponds to Perl |
57 |
5.005. |
5.005, with some additional features from later versions. |
58 |
|
This includes some experimental, incomplete support for |
59 |
|
UTF-8 encoded strings. Details of exactly what is and what |
60 |
|
is not supported are given below. |
61 |
|
|
62 |
PCRE has its own native API, which is described in this |
PCRE has its own native API, which is described in this |
63 |
document. There is also a set of wrapper functions that |
document. There is also a set of wrapper functions that |
64 |
correspond to the POSIX API. These are described in the |
correspond to the POSIX regular expression API. These are |
65 |
pcreposix documentation. |
described in the pcreposix documentation. |
66 |
|
|
67 |
The native API function prototypes are defined in the header |
The native API function prototypes are defined in the header |
68 |
file pcre.h, and on Unix systems the library itself is |
file pcre.h, and on Unix systems the library itself is |
69 |
called libpcre.a, so can be accessed by adding -lpcre to the |
called libpcre.a, so can be accessed by adding -lpcre to the |
70 |
command for linking an application which calls it. |
command for linking an application which calls it. The |
71 |
|
header file defines the macros PCRE_MAJOR and PCRE_MINOR to |
72 |
|
contain the major and minor release numbers for the library. |
73 |
|
Applications can use these to include support for different |
74 |
|
releases. |
75 |
|
|
76 |
The functions pcre_compile(), pcre_study(), and pcre_exec() |
The functions pcre_compile(), pcre_study(), and pcre_exec() |
77 |
are used for compiling and matching regular expressions, |
are used for compiling and matching regular expressions. A |
78 |
while pcre_copy_substring(), pcre_get_substring(), and |
sample program that demonstrates the simplest way of using |
79 |
pcre_get_substring_list() are convenience functions for |
them is given in the file pcredemo.c. The last section of |
80 |
|
this man page describes how to run it. |
81 |
|
|
82 |
|
The functions pcre_copy_substring(), pcre_get_substring(), |
83 |
|
and pcre_get_substring_list() are convenience functions for |
84 |
extracting captured substrings from a matched subject |
extracting captured substrings from a matched subject |
85 |
string. The function pcre_maketables() is used (optionally) |
string; pcre_free_substring() and pcre_free_substring_list() |
86 |
to build a set of character tables in the current locale for |
are also provided, to free the memory used for extracted |
87 |
passing to pcre_compile(). |
strings. |
88 |
|
|
89 |
The function pcre_info() is used to find out information |
The function pcre_maketables() is used (optionally) to build |
90 |
about a compiled pattern, while the function pcre_version() |
a set of character tables in the current locale for passing |
91 |
returns a pointer to a string containing the version of PCRE |
to pcre_compile(). |
92 |
and its date of release. |
|
93 |
|
The function pcre_fullinfo() is used to find out information |
94 |
|
about a compiled pattern; pcre_info() is an obsolete version |
95 |
|
which returns only some of the available information, but is |
96 |
|
retained for backwards compatibility. The function |
97 |
|
pcre_version() returns a pointer to a string containing the |
98 |
|
version of PCRE and its date of release. |
99 |
|
|
100 |
The global variables pcre_malloc and pcre_free initially |
The global variables pcre_malloc and pcre_free initially |
101 |
contain the entry points of the standard malloc() and free() |
contain the entry points of the standard malloc() and free() |
124 |
by a binary zero, and is passed in the argument pattern. A |
by a binary zero, and is passed in the argument pattern. A |
125 |
pointer to a single block of memory that is obtained via |
pointer to a single block of memory that is obtained via |
126 |
pcre_malloc is returned. This contains the compiled code and |
pcre_malloc is returned. This contains the compiled code and |
127 |
related data. The pcre type is defined for this for conveni- |
related data. The pcre type is defined for the returned |
128 |
ence, but in fact pcre is just a typedef for void, since the |
block; this is a typedef for a structure whose contents are |
129 |
contents of the block are not externally defined. It is up |
not externally defined. It is up to the caller to free the |
130 |
to the caller to free the memory when it is no longer |
memory when it is no longer required. |
131 |
required. |
|
132 |
|
Although the compiled code of a PCRE regex is relocatable, |
133 |
|
that is, it does not depend on memory location, the complete |
134 |
|
pcre data block is not fully relocatable, because it con- |
135 |
|
tains a copy of the tableptr argument, which is an address |
136 |
|
(see below). |
137 |
|
|
138 |
The size of a compiled pattern is roughly proportional to |
The size of a compiled pattern is roughly proportional to |
139 |
the length of the pattern string, except that each character |
the length of the pattern string, except that each character |
168 |
must be the result of a call to pcre_maketables(). See the |
must be the result of a call to pcre_maketables(). See the |
169 |
section on locale support below. |
section on locale support below. |
170 |
|
|
171 |
|
This code fragment shows a typical straightforward call to |
172 |
|
pcre_compile(): |
173 |
|
|
174 |
|
pcre *re; |
175 |
|
const char *error; |
176 |
|
int erroffset; |
177 |
|
re = pcre_compile( |
178 |
|
"^A.*Z", /* the pattern */ |
179 |
|
0, /* default options */ |
180 |
|
&error, /* for error message */ |
181 |
|
&erroffset, /* for error offset */ |
182 |
|
NULL); /* use default character tables */ |
183 |
|
|
184 |
The following option bits are defined in the header file: |
The following option bits are defined in the header file: |
185 |
|
|
186 |
PCRE_ANCHORED |
PCRE_ANCHORED |
231 |
|
|
232 |
PCRE_EXTRA |
PCRE_EXTRA |
233 |
|
|
234 |
This option turns on additional functionality of PCRE that |
This option was invented in order to turn on additional |
235 |
is incompatible with Perl. Any backslash in a pattern that |
functionality of PCRE that is incompatible with Perl, but it |
236 |
is followed by a letter that has no special meaning causes |
is currently of very little use. When set, any backslash in |
237 |
an error, thus reserving these combinations for future |
a pattern that is followed by a letter that has no special |
238 |
expansion. By default, as in Perl, a backslash followed by a |
meaning causes an error, thus reserving these combinations |
239 |
letter with no special meaning is treated as a literal. |
for future expansion. By default, as in Perl, a backslash |
240 |
There are at present no other features controlled by this |
followed by a letter with no special meaning is treated as a |
241 |
option. |
literal. There are at present no other features controlled |
242 |
|
by this option. It can also be set by a (?X) option setting |
243 |
|
within a pattern. |
244 |
|
|
245 |
PCRE_MULTILINE |
PCRE_MULTILINE |
246 |
|
|
253 |
PCRE_DOLLAR_ENDONLY is set). This is the same as Perl. |
PCRE_DOLLAR_ENDONLY is set). This is the same as Perl. |
254 |
|
|
255 |
When PCRE_MULTILINE it is set, the "start of line" and "end |
When PCRE_MULTILINE it is set, the "start of line" and "end |
256 |
of line" constructs match immediately following or |
of line" constructs match immediately following or immedi- |
257 |
immediately before any newline in the subject string, |
ately before any newline in the subject string, respec- |
258 |
respectively, as well as at the very start and end. This is |
tively, as well as at the very start and end. This is |
259 |
equivalent to Perl's /m option. If there are no "\n" charac- |
equivalent to Perl's /m option. If there are no "\n" charac- |
260 |
ters in a subject string, or no occurrences of ^ or $ in a |
ters in a subject string, or no occurrences of ^ or $ in a |
261 |
pattern, setting PCRE_MULTILINE has no effect. |
pattern, setting PCRE_MULTILINE has no effect. |
267 |
followed by "?". It is not compatible with Perl. It can also |
followed by "?". It is not compatible with Perl. It can also |
268 |
be set by a (?U) option setting within the pattern. |
be set by a (?U) option setting within the pattern. |
269 |
|
|
270 |
|
PCRE_UTF8 |
271 |
|
|
272 |
|
This option causes PCRE to regard both the pattern and the |
273 |
|
subject as strings of UTF-8 characters instead of just byte |
274 |
|
strings. However, it is available only if PCRE has been |
275 |
|
built to include UTF-8 support. If not, the use of this |
276 |
|
option provokes an error. Support for UTF-8 is new, experi- |
277 |
|
mental, and incomplete. Details of exactly what it entails |
278 |
|
are given below. |
279 |
|
|
280 |
|
|
281 |
|
|
282 |
STUDYING A PATTERN |
STUDYING A PATTERN |
284 |
worth spending more time analyzing it in order to speed up |
worth spending more time analyzing it in order to speed up |
285 |
the time taken for matching. The function pcre_study() takes |
the time taken for matching. The function pcre_study() takes |
286 |
a pointer to a compiled pattern as its first argument, and |
a pointer to a compiled pattern as its first argument, and |
287 |
returns a pointer to a pcre_extra block (another void |
returns a pointer to a pcre_extra block (another typedef for |
288 |
typedef) containing additional information about the pat- |
a structure with hidden contents) containing additional |
289 |
tern; this can be passed to pcre_exec(). If no additional |
information about the pattern; this can be passed to |
290 |
information is available, NULL is returned. |
pcre_exec(). If no additional information is available, NULL |
291 |
|
is returned. |
292 |
|
|
293 |
The second argument contains option bits. At present, no |
The second argument contains option bits. At present, no |
294 |
options are defined for pcre_study(), and this argument |
options are defined for pcre_study(), and this argument |
299 |
the variable it points to is set to NULL. Otherwise it |
the variable it points to is set to NULL. Otherwise it |
300 |
points to a textual error message. |
points to a textual error message. |
301 |
|
|
302 |
|
This is a typical call to pcre_study(): |
303 |
|
|
304 |
|
pcre_extra *pe; |
305 |
|
pe = pcre_study( |
306 |
|
re, /* result of pcre_compile() */ |
307 |
|
0, /* no options exist */ |
308 |
|
&error); /* set to NULL or points to a message */ |
309 |
|
|
310 |
At present, studying a pattern is useful only for non- |
At present, studying a pattern is useful only for non- |
311 |
anchored patterns that do not have a single fixed starting |
anchored patterns that do not have a single fixed starting |
312 |
character. A bitmap of possible starting characters is |
character. A bitmap of possible starting characters is |
349 |
|
|
350 |
|
|
351 |
INFORMATION ABOUT A PATTERN |
INFORMATION ABOUT A PATTERN |
352 |
The pcre_info() function returns information about a com- |
The pcre_fullinfo() function returns information about a |
353 |
piled pattern. Its yield is the number of capturing subpat- |
compiled pattern. It replaces the obsolete pcre_info() func- |
354 |
terns, or one of the following negative numbers: |
tion, which is nevertheless retained for backwards compabil- |
355 |
|
ity (and is documented below). |
356 |
|
|
357 |
|
The first argument for pcre_fullinfo() is a pointer to the |
358 |
|
compiled pattern. The second argument is the result of |
359 |
|
pcre_study(), or NULL if the pattern was not studied. The |
360 |
|
third argument specifies which piece of information is |
361 |
|
required, while the fourth argument is a pointer to a vari- |
362 |
|
able to receive the data. The yield of the function is zero |
363 |
|
for success, or one of the following negative numbers: |
364 |
|
|
365 |
PCRE_ERROR_NULL the argument code was NULL |
PCRE_ERROR_NULL the argument code was NULL |
366 |
|
the argument where was NULL |
367 |
PCRE_ERROR_BADMAGIC the "magic number" was not found |
PCRE_ERROR_BADMAGIC the "magic number" was not found |
368 |
|
PCRE_ERROR_BADOPTION the value of what was invalid |
369 |
|
|
370 |
If the optptr argument is not NULL, a copy of the options |
Here is a typical call of pcre_fullinfo(), to obtain the |
371 |
with which the pattern was compiled is placed in the integer |
length of the compiled pattern: |
372 |
it points to. These option bits are those specified in the |
|
373 |
|
int rc; |
374 |
|
unsigned long int length; |
375 |
|
rc = pcre_fullinfo( |
376 |
|
re, /* result of pcre_compile() */ |
377 |
|
pe, /* result of pcre_study(), or NULL */ |
378 |
|
PCRE_INFO_SIZE, /* what is required */ |
379 |
|
&length); /* where to put the data */ |
380 |
|
|
381 |
|
The possible values for the third argument are defined in |
382 |
|
pcre.h, and are as follows: |
383 |
|
|
384 |
|
PCRE_INFO_OPTIONS |
385 |
|
|
386 |
|
Return a copy of the options with which the pattern was com- |
387 |
|
piled. The fourth argument should point to an unsigned long |
388 |
|
int variable. These option bits are those specified in the |
389 |
call to pcre_compile(), modified by any top-level option |
call to pcre_compile(), modified by any top-level option |
390 |
settings within the pattern itself, and with the |
settings within the pattern itself, and with the |
391 |
PCRE_ANCHORED bit set if the form of the pattern implies |
PCRE_ANCHORED bit forcibly set if the form of the pattern |
392 |
that it can match only at the start of a subject string. |
implies that it can match only at the start of a subject |
393 |
|
string. |
394 |
|
|
395 |
If the pattern is not anchored and the firstcharptr argument |
PCRE_INFO_SIZE |
396 |
is not NULL, it is used to pass back information about the |
|
397 |
first character of any matched string. If there is a fixed |
Return the size of the compiled pattern, that is, the value |
398 |
first character, e.g. from a pattern such as |
that was passed as the argument to pcre_malloc() when PCRE |
399 |
(cat|cow|coyote), then it is returned in the integer pointed |
was getting memory in which to place the compiled data. The |
400 |
to by firstcharptr. Otherwise, if either |
fourth argument should point to a size_t variable. |
401 |
|
|
402 |
|
PCRE_INFO_CAPTURECOUNT |
403 |
|
|
404 |
|
Return the number of capturing subpatterns in the pattern. |
405 |
|
The fourth argument should point to an int variable. |
406 |
|
|
407 |
|
PCRE_INFO_BACKREFMAX |
408 |
|
|
409 |
|
Return the number of the highest back reference in the pat- |
410 |
|
tern. The fourth argument should point to an int variable. |
411 |
|
Zero is returned if there are no back references. |
412 |
|
|
413 |
|
PCRE_INFO_FIRSTCHAR |
414 |
|
|
415 |
|
Return information about the first character of any matched |
416 |
|
string, for a non-anchored pattern. If there is a fixed |
417 |
|
first character, e.g. from a pattern such as |
418 |
|
(cat|cow|coyote), it is returned in the integer pointed to |
419 |
|
by where. Otherwise, if either |
420 |
|
|
421 |
(a) the pattern was compiled with the PCRE_MULTILINE option, |
(a) the pattern was compiled with the PCRE_MULTILINE option, |
422 |
and every branch starts with "^", or |
and every branch starts with "^", or |
424 |
(b) every branch of the pattern starts with ".*" and |
(b) every branch of the pattern starts with ".*" and |
425 |
PCRE_DOTALL is not set (if it were set, the pattern would be |
PCRE_DOTALL is not set (if it were set, the pattern would be |
426 |
anchored), |
anchored), |
427 |
then -1 is returned, indicating that the pattern matches |
|
428 |
only at the start of a subject string or after any "\n" |
-1 is returned, indicating that the pattern matches only at |
429 |
within the string. Otherwise -2 is returned. |
the start of a subject string or after any "\n" within the |
430 |
|
string. Otherwise -2 is returned. For anchored patterns, -2 |
431 |
|
is returned. |
432 |
|
|
433 |
|
PCRE_INFO_FIRSTTABLE |
434 |
|
|
435 |
|
If the pattern was studied, and this resulted in the con- |
436 |
|
struction of a 256-bit table indicating a fixed set of char- |
437 |
|
acters for the first character in any matching string, a |
438 |
|
pointer to the table is returned. Otherwise NULL is |
439 |
|
returned. The fourth argument should point to an unsigned |
440 |
|
char * variable. |
441 |
|
|
442 |
|
PCRE_INFO_LASTLITERAL |
443 |
|
|
444 |
|
For a non-anchored pattern, return the value of the right- |
445 |
|
most literal character which must exist in any matched |
446 |
|
string, other than at its start. The fourth argument should |
447 |
|
point to an int variable. If there is no such character, or |
448 |
|
if the pattern is anchored, -1 is returned. For example, for |
449 |
|
the pattern /a\d+z\d+/ the returned value is 'z'. |
450 |
|
|
451 |
|
The pcre_info() function is now obsolete because its inter- |
452 |
|
face is too restrictive to return all the available data |
453 |
|
about a compiled pattern. New programs should use |
454 |
|
pcre_fullinfo() instead. The yield of pcre_info() is the |
455 |
|
number of capturing subpatterns, or one of the following |
456 |
|
negative numbers: |
457 |
|
|
458 |
|
PCRE_ERROR_NULL the argument code was NULL |
459 |
|
PCRE_ERROR_BADMAGIC the "magic number" was not found |
460 |
|
|
461 |
|
If the optptr argument is not NULL, a copy of the options |
462 |
|
with which the pattern was compiled is placed in the integer |
463 |
|
it points to (see PCRE_INFO_OPTIONS above). |
464 |
|
|
465 |
|
If the pattern is not anchored and the firstcharptr argument |
466 |
|
is not NULL, it is used to pass back information about the |
467 |
|
first character of any matched string (see |
468 |
|
PCRE_INFO_FIRSTCHAR above). |
469 |
|
|
470 |
|
|
471 |
|
|
472 |
MATCHING A PATTERN |
MATCHING A PATTERN |
473 |
The function pcre_exec() is called to match a subject string |
The function pcre_exec() is called to match a subject string |
474 |
|
|
475 |
|
|
476 |
|
|
477 |
|
|
478 |
|
|
479 |
|
SunOS 5.8 Last change: 9 |
480 |
|
|
481 |
|
|
482 |
|
|
483 |
against a pre-compiled pattern, which is passed in the code |
against a pre-compiled pattern, which is passed in the code |
484 |
argument. If the pattern has been studied, the result of the |
argument. If the pattern has been studied, the result of the |
485 |
study should be passed in the extra argument. Otherwise this |
study should be passed in the extra argument. Otherwise this |
486 |
must be NULL. |
must be NULL. |
487 |
|
|
488 |
|
Here is an example of a simple call to pcre_exec(): |
489 |
|
|
490 |
|
int rc; |
491 |
|
int ovector[30]; |
492 |
|
rc = pcre_exec( |
493 |
|
re, /* result of pcre_compile() */ |
494 |
|
NULL, /* we didn't study the pattern */ |
495 |
|
"some string", /* the subject string */ |
496 |
|
11, /* the length of the subject string */ |
497 |
|
0, /* start at offset 0 in the subject */ |
498 |
|
0, /* default options */ |
499 |
|
ovector, /* vector for substring information */ |
500 |
|
30); /* number of elements in the vector */ |
501 |
|
|
502 |
The PCRE_ANCHORED option can be passed in the options argu- |
The PCRE_ANCHORED option can be passed in the options argu- |
503 |
ment, whose unused bits must be zero. However, if a pattern |
ment, whose unused bits must be zero. However, if a pattern |
504 |
was compiled with PCRE_ANCHORED, or turned out to be |
was compiled with PCRE_ANCHORED, or turned out to be |
549 |
|
|
550 |
The subject string is passed as a pointer in subject, a |
The subject string is passed as a pointer in subject, a |
551 |
length in length, and a starting offset in startoffset. |
length in length, and a starting offset in startoffset. |
552 |
Unlike the pattern string, it may contain binary zero char- |
Unlike the pattern string, the subject may contain binary |
553 |
acters. When the starting offset is zero, the search for a |
zero characters. When the starting offset is zero, the |
554 |
match starts at the beginning of the subject, and this is by |
search for a match starts at the beginning of the subject, |
555 |
far the most common case. |
and this is by far the most common case. |
556 |
|
|
557 |
A non-zero starting offset is useful when searching for |
A non-zero starting offset is useful when searching for |
558 |
another match in the same subject by calling pcre_exec() |
another match in the same subject by calling pcre_exec() |
688 |
|
|
689 |
|
|
690 |
|
|
691 |
|
|
692 |
EXTRACTING CAPTURED SUBSTRINGS |
EXTRACTING CAPTURED SUBSTRINGS |
693 |
Captured substrings can be accessed directly by using the |
Captured substrings can be accessed directly by using the |
694 |
offsets returned by pcre_exec() in ovector. For convenience, |
offsets returned by pcre_exec() in ovector. For convenience, |
708 |
entire regular expression. This is the value returned by |
entire regular expression. This is the value returned by |
709 |
pcre_exec if it is greater than zero. If pcre_exec() |
pcre_exec if it is greater than zero. If pcre_exec() |
710 |
returned zero, indicating that it ran out of space in ovec- |
returned zero, indicating that it ran out of space in ovec- |
711 |
tor, then the value passed as stringcount should be the size |
tor, the value passed as stringcount should be the size of |
712 |
of the vector divided by three. |
the vector divided by three. |
713 |
|
|
714 |
The functions pcre_copy_substring() and pcre_get_substring() |
The functions pcre_copy_substring() and pcre_get_substring() |
715 |
extract a single substring, whose number is given as string- |
extract a single substring, whose number is given as string- |
717 |
the entire pattern, while higher values extract the captured |
the entire pattern, while higher values extract the captured |
718 |
substrings. For pcre_copy_substring(), the string is placed |
substrings. For pcre_copy_substring(), the string is placed |
719 |
in buffer, whose length is given by buffersize, while for |
in buffer, whose length is given by buffersize, while for |
720 |
pcre_get_substring() a new block of store is obtained via |
pcre_get_substring() a new block of memory is obtained via |
721 |
pcre_malloc, and its address is returned via stringptr. The |
pcre_malloc, and its address is returned via stringptr. The |
722 |
yield of the function is the length of the string, not |
yield of the function is the length of the string, not |
723 |
including the terminating zero, or one of |
including the terminating zero, or one of |
751 |
inspecting the appropriate offset in ovector, which is nega- |
inspecting the appropriate offset in ovector, which is nega- |
752 |
tive for unset substrings. |
tive for unset substrings. |
753 |
|
|
754 |
|
The two convenience functions pcre_free_substring() and |
755 |
|
pcre_free_substring_list() can be used to free the memory |
756 |
|
returned by a previous call of pcre_get_substring() or |
757 |
|
pcre_get_substring_list(), respectively. They do nothing |
758 |
|
more than call the function pointed to by pcre_free, which |
759 |
|
of course could be called directly from a C program. How- |
760 |
|
ever, PCRE is used in some situations where it is linked via |
761 |
|
a special interface to another programming language which |
762 |
|
cannot use pcre_free directly; it is for these cases that |
763 |
|
the functions are provided. |
764 |
|
|
765 |
|
|
766 |
|
|
768 |
There are some size limitations in PCRE but it is hoped that |
There are some size limitations in PCRE but it is hoped that |
769 |
they will never in practice be relevant. The maximum length |
they will never in practice be relevant. The maximum length |
770 |
of a compiled pattern is 65539 (sic) bytes. All values in |
of a compiled pattern is 65539 (sic) bytes. All values in |
771 |
repeating quantifiers must be less than 65536. The maximum |
repeating quantifiers must be less than 65536. There max- |
772 |
number of capturing subpatterns is 99. The maximum number |
imum number of capturing subpatterns is 65535. There is no |
773 |
of all parenthesized subpatterns, including capturing sub- |
limit to the number of non-capturing subpatterns, but the |
774 |
patterns, assertions, and other types of subpattern, is 200. |
maximum depth of nesting of all kinds of parenthesized sub- |
775 |
|
pattern, including capturing subpatterns, assertions, and |
776 |
|
other types of subpattern, is 200. |
777 |
|
|
778 |
The maximum length of a subject string is the largest posi- |
The maximum length of a subject string is the largest posi- |
779 |
tive number that an integer variable can hold. However, PCRE |
tive number that an integer variable can hold. However, PCRE |
827 |
6. The Perl \G assertion is not supported as it is not |
6. The Perl \G assertion is not supported as it is not |
828 |
relevant to single pattern matches. |
relevant to single pattern matches. |
829 |
|
|
830 |
7. Fairly obviously, PCRE does not support the (?{code}) |
7. Fairly obviously, PCRE does not support the (?{code}) and |
831 |
construction. |
(?p{code}) constructions. However, there is some experimen- |
832 |
|
tal support for recursive patterns using the non-Perl item |
833 |
|
(?R). |
834 |
|
|
835 |
8. There are at the time of writing some oddities in Perl |
8. There are at the time of writing some oddities in Perl |
836 |
5.005_02 concerned with the settings of captured strings |
5.005_02 concerned with the settings of captured strings |
838 |
"aba" against the pattern /^(a(b)?)+$/ sets $2 to the value |
"aba" against the pattern /^(a(b)?)+$/ sets $2 to the value |
839 |
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 |
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 |
840 |
unset. However, if the pattern is changed to |
unset. However, if the pattern is changed to |
841 |
/^(aa(b(b))?)+$/ then $2 (and $3) get set. |
/^(aa(b(b))?)+$/ then $2 (and $3) are set. |
842 |
|
|
843 |
In Perl 5.004 $2 is set in both cases, and that is also true |
In Perl 5.004 $2 is set in both cases, and that is also true |
844 |
of PCRE. If in the future Perl changes to a consistent state |
of PCRE. If in the future Perl changes to a consistent state |
864 |
(c) If PCRE_EXTRA is set, a backslash followed by a letter |
(c) If PCRE_EXTRA is set, a backslash followed by a letter |
865 |
with no special meaning is faulted. |
with no special meaning is faulted. |
866 |
|
|
867 |
(d) If PCRE_UNGREEDY is set, the greediness of the |
(d) If PCRE_UNGREEDY is set, the greediness of the repeti- |
868 |
repetition quantifiers is inverted, that is, by default they |
tion quantifiers is inverted, that is, by default they are |
869 |
are not greedy, but if followed by a question mark they are. |
not greedy, but if followed by a question mark they are. |
870 |
|
|
871 |
(e) PCRE_ANCHORED can be used to force a pattern to be tried |
(e) PCRE_ANCHORED can be used to force a pattern to be tried |
872 |
only at the start of the subject. |
only at the start of the subject. |
874 |
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options |
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options |
875 |
for pcre_exec() have no Perl equivalents. |
for pcre_exec() have no Perl equivalents. |
876 |
|
|
877 |
|
(g) The (?R) construct allows for recursive pattern matching |
878 |
|
(Perl 5.6 can do this using the (?p{code}) construct, which |
879 |
|
PCRE cannot of course support.) |
880 |
|
|
881 |
|
|
882 |
|
|
883 |
REGULAR EXPRESSION DETAILS |
REGULAR EXPRESSION DETAILS |
886 |
also described in the Perl documentation and in a number of |
also described in the Perl documentation and in a number of |
887 |
other books, some of which have copious examples. Jeffrey |
other books, some of which have copious examples. Jeffrey |
888 |
Friedl's "Mastering Regular Expressions", published by |
Friedl's "Mastering Regular Expressions", published by |
889 |
O'Reilly (ISBN 1-56592-257-3), covers them in great detail. |
O'Reilly (ISBN 1-56592-257), covers them in great detail. |
890 |
|
|
891 |
The description here is intended as reference documentation. |
The description here is intended as reference documentation. |
892 |
|
The basic operation of PCRE is on strings of bytes. However, |
893 |
|
there is the beginnings of some support for UTF-8 character |
894 |
|
strings. To use this support you must configure PCRE to |
895 |
|
include it, and then call pcre_compile() with the PCRE_UTF8 |
896 |
|
option. How this affects the pattern matching is described |
897 |
|
in the final section of this document. |
898 |
|
|
899 |
A regular expression is a pattern that is matched against a |
A regular expression is a pattern that is matched against a |
900 |
subject string from left to right. Most characters stand for |
subject string from left to right. Most characters stand for |
949 |
The backslash character has several uses. Firstly, if it is |
The backslash character has several uses. Firstly, if it is |
950 |
followed by a non-alphameric character, it takes away any |
followed by a non-alphameric character, it takes away any |
951 |
special meaning that character may have. This use of |
special meaning that character may have. This use of |
952 |
|
|
953 |
backslash as an escape character applies both inside and |
backslash as an escape character applies both inside and |
954 |
outside character classes. |
outside character classes. |
955 |
|
|
981 |
\f formfeed (hex 0C) |
\f formfeed (hex 0C) |
982 |
\n newline (hex 0A) |
\n newline (hex 0A) |
983 |
\r carriage return (hex 0D) |
\r carriage return (hex 0D) |
984 |
|
\t tab (hex 09) |
|
tab (hex 09) |
|
985 |
\xhh character with hex code hh |
\xhh character with hex code hh |
986 |
\ddd character with octal code ddd, or backreference |
\ddd character with octal code ddd, or backreference |
987 |
|
|
1033 |
Note that octal values of 100 or greater must not be intro- |
Note that octal values of 100 or greater must not be intro- |
1034 |
duced by a leading zero, because no more than three octal |
duced by a leading zero, because no more than three octal |
1035 |
digits are ever read. |
digits are ever read. |
1036 |
|
|
1037 |
All the sequences that define a single byte value can be |
All the sequences that define a single byte value can be |
1038 |
used both inside and outside character classes. In addition, |
used both inside and outside character classes. In addition, |
1039 |
inside a character class, the sequence "\b" is interpreted |
inside a character class, the sequence "\b" is interpreted |
1086 |
These assertions may not appear in character classes (but |
These assertions may not appear in character classes (but |
1087 |
note that "\b" has a different meaning, namely the backspace |
note that "\b" has a different meaning, namely the backspace |
1088 |
character, inside a character class). |
character, inside a character class). |
1089 |
|
|
1090 |
A word boundary is a position in the subject string where |
A word boundary is a position in the subject string where |
1091 |
the current character and the previous character do not both |
the current character and the previous character do not both |
1092 |
match \w or \W (i.e. one matches \w and the other matches |
match \w or \W (i.e. one matches \w and the other matches |
1153 |
|
|
1154 |
Note that the sequences \A, \Z, and \z can be used to match |
Note that the sequences \A, \Z, and \z can be used to match |
1155 |
the start and end of the subject in both modes, and if all |
the start and end of the subject in both modes, and if all |
1156 |
branches of a pattern start with \A is it always anchored, |
branches of a pattern start with \A it is always anchored, |
1157 |
whether PCRE_MULTILINE is set or not. |
whether PCRE_MULTILINE is set or not. |
1158 |
|
|
1159 |
|
|
1162 |
Outside a character class, a dot in the pattern matches any |
Outside a character class, a dot in the pattern matches any |
1163 |
one character in the subject, including a non-printing char- |
one character in the subject, including a non-printing char- |
1164 |
acter, but not (by default) newline. If the PCRE_DOTALL |
acter, but not (by default) newline. If the PCRE_DOTALL |
1165 |
option is set, then dots match newlines as well. The han- |
option is set, dots match newlines as well. The handling of |
1166 |
dling of dot is entirely independent of the handling of cir- |
dot is entirely independent of the handling of circumflex |
1167 |
cumflex and dollar, the only relationship being that they |
and dollar, the only relationship being that they both |
1168 |
both involve newline characters. Dot has no special meaning |
involve newline characters. Dot has no special meaning in a |
1169 |
in a character class. |
character class. |
1170 |
|
|
1171 |
|
|
1172 |
|
|
1248 |
|
|
1249 |
|
|
1250 |
|
|
1251 |
|
POSIX CHARACTER CLASSES |
1252 |
|
Perl 5.6 (not yet released at the time of writing) is going |
1253 |
|
to support the POSIX notation for character classes, which |
1254 |
|
uses names enclosed by [: and :] within the enclosing |
1255 |
|
square brackets. PCRE supports this notation. For example, |
1256 |
|
|
1257 |
|
[01[:alpha:]%] |
1258 |
|
|
1259 |
|
matches "0", "1", any alphabetic character, or "%". The sup- |
1260 |
|
ported class names are |
1261 |
|
|
1262 |
|
alnum letters and digits |
1263 |
|
alpha letters |
1264 |
|
ascii character codes 0 - 127 |
1265 |
|
cntrl control characters |
1266 |
|
digit decimal digits (same as \d) |
1267 |
|
graph printing characters, excluding space |
1268 |
|
lower lower case letters |
1269 |
|
print printing characters, including space |
1270 |
|
punct printing characters, excluding letters and digits |
1271 |
|
space white space (same as \s) |
1272 |
|
upper upper case letters |
1273 |
|
word "word" characters (same as \w) |
1274 |
|
xdigit hexadecimal digits |
1275 |
|
|
1276 |
|
The names "ascii" and "word" are Perl extensions. Another |
1277 |
|
Perl extension is negation, which is indicated by a ^ char- |
1278 |
|
acter after the colon. For example, |
1279 |
|
|
1280 |
|
[12[:^digit:]] |
1281 |
|
|
1282 |
|
matches "1", "2", or any non-digit. PCRE (and Perl) also |
1283 |
|
recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a |
1284 |
|
"collating element", but these are not supported, and an |
1285 |
|
error is given if they are encountered. |
1286 |
|
|
1287 |
|
|
1288 |
|
|
1289 |
VERTICAL BAR |
VERTICAL BAR |
1290 |
Vertical bar characters are used to separate alternative |
Vertical bar characters are used to separate alternative |
1291 |
patterns. For example, the pattern |
patterns. For example, the pattern |
1399 |
the ((red|white) (king|queen)) |
the ((red|white) (king|queen)) |
1400 |
|
|
1401 |
the captured substrings are "red king", "red", and "king", |
the captured substrings are "red king", "red", and "king", |
1402 |
and are numbered 1, 2, and 3. |
and are numbered 1, 2, and 3, respectively. |
1403 |
|
|
1404 |
The fact that plain parentheses fulfil two functions is not |
The fact that plain parentheses fulfil two functions is not |
1405 |
always helpful. There are often times when a grouping sub- |
always helpful. There are often times when a grouping sub- |
1437 |
Repetition is specified by quantifiers, which can follow any |
Repetition is specified by quantifiers, which can follow any |
1438 |
of the following items: |
of the following items: |
1439 |
|
|
|
|
|
1440 |
a single character, possibly escaped |
a single character, possibly escaped |
1441 |
the . metacharacter |
the . metacharacter |
1442 |
a character class |
a character class |
1470 |
one that does not match the syntax of a quantifier, is taken |
one that does not match the syntax of a quantifier, is taken |
1471 |
as a literal character. For example, {,6} is not a quantif- |
as a literal character. For example, {,6} is not a quantif- |
1472 |
ier, but a literal string of four characters. |
ier, but a literal string of four characters. |
|
|
|
1473 |
The quantifier {0} is permitted, causing the expression to |
The quantifier {0} is permitted, causing the expression to |
1474 |
behave as if the previous item and the quantifier were not |
behave as if the previous item and the quantifier were not |
1475 |
present. |
present. |
1508 |
|
|
1509 |
/* first command */ not comment /* second comment */ |
/* first command */ not comment /* second comment */ |
1510 |
|
|
1511 |
fails, because it matches the entire string due to the |
fails, because it matches the entire string owing to the |
1512 |
greediness of the .* item. |
greediness of the .* item. |
1513 |
|
|
1514 |
However, if a quantifier is followed by a question mark, |
However, if a quantifier is followed by a question mark, it |
1515 |
then it ceases to be greedy, and instead matches the minimum |
ceases to be greedy, and instead matches the minimum number |
1516 |
number of times possible, so the pattern |
of times possible, so the pattern |
1517 |
|
|
1518 |
/\*.*?\*/ |
/\*.*?\*/ |
1519 |
|
|
1530 |
that is the only way the rest of the pattern matches. |
that is the only way the rest of the pattern matches. |
1531 |
|
|
1532 |
If the PCRE_UNGREEDY option is set (an option which is not |
If the PCRE_UNGREEDY option is set (an option which is not |
1533 |
available in Perl) then the quantifiers are not greedy by |
available in Perl), the quantifiers are not greedy by |
1534 |
default, but individual ones can be made greedy by following |
default, but individual ones can be made greedy by following |
1535 |
them with a question mark. In other words, it inverts the |
them with a question mark. In other words, it inverts the |
1536 |
default behaviour. |
default behaviour. |
1542 |
|
|
1543 |
If a pattern starts with .* or .{0,} and the PCRE_DOTALL |
If a pattern starts with .* or .{0,} and the PCRE_DOTALL |
1544 |
option (equivalent to Perl's /s) is set, thus allowing the . |
option (equivalent to Perl's /s) is set, thus allowing the . |
1545 |
to match newlines, then the pattern is implicitly anchored, |
to match newlines, the pattern is implicitly anchored, |
1546 |
because whatever follows will be tried against every charac- |
because whatever follows will be tried against every charac- |
1547 |
ter position in the subject string, so there is no point in |
ter position in the subject string, so there is no point in |
1548 |
retrying the overall match at any position after the first. |
retrying the overall match at any position after the first. |
1574 |
BACK REFERENCES |
BACK REFERENCES |
1575 |
Outside a character class, a backslash followed by a digit |
Outside a character class, a backslash followed by a digit |
1576 |
greater than 0 (and possibly further digits) is a back |
greater than 0 (and possibly further digits) is a back |
1577 |
|
|
1578 |
|
|
1579 |
|
|
1580 |
|
|
1581 |
|
SunOS 5.8 Last change: 30 |
1582 |
|
|
1583 |
|
|
1584 |
|
|
1585 |
reference to a capturing subpattern earlier (i.e. to its |
reference to a capturing subpattern earlier (i.e. to its |
1586 |
left) in the pattern, provided there have been that many |
left) in the pattern, provided there have been that many |
1587 |
previous capturing left parentheses. |
previous capturing left parentheses. |
1603 |
|
|
1604 |
matches "sense and sensibility" and "response and responsi- |
matches "sense and sensibility" and "response and responsi- |
1605 |
bility", but not "sense and responsibility". If caseful |
bility", but not "sense and responsibility". If caseful |
1606 |
matching is in force at the time of the back reference, then |
matching is in force at the time of the back reference, the |
1607 |
the case of letters is relevant. For example, |
case of letters is relevant. For example, |
1608 |
|
|
1609 |
((?i)rah)\s+\1 |
((?i)rah)\s+\1 |
1610 |
|
|
1614 |
|
|
1615 |
There may be more than one back reference to the same sub- |
There may be more than one back reference to the same sub- |
1616 |
pattern. If a subpattern has not actually been used in a |
pattern. If a subpattern has not actually been used in a |
1617 |
particular match, then any back references to it always |
particular match, any back references to it always fail. For |
1618 |
fail. For example, the pattern |
example, the pattern |
1619 |
|
|
1620 |
(a|(bc))\2 |
(a|(bc))\2 |
1621 |
|
|
1623 |
Because there may be up to 99 back references, all digits |
Because there may be up to 99 back references, all digits |
1624 |
following the backslash are taken as part of a potential |
following the backslash are taken as part of a potential |
1625 |
back reference number. If the pattern continues with a digit |
back reference number. If the pattern continues with a digit |
1626 |
character, then some delimiter must be used to terminate the |
character, some delimiter must be used to terminate the back |
1627 |
back reference. If the PCRE_EXTENDED option is set, this can |
reference. If the PCRE_EXTENDED option is set, this can be |
1628 |
be whitespace. Otherwise an empty comment can be used. |
whitespace. Otherwise an empty comment can be used. |
1629 |
|
|
1630 |
A back reference that occurs inside the parentheses to which |
A back reference that occurs inside the parentheses to which |
1631 |
it refers fails when the subpattern is first used, so, for |
it refers fails when the subpattern is first used, so, for |
1635 |
|
|
1636 |
(a|b\1)+ |
(a|b\1)+ |
1637 |
|
|
1638 |
matches any number of "a"s and also "aba", "ababaa" etc. At |
matches any number of "a"s and also "aba", "ababbaa" etc. At |
1639 |
each iteration of the subpattern, the back reference matches |
each iteration of the subpattern, the back reference matches |
1640 |
the character string corresponding to the previous itera- |
the character string corresponding to the previous itera- |
1641 |
tion. In order for this to work, the pattern must be such |
tion. In order for this to work, the pattern must be such |
1653 |
cated assertions are coded as subpatterns. There are two |
cated assertions are coded as subpatterns. There are two |
1654 |
kinds: those that look ahead of the current position in the |
kinds: those that look ahead of the current position in the |
1655 |
subject string, and those that look behind it. |
subject string, and those that look behind it. |
1656 |
|
|
1657 |
An assertion subpattern is matched in the normal way, except |
An assertion subpattern is matched in the normal way, except |
1658 |
that it does not cause the current matching position to be |
that it does not cause the current matching position to be |
1659 |
changed. Lookahead assertions start with (?= for positive |
changed. Lookahead assertions start with (?= for positive |
1725 |
matches "foo" preceded by three digits that are not "999". |
matches "foo" preceded by three digits that are not "999". |
1726 |
Notice that each of the assertions is applied independently |
Notice that each of the assertions is applied independently |
1727 |
at the same point in the subject string. First there is a |
at the same point in the subject string. First there is a |
1728 |
check that the previous three characters are all digits, |
check that the previous three characters are all digits, and |
1729 |
then there is a check that the same three characters are not |
then there is a check that the same three characters are not |
1730 |
"999". This pattern does not match "foo" preceded by six |
"999". This pattern does not match "foo" preceded by six |
1731 |
characters, the first of which are digits and the last three |
characters, the first of which are digits and the last three |
1819 |
|
|
1820 |
abcd$ |
abcd$ |
1821 |
|
|
1822 |
when applied to a long string which does not match it. |
when applied to a long string which does not match. Because |
1823 |
Because matching proceeds from left to right, PCRE will look |
matching proceeds from left to right, PCRE will look for |
1824 |
for each "a" in the subject and then see if what follows |
each "a" in the subject and then see if what follows matches |
1825 |
matches the rest of the pattern. If the pattern is specified |
the rest of the pattern. If the pattern is specified as |
|
as |
|
1826 |
|
|
1827 |
^.*abcd$ |
^.*abcd$ |
1828 |
|
|
1829 |
then the initial .* matches the entire string at first, but |
the initial .* matches the entire string at first, but when |
1830 |
when this fails, it backtracks to match all but the last |
this fails (because there is no following "a"), it back- |
1831 |
character, then all but the last two characters, and so on. |
tracks to match all but the last character, then all but the |
1832 |
Once again the search for "a" covers the entire string, from |
last two characters, and so on. Once again the search for |
1833 |
right to left, so we are no better off. However, if the pat- |
"a" covers the entire string, from right to left, so we are |
1834 |
tern is written as |
no better off. However, if the pattern is written as |
1835 |
|
|
1836 |
^(?>.*)(?<=abcd) |
^(?>.*)(?<=abcd) |
1837 |
|
|
1838 |
then there can be no backtracking for the .* item; it can |
there can be no backtracking for the .* item; it can match |
1839 |
match only the entire string. The subsequent lookbehind |
only the entire string. The subsequent lookbehind assertion |
1840 |
assertion does a single test on the last four characters. If |
does a single test on the last four characters. If it fails, |
1841 |
it fails, the match fails immediately. For long strings, |
the match fails immediately. For long strings, this approach |
1842 |
this approach makes a significant difference to the process- |
makes a significant difference to the processing time. |
1843 |
ing time. |
|
1844 |
|
When a pattern contains an unlimited repeat inside a subpat- |
1845 |
|
tern that can itself be repeated an unlimited number of |
1846 |
|
times, the use of a once-only subpattern is the only way to |
1847 |
|
avoid some failing matches taking a very long time indeed. |
1848 |
|
The pattern |
1849 |
|
|
1850 |
|
(\D+|<\d+>)*[!?] |
1851 |
|
|
1852 |
|
matches an unlimited number of substrings that either con- |
1853 |
|
sist of non-digits, or digits enclosed in <>, followed by |
1854 |
|
either ! or ?. When it matches, it runs quickly. However, if |
1855 |
|
it is applied to |
1856 |
|
|
1857 |
|
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa |
1858 |
|
|
1859 |
|
it takes a long time before reporting failure. This is |
1860 |
|
because the string can be divided between the two repeats in |
1861 |
|
a large number of ways, and all have to be tried. (The exam- |
1862 |
|
ple used [!?] rather than a single character at the end, |
1863 |
|
because both PCRE and Perl have an optimization that allows |
1864 |
|
for fast failure when a single character is used. They |
1865 |
|
remember the last single character that is required for a |
1866 |
|
match, and fail early if it is not present in the string.) |
1867 |
|
If the pattern is changed to |
1868 |
|
|
1869 |
|
((?>\D+)|<\d+>)*[!?] |
1870 |
|
|
1871 |
|
sequences of non-digits cannot be broken, and failure hap- |
1872 |
|
pens quickly. |
1873 |
|
|
1874 |
|
|
1875 |
|
|
1889 |
error occurs. |
error occurs. |
1890 |
|
|
1891 |
There are two kinds of condition. If the text between the |
There are two kinds of condition. If the text between the |
1892 |
parentheses consists of a sequence of digits, then the |
parentheses consists of a sequence of digits, the condition |
1893 |
condition is satisfied if the capturing subpattern of that |
is satisfied if the capturing subpattern of that number has |
1894 |
number has previously matched. Consider the following pat- |
previously matched. The number must be greater than zero. |
1895 |
tern, which contains non-significant white space to make it |
Consider the following pattern, which contains non- |
1896 |
more readable (assume the PCRE_EXTENDED option) and to |
significant white space to make it more readable (assume the |
1897 |
divide it into three parts for ease of discussion: |
PCRE_EXTENDED option) and to divide it into three parts for |
1898 |
|
ease of discussion: |
1899 |
|
|
1900 |
( \( )? [^()]+ (?(1) \) ) |
( \( )? [^()]+ (?(1) \) ) |
1901 |
|
|
1944 |
|
|
1945 |
|
|
1946 |
|
|
1947 |
|
RECURSIVE PATTERNS |
1948 |
|
Consider the problem of matching a string in parentheses, |
1949 |
|
allowing for unlimited nested parentheses. Without the use |
1950 |
|
of recursion, the best that can be done is to use a pattern |
1951 |
|
that matches up to some fixed depth of nesting. It is not |
1952 |
|
possible to handle an arbitrary nesting depth. Perl 5.6 has |
1953 |
|
provided an experimental facility that allows regular |
1954 |
|
expressions to recurse (amongst other things). It does this |
1955 |
|
by interpolating Perl code in the expression at run time, |
1956 |
|
and the code can refer to the expression itself. A Perl pat- |
1957 |
|
tern to solve the parentheses problem can be created like |
1958 |
|
this: |
1959 |
|
|
1960 |
|
$re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x; |
1961 |
|
|
1962 |
|
The (?p{...}) item interpolates Perl code at run time, and |
1963 |
|
in this case refers recursively to the pattern in which it |
1964 |
|
appears. Obviously, PCRE cannot support the interpolation of |
1965 |
|
Perl code. Instead, the special item (?R) is provided for |
1966 |
|
the specific case of recursion. This PCRE pattern solves the |
1967 |
|
parentheses problem (assume the PCRE_EXTENDED option is set |
1968 |
|
so that white space is ignored): |
1969 |
|
|
1970 |
|
\( ( (?>[^()]+) | (?R) )* \) |
1971 |
|
|
1972 |
|
First it matches an opening parenthesis. Then it matches any |
1973 |
|
number of substrings which can either be a sequence of non- |
1974 |
|
parentheses, or a recursive match of the pattern itself |
1975 |
|
(i.e. a correctly parenthesized substring). Finally there is |
1976 |
|
a closing parenthesis. |
1977 |
|
|
1978 |
|
This particular example pattern contains nested unlimited |
1979 |
|
repeats, and so the use of a once-only subpattern for match- |
1980 |
|
ing strings of non-parentheses is important when applying |
1981 |
|
the pattern to strings that do not match. For example, when |
1982 |
|
it is applied to |
1983 |
|
|
1984 |
|
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa() |
1985 |
|
|
1986 |
|
it yields "no match" quickly. However, if a once-only sub- |
1987 |
|
pattern is not used, the match runs for a very long time |
1988 |
|
indeed because there are so many different ways the + and * |
1989 |
|
repeats can carve up the subject, and all have to be tested |
1990 |
|
before failure can be reported. |
1991 |
|
|
1992 |
|
The values set for any capturing subpatterns are those from |
1993 |
|
the outermost level of the recursion at which the subpattern |
1994 |
|
value is set. If the pattern above is matched against |
1995 |
|
|
1996 |
|
(ab(cd)ef) |
1997 |
|
|
1998 |
|
the value for the capturing parentheses is "ef", which is |
1999 |
|
the last value taken on at the top level. If additional |
2000 |
|
parentheses are added, giving |
2001 |
|
|
2002 |
|
\( ( ( (?>[^()]+) | (?R) )* ) \) |
2003 |
|
^ ^ |
2004 |
|
^ ^ the string they capture is |
2005 |
|
"ab(cd)ef", the contents of the top level parentheses. If |
2006 |
|
there are more than 15 capturing parentheses in a pattern, |
2007 |
|
PCRE has to obtain extra memory to store data during a |
2008 |
|
recursion, which it does by using pcre_malloc, freeing it |
2009 |
|
via pcre_free afterwards. If no memory can be obtained, it |
2010 |
|
saves data for the first 15 capturing parentheses only, as |
2011 |
|
there is no way to give an out-of-memory error from within a |
2012 |
|
recursion. |
2013 |
|
|
2014 |
|
|
2015 |
|
|
2016 |
PERFORMANCE |
PERFORMANCE |
2017 |
Certain items that may appear in patterns are more efficient |
Certain items that may appear in patterns are more efficient |
2018 |
than others. It is more efficient to use a character class |
than others. It is more efficient to use a character class |
2080 |
|
|
2081 |
|
|
2082 |
|
|
2083 |
|
UTF-8 SUPPORT |
2084 |
|
Starting at release 3.3, PCRE has some support for character |
2085 |
|
strings encoded in the UTF-8 format. This is incomplete, and |
2086 |
|
is regarded as experimental. In order to use it, you must |
2087 |
|
configure PCRE to include UTF-8 support in the code, and, in |
2088 |
|
addition, you must call pcre_compile() with the PCRE_UTF8 |
2089 |
|
option flag. When you do this, both the pattern and any sub- |
2090 |
|
ject strings that are matched against it are treated as |
2091 |
|
UTF-8 strings instead of just strings of bytes, but only in |
2092 |
|
the cases that are mentioned below. |
2093 |
|
|
2094 |
|
If you compile PCRE with UTF-8 support, but do not use it at |
2095 |
|
run time, the library will be a bit bigger, but the addi- |
2096 |
|
tional run time overhead is limited to testing the PCRE_UTF8 |
2097 |
|
flag in several places, so should not be very large. |
2098 |
|
|
2099 |
|
PCRE assumes that the strings it is given contain valid |
2100 |
|
UTF-8 codes. It does not diagnose invalid UTF-8 strings. If |
2101 |
|
you pass invalid UTF-8 strings to PCRE, the results are |
2102 |
|
undefined. |
2103 |
|
|
2104 |
|
Running with PCRE_UTF8 set causes these changes in the way |
2105 |
|
PCRE works: |
2106 |
|
|
2107 |
|
1. In a pattern, the escape sequence \x{...}, where the |
2108 |
|
contents of the braces is a string of hexadecimal digits, is |
2109 |
|
interpreted as a UTF-8 character whose code number is the |
2110 |
|
given hexadecimal number, for example: \x{1234}. This |
2111 |
|
inserts from one to six literal bytes into the pattern, |
2112 |
|
using the UTF-8 encoding. If a non-hexadecimal digit appears |
2113 |
|
between the braces, the item is not recognized. |
2114 |
|
|
2115 |
|
2. The original hexadecimal escape sequence, \xhh, generates |
2116 |
|
a two-byte UTF-8 character if its value is greater than 127. |
2117 |
|
|
2118 |
|
3. Repeat quantifiers are NOT correctly handled if they fol- |
2119 |
|
low a multibyte character. For example, \x{100}* and \xc3+ |
2120 |
|
do not work. If you want to repeat such characters, you must |
2121 |
|
enclose them in non-capturing parentheses, for example |
2122 |
|
(?:\x{100}), at present. |
2123 |
|
|
2124 |
|
4. The dot metacharacter matches one UTF-8 character instead |
2125 |
|
of a single byte. |
2126 |
|
|
2127 |
|
5. Unlike literal UTF-8 characters, the dot metacharacter |
2128 |
|
followed by a repeat quantifier does operate correctly on |
2129 |
|
UTF-8 characters instead of single bytes. |
2130 |
|
|
2131 |
|
4. Although the \x{...} escape is permitted in a character |
2132 |
|
class, characters whose values are greater than 255 cannot |
2133 |
|
be included in a class. |
2134 |
|
|
2135 |
|
5. A class is matched against a UTF-8 character instead of |
2136 |
|
just a single byte, but it can match only characters whose |
2137 |
|
values are less than 256. Characters with greater values |
2138 |
|
always fail to match a class. |
2139 |
|
|
2140 |
|
6. Repeated classes work correctly on multiple characters. |
2141 |
|
|
2142 |
|
7. Classes containing just a single character whose value is |
2143 |
|
greater than 127 (but less than 256), for example, [\x80] or |
2144 |
|
[^\x{93}], do not work because these are optimized into sin- |
2145 |
|
gle byte matches. In the first case, of course, the class |
2146 |
|
brackets are just redundant. |
2147 |
|
|
2148 |
|
8. Lookbehind assertions move backwards in the subject by a |
2149 |
|
fixed number of characters instead of a fixed number of |
2150 |
|
bytes. Simple cases have been tested to work correctly, but |
2151 |
|
there may be hidden gotchas herein. |
2152 |
|
|
2153 |
|
9. The character types such as \d and \w do not work |
2154 |
|
correctly with UTF-8 characters. They continue to test a |
2155 |
|
single byte. |
2156 |
|
|
2157 |
|
10. Anything not explicitly mentioned here continues to work |
2158 |
|
in bytes rather than in characters. |
2159 |
|
|
2160 |
|
The following UTF-8 features of Perl 5.6 are not imple- |
2161 |
|
mented: |
2162 |
|
|
2163 |
|
1. The escape sequence \C to match a single byte. |
2164 |
|
|
2165 |
|
2. The use of Unicode tables and properties and escapes \p, |
2166 |
|
\P, and \X. |
2167 |
|
|
2168 |
|
|
2169 |
|
|
2170 |
|
SAMPLE PROGRAM |
2171 |
|
The code below is a simple, complete demonstration program, |
2172 |
|
to get you started with using PCRE. This code is also sup- |
2173 |
|
plied in the file pcredemo.c in the PCRE distribution. |
2174 |
|
|
2175 |
|
The program compiles the regular expression that is its |
2176 |
|
first argument, and matches it against the subject string in |
2177 |
|
its second argument. No options are set, and default charac- |
2178 |
|
ter tables are used. If matching succeeds, the program out- |
2179 |
|
puts the portion of the subject that matched, together with |
2180 |
|
the contents of any captured substrings. |
2181 |
|
|
2182 |
|
On a Unix system that has PCRE installed in /usr/local, you |
2183 |
|
can compile the demonstration program using a command like |
2184 |
|
this: |
2185 |
|
|
2186 |
|
gcc -o pcredemo pcredemo.c -I/usr/local/include |
2187 |
|
-L/usr/local/lib -lpcre |
2188 |
|
|
2189 |
|
Then you can run simple tests like this: |
2190 |
|
|
2191 |
|
./pcredemo 'cat|dog' 'the cat sat on the mat' |
2192 |
|
|
2193 |
|
Note that there is a much more comprehensive test program, |
2194 |
|
called pcretest, which supports many more facilities for |
2195 |
|
testing regular expressions. The pcredemo program is pro- |
2196 |
|
vided as a simple coding example. |
2197 |
|
|
2198 |
|
On some operating systems (e.g. Solaris) you may get an |
2199 |
|
error like this when you try to run pcredemo: |
2200 |
|
|
2201 |
|
ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such |
2202 |
|
file or directory |
2203 |
|
|
2204 |
|
This is caused by the way shared library support works on |
2205 |
|
those systems. You need to add |
2206 |
|
|
2207 |
|
-R/usr/local/lib |
2208 |
|
|
2209 |
|
to the compile command to get round this problem. Here's the |
2210 |
|
code: |
2211 |
|
|
2212 |
|
#include <stdio.h> |
2213 |
|
#include <string.h> |
2214 |
|
#include <pcre.h> |
2215 |
|
|
2216 |
|
#define OVECCOUNT 30 /* should be a multiple of 3 */ |
2217 |
|
|
2218 |
|
int main(int argc, char **argv) |
2219 |
|
{ |
2220 |
|
pcre *re; |
2221 |
|
const char *error; |
2222 |
|
int erroffset; |
2223 |
|
int ovector[OVECCOUNT]; |
2224 |
|
int rc, i; |
2225 |
|
|
2226 |
|
if (argc != 3) |
2227 |
|
{ |
2228 |
|
printf("Two arguments required: a regex and a " |
2229 |
|
"subject string\n"); |
2230 |
|
return 1; |
2231 |
|
} |
2232 |
|
|
2233 |
|
/* Compile the regular expression in the first argument */ |
2234 |
|
|
2235 |
|
re = pcre_compile( |
2236 |
|
argv[1], /* the pattern */ |
2237 |
|
0, /* default options */ |
2238 |
|
&error, /* for error message */ |
2239 |
|
&erroffset, /* for error offset */ |
2240 |
|
NULL); /* use default character tables */ |
2241 |
|
|
2242 |
|
/* Compilation failed: print the error message and exit */ |
2243 |
|
|
2244 |
|
if (re == NULL) |
2245 |
|
{ |
2246 |
|
printf("PCRE compilation failed at offset %d: %s\n", |
2247 |
|
erroffset, error); |
2248 |
|
return 1; |
2249 |
|
} |
2250 |
|
|
2251 |
|
/* Compilation succeeded: match the subject in the second |
2252 |
|
argument */ |
2253 |
|
|
2254 |
|
rc = pcre_exec( |
2255 |
|
re, /* the compiled pattern */ |
2256 |
|
NULL, /* we didn't study the pattern */ |
2257 |
|
argv[2], /* the subject string */ |
2258 |
|
(int)strlen(argv[2]), /* the length of the subject */ |
2259 |
|
0, /* start at offset 0 in the subject */ |
2260 |
|
0, /* default options */ |
2261 |
|
ovector, /* vector for substring information */ |
2262 |
|
OVECCOUNT); /* number of elements in the vector */ |
2263 |
|
|
2264 |
|
/* Matching failed: handle error cases */ |
2265 |
|
|
2266 |
|
if (rc < 0) |
2267 |
|
{ |
2268 |
|
switch(rc) |
2269 |
|
{ |
2270 |
|
case PCRE_ERROR_NOMATCH: printf("No match\n"); break; |
2271 |
|
/* |
2272 |
|
Handle other special cases if you like |
2273 |
|
*/ |
2274 |
|
default: printf("Matching error %d\n", rc); break; |
2275 |
|
} |
2276 |
|
return 1; |
2277 |
|
} |
2278 |
|
|
2279 |
|
/* Match succeded */ |
2280 |
|
|
2281 |
|
printf("Match succeeded\n"); |
2282 |
|
|
2283 |
|
/* The output vector wasn't big enough */ |
2284 |
|
|
2285 |
|
if (rc == 0) |
2286 |
|
{ |
2287 |
|
rc = OVECCOUNT/3; |
2288 |
|
printf("ovector only has room for %d captured " |
2289 |
|
substrings\n", rc - 1); |
2290 |
|
} |
2291 |
|
|
2292 |
|
/* Show substrings stored in the output vector */ |
2293 |
|
|
2294 |
|
for (i = 0; i < rc; i++) |
2295 |
|
{ |
2296 |
|
char *substring_start = argv[2] + ovector[2*i]; |
2297 |
|
int substring_length = ovector[2*i+1] - ovector[2*i]; |
2298 |
|
printf("%2d: %.*s\n", i, substring_length, |
2299 |
|
substring_start); |
2300 |
|
} |
2301 |
|
|
2302 |
|
return 0; |
2303 |
|
} |
2304 |
|
|
2305 |
|
|
2306 |
|
|
2307 |
AUTHOR |
AUTHOR |
2308 |
Philip Hazel <ph10@cam.ac.uk> |
Philip Hazel <ph10@cam.ac.uk> |
2309 |
University Computing Service, |
University Computing Service, |
2311 |
Cambridge CB2 3QG, England. |
Cambridge CB2 3QG, England. |
2312 |
Phone: +44 1223 334714 |
Phone: +44 1223 334714 |
2313 |
|
|
2314 |
Last updated: 29 July 1999 |
Last updated: 15 August 2001 |
2315 |
Copyright (c) 1997-1999 University of Cambridge. |
Copyright (c) 1997-2001 University of Cambridge. |