13 |
.B const unsigned char *\fItableptr\fR); |
.B const unsigned char *\fItableptr\fR); |
14 |
.PP |
.PP |
15 |
.br |
.br |
|
.B const unsigned char *pcre_maketables(void); |
|
|
.PP |
|
|
.br |
|
16 |
.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR, |
.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR, |
17 |
.ti +5n |
.ti +5n |
18 |
.B const char **\fIerrptr\fR); |
.B const char **\fIerrptr\fR); |
25 |
.B int *\fIovector\fR, int \fIovecsize\fR); |
.B int *\fIovector\fR, int \fIovecsize\fR); |
26 |
.PP |
.PP |
27 |
.br |
.br |
28 |
|
.B int pcre_copy_substring(const char *\fIsubject\fR, int *\fIovector\fR, |
29 |
|
.ti +5n |
30 |
|
.B int \fIstringcount\fR, int \fIstringnumber\fR, char *\fIbuffer\fR, |
31 |
|
.ti +5n |
32 |
|
.B int \fIbuffersize\fR); |
33 |
|
.PP |
34 |
|
.br |
35 |
|
.B int pcre_get_substring(const char *\fIsubject\fR, int *\fIovector\fR, |
36 |
|
.ti +5n |
37 |
|
.B int \fIstringcount\fR, int \fIstringnumber\fR, |
38 |
|
.ti +5n |
39 |
|
.B const char **\fIstringptr\fR); |
40 |
|
.PP |
41 |
|
.br |
42 |
|
.B int pcre_get_substring_list(const char *\fIsubject\fR, |
43 |
|
.ti +5n |
44 |
|
.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);" |
45 |
|
.PP |
46 |
|
.br |
47 |
|
.B const unsigned char *pcre_maketables(void); |
48 |
|
.PP |
49 |
|
.br |
50 |
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int |
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int |
51 |
.B *\fIfirstcharptr\fR); |
.B *\fIfirstcharptr\fR); |
52 |
.PP |
.PP |
70 |
a set of wrapper functions that correspond to the POSIX API. See |
a set of wrapper functions that correspond to the POSIX API. See |
71 |
\fBpcreposix (3)\fR. |
\fBpcreposix (3)\fR. |
72 |
|
|
73 |
The three functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and |
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR |
74 |
\fBpcre_exec()\fR are used for compiling and matching regular expressions. The |
are used for compiling and matching regular expressions, while |
75 |
function \fBpcre_maketables()\fR is used (optionally) to build a set of |
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
76 |
character tables in the current locale for passing to \fBpcre_compile()\fR. |
\fBpcre_get_substring_list()\fR are convenience functions for extracting |
77 |
|
captured substrings from a matched subject string. The function |
78 |
|
\fBpcre_maketables()\fR is used (optionally) to build a set of character tables |
79 |
|
in the current locale for passing to \fBpcre_compile()\fR. |
80 |
|
|
81 |
The function \fBpcre_info()\fR is used to find out information about a compiled |
The function \fBpcre_info()\fR is used to find out information about a compiled |
82 |
pattern, while the function \fBpcre_version()\fR returns a pointer to a string |
pattern, while the function \fBpcre_version()\fR returns a pointer to a string |
255 |
memory containing the tables remains available for as long as it is needed. |
memory containing the tables remains available for as long as it is needed. |
256 |
|
|
257 |
|
|
258 |
|
.SH INFORMATION ABOUT A PATTERN |
259 |
|
The \fBpcre_info()\fR function returns information about a compiled pattern. |
260 |
|
Its yield is the number of capturing subpatterns, or one of the following |
261 |
|
negative numbers: |
262 |
|
|
263 |
|
PCRE_ERROR_NULL the argument \fIcode\fR was NULL |
264 |
|
PCRE_ERROR_BADMAGIC the "magic number" was not found |
265 |
|
|
266 |
|
If the \fIoptptr\fR argument is not NULL, a copy of the options with which the |
267 |
|
pattern was compiled is placed in the integer it points to. |
268 |
|
|
269 |
|
If the \fIfirstcharptr\fR argument is not NULL, is is used to pass back |
270 |
|
information about the first character of any matched string. If there is a |
271 |
|
fixed first character, e.g. from a pattern such as (cat|cow|coyote), then it is |
272 |
|
returned in the integer pointed to by \fIfirstcharptr\fR. Otherwise, if the |
273 |
|
pattern was compiled with the PCRE_MULTILINE option, and every branch started |
274 |
|
with "^", then -1 is returned, indicating that the pattern will match at the |
275 |
|
start of a subject string or after any "\\n" within the string. Otherwise -2 is |
276 |
|
returned. |
277 |
|
|
278 |
|
|
279 |
.SH MATCHING A PATTERN |
.SH MATCHING A PATTERN |
280 |
The function \fBpcre_exec()\fR is called to match a subject string against a |
The function \fBpcre_exec()\fR is called to match a subject string against a |
281 |
pre-compiled pattern, which is passed in the \fIcode\fR argument. If the |
pre-compiled pattern, which is passed in the \fIcode\fR argument. If the |
333 |
subpatterns, the return value from a successful match is 1, indicating that |
subpatterns, the return value from a successful match is 1, indicating that |
334 |
just the first pair of offsets has been set. |
just the first pair of offsets has been set. |
335 |
|
|
336 |
|
Some convenience functions are provided for extracting the captured substrings |
337 |
|
as separate strings. These are described in the following section. |
338 |
|
|
339 |
It is possible for an capturing subpattern number \fIn+1\fR to match some |
It is possible for an capturing subpattern number \fIn+1\fR to match some |
340 |
part of the subject when subpattern \fIn\fR has not been used at all. For |
part of the subject when subpattern \fIn\fR has not been used at all. For |
341 |
example, if the string "abc" is matched against the pattern (a|(z))(bc) |
example, if the string "abc" is matched against the pattern (a|(z))(bc) |
396 |
the end of matching. |
the end of matching. |
397 |
|
|
398 |
|
|
399 |
.SH INFORMATION ABOUT A PATTERN |
.SH EXTRACTING CAPTURED SUBSTRINGS |
400 |
The \fBpcre_info()\fR function returns information about a compiled pattern. |
Captured substrings can be accessed directly by using the offsets returned by |
401 |
Its yield is the number of capturing subpatterns, or one of the following |
\fBpcre_exec()\fR in \fIovector\fR. For convenience, the functions |
402 |
negative numbers: |
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and |
403 |
|
\fBpcre_get_substring_list()\fR are provided for extracting captured substrings |
404 |
|
as new, separate, zero-terminated strings. A substring that contains a binary |
405 |
|
zero is correctly extracted and has a further zero added on the end, but the |
406 |
|
result does not, of course, function as a C string. |
407 |
|
|
408 |
|
The first three arguments are the same for all three functions: \fIsubject\fR |
409 |
|
is the subject string which has just been successfully matched, \fIovector\fR |
410 |
|
is a pointer to the vector of integer offsets that was passed to |
411 |
|
\fBpcre_exec()\fR, and \fIstringcount\fR is the number of substrings that |
412 |
|
were captured by the match, including the substring that matched the entire |
413 |
|
regular expression. This is the value returned by \fBpcre_exec\fR if it |
414 |
|
is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it |
415 |
|
ran out of space in \fIovector\fR, then the value passed as |
416 |
|
\fIstringcount\fR should be the size of the vector divided by three. |
417 |
|
|
418 |
|
The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR |
419 |
|
extract a single substring, whose number is given as \fIstringnumber\fR. A |
420 |
|
value of zero extracts the substring that matched the entire pattern, while |
421 |
|
higher values extract the captured substrings. For \fBpcre_copy_substring()\fR, |
422 |
|
the string is placed in \fIbuffer\fR, whose length is given by |
423 |
|
\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of store is |
424 |
|
obtained via \fBpcre_malloc\fR, and its address is returned via |
425 |
|
\fIstringptr\fR. The yield of the function is the length of the string, not |
426 |
|
including the terminating zero, or one of |
427 |
|
|
428 |
PCRE_ERROR_NULL the argument \fIcode\fR was NULL |
PCRE_ERROR_NOMEMORY (-6) |
|
PCRE_ERROR_BADMAGIC the "magic number" was not found |
|
429 |
|
|
430 |
If the \fIoptptr\fR argument is not NULL, a copy of the options with which the |
The buffer was too small for \fBpcre_copy_substring()\fR, or the attempt to get |
431 |
pattern was compiled is placed in the integer it points to. |
memory failed for \fBpcre_get_substring()\fR. |
432 |
|
|
433 |
|
PCRE_ERROR_NOSUBSTRING (-7) |
434 |
|
|
435 |
|
There is no substring whose number is \fIstringnumber\fR. |
436 |
|
|
437 |
|
The \fBpcre_get_substring_list()\fR function extracts all available substrings |
438 |
|
and builds a list of pointers to them. All this is done in a single block of |
439 |
|
memory which is obtained via \fBpcre_malloc\fR. The address of the memory block |
440 |
|
is returned via \fIlistptr\fR, which is also the start of the list of string |
441 |
|
pointers. The end of the list is marked by a NULL pointer. The yield of the |
442 |
|
function is zero if all went well, or |
443 |
|
|
444 |
|
PCRE_ERROR_NOMEMORY (-6) |
445 |
|
|
446 |
|
if the attempt to get the memory block failed. |
447 |
|
|
448 |
|
When any of these functions encounter a substring that is unset, which can |
449 |
|
happen when capturing subpattern number \fIn+1\fR matches some part of the |
450 |
|
subject, but subpattern \fIn\fR has not been used at all, they return an empty |
451 |
|
string. This can be distinguished from a genuine zero-length substring by |
452 |
|
inspecting the appropriate offset in \fIovector\fR, which is negative for unset |
453 |
|
substrings. |
454 |
|
|
|
If the \fIfirstcharptr\fR argument is not NULL, is is used to pass back |
|
|
information about the first character of any matched string. If there is a |
|
|
fixed first character, e.g. from a pattern such as (cat|cow|coyote), then it is |
|
|
returned in the integer pointed to by \fIfirstcharptr\fR. Otherwise, if the |
|
|
pattern was compiled with the PCRE_MULTILINE option, and every branch started |
|
|
with "^", then -1 is returned, indicating that the pattern will match at the |
|
|
start of a subject string or after any "\\n" within the string. Otherwise -2 is |
|
|
returned. |
|
455 |
|
|
456 |
|
|
457 |
.SH LIMITATIONS |
.SH LIMITATIONS |
806 |
character class. For example, [d-m] matches any letter between d and m, |
character class. For example, [d-m] matches any letter between d and m, |
807 |
inclusive. If a minus character is required in a class, it must be escaped with |
inclusive. If a minus character is required in a class, it must be escaped with |
808 |
a backslash or appear in a position where it cannot be interpreted as |
a backslash or appear in a position where it cannot be interpreted as |
809 |
indicating a range, typically as the first or last character in the class. It |
indicating a range, typically as the first or last character in the class. |
810 |
is not possible to have the character "]" as the end character of a range, |
|
811 |
since a sequence such as [w-] is interpreted as a class of two characters. The |
It is not possible to have the literal character "]" as the end character of a |
812 |
octal or hexadecimal representation of "]" can, however, be used to end a |
range. A pattern such as [W-]46] is interpreted as a class of two characters |
813 |
range. |
("W" and "-") followed by a literal string "46]", so it would match "W46]" or |
814 |
|
"-46]". However, if the "]" is escaped with a backslash it is interpreted as |
815 |
|
the end of range, so [W-\\]46] is interpreted as a single class containing a |
816 |
|
range followed by two separate characters. The octal or hexadecimal |
817 |
|
representation of "]" can also be used to end a range. |
818 |
|
|
819 |
Ranges operate in ASCII collating sequence. They can also be used for |
Ranges operate in ASCII collating sequence. They can also be used for |
820 |
characters specified numerically, for example [\\000-\\037]. If a range that |
characters specified numerically, for example [\\000-\\037]. If a range that |
1243 |
the current point in the subject string. |
the current point in the subject string. |
1244 |
|
|
1245 |
Once-only subpatterns are not capturing subpatterns. Simple cases such as the |
Once-only subpatterns are not capturing subpatterns. Simple cases such as the |
1246 |
above example can be though of as a maximizing repeat that must swallow |
above example can be thought of as a maximizing repeat that must swallow |
1247 |
everything it can. So, while both \\d+ and \\d+? are prepared to adjust the |
everything it can. So, while both \\d+ and \\d+? are prepared to adjust the |
1248 |
number of digits they match in order to make the rest of the pattern match, |
number of digits they match in order to make the rest of the pattern match, |
1249 |
(?>\\d+) can only match an entire sequence of digits. |
(?>\\d+) can only match an entire sequence of digits. |