/[pcre]/code/trunk/doc/pcre.txt
ViewVC logotype

Contents of /code/trunk/doc/pcre.txt

Parent Directory Parent Directory | Revision Log Revision Log


Revision 53 - (show annotations)
Sat Feb 24 21:39:42 2007 UTC (12 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 99591 byte(s)
Load pcre-3.5 into code/trunk.
1 NAME
2 pcre - Perl-compatible regular expressions.
3
4
5
6 SYNOPSIS
7 #include <pcre.h>
8
9 pcre *pcre_compile(const char *pattern, int options,
10 const char **errptr, int *erroffset,
11 const unsigned char *tableptr);
12
13 pcre_extra *pcre_study(const pcre *code, int options,
14 const char **errptr);
15
16 int pcre_exec(const pcre *code, const pcre_extra *extra,
17 const char *subject, int length, int startoffset,
18 int options, int *ovector, int ovecsize);
19
20 int pcre_copy_substring(const char *subject, int *ovector,
21 int stringcount, int stringnumber, char *buffer,
22 int buffersize);
23
24 int pcre_get_substring(const char *subject, int *ovector,
25 int stringcount, int stringnumber,
26 const char **stringptr);
27
28 int pcre_get_substring_list(const char *subject,
29 int *ovector, int stringcount, const char ***listptr);
30
31 void pcre_free_substring(const char *stringptr);
32
33 void pcre_free_substring_list(const char **stringptr);
34
35 const unsigned char *pcre_maketables(void);
36
37 int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
38 int what, void *where);
39
40 int pcre_info(const pcre *code, int *optptr, *firstcharptr);
41
42 char *pcre_version(void);
43
44 void *(*pcre_malloc)(size_t);
45
46 void (*pcre_free)(void *);
47
48
49
50
51 DESCRIPTION
52 The PCRE library is a set of functions that implement regu-
53 lar expression pattern matching using the same syntax and
54 semantics as Perl 5, with just a few differences (see
55
56 below). The current implementation corresponds to Perl
57 5.005, with some additional features from later versions.
58 This includes some experimental, incomplete support for
59 UTF-8 encoded strings. Details of exactly what is and what
60 is not supported are given below.
61
62 PCRE has its own native API, which is described in this
63 document. There is also a set of wrapper functions that
64 correspond to the POSIX regular expression API. These are
65 described in the pcreposix documentation.
66
67 The native API function prototypes are defined in the header
68 file pcre.h, and on Unix systems the library itself is
69 called libpcre.a, so can be accessed by adding -lpcre to the
70 command for linking an application which calls it. The
71 header file defines the macros PCRE_MAJOR and PCRE_MINOR to
72 contain the major and minor release numbers for the library.
73 Applications can use these to include support for different
74 releases.
75
76 The functions pcre_compile(), pcre_study(), and pcre_exec()
77 are used for compiling and matching regular expressions. A
78 sample program that demonstrates the simplest way of using
79 them is given in the file pcredemo.c. The last section of
80 this man page describes how to run it.
81
82 The functions pcre_copy_substring(), pcre_get_substring(),
83 and pcre_get_substring_list() are convenience functions for
84 extracting captured substrings from a matched subject
85 string; pcre_free_substring() and pcre_free_substring_list()
86 are also provided, to free the memory used for extracted
87 strings.
88
89 The function pcre_maketables() is used (optionally) to build
90 a set of character tables in the current locale for passing
91 to pcre_compile().
92
93 The function pcre_fullinfo() is used to find out information
94 about a compiled pattern; pcre_info() is an obsolete version
95 which returns only some of the available information, but is
96 retained for backwards compatibility. The function
97 pcre_version() returns a pointer to a string containing the
98 version of PCRE and its date of release.
99
100 The global variables pcre_malloc and pcre_free initially
101 contain the entry points of the standard malloc() and free()
102 functions respectively. PCRE calls the memory management
103 functions via these variables, so a calling program can
104 replace them if it wishes to intercept the calls. This
105 should be done before calling any PCRE functions.
106
107
108
109 MULTI-THREADING
110 The PCRE functions can be used in multi-threading applica-
111 tions, with the proviso that the memory management functions
112 pointed to by pcre_malloc and pcre_free are shared by all
113 threads.
114
115 The compiled form of a regular expression is not altered
116 during matching, so the same compiled pattern can safely be
117 used by several threads at once.
118
119
120
121 COMPILING A PATTERN
122 The function pcre_compile() is called to compile a pattern
123 into an internal form. The pattern is a C string terminated
124 by a binary zero, and is passed in the argument pattern. A
125 pointer to a single block of memory that is obtained via
126 pcre_malloc is returned. This contains the compiled code and
127 related data. The pcre type is defined for the returned
128 block; this is a typedef for a structure whose contents are
129 not externally defined. It is up to the caller to free the
130 memory when it is no longer required.
131
132 Although the compiled code of a PCRE regex is relocatable,
133 that is, it does not depend on memory location, the complete
134 pcre data block is not fully relocatable, because it con-
135 tains a copy of the tableptr argument, which is an address
136 (see below).
137
138 The size of a compiled pattern is roughly proportional to
139 the length of the pattern string, except that each character
140 class (other than those containing just a single character,
141 negated or not) requires 33 bytes, and repeat quantifiers
142 with a minimum greater than one or a bounded maximum cause
143 the relevant portions of the compiled pattern to be repli-
144 cated.
145
146 The options argument contains independent bits that affect
147 the compilation. It should be zero if no options are
148 required. Some of the options, in particular, those that are
149 compatible with Perl, can also be set and unset from within
150 the pattern (see the detailed description of regular expres-
151 sions below). For these options, the contents of the options
152 argument specifies their initial settings at the start of
153 compilation and execution. The PCRE_ANCHORED option can be
154 set at the time of matching as well as at compile time.
155
156 If errptr is NULL, pcre_compile() returns NULL immediately.
157 Otherwise, if compilation of a pattern fails, pcre_compile()
158 returns NULL, and sets the variable pointed to by errptr to
159 point to a textual error message. The offset from the start
160 of the pattern to the character where the error was
161 discovered is placed in the variable pointed to by
162 erroffset, which must not be NULL. If it is, an immediate
163 error is given.
164
165 If the final argument, tableptr, is NULL, PCRE uses a
166 default set of character tables which are built when it is
167 compiled, using the default C locale. Otherwise, tableptr
168 must be the result of a call to pcre_maketables(). See the
169 section on locale support below.
170
171 This code fragment shows a typical straightforward call to
172 pcre_compile():
173
174 pcre *re;
175 const char *error;
176 int erroffset;
177 re = pcre_compile(
178 "^A.*Z", /* the pattern */
179 0, /* default options */
180 &error, /* for error message */
181 &erroffset, /* for error offset */
182 NULL); /* use default character tables */
183
184 The following option bits are defined in the header file:
185
186 PCRE_ANCHORED
187
188 If this bit is set, the pattern is forced to be "anchored",
189 that is, it is constrained to match only at the start of the
190 string which is being searched (the "subject string"). This
191 effect can also be achieved by appropriate constructs in the
192 pattern itself, which is the only way to do it in Perl.
193
194 PCRE_CASELESS
195
196 If this bit is set, letters in the pattern match both upper
197 and lower case letters. It is equivalent to Perl's /i
198 option.
199
200 PCRE_DOLLAR_ENDONLY
201
202 If this bit is set, a dollar metacharacter in the pattern
203 matches only at the end of the subject string. Without this
204 option, a dollar also matches immediately before the final
205 character if it is a newline (but not before any other new-
206 lines). The PCRE_DOLLAR_ENDONLY option is ignored if
207 PCRE_MULTILINE is set. There is no equivalent to this option
208 in Perl.
209
210 PCRE_DOTALL
211
212 If this bit is set, a dot metacharater in the pattern
213 matches all characters, including newlines. Without it, new-
214 lines are excluded. This option is equivalent to Perl's /s
215 option. A negative class such as [^a] always matches a new-
216 line character, independent of the setting of this option.
217
218 PCRE_EXTENDED
219
220 If this bit is set, whitespace data characters in the pat-
221 tern are totally ignored except when escaped or inside a
222 character class, and characters between an unescaped # out-
223 side a character class and the next newline character,
224 inclusive, are also ignored. This is equivalent to Perl's /x
225 option, and makes it possible to include comments inside
226 complicated patterns. Note, however, that this applies only
227 to data characters. Whitespace characters may never appear
228 within special character sequences in a pattern, for example
229 within the sequence (?( which introduces a conditional sub-
230 pattern.
231
232 PCRE_EXTRA
233
234 This option was invented in order to turn on additional
235 functionality of PCRE that is incompatible with Perl, but it
236 is currently of very little use. When set, any backslash in
237 a pattern that is followed by a letter that has no special
238 meaning causes an error, thus reserving these combinations
239 for future expansion. By default, as in Perl, a backslash
240 followed by a letter with no special meaning is treated as a
241 literal. There are at present no other features controlled
242 by this option. It can also be set by a (?X) option setting
243 within a pattern.
244
245 PCRE_MULTILINE
246
247 By default, PCRE treats the subject string as consisting of
248 a single "line" of characters (even if it actually contains
249 several newlines). The "start of line" metacharacter (^)
250 matches only at the start of the string, while the "end of
251 line" metacharacter ($) matches only at the end of the
252 string, or before a terminating newline (unless
253 PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
254
255 When PCRE_MULTILINE it is set, the "start of line" and "end
256 of line" constructs match immediately following or immedi-
257 ately before any newline in the subject string, respec-
258 tively, as well as at the very start and end. This is
259 equivalent to Perl's /m option. If there are no "\n" charac-
260 ters in a subject string, or no occurrences of ^ or $ in a
261 pattern, setting PCRE_MULTILINE has no effect.
262
263 PCRE_UNGREEDY
264
265 This option inverts the "greediness" of the quantifiers so
266 that they are not greedy by default, but become greedy if
267 followed by "?". It is not compatible with Perl. It can also
268 be set by a (?U) option setting within the pattern.
269
270 PCRE_UTF8
271
272 This option causes PCRE to regard both the pattern and the
273 subject as strings of UTF-8 characters instead of just byte
274 strings. However, it is available only if PCRE has been
275 built to include UTF-8 support. If not, the use of this
276 option provokes an error. Support for UTF-8 is new, experi-
277 mental, and incomplete. Details of exactly what it entails
278 are given below.
279
280
281
282 STUDYING A PATTERN
283 When a pattern is going to be used several times, it is
284 worth spending more time analyzing it in order to speed up
285 the time taken for matching. The function pcre_study() takes
286 a pointer to a compiled pattern as its first argument, and
287 returns a pointer to a pcre_extra block (another typedef for
288 a structure with hidden contents) containing additional
289 information about the pattern; this can be passed to
290 pcre_exec(). If no additional information is available, NULL
291 is returned.
292
293 The second argument contains option bits. At present, no
294 options are defined for pcre_study(), and this argument
295 should always be zero.
296
297 The third argument for pcre_study() is a pointer to an error
298 message. If studying succeeds (even if no data is returned),
299 the variable it points to is set to NULL. Otherwise it
300 points to a textual error message.
301
302 This is a typical call to pcre_study():
303
304 pcre_extra *pe;
305 pe = pcre_study(
306 re, /* result of pcre_compile() */
307 0, /* no options exist */
308 &error); /* set to NULL or points to a message */
309
310 At present, studying a pattern is useful only for non-
311 anchored patterns that do not have a single fixed starting
312 character. A bitmap of possible starting characters is
313 created.
314
315
316
317 LOCALE SUPPORT
318 PCRE handles caseless matching, and determines whether char-
319 acters are letters, digits, or whatever, by reference to a
320 set of tables. The library contains a default set of tables
321 which is created in the default C locale when PCRE is com-
322 piled. This is used when the final argument of
323 pcre_compile() is NULL, and is sufficient for many applica-
324 tions.
325
326 An alternative set of tables can, however, be supplied. Such
327 tables are built by calling the pcre_maketables() function,
328 which has no arguments, in the relevant locale. The result
329 can then be passed to pcre_compile() as often as necessary.
330 For example, to build and use tables that are appropriate
331 for the French locale (where accented characters with codes
332 greater than 128 are treated as letters), the following code
333 could be used:
334
335 setlocale(LC_CTYPE, "fr");
336 tables = pcre_maketables();
337 re = pcre_compile(..., tables);
338
339 The tables are built in memory that is obtained via
340 pcre_malloc. The pointer that is passed to pcre_compile is
341 saved with the compiled pattern, and the same tables are
342 used via this pointer by pcre_study() and pcre_exec(). Thus
343 for any single pattern, compilation, studying and matching
344 all happen in the same locale, but different patterns can be
345 compiled in different locales. It is the caller's responsi-
346 bility to ensure that the memory containing the tables
347 remains available for as long as it is needed.
348
349
350
351 INFORMATION ABOUT A PATTERN
352 The pcre_fullinfo() function returns information about a
353 compiled pattern. It replaces the obsolete pcre_info() func-
354 tion, which is nevertheless retained for backwards compabil-
355 ity (and is documented below).
356
357 The first argument for pcre_fullinfo() is a pointer to the
358 compiled pattern. The second argument is the result of
359 pcre_study(), or NULL if the pattern was not studied. The
360 third argument specifies which piece of information is
361 required, while the fourth argument is a pointer to a vari-
362 able to receive the data. The yield of the function is zero
363 for success, or one of the following negative numbers:
364
365 PCRE_ERROR_NULL the argument code was NULL
366 the argument where was NULL
367 PCRE_ERROR_BADMAGIC the "magic number" was not found
368 PCRE_ERROR_BADOPTION the value of what was invalid
369
370 Here is a typical call of pcre_fullinfo(), to obtain the
371 length of the compiled pattern:
372
373 int rc;
374 unsigned long int length;
375 rc = pcre_fullinfo(
376 re, /* result of pcre_compile() */
377 pe, /* result of pcre_study(), or NULL */
378 PCRE_INFO_SIZE, /* what is required */
379 &length); /* where to put the data */
380
381 The possible values for the third argument are defined in
382 pcre.h, and are as follows:
383
384 PCRE_INFO_OPTIONS
385
386 Return a copy of the options with which the pattern was com-
387 piled. The fourth argument should point to an unsigned long
388 int variable. These option bits are those specified in the
389 call to pcre_compile(), modified by any top-level option
390 settings within the pattern itself, and with the
391 PCRE_ANCHORED bit forcibly set if the form of the pattern
392 implies that it can match only at the start of a subject
393 string.
394
395 PCRE_INFO_SIZE
396
397 Return the size of the compiled pattern, that is, the value
398 that was passed as the argument to pcre_malloc() when PCRE
399 was getting memory in which to place the compiled data. The
400 fourth argument should point to a size_t variable.
401
402 PCRE_INFO_CAPTURECOUNT
403
404 Return the number of capturing subpatterns in the pattern.
405 The fourth argument should point to an int variable.
406
407 PCRE_INFO_BACKREFMAX
408
409 Return the number of the highest back reference in the pat-
410 tern. The fourth argument should point to an int variable.
411 Zero is returned if there are no back references.
412
413 PCRE_INFO_FIRSTCHAR
414
415 Return information about the first character of any matched
416 string, for a non-anchored pattern. If there is a fixed
417 first character, e.g. from a pattern such as
418 (cat|cow|coyote), it is returned in the integer pointed to
419 by where. Otherwise, if either
420
421 (a) the pattern was compiled with the PCRE_MULTILINE option,
422 and every branch starts with "^", or
423
424 (b) every branch of the pattern starts with ".*" and
425 PCRE_DOTALL is not set (if it were set, the pattern would be
426 anchored),
427
428 -1 is returned, indicating that the pattern matches only at
429 the start of a subject string or after any "\n" within the
430 string. Otherwise -2 is returned. For anchored patterns, -2
431 is returned.
432
433 PCRE_INFO_FIRSTTABLE
434
435 If the pattern was studied, and this resulted in the con-
436 struction of a 256-bit table indicating a fixed set of char-
437 acters for the first character in any matching string, a
438 pointer to the table is returned. Otherwise NULL is
439 returned. The fourth argument should point to an unsigned
440 char * variable.
441
442 PCRE_INFO_LASTLITERAL
443
444 For a non-anchored pattern, return the value of the right-
445 most literal character which must exist in any matched
446 string, other than at its start. The fourth argument should
447 point to an int variable. If there is no such character, or
448 if the pattern is anchored, -1 is returned. For example, for
449 the pattern /a\d+z\d+/ the returned value is 'z'.
450
451 The pcre_info() function is now obsolete because its inter-
452 face is too restrictive to return all the available data
453 about a compiled pattern. New programs should use
454 pcre_fullinfo() instead. The yield of pcre_info() is the
455 number of capturing subpatterns, or one of the following
456 negative numbers:
457
458 PCRE_ERROR_NULL the argument code was NULL
459 PCRE_ERROR_BADMAGIC the "magic number" was not found
460
461 If the optptr argument is not NULL, a copy of the options
462 with which the pattern was compiled is placed in the integer
463 it points to (see PCRE_INFO_OPTIONS above).
464
465 If the pattern is not anchored and the firstcharptr argument
466 is not NULL, it is used to pass back information about the
467 first character of any matched string (see
468 PCRE_INFO_FIRSTCHAR above).
469
470
471
472 MATCHING A PATTERN
473 The function pcre_exec() is called to match a subject string
474
475
476
477
478
479 SunOS 5.8 Last change: 9
480
481
482
483 against a pre-compiled pattern, which is passed in the code
484 argument. If the pattern has been studied, the result of the
485 study should be passed in the extra argument. Otherwise this
486 must be NULL.
487
488 Here is an example of a simple call to pcre_exec():
489
490 int rc;
491 int ovector[30];
492 rc = pcre_exec(
493 re, /* result of pcre_compile() */
494 NULL, /* we didn't study the pattern */
495 "some string", /* the subject string */
496 11, /* the length of the subject string */
497 0, /* start at offset 0 in the subject */
498 0, /* default options */
499 ovector, /* vector for substring information */
500 30); /* number of elements in the vector */
501
502 The PCRE_ANCHORED option can be passed in the options argu-
503 ment, whose unused bits must be zero. However, if a pattern
504 was compiled with PCRE_ANCHORED, or turned out to be
505 anchored by virtue of its contents, it cannot be made
506 unachored at matching time.
507
508 There are also three further options that can be set only at
509 matching time:
510
511 PCRE_NOTBOL
512
513 The first character of the string is not the beginning of a
514 line, so the circumflex metacharacter should not match
515 before it. Setting this without PCRE_MULTILINE (at compile
516 time) causes circumflex never to match.
517
518 PCRE_NOTEOL
519
520 The end of the string is not the end of a line, so the dol-
521 lar metacharacter should not match it nor (except in multi-
522 line mode) a newline immediately before it. Setting this
523 without PCRE_MULTILINE (at compile time) causes dollar never
524 to match.
525
526 PCRE_NOTEMPTY
527
528 An empty string is not considered to be a valid match if
529 this option is set. If there are alternatives in the pat-
530 tern, they are tried. If all the alternatives match the
531 empty string, the entire match fails. For example, if the
532 pattern
533
534 a?b?
535
536 is applied to a string not beginning with "a" or "b", it
537 matches the empty string at the start of the subject. With
538 PCRE_NOTEMPTY set, this match is not valid, so PCRE searches
539 further into the string for occurrences of "a" or "b".
540
541 Perl has no direct equivalent of PCRE_NOTEMPTY, but it does
542 make a special case of a pattern match of the empty string
543 within its split() function, and when using the /g modifier.
544 It is possible to emulate Perl's behaviour after matching a
545 null string by first trying the match again at the same
546 offset with PCRE_NOTEMPTY set, and then if that fails by
547 advancing the starting offset (see below) and trying an
548 ordinary match again.
549
550 The subject string is passed as a pointer in subject, a
551 length in length, and a starting offset in startoffset.
552 Unlike the pattern string, the subject may contain binary
553 zero characters. When the starting offset is zero, the
554 search for a match starts at the beginning of the subject,
555 and this is by far the most common case.
556
557 A non-zero starting offset is useful when searching for
558 another match in the same subject by calling pcre_exec()
559 again after a previous success. Setting startoffset differs
560 from just passing over a shortened string and setting
561 PCRE_NOTBOL in the case of a pattern that begins with any
562 kind of lookbehind. For example, consider the pattern
563
564 \Biss\B
565
566 which finds occurrences of "iss" in the middle of words. (\B
567 matches only if the current position in the subject is not a
568 word boundary.) When applied to the string "Mississipi" the
569 first call to pcre_exec() finds the first occurrence. If
570 pcre_exec() is called again with just the remainder of the
571 subject, namely "issipi", it does not match, because \B is
572 always false at the start of the subject, which is deemed to
573 be a word boundary. However, if pcre_exec() is passed the
574 entire string again, but with startoffset set to 4, it finds
575 the second occurrence of "iss" because it is able to look
576 behind the starting point to discover that it is preceded by
577 a letter.
578
579 If a non-zero starting offset is passed when the pattern is
580 anchored, one attempt to match at the given offset is tried.
581 This can only succeed if the pattern does not require the
582 match to be at the start of the subject.
583
584 In general, a pattern matches a certain portion of the sub-
585 ject, and in addition, further substrings from the subject
586 may be picked out by parts of the pattern. Following the
587 usage in Jeffrey Friedl's book, this is called "capturing"
588 in what follows, and the phrase "capturing subpattern" is
589 used for a fragment of a pattern that picks out a substring.
590 PCRE supports several other kinds of parenthesized subpat-
591 tern that do not cause substrings to be captured.
592
593 Captured substrings are returned to the caller via a vector
594 of integer offsets whose address is passed in ovector. The
595 number of elements in the vector is passed in ovecsize. The
596 first two-thirds of the vector is used to pass back captured
597 substrings, each substring using a pair of integers. The
598 remaining third of the vector is used as workspace by
599 pcre_exec() while matching capturing subpatterns, and is not
600 available for passing back information. The length passed in
601 ovecsize should always be a multiple of three. If it is not,
602 it is rounded down.
603
604 When a match has been successful, information about captured
605 substrings is returned in pairs of integers, starting at the
606 beginning of ovector, and continuing up to two-thirds of its
607 length at the most. The first element of a pair is set to
608 the offset of the first character in a substring, and the
609 second is set to the offset of the first character after the
610 end of a substring. The first pair, ovector[0] and ovec-
611 tor[1], identify the portion of the subject string matched
612 by the entire pattern. The next pair is used for the first
613 capturing subpattern, and so on. The value returned by
614 pcre_exec() is the number of pairs that have been set. If
615 there are no capturing subpatterns, the return value from a
616 successful match is 1, indicating that just the first pair
617 of offsets has been set.
618
619 Some convenience functions are provided for extracting the
620 captured substrings as separate strings. These are described
621 in the following section.
622
623 It is possible for an capturing subpattern number n+1 to
624 match some part of the subject when subpattern n has not
625 been used at all. For example, if the string "abc" is
626 matched against the pattern (a|(z))(bc) subpatterns 1 and 3
627 are matched, but 2 is not. When this happens, both offset
628 values corresponding to the unused subpattern are set to -1.
629
630 If a capturing subpattern is matched repeatedly, it is the
631 last portion of the string that it matched that gets
632 returned.
633
634 If the vector is too small to hold all the captured sub-
635 strings, it is used as far as possible (up to two-thirds of
636 its length), and the function returns a value of zero. In
637 particular, if the substring offsets are not of interest,
638 pcre_exec() may be called with ovector passed as NULL and
639 ovecsize as zero. However, if the pattern contains back
640 references and the ovector isn't big enough to remember the
641 related substrings, PCRE has to get additional memory for
642 use during matching. Thus it is usually advisable to supply
643 an ovector.
644
645 Note that pcre_info() can be used to find out how many cap-
646 turing subpatterns there are in a compiled pattern. The
647 smallest size for ovector that will allow for n captured
648 substrings in addition to the offsets of the substring
649 matched by the whole pattern is (n+1)*3.
650
651 If pcre_exec() fails, it returns a negative number. The fol-
652 lowing are defined in the header file:
653
654 PCRE_ERROR_NOMATCH (-1)
655
656 The subject string did not match the pattern.
657
658 PCRE_ERROR_NULL (-2)
659
660 Either code or subject was passed as NULL, or ovector was
661 NULL and ovecsize was not zero.
662
663 PCRE_ERROR_BADOPTION (-3)
664
665 An unrecognized bit was set in the options argument.
666
667 PCRE_ERROR_BADMAGIC (-4)
668
669 PCRE stores a 4-byte "magic number" at the start of the com-
670 piled code, to catch the case when it is passed a junk
671 pointer. This is the error it gives when the magic number
672 isn't present.
673
674 PCRE_ERROR_UNKNOWN_NODE (-5)
675
676 While running the pattern match, an unknown item was encoun-
677 tered in the compiled pattern. This error could be caused by
678 a bug in PCRE or by overwriting of the compiled pattern.
679
680 PCRE_ERROR_NOMEMORY (-6)
681
682 If a pattern contains back references, but the ovector that
683 is passed to pcre_exec() is not big enough to remember the
684 referenced substrings, PCRE gets a block of memory at the
685 start of matching to use for this purpose. If the call via
686 pcre_malloc() fails, this error is given. The memory is
687 freed at the end of matching.
688
689
690
691
692 EXTRACTING CAPTURED SUBSTRINGS
693 Captured substrings can be accessed directly by using the
694 offsets returned by pcre_exec() in ovector. For convenience,
695 the functions pcre_copy_substring(), pcre_get_substring(),
696 and pcre_get_substring_list() are provided for extracting
697 captured substrings as new, separate, zero-terminated
698 strings. A substring that contains a binary zero is
699 correctly extracted and has a further zero added on the end,
700 but the result does not, of course, function as a C string.
701
702 The first three arguments are the same for all three func-
703 tions: subject is the subject string which has just been
704 successfully matched, ovector is a pointer to the vector of
705 integer offsets that was passed to pcre_exec(), and
706 stringcount is the number of substrings that were captured
707 by the match, including the substring that matched the
708 entire regular expression. This is the value returned by
709 pcre_exec if it is greater than zero. If pcre_exec()
710 returned zero, indicating that it ran out of space in ovec-
711 tor, the value passed as stringcount should be the size of
712 the vector divided by three.
713
714 The functions pcre_copy_substring() and pcre_get_substring()
715 extract a single substring, whose number is given as string-
716 number. A value of zero extracts the substring that matched
717 the entire pattern, while higher values extract the captured
718 substrings. For pcre_copy_substring(), the string is placed
719 in buffer, whose length is given by buffersize, while for
720 pcre_get_substring() a new block of memory is obtained via
721 pcre_malloc, and its address is returned via stringptr. The
722 yield of the function is the length of the string, not
723 including the terminating zero, or one of
724
725 PCRE_ERROR_NOMEMORY (-6)
726
727 The buffer was too small for pcre_copy_substring(), or the
728 attempt to get memory failed for pcre_get_substring().
729
730 PCRE_ERROR_NOSUBSTRING (-7)
731
732 There is no substring whose number is stringnumber.
733
734 The pcre_get_substring_list() function extracts all avail-
735 able substrings and builds a list of pointers to them. All
736 this is done in a single block of memory which is obtained
737 via pcre_malloc. The address of the memory block is returned
738 via listptr, which is also the start of the list of string
739 pointers. The end of the list is marked by a NULL pointer.
740 The yield of the function is zero if all went well, or
741
742 PCRE_ERROR_NOMEMORY (-6)
743
744 if the attempt to get the memory block failed.
745
746 When any of these functions encounter a substring that is
747 unset, which can happen when capturing subpattern number n+1
748 matches some part of the subject, but subpattern n has not
749 been used at all, they return an empty string. This can be
750 distinguished from a genuine zero-length substring by
751 inspecting the appropriate offset in ovector, which is nega-
752 tive for unset substrings.
753
754 The two convenience functions pcre_free_substring() and
755 pcre_free_substring_list() can be used to free the memory
756 returned by a previous call of pcre_get_substring() or
757 pcre_get_substring_list(), respectively. They do nothing
758 more than call the function pointed to by pcre_free, which
759 of course could be called directly from a C program. How-
760 ever, PCRE is used in some situations where it is linked via
761 a special interface to another programming language which
762 cannot use pcre_free directly; it is for these cases that
763 the functions are provided.
764
765
766
767 LIMITATIONS
768 There are some size limitations in PCRE but it is hoped that
769 they will never in practice be relevant. The maximum length
770 of a compiled pattern is 65539 (sic) bytes. All values in
771 repeating quantifiers must be less than 65536. There max-
772 imum number of capturing subpatterns is 65535. There is no
773 limit to the number of non-capturing subpatterns, but the
774 maximum depth of nesting of all kinds of parenthesized sub-
775 pattern, including capturing subpatterns, assertions, and
776 other types of subpattern, is 200.
777
778 The maximum length of a subject string is the largest posi-
779 tive number that an integer variable can hold. However, PCRE
780 uses recursion to handle subpatterns and indefinite repeti-
781 tion. This means that the available stack space may limit
782 the size of a subject string that can be processed by cer-
783 tain patterns.
784
785
786
787 DIFFERENCES FROM PERL
788 The differences described here are with respect to Perl
789 5.005.
790
791 1. By default, a whitespace character is any character that
792 the C library function isspace() recognizes, though it is
793 possible to compile PCRE with alternative character type
794 tables. Normally isspace() matches space, formfeed, newline,
795 carriage return, horizontal tab, and vertical tab. Perl 5 no
796 longer includes vertical tab in its set of whitespace char-
797 acters. The \v escape that was in the Perl documentation for
798 a long time was never in fact recognized. However, the char-
799 acter itself was treated as whitespace at least up to 5.002.
800 In 5.004 and 5.005 it does not match \s.
801
802 2. PCRE does not allow repeat quantifiers on lookahead
803 assertions. Perl permits them, but they do not mean what you
804 might think. For example, (?!a){3} does not assert that the
805 next three characters are not "a". It just asserts that the
806 next character is not "a" three times.
807
808 3. Capturing subpatterns that occur inside negative looka-
809 head assertions are counted, but their entries in the
810 offsets vector are never set. Perl sets its numerical vari-
811 ables from any such patterns that are matched before the
812 assertion fails to match something (thereby succeeding), but
813 only if the negative lookahead assertion contains just one
814 branch.
815
816 4. Though binary zero characters are supported in the sub-
817 ject string, they are not allowed in a pattern string
818 because it is passed as a normal C string, terminated by
819 zero. The escape sequence "\0" can be used in the pattern to
820 represent a binary zero.
821
822 5. The following Perl escape sequences are not supported:
823 \l, \u, \L, \U, \E, \Q. In fact these are implemented by
824 Perl's general string-handling and are not part of its pat-
825 tern matching engine.
826
827 6. The Perl \G assertion is not supported as it is not
828 relevant to single pattern matches.
829
830 7. Fairly obviously, PCRE does not support the (?{code}) and
831 (?p{code}) constructions. However, there is some experimen-
832 tal support for recursive patterns using the non-Perl item
833 (?R).
834
835 8. There are at the time of writing some oddities in Perl
836 5.005_02 concerned with the settings of captured strings
837 when part of a pattern is repeated. For example, matching
838 "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
839 "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2
840 unset. However, if the pattern is changed to
841 /^(aa(b(b))?)+$/ then $2 (and $3) are set.
842
843 In Perl 5.004 $2 is set in both cases, and that is also true
844 of PCRE. If in the future Perl changes to a consistent state
845 that is different, PCRE may change to follow.
846
847 9. Another as yet unresolved discrepancy is that in Perl
848 5.005_02 the pattern /^(a)?(?(1)a|b)+$/ matches the string
849 "a", whereas in PCRE it does not. However, in both Perl and
850 PCRE /^(a)?a/ matched against "a" leaves $1 unset.
851
852 10. PCRE provides some extensions to the Perl regular
853 expression facilities:
854
855 (a) Although lookbehind assertions must match fixed length
856 strings, each alternative branch of a lookbehind assertion
857 can match a different length of string. Perl 5.005 requires
858 them all to have the same length.
859
860 (b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not
861 set, the $ meta- character matches only at the very end of
862 the string.
863
864 (c) If PCRE_EXTRA is set, a backslash followed by a letter
865 with no special meaning is faulted.
866
867 (d) If PCRE_UNGREEDY is set, the greediness of the repeti-
868 tion quantifiers is inverted, that is, by default they are
869 not greedy, but if followed by a question mark they are.
870
871 (e) PCRE_ANCHORED can be used to force a pattern to be tried
872 only at the start of the subject.
873
874 (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options
875 for pcre_exec() have no Perl equivalents.
876
877 (g) The (?R) construct allows for recursive pattern matching
878 (Perl 5.6 can do this using the (?p{code}) construct, which
879 PCRE cannot of course support.)
880
881
882
883 REGULAR EXPRESSION DETAILS
884 The syntax and semantics of the regular expressions sup-
885 ported by PCRE are described below. Regular expressions are
886 also described in the Perl documentation and in a number of
887 other books, some of which have copious examples. Jeffrey
888 Friedl's "Mastering Regular Expressions", published by
889 O'Reilly (ISBN 1-56592-257), covers them in great detail.
890
891 The description here is intended as reference documentation.
892 The basic operation of PCRE is on strings of bytes. However,
893 there is the beginnings of some support for UTF-8 character
894 strings. To use this support you must configure PCRE to
895 include it, and then call pcre_compile() with the PCRE_UTF8
896 option. How this affects the pattern matching is described
897 in the final section of this document.
898
899 A regular expression is a pattern that is matched against a
900 subject string from left to right. Most characters stand for
901 themselves in a pattern, and match the corresponding charac-
902 ters in the subject. As a trivial example, the pattern
903
904 The quick brown fox
905
906 matches a portion of a subject string that is identical to
907 itself. The power of regular expressions comes from the
908 ability to include alternatives and repetitions in the pat-
909 tern. These are encoded in the pattern by the use of meta-
910 characters, which do not stand for themselves but instead
911 are interpreted in some special way.
912
913 There are two different sets of meta-characters: those that
914 are recognized anywhere in the pattern except within square
915 brackets, and those that are recognized in square brackets.
916 Outside square brackets, the meta-characters are as follows:
917
918 \ general escape character with several uses
919 ^ assert start of subject (or line, in multiline
920 mode)
921 $ assert end of subject (or line, in multiline mode)
922 . match any character except newline (by default)
923 [ start character class definition
924 | start of alternative branch
925 ( start subpattern
926 ) end subpattern
927 ? extends the meaning of (
928 also 0 or 1 quantifier
929 also quantifier minimizer
930 * 0 or more quantifier
931 + 1 or more quantifier
932 { start min/max quantifier
933
934 Part of a pattern that is in square brackets is called a
935 "character class". In a character class the only meta-
936 characters are:
937
938 \ general escape character
939 ^ negate the class, but only if the first character
940 - indicates character range
941 ] terminates the character class
942
943 The following sections describe the use of each of the
944 meta-characters.
945
946
947
948 BACKSLASH
949 The backslash character has several uses. Firstly, if it is
950 followed by a non-alphameric character, it takes away any
951 special meaning that character may have. This use of
952
953 backslash as an escape character applies both inside and
954 outside character classes.
955
956 For example, if you want to match a "*" character, you write
957 "\*" in the pattern. This applies whether or not the follow-
958 ing character would otherwise be interpreted as a meta-
959 character, so it is always safe to precede a non-alphameric
960 with "\" to specify that it stands for itself. In particu-
961 lar, if you want to match a backslash, you write "\\".
962
963 If a pattern is compiled with the PCRE_EXTENDED option, whi-
964 tespace in the pattern (other than in a character class) and
965 characters between a "#" outside a character class and the
966 next newline character are ignored. An escaping backslash
967 can be used to include a whitespace or "#" character as part
968 of the pattern.
969
970 A second use of backslash provides a way of encoding non-
971 printing characters in patterns in a visible manner. There
972 is no restriction on the appearance of non-printing charac-
973 ters, apart from the binary zero that terminates a pattern,
974 but when a pattern is being prepared by text editing, it is
975 usually easier to use one of the following escape sequences
976 than the binary character it represents:
977
978 \a alarm, that is, the BEL character (hex 07)
979 \cx "control-x", where x is any character
980 \e escape (hex 1B)
981 \f formfeed (hex 0C)
982 \n newline (hex 0A)
983 \r carriage return (hex 0D)
984 \t tab (hex 09)
985 \xhh character with hex code hh
986 \ddd character with octal code ddd, or backreference
987
988 The precise effect of "\cx" is as follows: if "x" is a lower
989 case letter, it is converted to upper case. Then bit 6 of
990 the character (hex 40) is inverted. Thus "\cz" becomes hex
991 1A, but "\c{" becomes hex 3B, while "\c;" becomes hex 7B.
992
993 After "\x", up to two hexadecimal digits are read (letters
994 can be in upper or lower case).
995
996 After "\0" up to two further octal digits are read. In both
997 cases, if there are fewer than two digits, just those that
998 are present are used. Thus the sequence "\0\x\07" specifies
999 two binary zeros followed by a BEL character. Make sure you
1000 supply two digits after the initial zero if the character
1001 that follows is itself an octal digit.
1002
1003 The handling of a backslash followed by a digit other than 0
1004 is complicated. Outside a character class, PCRE reads it
1005 and any following digits as a decimal number. If the number
1006 is less than 10, or if there have been at least that many
1007 previous capturing left parentheses in the expression, the
1008 entire sequence is taken as a back reference. A description
1009 of how this works is given later, following the discussion
1010 of parenthesized subpatterns.
1011
1012 Inside a character class, or if the decimal number is
1013 greater than 9 and there have not been that many capturing
1014 subpatterns, PCRE re-reads up to three octal digits follow-
1015 ing the backslash, and generates a single byte from the
1016 least significant 8 bits of the value. Any subsequent digits
1017 stand for themselves. For example:
1018
1019 \040 is another way of writing a space
1020 \40 is the same, provided there are fewer than 40
1021 previous capturing subpatterns
1022 \7 is always a back reference
1023 \11 might be a back reference, or another way of
1024 writing a tab
1025 \011 is always a tab
1026 \0113 is a tab followed by the character "3"
1027 \113 is the character with octal code 113 (since there
1028 can be no more than 99 back references)
1029 \377 is a byte consisting entirely of 1 bits
1030 \81 is either a back reference, or a binary zero
1031 followed by the two characters "8" and "1"
1032
1033 Note that octal values of 100 or greater must not be intro-
1034 duced by a leading zero, because no more than three octal
1035 digits are ever read.
1036
1037 All the sequences that define a single byte value can be
1038 used both inside and outside character classes. In addition,
1039 inside a character class, the sequence "\b" is interpreted
1040 as the backspace character (hex 08). Outside a character
1041 class it has a different meaning (see below).
1042
1043 The third use of backslash is for specifying generic charac-
1044 ter types:
1045
1046 \d any decimal digit
1047 \D any character that is not a decimal digit
1048 \s any whitespace character
1049 \S any character that is not a whitespace character
1050 \w any "word" character
1051 \W any "non-word" character
1052
1053 Each pair of escape sequences partitions the complete set of
1054 characters into two disjoint sets. Any given character
1055 matches one, and only one, of each pair.
1056
1057 A "word" character is any letter or digit or the underscore
1058 character, that is, any character which can be part of a
1059 Perl "word". The definition of letters and digits is con-
1060 trolled by PCRE's character tables, and may vary if locale-
1061 specific matching is taking place (see "Locale support"
1062 above). For example, in the "fr" (French) locale, some char-
1063 acter codes greater than 128 are used for accented letters,
1064 and these are matched by \w.
1065
1066 These character type sequences can appear both inside and
1067 outside character classes. They each match one character of
1068 the appropriate type. If the current matching point is at
1069 the end of the subject string, all of them fail, since there
1070 is no character to match.
1071
1072 The fourth use of backslash is for certain simple asser-
1073 tions. An assertion specifies a condition that has to be met
1074 at a particular point in a match, without consuming any
1075 characters from the subject string. The use of subpatterns
1076 for more complicated assertions is described below. The
1077 backslashed assertions are
1078
1079 \b word boundary
1080 \B not a word boundary
1081 \A start of subject (independent of multiline mode)
1082 \Z end of subject or newline at end (independent of
1083 multiline mode)
1084 \z end of subject (independent of multiline mode)
1085
1086 These assertions may not appear in character classes (but
1087 note that "\b" has a different meaning, namely the backspace
1088 character, inside a character class).
1089
1090 A word boundary is a position in the subject string where
1091 the current character and the previous character do not both
1092 match \w or \W (i.e. one matches \w and the other matches
1093 \W), or the start or end of the string if the first or last
1094 character matches \w, respectively.
1095
1096 The \A, \Z, and \z assertions differ from the traditional
1097 circumflex and dollar (described below) in that they only
1098 ever match at the very start and end of the subject string,
1099 whatever options are set. They are not affected by the
1100 PCRE_NOTBOL or PCRE_NOTEOL options. If the startoffset argu-
1101 ment of pcre_exec() is non-zero, \A can never match. The
1102 difference between \Z and \z is that \Z matches before a
1103 newline that is the last character of the string as well as
1104 at the end of the string, whereas \z matches only at the
1105 end.
1106
1107
1108
1109 CIRCUMFLEX AND DOLLAR
1110 Outside a character class, in the default matching mode, the
1111 circumflex character is an assertion which is true only if
1112 the current matching point is at the start of the subject
1113 string. If the startoffset argument of pcre_exec() is non-
1114 zero, circumflex can never match. Inside a character class,
1115 circumflex has an entirely different meaning (see below).
1116
1117 Circumflex need not be the first character of the pattern if
1118 a number of alternatives are involved, but it should be the
1119 first thing in each alternative in which it appears if the
1120 pattern is ever to match that branch. If all possible alter-
1121 natives start with a circumflex, that is, if the pattern is
1122 constrained to match only at the start of the subject, it is
1123 said to be an "anchored" pattern. (There are also other con-
1124 structs that can cause a pattern to be anchored.)
1125
1126 A dollar character is an assertion which is true only if the
1127 current matching point is at the end of the subject string,
1128 or immediately before a newline character that is the last
1129 character in the string (by default). Dollar need not be the
1130 last character of the pattern if a number of alternatives
1131 are involved, but it should be the last item in any branch
1132 in which it appears. Dollar has no special meaning in a
1133 character class.
1134
1135 The meaning of dollar can be changed so that it matches only
1136 at the very end of the string, by setting the
1137 PCRE_DOLLAR_ENDONLY option at compile or matching time. This
1138 does not affect the \Z assertion.
1139
1140 The meanings of the circumflex and dollar characters are
1141 changed if the PCRE_MULTILINE option is set. When this is
1142 the case, they match immediately after and immediately
1143 before an internal "\n" character, respectively, in addition
1144 to matching at the start and end of the subject string. For
1145 example, the pattern /^abc$/ matches the subject string
1146 "def\nabc" in multiline mode, but not otherwise. Conse-
1147 quently, patterns that are anchored in single line mode
1148 because all branches start with "^" are not anchored in mul-
1149 tiline mode, and a match for circumflex is possible when the
1150 startoffset argument of pcre_exec() is non-zero. The
1151 PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
1152 set.
1153
1154 Note that the sequences \A, \Z, and \z can be used to match
1155 the start and end of the subject in both modes, and if all
1156 branches of a pattern start with \A it is always anchored,
1157 whether PCRE_MULTILINE is set or not.
1158
1159
1160
1161 FULL STOP (PERIOD, DOT)
1162 Outside a character class, a dot in the pattern matches any
1163 one character in the subject, including a non-printing char-
1164 acter, but not (by default) newline. If the PCRE_DOTALL
1165 option is set, dots match newlines as well. The handling of
1166 dot is entirely independent of the handling of circumflex
1167 and dollar, the only relationship being that they both
1168 involve newline characters. Dot has no special meaning in a
1169 character class.
1170
1171
1172
1173 SQUARE BRACKETS
1174 An opening square bracket introduces a character class, ter-
1175 minated by a closing square bracket. A closing square
1176 bracket on its own is not special. If a closing square
1177 bracket is required as a member of the class, it should be
1178 the first data character in the class (after an initial cir-
1179 cumflex, if present) or escaped with a backslash.
1180
1181 A character class matches a single character in the subject;
1182 the character must be in the set of characters defined by
1183 the class, unless the first character in the class is a cir-
1184 cumflex, in which case the subject character must not be in
1185 the set defined by the class. If a circumflex is actually
1186 required as a member of the class, ensure it is not the
1187 first character, or escape it with a backslash.
1188
1189 For example, the character class [aeiou] matches any lower
1190 case vowel, while [^aeiou] matches any character that is not
1191 a lower case vowel. Note that a circumflex is just a con-
1192 venient notation for specifying the characters which are in
1193 the class by enumerating those that are not. It is not an
1194 assertion: it still consumes a character from the subject
1195 string, and fails if the current pointer is at the end of
1196 the string.
1197
1198 When caseless matching is set, any letters in a class
1199 represent both their upper case and lower case versions, so
1200 for example, a caseless [aeiou] matches "A" as well as "a",
1201 and a caseless [^aeiou] does not match "A", whereas a case-
1202 ful version would.
1203
1204 The newline character is never treated in any special way in
1205 character classes, whatever the setting of the PCRE_DOTALL
1206 or PCRE_MULTILINE options is. A class such as [^a] will
1207 always match a newline.
1208
1209 The minus (hyphen) character can be used to specify a range
1210 of characters in a character class. For example, [d-m]
1211 matches any letter between d and m, inclusive. If a minus
1212 character is required in a class, it must be escaped with a
1213 backslash or appear in a position where it cannot be inter-
1214 preted as indicating a range, typically as the first or last
1215 character in the class.
1216
1217 It is not possible to have the literal character "]" as the
1218 end character of a range. A pattern such as [W-]46] is
1219 interpreted as a class of two characters ("W" and "-") fol-
1220 lowed by a literal string "46]", so it would match "W46]" or
1221 "-46]". However, if the "]" is escaped with a backslash it
1222 is interpreted as the end of range, so [W-\]46] is inter-
1223 preted as a single class containing a range followed by two
1224 separate characters. The octal or hexadecimal representation
1225 of "]" can also be used to end a range.
1226
1227 Ranges operate in ASCII collating sequence. They can also be
1228 used for characters specified numerically, for example
1229 [\000-\037]. If a range that includes letters is used when
1230 caseless matching is set, it matches the letters in either
1231 case. For example, [W-c] is equivalent to [][\^_`wxyzabc],
1232 matched caselessly, and if character tables for the "fr"
1233 locale are in use, [\xc8-\xcb] matches accented E characters
1234 in both cases.
1235
1236 The character types \d, \D, \s, \S, \w, and \W may also
1237 appear in a character class, and add the characters that
1238 they match to the class. For example, [\dABCDEF] matches any
1239 hexadecimal digit. A circumflex can conveniently be used
1240 with the upper case character types to specify a more res-
1241 tricted set of characters than the matching lower case type.
1242 For example, the class [^\W_] matches any letter or digit,
1243 but not underscore.
1244
1245 All non-alphameric characters other than \, -, ^ (at the
1246 start) and the terminating ] are non-special in character
1247 classes, but it does no harm if they are escaped.
1248
1249
1250
1251 POSIX CHARACTER CLASSES
1252 Perl 5.6 (not yet released at the time of writing) is going
1253 to support the POSIX notation for character classes, which
1254 uses names enclosed by [: and :] within the enclosing
1255 square brackets. PCRE supports this notation. For example,
1256
1257 [01[:alpha:]%]
1258
1259 matches "0", "1", any alphabetic character, or "%". The sup-
1260 ported class names are
1261
1262 alnum letters and digits
1263 alpha letters
1264 ascii character codes 0 - 127
1265 cntrl control characters
1266 digit decimal digits (same as \d)
1267 graph printing characters, excluding space
1268 lower lower case letters
1269 print printing characters, including space
1270 punct printing characters, excluding letters and digits
1271 space white space (same as \s)
1272 upper upper case letters
1273 word "word" characters (same as \w)
1274 xdigit hexadecimal digits
1275
1276 The names "ascii" and "word" are Perl extensions. Another
1277 Perl extension is negation, which is indicated by a ^ char-
1278 acter after the colon. For example,
1279
1280 [12[:^digit:]]
1281
1282 matches "1", "2", or any non-digit. PCRE (and Perl) also
1283 recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
1284 "collating element", but these are not supported, and an
1285 error is given if they are encountered.
1286
1287
1288
1289 VERTICAL BAR
1290 Vertical bar characters are used to separate alternative
1291 patterns. For example, the pattern
1292
1293 gilbert|sullivan
1294
1295 matches either "gilbert" or "sullivan". Any number of alter-
1296 natives may appear, and an empty alternative is permitted
1297 (matching the empty string). The matching process tries
1298 each alternative in turn, from left to right, and the first
1299 one that succeeds is used. If the alternatives are within a
1300 subpattern (defined below), "succeeds" means matching the
1301 rest of the main pattern as well as the alternative in the
1302 subpattern.
1303
1304
1305
1306 INTERNAL OPTION SETTING
1307 The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL,
1308 and PCRE_EXTENDED can be changed from within the pattern by
1309 a sequence of Perl option letters enclosed between "(?" and
1310 ")". The option letters are
1311
1312 i for PCRE_CASELESS
1313 m for PCRE_MULTILINE
1314 s for PCRE_DOTALL
1315 x for PCRE_EXTENDED
1316
1317 For example, (?im) sets caseless, multiline matching. It is
1318 also possible to unset these options by preceding the letter
1319 with a hyphen, and a combined setting and unsetting such as
1320 (?im-sx), which sets PCRE_CASELESS and PCRE_MULTILINE while
1321 unsetting PCRE_DOTALL and PCRE_EXTENDED, is also permitted.
1322 If a letter appears both before and after the hyphen, the
1323 option is unset.
1324
1325 The scope of these option changes depends on where in the
1326 pattern the setting occurs. For settings that are outside
1327 any subpattern (defined below), the effect is the same as if
1328 the options were set or unset at the start of matching. The
1329 following patterns all behave in exactly the same way:
1330
1331 (?i)abc
1332 a(?i)bc
1333 ab(?i)c
1334 abc(?i)
1335
1336 which in turn is the same as compiling the pattern abc with
1337 PCRE_CASELESS set. In other words, such "top level" set-
1338 tings apply to the whole pattern (unless there are other
1339 changes inside subpatterns). If there is more than one set-
1340 ting of the same option at top level, the rightmost setting
1341 is used.
1342
1343 If an option change occurs inside a subpattern, the effect
1344 is different. This is a change of behaviour in Perl 5.005.
1345 An option change inside a subpattern affects only that part
1346 of the subpattern that follows it, so
1347
1348 (a(?i)b)c
1349
1350 matches abc and aBc and no other strings (assuming
1351 PCRE_CASELESS is not used). By this means, options can be
1352 made to have different settings in different parts of the
1353 pattern. Any changes made in one alternative do carry on
1354 into subsequent branches within the same subpattern. For
1355 example,
1356
1357 (a(?i)b|c)
1358
1359 matches "ab", "aB", "c", and "C", even though when matching
1360 "C" the first branch is abandoned before the option setting.
1361 This is because the effects of option settings happen at
1362 compile time. There would be some very weird behaviour oth-
1363 erwise.
1364
1365 The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can
1366 be changed in the same way as the Perl-compatible options by
1367 using the characters U and X respectively. The (?X) flag
1368 setting is special in that it must always occur earlier in
1369 the pattern than any of the additional features it turns on,
1370 even when it is at top level. It is best put at the start.
1371
1372
1373
1374 SUBPATTERNS
1375 Subpatterns are delimited by parentheses (round brackets),
1376 which can be nested. Marking part of a pattern as a subpat-
1377 tern does two things:
1378
1379 1. It localizes a set of alternatives. For example, the pat-
1380 tern
1381
1382 cat(aract|erpillar|)
1383
1384 matches one of the words "cat", "cataract", or "caterpil-
1385 lar". Without the parentheses, it would match "cataract",
1386 "erpillar" or the empty string.
1387
1388 2. It sets up the subpattern as a capturing subpattern (as
1389 defined above). When the whole pattern matches, that por-
1390 tion of the subject string that matched the subpattern is
1391 passed back to the caller via the ovector argument of
1392 pcre_exec(). Opening parentheses are counted from left to
1393 right (starting from 1) to obtain the numbers of the captur-
1394 ing subpatterns.
1395
1396 For example, if the string "the red king" is matched against
1397 the pattern
1398
1399 the ((red|white) (king|queen))
1400
1401 the captured substrings are "red king", "red", and "king",
1402 and are numbered 1, 2, and 3, respectively.
1403
1404 The fact that plain parentheses fulfil two functions is not
1405 always helpful. There are often times when a grouping sub-
1406 pattern is required without a capturing requirement. If an
1407 opening parenthesis is followed by "?:", the subpattern does
1408 not do any capturing, and is not counted when computing the
1409 number of any subsequent capturing subpatterns. For example,
1410 if the string "the white queen" is matched against the pat-
1411 tern
1412
1413 the ((?:red|white) (king|queen))
1414
1415 the captured substrings are "white queen" and "queen", and
1416 are numbered 1 and 2. The maximum number of captured sub-
1417 strings is 99, and the maximum number of all subpatterns,
1418 both capturing and non-capturing, is 200.
1419
1420 As a convenient shorthand, if any option settings are
1421 required at the start of a non-capturing subpattern, the
1422 option letters may appear between the "?" and the ":". Thus
1423 the two patterns
1424
1425 (?i:saturday|sunday)
1426 (?:(?i)saturday|sunday)
1427
1428 match exactly the same set of strings. Because alternative
1429 branches are tried from left to right, and options are not
1430 reset until the end of the subpattern is reached, an option
1431 setting in one branch does affect subsequent branches, so
1432 the above patterns match "SUNDAY" as well as "Saturday".
1433
1434
1435
1436 REPETITION
1437 Repetition is specified by quantifiers, which can follow any
1438 of the following items:
1439
1440 a single character, possibly escaped
1441 the . metacharacter
1442 a character class
1443 a back reference (see next section)
1444 a parenthesized subpattern (unless it is an assertion -
1445 see below)
1446
1447 The general repetition quantifier specifies a minimum and
1448 maximum number of permitted matches, by giving the two
1449 numbers in curly brackets (braces), separated by a comma.
1450 The numbers must be less than 65536, and the first must be
1451 less than or equal to the second. For example:
1452
1453 z{2,4}
1454
1455 matches "zz", "zzz", or "zzzz". A closing brace on its own
1456 is not a special character. If the second number is omitted,
1457 but the comma is present, there is no upper limit; if the
1458 second number and the comma are both omitted, the quantifier
1459 specifies an exact number of required matches. Thus
1460
1461 [aeiou]{3,}
1462
1463 matches at least 3 successive vowels, but may match many
1464 more, while
1465
1466 \d{8}
1467
1468 matches exactly 8 digits. An opening curly bracket that
1469 appears in a position where a quantifier is not allowed, or
1470 one that does not match the syntax of a quantifier, is taken
1471 as a literal character. For example, {,6} is not a quantif-
1472 ier, but a literal string of four characters.
1473 The quantifier {0} is permitted, causing the expression to
1474 behave as if the previous item and the quantifier were not
1475 present.
1476
1477 For convenience (and historical compatibility) the three
1478 most common quantifiers have single-character abbreviations:
1479
1480 * is equivalent to {0,}
1481 + is equivalent to {1,}
1482 ? is equivalent to {0,1}
1483
1484 It is possible to construct infinite loops by following a
1485 subpattern that can match no characters with a quantifier
1486 that has no upper limit, for example:
1487
1488 (a?)*
1489
1490 Earlier versions of Perl and PCRE used to give an error at
1491 compile time for such patterns. However, because there are
1492 cases where this can be useful, such patterns are now
1493 accepted, but if any repetition of the subpattern does in
1494 fact match no characters, the loop is forcibly broken.
1495
1496 By default, the quantifiers are "greedy", that is, they
1497 match as much as possible (up to the maximum number of per-
1498 mitted times), without causing the rest of the pattern to
1499 fail. The classic example of where this gives problems is in
1500 trying to match comments in C programs. These appear between
1501 the sequences /* and */ and within the sequence, individual
1502 * and / characters may appear. An attempt to match C com-
1503 ments by applying the pattern
1504
1505 /\*.*\*/
1506
1507 to the string
1508
1509 /* first command */ not comment /* second comment */
1510
1511 fails, because it matches the entire string owing to the
1512 greediness of the .* item.
1513
1514 However, if a quantifier is followed by a question mark, it
1515 ceases to be greedy, and instead matches the minimum number
1516 of times possible, so the pattern
1517
1518 /\*.*?\*/
1519
1520 does the right thing with the C comments. The meaning of the
1521 various quantifiers is not otherwise changed, just the pre-
1522 ferred number of matches. Do not confuse this use of ques-
1523 tion mark with its use as a quantifier in its own right.
1524 Because it has two uses, it can sometimes appear doubled, as
1525 in
1526
1527 \d??\d
1528
1529 which matches one digit by preference, but can match two if
1530 that is the only way the rest of the pattern matches.
1531
1532 If the PCRE_UNGREEDY option is set (an option which is not
1533 available in Perl), the quantifiers are not greedy by
1534 default, but individual ones can be made greedy by following
1535 them with a question mark. In other words, it inverts the
1536 default behaviour.
1537
1538 When a parenthesized subpattern is quantified with a minimum
1539 repeat count that is greater than 1 or with a limited max-
1540 imum, more store is required for the compiled pattern, in
1541 proportion to the size of the minimum or maximum.
1542
1543 If a pattern starts with .* or .{0,} and the PCRE_DOTALL
1544 option (equivalent to Perl's /s) is set, thus allowing the .
1545 to match newlines, the pattern is implicitly anchored,
1546 because whatever follows will be tried against every charac-
1547 ter position in the subject string, so there is no point in
1548 retrying the overall match at any position after the first.
1549 PCRE treats such a pattern as though it were preceded by \A.
1550 In cases where it is known that the subject string contains
1551 no newlines, it is worth setting PCRE_DOTALL when the pat-
1552 tern begins with .* in order to obtain this optimization, or
1553 alternatively using ^ to indicate anchoring explicitly.
1554
1555 When a capturing subpattern is repeated, the value captured
1556 is the substring that matched the final iteration. For exam-
1557 ple, after
1558
1559 (tweedle[dume]{3}\s*)+
1560
1561 has matched "tweedledum tweedledee" the value of the cap-
1562 tured substring is "tweedledee". However, if there are
1563 nested capturing subpatterns, the corresponding captured
1564 values may have been set in previous iterations. For exam-
1565 ple, after
1566
1567 /(a|(b))+/
1568
1569 matches "aba" the value of the second captured substring is
1570 "b".
1571
1572
1573
1574 BACK REFERENCES
1575 Outside a character class, a backslash followed by a digit
1576 greater than 0 (and possibly further digits) is a back
1577
1578
1579
1580
1581 SunOS 5.8 Last change: 30
1582
1583
1584
1585 reference to a capturing subpattern earlier (i.e. to its
1586 left) in the pattern, provided there have been that many
1587 previous capturing left parentheses.
1588
1589 However, if the decimal number following the backslash is
1590 less than 10, it is always taken as a back reference, and
1591 causes an error only if there are not that many capturing
1592 left parentheses in the entire pattern. In other words, the
1593 parentheses that are referenced need not be to the left of
1594 the reference for numbers less than 10. See the section
1595 entitled "Backslash" above for further details of the han-
1596 dling of digits following a backslash.
1597
1598 A back reference matches whatever actually matched the cap-
1599 turing subpattern in the current subject string, rather than
1600 anything matching the subpattern itself. So the pattern
1601
1602 (sens|respons)e and \1ibility
1603
1604 matches "sense and sensibility" and "response and responsi-
1605 bility", but not "sense and responsibility". If caseful
1606 matching is in force at the time of the back reference, the
1607 case of letters is relevant. For example,
1608
1609 ((?i)rah)\s+\1
1610
1611 matches "rah rah" and "RAH RAH", but not "RAH rah", even
1612 though the original capturing subpattern is matched case-
1613 lessly.
1614
1615 There may be more than one back reference to the same sub-
1616 pattern. If a subpattern has not actually been used in a
1617 particular match, any back references to it always fail. For
1618 example, the pattern
1619
1620 (a|(bc))\2
1621
1622 always fails if it starts to match "a" rather than "bc".
1623 Because there may be up to 99 back references, all digits
1624 following the backslash are taken as part of a potential
1625 back reference number. If the pattern continues with a digit
1626 character, some delimiter must be used to terminate the back
1627 reference. If the PCRE_EXTENDED option is set, this can be
1628 whitespace. Otherwise an empty comment can be used.
1629
1630 A back reference that occurs inside the parentheses to which
1631 it refers fails when the subpattern is first used, so, for
1632 example, (a\1) never matches. However, such references can
1633 be useful inside repeated subpatterns. For example, the pat-
1634 tern
1635
1636 (a|b\1)+
1637
1638 matches any number of "a"s and also "aba", "ababbaa" etc. At
1639 each iteration of the subpattern, the back reference matches
1640 the character string corresponding to the previous itera-
1641 tion. In order for this to work, the pattern must be such
1642 that the first iteration does not need to match the back
1643 reference. This can be done using alternation, as in the
1644 example above, or by a quantifier with a minimum of zero.
1645
1646
1647
1648 ASSERTIONS
1649 An assertion is a test on the characters following or
1650 preceding the current matching point that does not actually
1651 consume any characters. The simple assertions coded as \b,
1652 \B, \A, \Z, \z, ^ and $ are described above. More compli-
1653 cated assertions are coded as subpatterns. There are two
1654 kinds: those that look ahead of the current position in the
1655 subject string, and those that look behind it.
1656
1657 An assertion subpattern is matched in the normal way, except
1658 that it does not cause the current matching position to be
1659 changed. Lookahead assertions start with (?= for positive
1660 assertions and (?! for negative assertions. For example,
1661
1662 \w+(?=;)
1663
1664 matches a word followed by a semicolon, but does not include
1665 the semicolon in the match, and
1666
1667 foo(?!bar)
1668
1669 matches any occurrence of "foo" that is not followed by
1670 "bar". Note that the apparently similar pattern
1671
1672 (?!foo)bar
1673
1674 does not find an occurrence of "bar" that is preceded by
1675 something other than "foo"; it finds any occurrence of "bar"
1676 whatsoever, because the assertion (?!foo) is always true
1677 when the next three characters are "bar". A lookbehind
1678 assertion is needed to achieve this effect.
1679
1680 Lookbehind assertions start with (?<= for positive asser-
1681 tions and (?<! for negative assertions. For example,
1682
1683 (?<!foo)bar
1684
1685 does find an occurrence of "bar" that is not preceded by
1686 "foo". The contents of a lookbehind assertion are restricted
1687 such that all the strings it matches must have a fixed
1688 length. However, if there are several alternatives, they do
1689 not all have to have the same fixed length. Thus
1690
1691 (?<=bullock|donkey)
1692
1693 is permitted, but
1694
1695 (?<!dogs?|cats?)
1696
1697 causes an error at compile time. Branches that match dif-
1698 ferent length strings are permitted only at the top level of
1699 a lookbehind assertion. This is an extension compared with
1700 Perl 5.005, which requires all branches to match the same
1701 length of string. An assertion such as
1702
1703 (?<=ab(c|de))
1704
1705 is not permitted, because its single top-level branch can
1706 match two different lengths, but it is acceptable if rewrit-
1707 ten to use two top-level branches:
1708
1709 (?<=abc|abde)
1710
1711 The implementation of lookbehind assertions is, for each
1712 alternative, to temporarily move the current position back
1713 by the fixed width and then try to match. If there are
1714 insufficient characters before the current position, the
1715 match is deemed to fail. Lookbehinds in conjunction with
1716 once-only subpatterns can be particularly useful for match-
1717 ing at the ends of strings; an example is given at the end
1718 of the section on once-only subpatterns.
1719
1720 Several assertions (of any sort) may occur in succession.
1721 For example,
1722
1723 (?<=\d{3})(?<!999)foo
1724
1725 matches "foo" preceded by three digits that are not "999".
1726 Notice that each of the assertions is applied independently
1727 at the same point in the subject string. First there is a
1728 check that the previous three characters are all digits, and
1729 then there is a check that the same three characters are not
1730 "999". This pattern does not match "foo" preceded by six
1731 characters, the first of which are digits and the last three
1732 of which are not "999". For example, it doesn't match
1733 "123abcfoo". A pattern to do that is
1734
1735 (?<=\d{3}...)(?<!999)foo
1736
1737 This time the first assertion looks at the preceding six
1738 characters, checking that the first three are digits, and
1739 then the second assertion checks that the preceding three
1740 characters are not "999".
1741
1742 Assertions can be nested in any combination. For example,
1743
1744 (?<=(?<!foo)bar)baz
1745
1746 matches an occurrence of "baz" that is preceded by "bar"
1747 which in turn is not preceded by "foo", while
1748
1749 (?<=\d{3}(?!999)...)foo
1750
1751 is another pattern which matches "foo" preceded by three
1752 digits and any three characters that are not "999".
1753
1754 Assertion subpatterns are not capturing subpatterns, and may
1755 not be repeated, because it makes no sense to assert the
1756 same thing several times. If any kind of assertion contains
1757 capturing subpatterns within it, these are counted for the
1758 purposes of numbering the capturing subpatterns in the whole
1759 pattern. However, substring capturing is carried out only
1760 for positive assertions, because it does not make sense for
1761 negative assertions.
1762
1763 Assertions count towards the maximum of 200 parenthesized
1764 subpatterns.
1765
1766
1767
1768 ONCE-ONLY SUBPATTERNS
1769 With both maximizing and minimizing repetition, failure of
1770 what follows normally causes the repeated item to be re-
1771 evaluated to see if a different number of repeats allows the
1772 rest of the pattern to match. Sometimes it is useful to
1773 prevent this, either to change the nature of the match, or
1774 to cause it fail earlier than it otherwise might, when the
1775 author of the pattern knows there is no point in carrying
1776 on.
1777
1778 Consider, for example, the pattern \d+foo when applied to
1779 the subject line
1780
1781 123456bar
1782
1783 After matching all 6 digits and then failing to match "foo",
1784 the normal action of the matcher is to try again with only 5
1785 digits matching the \d+ item, and then with 4, and so on,
1786 before ultimately failing. Once-only subpatterns provide the
1787 means for specifying that once a portion of the pattern has
1788 matched, it is not to be re-evaluated in this way, so the
1789 matcher would give up immediately on failing to match "foo"
1790 the first time. The notation is another kind of special
1791 parenthesis, starting with (?> as in this example:
1792
1793 (?>\d+)bar
1794
1795 This kind of parenthesis "locks up" the part of the pattern
1796 it contains once it has matched, and a failure further into
1797 the pattern is prevented from backtracking into it. Back-
1798 tracking past it to previous items, however, works as nor-
1799 mal.
1800
1801 An alternative description is that a subpattern of this type
1802 matches the string of characters that an identical stan-
1803 dalone pattern would match, if anchored at the current point
1804 in the subject string.
1805
1806 Once-only subpatterns are not capturing subpatterns. Simple
1807 cases such as the above example can be thought of as a max-
1808 imizing repeat that must swallow everything it can. So,
1809 while both \d+ and \d+? are prepared to adjust the number of
1810 digits they match in order to make the rest of the pattern
1811 match, (?>\d+) can only match an entire sequence of digits.
1812
1813 This construction can of course contain arbitrarily compli-
1814 cated subpatterns, and it can be nested.
1815
1816 Once-only subpatterns can be used in conjunction with look-
1817 behind assertions to specify efficient matching at the end
1818 of the subject string. Consider a simple pattern such as
1819
1820 abcd$
1821
1822 when applied to a long string which does not match. Because
1823 matching proceeds from left to right, PCRE will look for
1824 each "a" in the subject and then see if what follows matches
1825 the rest of the pattern. If the pattern is specified as
1826
1827 ^.*abcd$
1828
1829 the initial .* matches the entire string at first, but when
1830 this fails (because there is no following "a"), it back-
1831 tracks to match all but the last character, then all but the
1832 last two characters, and so on. Once again the search for
1833 "a" covers the entire string, from right to left, so we are
1834 no better off. However, if the pattern is written as
1835
1836 ^(?>.*)(?<=abcd)
1837
1838 there can be no backtracking for the .* item; it can match
1839 only the entire string. The subsequent lookbehind assertion
1840 does a single test on the last four characters. If it fails,
1841 the match fails immediately. For long strings, this approach
1842 makes a significant difference to the processing time.
1843
1844 When a pattern contains an unlimited repeat inside a subpat-
1845 tern that can itself be repeated an unlimited number of
1846 times, the use of a once-only subpattern is the only way to
1847 avoid some failing matches taking a very long time indeed.
1848 The pattern
1849
1850 (\D+|<\d+>)*[!?]
1851
1852 matches an unlimited number of substrings that either con-
1853 sist of non-digits, or digits enclosed in <>, followed by
1854 either ! or ?. When it matches, it runs quickly. However, if
1855 it is applied to
1856
1857 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
1858
1859 it takes a long time before reporting failure. This is
1860 because the string can be divided between the two repeats in
1861 a large number of ways, and all have to be tried. (The exam-
1862 ple used [!?] rather than a single character at the end,
1863 because both PCRE and Perl have an optimization that allows
1864 for fast failure when a single character is used. They
1865 remember the last single character that is required for a
1866 match, and fail early if it is not present in the string.)
1867 If the pattern is changed to
1868
1869 ((?>\D+)|<\d+>)*[!?]
1870
1871 sequences of non-digits cannot be broken, and failure hap-
1872 pens quickly.
1873
1874
1875
1876 CONDITIONAL SUBPATTERNS
1877 It is possible to cause the matching process to obey a sub-
1878 pattern conditionally or to choose between two alternative
1879 subpatterns, depending on the result of an assertion, or
1880 whether a previous capturing subpattern matched or not. The
1881 two possible forms of conditional subpattern are
1882
1883 (?(condition)yes-pattern)
1884 (?(condition)yes-pattern|no-pattern)
1885
1886 If the condition is satisfied, the yes-pattern is used; oth-
1887 erwise the no-pattern (if present) is used. If there are
1888 more than two alternatives in the subpattern, a compile-time
1889 error occurs.
1890
1891 There are two kinds of condition. If the text between the
1892 parentheses consists of a sequence of digits, the condition
1893 is satisfied if the capturing subpattern of that number has
1894 previously matched. The number must be greater than zero.
1895 Consider the following pattern, which contains non-
1896 significant white space to make it more readable (assume the
1897 PCRE_EXTENDED option) and to divide it into three parts for
1898 ease of discussion:
1899
1900 ( \( )? [^()]+ (?(1) \) )
1901
1902 The first part matches an optional opening parenthesis, and
1903 if that character is present, sets it as the first captured
1904 substring. The second part matches one or more characters
1905 that are not parentheses. The third part is a conditional
1906 subpattern that tests whether the first set of parentheses
1907 matched or not. If they did, that is, if subject started
1908 with an opening parenthesis, the condition is true, and so
1909 the yes-pattern is executed and a closing parenthesis is
1910 required. Otherwise, since no-pattern is not present, the
1911 subpattern matches nothing. In other words, this pattern
1912 matches a sequence of non-parentheses, optionally enclosed
1913 in parentheses.
1914
1915 If the condition is not a sequence of digits, it must be an
1916 assertion. This may be a positive or negative lookahead or
1917 lookbehind assertion. Consider this pattern, again contain-
1918 ing non-significant white space, and with the two alterna-
1919 tives on the second line:
1920
1921 (?(?=[^a-z]*[a-z])
1922 \d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
1923
1924 The condition is a positive lookahead assertion that matches
1925 an optional sequence of non-letters followed by a letter. In
1926 other words, it tests for the presence of at least one
1927 letter in the subject. If a letter is found, the subject is
1928 matched against the first alternative; otherwise it is
1929 matched against the second. This pattern matches strings in
1930 one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
1931 letters and dd are digits.
1932
1933
1934
1935 COMMENTS
1936 The sequence (?# marks the start of a comment which contin-
1937 ues up to the next closing parenthesis. Nested parentheses
1938 are not permitted. The characters that make up a comment
1939 play no part in the pattern matching at all.
1940
1941 If the PCRE_EXTENDED option is set, an unescaped # character
1942 outside a character class introduces a comment that contin-
1943 ues up to the next newline character in the pattern.
1944
1945
1946
1947 RECURSIVE PATTERNS
1948 Consider the problem of matching a string in parentheses,
1949 allowing for unlimited nested parentheses. Without the use
1950 of recursion, the best that can be done is to use a pattern
1951 that matches up to some fixed depth of nesting. It is not
1952 possible to handle an arbitrary nesting depth. Perl 5.6 has
1953 provided an experimental facility that allows regular
1954 expressions to recurse (amongst other things). It does this
1955 by interpolating Perl code in the expression at run time,
1956 and the code can refer to the expression itself. A Perl pat-
1957 tern to solve the parentheses problem can be created like
1958 this:
1959
1960 $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
1961
1962 The (?p{...}) item interpolates Perl code at run time, and
1963 in this case refers recursively to the pattern in which it
1964 appears. Obviously, PCRE cannot support the interpolation of
1965 Perl code. Instead, the special item (?R) is provided for
1966 the specific case of recursion. This PCRE pattern solves the
1967 parentheses problem (assume the PCRE_EXTENDED option is set
1968 so that white space is ignored):
1969
1970 \( ( (?>[^()]+) | (?R) )* \)
1971
1972 First it matches an opening parenthesis. Then it matches any
1973 number of substrings which can either be a sequence of non-
1974 parentheses, or a recursive match of the pattern itself
1975 (i.e. a correctly parenthesized substring). Finally there is
1976 a closing parenthesis.
1977
1978 This particular example pattern contains nested unlimited
1979 repeats, and so the use of a once-only subpattern for match-
1980 ing strings of non-parentheses is important when applying
1981 the pattern to strings that do not match. For example, when
1982 it is applied to
1983
1984 (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
1985
1986 it yields "no match" quickly. However, if a once-only sub-
1987 pattern is not used, the match runs for a very long time
1988 indeed because there are so many different ways the + and *
1989 repeats can carve up the subject, and all have to be tested
1990 before failure can be reported.
1991
1992 The values set for any capturing subpatterns are those from
1993 the outermost level of the recursion at which the subpattern
1994 value is set. If the pattern above is matched against
1995
1996 (ab(cd)ef)
1997
1998 the value for the capturing parentheses is "ef", which is
1999 the last value taken on at the top level. If additional
2000 parentheses are added, giving
2001
2002 \( ( ( (?>[^()]+) | (?R) )* ) \)
2003 ^ ^
2004 ^ ^ the string they capture is
2005 "ab(cd)ef", the contents of the top level parentheses. If
2006 there are more than 15 capturing parentheses in a pattern,
2007 PCRE has to obtain extra memory to store data during a
2008 recursion, which it does by using pcre_malloc, freeing it
2009 via pcre_free afterwards. If no memory can be obtained, it
2010 saves data for the first 15 capturing parentheses only, as
2011 there is no way to give an out-of-memory error from within a
2012 recursion.
2013
2014
2015
2016 PERFORMANCE
2017 Certain items that may appear in patterns are more efficient
2018 than others. It is more efficient to use a character class
2019 like [aeiou] than a set of alternatives such as (a|e|i|o|u).
2020 In general, the simplest construction that provides the
2021 required behaviour is usually the most efficient. Jeffrey
2022 Friedl's book contains a lot of discussion about optimizing
2023 regular expressions for efficient performance.
2024
2025 When a pattern begins with .* and the PCRE_DOTALL option is
2026 set, the pattern is implicitly anchored by PCRE, since it
2027 can match only at the start of a subject string. However, if
2028 PCRE_DOTALL is not set, PCRE cannot make this optimization,
2029 because the . metacharacter does not then match a newline,
2030 and if the subject string contains newlines, the pattern may
2031 match from the character immediately following one of them
2032 instead of from the very start. For example, the pattern
2033
2034 (.*) second
2035
2036 matches the subject "first\nand second" (where \n stands for
2037 a newline character) with the first captured substring being
2038 "and". In order to do this, PCRE has to retry the match
2039 starting after every newline in the subject.
2040
2041 If you are using such a pattern with subject strings that do
2042 not contain newlines, the best performance is obtained by
2043 setting PCRE_DOTALL, or starting the pattern with ^.* to
2044 indicate explicit anchoring. That saves PCRE from having to
2045 scan along the subject looking for a newline to restart at.
2046
2047 Beware of patterns that contain nested indefinite repeats.
2048 These can take a long time to run when applied to a string
2049 that does not match. Consider the pattern fragment
2050
2051 (a+)*
2052
2053 This can match "aaaa" in 33 different ways, and this number
2054 increases very rapidly as the string gets longer. (The *
2055 repeat can match 0, 1, 2, 3, or 4 times, and for each of
2056 those cases other than 0, the + repeats can match different
2057 numbers of times.) When the remainder of the pattern is such
2058 that the entire match is going to fail, PCRE has in princi-
2059 ple to try every possible variation, and this can take an
2060 extremely long time.
2061
2062 An optimization catches some of the more simple cases such
2063 as
2064
2065 (a+)*b
2066
2067 where a literal character follows. Before embarking on the
2068 standard matching procedure, PCRE checks that there is a "b"
2069 later in the subject string, and if there is not, it fails
2070 the match immediately. However, when there is no following
2071 literal this optimization cannot be used. You can see the
2072 difference by comparing the behaviour of
2073
2074 (a+)*\d
2075
2076 with the pattern above. The former gives a failure almost
2077 instantly when applied to a whole line of "a" characters,
2078 whereas the latter takes an appreciable time with strings
2079 longer than about 20 characters.
2080
2081
2082
2083 UTF-8 SUPPORT
2084 Starting at release 3.3, PCRE has some support for character
2085 strings encoded in the UTF-8 format. This is incomplete, and
2086 is regarded as experimental. In order to use it, you must
2087 configure PCRE to include UTF-8 support in the code, and, in
2088 addition, you must call pcre_compile() with the PCRE_UTF8
2089 option flag. When you do this, both the pattern and any sub-
2090 ject strings that are matched against it are treated as
2091 UTF-8 strings instead of just strings of bytes, but only in
2092 the cases that are mentioned below.
2093
2094 If you compile PCRE with UTF-8 support, but do not use it at
2095 run time, the library will be a bit bigger, but the addi-
2096 tional run time overhead is limited to testing the PCRE_UTF8
2097 flag in several places, so should not be very large.
2098
2099 PCRE assumes that the strings it is given contain valid
2100 UTF-8 codes. It does not diagnose invalid UTF-8 strings. If
2101 you pass invalid UTF-8 strings to PCRE, the results are
2102 undefined.
2103
2104 Running with PCRE_UTF8 set causes these changes in the way
2105 PCRE works:
2106
2107 1. In a pattern, the escape sequence \x{...}, where the
2108 contents of the braces is a string of hexadecimal digits, is
2109 interpreted as a UTF-8 character whose code number is the
2110 given hexadecimal number, for example: \x{1234}. This
2111 inserts from one to six literal bytes into the pattern,
2112 using the UTF-8 encoding. If a non-hexadecimal digit appears
2113 between the braces, the item is not recognized.
2114
2115 2. The original hexadecimal escape sequence, \xhh, generates
2116 a two-byte UTF-8 character if its value is greater than 127.
2117
2118 3. Repeat quantifiers are NOT correctly handled if they fol-
2119 low a multibyte character. For example, \x{100}* and \xc3+
2120 do not work. If you want to repeat such characters, you must
2121 enclose them in non-capturing parentheses, for example
2122 (?:\x{100}), at present.
2123
2124 4. The dot metacharacter matches one UTF-8 character instead
2125 of a single byte.
2126
2127 5. Unlike literal UTF-8 characters, the dot metacharacter
2128 followed by a repeat quantifier does operate correctly on
2129 UTF-8 characters instead of single bytes.
2130
2131 4. Although the \x{...} escape is permitted in a character
2132 class, characters whose values are greater than 255 cannot
2133 be included in a class.
2134
2135 5. A class is matched against a UTF-8 character instead of
2136 just a single byte, but it can match only characters whose
2137 values are less than 256. Characters with greater values
2138 always fail to match a class.
2139
2140 6. Repeated classes work correctly on multiple characters.
2141
2142 7. Classes containing just a single character whose value is
2143 greater than 127 (but less than 256), for example, [\x80] or
2144 [^\x{93}], do not work because these are optimized into sin-
2145 gle byte matches. In the first case, of course, the class
2146 brackets are just redundant.
2147
2148 8. Lookbehind assertions move backwards in the subject by a
2149 fixed number of characters instead of a fixed number of
2150 bytes. Simple cases have been tested to work correctly, but
2151 there may be hidden gotchas herein.
2152
2153 9. The character types such as \d and \w do not work
2154 correctly with UTF-8 characters. They continue to test a
2155 single byte.
2156
2157 10. Anything not explicitly mentioned here continues to work
2158 in bytes rather than in characters.
2159
2160 The following UTF-8 features of Perl 5.6 are not imple-
2161 mented:
2162
2163 1. The escape sequence \C to match a single byte.
2164
2165 2. The use of Unicode tables and properties and escapes \p,
2166 \P, and \X.
2167
2168
2169
2170 SAMPLE PROGRAM
2171 The code below is a simple, complete demonstration program,
2172 to get you started with using PCRE. This code is also sup-
2173 plied in the file pcredemo.c in the PCRE distribution.
2174
2175 The program compiles the regular expression that is its
2176 first argument, and matches it against the subject string in
2177 its second argument. No options are set, and default charac-
2178 ter tables are used. If matching succeeds, the program out-
2179 puts the portion of the subject that matched, together with
2180 the contents of any captured substrings.
2181
2182 On a Unix system that has PCRE installed in /usr/local, you
2183 can compile the demonstration program using a command like
2184 this:
2185
2186 gcc -o pcredemo pcredemo.c -I/usr/local/include
2187 -L/usr/local/lib -lpcre
2188
2189 Then you can run simple tests like this:
2190
2191 ./pcredemo 'cat|dog' 'the cat sat on the mat'
2192
2193 Note that there is a much more comprehensive test program,
2194 called pcretest, which supports many more facilities for
2195 testing regular expressions. The pcredemo program is pro-
2196 vided as a simple coding example.
2197
2198 On some operating systems (e.g. Solaris) you may get an
2199 error like this when you try to run pcredemo:
2200
2201 ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such
2202 file or directory
2203
2204 This is caused by the way shared library support works on
2205 those systems. You need to add
2206
2207 -R/usr/local/lib
2208
2209 to the compile command to get round this problem. Here's the
2210 code:
2211
2212 #include <stdio.h>
2213 #include <string.h>
2214 #include <pcre.h>
2215
2216 #define OVECCOUNT 30 /* should be a multiple of 3 */
2217
2218 int main(int argc, char **argv)
2219 {
2220 pcre *re;
2221 const char *error;
2222 int erroffset;
2223 int ovector[OVECCOUNT];
2224 int rc, i;
2225
2226 if (argc != 3)
2227 {
2228 printf("Two arguments required: a regex and a "
2229 "subject string\n");
2230 return 1;
2231 }
2232
2233 /* Compile the regular expression in the first argument */
2234
2235 re = pcre_compile(
2236 argv[1], /* the pattern */
2237 0, /* default options */
2238 &error, /* for error message */
2239 &erroffset, /* for error offset */
2240 NULL); /* use default character tables */
2241
2242 /* Compilation failed: print the error message and exit */
2243
2244 if (re == NULL)
2245 {
2246 printf("PCRE compilation failed at offset %d: %s\n",
2247 erroffset, error);
2248 return 1;
2249 }
2250
2251 /* Compilation succeeded: match the subject in the second
2252 argument */
2253
2254 rc = pcre_exec(
2255 re, /* the compiled pattern */
2256 NULL, /* we didn't study the pattern */
2257 argv[2], /* the subject string */
2258 (int)strlen(argv[2]), /* the length of the subject */
2259 0, /* start at offset 0 in the subject */
2260 0, /* default options */
2261 ovector, /* vector for substring information */
2262 OVECCOUNT); /* number of elements in the vector */
2263
2264 /* Matching failed: handle error cases */
2265
2266 if (rc < 0)
2267 {
2268 switch(rc)
2269 {
2270 case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
2271 /*
2272 Handle other special cases if you like
2273 */
2274 default: printf("Matching error %d\n", rc); break;
2275 }
2276 return 1;
2277 }
2278
2279 /* Match succeded */
2280
2281 printf("Match succeeded\n");
2282
2283 /* The output vector wasn't big enough */
2284
2285 if (rc == 0)
2286 {
2287 rc = OVECCOUNT/3;
2288 printf("ovector only has room for %d captured "
2289 substrings\n", rc - 1);
2290 }
2291
2292 /* Show substrings stored in the output vector */
2293
2294 for (i = 0; i < rc; i++)
2295 {
2296 char *substring_start = argv[2] + ovector[2*i];
2297 int substring_length = ovector[2*i+1] - ovector[2*i];
2298 printf("%2d: %.*s\n", i, substring_length,
2299 substring_start);
2300 }
2301
2302 return 0;
2303 }
2304
2305
2306
2307 AUTHOR
2308 Philip Hazel <ph10@cam.ac.uk>
2309 University Computing Service,
2310 New Museums Site,
2311 Cambridge CB2 3QG, England.
2312 Phone: +44 1223 334714
2313
2314 Last updated: 15 August 2001
2315 Copyright (c) 1997-2001 University of Cambridge.

  ViewVC Help
Powered by ViewVC 1.1.5