/[pcre]/code/trunk/doc/pcre.3
ViewVC logotype

Diff of /code/trunk/doc/pcre.3

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 41 by nigel, Sat Feb 24 21:39:17 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 44  pcre - Perl-compatible regular expressio Line 44  pcre - Perl-compatible regular expressio
44  .B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);"  .B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);"
45  .PP  .PP
46  .br  .br
47    .B void pcre_free_substring(const char *\fIstringptr\fR);
48    .PP
49    .br
50    .B void pcre_free_substring_list(const char **\fIstringptr\fR);
51    .PP
52    .br
53  .B const unsigned char *pcre_maketables(void);  .B const unsigned char *pcre_maketables(void);
54  .PP  .PP
55  .br  .br
56    .B int pcre_fullinfo(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
57    .ti +5n
58    .B int \fIwhat\fR, void *\fIwhere\fR);
59    .PP
60    .br
61  .B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int  .B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int
62  .B *\fIfirstcharptr\fR);  .B *\fIfirstcharptr\fR);
63  .PP  .PP
# Line 64  pcre - Perl-compatible regular expressio Line 75  pcre - Perl-compatible regular expressio
75  .SH DESCRIPTION  .SH DESCRIPTION
76  The PCRE library is a set of functions that implement regular expression  The PCRE library is a set of functions that implement regular expression
77  pattern matching using the same syntax and semantics as Perl 5, with just a few  pattern matching using the same syntax and semantics as Perl 5, with just a few
78  differences (see below). The current implementation corresponds to Perl 5.005.  differences (see below). The current implementation corresponds to Perl 5.005,
79    with some additional features from later versions. This includes some
80    experimental, incomplete support for UTF-8 encoded strings. Details of exactly
81    what is and what is not supported are given below.
82    
83  PCRE has its own native API, which is described in this document. There is also  PCRE has its own native API, which is described in this document. There is also
84  a set of wrapper functions that correspond to the POSIX API. These are  a set of wrapper functions that correspond to the POSIX regular expression API.
85  described in the \fBpcreposix\fR documentation.  These are described in the \fBpcreposix\fR documentation.
86    
87  The native API function prototypes are defined in the header file \fBpcre.h\fR,  The native API function prototypes are defined in the header file \fBpcre.h\fR,
88  and on Unix systems the library itself is called \fBlibpcre.a\fR, so can be  and on Unix systems the library itself is called \fBlibpcre.a\fR, so can be
89  accessed by adding \fB-lpcre\fR to the command for linking an application which  accessed by adding \fB-lpcre\fR to the command for linking an application which
90  calls it.  calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
91    contain the major and minor release numbers for the library. Applications can
92    use these to include support for different releases.
93    
94  The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR  The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR
95  are used for compiling and matching regular expressions, while  are used for compiling and matching regular expressions. A sample program that
96  \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and  demonstrates the simplest way of using them is given in the file
97    \fIpcredemo.c\fR. The last section of this man page describes how to run it.
98    
99    The functions \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
100  \fBpcre_get_substring_list()\fR are convenience functions for extracting  \fBpcre_get_substring_list()\fR are convenience functions for extracting
101  captured substrings from a matched subject string. The function  captured substrings from a matched subject string; \fBpcre_free_substring()\fR
102  \fBpcre_maketables()\fR is used (optionally) to build a set of character tables  and \fBpcre_free_substring_list()\fR are also provided, to free the memory used
103  in the current locale for passing to \fBpcre_compile()\fR.  for extracted strings.
104    
105  The function \fBpcre_info()\fR is used to find out information about a compiled  The function \fBpcre_maketables()\fR is used (optionally) to build a set of
106  pattern, while the function \fBpcre_version()\fR returns a pointer to a string  character tables in the current locale for passing to \fBpcre_compile()\fR.
107  containing the version of PCRE and its date of release.  
108    The function \fBpcre_fullinfo()\fR is used to find out information about a
109    compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only
110    some of the available information, but is retained for backwards compatibility.
111    The function \fBpcre_version()\fR returns a pointer to a string containing the
112    version of PCRE and its date of release.
113    
114  The global variables \fBpcre_malloc\fR and \fBpcre_free\fR initially contain  The global variables \fBpcre_malloc\fR and \fBpcre_free\fR initially contain
115  the entry points of the standard \fBmalloc()\fR and \fBfree()\fR functions  the entry points of the standard \fBmalloc()\fR and \fBfree()\fR functions
# Line 107  the same compiled pattern can safely be Line 131  the same compiled pattern can safely be
131  The function \fBpcre_compile()\fR is called to compile a pattern into an  The function \fBpcre_compile()\fR is called to compile a pattern into an
132  internal form. The pattern is a C string terminated by a binary zero, and  internal form. The pattern is a C string terminated by a binary zero, and
133  is passed in the argument \fIpattern\fR. A pointer to a single block of memory  is passed in the argument \fIpattern\fR. A pointer to a single block of memory
134  that is obtained via \fBpcre_malloc\fR is returned. This contains the  that is obtained via \fBpcre_malloc\fR is returned. This contains the compiled
135  compiled code and related data. The \fBpcre\fR type is defined for this for  code and related data. The \fBpcre\fR type is defined for the returned block;
136  convenience, but in fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the  this is a typedef for a structure whose contents are not externally defined. It
137  contents of the block are not externally defined. It is up to the caller to  is up to the caller to free the memory when it is no longer required.
138  free the memory when it is no longer required.  
139  .PP  Although the compiled code of a PCRE regex is relocatable, that is, it does not
140    depend on memory location, the complete \fBpcre\fR data block is not
141    fully relocatable, because it contains a copy of the \fItableptr\fR argument,
142    which is an address (see below).
143    
144  The size of a compiled pattern is roughly proportional to the length of the  The size of a compiled pattern is roughly proportional to the length of the
145  pattern string, except that each character class (other than those containing  pattern string, except that each character class (other than those containing
146  just a single character, negated or not) requires 33 bytes, and repeat  just a single character, negated or not) requires 33 bytes, and repeat
147  quantifiers with a minimum greater than one or a bounded maximum cause the  quantifiers with a minimum greater than one or a bounded maximum cause the
148  relevant portions of the compiled pattern to be replicated.  relevant portions of the compiled pattern to be replicated.
149  .PP  
150  The \fIoptions\fR argument contains independent bits that affect the  The \fIoptions\fR argument contains independent bits that affect the
151  compilation. It should be zero if no options are required. Some of the options,  compilation. It should be zero if no options are required. Some of the options,
152  in particular, those that are compatible with Perl, can also be set and unset  in particular, those that are compatible with Perl, can also be set and unset
# Line 127  below). For these options, the contents Line 155  below). For these options, the contents
155  their initial settings at the start of compilation and execution. The  their initial settings at the start of compilation and execution. The
156  PCRE_ANCHORED option can be set at the time of matching as well as at compile  PCRE_ANCHORED option can be set at the time of matching as well as at compile
157  time.  time.
158  .PP  
159  If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately.  If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately.
160  Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns  Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns
161  NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual  NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual
162  error message. The offset from the start of the pattern to the character where  error message. The offset from the start of the pattern to the character where
163  the error was discovered is placed in the variable pointed to by  the error was discovered is placed in the variable pointed to by
164  \fIerroffset\fR, which must not be NULL. If it is, an immediate error is given.  \fIerroffset\fR, which must not be NULL. If it is, an immediate error is given.
165  .PP  
166  If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of  If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of
167  character tables which are built when it is compiled, using the default C  character tables which are built when it is compiled, using the default C
168  locale. Otherwise, \fItableptr\fR must be the result of a call to  locale. Otherwise, \fItableptr\fR must be the result of a call to
169  \fBpcre_maketables()\fR. See the section on locale support below.  \fBpcre_maketables()\fR. See the section on locale support below.
170  .PP  
171    This code fragment shows a typical straightforward call to \fBpcre_compile()\fR:
172    
173      pcre *re;
174      const char *error;
175      int erroffset;
176      re = pcre_compile(
177        "^A.*Z",          /* the pattern */
178        0,                /* default options */
179        &error,           /* for error message */
180        &erroffset,       /* for error offset */
181        NULL);            /* use default character tables */
182    
183  The following option bits are defined in the header file:  The following option bits are defined in the header file:
184    
185    PCRE_ANCHORED    PCRE_ANCHORED
# Line 182  sequence (?( which introduces a conditio Line 222  sequence (?( which introduces a conditio
222    
223    PCRE_EXTRA    PCRE_EXTRA
224    
225  This option turns on additional functionality of PCRE that is incompatible with  This option was invented in order to turn on additional functionality of PCRE
226  Perl. Any backslash in a pattern that is followed by a letter that has no  that is incompatible with Perl, but it is currently of very little use. When
227    set, any backslash in a pattern that is followed by a letter that has no
228  special meaning causes an error, thus reserving these combinations for future  special meaning causes an error, thus reserving these combinations for future
229  expansion. By default, as in Perl, a backslash followed by a letter with no  expansion. By default, as in Perl, a backslash followed by a letter with no
230  special meaning is treated as a literal. There are at present no other features  special meaning is treated as a literal. There are at present no other features
231  controlled by this option.  controlled by this option. It can also be set by a (?X) option setting within a
232    pattern.
233    
234    PCRE_MULTILINE    PCRE_MULTILINE
235    
# Line 211  This option inverts the "greediness" of Line 253  This option inverts the "greediness" of
253  greedy by default, but become greedy if followed by "?". It is not compatible  greedy by default, but become greedy if followed by "?". It is not compatible
254  with Perl. It can also be set by a (?U) option setting within the pattern.  with Perl. It can also be set by a (?U) option setting within the pattern.
255    
256      PCRE_UTF8
257    
258    This option causes PCRE to regard both the pattern and the subject as strings
259    of UTF-8 characters instead of just byte strings. However, it is available only
260    if PCRE has been built to include UTF-8 support. If not, the use of this option
261    provokes an error. Support for UTF-8 is new, experimental, and incomplete.
262    Details of exactly what it entails are given below.
263    
264    
265  .SH STUDYING A PATTERN  .SH STUDYING A PATTERN
266  When a pattern is going to be used several times, it is worth spending more  When a pattern is going to be used several times, it is worth spending more
267  time analyzing it in order to speed up the time taken for matching. The  time analyzing it in order to speed up the time taken for matching. The
268  function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first  function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first
269  argument, and returns a pointer to a \fBpcre_extra\fR block (another \fBvoid\fR  argument, and returns a pointer to a \fBpcre_extra\fR block (another typedef
270  typedef) containing additional information about the pattern; this can be  for a structure with hidden contents) containing additional information about
271  passed to \fBpcre_exec()\fR. If no additional information is available, NULL  the pattern; this can be passed to \fBpcre_exec()\fR. If no additional
272  is returned.  information is available, NULL is returned.
273    
274  The second argument contains option bits. At present, no options are defined  The second argument contains option bits. At present, no options are defined
275  for \fBpcre_study()\fR, and this argument should always be zero.  for \fBpcre_study()\fR, and this argument should always be zero.
# Line 228  The third argument for \fBpcre_study()\f Line 278  The third argument for \fBpcre_study()\f
278  studying succeeds (even if no data is returned), the variable it points to is  studying succeeds (even if no data is returned), the variable it points to is
279  set to NULL. Otherwise it points to a textual error message.  set to NULL. Otherwise it points to a textual error message.
280    
281    This is a typical call to \fBpcre_study\fR():
282    
283      pcre_extra *pe;
284      pe = pcre_study(
285        re,             /* result of pcre_compile() */
286        0,              /* no options exist */
287        &error);        /* set to NULL or points to a message */
288    
289  At present, studying a pattern is useful only for non-anchored patterns that do  At present, studying a pattern is useful only for non-anchored patterns that do
290  not have a single fixed starting character. A bitmap of possible starting  not have a single fixed starting character. A bitmap of possible starting
291  characters is created.  characters is created.
# Line 261  memory containing the tables remains ava Line 319  memory containing the tables remains ava
319    
320    
321  .SH INFORMATION ABOUT A PATTERN  .SH INFORMATION ABOUT A PATTERN
322  The \fBpcre_info()\fR function returns information about a compiled pattern.  The \fBpcre_fullinfo()\fR function returns information about a compiled
323  Its yield is the number of capturing subpatterns, or one of the following  pattern. It replaces the obsolete \fBpcre_info()\fR function, which is
324  negative numbers:  nevertheless retained for backwards compability (and is documented below).
325    
326    The first argument for \fBpcre_fullinfo()\fR is a pointer to the compiled
327    pattern. The second argument is the result of \fBpcre_study()\fR, or NULL if
328    the pattern was not studied. The third argument specifies which piece of
329    information is required, while the fourth argument is a pointer to a variable
330    to receive the data. The yield of the function is zero for success, or one of
331    the following negative numbers:
332    
333    PCRE_ERROR_NULL       the argument \fIcode\fR was NULL    PCRE_ERROR_NULL       the argument \fIcode\fR was NULL
334                            the argument \fIwhere\fR was NULL
335    PCRE_ERROR_BADMAGIC   the "magic number" was not found    PCRE_ERROR_BADMAGIC   the "magic number" was not found
336      PCRE_ERROR_BADOPTION  the value of \fIwhat\fR was invalid
337    
338  If the \fIoptptr\fR argument is not NULL, a copy of the options with which the  Here is a typical call of \fBpcre_fullinfo()\fR, to obtain the length of the
339  pattern was compiled is placed in the integer it points to. These option bits  compiled pattern:
340    
341      int rc;
342      unsigned long int length;
343      rc = pcre_fullinfo(
344        re,               /* result of pcre_compile() */
345        pe,               /* result of pcre_study(), or NULL */
346        PCRE_INFO_SIZE,   /* what is required */
347        &length);         /* where to put the data */
348    
349    The possible values for the third argument are defined in \fBpcre.h\fR, and are
350    as follows:
351    
352      PCRE_INFO_OPTIONS
353    
354    Return a copy of the options with which the pattern was compiled. The fourth
355    argument should point to an \fBunsigned long int\fR variable. These option bits
356  are those specified in the call to \fBpcre_compile()\fR, modified by any  are those specified in the call to \fBpcre_compile()\fR, modified by any
357  top-level option settings within the pattern itself, and with the PCRE_ANCHORED  top-level option settings within the pattern itself, and with the PCRE_ANCHORED
358  bit set if the form of the pattern implies that it can match only at the start  bit forcibly set if the form of the pattern implies that it can match only at
359  of a subject string.  the start of a subject string.
360    
361  If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,    PCRE_INFO_SIZE
362  it is used to pass back information about the first character of any matched  
363  string. If there is a fixed first character, e.g. from a pattern such as  Return the size of the compiled pattern, that is, the value that was passed as
364  (cat|cow|coyote), then it is returned in the integer pointed to by  the argument to \fBpcre_malloc()\fR when PCRE was getting memory in which to
365  \fIfirstcharptr\fR. Otherwise, if either  place the compiled data. The fourth argument should point to a \fBsize_t\fR
366    variable.
367    
368      PCRE_INFO_CAPTURECOUNT
369    
370    Return the number of capturing subpatterns in the pattern. The fourth argument
371    should point to an \fbint\fR variable.
372    
373      PCRE_INFO_BACKREFMAX
374    
375    Return the number of the highest back reference in the pattern. The fourth
376    argument should point to an \fBint\fR variable. Zero is returned if there are
377    no back references.
378    
379      PCRE_INFO_FIRSTCHAR
380    
381    Return information about the first character of any matched string, for a
382    non-anchored pattern. If there is a fixed first character, e.g. from a pattern
383    such as (cat|cow|coyote), it is returned in the integer pointed to by
384    \fIwhere\fR. Otherwise, if either
385    
386  (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch  (a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
387  starts with "^", or  starts with "^", or
# Line 287  starts with "^", or Line 389  starts with "^", or
389  (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set  (b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
390  (if it were set, the pattern would be anchored),  (if it were set, the pattern would be anchored),
391    
392  then -1 is returned, indicating that the pattern matches only at the  -1 is returned, indicating that the pattern matches only at the start of a
393  start of a subject string or after any "\\n" within the string. Otherwise -2 is  subject string or after any "\\n" within the string. Otherwise -2 is returned.
394  returned.  For anchored patterns, -2 is returned.
395    
396      PCRE_INFO_FIRSTTABLE
397    
398    If the pattern was studied, and this resulted in the construction of a 256-bit
399    table indicating a fixed set of characters for the first character in any
400    matching string, a pointer to the table is returned. Otherwise NULL is
401    returned. The fourth argument should point to an \fBunsigned char *\fR
402    variable.
403    
404      PCRE_INFO_LASTLITERAL
405    
406    For a non-anchored pattern, return the value of the rightmost literal character
407    which must exist in any matched string, other than at its start. The fourth
408    argument should point to an \fBint\fR variable. If there is no such character,
409    or if the pattern is anchored, -1 is returned. For example, for the pattern
410    /a\\d+z\\d+/ the returned value is 'z'.
411    
412    The \fBpcre_info()\fR function is now obsolete because its interface is too
413    restrictive to return all the available data about a compiled pattern. New
414    programs should use \fBpcre_fullinfo()\fR instead. The yield of
415    \fBpcre_info()\fR is the number of capturing subpatterns, or one of the
416    following negative numbers:
417    
418      PCRE_ERROR_NULL       the argument \fIcode\fR was NULL
419      PCRE_ERROR_BADMAGIC   the "magic number" was not found
420    
421    If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
422    pattern was compiled is placed in the integer it points to (see
423    PCRE_INFO_OPTIONS above).
424    
425    If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
426    it is used to pass back information about the first character of any matched
427    string (see PCRE_INFO_FIRSTCHAR above).
428    
429    
430  .SH MATCHING A PATTERN  .SH MATCHING A PATTERN
# Line 298  pre-compiled pattern, which is passed in Line 433  pre-compiled pattern, which is passed in
433  pattern has been studied, the result of the study should be passed in the  pattern has been studied, the result of the study should be passed in the
434  \fIextra\fR argument. Otherwise this must be NULL.  \fIextra\fR argument. Otherwise this must be NULL.
435    
436    Here is an example of a simple call to \fBpcre_exec()\fR:
437    
438      int rc;
439      int ovector[30];
440      rc = pcre_exec(
441        re,             /* result of pcre_compile() */
442        NULL,           /* we didn't study the pattern */
443        "some string",  /* the subject string */
444        11,             /* the length of the subject string */
445        0,              /* start at offset 0 in the subject */
446        0,              /* default options */
447        ovector,        /* vector for substring information */
448        30);            /* number of elements in the vector */
449    
450  The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose  The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose
451  unused bits must be zero. However, if a pattern was compiled with  unused bits must be zero. However, if a pattern was compiled with
452  PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it  PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
# Line 339  below) and trying an ordinary match agai Line 488  below) and trying an ordinary match agai
488    
489  The subject string is passed as a pointer in \fIsubject\fR, a length in  The subject string is passed as a pointer in \fIsubject\fR, a length in
490  \fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern  \fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern
491  string, it may contain binary zero characters. When the starting offset is  string, the subject may contain binary zero characters. When the starting
492  zero, the search for a match starts at the beginning of the subject, and this  offset is zero, the search for a match starts at the beginning of the subject,
493  is by far the most common case.  and this is by far the most common case.
494    
495  A non-zero starting offset is useful when searching for another match in the  A non-zero starting offset is useful when searching for another match in the
496  same subject by calling \fBpcre_exec()\fR again after a previous success.  same subject by calling \fBpcre_exec()\fR again after a previous success.
# Line 472  is a pointer to the vector of integer of Line 621  is a pointer to the vector of integer of
621  were captured by the match, including the substring that matched the entire  were captured by the match, including the substring that matched the entire
622  regular expression. This is the value returned by \fBpcre_exec\fR if it  regular expression. This is the value returned by \fBpcre_exec\fR if it
623  is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it  is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it
624  ran out of space in \fIovector\fR, then the value passed as  ran out of space in \fIovector\fR, the value passed as \fIstringcount\fR should
625  \fIstringcount\fR should be the size of the vector divided by three.  be the size of the vector divided by three.
626    
627  The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR  The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR
628  extract a single substring, whose number is given as \fIstringnumber\fR. A  extract a single substring, whose number is given as \fIstringnumber\fR. A
629  value of zero extracts the substring that matched the entire pattern, while  value of zero extracts the substring that matched the entire pattern, while
630  higher values extract the captured substrings. For \fBpcre_copy_substring()\fR,  higher values extract the captured substrings. For \fBpcre_copy_substring()\fR,
631  the string is placed in \fIbuffer\fR, whose length is given by  the string is placed in \fIbuffer\fR, whose length is given by
632  \fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of store is  \fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of memory is
633  obtained via \fBpcre_malloc\fR, and its address is returned via  obtained via \fBpcre_malloc\fR, and its address is returned via
634  \fIstringptr\fR. The yield of the function is the length of the string, not  \fIstringptr\fR. The yield of the function is the length of the string, not
635  including the terminating zero, or one of  including the terminating zero, or one of
# Line 512  string. This can be distinguished from a Line 661  string. This can be distinguished from a
661  inspecting the appropriate offset in \fIovector\fR, which is negative for unset  inspecting the appropriate offset in \fIovector\fR, which is negative for unset
662  substrings.  substrings.
663    
664    The two convenience functions \fBpcre_free_substring()\fR and
665    \fBpcre_free_substring_list()\fR can be used to free the memory returned by
666    a previous call of \fBpcre_get_substring()\fR or
667    \fBpcre_get_substring_list()\fR, respectively. They do nothing more than call
668    the function pointed to by \fBpcre_free\fR, which of course could be called
669    directly from a C program. However, PCRE is used in some situations where it is
670    linked via a special interface to another programming language which cannot use
671    \fBpcre_free\fR directly; it is for these cases that the functions are
672    provided.
673    
674    
675  .SH LIMITATIONS  .SH LIMITATIONS
# Line 519  There are some size limitations in PCRE Line 677  There are some size limitations in PCRE
677  practice be relevant.  practice be relevant.
678  The maximum length of a compiled pattern is 65539 (sic) bytes.  The maximum length of a compiled pattern is 65539 (sic) bytes.
679  All values in repeating quantifiers must be less than 65536.  All values in repeating quantifiers must be less than 65536.
680  The maximum number of capturing subpatterns is 99.  There maximum number of capturing subpatterns is 65535.
681  The maximum number of all parenthesized subpatterns, including capturing  There is no limit to the number of non-capturing subpatterns, but the maximum
682    depth of nesting of all kinds of parenthesized subpattern, including capturing
683  subpatterns, assertions, and other types of subpattern, is 200.  subpatterns, assertions, and other types of subpattern, is 200.
684    
685  The maximum length of a subject string is the largest positive number that an  The maximum length of a subject string is the largest positive number that an
# Line 564  are not part of its pattern matching eng Line 723  are not part of its pattern matching eng
723  6. The Perl \\G assertion is not supported as it is not relevant to single  6. The Perl \\G assertion is not supported as it is not relevant to single
724  pattern matches.  pattern matches.
725    
726  7. Fairly obviously, PCRE does not support the (?{code}) construction.  7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
727    constructions. However, there is some experimental support for recursive
728    patterns using the non-Perl item (?R).
729    
730  8. There are at the time of writing some oddities in Perl 5.005_02 concerned  8. There are at the time of writing some oddities in Perl 5.005_02 concerned
731  with the settings of captured strings when part of a pattern is repeated. For  with the settings of captured strings when part of a pattern is repeated. For
732  example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value  example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
733  "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if  "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if
734  the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) get set.  the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set.
735    
736  In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the  In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the
737  future Perl changes to a consistent state that is different, PCRE may change to  future Perl changes to a consistent state that is different, PCRE may change to
# Line 602  of the subject. Line 763  of the subject.
763  (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for  (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
764  \fBpcre_exec()\fR have no Perl equivalents.  \fBpcre_exec()\fR have no Perl equivalents.
765    
766    (g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
767    this using the (?p{code}) construct, which PCRE cannot of course support.)
768    
769    
770  .SH REGULAR EXPRESSION DETAILS  .SH REGULAR EXPRESSION DETAILS
771  The syntax and semantics of the regular expressions supported by PCRE are  The syntax and semantics of the regular expressions supported by PCRE are
772  described below. Regular expressions are also described in the Perl  described below. Regular expressions are also described in the Perl
773  documentation and in a number of other books, some of which have copious  documentation and in a number of other books, some of which have copious
774  examples. Jeffrey Friedl's "Mastering Regular Expressions", published by  examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
775  O'Reilly (ISBN 1-56592-257-3), covers them in great detail. The description  O'Reilly (ISBN 1-56592-257), covers them in great detail.
776  here is intended as reference documentation.  
777    The description here is intended as reference documentation. The basic
778    operation of PCRE is on strings of bytes. However, there is the beginnings of
779    some support for UTF-8 character strings. To use this support you must
780    configure PCRE to include it, and then call \fBpcre_compile()\fR with the
781    PCRE_UTF8 option. How this affects the pattern matching is described in the
782    final section of this document.
783    
784  A regular expression is a pattern that is matched against a subject string from  A regular expression is a pattern that is matched against a subject string from
785  left to right. Most characters stand for themselves in a pattern, and match the  left to right. Most characters stand for themselves in a pattern, and match the
# Line 831  PCRE_MULTILINE is set. Line 1001  PCRE_MULTILINE is set.
1001    
1002  Note that the sequences \\A, \\Z, and \\z can be used to match the start and  Note that the sequences \\A, \\Z, and \\z can be used to match the start and
1003  end of the subject in both modes, and if all branches of a pattern start with  end of the subject in both modes, and if all branches of a pattern start with
1004  \\A is it always anchored, whether PCRE_MULTILINE is set or not.  \\A it is always anchored, whether PCRE_MULTILINE is set or not.
1005    
1006    
1007  .SH FULL STOP (PERIOD, DOT)  .SH FULL STOP (PERIOD, DOT)
1008  Outside a character class, a dot in the pattern matches any one character in  Outside a character class, a dot in the pattern matches any one character in
1009  the subject, including a non-printing character, but not (by default) newline.  the subject, including a non-printing character, but not (by default) newline.
1010  If the PCRE_DOTALL option is set, then dots match newlines as well. The  If the PCRE_DOTALL option is set, dots match newlines as well. The handling of
1011  handling of dot is entirely independent of the handling of circumflex and  dot is entirely independent of the handling of circumflex and dollar, the only
1012  dollar, the only relationship being that they both involve newline characters.  relationship being that they both involve newline characters. Dot has no
1013  Dot has no special meaning in a character class.  special meaning in a character class.
1014    
1015    
1016  .SH SQUARE BRACKETS  .SH SQUARE BRACKETS
# Line 906  terminating ] are non-special in charact Line 1076  terminating ] are non-special in charact
1076  are escaped.  are escaped.
1077    
1078    
1079    .SH POSIX CHARACTER CLASSES
1080    Perl 5.6 (not yet released at the time of writing) is going to support the
1081    POSIX notation for character classes, which uses names enclosed by [: and :]
1082    within the enclosing square brackets. PCRE supports this notation. For example,
1083    
1084      [01[:alpha:]%]
1085    
1086    matches "0", "1", any alphabetic character, or "%". The supported class names
1087    are
1088    
1089      alnum    letters and digits
1090      alpha    letters
1091      ascii    character codes 0 - 127
1092      cntrl    control characters
1093      digit    decimal digits (same as \\d)
1094      graph    printing characters, excluding space
1095      lower    lower case letters
1096      print    printing characters, including space
1097      punct    printing characters, excluding letters and digits
1098      space    white space (same as \\s)
1099      upper    upper case letters
1100      word     "word" characters (same as \\w)
1101      xdigit   hexadecimal digits
1102    
1103    The names "ascii" and "word" are Perl extensions. Another Perl extension is
1104    negation, which is indicated by a ^ character after the colon. For example,
1105    
1106      [12[:^digit:]]
1107    
1108    matches "1", "2", or any non-digit. PCRE (and Perl) also recognize the POSIX
1109    syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
1110    supported, and an error is given if they are encountered.
1111    
1112    
1113  .SH VERTICAL BAR  .SH VERTICAL BAR
1114  Vertical bar characters are used to separate alternative patterns. For example,  Vertical bar characters are used to separate alternative patterns. For example,
1115  the pattern  the pattern
# Line 999  For example, if the string "the red king Line 1203  For example, if the string "the red king
1203    the ((red|white) (king|queen))    the ((red|white) (king|queen))
1204    
1205  the captured substrings are "red king", "red", and "king", and are numbered 1,  the captured substrings are "red king", "red", and "king", and are numbered 1,
1206  2, and 3.  2, and 3, respectively.
1207    
1208  The fact that plain parentheses fulfil two functions is not always helpful.  The fact that plain parentheses fulfil two functions is not always helpful.
1209  There are often times when a grouping subpattern is required without a  There are often times when a grouping subpattern is required without a
# Line 1093  to the string Line 1297  to the string
1297    
1298    /* first command */  not comment  /* second comment */    /* first command */  not comment  /* second comment */
1299    
1300  fails, because it matches the entire string due to the greediness of the .*  fails, because it matches the entire string owing to the greediness of the .*
1301  item.  item.
1302    
1303  However, if a quantifier is followed by a question mark, then it ceases to be  However, if a quantifier is followed by a question mark, it ceases to be
1304  greedy, and instead matches the minimum number of times possible, so the  greedy, and instead matches the minimum number of times possible, so the
1305  pattern  pattern
1306    
# Line 1112  own right. Because it has two uses, it c Line 1316  own right. Because it has two uses, it c
1316  which matches one digit by preference, but can match two if that is the only  which matches one digit by preference, but can match two if that is the only
1317  way the rest of the pattern matches.  way the rest of the pattern matches.
1318    
1319  If the PCRE_UNGREEDY option is set (an option which is not available in Perl)  If the PCRE_UNGREEDY option is set (an option which is not available in Perl),
1320  then the quantifiers are not greedy by default, but individual ones can be made  the quantifiers are not greedy by default, but individual ones can be made
1321  greedy by following them with a question mark. In other words, it inverts the  greedy by following them with a question mark. In other words, it inverts the
1322  default behaviour.  default behaviour.
1323    
# Line 1122  is greater than 1 or with a limited maxi Line 1326  is greater than 1 or with a limited maxi
1326  compiled pattern, in proportion to the size of the minimum or maximum.  compiled pattern, in proportion to the size of the minimum or maximum.
1327    
1328  If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent  If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
1329  to Perl's /s) is set, thus allowing the . to match newlines, then the pattern  to Perl's /s) is set, thus allowing the . to match newlines, the pattern is
1330  is implicitly anchored, because whatever follows will be tried against every  implicitly anchored, because whatever follows will be tried against every
1331  character position in the subject string, so there is no point in retrying the  character position in the subject string, so there is no point in retrying the
1332  overall match at any position after the first. PCRE treats such a pattern as  overall match at any position after the first. PCRE treats such a pattern as
1333  though it were preceded by \\A. In cases where it is known that the subject  though it were preceded by \\A. In cases where it is known that the subject
# Line 1167  itself. So the pattern Line 1371  itself. So the pattern
1371    
1372  matches "sense and sensibility" and "response and responsibility", but not  matches "sense and sensibility" and "response and responsibility", but not
1373  "sense and responsibility". If caseful matching is in force at the time of the  "sense and responsibility". If caseful matching is in force at the time of the
1374  back reference, then the case of letters is relevant. For example,  back reference, the case of letters is relevant. For example,
1375    
1376    ((?i)rah)\\s+\\1    ((?i)rah)\\s+\\1
1377    
# Line 1175  matches "rah rah" and "RAH RAH", but not Line 1379  matches "rah rah" and "RAH RAH", but not
1379  capturing subpattern is matched caselessly.  capturing subpattern is matched caselessly.
1380    
1381  There may be more than one back reference to the same subpattern. If a  There may be more than one back reference to the same subpattern. If a
1382  subpattern has not actually been used in a particular match, then any back  subpattern has not actually been used in a particular match, any back
1383  references to it always fail. For example, the pattern  references to it always fail. For example, the pattern
1384    
1385    (a|(bc))\\2    (a|(bc))\\2
# Line 1183  references to it always fail. For exampl Line 1387  references to it always fail. For exampl
1387  always fails if it starts to match "a" rather than "bc". Because there may be  always fails if it starts to match "a" rather than "bc". Because there may be
1388  up to 99 back references, all digits following the backslash are taken  up to 99 back references, all digits following the backslash are taken
1389  as part of a potential back reference number. If the pattern continues with a  as part of a potential back reference number. If the pattern continues with a
1390  digit character, then some delimiter must be used to terminate the back  digit character, some delimiter must be used to terminate the back reference.
1391  reference. If the PCRE_EXTENDED option is set, this can be whitespace.  If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty
1392  Otherwise an empty comment can be used.  comment can be used.
1393    
1394  A back reference that occurs inside the parentheses to which it refers fails  A back reference that occurs inside the parentheses to which it refers fails
1395  when the subpattern is first used, so, for example, (a\\1) never matches.  when the subpattern is first used, so, for example, (a\\1) never matches.
# Line 1194  example, the pattern Line 1398  example, the pattern
1398    
1399    (a|b\\1)+    (a|b\\1)+
1400    
1401  matches any number of "a"s and also "aba", "ababaa" etc. At each iteration of  matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
1402  the subpattern, the back reference matches the character string corresponding  the subpattern, the back reference matches the character string corresponding
1403  to the previous iteration. In order for this to work, the pattern must be such  to the previous iteration. In order for this to work, the pattern must be such
1404  that the first iteration does not need to match the back reference. This can be  that the first iteration does not need to match the back reference. This can be
# Line 1273  Several assertions (of any sort) may occ Line 1477  Several assertions (of any sort) may occ
1477  matches "foo" preceded by three digits that are not "999". Notice that each of  matches "foo" preceded by three digits that are not "999". Notice that each of
1478  the assertions is applied independently at the same point in the subject  the assertions is applied independently at the same point in the subject
1479  string. First there is a check that the previous three characters are all  string. First there is a check that the previous three characters are all
1480  digits, then there is a check that the same three characters are not "999".  digits, and then there is a check that the same three characters are not "999".
1481  This pattern does \fInot\fR match "foo" preceded by six characters, the first  This pattern does \fInot\fR match "foo" preceded by six characters, the first
1482  of which are digits and the last three of which are not "999". For example, it  of which are digits and the last three of which are not "999". For example, it
1483  doesn't match "123abcfoo". A pattern to do that is  doesn't match "123abcfoo". A pattern to do that is
# Line 1352  pattern such as Line 1556  pattern such as
1556    
1557    abcd$    abcd$
1558    
1559  when applied to a long string which does not match it. Because matching  when applied to a long string which does not match. Because matching proceeds
1560  proceeds from left to right, PCRE will look for each "a" in the subject and  from left to right, PCRE will look for each "a" in the subject and then see if
1561  then see if what follows matches the rest of the pattern. If the pattern is  what follows matches the rest of the pattern. If the pattern is specified as
 specified as  
1562    
1563    ^.*abcd$    ^.*abcd$
1564    
1565  then the initial .* matches the entire string at first, but when this fails, it  the initial .* matches the entire string at first, but when this fails (because
1566  backtracks to match all but the last character, then all but the last two  there is no following "a"), it backtracks to match all but the last character,
1567  characters, and so on. Once again the search for "a" covers the entire string,  then all but the last two characters, and so on. Once again the search for "a"
1568  from right to left, so we are no better off. However, if the pattern is written  covers the entire string, from right to left, so we are no better off. However,
1569  as  if the pattern is written as
1570    
1571    ^(?>.*)(?<=abcd)    ^(?>.*)(?<=abcd)
1572    
1573  then there can be no backtracking for the .* item; it can match only the entire  there can be no backtracking for the .* item; it can match only the entire
1574  string. The subsequent lookbehind assertion does a single test on the last four  string. The subsequent lookbehind assertion does a single test on the last four
1575  characters. If it fails, the match fails immediately. For long strings, this  characters. If it fails, the match fails immediately. For long strings, this
1576  approach makes a significant difference to the processing time.  approach makes a significant difference to the processing time.
1577    
1578    When a pattern contains an unlimited repeat inside a subpattern that can itself
1579    be repeated an unlimited number of times, the use of a once-only subpattern is
1580    the only way to avoid some failing matches taking a very long time indeed.
1581    The pattern
1582    
1583      (\\D+|<\\d+>)*[!?]
1584    
1585    matches an unlimited number of substrings that either consist of non-digits, or
1586    digits enclosed in <>, followed by either ! or ?. When it matches, it runs
1587    quickly. However, if it is applied to
1588    
1589      aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
1590    
1591    it takes a long time before reporting failure. This is because the string can
1592    be divided between the two repeats in a large number of ways, and all have to
1593    be tried. (The example used [!?] rather than a single character at the end,
1594    because both PCRE and Perl have an optimization that allows for fast failure
1595    when a single character is used. They remember the last single character that
1596    is required for a match, and fail early if it is not present in the string.)
1597    If the pattern is changed to
1598    
1599      ((?>\\D+)|<\\d+>)*[!?]
1600    
1601    sequences of non-digits cannot be broken, and failure happens quickly.
1602    
1603    
1604  .SH CONDITIONAL SUBPATTERNS  .SH CONDITIONAL SUBPATTERNS
1605  It is possible to cause the matching process to obey a subpattern  It is possible to cause the matching process to obey a subpattern
# Line 1387  no-pattern (if present) is used. If ther Line 1615  no-pattern (if present) is used. If ther
1615  subpattern, a compile-time error occurs.  subpattern, a compile-time error occurs.
1616    
1617  There are two kinds of condition. If the text between the parentheses consists  There are two kinds of condition. If the text between the parentheses consists
1618  of a sequence of digits, then the condition is satisfied if the capturing  of a sequence of digits, the condition is satisfied if the capturing subpattern
1619  subpattern of that number has previously matched. Consider the following  of that number has previously matched. The number must be greater than zero.
1620  pattern, which contains non-significant white space to make it more readable  Consider the following pattern, which contains non-significant white space to
1621  (assume the PCRE_EXTENDED option) and to divide it into three parts for ease  make it more readable (assume the PCRE_EXTENDED option) and to divide it into
1622  of discussion:  three parts for ease of discussion:
1623    
1624    ( \\( )?    [^()]+    (?(1) \\) )    ( \\( )?    [^()]+    (?(1) \\) )
1625    
# Line 1431  character class introduces a comment tha Line 1659  character class introduces a comment tha
1659  character in the pattern.  character in the pattern.
1660    
1661    
1662    .SH RECURSIVE PATTERNS
1663    Consider the problem of matching a string in parentheses, allowing for
1664    unlimited nested parentheses. Without the use of recursion, the best that can
1665    be done is to use a pattern that matches up to some fixed depth of nesting. It
1666    is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
1667    experimental facility that allows regular expressions to recurse (amongst other
1668    things). It does this by interpolating Perl code in the expression at run time,
1669    and the code can refer to the expression itself. A Perl pattern to solve the
1670    parentheses problem can be created like this:
1671    
1672      $re = qr{\\( (?: (?>[^()]+) | (?p{$re}) )* \\)}x;
1673    
1674    The (?p{...}) item interpolates Perl code at run time, and in this case refers
1675    recursively to the pattern in which it appears. Obviously, PCRE cannot support
1676    the interpolation of Perl code. Instead, the special item (?R) is provided for
1677    the specific case of recursion. This PCRE pattern solves the parentheses
1678    problem (assume the PCRE_EXTENDED option is set so that white space is
1679    ignored):
1680    
1681      \\( ( (?>[^()]+) | (?R) )* \\)
1682    
1683    First it matches an opening parenthesis. Then it matches any number of
1684    substrings which can either be a sequence of non-parentheses, or a recursive
1685    match of the pattern itself (i.e. a correctly parenthesized substring). Finally
1686    there is a closing parenthesis.
1687    
1688    This particular example pattern contains nested unlimited repeats, and so the
1689    use of a once-only subpattern for matching strings of non-parentheses is
1690    important when applying the pattern to strings that do not match. For example,
1691    when it is applied to
1692    
1693      (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
1694    
1695    it yields "no match" quickly. However, if a once-only subpattern is not used,
1696    the match runs for a very long time indeed because there are so many different
1697    ways the + and * repeats can carve up the subject, and all have to be tested
1698    before failure can be reported.
1699    
1700    The values set for any capturing subpatterns are those from the outermost level
1701    of the recursion at which the subpattern value is set. If the pattern above is
1702    matched against
1703    
1704      (ab(cd)ef)
1705    
1706    the value for the capturing parentheses is "ef", which is the last value taken
1707    on at the top level. If additional parentheses are added, giving
1708    
1709      \\( ( ( (?>[^()]+) | (?R) )* ) \\)
1710         ^                        ^
1711         ^                        ^
1712    the string they capture is "ab(cd)ef", the contents of the top level
1713    parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
1714    has to obtain extra memory to store data during a recursion, which it does by
1715    using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no
1716    memory can be obtained, it saves data for the first 15 capturing parentheses
1717    only, as there is no way to give an out-of-memory error from within a
1718    recursion.
1719    
1720    
1721  .SH PERFORMANCE  .SH PERFORMANCE
1722  Certain items that may appear in patterns are more efficient than others. It is  Certain items that may appear in patterns are more efficient than others. It is
1723  more efficient to use a character class like [aeiou] than a set of alternatives  more efficient to use a character class like [aeiou] than a set of alternatives
# Line 1486  with the pattern above. The former gives Line 1773  with the pattern above. The former gives
1773  applied to a whole line of "a" characters, whereas the latter takes an  applied to a whole line of "a" characters, whereas the latter takes an
1774  appreciable time with strings longer than about 20 characters.  appreciable time with strings longer than about 20 characters.
1775    
1776    
1777    .SH UTF-8 SUPPORT
1778    Starting at release 3.3, PCRE has some support for character strings encoded
1779    in the UTF-8 format. This is incomplete, and is regarded as experimental. In
1780    order to use it, you must configure PCRE to include UTF-8 support in the code,
1781    and, in addition, you must call \fBpcre_compile()\fR with the PCRE_UTF8 option
1782    flag. When you do this, both the pattern and any subject strings that are
1783    matched against it are treated as UTF-8 strings instead of just strings of
1784    bytes, but only in the cases that are mentioned below.
1785    
1786    If you compile PCRE with UTF-8 support, but do not use it at run time, the
1787    library will be a bit bigger, but the additional run time overhead is limited
1788    to testing the PCRE_UTF8 flag in several places, so should not be very large.
1789    
1790    PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
1791    not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
1792    the results are undefined.
1793    
1794    Running with PCRE_UTF8 set causes these changes in the way PCRE works:
1795    
1796    1. In a pattern, the escape sequence \\x{...}, where the contents of the braces
1797    is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
1798    code number is the given hexadecimal number, for example: \\x{1234}. This
1799    inserts from one to six literal bytes into the pattern, using the UTF-8
1800    encoding. If a non-hexadecimal digit appears between the braces, the item is
1801    not recognized.
1802    
1803    2. The original hexadecimal escape sequence, \\xhh, generates a two-byte UTF-8
1804    character if its value is greater than 127.
1805    
1806    3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
1807    character. For example, \\x{100}* and \\xc3+ do not work. If you want to
1808    repeat such characters, you must enclose them in non-capturing parentheses,
1809    for example (?:\\x{100}), at present.
1810    
1811    4. The dot metacharacter matches one UTF-8 character instead of a single byte.
1812    
1813    5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
1814    repeat quantifier does operate correctly on UTF-8 characters instead of
1815    single bytes.
1816    
1817    4. Although the \\x{...} escape is permitted in a character class, characters
1818    whose values are greater than 255 cannot be included in a class.
1819    
1820    5. A class is matched against a UTF-8 character instead of just a single byte,
1821    but it can match only characters whose values are less than 256. Characters
1822    with greater values always fail to match a class.
1823    
1824    6. Repeated classes work correctly on multiple characters.
1825    
1826    7. Classes containing just a single character whose value is greater than 127
1827    (but less than 256), for example, [\\x80] or [^\\x{93}], do not work because
1828    these are optimized into single byte matches. In the first case, of course,
1829    the class brackets are just redundant.
1830    
1831    8. Lookbehind assertions move backwards in the subject by a fixed number of
1832    characters instead of a fixed number of bytes. Simple cases have been tested
1833    to work correctly, but there may be hidden gotchas herein.
1834    
1835    9. The character types such as \\d and \\w do not work correctly with UTF-8
1836    characters. They continue to test a single byte.
1837    
1838    10. Anything not explicitly mentioned here continues to work in bytes rather
1839    than in characters.
1840    
1841    The following UTF-8 features of Perl 5.6 are not implemented:
1842    
1843    1. The escape sequence \\C to match a single byte.
1844    
1845    2. The use of Unicode tables and properties and escapes \\p, \\P, and \\X.
1846    
1847    
1848    .SH SAMPLE PROGRAM
1849    The code below is a simple, complete demonstration program, to get you started
1850    with using PCRE. This code is also supplied in the file \fIpcredemo.c\fR in the
1851    PCRE distribution.
1852    
1853    The program compiles the regular expression that is its first argument, and
1854    matches it against the subject string in its second argument. No options are
1855    set, and default character tables are used. If matching succeeds, the program
1856    outputs the portion of the subject that matched, together with the contents of
1857    any captured substrings.
1858    
1859    On a Unix system that has PCRE installed in \fI/usr/local\fR, you can compile
1860    the demonstration program using a command like this:
1861    
1862      gcc -o pcredemo pcredemo.c -I/usr/local/include -L/usr/local/lib -lpcre
1863    
1864    Then you can run simple tests like this:
1865    
1866      ./pcredemo 'cat|dog' 'the cat sat on the mat'
1867    
1868    Note that there is a much more comprehensive test program, called
1869    \fBpcretest\fR, which supports many more facilities for testing regular
1870    expressions. The \fBpcredemo\fR program is provided as a simple coding example.
1871    
1872    On some operating systems (e.g. Solaris) you may get an error like this when
1873    you try to run \fBpcredemo\fR:
1874    
1875      ld.so.1: a.out: fatal: libpcre.so.0: open failed: No such file or directory
1876    
1877    This is caused by the way shared library support works on those systems. You
1878    need to add
1879    
1880      -R/usr/local/lib
1881    
1882    to the compile command to get round this problem. Here's the code:
1883    
1884      #include <stdio.h>
1885      #include <string.h>
1886      #include <pcre.h>
1887    
1888      #define OVECCOUNT 30    /* should be a multiple of 3 */
1889    
1890      int main(int argc, char **argv)
1891      {
1892      pcre *re;
1893      const char *error;
1894      int erroffset;
1895      int ovector[OVECCOUNT];
1896      int rc, i;
1897    
1898      if (argc != 3)
1899        {
1900        printf("Two arguments required: a regex and a "
1901          "subject string\\n");
1902        return 1;
1903        }
1904    
1905      /* Compile the regular expression in the first argument */
1906    
1907      re = pcre_compile(
1908        argv[1],     /* the pattern */
1909        0,           /* default options */
1910        &error,      /* for error message */
1911        &erroffset,  /* for error offset */
1912        NULL);       /* use default character tables */
1913    
1914      /* Compilation failed: print the error message and exit */
1915    
1916      if (re == NULL)
1917        {
1918        printf("PCRE compilation failed at offset %d: %s\\n",
1919          erroffset, error);
1920        return 1;
1921        }
1922    
1923      /* Compilation succeeded: match the subject in the second
1924         argument */
1925    
1926      rc = pcre_exec(
1927        re,          /* the compiled pattern */
1928        NULL,        /* we didn't study the pattern */
1929        argv[2],     /* the subject string */
1930        (int)strlen(argv[2]), /* the length of the subject */
1931        0,           /* start at offset 0 in the subject */
1932        0,           /* default options */
1933        ovector,     /* vector for substring information */
1934        OVECCOUNT);  /* number of elements in the vector */
1935    
1936      /* Matching failed: handle error cases */
1937    
1938      if (rc < 0)
1939        {
1940        switch(rc)
1941          {
1942          case PCRE_ERROR_NOMATCH: printf("No match\\n"); break;
1943          /*
1944          Handle other special cases if you like
1945          */
1946          default: printf("Matching error %d\\n", rc); break;
1947          }
1948        return 1;
1949        }
1950    
1951      /* Match succeded */
1952    
1953      printf("Match succeeded\\n");
1954    
1955      /* The output vector wasn't big enough */
1956    
1957      if (rc == 0)
1958        {
1959        rc = OVECCOUNT/3;
1960        printf("ovector only has room for %d captured "
1961          substrings\\n", rc - 1);
1962        }
1963    
1964      /* Show substrings stored in the output vector */
1965    
1966      for (i = 0; i < rc; i++)
1967        {
1968        char *substring_start = argv[2] + ovector[2*i];
1969        int substring_length = ovector[2*i+1] - ovector[2*i];
1970        printf("%2d: %.*s\\n", i, substring_length,
1971          substring_start);
1972        }
1973    
1974      return 0;
1975      }
1976    
1977    
1978  .SH AUTHOR  .SH AUTHOR
1979  Philip Hazel <ph10@cam.ac.uk>  Philip Hazel <ph10@cam.ac.uk>
1980  .br  .br
# Line 1497  Cambridge CB2 3QG, England. Line 1986  Cambridge CB2 3QG, England.
1986  .br  .br
1987  Phone: +44 1223 334714  Phone: +44 1223 334714
1988    
1989  Last updated: 29 July 1999  Last updated: 15 August 2001
1990  .br  .br
1991  Copyright (c) 1997-1999 University of Cambridge.  Copyright (c) 1997-2001 University of Cambridge.

Legend:
Removed from v.41  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5