/[pcre]/code/tags/pcre-5.0/doc/pcre.txt
ViewVC logotype

Diff of /code/tags/pcre-5.0/doc/pcre.txt

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 41 by nigel, Sat Feb 24 21:39:17 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 28  SYNOPSIS Line 28  SYNOPSIS
28       int pcre_get_substring_list(const char *subject,       int pcre_get_substring_list(const char *subject,
29            int *ovector, int stringcount, const char ***listptr);            int *ovector, int stringcount, const char ***listptr);
30    
31         void pcre_free_substring(const char *stringptr);
32    
33         void pcre_free_substring_list(const char **stringptr);
34    
35       const unsigned char *pcre_maketables(void);       const unsigned char *pcre_maketables(void);
36    
37         int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
38              int what, void *where);
39    
40       int pcre_info(const pcre *code, int *optptr, *firstcharptr);       int pcre_info(const pcre *code, int *optptr, *firstcharptr);
41    
42       char *pcre_version(void);       char *pcre_version(void);
# Line 45  DESCRIPTION Line 52  DESCRIPTION
52       The PCRE library is a set of functions that implement  regu-       The PCRE library is a set of functions that implement  regu-
53       lar  expression  pattern  matching using the same syntax and       lar  expression  pattern  matching using the same syntax and
54       semantics as Perl  5,  with  just  a  few  differences  (see       semantics as Perl  5,  with  just  a  few  differences  (see
55    
56       below).  The  current  implementation  corresponds  to  Perl       below).  The  current  implementation  corresponds  to  Perl
57       5.005.       5.005, with some additional features  from  later  versions.
58         This  includes  some  experimental,  incomplete  support for
59         UTF-8 encoded strings. Details of exactly what is  and  what
60         is not supported are given below.
61    
62       PCRE has its own native API,  which  is  described  in  this       PCRE has its own native API,  which  is  described  in  this
63       document.  There  is  also  a  set of wrapper functions that       document.  There  is  also  a  set of wrapper functions that
64       correspond to the POSIX API.  These  are  described  in  the       correspond to the POSIX regular expression API.   These  are
65       pcreposix documentation.       described in the pcreposix documentation.
66    
67       The native API function prototypes are defined in the header       The native API function prototypes are defined in the header
68       file  pcre.h,  and  on  Unix  systems  the library itself is       file  pcre.h,  and  on  Unix  systems  the library itself is
69       called libpcre.a, so can be accessed by adding -lpcre to the       called libpcre.a, so can be accessed by adding -lpcre to the
70       command for linking an application which calls it.       command  for  linking  an  application  which  calls it. The
71         header file defines the macros PCRE_MAJOR and PCRE_MINOR  to
72         contain the major and minor release numbers for the library.
73         Applications can use these to include support for  different
74         releases.
75    
76       The functions pcre_compile(), pcre_study(), and  pcre_exec()       The functions pcre_compile(), pcre_study(), and  pcre_exec()
77       are  used  for  compiling  and matching regular expressions,       are  used  for compiling and matching regular expressions. A
78       while   pcre_copy_substring(),   pcre_get_substring(),   and       sample program that demonstrates the simplest way  of  using
79       pcre_get_substring_list()   are  convenience  functions  for       them  is  given  in the file pcredemo.c. The last section of
80         this man page describes how to run it.
81    
82         The functions  pcre_copy_substring(),  pcre_get_substring(),
83         and  pcre_get_substring_list() are convenience functions for
84       extracting  captured  substrings  from  a  matched   subject       extracting  captured  substrings  from  a  matched   subject
85       string.  The function pcre_maketables() is used (optionally)       string; pcre_free_substring() and pcre_free_substring_list()
86       to build a set of character tables in the current locale for       are also provided, to free the  memory  used  for  extracted
87       passing to pcre_compile().       strings.
88    
89       The function pcre_info() is used  to  find  out  information       The function pcre_maketables() is used (optionally) to build
90       about  a compiled pattern, while the function pcre_version()       a  set of character tables in the current locale for passing
91       returns a pointer to a string containing the version of PCRE       to pcre_compile().
92       and its date of release.  
93         The function pcre_fullinfo() is used to find out information
94         about a compiled pattern; pcre_info() is an obsolete version
95         which returns only some of the available information, but is
96         retained   for   backwards   compatibility.    The  function
97         pcre_version() returns a pointer to a string containing  the
98         version of PCRE and its date of release.
99    
100       The global variables  pcre_malloc  and  pcre_free  initially       The global variables  pcre_malloc  and  pcre_free  initially
101       contain the entry points of the standard malloc() and free()       contain the entry points of the standard malloc() and free()
# Line 98  COMPILING A PATTERN Line 124  COMPILING A PATTERN
124       by a binary zero, and is passed in the argument  pattern.  A       by a binary zero, and is passed in the argument  pattern.  A
125       pointer  to  a  single  block of memory that is obtained via       pointer  to  a  single  block of memory that is obtained via
126       pcre_malloc is returned. This contains the compiled code and       pcre_malloc is returned. This contains the compiled code and
127       related data. The pcre type is defined for this for conveni-       related  data.  The  pcre  type  is defined for the returned
128       ence, but in fact pcre is just a typedef for void, since the       block; this is a typedef for a structure whose contents  are
129       contents  of  the block are not externally defined. It is up       not  externally  defined. It is up to the caller to free the
130       to the caller to free  the  memory  when  it  is  no  longer       memory when it is no longer required.
131       required.  
132         Although the compiled code of a PCRE regex  is  relocatable,
133         that is, it does not depend on memory location, the complete
134         pcre data block is not fully relocatable,  because  it  con-
135         tains  a  copy of the tableptr argument, which is an address
136         (see below).
137    
138       The size of a compiled pattern is  roughly  proportional  to       The size of a compiled pattern is  roughly  proportional  to
139       the length of the pattern string, except that each character       the length of the pattern string, except that each character
# Line 137  COMPILING A PATTERN Line 168  COMPILING A PATTERN
168       must  be  the result of a call to pcre_maketables(). See the       must  be  the result of a call to pcre_maketables(). See the
169       section on locale support below.       section on locale support below.
170    
171         This code fragment shows a typical straightforward  call  to
172         pcre_compile():
173    
174           pcre *re;
175           const char *error;
176           int erroffset;
177           re = pcre_compile(
178             "^A.*Z",          /* the pattern */
179             0,                /* default options */
180             &error,           /* for error message */
181             &erroffset,       /* for error offset */
182             NULL);            /* use default character tables */
183    
184       The following option bits are defined in the header file:       The following option bits are defined in the header file:
185    
186         PCRE_ANCHORED         PCRE_ANCHORED
# Line 187  COMPILING A PATTERN Line 231  COMPILING A PATTERN
231    
232         PCRE_EXTRA         PCRE_EXTRA
233    
234       This option turns on additional functionality of  PCRE  that       This option was invented in  order  to  turn  on  additional
235       is  incompatible  with Perl. Any backslash in a pattern that       functionality of PCRE that is incompatible with Perl, but it
236       is followed by a letter that has no special  meaning  causes       is currently of very little use. When set, any backslash  in
237       an  error,  thus  reserving  these  combinations  for future       a  pattern  that is followed by a letter that has no special
238       expansion. By default, as in Perl, a backslash followed by a       meaning causes an error, thus reserving  these  combinations
239       letter  with  no  special  meaning  is treated as a literal.       for  future  expansion.  By default, as in Perl, a backslash
240       There are at present no other features  controlled  by  this       followed by a letter with no special meaning is treated as a
241       option.       literal.  There  are at present no other features controlled
242         by this option. It can also be set by a (?X) option  setting
243         within a pattern.
244    
245         PCRE_MULTILINE         PCRE_MULTILINE
246    
# Line 207  COMPILING A PATTERN Line 253  COMPILING A PATTERN
253       PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.       PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
254    
255       When PCRE_MULTILINE it is set, the "start of line" and  "end       When PCRE_MULTILINE it is set, the "start of line" and  "end
256       of   line"   constructs   match   immediately  following  or       of  line"  constructs match immediately following or immedi-
257       immediately  before  any  newline  in  the  subject  string,       ately before any newline  in  the  subject  string,  respec-
258       respectively,  as well as at the very start and end. This is       tively,  as  well  as  at  the  very  start and end. This is
259       equivalent to Perl's /m option. If there are no "\n" charac-       equivalent to Perl's /m option. If there are no "\n" charac-
260       ters  in  a subject string, or no occurrences of ^ or $ in a       ters  in  a subject string, or no occurrences of ^ or $ in a
261       pattern, setting PCRE_MULTILINE has no effect.       pattern, setting PCRE_MULTILINE has no effect.
# Line 221  COMPILING A PATTERN Line 267  COMPILING A PATTERN
267       followed by "?". It is not compatible with Perl. It can also       followed by "?". It is not compatible with Perl. It can also
268       be set by a (?U) option setting within the pattern.       be set by a (?U) option setting within the pattern.
269    
270           PCRE_UTF8
271    
272         This option causes PCRE to regard both the pattern  and  the
273         subject  as strings of UTF-8 characters instead of just byte
274         strings. However, it is available  only  if  PCRE  has  been
275         built  to  include  UTF-8  support.  If not, the use of this
276         option provokes an error. Support for UTF-8 is new,  experi-
277         mental,  and incomplete.  Details of exactly what it entails
278         are given below.
279    
280    
281    
282  STUDYING A PATTERN  STUDYING A PATTERN
# Line 228  STUDYING A PATTERN Line 284  STUDYING A PATTERN
284       worth  spending  more time analyzing it in order to speed up       worth  spending  more time analyzing it in order to speed up
285       the time taken for matching. The function pcre_study() takes       the time taken for matching. The function pcre_study() takes
286       a  pointer  to a compiled pattern as its first argument, and       a  pointer  to a compiled pattern as its first argument, and
287       returns a  pointer  to  a  pcre_extra  block  (another  void       returns a pointer to a pcre_extra block (another typedef for
288       typedef)  containing  additional  information about the pat-       a  structure  with  hidden  contents)  containing additional
289       tern; this can be passed to pcre_exec().  If  no  additional       information  about  the  pattern;  this  can  be  passed  to
290       information is available, NULL is returned.       pcre_exec(). If no additional information is available, NULL
291         is returned.
292    
293       The second argument contains option  bits.  At  present,  no       The second argument contains option  bits.  At  present,  no
294       options  are  defined  for  pcre_study(),  and this argument       options  are  defined  for  pcre_study(),  and this argument
# Line 242  STUDYING A PATTERN Line 299  STUDYING A PATTERN
299       the variable it points to  is  set  to  NULL.  Otherwise  it       the variable it points to  is  set  to  NULL.  Otherwise  it
300       points to a textual error message.       points to a textual error message.
301    
302         This is a typical call to pcre_study():
303    
304           pcre_extra *pe;
305           pe = pcre_study(
306             re,             /* result of pcre_compile() */
307             0,              /* no options exist */
308             &error);        /* set to NULL or points to a message */
309    
310       At present, studying a  pattern  is  useful  only  for  non-       At present, studying a  pattern  is  useful  only  for  non-
311       anchored  patterns  that do not have a single fixed starting       anchored  patterns  that do not have a single fixed starting
312       character. A  bitmap  of  possible  starting  characters  is       character. A  bitmap  of  possible  starting  characters  is
# Line 284  LOCALE SUPPORT Line 349  LOCALE SUPPORT
349    
350    
351  INFORMATION ABOUT A PATTERN  INFORMATION ABOUT A PATTERN
352       The pcre_info() function returns information  about  a  com-       The pcre_fullinfo() function  returns  information  about  a
353       piled pattern.  Its yield is the number of capturing subpat-       compiled pattern. It replaces the obsolete pcre_info() func-
354       terns, or one of the following negative numbers:       tion, which is nevertheless retained for backwards compabil-
355         ity (and is documented below).
356    
357         The first argument for pcre_fullinfo() is a pointer  to  the
358         compiled  pattern.  The  second  argument  is  the result of
359         pcre_study(), or NULL if the pattern was  not  studied.  The
360         third  argument  specifies  which  piece  of  information is
361         required, while the fourth argument is a pointer to a  vari-
362         able  to receive the data. The yield of the function is zero
363         for success, or one of the following negative numbers:
364    
365         PCRE_ERROR_NULL       the argument code was NULL         PCRE_ERROR_NULL       the argument code was NULL
366                                 the argument where was NULL
367         PCRE_ERROR_BADMAGIC   the "magic number" was not found         PCRE_ERROR_BADMAGIC   the "magic number" was not found
368           PCRE_ERROR_BADOPTION  the value of what was invalid
369    
370       If the optptr argument is not NULL, a copy  of  the  options       Here is a typical call of  pcre_fullinfo(),  to  obtain  the
371       with which the pattern was compiled is placed in the integer       length of the compiled pattern:
372       it points to. These option bits are those specified  in  the  
373           int rc;
374           unsigned long int length;
375           rc = pcre_fullinfo(
376             re,               /* result of pcre_compile() */
377             pe,               /* result of pcre_study(), or NULL */
378             PCRE_INFO_SIZE,   /* what is required */
379             &length);         /* where to put the data */
380    
381         The possible values for the third argument  are  defined  in
382         pcre.h, and are as follows:
383    
384           PCRE_INFO_OPTIONS
385    
386         Return a copy of the options with which the pattern was com-
387         piled.  The fourth argument should point to an unsigned long
388         int variable. These option bits are those specified  in  the
389       call  to  pcre_compile(),  modified  by any top-level option       call  to  pcre_compile(),  modified  by any top-level option
390       settings  within  the   pattern   itself,   and   with   the       settings  within  the   pattern   itself,   and   with   the
391       PCRE_ANCHORED  bit  set  if  the form of the pattern implies       PCRE_ANCHORED  bit  forcibly  set if the form of the pattern
392       that it can match only at the start of a subject string.       implies that it can match only at the  start  of  a  subject
393         string.
394    
395       If the pattern is not anchored and the firstcharptr argument         PCRE_INFO_SIZE
396       is  not  NULL, it is used to pass back information about the  
397       first character of any matched string. If there is  a  fixed       Return the size of the compiled pattern, that is, the  value
398       first    character,    e.g.   from   a   pattern   such   as       that  was  passed as the argument to pcre_malloc() when PCRE
399       (cat|cow|coyote), then it is returned in the integer pointed       was getting memory in which to place the compiled data.  The
400       to by firstcharptr. Otherwise, if either       fourth argument should point to a size_t variable.
401    
402           PCRE_INFO_CAPTURECOUNT
403    
404         Return the number of capturing subpatterns in  the  pattern.
405         The fourth argument should point to an int variable.
406    
407           PCRE_INFO_BACKREFMAX
408    
409         Return the number of the highest back reference in the  pat-
410         tern.  The  fourth argument should point to an int variable.
411         Zero is returned if there are no back references.
412    
413           PCRE_INFO_FIRSTCHAR
414    
415         Return information about the first character of any  matched
416         string,  for  a  non-anchored  pattern.  If there is a fixed
417         first   character,   e.g.   from   a   pattern    such    as
418         (cat|cow|coyote),  it  is returned in the integer pointed to
419         by where. Otherwise, if either
420    
421       (a) the pattern was compiled with the PCRE_MULTILINE option,       (a) the pattern was compiled with the PCRE_MULTILINE option,
422       and every branch starts with "^", or       and every branch starts with "^", or
# Line 312  INFORMATION ABOUT A PATTERN Line 424  INFORMATION ABOUT A PATTERN
424       (b) every  branch  of  the  pattern  starts  with  ".*"  and       (b) every  branch  of  the  pattern  starts  with  ".*"  and
425       PCRE_DOTALL is not set (if it were set, the pattern would be       PCRE_DOTALL is not set (if it were set, the pattern would be
426       anchored),       anchored),
427       then -1 is returned, indicating  that  the  pattern  matches  
428       only  at  the  start  of  a subject string or after any "\n"       -1 is returned, indicating that the pattern matches only  at
429       within the string. Otherwise -2 is returned.       the  start  of a subject string or after any "\n" within the
430         string. Otherwise -2 is returned.  For anchored patterns, -2
431         is returned.
432    
433           PCRE_INFO_FIRSTTABLE
434    
435         If the pattern was studied, and this resulted  in  the  con-
436         struction of a 256-bit table indicating a fixed set of char-
437         acters for the first character in  any  matching  string,  a
438         pointer   to  the  table  is  returned.  Otherwise  NULL  is
439         returned. The fourth argument should point  to  an  unsigned
440         char * variable.
441    
442           PCRE_INFO_LASTLITERAL
443    
444         For a non-anchored pattern, return the value of  the  right-
445         most  literal  character  which  must  exist  in any matched
446         string, other than at its start. The fourth argument  should
447         point  to an int variable. If there is no such character, or
448         if the pattern is anchored, -1 is returned. For example, for
449         the pattern /a\d+z\d+/ the returned value is 'z'.
450    
451         The pcre_info() function is now obsolete because its  inter-
452         face  is  too  restrictive  to return all the available data
453         about  a  compiled  pattern.   New   programs   should   use
454         pcre_fullinfo()  instead.  The  yield  of pcre_info() is the
455         number of capturing subpatterns, or  one  of  the  following
456         negative numbers:
457    
458           PCRE_ERROR_NULL       the argument code was NULL
459           PCRE_ERROR_BADMAGIC   the "magic number" was not found
460    
461         If the optptr argument is not NULL, a copy  of  the  options
462         with which the pattern was compiled is placed in the integer
463         it points to (see PCRE_INFO_OPTIONS above).
464    
465         If the pattern is not anchored and the firstcharptr argument
466         is  not  NULL, it is used to pass back information about the
467         first    character    of    any    matched    string    (see
468         PCRE_INFO_FIRSTCHAR above).
469    
470    
471    
472  MATCHING A PATTERN  MATCHING A PATTERN
473       The function pcre_exec() is called to match a subject string       The function pcre_exec() is called to match a subject string
474    
475    
476    
477    
478    
479    SunOS 5.8                 Last change:                          9
480    
481    
482    
483       against  a pre-compiled pattern, which is passed in the code       against  a pre-compiled pattern, which is passed in the code
484       argument. If the pattern has been studied, the result of the       argument. If the pattern has been studied, the result of the
485       study should be passed in the extra argument. Otherwise this       study should be passed in the extra argument. Otherwise this
486       must be NULL.       must be NULL.
487    
488         Here is an example of a simple call to pcre_exec():
489    
490           int rc;
491           int ovector[30];
492           rc = pcre_exec(
493             re,             /* result of pcre_compile() */
494             NULL,           /* we didn't study the pattern */
495             "some string",  /* the subject string */
496             11,             /* the length of the subject string */
497             0,              /* start at offset 0 in the subject */
498             0,              /* default options */
499             ovector,        /* vector for substring information */
500             30);            /* number of elements in the vector */
501    
502       The PCRE_ANCHORED option can be passed in the options  argu-       The PCRE_ANCHORED option can be passed in the options  argu-
503       ment,  whose unused bits must be zero. However, if a pattern       ment,  whose unused bits must be zero. However, if a pattern
504       was  compiled  with  PCRE_ANCHORED,  or  turned  out  to  be       was  compiled  with  PCRE_ANCHORED,  or  turned  out  to  be
# Line 375  MATCHING A PATTERN Line 549  MATCHING A PATTERN
549    
550       The subject string is passed as  a  pointer  in  subject,  a       The subject string is passed as  a  pointer  in  subject,  a
551       length  in  length,  and  a  starting offset in startoffset.       length  in  length,  and  a  starting offset in startoffset.
552       Unlike the pattern string, it may contain binary zero  char-       Unlike the pattern string, the subject  may  contain  binary
553       acters.  When  the starting offset is zero, the search for a       zero  characters.  When  the  starting  offset  is zero, the
554       match starts at the beginning of the subject, and this is by       search for a match starts at the beginning of  the  subject,
555       far the most common case.       and this is by far the most common case.
556    
557       A non-zero starting offset  is  useful  when  searching  for       A non-zero starting offset  is  useful  when  searching  for
558       another  match  in  the  same subject by calling pcre_exec()       another  match  in  the  same subject by calling pcre_exec()
# Line 514  MATCHING A PATTERN Line 688  MATCHING A PATTERN
688    
689    
690    
691    
692  EXTRACTING CAPTURED SUBSTRINGS  EXTRACTING CAPTURED SUBSTRINGS
693       Captured substrings can be accessed directly  by  using  the       Captured substrings can be accessed directly  by  using  the
694       offsets returned by pcre_exec() in ovector. For convenience,       offsets returned by pcre_exec() in ovector. For convenience,
# Line 533  EXTRACTING CAPTURED SUBSTRINGS Line 708  EXTRACTING CAPTURED SUBSTRINGS
708       entire regular expression. This is  the  value  returned  by       entire regular expression. This is  the  value  returned  by
709       pcre_exec  if  it  is  greater  than  zero.  If  pcre_exec()       pcre_exec  if  it  is  greater  than  zero.  If  pcre_exec()
710       returned zero, indicating that it ran out of space in  ovec-       returned zero, indicating that it ran out of space in  ovec-
711       tor, then the value passed as stringcount should be the size       tor,  the  value passed as stringcount should be the size of
712       of the vector divided by three.       the vector divided by three.
713    
714       The functions pcre_copy_substring() and pcre_get_substring()       The functions pcre_copy_substring() and pcre_get_substring()
715       extract a single substring, whose number is given as string-       extract a single substring, whose number is given as string-
# Line 542  EXTRACTING CAPTURED SUBSTRINGS Line 717  EXTRACTING CAPTURED SUBSTRINGS
717       the entire pattern, while higher values extract the captured       the entire pattern, while higher values extract the captured
718       substrings. For pcre_copy_substring(), the string is  placed       substrings. For pcre_copy_substring(), the string is  placed
719       in  buffer,  whose  length is given by buffersize, while for       in  buffer,  whose  length is given by buffersize, while for
720       pcre_get_substring() a new block of store  is  obtained  via       pcre_get_substring() a new block of memory is  obtained  via
721       pcre_malloc,  and its address is returned via stringptr. The       pcre_malloc,  and its address is returned via stringptr. The
722       yield of the function is  the  length  of  the  string,  not       yield of the function is  the  length  of  the  string,  not
723       including the terminating zero, or one of       including the terminating zero, or one of
# Line 576  EXTRACTING CAPTURED SUBSTRINGS Line 751  EXTRACTING CAPTURED SUBSTRINGS
751       inspecting the appropriate offset in ovector, which is nega-       inspecting the appropriate offset in ovector, which is nega-
752       tive for unset substrings.       tive for unset substrings.
753    
754         The  two  convenience  functions  pcre_free_substring()  and
755         pcre_free_substring_list()  can  be  used to free the memory
756         returned by  a  previous  call  of  pcre_get_substring()  or
757         pcre_get_substring_list(),  respectively.  They  do  nothing
758         more than call the function pointed to by  pcre_free,  which
759         of  course  could  be called directly from a C program. How-
760         ever, PCRE is used in some situations where it is linked via
761         a  special  interface  to another programming language which
762         cannot use pcre_free directly; it is for  these  cases  that
763         the functions are provided.
764    
765    
766    
# Line 583  LIMITATIONS Line 768  LIMITATIONS
768       There are some size limitations in PCRE but it is hoped that       There are some size limitations in PCRE but it is hoped that
769       they will never in practice be relevant.  The maximum length       they will never in practice be relevant.  The maximum length
770       of a compiled pattern is 65539 (sic) bytes.  All  values  in       of a compiled pattern is 65539 (sic) bytes.  All  values  in
771       repeating  quantifiers must be less than 65536.  The maximum       repeating  quantifiers  must be less than 65536.  There max-
772       number of capturing subpatterns is 99.  The  maximum  number       imum number of capturing subpatterns is 65535.  There is  no
773       of  all  parenthesized subpatterns, including capturing sub-       limit  to  the  number of non-capturing subpatterns, but the
774       patterns, assertions, and other types of subpattern, is 200.       maximum depth of nesting of all kinds of parenthesized  sub-
775         pattern,  including  capturing  subpatterns, assertions, and
776         other types of subpattern, is 200.
777    
778       The maximum length of a subject string is the largest  posi-       The maximum length of a subject string is the largest  posi-
779       tive number that an integer variable can hold. However, PCRE       tive number that an integer variable can hold. However, PCRE
# Line 640  DIFFERENCES FROM PERL Line 827  DIFFERENCES FROM PERL
827       6. The Perl \G assertion is  not  supported  as  it  is  not       6. The Perl \G assertion is  not  supported  as  it  is  not
828       relevant to single pattern matches.       relevant to single pattern matches.
829    
830       7. Fairly obviously, PCRE does  not  support  the  (?{code})       7. Fairly obviously, PCRE does not support the (?{code}) and
831       construction.       (?p{code})  constructions. However, there is some experimen-
832         tal support for recursive patterns using the  non-Perl  item
833         (?R).
834    
835       8. There are at the time of writing some  oddities  in  Perl       8. There are at the time of writing some  oddities  in  Perl
836       5.005_02  concerned  with  the  settings of captured strings       5.005_02  concerned  with  the  settings of captured strings
# Line 649  DIFFERENCES FROM PERL Line 838  DIFFERENCES FROM PERL
838       "aba"  against the pattern /^(a(b)?)+$/ sets $2 to the value       "aba"  against the pattern /^(a(b)?)+$/ sets $2 to the value
839       "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves  $2       "b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves  $2
840       unset.    However,    if   the   pattern   is   changed   to       unset.    However,    if   the   pattern   is   changed   to
841       /^(aa(b(b))?)+$/ then $2 (and $3) get set.       /^(aa(b(b))?)+$/ then $2 (and $3) are set.
842    
843       In Perl 5.004 $2 is set in both cases, and that is also true       In Perl 5.004 $2 is set in both cases, and that is also true
844       of PCRE. If in the future Perl changes to a consistent state       of PCRE. If in the future Perl changes to a consistent state
# Line 675  DIFFERENCES FROM PERL Line 864  DIFFERENCES FROM PERL
864       (c) If PCRE_EXTRA is set, a backslash followed by  a  letter       (c) If PCRE_EXTRA is set, a backslash followed by  a  letter
865       with no special meaning is faulted.       with no special meaning is faulted.
866    
867       (d)  If  PCRE_UNGREEDY  is  set,  the  greediness   of   the       (d) If PCRE_UNGREEDY is set, the greediness of  the  repeti-
868       repetition quantifiers is inverted, that is, by default they       tion  quantifiers  is inverted, that is, by default they are
869       are not greedy, but if followed by a question mark they are.       not greedy, but if followed by a question mark they are.
870    
871       (e) PCRE_ANCHORED can be used to force a pattern to be tried       (e) PCRE_ANCHORED can be used to force a pattern to be tried
872       only at the start of the subject.       only at the start of the subject.
# Line 685  DIFFERENCES FROM PERL Line 874  DIFFERENCES FROM PERL
874       (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY  options       (f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY  options
875       for pcre_exec() have no Perl equivalents.       for pcre_exec() have no Perl equivalents.
876    
877         (g) The (?R) construct allows for recursive pattern matching
878         (Perl  5.6 can do this using the (?p{code}) construct, which
879         PCRE cannot of course support.)
880    
881    
882    
883  REGULAR EXPRESSION DETAILS  REGULAR EXPRESSION DETAILS
# Line 693  REGULAR EXPRESSION DETAILS Line 886  REGULAR EXPRESSION DETAILS
886       also described in the Perl documentation and in a number  of       also described in the Perl documentation and in a number  of
887       other  books,  some  of which have copious examples. Jeffrey       other  books,  some  of which have copious examples. Jeffrey
888       Friedl's  "Mastering  Regular  Expressions",  published   by       Friedl's  "Mastering  Regular  Expressions",  published   by
889       O'Reilly  (ISBN 1-56592-257-3), covers them in great detail.       O'Reilly (ISBN 1-56592-257), covers them in great detail.
890    
891       The description here is intended as reference documentation.       The description here is intended as reference documentation.
892         The basic operation of PCRE is on strings of bytes. However,
893         there is the beginnings of some support for UTF-8  character
894         strings.  To  use  this  support  you must configure PCRE to
895         include it, and then call pcre_compile() with the  PCRE_UTF8
896         option.  How  this affects the pattern matching is described
897         in the final section of this document.
898    
899       A regular expression is a pattern that is matched against  a       A regular expression is a pattern that is matched against  a
900       subject string from left to right. Most characters stand for       subject string from left to right. Most characters stand for
# Line 749  BACKSLASH Line 949  BACKSLASH
949       The backslash character has several uses. Firstly, if it  is       The backslash character has several uses. Firstly, if it  is
950       followed  by  a  non-alphameric character, it takes away any       followed  by  a  non-alphameric character, it takes away any
951       special  meaning  that  character  may  have.  This  use  of       special  meaning  that  character  may  have.  This  use  of
952    
953       backslash  as  an  escape  character applies both inside and       backslash  as  an  escape  character applies both inside and
954       outside character classes.       outside character classes.
955    
# Line 780  BACKSLASH Line 981  BACKSLASH
981         \f     formfeed (hex 0C)         \f     formfeed (hex 0C)
982         \n     newline (hex 0A)         \n     newline (hex 0A)
983         \r     carriage return (hex 0D)         \r     carriage return (hex 0D)
984           \t     tab (hex 09)
             tab (hex 09)  
985         \xhh   character with hex code hh         \xhh   character with hex code hh
986         \ddd   character with octal code ddd, or backreference         \ddd   character with octal code ddd, or backreference
987    
# Line 833  BACKSLASH Line 1033  BACKSLASH
1033       Note that octal values of 100 or greater must not be  intro-       Note that octal values of 100 or greater must not be  intro-
1034       duced  by  a  leading zero, because no more than three octal       duced  by  a  leading zero, because no more than three octal
1035       digits are ever read.       digits are ever read.
1036    
1037       All the sequences that define a single  byte  value  can  be       All the sequences that define a single  byte  value  can  be
1038       used both inside and outside character classes. In addition,       used both inside and outside character classes. In addition,
1039       inside a character class, the sequence "\b"  is  interpreted       inside a character class, the sequence "\b"  is  interpreted
# Line 885  BACKSLASH Line 1086  BACKSLASH
1086       These assertions may not appear in  character  classes  (but       These assertions may not appear in  character  classes  (but
1087       note that "\b" has a different meaning, namely the backspace       note that "\b" has a different meaning, namely the backspace
1088       character, inside a character class).       character, inside a character class).
1089    
1090       A word boundary is a position in the  subject  string  where       A word boundary is a position in the  subject  string  where
1091       the current character and the previous character do not both       the current character and the previous character do not both
1092       match \w or \W (i.e. one matches \w and  the  other  matches       match \w or \W (i.e. one matches \w and  the  other  matches
# Line 951  CIRCUMFLEX AND DOLLAR Line 1153  CIRCUMFLEX AND DOLLAR
1153    
1154       Note that the sequences \A, \Z, and \z can be used to  match       Note that the sequences \A, \Z, and \z can be used to  match
1155       the  start  and end of the subject in both modes, and if all       the  start  and end of the subject in both modes, and if all
1156       branches of a pattern start with \A is it  always  anchored,       branches of a pattern start with \A it is  always  anchored,
1157       whether PCRE_MULTILINE is set or not.       whether PCRE_MULTILINE is set or not.
1158    
1159    
# Line 960  FULL STOP (PERIOD, DOT) Line 1162  FULL STOP (PERIOD, DOT)
1162       Outside a character class, a dot in the pattern matches  any       Outside a character class, a dot in the pattern matches  any
1163       one character in the subject, including a non-printing char-       one character in the subject, including a non-printing char-
1164       acter, but not (by default)  newline.   If  the  PCRE_DOTALL       acter, but not (by default)  newline.   If  the  PCRE_DOTALL
1165       option  is  set,  then dots match newlines as well. The han-       option  is set, dots match newlines as well. The handling of
1166       dling of dot is entirely independent of the handling of cir-       dot is entirely independent of the  handling  of  circumflex
1167       cumflex  and  dollar,  the only relationship being that they       and  dollar,  the  only  relationship  being  that they both
1168       both involve newline characters.  Dot has no special meaning       involve newline characters. Dot has no special meaning in  a
1169       in a character class.       character class.
1170    
1171    
1172    
# Line 1046  SQUARE BRACKETS Line 1248  SQUARE BRACKETS
1248    
1249    
1250    
1251    POSIX CHARACTER CLASSES
1252         Perl 5.6 (not yet released at the time of writing) is  going
1253         to  support  the POSIX notation for character classes, which
1254         uses names enclosed by  [:  and  :]   within  the  enclosing
1255         square brackets. PCRE supports this notation. For example,
1256    
1257           [01[:alpha:]%]
1258    
1259         matches "0", "1", any alphabetic character, or "%". The sup-
1260         ported class names are
1261    
1262           alnum    letters and digits
1263           alpha    letters
1264           ascii    character codes 0 - 127
1265           cntrl    control characters
1266           digit    decimal digits (same as \d)
1267           graph    printing characters, excluding space
1268           lower    lower case letters
1269           print    printing characters, including space
1270           punct    printing characters, excluding letters and digits
1271           space    white space (same as \s)
1272           upper    upper case letters
1273           word     "word" characters (same as \w)
1274           xdigit   hexadecimal digits
1275    
1276         The names "ascii" and "word" are  Perl  extensions.  Another
1277         Perl  extension is negation, which is indicated by a ^ char-
1278         acter after the colon. For example,
1279    
1280           [12[:^digit:]]
1281    
1282         matches "1", "2", or any non-digit.  PCRE  (and  Perl)  also
1283         recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
1284         "collating element", but these are  not  supported,  and  an
1285         error is given if they are encountered.
1286    
1287    
1288    
1289  VERTICAL BAR  VERTICAL BAR
1290       Vertical bar characters are  used  to  separate  alternative       Vertical bar characters are  used  to  separate  alternative
1291       patterns. For example, the pattern       patterns. For example, the pattern
# Line 1159  SUBPATTERNS Line 1399  SUBPATTERNS
1399         the ((red|white) (king|queen))         the ((red|white) (king|queen))
1400    
1401       the captured substrings are "red king", "red",  and  "king",       the captured substrings are "red king", "red",  and  "king",
1402       and are numbered 1, 2, and 3.       and are numbered 1, 2, and 3, respectively.
1403    
1404       The fact that plain parentheses fulfil two functions is  not       The fact that plain parentheses fulfil two functions is  not
1405       always  helpful.  There are often times when a grouping sub-       always  helpful.  There are often times when a grouping sub-
# Line 1197  REPETITION Line 1437  REPETITION
1437       Repetition is specified by quantifiers, which can follow any       Repetition is specified by quantifiers, which can follow any
1438       of the following items:       of the following items:
1439    
   
1440         a single character, possibly escaped         a single character, possibly escaped
1441         the . metacharacter         the . metacharacter
1442         a character class         a character class
# Line 1231  REPETITION Line 1470  REPETITION
1470       one that does not match the syntax of a quantifier, is taken       one that does not match the syntax of a quantifier, is taken
1471       as  a literal character. For example, {,6} is not a quantif-       as  a literal character. For example, {,6} is not a quantif-
1472       ier, but a literal string of four characters.       ier, but a literal string of four characters.
   
1473       The quantifier {0} is permitted, causing the  expression  to       The quantifier {0} is permitted, causing the  expression  to
1474       behave  as  if the previous item and the quantifier were not       behave  as  if the previous item and the quantifier were not
1475       present.       present.
# Line 1270  REPETITION Line 1508  REPETITION
1508    
1509         /* first command */  not comment  /* second comment */         /* first command */  not comment  /* second comment */
1510    
1511       fails, because it matches  the  entire  string  due  to  the       fails, because it matches the entire  string  owing  to  the
1512       greediness of the .*  item.       greediness of the .*  item.
1513    
1514       However, if a quantifier is followed  by  a  question  mark,       However, if a quantifier is followed by a question mark,  it
1515       then it ceases to be greedy, and instead matches the minimum       ceases  to be greedy, and instead matches the minimum number
1516       number of times possible, so the pattern       of times possible, so the pattern
1517    
1518         /\*.*?\*/         /\*.*?\*/
1519    
# Line 1292  REPETITION Line 1530  REPETITION
1530       that is the only way the rest of the pattern matches.       that is the only way the rest of the pattern matches.
1531    
1532       If the PCRE_UNGREEDY option is set (an option which  is  not       If the PCRE_UNGREEDY option is set (an option which  is  not
1533       available  in  Perl)  then the quantifiers are not greedy by       available  in  Perl),  the  quantifiers  are  not  greedy by
1534       default, but individual ones can be made greedy by following       default, but individual ones can be made greedy by following
1535       them  with  a  question mark. In other words, it inverts the       them  with  a  question mark. In other words, it inverts the
1536       default behaviour.       default behaviour.
# Line 1304  REPETITION Line 1542  REPETITION
1542    
1543       If a pattern starts with .* or  .{0,}  and  the  PCRE_DOTALL       If a pattern starts with .* or  .{0,}  and  the  PCRE_DOTALL
1544       option (equivalent to Perl's /s) is set, thus allowing the .       option (equivalent to Perl's /s) is set, thus allowing the .
1545       to match newlines, then the pattern is implicitly  anchored,       to match  newlines,  the  pattern  is  implicitly  anchored,
1546       because whatever follows will be tried against every charac-       because whatever follows will be tried against every charac-
1547       ter position in the subject string, so there is no point  in       ter position in the subject string, so there is no point  in
1548       retrying  the overall match at any position after the first.       retrying  the overall match at any position after the first.
# Line 1336  REPETITION Line 1574  REPETITION
1574  BACK REFERENCES  BACK REFERENCES
1575       Outside a character class, a backslash followed by  a  digit       Outside a character class, a backslash followed by  a  digit
1576       greater  than  0  (and  possibly  further  digits) is a back       greater  than  0  (and  possibly  further  digits) is a back
1577    
1578    
1579    
1580    
1581    SunOS 5.8                 Last change:                         30
1582    
1583    
1584    
1585       reference to a capturing subpattern  earlier  (i.e.  to  its       reference to a capturing subpattern  earlier  (i.e.  to  its
1586       left)  in  the  pattern,  provided there have been that many       left)  in  the  pattern,  provided there have been that many
1587       previous capturing left parentheses.       previous capturing left parentheses.
# Line 1357  BACK REFERENCES Line 1603  BACK REFERENCES
1603    
1604       matches "sense and sensibility" and "response and  responsi-       matches "sense and sensibility" and "response and  responsi-
1605       bility",  but  not  "sense  and  responsibility". If caseful       bility",  but  not  "sense  and  responsibility". If caseful
1606       matching is in force at the time of the back reference, then       matching is in force at the time of the back reference,  the
1607       the case of letters is relevant. For example,       case of letters is relevant. For example,
1608    
1609         ((?i)rah)\s+\1         ((?i)rah)\s+\1
1610    
# Line 1368  BACK REFERENCES Line 1614  BACK REFERENCES
1614    
1615       There may be more than one back reference to the  same  sub-       There may be more than one back reference to the  same  sub-
1616       pattern.  If  a  subpattern  has not actually been used in a       pattern.  If  a  subpattern  has not actually been used in a
1617       particular match, then any  back  references  to  it  always       particular match, any back references to it always fail. For
1618       fail. For example, the pattern       example, the pattern
1619    
1620         (a|(bc))\2         (a|(bc))\2
1621    
# Line 1377  BACK REFERENCES Line 1623  BACK REFERENCES
1623       Because  there  may  be up to 99 back references, all digits       Because  there  may  be up to 99 back references, all digits
1624       following the backslash are taken as  part  of  a  potential       following the backslash are taken as  part  of  a  potential
1625       back reference number. If the pattern continues with a digit       back reference number. If the pattern continues with a digit
1626       character, then some delimiter must be used to terminate the       character, some delimiter must be used to terminate the back
1627       back reference. If the PCRE_EXTENDED option is set, this can       reference.   If the PCRE_EXTENDED option is set, this can be
1628       be whitespace.  Otherwise an empty comment can be used.       whitespace. Otherwise an empty comment can be used.
1629    
1630       A back reference that occurs inside the parentheses to which       A back reference that occurs inside the parentheses to which
1631       it  refers  fails when the subpattern is first used, so, for       it  refers  fails when the subpattern is first used, so, for
# Line 1389  BACK REFERENCES Line 1635  BACK REFERENCES
1635    
1636         (a|b\1)+         (a|b\1)+
1637    
1638       matches any number of "a"s and also "aba", "ababaa" etc.  At       matches any number of "a"s and also "aba", "ababbaa" etc. At
1639       each iteration of the subpattern, the back reference matches       each iteration of the subpattern, the back reference matches
1640       the character string corresponding to  the  previous  itera-       the character string corresponding to  the  previous  itera-
1641       tion.  In  order  for this to work, the pattern must be such       tion.  In  order  for this to work, the pattern must be such
# Line 1407  ASSERTIONS Line 1653  ASSERTIONS
1653       cated assertions are coded as  subpatterns.  There  are  two       cated assertions are coded as  subpatterns.  There  are  two
1654       kinds:  those that look ahead of the current position in the       kinds:  those that look ahead of the current position in the
1655       subject string, and those that look behind it.       subject string, and those that look behind it.
1656    
1657       An assertion subpattern is matched in the normal way, except       An assertion subpattern is matched in the normal way, except
1658       that  it  does not cause the current matching position to be       that  it  does not cause the current matching position to be
1659       changed. Lookahead assertions start with  (?=  for  positive       changed. Lookahead assertions start with  (?=  for  positive
# Line 1478  ASSERTIONS Line 1725  ASSERTIONS
1725       matches "foo" preceded by three digits that are  not  "999".       matches "foo" preceded by three digits that are  not  "999".
1726       Notice  that each of the assertions is applied independently       Notice  that each of the assertions is applied independently
1727       at the same point in the subject string. First  there  is  a       at the same point in the subject string. First  there  is  a
1728       check  that  the  previous  three characters are all digits,       check that the previous three characters are all digits, and
1729       then there is a check that the same three characters are not       then there is a check that the same three characters are not
1730       "999".   This  pattern  does not match "foo" preceded by six       "999".   This  pattern  does not match "foo" preceded by six
1731       characters, the first of which are digits and the last three       characters, the first of which are digits and the last three
# Line 1572  ONCE-ONLY SUBPATTERNS Line 1819  ONCE-ONLY SUBPATTERNS
1819    
1820         abcd$         abcd$
1821    
1822       when applied to a long  string  which  does  not  match  it.       when applied to a long string which does not match.  Because
1823       Because matching proceeds from left to right, PCRE will look       matching  proceeds  from  left  to right, PCRE will look for
1824       for each "a" in the subject and then  see  if  what  follows       each "a" in the subject and then see if what follows matches
1825       matches the rest of the pattern. If the pattern is specified       the rest of the pattern. If the pattern is specified as
      as  
1826    
1827         ^.*abcd$         ^.*abcd$
1828    
1829       then the initial .* matches the entire string at first,  but       the initial .* matches the entire string at first, but  when
1830       when  this  fails,  it  backtracks to match all but the last       this  fails  (because  there  is no following "a"), it back-
1831       character, then all but the last two characters, and so  on.       tracks to match all but the last character, then all but the
1832       Once again the search for "a" covers the entire string, from       last  two  characters,  and so on. Once again the search for
1833       right to left, so we are no better off. However, if the pat-       "a" covers the entire string, from right to left, so we  are
1834       tern is written as       no better off. However, if the pattern is written as
1835    
1836         ^(?>.*)(?<=abcd)         ^(?>.*)(?<=abcd)
1837    
1838       then there can be no backtracking for the .*  item;  it  can       there can be no backtracking for the .* item; it  can  match
1839       match  only  the  entire  string.  The subsequent lookbehind       only  the entire string. The subsequent lookbehind assertion
1840       assertion does a single test on the last four characters. If       does a single test on the last four characters. If it fails,
1841       it  fails,  the  match  fails immediately. For long strings,       the match fails immediately. For long strings, this approach
1842       this approach makes a significant difference to the process-       makes a significant difference to the processing time.
1843       ing time.  
1844         When a pattern contains an unlimited repeat inside a subpat-
1845         tern  that  can  itself  be  repeated an unlimited number of
1846         times, the use of a once-only subpattern is the only way  to
1847         avoid  some  failing matches taking a very long time indeed.
1848         The pattern
1849    
1850           (\D+|<\d+>)*[!?]
1851    
1852         matches an unlimited number of substrings that  either  con-
1853         sist  of  non-digits,  or digits enclosed in <>, followed by
1854         either ! or ?. When it matches, it runs quickly. However, if
1855         it is applied to
1856    
1857           aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
1858    
1859         it takes a long  time  before  reporting  failure.  This  is
1860         because the string can be divided between the two repeats in
1861         a large number of ways, and all have to be tried. (The exam-
1862         ple  used  [!?]  rather  than a single character at the end,
1863         because both PCRE and Perl have an optimization that  allows
1864         for  fast  failure  when  a  single  character is used. They
1865         remember the last single character that is  required  for  a
1866         match,  and  fail early if it is not present in the string.)
1867         If the pattern is changed to
1868    
1869           ((?>\D+)|<\d+>)*[!?]
1870    
1871         sequences of non-digits cannot be broken, and  failure  hap-
1872         pens quickly.
1873    
1874    
1875    
# Line 1614  CONDITIONAL SUBPATTERNS Line 1889  CONDITIONAL SUBPATTERNS
1889       error occurs.       error occurs.
1890    
1891       There are two kinds of condition. If the  text  between  the       There are two kinds of condition. If the  text  between  the
1892       parentheses  consists  of  a  sequence  of  digits, then the       parentheses  consists of a sequence of digits, the condition
1893       condition is satisfied if the capturing subpattern  of  that       is satisfied if the capturing subpattern of that number  has
1894       number  has  previously matched. Consider the following pat-       previously  matched.  The  number must be greater than zero.
1895       tern, which contains non-significant white space to make  it       Consider  the  following  pattern,   which   contains   non-
1896       more  readable  (assume  the  PCRE_EXTENDED  option)  and to       significant white space to make it more readable (assume the
1897       divide it into three parts for ease of discussion:       PCRE_EXTENDED option) and to divide it into three parts  for
1898         ease of discussion:
1899    
1900         ( \( )?    [^()]+    (?(1) \) )         ( \( )?    [^()]+    (?(1) \) )
1901    
# Line 1668  COMMENTS Line 1944  COMMENTS
1944    
1945    
1946    
1947    RECURSIVE PATTERNS
1948         Consider the problem of matching a  string  in  parentheses,
1949         allowing  for  unlimited nested parentheses. Without the use
1950         of recursion, the best that can be done is to use a  pattern
1951         that  matches  up  to some fixed depth of nesting. It is not
1952         possible to handle an arbitrary nesting depth. Perl 5.6  has
1953         provided   an  experimental  facility  that  allows  regular
1954         expressions to recurse (amongst other things). It does  this
1955         by  interpolating  Perl  code in the expression at run time,
1956         and the code can refer to the expression itself. A Perl pat-
1957         tern  to  solve  the parentheses problem can be created like
1958         this:
1959    
1960           $re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
1961    
1962         The (?p{...}) item interpolates Perl code at run  time,  and
1963         in  this  case refers recursively to the pattern in which it
1964         appears. Obviously, PCRE cannot support the interpolation of
1965         Perl  code.  Instead,  the special item (?R) is provided for
1966         the specific case of recursion. This PCRE pattern solves the
1967         parentheses  problem (assume the PCRE_EXTENDED option is set
1968         so that white space is ignored):
1969    
1970           \( ( (?>[^()]+) | (?R) )* \)
1971    
1972         First it matches an opening parenthesis. Then it matches any
1973         number  of substrings which can either be a sequence of non-
1974         parentheses, or a recursive  match  of  the  pattern  itself
1975         (i.e. a correctly parenthesized substring). Finally there is
1976         a closing parenthesis.
1977    
1978         This particular example pattern  contains  nested  unlimited
1979         repeats, and so the use of a once-only subpattern for match-
1980         ing strings of non-parentheses is  important  when  applying
1981         the  pattern to strings that do not match. For example, when
1982         it is applied to
1983    
1984           (aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
1985    
1986         it yields "no match" quickly. However, if a  once-only  sub-
1987         pattern  is  not  used,  the match runs for a very long time
1988         indeed because there are so many different ways the + and  *
1989         repeats  can carve up the subject, and all have to be tested
1990         before failure can be reported.
1991    
1992         The values set for any capturing subpatterns are those  from
1993         the outermost level of the recursion at which the subpattern
1994         value is set. If the pattern above is matched against
1995    
1996           (ab(cd)ef)
1997    
1998         the value for the capturing parentheses is  "ef",  which  is
1999         the  last  value  taken  on  at the top level. If additional
2000         parentheses are added, giving
2001    
2002           \( ( ( (?>[^()]+) | (?R) )* ) \)
2003              ^                        ^
2004              ^                        ^ the string they  capture  is
2005         "ab(cd)ef",  the  contents  of the top level parentheses. If
2006         there are more than 15 capturing parentheses in  a  pattern,
2007         PCRE  has  to  obtain  extra  memory  to store data during a
2008         recursion, which it does by using  pcre_malloc,  freeing  it
2009         via  pcre_free  afterwards. If no memory can be obtained, it
2010         saves data for the first 15 capturing parentheses  only,  as
2011         there is no way to give an out-of-memory error from within a
2012         recursion.
2013    
2014    
2015    
2016  PERFORMANCE  PERFORMANCE
2017       Certain items that may appear in patterns are more efficient       Certain items that may appear in patterns are more efficient
2018       than  others.  It is more efficient to use a character class       than  others.  It is more efficient to use a character class
# Line 1735  PERFORMANCE Line 2080  PERFORMANCE
2080    
2081    
2082    
2083    UTF-8 SUPPORT
2084         Starting at release 3.3, PCRE has some support for character
2085         strings encoded in the UTF-8 format. This is incomplete, and
2086         is regarded as experimental. In order to use  it,  you  must
2087         configure PCRE to include UTF-8 support in the code, and, in
2088         addition, you must call pcre_compile()  with  the  PCRE_UTF8
2089         option flag. When you do this, both the pattern and any sub-
2090         ject strings that are matched  against  it  are  treated  as
2091         UTF-8  strings instead of just strings of bytes, but only in
2092         the cases that are mentioned below.
2093    
2094         If you compile PCRE with UTF-8 support, but do not use it at
2095         run  time,  the  library will be a bit bigger, but the addi-
2096         tional run time overhead is limited to testing the PCRE_UTF8
2097         flag in several places, so should not be very large.
2098    
2099         PCRE assumes that the strings  it  is  given  contain  valid
2100         UTF-8  codes. It does not diagnose invalid UTF-8 strings. If
2101         you pass invalid UTF-8 strings  to  PCRE,  the  results  are
2102         undefined.
2103    
2104         Running with PCRE_UTF8 set causes these changes in  the  way
2105         PCRE works:
2106    
2107         1. In a pattern, the  escape  sequence  \x{...},  where  the
2108         contents of the braces is a string of hexadecimal digits, is
2109         interpreted as a UTF-8 character whose code  number  is  the
2110         given   hexadecimal  number,  for  example:  \x{1234}.  This
2111         inserts from one to six  literal  bytes  into  the  pattern,
2112         using the UTF-8 encoding. If a non-hexadecimal digit appears
2113         between the braces, the item is not recognized.
2114    
2115         2. The original hexadecimal escape sequence, \xhh, generates
2116         a two-byte UTF-8 character if its value is greater than 127.
2117    
2118         3. Repeat quantifiers are NOT correctly handled if they fol-
2119         low  a  multibyte character. For example, \x{100}* and \xc3+
2120         do not work. If you want to repeat such characters, you must
2121         enclose  them  in  non-capturing  parentheses,  for  example
2122         (?:\x{100}), at present.
2123    
2124         4. The dot metacharacter matches one UTF-8 character instead
2125         of a single byte.
2126    
2127         5. Unlike literal UTF-8 characters,  the  dot  metacharacter
2128         followed  by  a  repeat quantifier does operate correctly on
2129         UTF-8 characters instead of single bytes.
2130    
2131         4. Although the \x{...} escape is permitted in  a  character
2132         class,  characters  whose values are greater than 255 cannot
2133         be included in a class.
2134    
2135         5. A class is matched against a UTF-8 character  instead  of
2136         just  a  single byte, but it can match only characters whose
2137         values are less than 256.  Characters  with  greater  values
2138         always fail to match a class.
2139    
2140         6. Repeated classes work correctly on multiple characters.
2141    
2142         7. Classes containing just a single character whose value is
2143         greater than 127 (but less than 256), for example, [\x80] or
2144         [^\x{93}], do not work because these are optimized into sin-
2145         gle  byte  matches.  In the first case, of course, the class
2146         brackets are just redundant.
2147    
2148         8. Lookbehind assertions move backwards in the subject by  a
2149         fixed  number  of  characters  instead  of a fixed number of
2150         bytes. Simple cases have been tested to work correctly,  but
2151         there may be hidden gotchas herein.
2152    
2153         9. The character types  such  as  \d  and  \w  do  not  work
2154         correctly  with  UTF-8  characters.  They continue to test a
2155         single byte.
2156    
2157         10. Anything not explicitly mentioned here continues to work
2158         in bytes rather than in characters.
2159    
2160         The following UTF-8 features of  Perl  5.6  are  not  imple-
2161         mented:
2162    
2163         1. The escape sequence \C to match a single byte.
2164    
2165         2. The use of Unicode tables and properties and escapes  \p,
2166         \P, and \X.
2167    
2168    
2169    
2170    SAMPLE PROGRAM
2171         The code below is a simple, complete demonstration  program,
2172         to  get  you started with using PCRE. This code is also sup-
2173         plied in the file pcredemo.c in the PCRE distribution.
2174    
2175         The program compiles the  regular  expression  that  is  its
2176         first argument, and matches it against the subject string in
2177         its second argument. No options are set, and default charac-
2178         ter  tables are used. If matching succeeds, the program out-
2179         puts the portion of the subject that matched, together  with
2180         the contents of any captured substrings.
2181    
2182         On a Unix system that has PCRE installed in /usr/local,  you
2183         can  compile  the demonstration program using a command like
2184         this:
2185    
2186           gcc   -o    pcredemo    pcredemo.c    -I/usr/local/include
2187         -L/usr/local/lib -lpcre
2188    
2189         Then you can run simple tests like this:
2190    
2191           ./pcredemo 'cat|dog' 'the cat sat on the mat'
2192    
2193         Note that there is a much more comprehensive  test  program,
2194         called  pcretest,  which  supports  many more facilities for
2195         testing regular expressions. The pcredemo  program  is  pro-
2196         vided as a simple coding example.
2197    
2198         On some operating systems (e.g.  Solaris)  you  may  get  an
2199         error like this when you try to run pcredemo:
2200    
2201           ld.so.1: a.out: fatal: libpcre.so.0: open failed: No  such
2202         file or directory
2203    
2204         This is caused by the way shared library  support  works  on
2205         those systems. You need to add
2206    
2207           -R/usr/local/lib
2208    
2209         to the compile command to get round this problem. Here's the
2210         code:
2211    
2212           #include <stdio.h>
2213           #include <string.h>
2214           #include <pcre.h>
2215    
2216           #define OVECCOUNT 30    /* should be a multiple of 3 */
2217    
2218           int main(int argc, char **argv)
2219           {
2220           pcre *re;
2221           const char *error;
2222           int erroffset;
2223           int ovector[OVECCOUNT];
2224           int rc, i;
2225    
2226           if (argc != 3)
2227             {
2228             printf("Two arguments required: a regex and a "
2229               "subject string\n");
2230             return 1;
2231             }
2232    
2233           /* Compile the regular expression in the first argument */
2234    
2235           re = pcre_compile(
2236             argv[1],     /* the pattern */
2237             0,           /* default options */
2238             &error,      /* for error message */
2239             &erroffset,  /* for error offset */
2240             NULL);       /* use default character tables */
2241    
2242           /* Compilation failed: print the error message and exit */
2243    
2244           if (re == NULL)
2245             {
2246             printf("PCRE compilation failed at offset %d: %s\n",
2247               erroffset, error);
2248             return 1;
2249             }
2250    
2251           /* Compilation succeeded: match the subject in the second
2252              argument */
2253    
2254           rc = pcre_exec(
2255             re,          /* the compiled pattern */
2256             NULL,        /* we didn't study the pattern */
2257             argv[2],     /* the subject string */
2258             (int)strlen(argv[2]), /* the length of the subject */
2259             0,           /* start at offset 0 in the subject */
2260             0,           /* default options */
2261             ovector,     /* vector for substring information */
2262             OVECCOUNT);  /* number of elements in the vector */
2263    
2264           /* Matching failed: handle error cases */
2265    
2266           if (rc < 0)
2267             {
2268             switch(rc)
2269               {
2270               case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
2271               /*
2272               Handle other special cases if you like
2273               */
2274               default: printf("Matching error %d\n", rc); break;
2275               }
2276             return 1;
2277             }
2278    
2279           /* Match succeded */
2280    
2281           printf("Match succeeded\n");
2282    
2283           /* The output vector wasn't big enough */
2284    
2285           if (rc == 0)
2286             {
2287             rc = OVECCOUNT/3;
2288             printf("ovector only has room for %d captured "
2289               substrings\n", rc - 1);
2290             }
2291    
2292           /* Show substrings stored in the output vector */
2293    
2294           for (i = 0; i < rc; i++)
2295             {
2296             char *substring_start = argv[2] + ovector[2*i];
2297             int substring_length = ovector[2*i+1] - ovector[2*i];
2298             printf("%2d: %.*s\n", i, substring_length,
2299               substring_start);
2300             }
2301    
2302           return 0;
2303           }
2304    
2305    
2306    
2307  AUTHOR  AUTHOR
2308       Philip Hazel <ph10@cam.ac.uk>       Philip Hazel <ph10@cam.ac.uk>
2309       University Computing Service,       University Computing Service,
# Line 1742  AUTHOR Line 2311  AUTHOR
2311       Cambridge CB2 3QG, England.       Cambridge CB2 3QG, England.
2312       Phone: +44 1223 334714       Phone: +44 1223 334714
2313    
2314       Last updated: 29 July 1999       Last updated: 15 August 2001
2315       Copyright (c) 1997-1999 University of Cambridge.       Copyright (c) 1997-2001 University of Cambridge.

Legend:
Removed from v.41  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5