/[pcre]/code/trunk/doc/pcre.txt
ViewVC logotype

Diff of /code/trunk/doc/pcre.txt

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 47 by nigel, Sat Feb 24 21:39:29 2007 UTC revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC
# Line 28  SYNOPSIS Line 28  SYNOPSIS
28       int pcre_get_substring_list(const char *subject,       int pcre_get_substring_list(const char *subject,
29            int *ovector, int stringcount, const char ***listptr);            int *ovector, int stringcount, const char ***listptr);
30    
31         void pcre_free_substring(const char *stringptr);
32    
33         void pcre_free_substring_list(const char **stringptr);
34    
35       const unsigned char *pcre_maketables(void);       const unsigned char *pcre_maketables(void);
36    
37       int pcre_fullinfo(const pcre *code, const pcre_extra *extra,       int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
# Line 48  DESCRIPTION Line 52  DESCRIPTION
52       The PCRE library is a set of functions that implement  regu-       The PCRE library is a set of functions that implement  regu-
53       lar  expression  pattern  matching using the same syntax and       lar  expression  pattern  matching using the same syntax and
54       semantics as Perl  5,  with  just  a  few  differences  (see       semantics as Perl  5,  with  just  a  few  differences  (see
55    
56       below).  The  current  implementation  corresponds  to  Perl       below).  The  current  implementation  corresponds  to  Perl
57       5.005, with some additional features from the Perl  develop-       5.005, with some additional features  from  later  versions.
58       ment release.       This  includes  some  experimental,  incomplete  support for
59         UTF-8 encoded strings. Details of exactly what is  and  what
60         is not supported are given below.
61    
62       PCRE has its own native API,  which  is  described  in  this       PCRE has its own native API,  which  is  described  in  this
63       document.  There  is  also  a  set of wrapper functions that       document.  There  is  also  a  set of wrapper functions that
# Line 67  DESCRIPTION Line 74  DESCRIPTION
74       releases.       releases.
75    
76       The functions pcre_compile(), pcre_study(), and  pcre_exec()       The functions pcre_compile(), pcre_study(), and  pcre_exec()
77       are  used  for  compiling  and matching regular expressions,       are  used  for compiling and matching regular expressions. A
78       while   pcre_copy_substring(),   pcre_get_substring(),   and       sample program that demonstrates the simplest way  of  using
79       pcre_get_substring_list()   are  convenience  functions  for       them  is  given  in the file pcredemo.c. The last section of
80         this man page describes how to run it.
81    
82         The functions  pcre_copy_substring(),  pcre_get_substring(),
83         and  pcre_get_substring_list() are convenience functions for
84       extracting  captured  substrings  from  a  matched   subject       extracting  captured  substrings  from  a  matched   subject
85       string.  The function pcre_maketables() is used (optionally)       string; pcre_free_substring() and pcre_free_substring_list()
86       to build a set of character tables in the current locale for       are also provided, to free the  memory  used  for  extracted
87       passing to pcre_compile().       strings.
88    
89         The function pcre_maketables() is used (optionally) to build
90         a  set of character tables in the current locale for passing
91         to pcre_compile().
92    
93       The function pcre_fullinfo() is used to find out information       The function pcre_fullinfo() is used to find out information
94       about a compiled pattern; pcre_info() is an obsolete version       about a compiled pattern; pcre_info() is an obsolete version
# Line 103  MULTI-THREADING Line 118  MULTI-THREADING
118    
119    
120    
   
121  COMPILING A PATTERN  COMPILING A PATTERN
122       The function pcre_compile() is called to compile  a  pattern       The function pcre_compile() is called to compile  a  pattern
123       into  an internal form. The pattern is a C string terminated       into  an internal form. The pattern is a C string terminated
124       by a binary zero, and is passed in the argument  pattern.  A       by a binary zero, and is passed in the argument  pattern.  A
125       pointer  to  a  single  block of memory that is obtained via       pointer  to  a  single  block of memory that is obtained via
126       pcre_malloc is returned. This contains the compiled code and       pcre_malloc is returned. This contains the compiled code and
127       related data. The pcre type is defined for this for conveni-       related  data.  The  pcre  type  is defined for the returned
128       ence, but in fact pcre is just a typedef for void, since the       block; this is a typedef for a structure whose contents  are
129       contents  of  the block are not externally defined. It is up       not  externally  defined. It is up to the caller to free the
130       to the caller to free  the  memory  when  it  is  no  longer       memory when it is no longer required.
131       required.  
132         Although the compiled code of a PCRE regex  is  relocatable,
133         that is, it does not depend on memory location, the complete
134         pcre data block is not fully relocatable,  because  it  con-
135         tains  a  copy of the tableptr argument, which is an address
136         (see below).
137    
138       The size of a compiled pattern is  roughly  proportional  to       The size of a compiled pattern is  roughly  proportional  to
139       the length of the pattern string, except that each character       the length of the pattern string, except that each character
# Line 149  COMPILING A PATTERN Line 168  COMPILING A PATTERN
168       must  be  the result of a call to pcre_maketables(). See the       must  be  the result of a call to pcre_maketables(). See the
169       section on locale support below.       section on locale support below.
170    
171         This code fragment shows a typical straightforward  call  to
172         pcre_compile():
173    
174           pcre *re;
175           const char *error;
176           int erroffset;
177           re = pcre_compile(
178             "^A.*Z",          /* the pattern */
179             0,                /* default options */
180             &error,           /* for error message */
181             &erroffset,       /* for error offset */
182             NULL);            /* use default character tables */
183    
184       The following option bits are defined in the header file:       The following option bits are defined in the header file:
185    
186         PCRE_ANCHORED         PCRE_ANCHORED
# Line 235  COMPILING A PATTERN Line 267  COMPILING A PATTERN
267       followed by "?". It is not compatible with Perl. It can also       followed by "?". It is not compatible with Perl. It can also
268       be set by a (?U) option setting within the pattern.       be set by a (?U) option setting within the pattern.
269    
270           PCRE_UTF8
271    
272         This option causes PCRE to regard both the pattern  and  the
273         subject  as strings of UTF-8 characters instead of just byte
274         strings. However, it is available  only  if  PCRE  has  been
275         built  to  include  UTF-8  support.  If not, the use of this
276         option provokes an error. Support for UTF-8 is new,  experi-
277         mental,  and incomplete.  Details of exactly what it entails
278         are given below.
279    
280    
281    
282  STUDYING A PATTERN  STUDYING A PATTERN
# Line 242  STUDYING A PATTERN Line 284  STUDYING A PATTERN
284       worth  spending  more time analyzing it in order to speed up       worth  spending  more time analyzing it in order to speed up
285       the time taken for matching. The function pcre_study() takes       the time taken for matching. The function pcre_study() takes
286       a  pointer  to a compiled pattern as its first argument, and       a  pointer  to a compiled pattern as its first argument, and
287       returns a  pointer  to  a  pcre_extra  block  (another  void       returns a pointer to a pcre_extra block (another typedef for
288       typedef)  containing  additional  information about the pat-       a  structure  with  hidden  contents)  containing additional
289       tern; this can be passed to pcre_exec().  If  no  additional       information  about  the  pattern;  this  can  be  passed  to
290       information is available, NULL is returned.       pcre_exec(). If no additional information is available, NULL
291         is returned.
292    
293       The second argument contains option  bits.  At  present,  no       The second argument contains option  bits.  At  present,  no
294       options  are  defined  for  pcre_study(),  and this argument       options  are  defined  for  pcre_study(),  and this argument
# Line 256  STUDYING A PATTERN Line 299  STUDYING A PATTERN
299       the variable it points to  is  set  to  NULL.  Otherwise  it       the variable it points to  is  set  to  NULL.  Otherwise  it
300       points to a textual error message.       points to a textual error message.
301    
302         This is a typical call to pcre_study():
303    
304           pcre_extra *pe;
305           pe = pcre_study(
306             re,             /* result of pcre_compile() */
307             0,              /* no options exist */
308             &error);        /* set to NULL or points to a message */
309    
310       At present, studying a  pattern  is  useful  only  for  non-       At present, studying a  pattern  is  useful  only  for  non-
311       anchored  patterns  that do not have a single fixed starting       anchored  patterns  that do not have a single fixed starting
312       character. A  bitmap  of  possible  starting  characters  is       character. A  bitmap  of  possible  starting  characters  is
# Line 316  INFORMATION ABOUT A PATTERN Line 367  INFORMATION ABOUT A PATTERN
367         PCRE_ERROR_BADMAGIC   the "magic number" was not found         PCRE_ERROR_BADMAGIC   the "magic number" was not found
368         PCRE_ERROR_BADOPTION  the value of what was invalid         PCRE_ERROR_BADOPTION  the value of what was invalid
369    
370         Here is a typical call of  pcre_fullinfo(),  to  obtain  the
371         length of the compiled pattern:
372    
373           int rc;
374           unsigned long int length;
375           rc = pcre_fullinfo(
376             re,               /* result of pcre_compile() */
377             pe,               /* result of pcre_study(), or NULL */
378             PCRE_INFO_SIZE,   /* what is required */
379             &length);         /* where to put the data */
380    
381       The possible values for the third argument  are  defined  in       The possible values for the third argument  are  defined  in
382       pcre.h, and are as follows:       pcre.h, and are as follows:
383    
384         PCRE_INFO_OPTIONS         PCRE_INFO_OPTIONS
385    
386       Return a copy of the options with which the pattern was com-       Return a copy of the options with which the pattern was com-
387       piled.  The fourth argument should point to au unsigned long       piled.  The fourth argument should point to an unsigned long
388       int variable. These option bits are those specified  in  the       int variable. These option bits are those specified  in  the
389       call  to  pcre_compile(),  modified  by any top-level option       call  to  pcre_compile(),  modified  by any top-level option
390       settings  within  the   pattern   itself,   and   with   the       settings  within  the   pattern   itself,   and   with   the
# Line 409  INFORMATION ABOUT A PATTERN Line 471  INFORMATION ABOUT A PATTERN
471    
472  MATCHING A PATTERN  MATCHING A PATTERN
473       The function pcre_exec() is called to match a subject string       The function pcre_exec() is called to match a subject string
474    
475    
476    
477    
478    
479    SunOS 5.8                 Last change:                          9
480    
481    
482    
483       against  a pre-compiled pattern, which is passed in the code       against  a pre-compiled pattern, which is passed in the code
484       argument. If the pattern has been studied, the result of the       argument. If the pattern has been studied, the result of the
485       study should be passed in the extra argument. Otherwise this       study should be passed in the extra argument. Otherwise this
486       must be NULL.       must be NULL.
487    
488         Here is an example of a simple call to pcre_exec():
489    
490           int rc;
491           int ovector[30];
492           rc = pcre_exec(
493             re,             /* result of pcre_compile() */
494             NULL,           /* we didn't study the pattern */
495             "some string",  /* the subject string */
496             11,             /* the length of the subject string */
497             0,              /* start at offset 0 in the subject */
498             0,              /* default options */
499             ovector,        /* vector for substring information */
500             30);            /* number of elements in the vector */
501    
502       The PCRE_ANCHORED option can be passed in the options  argu-       The PCRE_ANCHORED option can be passed in the options  argu-
503       ment,  whose unused bits must be zero. However, if a pattern       ment,  whose unused bits must be zero. However, if a pattern
504       was  compiled  with  PCRE_ANCHORED,  or  turned  out  to  be       was  compiled  with  PCRE_ANCHORED,  or  turned  out  to  be
# Line 464  MATCHING A PATTERN Line 549  MATCHING A PATTERN
549    
550       The subject string is passed as  a  pointer  in  subject,  a       The subject string is passed as  a  pointer  in  subject,  a
551       length  in  length,  and  a  starting offset in startoffset.       length  in  length,  and  a  starting offset in startoffset.
552       Unlike the pattern string, it may contain binary zero  char-       Unlike the pattern string, the subject  may  contain  binary
553       acters.  When  the starting offset is zero, the search for a       zero  characters.  When  the  starting  offset  is zero, the
554       match starts at the beginning of the subject, and this is by       search for a match starts at the beginning of  the  subject,
555       far the most common case.       and this is by far the most common case.
556    
557       A non-zero starting offset  is  useful  when  searching  for       A non-zero starting offset  is  useful  when  searching  for
558       another  match  in  the  same subject by calling pcre_exec()       another  match  in  the  same subject by calling pcre_exec()
# Line 603  MATCHING A PATTERN Line 688  MATCHING A PATTERN
688    
689    
690    
691    
692  EXTRACTING CAPTURED SUBSTRINGS  EXTRACTING CAPTURED SUBSTRINGS
693       Captured substrings can be accessed directly  by  using  the       Captured substrings can be accessed directly  by  using  the
694       offsets returned by pcre_exec() in ovector. For convenience,       offsets returned by pcre_exec() in ovector. For convenience,
# Line 631  EXTRACTING CAPTURED SUBSTRINGS Line 717  EXTRACTING CAPTURED SUBSTRINGS
717       the entire pattern, while higher values extract the captured       the entire pattern, while higher values extract the captured
718       substrings. For pcre_copy_substring(), the string is  placed       substrings. For pcre_copy_substring(), the string is  placed
719       in  buffer,  whose  length is given by buffersize, while for       in  buffer,  whose  length is given by buffersize, while for
720       pcre_get_substring() a new block of store  is  obtained  via       pcre_get_substring() a new block of memory is  obtained  via
721       pcre_malloc,  and its address is returned via stringptr. The       pcre_malloc,  and its address is returned via stringptr. The
722       yield of the function is  the  length  of  the  string,  not       yield of the function is  the  length  of  the  string,  not
723       including the terminating zero, or one of       including the terminating zero, or one of
# Line 665  EXTRACTING CAPTURED SUBSTRINGS Line 751  EXTRACTING CAPTURED SUBSTRINGS
751       inspecting the appropriate offset in ovector, which is nega-       inspecting the appropriate offset in ovector, which is nega-
752       tive for unset substrings.       tive for unset substrings.
753    
754         The  two  convenience  functions  pcre_free_substring()  and
755         pcre_free_substring_list()  can  be  used to free the memory
756         returned by  a  previous  call  of  pcre_get_substring()  or
757         pcre_get_substring_list(),  respectively.  They  do  nothing
758         more than call the function pointed to by  pcre_free,  which
759         of  course  could  be called directly from a C program. How-
760         ever, PCRE is used in some situations where it is linked via
761         a  special  interface  to another programming language which
762         cannot use pcre_free directly; it is for  these  cases  that
763         the functions are provided.
764    
765    
766    
# Line 672  LIMITATIONS Line 768  LIMITATIONS
768       There are some size limitations in PCRE but it is hoped that       There are some size limitations in PCRE but it is hoped that
769       they will never in practice be relevant.  The maximum length       they will never in practice be relevant.  The maximum length
770       of a compiled pattern is 65539 (sic) bytes.  All  values  in       of a compiled pattern is 65539 (sic) bytes.  All  values  in
771       repeating  quantifiers must be less than 65536.  The maximum       repeating  quantifiers  must be less than 65536.  There max-
772       number of capturing subpatterns is 99.  The  maximum  number       imum number of capturing subpatterns is 65535.  There is  no
773       of  all  parenthesized subpatterns, including capturing sub-       limit  to  the  number of non-capturing subpatterns, but the
774       patterns, assertions, and other types of subpattern, is 200.       maximum depth of nesting of all kinds of parenthesized  sub-
775         pattern,  including  capturing  subpatterns, assertions, and
776         other types of subpattern, is 200.
777    
778       The maximum length of a subject string is the largest  posi-       The maximum length of a subject string is the largest  posi-
779       tive number that an integer variable can hold. However, PCRE       tive number that an integer variable can hold. However, PCRE
# Line 733  DIFFERENCES FROM PERL Line 831  DIFFERENCES FROM PERL
831       (?p{code})  constructions. However, there is some experimen-       (?p{code})  constructions. However, there is some experimen-
832       tal support for recursive patterns using the  non-Perl  item       tal support for recursive patterns using the  non-Perl  item
833       (?R).       (?R).
834    
835       8. There are at the time of writing some  oddities  in  Perl       8. There are at the time of writing some  oddities  in  Perl
836       5.005_02  concerned  with  the  settings of captured strings       5.005_02  concerned  with  the  settings of captured strings
837       when part of a pattern is repeated.  For  example,  matching       when part of a pattern is repeated.  For  example,  matching
# Line 785  REGULAR EXPRESSION DETAILS Line 884  REGULAR EXPRESSION DETAILS
884       The syntax and semantics of  the  regular  expressions  sup-       The syntax and semantics of  the  regular  expressions  sup-
885       ported  by PCRE are described below. Regular expressions are       ported  by PCRE are described below. Regular expressions are
886       also described in the Perl documentation and in a number  of       also described in the Perl documentation and in a number  of
   
887       other  books,  some  of which have copious examples. Jeffrey       other  books,  some  of which have copious examples. Jeffrey
888       Friedl's  "Mastering  Regular  Expressions",  published   by       Friedl's  "Mastering  Regular  Expressions",  published   by
889       O'Reilly  (ISBN  1-56592-257),  covers them in great detail.       O'Reilly (ISBN 1-56592-257), covers them in great detail.
890    
891       The description here is intended as reference documentation.       The description here is intended as reference documentation.
892         The basic operation of PCRE is on strings of bytes. However,
893         there is the beginnings of some support for UTF-8  character
894         strings.  To  use  this  support  you must configure PCRE to
895         include it, and then call pcre_compile() with the  PCRE_UTF8
896         option.  How  this affects the pattern matching is described
897         in the final section of this document.
898    
899       A regular expression is a pattern that is matched against  a       A regular expression is a pattern that is matched against  a
900       subject string from left to right. Most characters stand for       subject string from left to right. Most characters stand for
# Line 844  BACKSLASH Line 949  BACKSLASH
949       The backslash character has several uses. Firstly, if it  is       The backslash character has several uses. Firstly, if it  is
950       followed  by  a  non-alphameric character, it takes away any       followed  by  a  non-alphameric character, it takes away any
951       special  meaning  that  character  may  have.  This  use  of       special  meaning  that  character  may  have.  This  use  of
952    
953       backslash  as  an  escape  character applies both inside and       backslash  as  an  escape  character applies both inside and
954       outside character classes.       outside character classes.
955    
# Line 1047  CIRCUMFLEX AND DOLLAR Line 1153  CIRCUMFLEX AND DOLLAR
1153    
1154       Note that the sequences \A, \Z, and \z can be used to  match       Note that the sequences \A, \Z, and \z can be used to  match
1155       the  start  and end of the subject in both modes, and if all       the  start  and end of the subject in both modes, and if all
1156       branches of a pattern start with \A is it  always  anchored,       branches of a pattern start with \A it is  always  anchored,
1157       whether PCRE_MULTILINE is set or not.       whether PCRE_MULTILINE is set or not.
1158    
1159    
# Line 1174  POSIX CHARACTER CLASSES Line 1280  POSIX CHARACTER CLASSES
1280         [12[:^digit:]]         [12[:^digit:]]
1281    
1282       matches "1", "2", or any non-digit.  PCRE  (and  Perl)  also       matches "1", "2", or any non-digit.  PCRE  (and  Perl)  also
1283       recogize  the POSIX syntax [.ch.] and [=ch=] where "ch" is a       recognize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
1284       "collating element", but these are  not  supported,  and  an       "collating element", but these are  not  supported,  and  an
1285       error is given if they are encountered.       error is given if they are encountered.
1286    
# Line 1293  SUBPATTERNS Line 1399  SUBPATTERNS
1399         the ((red|white) (king|queen))         the ((red|white) (king|queen))
1400    
1401       the captured substrings are "red king", "red",  and  "king",       the captured substrings are "red king", "red",  and  "king",
1402       and are numbered 1, 2, and 3.       and are numbered 1, 2, and 3, respectively.
1403    
1404       The fact that plain parentheses fulfil two functions is  not       The fact that plain parentheses fulfil two functions is  not
1405       always  helpful.  There are often times when a grouping sub-       always  helpful.  There are often times when a grouping sub-
# Line 1364  REPETITION Line 1470  REPETITION
1470       one that does not match the syntax of a quantifier, is taken       one that does not match the syntax of a quantifier, is taken
1471       as  a literal character. For example, {,6} is not a quantif-       as  a literal character. For example, {,6} is not a quantif-
1472       ier, but a literal string of four characters.       ier, but a literal string of four characters.
   
1473       The quantifier {0} is permitted, causing the  expression  to       The quantifier {0} is permitted, causing the  expression  to
1474       behave  as  if the previous item and the quantifier were not       behave  as  if the previous item and the quantifier were not
1475       present.       present.
# Line 1403  REPETITION Line 1508  REPETITION
1508    
1509         /* first command */  not comment  /* second comment */         /* first command */  not comment  /* second comment */
1510    
1511       fails, because it matches  the  entire  string  due  to  the       fails, because it matches the entire  string  owing  to  the
1512       greediness of the .*  item.       greediness of the .*  item.
1513    
1514       However, if a quantifier is followed by a question mark,  it       However, if a quantifier is followed by a question mark,  it
# Line 1469  REPETITION Line 1574  REPETITION
1574  BACK REFERENCES  BACK REFERENCES
1575       Outside a character class, a backslash followed by  a  digit       Outside a character class, a backslash followed by  a  digit
1576       greater  than  0  (and  possibly  further  digits) is a back       greater  than  0  (and  possibly  further  digits) is a back
1577    
1578    
1579    
1580    
1581    SunOS 5.8                 Last change:                         30
1582    
1583    
1584    
1585       reference to a capturing subpattern  earlier  (i.e.  to  its       reference to a capturing subpattern  earlier  (i.e.  to  its
1586       left)  in  the  pattern,  provided there have been that many       left)  in  the  pattern,  provided there have been that many
1587       previous capturing left parentheses.       previous capturing left parentheses.
# Line 1517  BACK REFERENCES Line 1630  BACK REFERENCES
1630       A back reference that occurs inside the parentheses to which       A back reference that occurs inside the parentheses to which
1631       it  refers  fails when the subpattern is first used, so, for       it  refers  fails when the subpattern is first used, so, for
1632       example, (a\1) never matches.  However, such references  can       example, (a\1) never matches.  However, such references  can
1633       be  useful  inside  repeated  subpatterns.  For example, the       be useful inside repeated subpatterns. For example, the pat-
1634       pattern       tern
1635    
1636         (a|b\1)+         (a|b\1)+
1637    
1638       matches any number of "a"s and also "aba", "ababaa" etc.  At       matches any number of "a"s and also "aba", "ababbaa" etc. At
1639       each iteration of the subpattern, the back reference matches       each iteration of the subpattern, the back reference matches
1640       the character string corresponding to  the  previous  itera-       the character string corresponding to  the  previous  itera-
1641       tion.  In  order  for this to work, the pattern must be such       tion.  In  order  for this to work, the pattern must be such
# Line 1778  CONDITIONAL SUBPATTERNS Line 1891  CONDITIONAL SUBPATTERNS
1891       There are two kinds of condition. If the  text  between  the       There are two kinds of condition. If the  text  between  the
1892       parentheses  consists of a sequence of digits, the condition       parentheses  consists of a sequence of digits, the condition
1893       is satisfied if the capturing subpattern of that number  has       is satisfied if the capturing subpattern of that number  has
1894       previously  matched.  Consider  the following pattern, which       previously  matched.  The  number must be greater than zero.
1895       contains non-significant white space to make it  more  read-       Consider  the  following  pattern,   which   contains   non-
1896       able (assume the PCRE_EXTENDED option) and to divide it into       significant white space to make it more readable (assume the
1897       three parts for ease of discussion:       PCRE_EXTENDED option) and to divide it into three parts  for
1898         ease of discussion:
1899    
1900         ( \( )?    [^()]+    (?(1) \) )         ( \( )?    [^()]+    (?(1) \) )
1901    
# Line 1966  PERFORMANCE Line 2080  PERFORMANCE
2080    
2081    
2082    
2083    UTF-8 SUPPORT
2084         Starting at release 3.3, PCRE has some support for character
2085         strings encoded in the UTF-8 format. This is incomplete, and
2086         is regarded as experimental. In order to use  it,  you  must
2087         configure PCRE to include UTF-8 support in the code, and, in
2088         addition, you must call pcre_compile()  with  the  PCRE_UTF8
2089         option flag. When you do this, both the pattern and any sub-
2090         ject strings that are matched  against  it  are  treated  as
2091         UTF-8  strings instead of just strings of bytes, but only in
2092         the cases that are mentioned below.
2093    
2094         If you compile PCRE with UTF-8 support, but do not use it at
2095         run  time,  the  library will be a bit bigger, but the addi-
2096         tional run time overhead is limited to testing the PCRE_UTF8
2097         flag in several places, so should not be very large.
2098    
2099         PCRE assumes that the strings  it  is  given  contain  valid
2100         UTF-8  codes. It does not diagnose invalid UTF-8 strings. If
2101         you pass invalid UTF-8 strings  to  PCRE,  the  results  are
2102         undefined.
2103    
2104         Running with PCRE_UTF8 set causes these changes in  the  way
2105         PCRE works:
2106    
2107         1. In a pattern, the  escape  sequence  \x{...},  where  the
2108         contents of the braces is a string of hexadecimal digits, is
2109         interpreted as a UTF-8 character whose code  number  is  the
2110         given   hexadecimal  number,  for  example:  \x{1234}.  This
2111         inserts from one to six  literal  bytes  into  the  pattern,
2112         using the UTF-8 encoding. If a non-hexadecimal digit appears
2113         between the braces, the item is not recognized.
2114    
2115         2. The original hexadecimal escape sequence, \xhh, generates
2116         a two-byte UTF-8 character if its value is greater than 127.
2117    
2118         3. Repeat quantifiers are NOT correctly handled if they fol-
2119         low  a  multibyte character. For example, \x{100}* and \xc3+
2120         do not work. If you want to repeat such characters, you must
2121         enclose  them  in  non-capturing  parentheses,  for  example
2122         (?:\x{100}), at present.
2123    
2124         4. The dot metacharacter matches one UTF-8 character instead
2125         of a single byte.
2126    
2127         5. Unlike literal UTF-8 characters,  the  dot  metacharacter
2128         followed  by  a  repeat quantifier does operate correctly on
2129         UTF-8 characters instead of single bytes.
2130    
2131         4. Although the \x{...} escape is permitted in  a  character
2132         class,  characters  whose values are greater than 255 cannot
2133         be included in a class.
2134    
2135         5. A class is matched against a UTF-8 character  instead  of
2136         just  a  single byte, but it can match only characters whose
2137         values are less than 256.  Characters  with  greater  values
2138         always fail to match a class.
2139    
2140         6. Repeated classes work correctly on multiple characters.
2141    
2142         7. Classes containing just a single character whose value is
2143         greater than 127 (but less than 256), for example, [\x80] or
2144         [^\x{93}], do not work because these are optimized into sin-
2145         gle  byte  matches.  In the first case, of course, the class
2146         brackets are just redundant.
2147    
2148         8. Lookbehind assertions move backwards in the subject by  a
2149         fixed  number  of  characters  instead  of a fixed number of
2150         bytes. Simple cases have been tested to work correctly,  but
2151         there may be hidden gotchas herein.
2152    
2153         9. The character types  such  as  \d  and  \w  do  not  work
2154         correctly  with  UTF-8  characters.  They continue to test a
2155         single byte.
2156    
2157         10. Anything not explicitly mentioned here continues to work
2158         in bytes rather than in characters.
2159    
2160         The following UTF-8 features of  Perl  5.6  are  not  imple-
2161         mented:
2162    
2163         1. The escape sequence \C to match a single byte.
2164    
2165         2. The use of Unicode tables and properties and escapes  \p,
2166         \P, and \X.
2167    
2168    
2169    
2170    SAMPLE PROGRAM
2171         The code below is a simple, complete demonstration  program,
2172         to  get  you started with using PCRE. This code is also sup-
2173         plied in the file pcredemo.c in the PCRE distribution.
2174    
2175         The program compiles the  regular  expression  that  is  its
2176         first argument, and matches it against the subject string in
2177         its second argument. No options are set, and default charac-
2178         ter  tables are used. If matching succeeds, the program out-
2179         puts the portion of the subject that matched, together  with
2180         the contents of any captured substrings.
2181    
2182         On a Unix system that has PCRE installed in /usr/local,  you
2183         can  compile  the demonstration program using a command like
2184         this:
2185    
2186           gcc   -o    pcredemo    pcredemo.c    -I/usr/local/include
2187         -L/usr/local/lib -lpcre
2188    
2189         Then you can run simple tests like this:
2190    
2191           ./pcredemo 'cat|dog' 'the cat sat on the mat'
2192    
2193         Note that there is a much more comprehensive  test  program,
2194         called  pcretest,  which  supports  many more facilities for
2195         testing regular expressions. The pcredemo  program  is  pro-
2196         vided as a simple coding example.
2197    
2198         On some operating systems (e.g.  Solaris)  you  may  get  an
2199         error like this when you try to run pcredemo:
2200    
2201           ld.so.1: a.out: fatal: libpcre.so.0: open failed: No  such
2202         file or directory
2203    
2204         This is caused by the way shared library  support  works  on
2205         those systems. You need to add
2206    
2207           -R/usr/local/lib
2208    
2209         to the compile command to get round this problem. Here's the
2210         code:
2211    
2212           #include <stdio.h>
2213           #include <string.h>
2214           #include <pcre.h>
2215    
2216           #define OVECCOUNT 30    /* should be a multiple of 3 */
2217    
2218           int main(int argc, char **argv)
2219           {
2220           pcre *re;
2221           const char *error;
2222           int erroffset;
2223           int ovector[OVECCOUNT];
2224           int rc, i;
2225    
2226           if (argc != 3)
2227             {
2228             printf("Two arguments required: a regex and a "
2229               "subject string\n");
2230             return 1;
2231             }
2232    
2233           /* Compile the regular expression in the first argument */
2234    
2235           re = pcre_compile(
2236             argv[1],     /* the pattern */
2237             0,           /* default options */
2238             &error,      /* for error message */
2239             &erroffset,  /* for error offset */
2240             NULL);       /* use default character tables */
2241    
2242           /* Compilation failed: print the error message and exit */
2243    
2244           if (re == NULL)
2245             {
2246             printf("PCRE compilation failed at offset %d: %s\n",
2247               erroffset, error);
2248             return 1;
2249             }
2250    
2251           /* Compilation succeeded: match the subject in the second
2252              argument */
2253    
2254           rc = pcre_exec(
2255             re,          /* the compiled pattern */
2256             NULL,        /* we didn't study the pattern */
2257             argv[2],     /* the subject string */
2258             (int)strlen(argv[2]), /* the length of the subject */
2259             0,           /* start at offset 0 in the subject */
2260             0,           /* default options */
2261             ovector,     /* vector for substring information */
2262             OVECCOUNT);  /* number of elements in the vector */
2263    
2264           /* Matching failed: handle error cases */
2265    
2266           if (rc < 0)
2267             {
2268             switch(rc)
2269               {
2270               case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
2271               /*
2272               Handle other special cases if you like
2273               */
2274               default: printf("Matching error %d\n", rc); break;
2275               }
2276             return 1;
2277             }
2278    
2279           /* Match succeded */
2280    
2281           printf("Match succeeded\n");
2282    
2283           /* The output vector wasn't big enough */
2284    
2285           if (rc == 0)
2286             {
2287             rc = OVECCOUNT/3;
2288             printf("ovector only has room for %d captured "
2289               substrings\n", rc - 1);
2290             }
2291    
2292           /* Show substrings stored in the output vector */
2293    
2294           for (i = 0; i < rc; i++)
2295             {
2296             char *substring_start = argv[2] + ovector[2*i];
2297             int substring_length = ovector[2*i+1] - ovector[2*i];
2298             printf("%2d: %.*s\n", i, substring_length,
2299               substring_start);
2300             }
2301    
2302           return 0;
2303           }
2304    
2305    
2306    
2307  AUTHOR  AUTHOR
2308       Philip Hazel <ph10@cam.ac.uk>       Philip Hazel <ph10@cam.ac.uk>
2309       University Computing Service,       University Computing Service,
# Line 1973  AUTHOR Line 2311  AUTHOR
2311       Cambridge CB2 3QG, England.       Cambridge CB2 3QG, England.
2312       Phone: +44 1223 334714       Phone: +44 1223 334714
2313    
2314       Last updated: 27 January 2000       Last updated: 15 August 2001
2315       Copyright (c) 1997-2000 University of Cambridge.       Copyright (c) 1997-2001 University of Cambridge.

Legend:
Removed from v.47  
changed lines
  Added in v.53

  ViewVC Help
Powered by ViewVC 1.1.5