/[pcre]/code/trunk/pcredemo.c
ViewVC logotype

Diff of /code/trunk/pcredemo.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 53 by nigel, Sat Feb 24 21:39:42 2007 UTC revision 945 by ph10, Wed Feb 29 09:37:15 2012 UTC
# Line 1  Line 1 
1    /*************************************************
2    *           PCRE DEMONSTRATION PROGRAM           *
3    *************************************************/
4    
5    /* This is a demonstration program to illustrate the most straightforward ways
6    of calling the PCRE regular expression library from a C program. See the
7    pcresample documentation for a short discussion ("man pcresample" if you have
8    the PCRE man pages installed).
9    
10    In Unix-like environments, if PCRE is installed in your standard system
11    libraries, you should be able to compile this program using this command:
12    
13    gcc -Wall pcredemo.c -lpcre -o pcredemo
14    
15    If PCRE is not installed in a standard place, it is likely to be installed with
16    support for the pkg-config mechanism. If you have pkg-config, you can compile
17    this program using this command:
18    
19    gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
20    
21    If you do not have pkg-config, you may have to use this:
22    
23    gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
24      -R/usr/local/lib -lpcre -o pcredemo
25    
26    Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
27    library files for PCRE are installed on your system. Only some operating
28    systems (e.g. Solaris) use the -R option.
29    
30    Building under Windows:
31    
32    If you want to statically link this program against a non-dll .a file, you must
33    define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
34    pcre_free() exported functions will be declared __declspec(dllimport), with
35    unwanted results. So in this environment, uncomment the following line. */
36    
37    /* #define PCRE_STATIC */
38    
39  #include <stdio.h>  #include <stdio.h>
40  #include <string.h>  #include <string.h>
41  #include <pcre.h>  #include <pcre.h>
42    
 /* Compile thuswise:  
   gcc -Wall pcredemo.c -I/opt/local/include -L/opt/local/lib \  
     -R/opt/local/lib -lpcre  
 */  
   
43  #define OVECCOUNT 30    /* should be a multiple of 3 */  #define OVECCOUNT 30    /* should be a multiple of 3 */
44    
45    
46  int main(int argc, char **argv)  int main(int argc, char **argv)
47  {  {
48  pcre *re;  pcre *re;
49  const char *error;  const char *error;
50    char *pattern;
51    char *subject;
52    unsigned char *name_table;
53    unsigned int option_bits;
54  int erroffset;  int erroffset;
55    int find_all;
56    int crlf_is_newline;
57    int namecount;
58    int name_entry_size;
59  int ovector[OVECCOUNT];  int ovector[OVECCOUNT];
60    int subject_length;
61  int rc, i;  int rc, i;
62    int utf8;
63    
64    
65    /**************************************************************************
66    * First, sort out the command line. There is only one possible option at  *
67    * the moment, "-g" to request repeated matching to find all occurrences,  *
68    * like Perl's /g option. We set the variable find_all to a non-zero value *
69    * if the -g option is present. Apart from that, there must be exactly two *
70    * arguments.                                                              *
71    **************************************************************************/
72    
73  if (argc != 3)  find_all = 0;
74    for (i = 1; i < argc; i++)
75      {
76      if (strcmp(argv[i], "-g") == 0) find_all = 1;
77        else break;
78      }
79    
80    /* After the options, we require exactly two arguments, which are the pattern,
81    and the subject string. */
82    
83    if (argc - i != 2)
84    {    {
85    printf("Two arguments required: a regex and a subject string\n");    printf("Two arguments required: a regex and a subject string\n");
86    return 1;    return 1;
87    }    }
88    
89  /* Compile the regular expression in the first argument */  pattern = argv[i];
90    subject = argv[i+1];
91    subject_length = (int)strlen(subject);
92    
93    
94    /*************************************************************************
95    * Now we are going to compile the regular expression pattern, and handle *
96    * and errors that are detected.                                          *
97    *************************************************************************/
98    
99  re = pcre_compile(  re = pcre_compile(
100    argv[1],              /* the pattern */    pattern,              /* the pattern */
101    0,                    /* default options */    0,                    /* default options */
102    &error,               /* for error message */    &error,               /* for error message */
103    &erroffset,           /* for error offset */    &erroffset,           /* for error offset */
# Line 40  if (re == NULL) Line 111  if (re == NULL)
111    return 1;    return 1;
112    }    }
113    
114  /* Compilation succeeded: match the subject in the second argument */  
115    /*************************************************************************
116    * If the compilation succeeded, we call PCRE again, in order to do a     *
117    * pattern match against the subject string. This does just ONE match. If *
118    * further matching is needed, it will be done below.                     *
119    *************************************************************************/
120    
121  rc = pcre_exec(  rc = pcre_exec(
122    re,                   /* the compiled pattern */    re,                   /* the compiled pattern */
123    NULL,                 /* no extra data - we didn't study the pattern */    NULL,                 /* no extra data - we didn't study the pattern */
124    argv[2],              /* the subject string */    subject,              /* the subject string */
125    (int)strlen(argv[2]), /* the length of the subject */    subject_length,       /* the length of the subject */
126    0,                    /* start at offset 0 in the subject */    0,                    /* start at offset 0 in the subject */
127    0,                    /* default options */    0,                    /* default options */
128    ovector,              /* output vector for substring information */    ovector,              /* output vector for substring information */
# Line 64  if (rc < 0) Line 140  if (rc < 0)
140      */      */
141      default: printf("Matching error %d\n", rc); break;      default: printf("Matching error %d\n", rc); break;
142      }      }
143      pcre_free(re);     /* Release memory used for the compiled pattern */
144    return 1;    return 1;
145    }    }
146    
147  /* Match succeded */  /* Match succeded */
148    
149  printf("Match succeeded\n");  printf("\nMatch succeeded at offset %d\n", ovector[0]);
150    
151    
152    /*************************************************************************
153    * We have found the first match within the subject string. If the output *
154    * vector wasn't big enough, say so. Then output any substrings that were *
155    * captured.                                                              *
156    *************************************************************************/
157    
158  /* The output vector wasn't big enough */  /* The output vector wasn't big enough */
159    
# Line 79  if (rc == 0) Line 163  if (rc == 0)
163    printf("ovector only has room for %d captured substrings\n", rc - 1);    printf("ovector only has room for %d captured substrings\n", rc - 1);
164    }    }
165    
166  /* Show substrings stored in the output vector */  /* Show substrings stored in the output vector by number. Obviously, in a real
167    application you might want to do things other than print them. */
168    
169  for (i = 0; i < rc; i++)  for (i = 0; i < rc; i++)
170    {    {
171    char *substring_start = argv[2] + ovector[2*i];    char *substring_start = subject + ovector[2*i];
172    int substring_length = ovector[2*i+1] - ovector[2*i];    int substring_length = ovector[2*i+1] - ovector[2*i];
173    printf("%2d: %.*s\n", i, substring_length, substring_start);    printf("%2d: %.*s\n", i, substring_length, substring_start);
174    }    }
175    
176    
177    /**************************************************************************
178    * That concludes the basic part of this demonstration program. We have    *
179    * compiled a pattern, and performed a single match. The code that follows *
180    * shows first how to access named substrings, and then how to code for    *
181    * repeated matches on the same subject.                                   *
182    **************************************************************************/
183    
184    /* See if there are any named substrings, and if so, show them by name. First
185    we have to extract the count of named parentheses from the pattern. */
186    
187    (void)pcre_fullinfo(
188      re,                   /* the compiled pattern */
189      NULL,                 /* no extra data - we didn't study the pattern */
190      PCRE_INFO_NAMECOUNT,  /* number of named substrings */
191      &namecount);          /* where to put the answer */
192    
193    if (namecount <= 0) printf("No named substrings\n"); else
194      {
195      unsigned char *tabptr;
196      printf("Named substrings\n");
197    
198      /* Before we can access the substrings, we must extract the table for
199      translating names to numbers, and the size of each entry in the table. */
200    
201      (void)pcre_fullinfo(
202        re,                       /* the compiled pattern */
203        NULL,                     /* no extra data - we didn't study the pattern */
204        PCRE_INFO_NAMETABLE,      /* address of the table */
205        &name_table);             /* where to put the answer */
206    
207      (void)pcre_fullinfo(
208        re,                       /* the compiled pattern */
209        NULL,                     /* no extra data - we didn't study the pattern */
210        PCRE_INFO_NAMEENTRYSIZE,  /* size of each entry in the table */
211        &name_entry_size);        /* where to put the answer */
212    
213      /* Now we can scan the table and, for each entry, print the number, the name,
214      and the substring itself. */
215    
216      tabptr = name_table;
217      for (i = 0; i < namecount; i++)
218        {
219        int n = (tabptr[0] << 8) | tabptr[1];
220        printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
221          ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
222        tabptr += name_entry_size;
223        }
224      }
225    
226    
227    /*************************************************************************
228    * If the "-g" option was given on the command line, we want to continue  *
229    * to search for additional matches in the subject string, in a similar   *
230    * way to the /g option in Perl. This turns out to be trickier than you   *
231    * might think because of the possibility of matching an empty string.    *
232    * What happens is as follows:                                            *
233    *                                                                        *
234    * If the previous match was NOT for an empty string, we can just start   *
235    * the next match at the end of the previous one.                         *
236    *                                                                        *
237    * If the previous match WAS for an empty string, we can't do that, as it *
238    * would lead to an infinite loop. Instead, a special call of pcre_exec() *
239    * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set.    *
240    * The first of these tells PCRE that an empty string at the start of the *
241    * subject is not a valid match; other possibilities must be tried. The   *
242    * second flag restricts PCRE to one match attempt at the initial string  *
243    * position. If this match succeeds, an alternative to the empty string   *
244    * match has been found, and we can print it and proceed round the loop,  *
245    * advancing by the length of whatever was found. If this match does not  *
246    * succeed, we still stay in the loop, advancing by just one character.   *
247    * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
248    * more than one byte.                                                    *
249    *                                                                        *
250    * However, there is a complication concerned with newlines. When the     *
251    * newline convention is such that CRLF is a valid newline, we must       *
252    * advance by two characters rather than one. The newline convention can  *
253    * be set in the regex by (*CR), etc.; if not, we must find the default.  *
254    *************************************************************************/
255    
256    if (!find_all)     /* Check for -g */
257      {
258      pcre_free(re);   /* Release the memory used for the compiled pattern */
259      return 0;        /* Finish unless -g was given */
260      }
261    
262    /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
263    sequence. First, find the options with which the regex was compiled; extract
264    the UTF-8 state, and mask off all but the newline options. */
265    
266    (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
267    utf8 = option_bits & PCRE_UTF8;
268    option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
269                   PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
270    
271    /* If no newline options were set, find the default newline convention from the
272    build configuration. */
273    
274    if (option_bits == 0)
275      {
276      int d;
277      (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
278      /* Note that these values are always the ASCII ones, even in
279      EBCDIC environments. CR = 13, NL = 10. */
280      option_bits = (d == 13)? PCRE_NEWLINE_CR :
281              (d == 10)? PCRE_NEWLINE_LF :
282              (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
283              (d == -2)? PCRE_NEWLINE_ANYCRLF :
284              (d == -1)? PCRE_NEWLINE_ANY : 0;
285      }
286    
287    /* See if CRLF is a valid newline sequence. */
288    
289    crlf_is_newline =
290         option_bits == PCRE_NEWLINE_ANY ||
291         option_bits == PCRE_NEWLINE_CRLF ||
292         option_bits == PCRE_NEWLINE_ANYCRLF;
293    
294    /* Loop for second and subsequent matches */
295    
296    for (;;)
297      {
298      int options = 0;                 /* Normally no options */
299      int start_offset = ovector[1];   /* Start at end of previous match */
300    
301      /* If the previous match was for an empty string, we are finished if we are
302      at the end of the subject. Otherwise, arrange to run another match at the
303      same point to see if a non-empty match can be found. */
304    
305      if (ovector[0] == ovector[1])
306        {
307        if (ovector[0] == subject_length) break;
308        options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
309        }
310    
311      /* Run the next matching operation */
312    
313      rc = pcre_exec(
314        re,                   /* the compiled pattern */
315        NULL,                 /* no extra data - we didn't study the pattern */
316        subject,              /* the subject string */
317        subject_length,       /* the length of the subject */
318        start_offset,         /* starting offset in the subject */
319        options,              /* options */
320        ovector,              /* output vector for substring information */
321        OVECCOUNT);           /* number of elements in the output vector */
322    
323      /* This time, a result of NOMATCH isn't an error. If the value in "options"
324      is zero, it just means we have found all possible matches, so the loop ends.
325      Otherwise, it means we have failed to find a non-empty-string match at a
326      point where there was a previous empty-string match. In this case, we do what
327      Perl does: advance the matching position by one character, and continue. We
328      do this by setting the "end of previous match" offset, because that is picked
329      up at the top of the loop as the point at which to start again.
330    
331      There are two complications: (a) When CRLF is a valid newline sequence, and
332      the current position is just before it, advance by an extra byte. (b)
333      Otherwise we must ensure that we skip an entire UTF-8 character if we are in
334      UTF-8 mode. */
335    
336      if (rc == PCRE_ERROR_NOMATCH)
337        {
338        if (options == 0) break;                    /* All matches found */
339        ovector[1] = start_offset + 1;              /* Advance one byte */
340        if (crlf_is_newline &&                      /* If CRLF is newline & */
341            start_offset < subject_length - 1 &&    /* we are at CRLF, */
342            subject[start_offset] == '\r' &&
343            subject[start_offset + 1] == '\n')
344          ovector[1] += 1;                          /* Advance by one more. */
345        else if (utf8)                              /* Otherwise, ensure we */
346          {                                         /* advance a whole UTF-8 */
347          while (ovector[1] < subject_length)       /* character. */
348            {
349            if ((subject[ovector[1]] & 0xc0) != 0x80) break;
350            ovector[1] += 1;
351            }
352          }
353        continue;    /* Go round the loop again */
354        }
355    
356      /* Other matching errors are not recoverable. */
357    
358      if (rc < 0)
359        {
360        printf("Matching error %d\n", rc);
361        pcre_free(re);    /* Release memory used for the compiled pattern */
362        return 1;
363        }
364    
365      /* Match succeded */
366    
367      printf("\nMatch succeeded again at offset %d\n", ovector[0]);
368    
369      /* The match succeeded, but the output vector wasn't big enough. */
370    
371      if (rc == 0)
372        {
373        rc = OVECCOUNT/3;
374        printf("ovector only has room for %d captured substrings\n", rc - 1);
375        }
376    
377      /* As before, show substrings stored in the output vector by number, and then
378      also any named substrings. */
379    
380      for (i = 0; i < rc; i++)
381        {
382        char *substring_start = subject + ovector[2*i];
383        int substring_length = ovector[2*i+1] - ovector[2*i];
384        printf("%2d: %.*s\n", i, substring_length, substring_start);
385        }
386    
387      if (namecount <= 0) printf("No named substrings\n"); else
388        {
389        unsigned char *tabptr = name_table;
390        printf("Named substrings\n");
391        for (i = 0; i < namecount; i++)
392          {
393          int n = (tabptr[0] << 8) | tabptr[1];
394          printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
395            ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
396          tabptr += name_entry_size;
397          }
398        }
399      }      /* End of loop to find second and subsequent matches */
400    
401    printf("\n");
402    pcre_free(re);       /* Release memory used for the compiled pattern */
403  return 0;  return 0;
404  }  }
405    
406    /* End of pcredemo.c */

Legend:
Removed from v.53  
changed lines
  Added in v.945

  ViewVC Help
Powered by ViewVC 1.1.5