/[pcre]/code/trunk/pcredemo.c
ViewVC logotype

Diff of /code/trunk/pcredemo.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 75 by nigel, Sat Feb 24 21:40:37 2007 UTC revision 566 by ph10, Wed Nov 3 18:32:55 2010 UTC
# Line 4  Line 4 
4    
5  /* This is a demonstration program to illustrate the most straightforward ways  /* This is a demonstration program to illustrate the most straightforward ways
6  of calling the PCRE regular expression library from a C program. See the  of calling the PCRE regular expression library from a C program. See the
7  pcresample documentation for a short discussion.  pcresample documentation for a short discussion ("man pcresample" if you have
8    the PCRE man pages installed).
9    
10  Compile thuswise:  In Unix-like environments, if PCRE is installed in your standard system
11    gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \  libraries, you should be able to compile this program using this command:
12      -R/usr/local/lib -lpcre  
13    gcc -Wall pcredemo.c -lpcre -o pcredemo
14    
15    If PCRE is not installed in a standard place, it is likely to be installed with
16    support for the pkg-config mechanism. If you have pkg-config, you can compile
17    this program using this command:
18    
19    gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
20    
21    If you do not have pkg-config, you may have to use this:
22    
23    gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
24      -R/usr/local/lib -lpcre -o pcredemo
25    
26  Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and  Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
27  library files for PCRE are installed on your system. Only some operating  library files for PCRE are installed on your system. Only some operating
28  systems (e.g. Solaris) use the -R option.  systems (e.g. Solaris) use the -R option.
 */  
29    
30    Building under Windows:
31    
32    If you want to statically link this program against a non-dll .a file, you must
33    define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
34    pcre_free() exported functions will be declared __declspec(dllimport), with
35    unwanted results. So in this environment, uncomment the following line. */
36    
37    /* #define PCRE_STATIC */
38    
39  #include <stdio.h>  #include <stdio.h>
40  #include <string.h>  #include <string.h>
# Line 30  const char *error; Line 50  const char *error;
50  char *pattern;  char *pattern;
51  char *subject;  char *subject;
52  unsigned char *name_table;  unsigned char *name_table;
53    unsigned int option_bits;
54  int erroffset;  int erroffset;
55  int find_all;  int find_all;
56    int crlf_is_newline;
57  int namecount;  int namecount;
58  int name_entry_size;  int name_entry_size;
59  int ovector[OVECCOUNT];  int ovector[OVECCOUNT];
60  int subject_length;  int subject_length;
61  int rc, i;  int rc, i;
62    int utf8;
63    
64    
65  /**************************************************************************  /**************************************************************************
# Line 117  if (rc < 0) Line 140  if (rc < 0)
140      */      */
141      default: printf("Matching error %d\n", rc); break;      default: printf("Matching error %d\n", rc); break;
142      }      }
143    free(re);     /* Release memory used for the compiled pattern */    pcre_free(re);     /* Release memory used for the compiled pattern */
144    return 1;    return 1;
145    }    }
146    
# Line 128  printf("\nMatch succeeded at offset %d\n Line 151  printf("\nMatch succeeded at offset %d\n
151    
152  /*************************************************************************  /*************************************************************************
153  * We have found the first match within the subject string. If the output *  * We have found the first match within the subject string. If the output *
154  * vector wasn't big enough, set its size to the maximum. Then output any *  * vector wasn't big enough, say so. Then output any substrings that were *
155  * substrings that were captured.                                         *  * captured.                                                              *
156  *************************************************************************/  *************************************************************************/
157    
158  /* The output vector wasn't big enough */  /* The output vector wasn't big enough */
# Line 154  for (i = 0; i < rc; i++) Line 177  for (i = 0; i < rc; i++)
177  /**************************************************************************  /**************************************************************************
178  * That concludes the basic part of this demonstration program. We have    *  * That concludes the basic part of this demonstration program. We have    *
179  * compiled a pattern, and performed a single match. The code that follows *  * compiled a pattern, and performed a single match. The code that follows *
180  * first shows how to access named substrings, and then how to code for    *  * shows first how to access named substrings, and then how to code for    *
181  * repeated matches on the same subject.                                   *  * repeated matches on the same subject.                                   *
182  **************************************************************************/  **************************************************************************/
183    
# Line 213  if (namecount <= 0) printf("No named sub Line 236  if (namecount <= 0) printf("No named sub
236  *                                                                        *  *                                                                        *
237  * If the previous match WAS for an empty string, we can't do that, as it *  * If the previous match WAS for an empty string, we can't do that, as it *
238  * would lead to an infinite loop. Instead, a special call of pcre_exec() *  * would lead to an infinite loop. Instead, a special call of pcre_exec() *
239  * is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first  *  * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set.    *
240  * of these tells PCRE that an empty string is not a valid match; other   *  * The first of these tells PCRE that an empty string at the start of the *
241  * possibilities must be tried. The second flag restricts PCRE to one     *  * subject is not a valid match; other possibilities must be tried. The   *
242  * match attempt at the initial string position. If this match succeeds,  *  * second flag restricts PCRE to one match attempt at the initial string  *
243  * an alternative to the empty string match has been found, and we can    *  * position. If this match succeeds, an alternative to the empty string   *
244  * proceed round the loop.                                                *  * match has been found, and we can print it and proceed round the loop,  *
245    * advancing by the length of whatever was found. If this match does not  *
246    * succeed, we still stay in the loop, advancing by just one character.   *
247    * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
248    * more than one byte.                                                    *
249    *                                                                        *
250    * However, there is a complication concerned with newlines. When the     *
251    * newline convention is such that CRLF is a valid newline, we want must  *
252    * advance by two characters rather than one. The newline convention can  *
253    * be set in the regex by (*CR), etc.; if not, we must find the default.  *
254  *************************************************************************/  *************************************************************************/
255    
256  if (!find_all)  if (!find_all)     /* Check for -g */
257    {    {
258    free(re);   /* Release the memory used for the compiled pattern */    pcre_free(re);   /* Release the memory used for the compiled pattern */
259    return 0;   /* Finish unless -g was given */    return 0;        /* Finish unless -g was given */
260    }    }
261    
262    /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
263    sequence. First, find the options with which the regex was compiled; extract
264    the UTF-8 state, and mask off all but the newline options. */
265    
266    (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
267    utf8 = option_bits & PCRE_UTF8;
268    option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
269                   PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
270    
271    /* If no newline options were set, find the default newline convention from the
272    build configuration. */
273    
274    if (option_bits == 0)
275      {
276      int d;
277      (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
278      /* Note that these values are always the ASCII ones, even in
279      EBCDIC environments. CR = 13, NL = 10. */
280      option_bits = (d == 13)? PCRE_NEWLINE_CR :
281              (d == 10)? PCRE_NEWLINE_LF :
282              (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
283              (d == -2)? PCRE_NEWLINE_ANYCRLF :
284              (d == -1)? PCRE_NEWLINE_ANY : 0;
285      }
286    
287    /* See if CRLF is a valid newline sequence. */
288    
289    crlf_is_newline =
290         option_bits == PCRE_NEWLINE_ANY ||
291         option_bits == PCRE_NEWLINE_CRLF ||
292         option_bits == PCRE_NEWLINE_ANYCRLF;
293    
294  /* Loop for second and subsequent matches */  /* Loop for second and subsequent matches */
295    
296  for (;;)  for (;;)
# Line 241  for (;;) Line 305  for (;;)
305    if (ovector[0] == ovector[1])    if (ovector[0] == ovector[1])
306      {      {
307      if (ovector[0] == subject_length) break;      if (ovector[0] == subject_length) break;
308      options = PCRE_NOTEMPTY | PCRE_ANCHORED;      options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
309      }      }
310    
311    /* Run the next matching operation */    /* Run the next matching operation */
# Line 260  for (;;) Line 324  for (;;)
324    is zero, it just means we have found all possible matches, so the loop ends.    is zero, it just means we have found all possible matches, so the loop ends.
325    Otherwise, it means we have failed to find a non-empty-string match at a    Otherwise, it means we have failed to find a non-empty-string match at a
326    point where there was a previous empty-string match. In this case, we do what    point where there was a previous empty-string match. In this case, we do what
327    Perl does: advance the matching position by one, and continue. We do this by    Perl does: advance the matching position by one character, and continue. We
328    setting the "end of previous match" offset, because that is picked up at the    do this by setting the "end of previous match" offset, because that is picked
329    top of the loop as the point at which to start again. */    up at the top of the loop as the point at which to start again.
330    
331      There are two complications: (a) When CRLF is a valid newline sequence, and
332      the current position is just before it, advance by an extra byte. (b)
333      Otherwise we must ensure that we skip an entire UTF-8 character if we are in
334      UTF-8 mode. */
335    
336    if (rc == PCRE_ERROR_NOMATCH)    if (rc == PCRE_ERROR_NOMATCH)
337      {      {
338      if (options == 0) break;      if (options == 0) break;                    /* All matches found */
339      ovector[1] = start_offset + 1;      ovector[1] = start_offset + 1;              /* Advance one byte */
340        if (crlf_is_newline &&                      /* If CRLF is newline & */
341            start_offset < subject_length - 1 &&    /* we are at CRLF, */
342            subject[start_offset] == '\r' &&
343            subject[start_offset + 1] == '\n')
344          ovector[1] += 1;                          /* Advance by one more. */
345        else if (utf8)                              /* Otherwise, ensure we */
346          {                                         /* advance a whole UTF-8 */
347          while (ovector[1] < subject_length)       /* character. */
348            {
349            if ((subject[ovector[1]] & 0xc0) != 0x80) break;
350            ovector[1] += 1;
351            }
352          }
353      continue;    /* Go round the loop again */      continue;    /* Go round the loop again */
354      }      }
355    
# Line 276  for (;;) Line 358  for (;;)
358    if (rc < 0)    if (rc < 0)
359      {      {
360      printf("Matching error %d\n", rc);      printf("Matching error %d\n", rc);
361      free(re);    /* Release memory used for the compiled pattern */      pcre_free(re);    /* Release memory used for the compiled pattern */
362      return 1;      return 1;
363      }      }
364    
# Line 317  for (;;) Line 399  for (;;)
399    }      /* End of loop to find second and subsequent matches */    }      /* End of loop to find second and subsequent matches */
400    
401  printf("\n");  printf("\n");
402  free(re);       /* Release memory used for the compiled pattern */  pcre_free(re);       /* Release memory used for the compiled pattern */
403  return 0;  return 0;
404  }  }
405    

Legend:
Removed from v.75  
changed lines
  Added in v.566

  ViewVC Help
Powered by ViewVC 1.1.5