/[pcre]/code/trunk/doc/html/pcredemo.html
ViewVC logotype

Diff of /code/trunk/doc/html/pcredemo.html

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 566 by ph10, Wed Jan 6 10:26:55 2010 UTC revision 567 by ph10, Sat Nov 6 17:10:00 2010 UTC
# Line 67  const char *error; Line 67  const char *error;
67  char *pattern;  char *pattern;
68  char *subject;  char *subject;
69  unsigned char *name_table;  unsigned char *name_table;
70    unsigned int option_bits;
71  int erroffset;  int erroffset;
72  int find_all;  int find_all;
73    int crlf_is_newline;
74  int namecount;  int namecount;
75  int name_entry_size;  int name_entry_size;
76  int ovector[OVECCOUNT];  int ovector[OVECCOUNT];
77  int subject_length;  int subject_length;
78  int rc, i;  int rc, i;
79    int utf8;
80    
81    
82  /**************************************************************************  /**************************************************************************
# Line 255  if (namecount <= 0) printf("No named Line 258  if (namecount <= 0) printf("No named
258  * subject is not a valid match; other possibilities must be tried. The   *  * subject is not a valid match; other possibilities must be tried. The   *
259  * second flag restricts PCRE to one match attempt at the initial string  *  * second flag restricts PCRE to one match attempt at the initial string  *
260  * position. If this match succeeds, an alternative to the empty string   *  * position. If this match succeeds, an alternative to the empty string   *
261  * match has been found, and we can proceed round the loop.               *  * match has been found, and we can print it and proceed round the loop,  *
262    * advancing by the length of whatever was found. If this match does not  *
263    * succeed, we still stay in the loop, advancing by just one character.   *
264    * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be *
265    * more than one byte.                                                    *
266    *                                                                        *
267    * However, there is a complication concerned with newlines. When the     *
268    * newline convention is such that CRLF is a valid newline, we want must  *
269    * advance by two characters rather than one. The newline convention can  *
270    * be set in the regex by (*CR), etc.; if not, we must find the default.  *
271  *************************************************************************/  *************************************************************************/
272    
273  if (!find_all)  if (!find_all)     /* Check for -g */
274    {    {
275    pcre_free(re);   /* Release the memory used for the compiled pattern */    pcre_free(re);   /* Release the memory used for the compiled pattern */
276    return 0;        /* Finish unless -g was given */    return 0;        /* Finish unless -g was given */
277    }    }
278    
279    /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
280    sequence. First, find the options with which the regex was compiled; extract
281    the UTF-8 state, and mask off all but the newline options. */
282    
283    (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits);
284    utf8 = option_bits & PCRE_UTF8;
285    option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF|
286                   PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF;
287    
288    /* If no newline options were set, find the default newline convention from the
289    build configuration. */
290    
291    if (option_bits == 0)
292      {
293      int d;
294      (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
295      /* Note that these values are always the ASCII ones, even in
296      EBCDIC environments. CR = 13, NL = 10. */
297      option_bits = (d == 13)? PCRE_NEWLINE_CR :
298              (d == 10)? PCRE_NEWLINE_LF :
299              (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF :
300              (d == -2)? PCRE_NEWLINE_ANYCRLF :
301              (d == -1)? PCRE_NEWLINE_ANY : 0;
302      }
303    
304    /* See if CRLF is a valid newline sequence. */
305    
306    crlf_is_newline =
307         option_bits == PCRE_NEWLINE_ANY ||
308         option_bits == PCRE_NEWLINE_CRLF ||
309         option_bits == PCRE_NEWLINE_ANYCRLF;
310    
311  /* Loop for second and subsequent matches */  /* Loop for second and subsequent matches */
312    
313  for (;;)  for (;;)
# Line 297  for (;;) Line 341  for (;;)
341    is zero, it just means we have found all possible matches, so the loop ends.    is zero, it just means we have found all possible matches, so the loop ends.
342    Otherwise, it means we have failed to find a non-empty-string match at a    Otherwise, it means we have failed to find a non-empty-string match at a
343    point where there was a previous empty-string match. In this case, we do what    point where there was a previous empty-string match. In this case, we do what
344    Perl does: advance the matching position by one, and continue. We do this by    Perl does: advance the matching position by one character, and continue. We
345    setting the "end of previous match" offset, because that is picked up at the    do this by setting the "end of previous match" offset, because that is picked
346    top of the loop as the point at which to start again. */    up at the top of the loop as the point at which to start again.
347    
348      There are two complications: (a) When CRLF is a valid newline sequence, and
349      the current position is just before it, advance by an extra byte. (b)
350      Otherwise we must ensure that we skip an entire UTF-8 character if we are in
351      UTF-8 mode. */
352    
353    if (rc == PCRE_ERROR_NOMATCH)    if (rc == PCRE_ERROR_NOMATCH)
354      {      {
355      if (options == 0) break;      if (options == 0) break;                    /* All matches found */
356      ovector[1] = start_offset + 1;      ovector[1] = start_offset + 1;              /* Advance one byte */
357        if (crlf_is_newline &&                      /* If CRLF is newline & */
358            start_offset < subject_length - 1 &&    /* we are at CRLF, */
359            subject[start_offset] == '\r' &&
360            subject[start_offset + 1] == '\n')
361          ovector[1] += 1;                          /* Advance by one more. */
362        else if (utf8)                              /* Otherwise, ensure we */
363          {                                         /* advance a whole UTF-8 */
364          while (ovector[1] < subject_length)       /* character. */
365            {
366            if ((subject[ovector[1]] & 0xc0) != 0x80) break;
367            ovector[1] += 1;
368            }
369          }
370      continue;    /* Go round the loop again */      continue;    /* Go round the loop again */
371      }      }
372    

Legend:
Removed from v.566  
changed lines
  Added in v.567

  ViewVC Help
Powered by ViewVC 1.1.5