/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Diff of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 142 by ph10, Fri Mar 30 15:55:18 2007 UTC revision 149 by ph10, Mon Apr 16 15:28:08 2007 UTC
# Line 67  input mode under Windows. */ Line 67  input mode under Windows. */
67  #endif  #endif
68    
69    
70  #define PCRE_SPY        /* For Win32 build, import data, not export */  /* We have to include pcre_internal.h because we need the internal info for
71    displaying the results of pcre_study() and we also need to know about the
72  /* We include pcre_internal.h because we need the internal info for displaying  internal macros, structures, and other internal data values; pcretest has
73  the results of pcre_study() and we also need to know about the internal  "inside information" compared to a program that strictly follows the PCRE API.
74  macros, structures, and other internal data values; pcretest has "inside  
75  information" compared to a program that strictly follows the PCRE API. */  Although pcre_internal.h does itself include pcre.h, we explicitly include it
76    here before pcre_internal.h so that the PCRE_EXP_xxx macros get set
77    appropriately for an application, not for building PCRE. */
78    
79    #include "pcre.h"
80  #include "pcre_internal.h"  #include "pcre_internal.h"
81    
82  /* We need access to the data tables that PCRE uses. So as not to have to keep  /* We need access to the data tables that PCRE uses. So as not to have to keep
# Line 660  return count; Line 663  return count;
663  *************************************************/  *************************************************/
664    
665  /* This is used both at compile and run-time to check for <xxx> escapes, where  /* This is used both at compile and run-time to check for <xxx> escapes, where
666  xxx is LF, CR, CRLF, or ANY. Print a message and return 0 if there is no match.  xxx is LF, CR, CRLF, ANYCRLF, or ANY. Print a message and return 0 if there is
667    no match.
668    
669  Arguments:  Arguments:
670    p           points after the leading '<'    p           points after the leading '<'
# Line 675  check_newline(uschar *p, FILE *f) Line 679  check_newline(uschar *p, FILE *f)
679  if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;  if (strncmp((char *)p, "cr>", 3) == 0) return PCRE_NEWLINE_CR;
680  if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;  if (strncmp((char *)p, "lf>", 3) == 0) return PCRE_NEWLINE_LF;
681  if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;  if (strncmp((char *)p, "crlf>", 5) == 0) return PCRE_NEWLINE_CRLF;
682    if (strncmp((char *)p, "anycrlf>", 8) == 0) return PCRE_NEWLINE_ANYCRLF;
683  if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;  if (strncmp((char *)p, "any>", 4) == 0) return PCRE_NEWLINE_ANY;
684  fprintf(f, "Unknown newline type at: <%s\n", p);  fprintf(f, "Unknown newline type at: <%s\n", p);
685  return 0;  return 0;
# Line 847  while (argc > 1 && argv[op][0] == '-') Line 852  while (argc > 1 && argv[op][0] == '-')
852      (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);      (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
853      printf("  Newline sequence is %s\n", (rc == '\r')? "CR" :      printf("  Newline sequence is %s\n", (rc == '\r')? "CR" :
854        (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :        (rc == '\n')? "LF" : (rc == ('\r'<<8 | '\n'))? "CRLF" :
855          (rc == -2)? "ANYCRLF" :
856        (rc == -1)? "ANY" : "???");        (rc == -1)? "ANY" : "???");
857      (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);      (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
858      printf("  Internal link size = %d\n", rc);      printf("  Internal link size = %d\n", rc);
# Line 1442  while (!done) Line 1448  while (!done)
1448          fprintf(outfile, "Forced newline sequence: CRLF\n");          fprintf(outfile, "Forced newline sequence: CRLF\n");
1449          break;          break;
1450    
1451            case PCRE_NEWLINE_ANYCRLF:
1452            fprintf(outfile, "Forced newline sequence: ANYCRLF\n");
1453            break;
1454    
1455          case PCRE_NEWLINE_ANY:          case PCRE_NEWLINE_ANY:
1456          fprintf(outfile, "Forced newline sequence: ANY\n");          fprintf(outfile, "Forced newline sequence: ANY\n");
1457          break;          break;
# Line 1591  while (!done) Line 1601  while (!done)
1601    for (;;)    for (;;)
1602      {      {
1603      uschar *q;      uschar *q;
1604      uschar *bptr = dbuffer;      uschar *bptr;
1605      int *use_offsets = offsets;      int *use_offsets = offsets;
1606      int use_size_offsets = size_offsets;      int use_size_offsets = size_offsets;
1607      int callout_data = 0;      int callout_data = 0;
# Line 1647  while (!done) Line 1657  while (!done)
1657      p = buffer;      p = buffer;
1658      while (isspace(*p)) p++;      while (isspace(*p)) p++;
1659    
1660      q = dbuffer;      bptr = q = dbuffer;
1661      while ((c = *p++) != 0)      while ((c = *p++) != 0)
1662        {        {
1663        int i = 0;        int i = 0;
# Line 1972  while (!done) Line 1982  while (!done)
1982    
1983      for (;; gmatched++)    /* Loop for /g or /G */      for (;; gmatched++)    /* Loop for /g or /G */
1984        {        {
       int gany_fudge;  
1985        if (timeitm > 0)        if (timeitm > 0)
1986          {          {
1987          register int i;          register int i;
# Line 2212  while (!done) Line 2221  while (!done)
2221          }          }
2222    
2223        /* Failed to match. If this is a /g or /G loop and we previously set        /* Failed to match. If this is a /g or /G loop and we previously set
2224        g_notempty after a null match, this is not necessarily the end.        g_notempty after a null match, this is not necessarily the end. We want
2225        We want to advance the start offset, and continue. In the case of UTF-8        to advance the start offset, and continue. We won't be at the end of the
2226        matching, the advance must be one character, not one byte. Fudge the        string - that was checked before setting g_notempty.
2227        offset values to achieve this. We won't be at the end of the string -  
2228        that was checked before setting g_notempty. */        Complication arises in the case when the newline option is "any" or
2229          "anycrlf". If the previous match was at the end of a line terminated by
2230          CRLF, an advance of one character just passes the \r, whereas we should
2231          prefer the longer newline sequence, as does the code in pcre_exec().
2232          Fudge the offset value to achieve this.
2233    
2234          Otherwise, in the case of UTF-8 matching, the advance must be one
2235          character, not one byte. */
2236    
2237        else        else
2238          {          {
2239          if (g_notempty != 0)          if (g_notempty != 0)
2240            {            {
2241            int onechar = 1;            int onechar = 1;
2242              unsigned int obits = ((real_pcre *)re)->options;
2243            use_offsets[0] = start_offset;            use_offsets[0] = start_offset;
2244            if (use_utf8)            if ((obits & PCRE_NEWLINE_BITS) == 0)
2245                {
2246                int d;
2247                (void)pcre_config(PCRE_CONFIG_NEWLINE, &d);
2248                obits = (d == '\r')? PCRE_NEWLINE_CR :
2249                        (d == '\n')? PCRE_NEWLINE_LF :
2250                        (d == ('\r'<<8 | '\n'))? PCRE_NEWLINE_CRLF :
2251                        (d == -2)? PCRE_NEWLINE_ANYCRLF :
2252                        (d == -1)? PCRE_NEWLINE_ANY : 0;
2253                }
2254              if (((obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY ||
2255                   (obits & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANYCRLF)
2256                  &&
2257                  start_offset < len - 1 &&
2258                  bptr[start_offset] == '\r' &&
2259                  bptr[start_offset+1] == '\n')
2260                onechar++;
2261              else if (use_utf8)
2262              {              {
2263              while (start_offset + onechar < len)              while (start_offset + onechar < len)
2264                {                {
# Line 2256  while (!done) Line 2290  while (!done)
2290        what Perl's /g options does. This turns out to be rather cunning. First        what Perl's /g options does. This turns out to be rather cunning. First
2291        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
2292        same point. If this fails (picked up above) we advance to the next        same point. If this fails (picked up above) we advance to the next
2293        character.        character. */
   
       Yet more complication arises in the case when the newline option is  
       "any" and a pattern in multiline mode has to match at the start of a  
       line. If a previous match was at the end of a line, and advance of one  
       character just passes the \r, whereas we should prefer the longer newline  
       sequence, as does the code in pcre_exec(). So we fudge it. */  
2294    
2295        g_notempty = 0;        g_notempty = 0;
       gany_fudge = 0;  
2296    
2297        if (use_offsets[0] == use_offsets[1])        if (use_offsets[0] == use_offsets[1])
2298          {          {
2299          if (use_offsets[0] == len) break;          if (use_offsets[0] == len) break;
2300          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
         if ((((real_pcre *)re)->options & PCRE_STARTLINE) != 0 &&  
             (((real_pcre *)re)->options & PCRE_NEWLINE_BITS) == PCRE_NEWLINE_ANY &&  
             use_offsets[0] < len - 1 &&  
             bptr[use_offsets[0]] == '\r' &&  
             bptr[use_offsets[0]+1] == '\n')  
           gany_fudge = 1;  
2301          }          }
2302    
2303        /* For /g, update the start offset, leaving the rest alone */        /* For /g, update the start offset, leaving the rest alone */
2304    
2305        if (do_g) start_offset = use_offsets[1] + gany_fudge;        if (do_g) start_offset = use_offsets[1];
2306    
2307        /* For /G, update the pointer and length */        /* For /G, update the pointer and length */
2308    
2309        else        else
2310          {          {
2311          bptr += use_offsets[1] + gany_fudge;          bptr += use_offsets[1];
2312          len -= use_offsets[1] + gany_fudge;          len -= use_offsets[1];
2313          }          }
2314        }  /* End of loop for /g and /G */        }  /* End of loop for /g and /G */
2315    

Legend:
Removed from v.142  
changed lines
  Added in v.149

  ViewVC Help
Powered by ViewVC 1.1.5