/[pcre]/code/trunk/pcretest.c
ViewVC logotype

Diff of /code/trunk/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 41 by nigel, Sat Feb 24 21:39:17 2007 UTC revision 49 by nigel, Sat Feb 24 21:39:33 2007 UTC
# Line 34  Makefile. */ Line 34  Makefile. */
34    
35  static FILE *outfile;  static FILE *outfile;
36  static int log_store = 0;  static int log_store = 0;
37    static size_t gotten_store;
38    
39    
40    
41    static int utf8_table1[] = {
42      0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43    
44    static int utf8_table2[] = {
45      0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46    
47    static int utf8_table3[] = {
48      0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49    
50    
51    /*************************************************
52    *       Convert character value to UTF-8         *
53    *************************************************/
54    
55    /* This function takes an integer value in the range 0 - 0x7fffffff
56    and encodes it as a UTF-8 character in 0 to 6 bytes.
57    
58    Arguments:
59      cvalue     the character value
60      buffer     pointer to buffer for result - at least 6 bytes long
61    
62    Returns:     number of characters placed in the buffer
63                 -1 if input character is negative
64                 0 if input character is positive but too big (only when
65                 int is longer than 32 bits)
66    */
67    
68    static int
69    ord2utf8(int cvalue, unsigned char *buffer)
70    {
71    register int i, j;
72    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73      if (cvalue <= utf8_table1[i]) break;
74    if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75    if (cvalue < 0) return -1;
76    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77    cvalue >>= 6 - i;
78    for (j = 0; j < i; j++)
79      {
80      *buffer++ = 0x80 | (cvalue & 0x3f);
81      cvalue >>= 6;
82      }
83    return i + 1;
84    }
85    
86    
87    /*************************************************
88    *            Convert UTF-8 string to value       *
89    *************************************************/
90    
91    /* This function takes one or more bytes that represents a UTF-8 character,
92    and returns the value of the character.
93    
94    Argument:
95      buffer   a pointer to the byte vector
96      vptr     a pointer to an int to receive the value
97    
98    Returns:   >  0 => the number of bytes consumed
99               -6 to 0 => malformed UTF-8 character at offset = (-return)
100    */
101    
102    int
103    utf82ord(unsigned char *buffer, int *vptr)
104    {
105    int c = *buffer++;
106    int d = c;
107    int i, j, s;
108    
109    for (i = -1; i < 6; i++)               /* i is number of additional bytes */
110      {
111      if ((d & 0x80) == 0) break;
112      d <<= 1;
113      }
114    
115    if (i == -1) { *vptr = c; return 1; }  /* ascii character */
116    if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
117    
118    /* i now has a value in the range 1-5 */
119    
120    d = c & utf8_table3[i];
121    s = 6 - i;
122    
123    for (j = 0; j < i; j++)
124      {
125      c = *buffer++;
126      if ((c & 0xc0) != 0x80) return -(j+1);
127      d |= (c & 0x3f) << s;
128      s += 6;
129      }
130    
131    /* Check that encoding was the correct unique one */
132    
133    for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134      if (d <= utf8_table1[j]) break;
135    if (j != i) return -(i+1);
136    
137    /* Valid value */
138    
139    *vptr = d;
140    return i+1;
141    }
142    
143    
144    
145    
146    
147    
# Line 48  static const char *OP_names[] = { Line 156  static const char *OP_names[] = {
156    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
157    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
158    "*", "*?", "+", "+?", "?", "??", "{", "{",    "*", "*?", "+", "+?", "?", "??", "{", "{",
159    "class", "Ref",    "class", "Ref", "Recurse",
160    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",    "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
161    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",    "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
162    "Brazero", "Braminzero", "Bra"    "Brazero", "Braminzero", "Bra"
# Line 264  for(;;) Line 372  for(;;)
372    
373    
374    
375  /* Character string printing function. */  /* Character string printing function. A "normal" and a UTF-8 version. */
376    
377  static void pchars(unsigned char *p, int length)  static void pchars(unsigned char *p, int length, int utf8)
378  {  {
379  int c;  int c;
380  while (length-- > 0)  while (length-- > 0)
381      {
382      if (utf8)
383        {
384        int rc = utf82ord(p, &c);
385        if (rc > 0)
386          {
387          length -= rc - 1;
388          p += rc;
389          if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
390            else fprintf(outfile, "\\x{%02x}", c);
391          continue;
392          }
393        }
394    
395       /* Not UTF-8, or malformed UTF-8  */
396    
397    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
398      else fprintf(outfile, "\\x%02x", c);      else fprintf(outfile, "\\x%02x", c);
399      }
400  }  }
401    
402    
# Line 281  compiled re. */ Line 406  compiled re. */
406    
407  static void *new_malloc(size_t size)  static void *new_malloc(size_t size)
408  {  {
409    gotten_store = size;
410  if (log_store)  if (log_store)
411    fprintf(outfile, "Memory allocation (code space): %d\n",    fprintf(outfile, "Memory allocation (code space): %d\n",
412      (int)((int)size - offsetof(real_pcre, code[0])));      (int)((int)size - offsetof(real_pcre, code[0])));
# Line 289  return malloc(size); Line 415  return malloc(size);
415    
416    
417    
418    
419    /* Get one piece of information from the pcre_fullinfo() function */
420    
421    static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
422    {
423    int rc;
424    if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
425      fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
426    }
427    
428    
429    
430    
431  /* Read lines from named file or stdin and write to named file or stdout; lines  /* Read lines from named file or stdin and write to named file or stdout; lines
432  consist of a regular expression, in delimiters and optionally followed by  consist of a regular expression, in delimiters and optionally followed by
433  options, followed by a set of test data, terminated by an empty line. */  options, followed by a set of test data, terminated by an empty line. */
# Line 376  while (!done) Line 515  while (!done)
515    
516  #if !defined NOPOSIX  /* There are still compilers that require no indent */  #if !defined NOPOSIX  /* There are still compilers that require no indent */
517    regex_t preg;    regex_t preg;
518      int do_posix = 0;
519  #endif  #endif
520    
521    const char *error;    const char *error;
# Line 387  while (!done) Line 527  while (!done)
527    int do_g = 0;    int do_g = 0;
528    int do_showinfo = showinfo;    int do_showinfo = showinfo;
529    int do_showrest = 0;    int do_showrest = 0;
530    int do_posix = 0;    int utf8 = 0;
531    int erroroffset, len, delimiter;    int erroroffset, len, delimiter;
532    
533    if (infile == stdin) printf("  re> ");    if (infile == stdin) printf("  re> ");
# Line 479  while (!done) Line 619  while (!done)
619        case 'S': do_study = 1; break;        case 'S': do_study = 1; break;
620        case 'U': options |= PCRE_UNGREEDY; break;        case 'U': options |= PCRE_UNGREEDY; break;
621        case 'X': options |= PCRE_EXTRA; break;        case 'X': options |= PCRE_EXTRA; break;
622          case '8': options |= PCRE_UTF8; utf8 = 1; break;
623    
624        case 'L':        case 'L':
625        ppp = pp;        ppp = pp;
# Line 573  while (!done) Line 714  while (!done)
714        goto CONTINUE;        goto CONTINUE;
715        }        }
716    
717      /* Compilation succeeded; print data if required */      /* Compilation succeeded; print data if required. There are now two
718        info-returning functions. The old one has a limited interface and
719        returns only limited data. Check that it agrees with the newer one. */
720    
721      if (do_showinfo)      if (do_showinfo)
722        {        {
723        int first_char, count;        int old_first_char, old_options, old_count;
724          int count, backrefmax, first_char, need_char;
725          size_t size;
726    
727        if (do_debug) print_internals(re);        if (do_debug) print_internals(re);
728    
729        count = pcre_info(re, &options, &first_char);        new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
730          new_info(re, NULL, PCRE_INFO_SIZE, &size);
731          new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
732          new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
733          new_info(re, NULL, PCRE_INFO_FIRSTCHAR, &first_char);
734          new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
735    
736          old_count = pcre_info(re, &old_options, &old_first_char);
737        if (count < 0) fprintf(outfile,        if (count < 0) fprintf(outfile,
738          "Error %d while reading info\n", count);          "Error %d from pcre_info()\n", count);
739        else        else
740          {          {
741          fprintf(outfile, "Identifying subpattern count = %d\n", count);          if (old_count != count) fprintf(outfile,
742          if (options == 0) fprintf(outfile, "No options\n");            "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
743            else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",              old_count);
744              ((options & PCRE_ANCHORED) != 0)? " anchored" : "",  
745              ((options & PCRE_CASELESS) != 0)? " caseless" : "",          if (old_first_char != first_char) fprintf(outfile,
746              ((options & PCRE_EXTENDED) != 0)? " extended" : "",            "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
747              ((options & PCRE_MULTILINE) != 0)? " multiline" : "",              first_char, old_first_char);
748              ((options & PCRE_DOTALL) != 0)? " dotall" : "",  
749              ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",          if (old_options != options) fprintf(outfile,
750              ((options & PCRE_EXTRA) != 0)? " extra" : "",            "Options disagreement: pcre_fullinfo=%d pcre_info=%d\n", options,
751              ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");              old_options);
752            }
         if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)  
           fprintf(outfile, "Case state changes\n");  
753    
754          if (first_char == -1)        if (size != gotten_store) fprintf(outfile,
755            {          "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
756            fprintf(outfile, "First char at start or follows \\n\n");          size, gotten_store);
757            }  
758          else if (first_char < 0)        fprintf(outfile, "Capturing subpattern count = %d\n", count);
759            {        if (backrefmax > 0)
760            fprintf(outfile, "No first char\n");          fprintf(outfile, "Max back reference = %d\n", backrefmax);
761            }        if (options == 0) fprintf(outfile, "No options\n");
762            else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
763              ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
764              ((options & PCRE_CASELESS) != 0)? " caseless" : "",
765              ((options & PCRE_EXTENDED) != 0)? " extended" : "",
766              ((options & PCRE_MULTILINE) != 0)? " multiline" : "",
767              ((options & PCRE_DOTALL) != 0)? " dotall" : "",
768              ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
769              ((options & PCRE_EXTRA) != 0)? " extra" : "",
770              ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
771              ((options & PCRE_UTF8) != 0)? " utf8" : "");
772    
773          if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
774            fprintf(outfile, "Case state changes\n");
775    
776          if (first_char == -1)
777            {
778            fprintf(outfile, "First char at start or follows \\n\n");
779            }
780          else if (first_char < 0)
781            {
782            fprintf(outfile, "No first char\n");
783            }
784          else
785            {
786            if (isprint(first_char))
787              fprintf(outfile, "First char = \'%c\'\n", first_char);
788          else          else
789            {            fprintf(outfile, "First char = %d\n", first_char);
790            if (isprint(first_char))          }
             fprintf(outfile, "First char = \'%c\'\n", first_char);  
           else  
             fprintf(outfile, "First char = %d\n", first_char);  
           }  
791    
792          if (((((real_pcre *)re)->options) & PCRE_REQCHSET) != 0)        if (need_char < 0)
793            {          {
794            int req_char = ((real_pcre *)re)->req_char;          fprintf(outfile, "No need char\n");
795            if (isprint(req_char))          }
796              fprintf(outfile, "Req char = \'%c\'\n", req_char);        else
797            else          {
798              fprintf(outfile, "Req char = %d\n", req_char);          if (isprint(need_char))
799            }            fprintf(outfile, "Need char = \'%c\'\n", need_char);
800          else fprintf(outfile, "No req char\n");          else
801              fprintf(outfile, "Need char = %d\n", need_char);
802          }          }
803        }        }
804    
# Line 654  while (!done) Line 827  while (!done)
827        else if (extra == NULL)        else if (extra == NULL)
828          fprintf(outfile, "Study returned NULL\n");          fprintf(outfile, "Study returned NULL\n");
829    
       /* This looks at internal information. A bit kludgy to do it this  
       way, but it is useful for testing. */  
   
830        else if (do_showinfo)        else if (do_showinfo)
831          {          {
832          real_pcre_extra *xx = (real_pcre_extra *)extra;          uschar *start_bits = NULL;
833          if ((xx->options & PCRE_STUDY_MAPPED) == 0)          new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
834            if (start_bits == NULL)
835            fprintf(outfile, "No starting character set\n");            fprintf(outfile, "No starting character set\n");
836          else          else
837            {            {
# Line 669  while (!done) Line 840  while (!done)
840            fprintf(outfile, "Starting character set: ");            fprintf(outfile, "Starting character set: ");
841            for (i = 0; i < 256; i++)            for (i = 0; i < 256; i++)
842              {              {
843              if ((xx->start_bits[i/8] & (1<<(i%8))) != 0)              if ((start_bits[i/8] & (1<<(i%8))) != 0)
844                {                {
845                if (c > 75)                if (c > 75)
846                  {                  {
# Line 752  while (!done) Line 923  while (!done)
923          break;          break;
924    
925          case 'x':          case 'x':
926    
927            /* Handle \x{..} specially - new Perl thing for utf8 */
928    
929            if (*p == '{')
930              {
931              unsigned char *pt = p;
932              c = 0;
933              while (isxdigit(*(++pt)))
934                c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
935              if (*pt == '}')
936                {
937                unsigned char buffer[8];
938                int ii, utn;
939                utn = ord2utf8(c, buffer);
940                for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
941                c = buffer[ii];   /* Last byte */
942                p = pt + 1;
943                break;
944                }
945              /* Not correct form; fall through */
946              }
947    
948            /* Ordinary \x */
949    
950          c = 0;          c = 0;
951          while (i++ < 2 && isxdigit(*p))          while (i++ < 2 && isxdigit(*p))
952            {            {
# Line 832  while (!done) Line 1027  while (!done)
1027              {              {
1028              fprintf(outfile, "%2d: ", (int)i);              fprintf(outfile, "%2d: ", (int)i);
1029              pchars(dbuffer + pmatch[i].rm_so,              pchars(dbuffer + pmatch[i].rm_so,
1030                pmatch[i].rm_eo - pmatch[i].rm_so);                pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1031              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1032              if (i == 0 && do_showrest)              if (i == 0 && do_showrest)
1033                {                {
1034                fprintf(outfile, " 0+ ");                fprintf(outfile, " 0+ ");
1035                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo);                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1036                fprintf(outfile, "\n");                fprintf(outfile, "\n");
1037                }                }
1038              }              }
# Line 887  while (!done) Line 1082  while (!done)
1082            else            else
1083              {              {
1084              fprintf(outfile, "%2d: ", i/2);              fprintf(outfile, "%2d: ", i/2);
1085              pchars(bptr + offsets[i], offsets[i+1] - offsets[i]);              pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
1086              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1087              if (i == 0)              if (i == 0)
1088                {                {
1089                if (do_showrest)                if (do_showrest)
1090                  {                  {
1091                  fprintf(outfile, " 0+ ");                  fprintf(outfile, " 0+ ");
1092                  pchars(bptr + offsets[i+1], len - offsets[i+1]);                  pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
1093                  fprintf(outfile, "\n");                  fprintf(outfile, "\n");
1094                  }                  }
1095                }                }
# Line 927  while (!done) Line 1122  while (!done)
1122              else              else
1123                {                {
1124                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1125                free((void *)substring);                /* free((void *)substring); */
1126                  pcre_free_substring(substring);
1127                }                }
1128              }              }
1129            }            }
# Line 945  while (!done) Line 1141  while (!done)
1141                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1142              if (stringlist[i] != NULL)              if (stringlist[i] != NULL)
1143                fprintf(outfile, "string list not terminated by NULL\n");                fprintf(outfile, "string list not terminated by NULL\n");
1144              free((void *)stringlist);              /* free((void *)stringlist); */
1145                pcre_free_substring_list(stringlist);
1146              }              }
1147            }            }
1148          }          }
1149    
1150        /* Failed to match. If this is a /g or /G loop and we previously set        /* Failed to match. If this is a /g or /G loop and we previously set
1151        PCRE_NOTEMPTY after a null match, this is not necessarily the end.        g_notempty after a null match, this is not necessarily the end.
1152        We want to advance the start offset, and continue. Fudge the offset        We want to advance the start offset, and continue. Fudge the offset
1153        values to achieve this. We won't be at the end of the string - that        values to achieve this. We won't be at the end of the string - that
1154        was checked before setting PCRE_NOTEMPTY. */        was checked before setting g_notempty. */
1155    
1156        else        else
1157          {          {
# Line 981  while (!done) Line 1178  while (!done)
1178        /* If we have matched an empty string, first check to see if we are at        /* If we have matched an empty string, first check to see if we are at
1179        the end of the subject. If so, the /g loop is over. Otherwise, mimic        the end of the subject. If so, the /g loop is over. Otherwise, mimic
1180        what Perl's /g options does. This turns out to be rather cunning. First        what Perl's /g options does. This turns out to be rather cunning. First
1181        we set PCRE_NOTEMPTY and try the match again at the same point. If this        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1182        fails (picked up above) we advance to the next character. */        same point. If this fails (picked up above) we advance to the next
1183          character. */
1184    
1185        g_notempty = 0;        g_notempty = 0;
1186        if (offsets[0] == offsets[1])        if (offsets[0] == offsets[1])
1187          {          {
1188          if (offsets[0] == len) break;          if (offsets[0] == len) break;
1189          g_notempty = PCRE_NOTEMPTY;          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1190          }          }
1191    
1192        /* For /g, update the start offset, leaving the rest alone */        /* For /g, update the start offset, leaving the rest alone */

Legend:
Removed from v.41  
changed lines
  Added in v.49

  ViewVC Help
Powered by ViewVC 1.1.5