/[pcre]/code/tags/pcre-6.0/pcretest.c
ViewVC logotype

Diff of /code/tags/pcre-6.0/pcretest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 43 by nigel, Sat Feb 24 21:39:21 2007 UTC revision 49 by nigel, Sat Feb 24 21:39:33 2007 UTC
# Line 38  static size_t gotten_store; Line 38  static size_t gotten_store;
38    
39    
40    
41    static int utf8_table1[] = {
42      0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
43    
44    static int utf8_table2[] = {
45      0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
46    
47    static int utf8_table3[] = {
48      0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49    
50    
51    /*************************************************
52    *       Convert character value to UTF-8         *
53    *************************************************/
54    
55    /* This function takes an integer value in the range 0 - 0x7fffffff
56    and encodes it as a UTF-8 character in 0 to 6 bytes.
57    
58    Arguments:
59      cvalue     the character value
60      buffer     pointer to buffer for result - at least 6 bytes long
61    
62    Returns:     number of characters placed in the buffer
63                 -1 if input character is negative
64                 0 if input character is positive but too big (only when
65                 int is longer than 32 bits)
66    */
67    
68    static int
69    ord2utf8(int cvalue, unsigned char *buffer)
70    {
71    register int i, j;
72    for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
73      if (cvalue <= utf8_table1[i]) break;
74    if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
75    if (cvalue < 0) return -1;
76    *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
77    cvalue >>= 6 - i;
78    for (j = 0; j < i; j++)
79      {
80      *buffer++ = 0x80 | (cvalue & 0x3f);
81      cvalue >>= 6;
82      }
83    return i + 1;
84    }
85    
86    
87    /*************************************************
88    *            Convert UTF-8 string to value       *
89    *************************************************/
90    
91    /* This function takes one or more bytes that represents a UTF-8 character,
92    and returns the value of the character.
93    
94    Argument:
95      buffer   a pointer to the byte vector
96      vptr     a pointer to an int to receive the value
97    
98    Returns:   >  0 => the number of bytes consumed
99               -6 to 0 => malformed UTF-8 character at offset = (-return)
100    */
101    
102    int
103    utf82ord(unsigned char *buffer, int *vptr)
104    {
105    int c = *buffer++;
106    int d = c;
107    int i, j, s;
108    
109    for (i = -1; i < 6; i++)               /* i is number of additional bytes */
110      {
111      if ((d & 0x80) == 0) break;
112      d <<= 1;
113      }
114    
115    if (i == -1) { *vptr = c; return 1; }  /* ascii character */
116    if (i == 0 || i == 6) return 0;        /* invalid UTF-8 */
117    
118    /* i now has a value in the range 1-5 */
119    
120    d = c & utf8_table3[i];
121    s = 6 - i;
122    
123    for (j = 0; j < i; j++)
124      {
125      c = *buffer++;
126      if ((c & 0xc0) != 0x80) return -(j+1);
127      d |= (c & 0x3f) << s;
128      s += 6;
129      }
130    
131    /* Check that encoding was the correct unique one */
132    
133    for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
134      if (d <= utf8_table1[j]) break;
135    if (j != i) return -(i+1);
136    
137    /* Valid value */
138    
139    *vptr = d;
140    return i+1;
141    }
142    
143    
144    
145    
146    
147    
148  /* Debugging function to print the internal form of the regex. This is the same  /* Debugging function to print the internal form of the regex. This is the same
149  code as contained in pcre.c under the DEBUG macro. */  code as contained in pcre.c under the DEBUG macro. */
150    
# Line 265  for(;;) Line 372  for(;;)
372    
373    
374    
375  /* Character string printing function. */  /* Character string printing function. A "normal" and a UTF-8 version. */
376    
377  static void pchars(unsigned char *p, int length)  static void pchars(unsigned char *p, int length, int utf8)
378  {  {
379  int c;  int c;
380  while (length-- > 0)  while (length-- > 0)
381      {
382      if (utf8)
383        {
384        int rc = utf82ord(p, &c);
385        if (rc > 0)
386          {
387          length -= rc - 1;
388          p += rc;
389          if (c < 256 && isprint(c)) fprintf(outfile, "%c", c);
390            else fprintf(outfile, "\\x{%02x}", c);
391          continue;
392          }
393        }
394    
395       /* Not UTF-8, or malformed UTF-8  */
396    
397    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);    if (isprint(c = *(p++))) fprintf(outfile, "%c", c);
398      else fprintf(outfile, "\\x%02x", c);      else fprintf(outfile, "\\x%02x", c);
399      }
400  }  }
401    
402    
# Line 391  while (!done) Line 515  while (!done)
515    
516  #if !defined NOPOSIX  /* There are still compilers that require no indent */  #if !defined NOPOSIX  /* There are still compilers that require no indent */
517    regex_t preg;    regex_t preg;
518      int do_posix = 0;
519  #endif  #endif
520    
521    const char *error;    const char *error;
# Line 402  while (!done) Line 527  while (!done)
527    int do_g = 0;    int do_g = 0;
528    int do_showinfo = showinfo;    int do_showinfo = showinfo;
529    int do_showrest = 0;    int do_showrest = 0;
530    int do_posix = 0;    int utf8 = 0;
531    int erroroffset, len, delimiter;    int erroroffset, len, delimiter;
532    
533    if (infile == stdin) printf("  re> ");    if (infile == stdin) printf("  re> ");
# Line 494  while (!done) Line 619  while (!done)
619        case 'S': do_study = 1; break;        case 'S': do_study = 1; break;
620        case 'U': options |= PCRE_UNGREEDY; break;        case 'U': options |= PCRE_UNGREEDY; break;
621        case 'X': options |= PCRE_EXTRA; break;        case 'X': options |= PCRE_EXTRA; break;
622          case '8': options |= PCRE_UTF8; utf8 = 1; break;
623    
624        case 'L':        case 'L':
625        ppp = pp;        ppp = pp;
# Line 633  while (!done) Line 759  while (!done)
759        if (backrefmax > 0)        if (backrefmax > 0)
760          fprintf(outfile, "Max back reference = %d\n", backrefmax);          fprintf(outfile, "Max back reference = %d\n", backrefmax);
761        if (options == 0) fprintf(outfile, "No options\n");        if (options == 0) fprintf(outfile, "No options\n");
762          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s\n",          else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s\n",
763            ((options & PCRE_ANCHORED) != 0)? " anchored" : "",            ((options & PCRE_ANCHORED) != 0)? " anchored" : "",
764            ((options & PCRE_CASELESS) != 0)? " caseless" : "",            ((options & PCRE_CASELESS) != 0)? " caseless" : "",
765            ((options & PCRE_EXTENDED) != 0)? " extended" : "",            ((options & PCRE_EXTENDED) != 0)? " extended" : "",
# Line 641  while (!done) Line 767  while (!done)
767            ((options & PCRE_DOTALL) != 0)? " dotall" : "",            ((options & PCRE_DOTALL) != 0)? " dotall" : "",
768            ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",            ((options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
769            ((options & PCRE_EXTRA) != 0)? " extra" : "",            ((options & PCRE_EXTRA) != 0)? " extra" : "",
770            ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "");            ((options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
771              ((options & PCRE_UTF8) != 0)? " utf8" : "");
772    
773        if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)        if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
774          fprintf(outfile, "Case state changes\n");          fprintf(outfile, "Case state changes\n");
# Line 796  while (!done) Line 923  while (!done)
923          break;          break;
924    
925          case 'x':          case 'x':
926    
927            /* Handle \x{..} specially - new Perl thing for utf8 */
928    
929            if (*p == '{')
930              {
931              unsigned char *pt = p;
932              c = 0;
933              while (isxdigit(*(++pt)))
934                c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
935              if (*pt == '}')
936                {
937                unsigned char buffer[8];
938                int ii, utn;
939                utn = ord2utf8(c, buffer);
940                for (ii = 0; ii < utn - 1; ii++) *q++ = buffer[ii];
941                c = buffer[ii];   /* Last byte */
942                p = pt + 1;
943                break;
944                }
945              /* Not correct form; fall through */
946              }
947    
948            /* Ordinary \x */
949    
950          c = 0;          c = 0;
951          while (i++ < 2 && isxdigit(*p))          while (i++ < 2 && isxdigit(*p))
952            {            {
# Line 876  while (!done) Line 1027  while (!done)
1027              {              {
1028              fprintf(outfile, "%2d: ", (int)i);              fprintf(outfile, "%2d: ", (int)i);
1029              pchars(dbuffer + pmatch[i].rm_so,              pchars(dbuffer + pmatch[i].rm_so,
1030                pmatch[i].rm_eo - pmatch[i].rm_so);                pmatch[i].rm_eo - pmatch[i].rm_so, utf8);
1031              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1032              if (i == 0 && do_showrest)              if (i == 0 && do_showrest)
1033                {                {
1034                fprintf(outfile, " 0+ ");                fprintf(outfile, " 0+ ");
1035                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo);                pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo, utf8);
1036                fprintf(outfile, "\n");                fprintf(outfile, "\n");
1037                }                }
1038              }              }
# Line 931  while (!done) Line 1082  while (!done)
1082            else            else
1083              {              {
1084              fprintf(outfile, "%2d: ", i/2);              fprintf(outfile, "%2d: ", i/2);
1085              pchars(bptr + offsets[i], offsets[i+1] - offsets[i]);              pchars(bptr + offsets[i], offsets[i+1] - offsets[i], utf8);
1086              fprintf(outfile, "\n");              fprintf(outfile, "\n");
1087              if (i == 0)              if (i == 0)
1088                {                {
1089                if (do_showrest)                if (do_showrest)
1090                  {                  {
1091                  fprintf(outfile, " 0+ ");                  fprintf(outfile, " 0+ ");
1092                  pchars(bptr + offsets[i+1], len - offsets[i+1]);                  pchars(bptr + offsets[i+1], len - offsets[i+1], utf8);
1093                  fprintf(outfile, "\n");                  fprintf(outfile, "\n");
1094                  }                  }
1095                }                }
# Line 971  while (!done) Line 1122  while (!done)
1122              else              else
1123                {                {
1124                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);                fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1125                free((void *)substring);                /* free((void *)substring); */
1126                  pcre_free_substring(substring);
1127                }                }
1128              }              }
1129            }            }
# Line 989  while (!done) Line 1141  while (!done)
1141                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);                fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1142              if (stringlist[i] != NULL)              if (stringlist[i] != NULL)
1143                fprintf(outfile, "string list not terminated by NULL\n");                fprintf(outfile, "string list not terminated by NULL\n");
1144              free((void *)stringlist);              /* free((void *)stringlist); */
1145                pcre_free_substring_list(stringlist);
1146              }              }
1147            }            }
1148          }          }
1149    
1150        /* Failed to match. If this is a /g or /G loop and we previously set        /* Failed to match. If this is a /g or /G loop and we previously set
1151        PCRE_NOTEMPTY after a null match, this is not necessarily the end.        g_notempty after a null match, this is not necessarily the end.
1152        We want to advance the start offset, and continue. Fudge the offset        We want to advance the start offset, and continue. Fudge the offset
1153        values to achieve this. We won't be at the end of the string - that        values to achieve this. We won't be at the end of the string - that
1154        was checked before setting PCRE_NOTEMPTY. */        was checked before setting g_notempty. */
1155    
1156        else        else
1157          {          {
# Line 1025  while (!done) Line 1178  while (!done)
1178        /* If we have matched an empty string, first check to see if we are at        /* If we have matched an empty string, first check to see if we are at
1179        the end of the subject. If so, the /g loop is over. Otherwise, mimic        the end of the subject. If so, the /g loop is over. Otherwise, mimic
1180        what Perl's /g options does. This turns out to be rather cunning. First        what Perl's /g options does. This turns out to be rather cunning. First
1181        we set PCRE_NOTEMPTY and try the match again at the same point. If this        we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1182        fails (picked up above) we advance to the next character. */        same point. If this fails (picked up above) we advance to the next
1183          character. */
1184    
1185        g_notempty = 0;        g_notempty = 0;
1186        if (offsets[0] == offsets[1])        if (offsets[0] == offsets[1])
1187          {          {
1188          if (offsets[0] == len) break;          if (offsets[0] == len) break;
1189          g_notempty = PCRE_NOTEMPTY;          g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1190          }          }
1191    
1192        /* For /g, update the start offset, leaving the rest alone */        /* For /g, update the start offset, leaving the rest alone */

Legend:
Removed from v.43  
changed lines
  Added in v.49

  ViewVC Help
Powered by ViewVC 1.1.5