/[pcre]/code/trunk/maint/utf8.c
ViewVC logotype

Diff of /code/trunk/maint/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 945 by ph10, Sat Apr 30 17:37:37 2011 UTC revision 946 by ph10, Wed Feb 29 18:00:55 2012 UTC
# Line 1  Line 1 
1  /* A program for converting characters to UTF-8 and vice versa */  /* A test program for converting characters to UTF-8 and vice versa. Note that
2    this program conforms to the original definition of UTF-8, which allows
3    codepoints up to 7fffffff. The more recent definition limits the validity of
4    UTF-8 codepoints to a maximum of 10ffffff.
5    
6    The arguments are either single codepoint values, written as 0xhhhh, for
7    conversion to UTF-8, or sequences of hex values, written without 0x and
8    optionally including spaces (but such arguments must be quoted), for conversion
9    from UTF-8 to codepoints. For example:
10    
11    ./utf8 0x1234
12    0x00001234 => e1 88 b4
13    
14    ./utf8 "e1 88 b4"
15    0x00001234 <= e1 88 b4
16    
17    In the second case, a number of characters can be present in one argument:
18    
19    ./utf8 "65 e188b4 77"
20    0x00000065 <= 65
21    0x00001234 <= e1 88 b4
22    0x00000077 <= 77
23    
24    If the option -s is given, the sequence of UTF-bytes is written out between
25    angle brackets at the end of the line. On a UTF-8 terminal, this will show the
26    appropriate graphic for the codepoint. */
27    
28  #include <stdio.h>  #include <stdio.h>
29  #include <stdlib.h>  #include <stdlib.h>
# Line 137  main(int argc, char **argv) Line 162  main(int argc, char **argv)
162  {  {
163  int i = 1;  int i = 1;
164  int show = 0;  int show = 0;
165  unsigned char buffer[8];  unsigned char buffer[64];
166    
167  if (strcmp(argv[1], "-s") == 0)  if (argc > 1 && strcmp(argv[1], "-s") == 0)
168    {    {
169    show = 1;    show = 1;
170    i = 2;    i = 2;
# Line 171  for (; i < argc; i++) Line 196  for (; i < argc; i++)
196      int d, rc;      int d, rc;
197      int j = 0;      int j = 0;
198      int y = 0;      int y = 0;
199      int z = 0;      int z = 0;
200        unsigned char *bptr;
201    
202      for (;;)      for (;;)
203        {        {
204        while (*x == ' ') x++;        while (*x == ' ') x++;
# Line 191  for (; i < argc; i++) Line 218  for (; i < argc; i++)
218          }          }
219        z ^= 1;        z ^= 1;
220        }        }
221      if (j < 0) continue;      buffer[j] = 0;
222      buffer[j] = 0;      bptr = buffer;
223      rc = utf82ord(buffer, &d);  
224      if (rc > 0) printf("0x%08x <= %s\n", d, argv[i]);      while (*bptr != 0)
225        else printf("Error %d <= %s\n", rc, argv[i]);        {
226          rc = utf82ord(bptr, &d);
227          if (rc > 0)
228            {
229            printf("0x%08x <= ", d);
230            for (j = 0; j < rc; j++) printf("%02x ", bptr[j]);
231            if (show)
232              {
233              printf(">");
234              for (j = 0; j < rc; j++) printf("%c", bptr[j]);
235              printf("<");
236              }
237            printf("\n");
238            bptr += rc;
239            }
240          else
241            {
242            printf("Malformed UTF-8 at offset %d <= ", -rc);
243            while (*bptr != 0) printf("%02x ", *bptr++);
244            printf("\n");
245            break;
246            }
247          }
248      }      }
249    }    }
250  return 0;  return 0;

Legend:
Removed from v.945  
changed lines
  Added in v.946

  ViewVC Help
Powered by ViewVC 1.1.5