1 |
/* A program for converting characters to UTF-8 and vice versa */
|
2 |
|
3 |
#include <stdio.h>
|
4 |
#include <stdlib.h>
|
5 |
#include <ctype.h>
|
6 |
|
7 |
/* The valid ranges for UTF-8 characters are:
|
8 |
|
9 |
0000 0000 to 0000 007f 1 byte (ascii)
|
10 |
0000 0080 to 0000 07ff 2 bytes
|
11 |
0000 0800 to 0000 ffff 3 bytes
|
12 |
0001 0000 to 001f ffff 4 bytes
|
13 |
0020 0000 to 03ff ffff 5 bytes
|
14 |
0400 0000 to 7fff ffff 6 bytes
|
15 |
*/
|
16 |
|
17 |
|
18 |
static const int utf8_table1[] = {
|
19 |
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
|
20 |
|
21 |
static const int utf8_table2[] = {
|
22 |
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
|
23 |
|
24 |
static const int utf8_table3[] = {
|
25 |
0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
|
26 |
|
27 |
static const unsigned char utf8_table4[] = {
|
28 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
29 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
30 |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
31 |
3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
|
32 |
|
33 |
|
34 |
/*************************************************
|
35 |
* Convert character value to UTF-8 *
|
36 |
*************************************************/
|
37 |
|
38 |
/* This function takes an integer value in the range 0 - 0x7fffffff
|
39 |
and encodes it as a UTF-8 character in 1 to 6 bytes.
|
40 |
|
41 |
Arguments:
|
42 |
cvalue the character value
|
43 |
buffer pointer to buffer for result - at least 6 bytes long
|
44 |
|
45 |
Returns: number of characters placed in the buffer
|
46 |
-1 if input character is negative
|
47 |
0 if input character is positive but too big (only when
|
48 |
int is longer than 32 bits)
|
49 |
*/
|
50 |
|
51 |
int
|
52 |
ord2utf8(int cvalue, unsigned char *buffer)
|
53 |
{
|
54 |
register int i, j;
|
55 |
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
|
56 |
if (cvalue <= utf8_table1[i]) break;
|
57 |
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
|
58 |
if (cvalue < 0) return -1;
|
59 |
buffer += i;
|
60 |
for (j = i; j > 0; j--)
|
61 |
{
|
62 |
*buffer-- = 0x80 | (cvalue & 0x3f);
|
63 |
cvalue >>= 6;
|
64 |
}
|
65 |
*buffer = utf8_table2[i] | cvalue;
|
66 |
return i + 1;
|
67 |
}
|
68 |
|
69 |
|
70 |
|
71 |
/*************************************************
|
72 |
* Convert UTF-8 string to value *
|
73 |
*************************************************/
|
74 |
|
75 |
/* This function takes one or more bytes that represents a UTF-8 character,
|
76 |
and returns the value of the character.
|
77 |
|
78 |
Argument:
|
79 |
buffer a pointer to the byte vector
|
80 |
vptr a pointer to an int to receive the value
|
81 |
|
82 |
Returns: > 0 => the number of bytes consumed
|
83 |
-6 to 0 => malformed UTF-8 character at offset = (-return)
|
84 |
*/
|
85 |
|
86 |
int
|
87 |
utf82ord(unsigned char *buffer, int *vptr)
|
88 |
{
|
89 |
int c = *buffer++;
|
90 |
int d = c;
|
91 |
int i, j, s;
|
92 |
|
93 |
for (i = -1; i < 6; i++) /* i is number of additional bytes */
|
94 |
{
|
95 |
if ((d & 0x80) == 0) break;
|
96 |
d <<= 1;
|
97 |
}
|
98 |
|
99 |
if (i == -1) { *vptr = c; return 1; } /* ascii character */
|
100 |
if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
|
101 |
|
102 |
/* i now has a value in the range 1-5 */
|
103 |
|
104 |
s = 6*i;
|
105 |
d = (c & utf8_table3[i]) << s;
|
106 |
|
107 |
for (j = 0; j < i; j++)
|
108 |
{
|
109 |
c = *buffer++;
|
110 |
if ((c & 0xc0) != 0x80) return -(j+1);
|
111 |
s -= 6;
|
112 |
d |= (c & 0x3f) << s;
|
113 |
}
|
114 |
|
115 |
/* Check that encoding was the correct unique one */
|
116 |
|
117 |
for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
|
118 |
if (d <= utf8_table1[j]) break;
|
119 |
if (j != i) return -(i+1);
|
120 |
|
121 |
/* Valid value */
|
122 |
|
123 |
*vptr = d;
|
124 |
return i+1;
|
125 |
}
|
126 |
|
127 |
|
128 |
|
129 |
|
130 |
/*************************************************
|
131 |
* Main Program *
|
132 |
*************************************************/
|
133 |
|
134 |
|
135 |
int
|
136 |
main(int argc, char **argv)
|
137 |
{
|
138 |
int i;
|
139 |
unsigned char buffer[8];
|
140 |
for (i = 1; i < argc; i++)
|
141 |
{
|
142 |
unsigned char *x = argv[i];
|
143 |
if (strncmp(x, "0x", 2) == 0)
|
144 |
{
|
145 |
int j;
|
146 |
int d = strtol(x+2, NULL, 16);
|
147 |
int rc = ord2utf8(d, buffer);
|
148 |
printf("0x%08x => ", d);
|
149 |
if (rc <= 0) printf("*** Error %d ***", rc);
|
150 |
else for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
|
151 |
printf("\n");
|
152 |
}
|
153 |
else
|
154 |
{
|
155 |
int d, rc;
|
156 |
int j = 0;
|
157 |
int y = 0;
|
158 |
int z = 0;
|
159 |
for (;;)
|
160 |
{
|
161 |
while (*x == ' ') x++;
|
162 |
if (*x == 0 && !z) break;
|
163 |
if (!isxdigit(*x))
|
164 |
{
|
165 |
printf("Malformed hex string: %s\n", argv[i]);
|
166 |
j = -1;
|
167 |
break;
|
168 |
}
|
169 |
y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
|
170 |
x++;
|
171 |
if (z)
|
172 |
{
|
173 |
buffer[j++] = y;
|
174 |
y = 0;
|
175 |
}
|
176 |
z ^= 1;
|
177 |
}
|
178 |
if (j < 0) continue;
|
179 |
buffer[j] = 0;
|
180 |
rc = utf82ord(buffer, &d);
|
181 |
if (rc > 0) printf("0x%08x <= %s\n", d, argv[i]);
|
182 |
else printf("Error %d <= %s\n", rc, argv[i]);
|
183 |
}
|
184 |
}
|
185 |
return 0;
|
186 |
}
|
187 |
|
188 |
/* End */
|