/[pcre]/code/trunk/maintain/utf8.c
ViewVC logotype

Contents of /code/trunk/maintain/utf8.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 97 - (show annotations)
Mon Mar 5 12:36:47 2007 UTC (13 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 4396 byte(s)
Applied Bob and Daniel's patches to convert the build system to automake. Added 
the maintain directory, containing files that are used for maintenance, but are 
not distributed. This is an intermediate step.
1 /* A program for converting characters to UTF-8 and vice versa */
2
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <ctype.h>
6
7 /* The valid ranges for UTF-8 characters are:
8
9 0000 0000 to 0000 007f 1 byte (ascii)
10 0000 0080 to 0000 07ff 2 bytes
11 0000 0800 to 0000 ffff 3 bytes
12 0001 0000 to 001f ffff 4 bytes
13 0020 0000 to 03ff ffff 5 bytes
14 0400 0000 to 7fff ffff 6 bytes
15 */
16
17
18 static const int utf8_table1[] = {
19 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
20
21 static const int utf8_table2[] = {
22 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
23
24 static const int utf8_table3[] = {
25 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
26
27 static const unsigned char utf8_table4[] = {
28 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
29 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
30 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
31 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,6 };
32
33
34 /*************************************************
35 * Convert character value to UTF-8 *
36 *************************************************/
37
38 /* This function takes an integer value in the range 0 - 0x7fffffff
39 and encodes it as a UTF-8 character in 0 to 6 bytes.
40
41 Arguments:
42 cvalue the character value
43 buffer pointer to buffer for result - at least 6 bytes long
44
45 Returns: number of characters placed in the buffer
46 -1 if input character is negative
47 0 if input character is positive but too big (only when
48 int is longer than 32 bits)
49 */
50
51 int
52 ord2utf8(int cvalue, unsigned char *buffer)
53 {
54 register int i, j;
55 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
56 if (cvalue <= utf8_table1[i]) break;
57 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
58 if (cvalue < 0) return -1;
59 buffer += i;
60 for (j = i; j > 0; j--)
61 {
62 *buffer-- = 0x80 | (cvalue & 0x3f);
63 cvalue >>= 6;
64 }
65 *buffer = utf8_table2[i] | cvalue;
66 return i + 1;
67 }
68
69
70
71 /*************************************************
72 * Convert UTF-8 string to value *
73 *************************************************/
74
75 /* This function takes one or more bytes that represents a UTF-8 character,
76 and returns the value of the character.
77
78 Argument:
79 buffer a pointer to the byte vector
80 vptr a pointer to an int to receive the value
81
82 Returns: > 0 => the number of bytes consumed
83 -6 to 0 => malformed UTF-8 character at offset = (-return)
84 */
85
86 int
87 utf82ord(unsigned char *buffer, int *vptr)
88 {
89 int c = *buffer++;
90 int d = c;
91 int i, j, s;
92
93 for (i = -1; i < 6; i++) /* i is number of additional bytes */
94 {
95 if ((d & 0x80) == 0) break;
96 d <<= 1;
97 }
98
99 if (i == -1) { *vptr = c; return 1; } /* ascii character */
100 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
101
102 /* i now has a value in the range 1-5 */
103
104 s = 6*i;
105 d = (c & utf8_table3[i]) << s;
106
107 for (j = 0; j < i; j++)
108 {
109 c = *buffer++;
110 if ((c & 0xc0) != 0x80) return -(j+1);
111 s -= 6;
112 d |= (c & 0x3f) << s;
113 }
114
115 /* Check that encoding was the correct unique one */
116
117 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
118 if (d <= utf8_table1[j]) break;
119 if (j != i) return -(i+1);
120
121 /* Valid value */
122
123 *vptr = d;
124 return i+1;
125 }
126
127
128
129
130 /*************************************************
131 * Main Program *
132 *************************************************/
133
134
135 int
136 main(int argc, char **argv)
137 {
138 int i;
139 unsigned char buffer[8];
140 for (i = 1; i < argc; i++)
141 {
142 unsigned char *x = argv[i];
143 if (strncmp(x, "0x", 2) == 0)
144 {
145 int j;
146 int d = strtol(x+2, NULL, 16);
147 int rc = ord2utf8(d, buffer);
148 printf("0x%08x => ", d);
149 if (rc <= 0) printf("*** Error %d ***", rc);
150 else for (j = 0; j < rc; j++) printf("%02x ", buffer[j]);
151 printf("\n");
152 }
153 else
154 {
155 int d, rc;
156 int j = 0;
157 int y = 0;
158 int z = 0;
159 for (;;)
160 {
161 while (*x == ' ') x++;
162 if (*x == 0 && !z) break;
163 if (!isxdigit(*x))
164 {
165 printf("Malformed hex string: %s\n", argv[i]);
166 j = -1;
167 break;
168 }
169 y = y * 16 + tolower(*x) - ((isdigit(*x))? '0' : 'W');
170 x++;
171 if (z)
172 {
173 buffer[j++] = y;
174 y = 0;
175 }
176 z ^= 1;
177 }
178 if (j < 0) continue;
179 buffer[j] = 0;
180 rc = utf82ord(buffer, &d);
181 if (rc > 0) printf("0x%08x <= %s\n", d, argv[i]);
182 else printf("Error %d <= %s\n", rc, argv[i]);
183 }
184 }
185 return 0;
186 }
187
188 /* End */

  ViewVC Help
Powered by ViewVC 1.1.5