/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Contents of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 943 - (show annotations)
Tue Feb 28 15:02:51 2012 UTC (7 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 10801 byte(s)
Error occurred while calculating annotation data.
Update ucptest maintenance program for Unicode 6.1.0.
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008 */
6
7 /* Compile thus:
8 gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9 */
10
11 /* The program expects to read commands on stdin, and it writes output
12 to stdout. There is only one command, "findprop", followed by a list of Unicode
13 code points as hex numbers (without any prefixes). The output is one line per
14 character, giving its Unicode properties followed by its other case if there is
15 one. */
16
17 #ifdef HAVE_CONFIG_H
18 #include "../config.h"
19 #endif
20
21 #ifndef SUPPORT_UCP
22 #define SUPPORT_UCP
23 #endif
24
25 #include <ctype.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include "../pcre_internal.h"
30 #include "../ucp.h"
31
32
33 /* -------------------------------------------------------------------*/
34
35 #define CS (char *)
36 #define CCS (const char *)
37 #define CSS (char **)
38 #define US (unsigned char *)
39 #define CUS (const unsigned char *)
40 #define USS (unsigned char **)
41
42 /* -------------------------------------------------------------------*/
43
44
45
46
47 /*************************************************
48 * Print Unicode property info for a char *
49 *************************************************/
50
51 static void
52 print_prop(int c)
53 {
54 int type = UCD_CATEGORY(c);
55 int fulltype = UCD_CHARTYPE(c);
56 int script = UCD_SCRIPT(c);
57 int othercase = UCD_OTHERCASE(c);
58
59 unsigned char *fulltypename = US"??";
60 unsigned char *typename = US"??";
61 unsigned char *scriptname = US"??";
62
63 switch (type)
64 {
65 case ucp_C: typename = US"Control"; break;
66 case ucp_L: typename = US"Letter"; break;
67 case ucp_M: typename = US"Mark"; break;
68 case ucp_N: typename = US"Number"; break;
69 case ucp_P: typename = US"Punctuation"; break;
70 case ucp_S: typename = US"Symbol"; break;
71 case ucp_Z: typename = US"Separator"; break;
72 }
73
74 switch (fulltype)
75 {
76 case ucp_Cc: fulltypename = US"Control"; break;
77 case ucp_Cf: fulltypename = US"Format"; break;
78 case ucp_Cn: fulltypename = US"Unassigned"; break;
79 case ucp_Co: fulltypename = US"Private use"; break;
80 case ucp_Cs: fulltypename = US"Surrogate"; break;
81 case ucp_Ll: fulltypename = US"Lower case letter"; break;
82 case ucp_Lm: fulltypename = US"Modifier letter"; break;
83 case ucp_Lo: fulltypename = US"Other letter"; break;
84 case ucp_Lt: fulltypename = US"Title case letter"; break;
85 case ucp_Lu: fulltypename = US"Upper case letter"; break;
86 case ucp_Mc: fulltypename = US"Spacing mark"; break;
87 case ucp_Me: fulltypename = US"Enclosing mark"; break;
88 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
89 case ucp_Nd: fulltypename = US"Decimal number"; break;
90 case ucp_Nl: fulltypename = US"Letter number"; break;
91 case ucp_No: fulltypename = US"Other number"; break;
92 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
93 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
94 case ucp_Pe: fulltypename = US"Close punctuation"; break;
95 case ucp_Pf: fulltypename = US"Final punctuation"; break;
96 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
97 case ucp_Po: fulltypename = US"Other punctuation"; break;
98 case ucp_Ps: fulltypename = US"Open punctuation"; break;
99 case ucp_Sc: fulltypename = US"Currency symbol"; break;
100 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
101 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
102 case ucp_So: fulltypename = US"Other symbol"; break;
103 case ucp_Zl: fulltypename = US"Line separator"; break;
104 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
105 case ucp_Zs: fulltypename = US"Space separator"; break;
106 }
107
108 switch(script)
109 {
110 case ucp_Arabic: scriptname = US"Arabic"; break;
111 case ucp_Armenian: scriptname = US"Armenian"; break;
112 case ucp_Balinese: scriptname = US"Balinese"; break;
113 case ucp_Bengali: scriptname = US"Bengali"; break;
114 case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
115 case ucp_Braille: scriptname = US"Braille"; break;
116 case ucp_Buginese: scriptname = US"Buginese"; break;
117 case ucp_Buhid: scriptname = US"Buhid"; break;
118 case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
119 case ucp_Cherokee: scriptname = US"Cherokee"; break;
120 case ucp_Common: scriptname = US"Common"; break;
121 case ucp_Coptic: scriptname = US"Coptic"; break;
122 case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
123 case ucp_Cypriot: scriptname = US"Cypriot"; break;
124 case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
125 case ucp_Deseret: scriptname = US"Deseret"; break;
126 case ucp_Devanagari: scriptname = US"Devanagari"; break;
127 case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
128 case ucp_Georgian: scriptname = US"Georgian"; break;
129 case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
130 case ucp_Gothic: scriptname = US"Gothic"; break;
131 case ucp_Greek: scriptname = US"Greek"; break;
132 case ucp_Gujarati: scriptname = US"Gujarati"; break;
133 case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
134 case ucp_Han: scriptname = US"Han"; break;
135 case ucp_Hangul: scriptname = US"Hangul"; break;
136 case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
137 case ucp_Hebrew: scriptname = US"Hebrew"; break;
138 case ucp_Hiragana: scriptname = US"Hiragana"; break;
139 case ucp_Inherited: scriptname = US"Inherited"; break;
140 case ucp_Kannada: scriptname = US"Kannada"; break;
141 case ucp_Katakana: scriptname = US"Katakana"; break;
142 case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
143 case ucp_Khmer: scriptname = US"Khmer"; break;
144 case ucp_Lao: scriptname = US"Lao"; break;
145 case ucp_Latin: scriptname = US"Latin"; break;
146 case ucp_Limbu: scriptname = US"Limbu"; break;
147 case ucp_Linear_B: scriptname = US"Linear_B"; break;
148 case ucp_Malayalam: scriptname = US"Malayalam"; break;
149 case ucp_Mongolian: scriptname = US"Mongolian"; break;
150 case ucp_Myanmar: scriptname = US"Myanmar"; break;
151 case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
152 case ucp_Nko: scriptname = US"Nko"; break;
153 case ucp_Ogham: scriptname = US"Ogham"; break;
154 case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
155 case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
156 case ucp_Oriya: scriptname = US"Oriya"; break;
157 case ucp_Osmanya: scriptname = US"Osmanya"; break;
158 case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
159 case ucp_Phoenician: scriptname = US"Phoenician"; break;
160 case ucp_Runic: scriptname = US"Runic"; break;
161 case ucp_Shavian: scriptname = US"Shavian"; break;
162 case ucp_Sinhala: scriptname = US"Sinhala"; break;
163 case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
164 case ucp_Syriac: scriptname = US"Syriac"; break;
165 case ucp_Tagalog: scriptname = US"Tagalog"; break;
166 case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
167 case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
168 case ucp_Tamil: scriptname = US"Tamil"; break;
169 case ucp_Telugu: scriptname = US"Telugu"; break;
170 case ucp_Thaana: scriptname = US"Thaana"; break;
171 case ucp_Thai: scriptname = US"Thai"; break;
172 case ucp_Tibetan: scriptname = US"Tibetan"; break;
173 case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
174 case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
175 case ucp_Yi: scriptname = US"Yi"; break;
176 /* New for Unicode 5.1: */
177 case ucp_Carian: scriptname = US"Carian"; break;
178 case ucp_Cham: scriptname = US"Cham"; break;
179 case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
180 case ucp_Lepcha: scriptname = US"Lepcha"; break;
181 case ucp_Lycian: scriptname = US"Lycian"; break;
182 case ucp_Lydian: scriptname = US"Lydian"; break;
183 case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
184 case ucp_Rejang: scriptname = US"Rejang"; break;
185 case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
186 case ucp_Sundanese: scriptname = US"Sundanese"; break;
187 case ucp_Vai: scriptname = US"Vai"; break;
188 /* New for Unicode 5.2: */
189 case ucp_Avestan: scriptname = US"Avestan"; break;
190 case ucp_Bamum: scriptname = US"Bamum"; break;
191 case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
192 case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
193 case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
194 case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
195 case ucp_Javanese: scriptname = US"Javanese"; break;
196 case ucp_Kaithi: scriptname = US"Kaithi"; break;
197 case ucp_Lisu: scriptname = US"Lisu"; break;
198 case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
199 case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
200 case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
201 case ucp_Samaritan: scriptname = US"Samaritan"; break;
202 case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
203 case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
204 /* New for Unicode 6.0.0 */
205 case ucp_Batak: scriptname = US"Batak"; break;
206 case ucp_Brahmi: scriptname = US"Brahmi"; break;
207 case ucp_Mandaic: scriptname = US"Mandaic"; break;
208
209 /* New for Unicode 6.1.0 */
210 case ucp_Chakma: scriptname = US"Chakma"; break;
211 case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
212 case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
213 case ucp_Miao: scriptname = US"Miao"; break;
214 case ucp_Sharada: scriptname = US"Sharada"; break;
215 case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
216 case ucp_Takri: scriptname = US"Takri"; break;
217
218 }
219
220 printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
221 if (othercase != c) printf(" %04x", othercase);
222 printf("\n");
223 }
224
225
226
227 /*************************************************
228 * Main program *
229 *************************************************/
230
231 int
232 main(void)
233 {
234 unsigned char buffer[1024];
235 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
236 {
237 unsigned char name[24];
238 unsigned char *s, *t;
239
240 printf("%s", buffer);
241 s = buffer;
242 while (isspace(*s)) s++;
243 if (*s == 0) continue;
244
245 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
246 *t = 0;
247 while (isspace(*s)) s++;
248
249 if (strcmp(CS name, "findprop") == 0)
250 {
251 while (*s != 0)
252 {
253 unsigned char *endptr;
254 int c = strtoul(CS s, CSS(&endptr), 16);
255 print_prop(c);
256 s = endptr;
257 while (isspace(*s)) s++;
258 }
259 }
260
261 else printf("Unknown test command %s\n", name);
262 }
263
264 return 0;
265 }
266
267 /* End */

  ViewVC Help
Powered by ViewVC 1.1.5