/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Contents of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1011 - (show annotations)
Sat Aug 25 11:36:15 2012 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 11702 byte(s)
Error occurred while calculating annotation data.
Upgrade \X to match an extended grapheme cluster
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008 */
6
7 /* Compile thus:
8 gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9 */
10
11 /* The program expects to read commands on stdin, and it writes output
12 to stdout. There is only one command, "findprop", followed by a list of Unicode
13 code points as hex numbers (without any prefixes). The output is one line per
14 character, giving its Unicode properties followed by its other case if there is
15 one. */
16
17 #ifdef HAVE_CONFIG_H
18 #include "../config.h"
19 #endif
20
21 #ifndef SUPPORT_UCP
22 #define SUPPORT_UCP
23 #endif
24
25 #include <ctype.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include "../pcre_internal.h"
30 #include "../ucp.h"
31
32
33 /* -------------------------------------------------------------------*/
34
35 #define CS (char *)
36 #define CCS (const char *)
37 #define CSS (char **)
38 #define US (unsigned char *)
39 #define CUS (const unsigned char *)
40 #define USS (unsigned char **)
41
42 /* -------------------------------------------------------------------*/
43
44
45
46
47 /*************************************************
48 * Print Unicode property info for a char *
49 *************************************************/
50
51 static void
52 print_prop(int c)
53 {
54 int type = UCD_CATEGORY(c);
55 int fulltype = UCD_CHARTYPE(c);
56 int script = UCD_SCRIPT(c);
57 int gbprop = UCD_GRAPHBREAK(c);
58 int othercase = UCD_OTHERCASE(c);
59
60 unsigned char *fulltypename = US"??";
61 unsigned char *typename = US"??";
62 unsigned char *scriptname = US"??";
63 unsigned char *graphbreak = US"??";
64
65 switch (type)
66 {
67 case ucp_C: typename = US"Control"; break;
68 case ucp_L: typename = US"Letter"; break;
69 case ucp_M: typename = US"Mark"; break;
70 case ucp_N: typename = US"Number"; break;
71 case ucp_P: typename = US"Punctuation"; break;
72 case ucp_S: typename = US"Symbol"; break;
73 case ucp_Z: typename = US"Separator"; break;
74 }
75
76 switch (fulltype)
77 {
78 case ucp_Cc: fulltypename = US"Control"; break;
79 case ucp_Cf: fulltypename = US"Format"; break;
80 case ucp_Cn: fulltypename = US"Unassigned"; break;
81 case ucp_Co: fulltypename = US"Private use"; break;
82 case ucp_Cs: fulltypename = US"Surrogate"; break;
83 case ucp_Ll: fulltypename = US"Lower case letter"; break;
84 case ucp_Lm: fulltypename = US"Modifier letter"; break;
85 case ucp_Lo: fulltypename = US"Other letter"; break;
86 case ucp_Lt: fulltypename = US"Title case letter"; break;
87 case ucp_Lu: fulltypename = US"Upper case letter"; break;
88 case ucp_Mc: fulltypename = US"Spacing mark"; break;
89 case ucp_Me: fulltypename = US"Enclosing mark"; break;
90 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
91 case ucp_Nd: fulltypename = US"Decimal number"; break;
92 case ucp_Nl: fulltypename = US"Letter number"; break;
93 case ucp_No: fulltypename = US"Other number"; break;
94 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
95 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
96 case ucp_Pe: fulltypename = US"Close punctuation"; break;
97 case ucp_Pf: fulltypename = US"Final punctuation"; break;
98 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
99 case ucp_Po: fulltypename = US"Other punctuation"; break;
100 case ucp_Ps: fulltypename = US"Open punctuation"; break;
101 case ucp_Sc: fulltypename = US"Currency symbol"; break;
102 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
103 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
104 case ucp_So: fulltypename = US"Other symbol"; break;
105 case ucp_Zl: fulltypename = US"Line separator"; break;
106 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
107 case ucp_Zs: fulltypename = US"Space separator"; break;
108 }
109
110 switch(gbprop)
111 {
112 case ucp_gbCR: graphbreak = US"CR"; break;
113 case ucp_gbLF: graphbreak = US"LF"; break;
114 case ucp_gbControl: graphbreak = US"Control"; break;
115 case ucp_gbExtend: graphbreak = US"Extend"; break;
116 case ucp_gbPrepend: graphbreak = US"Prepend"; break;
117 case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
118 case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
119 case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
120 case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
121 case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
122 case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
123 case ucp_gbOther: graphbreak = US"Other"; break;
124 }
125
126 switch(script)
127 {
128 case ucp_Arabic: scriptname = US"Arabic"; break;
129 case ucp_Armenian: scriptname = US"Armenian"; break;
130 case ucp_Balinese: scriptname = US"Balinese"; break;
131 case ucp_Bengali: scriptname = US"Bengali"; break;
132 case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
133 case ucp_Braille: scriptname = US"Braille"; break;
134 case ucp_Buginese: scriptname = US"Buginese"; break;
135 case ucp_Buhid: scriptname = US"Buhid"; break;
136 case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
137 case ucp_Cherokee: scriptname = US"Cherokee"; break;
138 case ucp_Common: scriptname = US"Common"; break;
139 case ucp_Coptic: scriptname = US"Coptic"; break;
140 case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
141 case ucp_Cypriot: scriptname = US"Cypriot"; break;
142 case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
143 case ucp_Deseret: scriptname = US"Deseret"; break;
144 case ucp_Devanagari: scriptname = US"Devanagari"; break;
145 case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
146 case ucp_Georgian: scriptname = US"Georgian"; break;
147 case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
148 case ucp_Gothic: scriptname = US"Gothic"; break;
149 case ucp_Greek: scriptname = US"Greek"; break;
150 case ucp_Gujarati: scriptname = US"Gujarati"; break;
151 case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
152 case ucp_Han: scriptname = US"Han"; break;
153 case ucp_Hangul: scriptname = US"Hangul"; break;
154 case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
155 case ucp_Hebrew: scriptname = US"Hebrew"; break;
156 case ucp_Hiragana: scriptname = US"Hiragana"; break;
157 case ucp_Inherited: scriptname = US"Inherited"; break;
158 case ucp_Kannada: scriptname = US"Kannada"; break;
159 case ucp_Katakana: scriptname = US"Katakana"; break;
160 case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
161 case ucp_Khmer: scriptname = US"Khmer"; break;
162 case ucp_Lao: scriptname = US"Lao"; break;
163 case ucp_Latin: scriptname = US"Latin"; break;
164 case ucp_Limbu: scriptname = US"Limbu"; break;
165 case ucp_Linear_B: scriptname = US"Linear_B"; break;
166 case ucp_Malayalam: scriptname = US"Malayalam"; break;
167 case ucp_Mongolian: scriptname = US"Mongolian"; break;
168 case ucp_Myanmar: scriptname = US"Myanmar"; break;
169 case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
170 case ucp_Nko: scriptname = US"Nko"; break;
171 case ucp_Ogham: scriptname = US"Ogham"; break;
172 case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
173 case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
174 case ucp_Oriya: scriptname = US"Oriya"; break;
175 case ucp_Osmanya: scriptname = US"Osmanya"; break;
176 case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
177 case ucp_Phoenician: scriptname = US"Phoenician"; break;
178 case ucp_Runic: scriptname = US"Runic"; break;
179 case ucp_Shavian: scriptname = US"Shavian"; break;
180 case ucp_Sinhala: scriptname = US"Sinhala"; break;
181 case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
182 case ucp_Syriac: scriptname = US"Syriac"; break;
183 case ucp_Tagalog: scriptname = US"Tagalog"; break;
184 case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
185 case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
186 case ucp_Tamil: scriptname = US"Tamil"; break;
187 case ucp_Telugu: scriptname = US"Telugu"; break;
188 case ucp_Thaana: scriptname = US"Thaana"; break;
189 case ucp_Thai: scriptname = US"Thai"; break;
190 case ucp_Tibetan: scriptname = US"Tibetan"; break;
191 case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
192 case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
193 case ucp_Yi: scriptname = US"Yi"; break;
194 /* New for Unicode 5.1: */
195 case ucp_Carian: scriptname = US"Carian"; break;
196 case ucp_Cham: scriptname = US"Cham"; break;
197 case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
198 case ucp_Lepcha: scriptname = US"Lepcha"; break;
199 case ucp_Lycian: scriptname = US"Lycian"; break;
200 case ucp_Lydian: scriptname = US"Lydian"; break;
201 case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
202 case ucp_Rejang: scriptname = US"Rejang"; break;
203 case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
204 case ucp_Sundanese: scriptname = US"Sundanese"; break;
205 case ucp_Vai: scriptname = US"Vai"; break;
206 /* New for Unicode 5.2: */
207 case ucp_Avestan: scriptname = US"Avestan"; break;
208 case ucp_Bamum: scriptname = US"Bamum"; break;
209 case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
210 case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
211 case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
212 case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
213 case ucp_Javanese: scriptname = US"Javanese"; break;
214 case ucp_Kaithi: scriptname = US"Kaithi"; break;
215 case ucp_Lisu: scriptname = US"Lisu"; break;
216 case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
217 case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
218 case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
219 case ucp_Samaritan: scriptname = US"Samaritan"; break;
220 case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
221 case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
222 /* New for Unicode 6.0.0 */
223 case ucp_Batak: scriptname = US"Batak"; break;
224 case ucp_Brahmi: scriptname = US"Brahmi"; break;
225 case ucp_Mandaic: scriptname = US"Mandaic"; break;
226
227 /* New for Unicode 6.1.0 */
228 case ucp_Chakma: scriptname = US"Chakma"; break;
229 case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
230 case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
231 case ucp_Miao: scriptname = US"Miao"; break;
232 case ucp_Sharada: scriptname = US"Sharada"; break;
233 case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
234 case ucp_Takri: scriptname = US"Takri"; break;
235
236 }
237
238 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
239 if (othercase != c) printf(", %04x", othercase);
240 printf("\n");
241 }
242
243
244
245 /*************************************************
246 * Main program *
247 *************************************************/
248
249 int
250 main(void)
251 {
252 unsigned char buffer[1024];
253 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
254 {
255 unsigned char name[24];
256 unsigned char *s, *t;
257
258 printf("%s", buffer);
259 s = buffer;
260 while (isspace(*s)) s++;
261 if (*s == 0) continue;
262
263 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
264 *t = 0;
265 while (isspace(*s)) s++;
266
267 if (strcmp(CS name, "findprop") == 0)
268 {
269 while (*s != 0)
270 {
271 unsigned char *endptr;
272 int c = strtoul(CS s, CSS(&endptr), 16);
273 print_prop(c);
274 s = endptr;
275 while (isspace(*s)) s++;
276 }
277 }
278
279 else printf("Unknown test command %s\n", name);
280 }
281
282 return 0;
283 }
284
285 /* End */

  ViewVC Help
Powered by ViewVC 1.1.5