/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Contents of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1045 - (show annotations)
Sun Sep 23 16:50:00 2012 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 11935 byte(s)
Update character class handling to use new character case information; rework 
\h, \H, \v, and \V to use the same apparatus with centrally defined lists.
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008 */
6
7 /* Compile thus:
8 gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9 */
10
11 /* The program expects to read commands on stdin, and it writes output
12 to stdout. There is only one command, "findprop", followed by a list of Unicode
13 code points as hex numbers (without any prefixes). The output is one line per
14 character, giving its Unicode properties followed by its other case if there is
15 one. */
16
17 #ifdef HAVE_CONFIG_H
18 #include "../config.h"
19 #endif
20
21 #ifndef SUPPORT_UCP
22 #define SUPPORT_UCP
23 #endif
24
25 #include <ctype.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include "../pcre_internal.h"
30 #include "../ucp.h"
31
32
33 /* -------------------------------------------------------------------*/
34
35 #define CS (char *)
36 #define CCS (const char *)
37 #define CSS (char **)
38 #define US (unsigned char *)
39 #define CUS (const unsigned char *)
40 #define USS (unsigned char **)
41
42 /* -------------------------------------------------------------------*/
43
44
45
46
47 /*************************************************
48 * Print Unicode property info for a char *
49 *************************************************/
50
51 static void
52 print_prop(int c)
53 {
54 int type = UCD_CATEGORY(c);
55 int fulltype = UCD_CHARTYPE(c);
56 int script = UCD_SCRIPT(c);
57 int gbprop = UCD_GRAPHBREAK(c);
58 int othercase = UCD_OTHERCASE(c);
59 int caseset = UCD_CASESET(c);
60
61 unsigned char *fulltypename = US"??";
62 unsigned char *typename = US"??";
63 unsigned char *scriptname = US"??";
64 unsigned char *graphbreak = US"??";
65
66 switch (type)
67 {
68 case ucp_C: typename = US"Control"; break;
69 case ucp_L: typename = US"Letter"; break;
70 case ucp_M: typename = US"Mark"; break;
71 case ucp_N: typename = US"Number"; break;
72 case ucp_P: typename = US"Punctuation"; break;
73 case ucp_S: typename = US"Symbol"; break;
74 case ucp_Z: typename = US"Separator"; break;
75 }
76
77 switch (fulltype)
78 {
79 case ucp_Cc: fulltypename = US"Control"; break;
80 case ucp_Cf: fulltypename = US"Format"; break;
81 case ucp_Cn: fulltypename = US"Unassigned"; break;
82 case ucp_Co: fulltypename = US"Private use"; break;
83 case ucp_Cs: fulltypename = US"Surrogate"; break;
84 case ucp_Ll: fulltypename = US"Lower case letter"; break;
85 case ucp_Lm: fulltypename = US"Modifier letter"; break;
86 case ucp_Lo: fulltypename = US"Other letter"; break;
87 case ucp_Lt: fulltypename = US"Title case letter"; break;
88 case ucp_Lu: fulltypename = US"Upper case letter"; break;
89 case ucp_Mc: fulltypename = US"Spacing mark"; break;
90 case ucp_Me: fulltypename = US"Enclosing mark"; break;
91 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
92 case ucp_Nd: fulltypename = US"Decimal number"; break;
93 case ucp_Nl: fulltypename = US"Letter number"; break;
94 case ucp_No: fulltypename = US"Other number"; break;
95 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
96 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
97 case ucp_Pe: fulltypename = US"Close punctuation"; break;
98 case ucp_Pf: fulltypename = US"Final punctuation"; break;
99 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
100 case ucp_Po: fulltypename = US"Other punctuation"; break;
101 case ucp_Ps: fulltypename = US"Open punctuation"; break;
102 case ucp_Sc: fulltypename = US"Currency symbol"; break;
103 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
104 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
105 case ucp_So: fulltypename = US"Other symbol"; break;
106 case ucp_Zl: fulltypename = US"Line separator"; break;
107 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
108 case ucp_Zs: fulltypename = US"Space separator"; break;
109 }
110
111 switch(gbprop)
112 {
113 case ucp_gbCR: graphbreak = US"CR"; break;
114 case ucp_gbLF: graphbreak = US"LF"; break;
115 case ucp_gbControl: graphbreak = US"Control"; break;
116 case ucp_gbExtend: graphbreak = US"Extend"; break;
117 case ucp_gbPrepend: graphbreak = US"Prepend"; break;
118 case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
119 case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
120 case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
121 case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
122 case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
123 case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
124 case ucp_gbOther: graphbreak = US"Other"; break;
125 }
126
127 switch(script)
128 {
129 case ucp_Arabic: scriptname = US"Arabic"; break;
130 case ucp_Armenian: scriptname = US"Armenian"; break;
131 case ucp_Balinese: scriptname = US"Balinese"; break;
132 case ucp_Bengali: scriptname = US"Bengali"; break;
133 case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
134 case ucp_Braille: scriptname = US"Braille"; break;
135 case ucp_Buginese: scriptname = US"Buginese"; break;
136 case ucp_Buhid: scriptname = US"Buhid"; break;
137 case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
138 case ucp_Cherokee: scriptname = US"Cherokee"; break;
139 case ucp_Common: scriptname = US"Common"; break;
140 case ucp_Coptic: scriptname = US"Coptic"; break;
141 case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
142 case ucp_Cypriot: scriptname = US"Cypriot"; break;
143 case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
144 case ucp_Deseret: scriptname = US"Deseret"; break;
145 case ucp_Devanagari: scriptname = US"Devanagari"; break;
146 case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
147 case ucp_Georgian: scriptname = US"Georgian"; break;
148 case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
149 case ucp_Gothic: scriptname = US"Gothic"; break;
150 case ucp_Greek: scriptname = US"Greek"; break;
151 case ucp_Gujarati: scriptname = US"Gujarati"; break;
152 case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
153 case ucp_Han: scriptname = US"Han"; break;
154 case ucp_Hangul: scriptname = US"Hangul"; break;
155 case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
156 case ucp_Hebrew: scriptname = US"Hebrew"; break;
157 case ucp_Hiragana: scriptname = US"Hiragana"; break;
158 case ucp_Inherited: scriptname = US"Inherited"; break;
159 case ucp_Kannada: scriptname = US"Kannada"; break;
160 case ucp_Katakana: scriptname = US"Katakana"; break;
161 case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
162 case ucp_Khmer: scriptname = US"Khmer"; break;
163 case ucp_Lao: scriptname = US"Lao"; break;
164 case ucp_Latin: scriptname = US"Latin"; break;
165 case ucp_Limbu: scriptname = US"Limbu"; break;
166 case ucp_Linear_B: scriptname = US"Linear_B"; break;
167 case ucp_Malayalam: scriptname = US"Malayalam"; break;
168 case ucp_Mongolian: scriptname = US"Mongolian"; break;
169 case ucp_Myanmar: scriptname = US"Myanmar"; break;
170 case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
171 case ucp_Nko: scriptname = US"Nko"; break;
172 case ucp_Ogham: scriptname = US"Ogham"; break;
173 case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
174 case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
175 case ucp_Oriya: scriptname = US"Oriya"; break;
176 case ucp_Osmanya: scriptname = US"Osmanya"; break;
177 case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
178 case ucp_Phoenician: scriptname = US"Phoenician"; break;
179 case ucp_Runic: scriptname = US"Runic"; break;
180 case ucp_Shavian: scriptname = US"Shavian"; break;
181 case ucp_Sinhala: scriptname = US"Sinhala"; break;
182 case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
183 case ucp_Syriac: scriptname = US"Syriac"; break;
184 case ucp_Tagalog: scriptname = US"Tagalog"; break;
185 case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
186 case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
187 case ucp_Tamil: scriptname = US"Tamil"; break;
188 case ucp_Telugu: scriptname = US"Telugu"; break;
189 case ucp_Thaana: scriptname = US"Thaana"; break;
190 case ucp_Thai: scriptname = US"Thai"; break;
191 case ucp_Tibetan: scriptname = US"Tibetan"; break;
192 case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
193 case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
194 case ucp_Yi: scriptname = US"Yi"; break;
195 /* New for Unicode 5.1: */
196 case ucp_Carian: scriptname = US"Carian"; break;
197 case ucp_Cham: scriptname = US"Cham"; break;
198 case ucp_Kayah_Li: scriptname = US"Kayah_Li"; break;
199 case ucp_Lepcha: scriptname = US"Lepcha"; break;
200 case ucp_Lycian: scriptname = US"Lycian"; break;
201 case ucp_Lydian: scriptname = US"Lydian"; break;
202 case ucp_Ol_Chiki: scriptname = US"Ol_Chiki"; break;
203 case ucp_Rejang: scriptname = US"Rejang"; break;
204 case ucp_Saurashtra: scriptname = US"Saurashtra"; break;
205 case ucp_Sundanese: scriptname = US"Sundanese"; break;
206 case ucp_Vai: scriptname = US"Vai"; break;
207 /* New for Unicode 5.2: */
208 case ucp_Avestan: scriptname = US"Avestan"; break;
209 case ucp_Bamum: scriptname = US"Bamum"; break;
210 case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
211 case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
212 case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
213 case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
214 case ucp_Javanese: scriptname = US"Javanese"; break;
215 case ucp_Kaithi: scriptname = US"Kaithi"; break;
216 case ucp_Lisu: scriptname = US"Lisu"; break;
217 case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
218 case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
219 case ucp_Old_Turkic: scriptname = US"Old_Turkic"; break;
220 case ucp_Samaritan: scriptname = US"Samaritan"; break;
221 case ucp_Tai_Tham: scriptname = US"Tai_Tham"; break;
222 case ucp_Tai_Viet: scriptname = US"Tai_Viet"; break;
223 /* New for Unicode 6.0.0 */
224 case ucp_Batak: scriptname = US"Batak"; break;
225 case ucp_Brahmi: scriptname = US"Brahmi"; break;
226 case ucp_Mandaic: scriptname = US"Mandaic"; break;
227
228 /* New for Unicode 6.1.0 */
229 case ucp_Chakma: scriptname = US"Chakma"; break;
230 case ucp_Meroitic_Cursive: scriptname = US"Meroitic_Cursive"; break;
231 case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
232 case ucp_Miao: scriptname = US"Miao"; break;
233 case ucp_Sharada: scriptname = US"Sharada"; break;
234 case ucp_Sora_Sompeng: scriptname = US"Sora Sompent"; break;
235 case ucp_Takri: scriptname = US"Takri"; break;
236
237 }
238
239 printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
240 if (othercase != c)
241 {
242 printf(", %04x", othercase);
243 if (caseset != 0)
244 {
245 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + caseset - 1;
246 while (*(++p) < NOTACHAR)
247 if (*p != othercase && *p != c) printf(", %04x", *p);
248 }
249 }
250 printf("\n");
251 }
252
253
254
255 /*************************************************
256 * Main program *
257 *************************************************/
258
259 int
260 main(void)
261 {
262 unsigned char buffer[1024];
263 while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
264 {
265 unsigned char name[24];
266 unsigned char *s, *t;
267
268 printf("%s", buffer);
269 s = buffer;
270 while (isspace(*s)) s++;
271 if (*s == 0) continue;
272
273 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
274 *t = 0;
275 while (isspace(*s)) s++;
276
277 if (strcmp(CS name, "findprop") == 0)
278 {
279 while (*s != 0)
280 {
281 unsigned char *endptr;
282 int c = strtoul(CS s, CSS(&endptr), 16);
283 print_prop(c);
284 s = endptr;
285 while (isspace(*s)) s++;
286 }
287 }
288
289 else printf("Unknown test command %s\n", name);
290 }
291
292 return 0;
293 }
294
295 /* End */

  ViewVC Help
Powered by ViewVC 1.1.5