1 |
/***************************************************
|
2 |
* A program for testing the Unicode property table *
|
3 |
***************************************************/
|
4 |
|
5 |
/* Copyright (c) University of Cambridge 2008 */
|
6 |
|
7 |
/* Compile thus:
|
8 |
gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
|
9 |
*/
|
10 |
|
11 |
/* The program expects to read commands on stdin, and it writes output
|
12 |
to stdout. There is only one command, "findprop", followed by a list of Unicode
|
13 |
code points as hex numbers (without any prefixes). The output is one line per
|
14 |
character, giving its Unicode properties followed by its other case if there is
|
15 |
one. */
|
16 |
|
17 |
#ifdef HAVE_CONFIG_H
|
18 |
#include "../config.h"
|
19 |
#endif
|
20 |
|
21 |
#include <ctype.h>
|
22 |
#include <stdio.h>
|
23 |
#include <stdlib.h>
|
24 |
#include <string.h>
|
25 |
#include "../pcre_internal.h"
|
26 |
#include "../ucp.h"
|
27 |
|
28 |
|
29 |
/* -------------------------------------------------------------------*/
|
30 |
|
31 |
#define CS (char *)
|
32 |
#define CCS (const char *)
|
33 |
#define CSS (char **)
|
34 |
#define US (unsigned char *)
|
35 |
#define CUS (const unsigned char *)
|
36 |
#define USS (unsigned char **)
|
37 |
|
38 |
/* -------------------------------------------------------------------*/
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
/*************************************************
|
44 |
* Print Unicode property info for a char *
|
45 |
*************************************************/
|
46 |
|
47 |
static void
|
48 |
print_prop(int c)
|
49 |
{
|
50 |
int type = UCD_CATEGORY(c);
|
51 |
int fulltype = UCD_CHARTYPE(c);
|
52 |
int script = UCD_SCRIPT(c);
|
53 |
int othercase = UCD_OTHERCASE(c);
|
54 |
|
55 |
uschar *fulltypename = US"??";
|
56 |
uschar *typename = US"??";
|
57 |
uschar *scriptname = US"??";
|
58 |
|
59 |
switch (type)
|
60 |
{
|
61 |
case ucp_C: typename = US"Control"; break;
|
62 |
case ucp_L: typename = US"Letter"; break;
|
63 |
case ucp_M: typename = US"Mark"; break;
|
64 |
case ucp_N: typename = US"Number"; break;
|
65 |
case ucp_P: typename = US"Punctuation"; break;
|
66 |
case ucp_S: typename = US"Symbol"; break;
|
67 |
case ucp_Z: typename = US"Separator"; break;
|
68 |
}
|
69 |
|
70 |
switch (fulltype)
|
71 |
{
|
72 |
case ucp_Cc: fulltypename = US"Control"; break;
|
73 |
case ucp_Cf: fulltypename = US"Format"; break;
|
74 |
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
75 |
case ucp_Co: fulltypename = US"Private use"; break;
|
76 |
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
77 |
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
78 |
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
79 |
case ucp_Lo: fulltypename = US"Other letter"; break;
|
80 |
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
81 |
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
82 |
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
83 |
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
84 |
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
85 |
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
86 |
case ucp_Nl: fulltypename = US"Letter number"; break;
|
87 |
case ucp_No: fulltypename = US"Other number"; break;
|
88 |
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
89 |
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
90 |
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
91 |
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
92 |
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
93 |
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
94 |
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
95 |
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
96 |
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
97 |
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
98 |
case ucp_So: fulltypename = US"Other symbol"; break;
|
99 |
case ucp_Zl: fulltypename = US"Line separator"; break;
|
100 |
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
101 |
case ucp_Zs: fulltypename = US"Space separator"; break;
|
102 |
}
|
103 |
|
104 |
switch(script)
|
105 |
{
|
106 |
case ucp_Arabic: scriptname = US"Arabic"; break;
|
107 |
case ucp_Armenian: scriptname = US"Armenian"; break;
|
108 |
case ucp_Balinese: scriptname = US"Balinese"; break;
|
109 |
case ucp_Bengali: scriptname = US"Bengali"; break;
|
110 |
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
|
111 |
case ucp_Braille: scriptname = US"Braille"; break;
|
112 |
case ucp_Buginese: scriptname = US"Buginese"; break;
|
113 |
case ucp_Buhid: scriptname = US"Buhid"; break;
|
114 |
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
|
115 |
case ucp_Cherokee: scriptname = US"Cherokee"; break;
|
116 |
case ucp_Common: scriptname = US"Common"; break;
|
117 |
case ucp_Coptic: scriptname = US"Coptic"; break;
|
118 |
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
|
119 |
case ucp_Cypriot: scriptname = US"Cypriot"; break;
|
120 |
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
|
121 |
case ucp_Deseret: scriptname = US"Deseret"; break;
|
122 |
case ucp_Devanagari: scriptname = US"Devanagari"; break;
|
123 |
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
|
124 |
case ucp_Georgian: scriptname = US"Georgian"; break;
|
125 |
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
|
126 |
case ucp_Gothic: scriptname = US"Gothic"; break;
|
127 |
case ucp_Greek: scriptname = US"Greek"; break;
|
128 |
case ucp_Gujarati: scriptname = US"Gujarati"; break;
|
129 |
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
|
130 |
case ucp_Han: scriptname = US"Han"; break;
|
131 |
case ucp_Hangul: scriptname = US"Hangul"; break;
|
132 |
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
|
133 |
case ucp_Hebrew: scriptname = US"Hebrew"; break;
|
134 |
case ucp_Hiragana: scriptname = US"Hiragana"; break;
|
135 |
case ucp_Inherited: scriptname = US"Inherited"; break;
|
136 |
case ucp_Kannada: scriptname = US"Kannada"; break;
|
137 |
case ucp_Katakana: scriptname = US"Katakana"; break;
|
138 |
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
|
139 |
case ucp_Khmer: scriptname = US"Khmer"; break;
|
140 |
case ucp_Lao: scriptname = US"Lao"; break;
|
141 |
case ucp_Latin: scriptname = US"Latin"; break;
|
142 |
case ucp_Limbu: scriptname = US"Limbu"; break;
|
143 |
case ucp_Linear_B: scriptname = US"Linear_B"; break;
|
144 |
case ucp_Malayalam: scriptname = US"Malayalam"; break;
|
145 |
case ucp_Mongolian: scriptname = US"Mongolian"; break;
|
146 |
case ucp_Myanmar: scriptname = US"Myanmar"; break;
|
147 |
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
|
148 |
case ucp_Nko: scriptname = US"Nko"; break;
|
149 |
case ucp_Ogham: scriptname = US"Ogham"; break;
|
150 |
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
|
151 |
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
|
152 |
case ucp_Oriya: scriptname = US"Oriya"; break;
|
153 |
case ucp_Osmanya: scriptname = US"Osmanya"; break;
|
154 |
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
|
155 |
case ucp_Phoenician: scriptname = US"Phoenician"; break;
|
156 |
case ucp_Runic: scriptname = US"Runic"; break;
|
157 |
case ucp_Shavian: scriptname = US"Shavian"; break;
|
158 |
case ucp_Sinhala: scriptname = US"Sinhala"; break;
|
159 |
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
|
160 |
case ucp_Syriac: scriptname = US"Syriac"; break;
|
161 |
case ucp_Tagalog: scriptname = US"Tagalog"; break;
|
162 |
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
|
163 |
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
|
164 |
case ucp_Tamil: scriptname = US"Tamil"; break;
|
165 |
case ucp_Telugu: scriptname = US"Telugu"; break;
|
166 |
case ucp_Thaana: scriptname = US"Thaana"; break;
|
167 |
case ucp_Thai: scriptname = US"Thai"; break;
|
168 |
case ucp_Tibetan: scriptname = US"Tibetan"; break;
|
169 |
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
|
170 |
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
|
171 |
case ucp_Yi: scriptname = US"Yi"; break;
|
172 |
}
|
173 |
|
174 |
printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
|
175 |
if (othercase != c) printf(" %04x", othercase);
|
176 |
printf("\n");
|
177 |
}
|
178 |
|
179 |
|
180 |
|
181 |
/*************************************************
|
182 |
* Main program *
|
183 |
*************************************************/
|
184 |
|
185 |
int
|
186 |
main(void)
|
187 |
{
|
188 |
uschar buffer[1024];
|
189 |
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
190 |
{
|
191 |
uschar name[24];
|
192 |
uschar *s, *t;
|
193 |
|
194 |
printf("%s", buffer);
|
195 |
s = buffer;
|
196 |
while (isspace(*s)) s++;
|
197 |
if (*s == 0) continue;
|
198 |
|
199 |
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
200 |
*t = 0;
|
201 |
while (isspace(*s)) s++;
|
202 |
|
203 |
if (strcmp(CS name, "findprop") == 0)
|
204 |
{
|
205 |
while (*s != 0)
|
206 |
{
|
207 |
uschar *endptr;
|
208 |
int c = strtoul(CS s, CSS(&endptr), 16);
|
209 |
print_prop(c);
|
210 |
s = endptr;
|
211 |
while (isspace(*s)) s++;
|
212 |
}
|
213 |
}
|
214 |
|
215 |
else printf("Unknown test command %s\n", name);
|
216 |
}
|
217 |
|
218 |
return 0;
|
219 |
}
|
220 |
|
221 |
/* End */
|