1 |
/***************************************************
|
2 |
* A program for testing the Unicode property table *
|
3 |
***************************************************/
|
4 |
|
5 |
/* Copyright (c) University of Cambridge 2006 */
|
6 |
|
7 |
/* Compile thus:
|
8 |
gcc -o ucptest maintain/ucptest.c pcre_ucp_searchfuncs.c
|
9 |
*/
|
10 |
|
11 |
#include <ctype.h>
|
12 |
#include <stdio.h>
|
13 |
#include <stdlib.h>
|
14 |
#include <string.h>
|
15 |
#include "pcre_internal.h"
|
16 |
#include "ucp.h"
|
17 |
#include "ucpinternal.h"
|
18 |
|
19 |
|
20 |
/* -------------------------------------------------------------------*/
|
21 |
|
22 |
#define CS (char *)
|
23 |
#define CCS (const char *)
|
24 |
#define CSS (char **)
|
25 |
#define US (unsigned char *)
|
26 |
#define CUS (const unsigned char *)
|
27 |
#define USS (unsigned char **)
|
28 |
|
29 |
/* -------------------------------------------------------------------*/
|
30 |
|
31 |
|
32 |
|
33 |
|
34 |
/*************************************************
|
35 |
* Print Unicode property info for a char *
|
36 |
*************************************************/
|
37 |
|
38 |
static void
|
39 |
print_prop(int c)
|
40 |
{
|
41 |
int fulltype, script, othercase;
|
42 |
int type = _pcre_ucp_findprop(c, &fulltype, &script);
|
43 |
|
44 |
printf("%04x ", c);
|
45 |
if (type < 0) printf("not found\n"); else
|
46 |
{
|
47 |
uschar *fulltypename = US"??";
|
48 |
uschar *typename = US"??";
|
49 |
uschar *scriptname = US"??";
|
50 |
switch (type)
|
51 |
{
|
52 |
case ucp_C: typename = US"Control"; break;
|
53 |
case ucp_L: typename = US"Letter"; break;
|
54 |
case ucp_M: typename = US"Mark"; break;
|
55 |
case ucp_N: typename = US"Number"; break;
|
56 |
case ucp_P: typename = US"Punctuation"; break;
|
57 |
case ucp_S: typename = US"Symbol"; break;
|
58 |
case ucp_Z: typename = US"Separator"; break;
|
59 |
}
|
60 |
switch (fulltype)
|
61 |
{
|
62 |
case ucp_Cc: fulltypename = US"Control"; break;
|
63 |
case ucp_Cf: fulltypename = US"Format"; break;
|
64 |
case ucp_Cn: fulltypename = US"Unassigned"; break;
|
65 |
case ucp_Co: fulltypename = US"Private use"; break;
|
66 |
case ucp_Cs: fulltypename = US"Surrogate"; break;
|
67 |
case ucp_Ll: fulltypename = US"Lower case letter"; break;
|
68 |
case ucp_Lm: fulltypename = US"Modifier letter"; break;
|
69 |
case ucp_Lo: fulltypename = US"Other letter"; break;
|
70 |
case ucp_Lt: fulltypename = US"Title case letter"; break;
|
71 |
case ucp_Lu: fulltypename = US"Upper case letter"; break;
|
72 |
case ucp_Mc: fulltypename = US"Spacing mark"; break;
|
73 |
case ucp_Me: fulltypename = US"Enclosing mark"; break;
|
74 |
case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
|
75 |
case ucp_Nd: fulltypename = US"Decimal number"; break;
|
76 |
case ucp_Nl: fulltypename = US"Letter number"; break;
|
77 |
case ucp_No: fulltypename = US"Other number"; break;
|
78 |
case ucp_Pc: fulltypename = US"Connector punctuation"; break;
|
79 |
case ucp_Pd: fulltypename = US"Dash punctuation"; break;
|
80 |
case ucp_Pe: fulltypename = US"Close punctuation"; break;
|
81 |
case ucp_Pf: fulltypename = US"Final punctuation"; break;
|
82 |
case ucp_Pi: fulltypename = US"Initial punctuation"; break;
|
83 |
case ucp_Po: fulltypename = US"Other punctuation"; break;
|
84 |
case ucp_Ps: fulltypename = US"Open punctuation"; break;
|
85 |
case ucp_Sc: fulltypename = US"Currency symbol"; break;
|
86 |
case ucp_Sk: fulltypename = US"Modifier symbol"; break;
|
87 |
case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
|
88 |
case ucp_So: fulltypename = US"Other symbol"; break;
|
89 |
case ucp_Zl: fulltypename = US"Line separator"; break;
|
90 |
case ucp_Zp: fulltypename = US"Paragraph separator"; break;
|
91 |
case ucp_Zs: fulltypename = US"Space separator"; break;
|
92 |
}
|
93 |
switch(script)
|
94 |
{
|
95 |
case ucp_Arabic: scriptname = US"Arabic"; break;
|
96 |
case ucp_Armenian: scriptname = US"Armenian"; break;
|
97 |
case ucp_Balinese: scriptname = US"Balinese"; break;
|
98 |
case ucp_Bengali: scriptname = US"Bengali"; break;
|
99 |
case ucp_Bopomofo: scriptname = US"Bopomofo"; break;
|
100 |
case ucp_Braille: scriptname = US"Braille"; break;
|
101 |
case ucp_Buginese: scriptname = US"Buginese"; break;
|
102 |
case ucp_Buhid: scriptname = US"Buhid"; break;
|
103 |
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
|
104 |
case ucp_Cherokee: scriptname = US"Cherokee"; break;
|
105 |
case ucp_Common: scriptname = US"Common"; break;
|
106 |
case ucp_Coptic: scriptname = US"Coptic"; break;
|
107 |
case ucp_Cuneiform: scriptname = US"Cuneiform"; break;
|
108 |
case ucp_Cypriot: scriptname = US"Cypriot"; break;
|
109 |
case ucp_Cyrillic: scriptname = US"Cyrillic"; break;
|
110 |
case ucp_Deseret: scriptname = US"Deseret"; break;
|
111 |
case ucp_Devanagari: scriptname = US"Devanagari"; break;
|
112 |
case ucp_Ethiopic: scriptname = US"Ethiopic"; break;
|
113 |
case ucp_Georgian: scriptname = US"Georgian"; break;
|
114 |
case ucp_Glagolitic: scriptname = US"Glagolitic"; break;
|
115 |
case ucp_Gothic: scriptname = US"Gothic"; break;
|
116 |
case ucp_Greek: scriptname = US"Greek"; break;
|
117 |
case ucp_Gujarati: scriptname = US"Gujarati"; break;
|
118 |
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break;
|
119 |
case ucp_Han: scriptname = US"Han"; break;
|
120 |
case ucp_Hangul: scriptname = US"Hangul"; break;
|
121 |
case ucp_Hanunoo: scriptname = US"Hanunoo"; break;
|
122 |
case ucp_Hebrew: scriptname = US"Hebrew"; break;
|
123 |
case ucp_Hiragana: scriptname = US"Hiragana"; break;
|
124 |
case ucp_Inherited: scriptname = US"Inherited"; break;
|
125 |
case ucp_Kannada: scriptname = US"Kannada"; break;
|
126 |
case ucp_Katakana: scriptname = US"Katakana"; break;
|
127 |
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break;
|
128 |
case ucp_Khmer: scriptname = US"Khmer"; break;
|
129 |
case ucp_Lao: scriptname = US"Lao"; break;
|
130 |
case ucp_Latin: scriptname = US"Latin"; break;
|
131 |
case ucp_Limbu: scriptname = US"Limbu"; break;
|
132 |
case ucp_Linear_B: scriptname = US"Linear_B"; break;
|
133 |
case ucp_Malayalam: scriptname = US"Malayalam"; break;
|
134 |
case ucp_Mongolian: scriptname = US"Mongolian"; break;
|
135 |
case ucp_Myanmar: scriptname = US"Myanmar"; break;
|
136 |
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
|
137 |
case ucp_Nko: scriptname = US"Nko"; break;
|
138 |
case ucp_Ogham: scriptname = US"Ogham"; break;
|
139 |
case ucp_Old_Italic: scriptname = US"Old_Italic"; break;
|
140 |
case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
|
141 |
case ucp_Oriya: scriptname = US"Oriya"; break;
|
142 |
case ucp_Osmanya: scriptname = US"Osmanya"; break;
|
143 |
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break;
|
144 |
case ucp_Phoenician: scriptname = US"Phoenician"; break;
|
145 |
case ucp_Runic: scriptname = US"Runic"; break;
|
146 |
case ucp_Shavian: scriptname = US"Shavian"; break;
|
147 |
case ucp_Sinhala: scriptname = US"Sinhala"; break;
|
148 |
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
|
149 |
case ucp_Syriac: scriptname = US"Syriac"; break;
|
150 |
case ucp_Tagalog: scriptname = US"Tagalog"; break;
|
151 |
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break;
|
152 |
case ucp_Tai_Le: scriptname = US"Tai_Le"; break;
|
153 |
case ucp_Tamil: scriptname = US"Tamil"; break;
|
154 |
case ucp_Telugu: scriptname = US"Telugu"; break;
|
155 |
case ucp_Thaana: scriptname = US"Thaana"; break;
|
156 |
case ucp_Thai: scriptname = US"Thai"; break;
|
157 |
case ucp_Tibetan: scriptname = US"Tibetan"; break;
|
158 |
case ucp_Tifinagh: scriptname = US"Tifinagh"; break;
|
159 |
case ucp_Ugaritic: scriptname = US"Ugaritic"; break;
|
160 |
case ucp_Yi: scriptname = US"Yi"; break;
|
161 |
}
|
162 |
|
163 |
printf("%s: %s %s", typename, fulltypename, scriptname);
|
164 |
othercase = _pcre_ucp_othercase(c);
|
165 |
if (othercase >= 0) printf(" %04x", othercase);
|
166 |
printf("\n");
|
167 |
}
|
168 |
}
|
169 |
|
170 |
|
171 |
|
172 |
/*************************************************
|
173 |
* Main program *
|
174 |
*************************************************/
|
175 |
|
176 |
int
|
177 |
main(void)
|
178 |
{
|
179 |
uschar buffer[1024];
|
180 |
while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
|
181 |
{
|
182 |
uschar name[24];
|
183 |
uschar *s, *t;
|
184 |
|
185 |
printf("%s", buffer);
|
186 |
s = buffer;
|
187 |
while (isspace(*s)) s++;
|
188 |
if (*s == 0) continue;
|
189 |
|
190 |
for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
|
191 |
*t = 0;
|
192 |
while (isspace(*s)) s++;
|
193 |
|
194 |
if (strcmp(CS name, "findprop") == 0)
|
195 |
{
|
196 |
while (*s != 0)
|
197 |
{
|
198 |
uschar *endptr;
|
199 |
int c = strtoul(CS s, CSS(&endptr), 16);
|
200 |
print_prop(c);
|
201 |
s = endptr;
|
202 |
while (isspace(*s)) s++;
|
203 |
}
|
204 |
}
|
205 |
|
206 |
else printf("Unknown test command %s\n", name);
|
207 |
}
|
208 |
|
209 |
return 0;
|
210 |
}
|
211 |
|
212 |
/* End */
|