2 |
* A program for testing the Unicode property table * |
* A program for testing the Unicode property table * |
3 |
***************************************************/ |
***************************************************/ |
4 |
|
|
5 |
/* Copyright (c) University of Cambridge 2006 */ |
/* Copyright (c) University of Cambridge 2008 */ |
6 |
|
|
7 |
/* Compile thus: |
/* Compile thus: |
8 |
gcc -o ucptest ucptest.c ../pcre_ucp_searchfuncs.c |
gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c |
9 |
*/ |
*/ |
10 |
|
|
11 |
|
/* The program expects to read commands on stdin, and it writes output |
12 |
|
to stdout. There is only one command, "findprop", followed by a list of Unicode |
13 |
|
code points as hex numbers (without any prefixes). The output is one line per |
14 |
|
character, giving its Unicode properties followed by its other case if there is |
15 |
|
one. */ |
16 |
|
|
17 |
|
#ifdef HAVE_CONFIG_H |
18 |
|
#include "../config.h" |
19 |
|
#endif |
20 |
|
|
21 |
#include <ctype.h> |
#include <ctype.h> |
22 |
#include <stdio.h> |
#include <stdio.h> |
23 |
#include <stdlib.h> |
#include <stdlib.h> |
24 |
#include <string.h> |
#include <string.h> |
25 |
#include "../pcre_internal.h" |
#include "../pcre_internal.h" |
26 |
#include "../ucp.h" |
#include "../ucp.h" |
|
#include "../ucpinternal.h" |
|
27 |
|
|
28 |
|
|
29 |
/* -------------------------------------------------------------------*/ |
/* -------------------------------------------------------------------*/ |
47 |
static void |
static void |
48 |
print_prop(int c) |
print_prop(int c) |
49 |
{ |
{ |
50 |
int fulltype, script, othercase; |
int type = UCD_CATEGORY(c); |
51 |
int type = _pcre_ucp_findprop(c, &fulltype, &script); |
int fulltype = UCD_CHARTYPE(c); |
52 |
|
int script = UCD_SCRIPT(c); |
53 |
|
int othercase = UCD_OTHERCASE(c); |
54 |
|
|
55 |
|
uschar *fulltypename = US"??"; |
56 |
|
uschar *typename = US"??"; |
57 |
|
uschar *scriptname = US"??"; |
58 |
|
|
59 |
|
switch (type) |
60 |
|
{ |
61 |
|
case ucp_C: typename = US"Control"; break; |
62 |
|
case ucp_L: typename = US"Letter"; break; |
63 |
|
case ucp_M: typename = US"Mark"; break; |
64 |
|
case ucp_N: typename = US"Number"; break; |
65 |
|
case ucp_P: typename = US"Punctuation"; break; |
66 |
|
case ucp_S: typename = US"Symbol"; break; |
67 |
|
case ucp_Z: typename = US"Separator"; break; |
68 |
|
} |
69 |
|
|
70 |
printf("%04x ", c); |
switch (fulltype) |
|
if (type < 0) printf("not found\n"); else |
|
71 |
{ |
{ |
72 |
uschar *fulltypename = US"??"; |
case ucp_Cc: fulltypename = US"Control"; break; |
73 |
uschar *typename = US"??"; |
case ucp_Cf: fulltypename = US"Format"; break; |
74 |
uschar *scriptname = US"??"; |
case ucp_Cn: fulltypename = US"Unassigned"; break; |
75 |
switch (type) |
case ucp_Co: fulltypename = US"Private use"; break; |
76 |
{ |
case ucp_Cs: fulltypename = US"Surrogate"; break; |
77 |
case ucp_C: typename = US"Control"; break; |
case ucp_Ll: fulltypename = US"Lower case letter"; break; |
78 |
case ucp_L: typename = US"Letter"; break; |
case ucp_Lm: fulltypename = US"Modifier letter"; break; |
79 |
case ucp_M: typename = US"Mark"; break; |
case ucp_Lo: fulltypename = US"Other letter"; break; |
80 |
case ucp_N: typename = US"Number"; break; |
case ucp_Lt: fulltypename = US"Title case letter"; break; |
81 |
case ucp_P: typename = US"Punctuation"; break; |
case ucp_Lu: fulltypename = US"Upper case letter"; break; |
82 |
case ucp_S: typename = US"Symbol"; break; |
case ucp_Mc: fulltypename = US"Spacing mark"; break; |
83 |
case ucp_Z: typename = US"Separator"; break; |
case ucp_Me: fulltypename = US"Enclosing mark"; break; |
84 |
} |
case ucp_Mn: fulltypename = US"Non-spacing mark"; break; |
85 |
switch (fulltype) |
case ucp_Nd: fulltypename = US"Decimal number"; break; |
86 |
{ |
case ucp_Nl: fulltypename = US"Letter number"; break; |
87 |
case ucp_Cc: fulltypename = US"Control"; break; |
case ucp_No: fulltypename = US"Other number"; break; |
88 |
case ucp_Cf: fulltypename = US"Format"; break; |
case ucp_Pc: fulltypename = US"Connector punctuation"; break; |
89 |
case ucp_Cn: fulltypename = US"Unassigned"; break; |
case ucp_Pd: fulltypename = US"Dash punctuation"; break; |
90 |
case ucp_Co: fulltypename = US"Private use"; break; |
case ucp_Pe: fulltypename = US"Close punctuation"; break; |
91 |
case ucp_Cs: fulltypename = US"Surrogate"; break; |
case ucp_Pf: fulltypename = US"Final punctuation"; break; |
92 |
case ucp_Ll: fulltypename = US"Lower case letter"; break; |
case ucp_Pi: fulltypename = US"Initial punctuation"; break; |
93 |
case ucp_Lm: fulltypename = US"Modifier letter"; break; |
case ucp_Po: fulltypename = US"Other punctuation"; break; |
94 |
case ucp_Lo: fulltypename = US"Other letter"; break; |
case ucp_Ps: fulltypename = US"Open punctuation"; break; |
95 |
case ucp_Lt: fulltypename = US"Title case letter"; break; |
case ucp_Sc: fulltypename = US"Currency symbol"; break; |
96 |
case ucp_Lu: fulltypename = US"Upper case letter"; break; |
case ucp_Sk: fulltypename = US"Modifier symbol"; break; |
97 |
case ucp_Mc: fulltypename = US"Spacing mark"; break; |
case ucp_Sm: fulltypename = US"Mathematical symbol"; break; |
98 |
case ucp_Me: fulltypename = US"Enclosing mark"; break; |
case ucp_So: fulltypename = US"Other symbol"; break; |
99 |
case ucp_Mn: fulltypename = US"Non-spacing mark"; break; |
case ucp_Zl: fulltypename = US"Line separator"; break; |
100 |
case ucp_Nd: fulltypename = US"Decimal number"; break; |
case ucp_Zp: fulltypename = US"Paragraph separator"; break; |
101 |
case ucp_Nl: fulltypename = US"Letter number"; break; |
case ucp_Zs: fulltypename = US"Space separator"; break; |
|
case ucp_No: fulltypename = US"Other number"; break; |
|
|
case ucp_Pc: fulltypename = US"Connector punctuation"; break; |
|
|
case ucp_Pd: fulltypename = US"Dash punctuation"; break; |
|
|
case ucp_Pe: fulltypename = US"Close punctuation"; break; |
|
|
case ucp_Pf: fulltypename = US"Final punctuation"; break; |
|
|
case ucp_Pi: fulltypename = US"Initial punctuation"; break; |
|
|
case ucp_Po: fulltypename = US"Other punctuation"; break; |
|
|
case ucp_Ps: fulltypename = US"Open punctuation"; break; |
|
|
case ucp_Sc: fulltypename = US"Currency symbol"; break; |
|
|
case ucp_Sk: fulltypename = US"Modifier symbol"; break; |
|
|
case ucp_Sm: fulltypename = US"Mathematical symbol"; break; |
|
|
case ucp_So: fulltypename = US"Other symbol"; break; |
|
|
case ucp_Zl: fulltypename = US"Line separator"; break; |
|
|
case ucp_Zp: fulltypename = US"Paragraph separator"; break; |
|
|
case ucp_Zs: fulltypename = US"Space separator"; break; |
|
|
} |
|
|
switch(script) |
|
|
{ |
|
|
case ucp_Arabic: scriptname = US"Arabic"; break; |
|
|
case ucp_Armenian: scriptname = US"Armenian"; break; |
|
|
case ucp_Balinese: scriptname = US"Balinese"; break; |
|
|
case ucp_Bengali: scriptname = US"Bengali"; break; |
|
|
case ucp_Bopomofo: scriptname = US"Bopomofo"; break; |
|
|
case ucp_Braille: scriptname = US"Braille"; break; |
|
|
case ucp_Buginese: scriptname = US"Buginese"; break; |
|
|
case ucp_Buhid: scriptname = US"Buhid"; break; |
|
|
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break; |
|
|
case ucp_Cherokee: scriptname = US"Cherokee"; break; |
|
|
case ucp_Common: scriptname = US"Common"; break; |
|
|
case ucp_Coptic: scriptname = US"Coptic"; break; |
|
|
case ucp_Cuneiform: scriptname = US"Cuneiform"; break; |
|
|
case ucp_Cypriot: scriptname = US"Cypriot"; break; |
|
|
case ucp_Cyrillic: scriptname = US"Cyrillic"; break; |
|
|
case ucp_Deseret: scriptname = US"Deseret"; break; |
|
|
case ucp_Devanagari: scriptname = US"Devanagari"; break; |
|
|
case ucp_Ethiopic: scriptname = US"Ethiopic"; break; |
|
|
case ucp_Georgian: scriptname = US"Georgian"; break; |
|
|
case ucp_Glagolitic: scriptname = US"Glagolitic"; break; |
|
|
case ucp_Gothic: scriptname = US"Gothic"; break; |
|
|
case ucp_Greek: scriptname = US"Greek"; break; |
|
|
case ucp_Gujarati: scriptname = US"Gujarati"; break; |
|
|
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break; |
|
|
case ucp_Han: scriptname = US"Han"; break; |
|
|
case ucp_Hangul: scriptname = US"Hangul"; break; |
|
|
case ucp_Hanunoo: scriptname = US"Hanunoo"; break; |
|
|
case ucp_Hebrew: scriptname = US"Hebrew"; break; |
|
|
case ucp_Hiragana: scriptname = US"Hiragana"; break; |
|
|
case ucp_Inherited: scriptname = US"Inherited"; break; |
|
|
case ucp_Kannada: scriptname = US"Kannada"; break; |
|
|
case ucp_Katakana: scriptname = US"Katakana"; break; |
|
|
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break; |
|
|
case ucp_Khmer: scriptname = US"Khmer"; break; |
|
|
case ucp_Lao: scriptname = US"Lao"; break; |
|
|
case ucp_Latin: scriptname = US"Latin"; break; |
|
|
case ucp_Limbu: scriptname = US"Limbu"; break; |
|
|
case ucp_Linear_B: scriptname = US"Linear_B"; break; |
|
|
case ucp_Malayalam: scriptname = US"Malayalam"; break; |
|
|
case ucp_Mongolian: scriptname = US"Mongolian"; break; |
|
|
case ucp_Myanmar: scriptname = US"Myanmar"; break; |
|
|
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break; |
|
|
case ucp_Nko: scriptname = US"Nko"; break; |
|
|
case ucp_Ogham: scriptname = US"Ogham"; break; |
|
|
case ucp_Old_Italic: scriptname = US"Old_Italic"; break; |
|
|
case ucp_Old_Persian: scriptname = US"Old_Persian"; break; |
|
|
case ucp_Oriya: scriptname = US"Oriya"; break; |
|
|
case ucp_Osmanya: scriptname = US"Osmanya"; break; |
|
|
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break; |
|
|
case ucp_Phoenician: scriptname = US"Phoenician"; break; |
|
|
case ucp_Runic: scriptname = US"Runic"; break; |
|
|
case ucp_Shavian: scriptname = US"Shavian"; break; |
|
|
case ucp_Sinhala: scriptname = US"Sinhala"; break; |
|
|
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break; |
|
|
case ucp_Syriac: scriptname = US"Syriac"; break; |
|
|
case ucp_Tagalog: scriptname = US"Tagalog"; break; |
|
|
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break; |
|
|
case ucp_Tai_Le: scriptname = US"Tai_Le"; break; |
|
|
case ucp_Tamil: scriptname = US"Tamil"; break; |
|
|
case ucp_Telugu: scriptname = US"Telugu"; break; |
|
|
case ucp_Thaana: scriptname = US"Thaana"; break; |
|
|
case ucp_Thai: scriptname = US"Thai"; break; |
|
|
case ucp_Tibetan: scriptname = US"Tibetan"; break; |
|
|
case ucp_Tifinagh: scriptname = US"Tifinagh"; break; |
|
|
case ucp_Ugaritic: scriptname = US"Ugaritic"; break; |
|
|
case ucp_Yi: scriptname = US"Yi"; break; |
|
|
} |
|
|
|
|
|
printf("%s: %s %s", typename, fulltypename, scriptname); |
|
|
othercase = _pcre_ucp_othercase(c); |
|
|
if (othercase >= 0) printf(" %04x", othercase); |
|
|
printf("\n"); |
|
102 |
} |
} |
103 |
|
|
104 |
|
switch(script) |
105 |
|
{ |
106 |
|
case ucp_Arabic: scriptname = US"Arabic"; break; |
107 |
|
case ucp_Armenian: scriptname = US"Armenian"; break; |
108 |
|
case ucp_Balinese: scriptname = US"Balinese"; break; |
109 |
|
case ucp_Bengali: scriptname = US"Bengali"; break; |
110 |
|
case ucp_Bopomofo: scriptname = US"Bopomofo"; break; |
111 |
|
case ucp_Braille: scriptname = US"Braille"; break; |
112 |
|
case ucp_Buginese: scriptname = US"Buginese"; break; |
113 |
|
case ucp_Buhid: scriptname = US"Buhid"; break; |
114 |
|
case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break; |
115 |
|
case ucp_Cherokee: scriptname = US"Cherokee"; break; |
116 |
|
case ucp_Common: scriptname = US"Common"; break; |
117 |
|
case ucp_Coptic: scriptname = US"Coptic"; break; |
118 |
|
case ucp_Cuneiform: scriptname = US"Cuneiform"; break; |
119 |
|
case ucp_Cypriot: scriptname = US"Cypriot"; break; |
120 |
|
case ucp_Cyrillic: scriptname = US"Cyrillic"; break; |
121 |
|
case ucp_Deseret: scriptname = US"Deseret"; break; |
122 |
|
case ucp_Devanagari: scriptname = US"Devanagari"; break; |
123 |
|
case ucp_Ethiopic: scriptname = US"Ethiopic"; break; |
124 |
|
case ucp_Georgian: scriptname = US"Georgian"; break; |
125 |
|
case ucp_Glagolitic: scriptname = US"Glagolitic"; break; |
126 |
|
case ucp_Gothic: scriptname = US"Gothic"; break; |
127 |
|
case ucp_Greek: scriptname = US"Greek"; break; |
128 |
|
case ucp_Gujarati: scriptname = US"Gujarati"; break; |
129 |
|
case ucp_Gurmukhi: scriptname = US"Gurmukhi"; break; |
130 |
|
case ucp_Han: scriptname = US"Han"; break; |
131 |
|
case ucp_Hangul: scriptname = US"Hangul"; break; |
132 |
|
case ucp_Hanunoo: scriptname = US"Hanunoo"; break; |
133 |
|
case ucp_Hebrew: scriptname = US"Hebrew"; break; |
134 |
|
case ucp_Hiragana: scriptname = US"Hiragana"; break; |
135 |
|
case ucp_Inherited: scriptname = US"Inherited"; break; |
136 |
|
case ucp_Kannada: scriptname = US"Kannada"; break; |
137 |
|
case ucp_Katakana: scriptname = US"Katakana"; break; |
138 |
|
case ucp_Kharoshthi: scriptname = US"Kharoshthi"; break; |
139 |
|
case ucp_Khmer: scriptname = US"Khmer"; break; |
140 |
|
case ucp_Lao: scriptname = US"Lao"; break; |
141 |
|
case ucp_Latin: scriptname = US"Latin"; break; |
142 |
|
case ucp_Limbu: scriptname = US"Limbu"; break; |
143 |
|
case ucp_Linear_B: scriptname = US"Linear_B"; break; |
144 |
|
case ucp_Malayalam: scriptname = US"Malayalam"; break; |
145 |
|
case ucp_Mongolian: scriptname = US"Mongolian"; break; |
146 |
|
case ucp_Myanmar: scriptname = US"Myanmar"; break; |
147 |
|
case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break; |
148 |
|
case ucp_Nko: scriptname = US"Nko"; break; |
149 |
|
case ucp_Ogham: scriptname = US"Ogham"; break; |
150 |
|
case ucp_Old_Italic: scriptname = US"Old_Italic"; break; |
151 |
|
case ucp_Old_Persian: scriptname = US"Old_Persian"; break; |
152 |
|
case ucp_Oriya: scriptname = US"Oriya"; break; |
153 |
|
case ucp_Osmanya: scriptname = US"Osmanya"; break; |
154 |
|
case ucp_Phags_Pa: scriptname = US"Phags_Pa"; break; |
155 |
|
case ucp_Phoenician: scriptname = US"Phoenician"; break; |
156 |
|
case ucp_Runic: scriptname = US"Runic"; break; |
157 |
|
case ucp_Shavian: scriptname = US"Shavian"; break; |
158 |
|
case ucp_Sinhala: scriptname = US"Sinhala"; break; |
159 |
|
case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break; |
160 |
|
case ucp_Syriac: scriptname = US"Syriac"; break; |
161 |
|
case ucp_Tagalog: scriptname = US"Tagalog"; break; |
162 |
|
case ucp_Tagbanwa: scriptname = US"Tagbanwa"; break; |
163 |
|
case ucp_Tai_Le: scriptname = US"Tai_Le"; break; |
164 |
|
case ucp_Tamil: scriptname = US"Tamil"; break; |
165 |
|
case ucp_Telugu: scriptname = US"Telugu"; break; |
166 |
|
case ucp_Thaana: scriptname = US"Thaana"; break; |
167 |
|
case ucp_Thai: scriptname = US"Thai"; break; |
168 |
|
case ucp_Tibetan: scriptname = US"Tibetan"; break; |
169 |
|
case ucp_Tifinagh: scriptname = US"Tifinagh"; break; |
170 |
|
case ucp_Ugaritic: scriptname = US"Ugaritic"; break; |
171 |
|
case ucp_Yi: scriptname = US"Yi"; break; |
172 |
|
} |
173 |
|
|
174 |
|
printf("%04x %s: %s %s", c, typename, fulltypename, scriptname); |
175 |
|
if (othercase != c) printf(" %04x", othercase); |
176 |
|
printf("\n"); |
177 |
} |
} |
178 |
|
|
179 |
|
|