/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Diff of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/maintain/ucptest.c revision 97 by ph10, Mon Mar 5 12:36:47 2007 UTC code/trunk/maint/ucptest.c revision 1045 by ph10, Sun Sep 23 16:50:00 2012 UTC
# Line 2  Line 2 
2  * A program for testing the Unicode property table *  * A program for testing the Unicode property table *
3  ***************************************************/  ***************************************************/
4    
5  /* Copyright (c) University of Cambridge 2006 */  /* Copyright (c) University of Cambridge 2008 */
6    
7  /* Compile thus:  /* Compile thus:
8     gcc -o ucptest maintain/ucptest.c pcre_ucp_searchfuncs.c     gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9  */  */
10    
11    /* The program expects to read commands on stdin, and it writes output
12    to stdout. There is only one command, "findprop", followed by a list of Unicode
13    code points as hex numbers (without any prefixes). The output is one line per
14    character, giving its Unicode properties followed by its other case if there is
15    one. */
16    
17    #ifdef HAVE_CONFIG_H
18    #include "../config.h"
19    #endif
20    
21    #ifndef SUPPORT_UCP
22    #define SUPPORT_UCP
23    #endif
24    
25  #include <ctype.h>  #include <ctype.h>
26  #include <stdio.h>  #include <stdio.h>
27  #include <stdlib.h>  #include <stdlib.h>
28  #include <string.h>  #include <string.h>
29  #include "pcre_internal.h"  #include "../pcre_internal.h"
30  #include "ucp.h"  #include "../ucp.h"
 #include "ucpinternal.h"  
31    
32    
33  /* -------------------------------------------------------------------*/  /* -------------------------------------------------------------------*/
# Line 38  Line 51 
51  static void  static void
52  print_prop(int c)  print_prop(int c)
53  {  {
54  int fulltype, script, othercase;  int type = UCD_CATEGORY(c);
55  int type = _pcre_ucp_findprop(c, &fulltype, &script);  int fulltype = UCD_CHARTYPE(c);
56    int script = UCD_SCRIPT(c);
57    int gbprop = UCD_GRAPHBREAK(c);
58    int othercase = UCD_OTHERCASE(c);
59    int caseset = UCD_CASESET(c);
60    
61    unsigned char *fulltypename = US"??";
62    unsigned char *typename = US"??";
63    unsigned char *scriptname = US"??";
64    unsigned char *graphbreak = US"??";
65    
66  printf("%04x ", c);  switch (type)
 if (type < 0) printf("not found\n"); else  
67    {    {
68    uschar *fulltypename = US"??";    case ucp_C: typename = US"Control"; break;
69    uschar *typename = US"??";    case ucp_L: typename = US"Letter"; break;
70    uschar *scriptname = US"??";    case ucp_M: typename = US"Mark"; break;
71    switch (type)    case ucp_N: typename = US"Number"; break;
72      {    case ucp_P: typename = US"Punctuation"; break;
73      case ucp_C: typename = US"Control"; break;    case ucp_S: typename = US"Symbol"; break;
74      case ucp_L: typename = US"Letter"; break;    case ucp_Z: typename = US"Separator"; break;
75      case ucp_M: typename = US"Mark"; break;    }
76      case ucp_N: typename = US"Number"; break;  
77      case ucp_P: typename = US"Punctuation"; break;  switch (fulltype)
78      case ucp_S: typename = US"Symbol"; break;    {
79      case ucp_Z: typename = US"Separator"; break;    case ucp_Cc: fulltypename = US"Control"; break;
80      }    case ucp_Cf: fulltypename = US"Format"; break;
81    switch (fulltype)    case ucp_Cn: fulltypename = US"Unassigned"; break;
82      {    case ucp_Co: fulltypename = US"Private use"; break;
83      case ucp_Cc: fulltypename = US"Control"; break;    case ucp_Cs: fulltypename = US"Surrogate"; break;
84      case ucp_Cf: fulltypename = US"Format"; break;    case ucp_Ll: fulltypename = US"Lower case letter"; break;
85      case ucp_Cn: fulltypename = US"Unassigned"; break;    case ucp_Lm: fulltypename = US"Modifier letter"; break;
86      case ucp_Co: fulltypename = US"Private use"; break;    case ucp_Lo: fulltypename = US"Other letter"; break;
87      case ucp_Cs: fulltypename = US"Surrogate"; break;    case ucp_Lt: fulltypename = US"Title case letter"; break;
88      case ucp_Ll: fulltypename = US"Lower case letter"; break;    case ucp_Lu: fulltypename = US"Upper case letter"; break;
89      case ucp_Lm: fulltypename = US"Modifier letter"; break;    case ucp_Mc: fulltypename = US"Spacing mark"; break;
90      case ucp_Lo: fulltypename = US"Other letter"; break;    case ucp_Me: fulltypename = US"Enclosing mark"; break;
91      case ucp_Lt: fulltypename = US"Title case letter"; break;    case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
92      case ucp_Lu: fulltypename = US"Upper case letter"; break;    case ucp_Nd: fulltypename = US"Decimal number"; break;
93      case ucp_Mc: fulltypename = US"Spacing mark"; break;    case ucp_Nl: fulltypename = US"Letter number"; break;
94      case ucp_Me: fulltypename = US"Enclosing mark"; break;    case ucp_No: fulltypename = US"Other number"; break;
95      case ucp_Mn: fulltypename = US"Non-spacing mark"; break;    case ucp_Pc: fulltypename = US"Connector punctuation"; break;
96      case ucp_Nd: fulltypename = US"Decimal number"; break;    case ucp_Pd: fulltypename = US"Dash punctuation"; break;
97      case ucp_Nl: fulltypename = US"Letter number"; break;    case ucp_Pe: fulltypename = US"Close punctuation"; break;
98      case ucp_No: fulltypename = US"Other number"; break;    case ucp_Pf: fulltypename = US"Final punctuation"; break;
99      case ucp_Pc: fulltypename = US"Connector punctuation"; break;    case ucp_Pi: fulltypename = US"Initial punctuation"; break;
100      case ucp_Pd: fulltypename = US"Dash punctuation"; break;    case ucp_Po: fulltypename = US"Other punctuation"; break;
101      case ucp_Pe: fulltypename = US"Close punctuation"; break;    case ucp_Ps: fulltypename = US"Open punctuation"; break;
102      case ucp_Pf: fulltypename = US"Final punctuation"; break;    case ucp_Sc: fulltypename = US"Currency symbol"; break;
103      case ucp_Pi: fulltypename = US"Initial punctuation"; break;    case ucp_Sk: fulltypename = US"Modifier symbol"; break;
104      case ucp_Po: fulltypename = US"Other punctuation"; break;    case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
105      case ucp_Ps: fulltypename = US"Open punctuation"; break;    case ucp_So: fulltypename = US"Other symbol"; break;
106      case ucp_Sc: fulltypename = US"Currency symbol"; break;    case ucp_Zl: fulltypename = US"Line separator"; break;
107      case ucp_Sk: fulltypename = US"Modifier symbol"; break;    case ucp_Zp: fulltypename = US"Paragraph separator"; break;
108      case ucp_Sm: fulltypename = US"Mathematical symbol"; break;    case ucp_Zs: fulltypename = US"Space separator"; break;
109      case ucp_So: fulltypename = US"Other symbol"; break;    }
110      case ucp_Zl: fulltypename = US"Line separator"; break;  
111      case ucp_Zp: fulltypename = US"Paragraph separator"; break;  switch(gbprop)
112      case ucp_Zs: fulltypename = US"Space separator"; break;    {
113      }    case ucp_gbCR:           graphbreak = US"CR"; break;
114    switch(script)    case ucp_gbLF:           graphbreak = US"LF"; break;
115      {    case ucp_gbControl:      graphbreak = US"Control"; break;
116      case ucp_Arabic:      scriptname = US"Arabic"; break;    case ucp_gbExtend:       graphbreak = US"Extend"; break;
117      case ucp_Armenian:    scriptname = US"Armenian"; break;    case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
118      case ucp_Balinese:    scriptname = US"Balinese"; break;    case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
119      case ucp_Bengali:     scriptname = US"Bengali"; break;    case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
120      case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;    case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
121      case ucp_Braille:     scriptname = US"Braille"; break;    case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
122      case ucp_Buginese:    scriptname = US"Buginese"; break;    case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
123      case ucp_Buhid:       scriptname = US"Buhid"; break;    case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
124      case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;    case ucp_gbOther:        graphbreak = US"Other"; break;
125      case ucp_Cherokee:    scriptname = US"Cherokee"; break;    }
126      case ucp_Common:      scriptname = US"Common"; break;  
127      case ucp_Coptic:      scriptname = US"Coptic"; break;  switch(script)
128      case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;    {
129      case ucp_Cypriot:     scriptname = US"Cypriot"; break;    case ucp_Arabic:      scriptname = US"Arabic"; break;
130      case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;    case ucp_Armenian:    scriptname = US"Armenian"; break;
131      case ucp_Deseret:     scriptname = US"Deseret"; break;    case ucp_Balinese:    scriptname = US"Balinese"; break;
132      case ucp_Devanagari:  scriptname = US"Devanagari"; break;    case ucp_Bengali:     scriptname = US"Bengali"; break;
133      case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;    case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
134      case ucp_Georgian:    scriptname = US"Georgian"; break;    case ucp_Braille:     scriptname = US"Braille"; break;
135      case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;    case ucp_Buginese:    scriptname = US"Buginese"; break;
136      case ucp_Gothic:      scriptname = US"Gothic"; break;    case ucp_Buhid:       scriptname = US"Buhid"; break;
137      case ucp_Greek:       scriptname = US"Greek"; break;    case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
138      case ucp_Gujarati:    scriptname = US"Gujarati"; break;    case ucp_Cherokee:    scriptname = US"Cherokee"; break;
139      case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;    case ucp_Common:      scriptname = US"Common"; break;
140      case ucp_Han:         scriptname = US"Han"; break;    case ucp_Coptic:      scriptname = US"Coptic"; break;
141      case ucp_Hangul:      scriptname = US"Hangul"; break;    case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
142      case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;    case ucp_Cypriot:     scriptname = US"Cypriot"; break;
143      case ucp_Hebrew:      scriptname = US"Hebrew"; break;    case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
144      case ucp_Hiragana:    scriptname = US"Hiragana"; break;    case ucp_Deseret:     scriptname = US"Deseret"; break;
145      case ucp_Inherited:   scriptname = US"Inherited"; break;    case ucp_Devanagari:  scriptname = US"Devanagari"; break;
146      case ucp_Kannada:     scriptname = US"Kannada"; break;    case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
147      case ucp_Katakana:    scriptname = US"Katakana"; break;    case ucp_Georgian:    scriptname = US"Georgian"; break;
148      case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;    case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
149      case ucp_Khmer:       scriptname = US"Khmer"; break;    case ucp_Gothic:      scriptname = US"Gothic"; break;
150      case ucp_Lao:         scriptname = US"Lao"; break;    case ucp_Greek:       scriptname = US"Greek"; break;
151      case ucp_Latin:       scriptname = US"Latin"; break;    case ucp_Gujarati:    scriptname = US"Gujarati"; break;
152      case ucp_Limbu:       scriptname = US"Limbu"; break;    case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
153      case ucp_Linear_B:    scriptname = US"Linear_B"; break;    case ucp_Han:         scriptname = US"Han"; break;
154      case ucp_Malayalam:   scriptname = US"Malayalam"; break;    case ucp_Hangul:      scriptname = US"Hangul"; break;
155      case ucp_Mongolian:   scriptname = US"Mongolian"; break;    case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
156      case ucp_Myanmar:     scriptname = US"Myanmar"; break;    case ucp_Hebrew:      scriptname = US"Hebrew"; break;
157      case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;    case ucp_Hiragana:    scriptname = US"Hiragana"; break;
158      case ucp_Nko:         scriptname = US"Nko"; break;    case ucp_Inherited:   scriptname = US"Inherited"; break;
159      case ucp_Ogham:       scriptname = US"Ogham"; break;    case ucp_Kannada:     scriptname = US"Kannada"; break;
160      case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;    case ucp_Katakana:    scriptname = US"Katakana"; break;
161      case ucp_Old_Persian: scriptname = US"Old_Persian"; break;    case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
162      case ucp_Oriya:       scriptname = US"Oriya"; break;    case ucp_Khmer:       scriptname = US"Khmer"; break;
163      case ucp_Osmanya:     scriptname = US"Osmanya"; break;    case ucp_Lao:         scriptname = US"Lao"; break;
164      case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;    case ucp_Latin:       scriptname = US"Latin"; break;
165      case ucp_Phoenician:  scriptname = US"Phoenician"; break;    case ucp_Limbu:       scriptname = US"Limbu"; break;
166      case ucp_Runic:       scriptname = US"Runic"; break;    case ucp_Linear_B:    scriptname = US"Linear_B"; break;
167      case ucp_Shavian:     scriptname = US"Shavian"; break;    case ucp_Malayalam:   scriptname = US"Malayalam"; break;
168      case ucp_Sinhala:     scriptname = US"Sinhala"; break;    case ucp_Mongolian:   scriptname = US"Mongolian"; break;
169      case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;    case ucp_Myanmar:     scriptname = US"Myanmar"; break;
170      case ucp_Syriac:      scriptname = US"Syriac"; break;    case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
171      case ucp_Tagalog:     scriptname = US"Tagalog"; break;    case ucp_Nko:         scriptname = US"Nko"; break;
172      case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;    case ucp_Ogham:       scriptname = US"Ogham"; break;
173      case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;    case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
174      case ucp_Tamil:       scriptname = US"Tamil"; break;    case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
175      case ucp_Telugu:      scriptname = US"Telugu"; break;    case ucp_Oriya:       scriptname = US"Oriya"; break;
176      case ucp_Thaana:      scriptname = US"Thaana"; break;    case ucp_Osmanya:     scriptname = US"Osmanya"; break;
177      case ucp_Thai:        scriptname = US"Thai"; break;    case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
178      case ucp_Tibetan:     scriptname = US"Tibetan"; break;    case ucp_Phoenician:  scriptname = US"Phoenician"; break;
179      case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;    case ucp_Runic:       scriptname = US"Runic"; break;
180      case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;    case ucp_Shavian:     scriptname = US"Shavian"; break;
181      case ucp_Yi:          scriptname = US"Yi"; break;    case ucp_Sinhala:     scriptname = US"Sinhala"; break;
182      }    case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
183      case ucp_Syriac:      scriptname = US"Syriac"; break;
184    printf("%s: %s %s", typename, fulltypename, scriptname);    case ucp_Tagalog:     scriptname = US"Tagalog"; break;
185    othercase = _pcre_ucp_othercase(c);    case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
186    if (othercase >= 0) printf(" %04x", othercase);    case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
187    printf("\n");    case ucp_Tamil:       scriptname = US"Tamil"; break;
188      case ucp_Telugu:      scriptname = US"Telugu"; break;
189      case ucp_Thaana:      scriptname = US"Thaana"; break;
190      case ucp_Thai:        scriptname = US"Thai"; break;
191      case ucp_Tibetan:     scriptname = US"Tibetan"; break;
192      case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
193      case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
194      case ucp_Yi:          scriptname = US"Yi"; break;
195      /* New for Unicode 5.1: */
196      case ucp_Carian:      scriptname = US"Carian"; break;
197      case ucp_Cham:        scriptname = US"Cham"; break;
198      case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
199      case ucp_Lepcha:      scriptname = US"Lepcha"; break;
200      case ucp_Lycian:      scriptname = US"Lycian"; break;
201      case ucp_Lydian:      scriptname = US"Lydian"; break;
202      case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
203      case ucp_Rejang:      scriptname = US"Rejang"; break;
204      case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
205      case ucp_Sundanese:   scriptname = US"Sundanese"; break;
206      case ucp_Vai:         scriptname = US"Vai"; break;
207      /* New for Unicode 5.2: */
208      case ucp_Avestan:     scriptname = US"Avestan"; break;
209      case ucp_Bamum:       scriptname = US"Bamum"; break;
210      case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
211      case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
212      case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
213      case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
214      case ucp_Javanese:    scriptname = US"Javanese"; break;
215      case ucp_Kaithi:      scriptname = US"Kaithi"; break;
216      case ucp_Lisu:        scriptname = US"Lisu"; break;
217      case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
218      case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
219      case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
220      case ucp_Samaritan:   scriptname = US"Samaritan"; break;
221      case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
222      case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
223      /* New for Unicode 6.0.0 */
224      case ucp_Batak:       scriptname = US"Batak"; break;
225      case ucp_Brahmi:      scriptname = US"Brahmi"; break;
226      case ucp_Mandaic:     scriptname = US"Mandaic"; break;
227    
228      /* New for Unicode 6.1.0 */
229      case ucp_Chakma:               scriptname = US"Chakma"; break;
230      case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
231      case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
232      case ucp_Miao:                 scriptname = US"Miao"; break;
233      case ucp_Sharada:              scriptname = US"Sharada"; break;
234      case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
235      case ucp_Takri:                scriptname = US"Takri"; break;
236    
237    }    }
238    
239    printf("%04x %s: %s, %s, %s", c, typename, fulltypename, scriptname, graphbreak);
240    if (othercase != c)
241      {
242      printf(", %04x", othercase);
243      if (caseset != 0)
244        {
245        const pcre_uint32 *p = PRIV(ucd_caseless_sets) + caseset - 1;
246        while (*(++p) < NOTACHAR)
247          if (*p != othercase && *p != c) printf(", %04x", *p);
248        }
249      }
250    printf("\n");
251  }  }
252    
253    
# Line 176  if (type < 0) printf("not found\n"); els Line 259  if (type < 0) printf("not found\n"); els
259  int  int
260  main(void)  main(void)
261  {  {
262  uschar buffer[1024];  unsigned char buffer[1024];
263  while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)  while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
264    {    {
265    uschar name[24];    unsigned char name[24];
266    uschar *s, *t;    unsigned char *s, *t;
267    
268    printf("%s", buffer);    printf("%s", buffer);
269    s = buffer;    s = buffer;
# Line 195  while (fgets(CS buffer, sizeof(buffer), Line 278  while (fgets(CS buffer, sizeof(buffer),
278      {      {
279      while (*s != 0)      while (*s != 0)
280        {        {
281        uschar *endptr;        unsigned char *endptr;
282        int c = strtoul(CS s, CSS(&endptr), 16);        int c = strtoul(CS s, CSS(&endptr), 16);
283        print_prop(c);        print_prop(c);
284        s = endptr;        s = endptr;

Legend:
Removed from v.97  
changed lines
  Added in v.1045

  ViewVC Help
Powered by ViewVC 1.1.5