/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Diff of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 114 by ph10, Fri Mar 9 10:15:12 2007 UTC revision 351 by ph10, Fri Jul 4 18:27:16 2008 UTC
# Line 2  Line 2 
2  * A program for testing the Unicode property table *  * A program for testing the Unicode property table *
3  ***************************************************/  ***************************************************/
4    
5  /* Copyright (c) University of Cambridge 2006 */  /* Copyright (c) University of Cambridge 2008 */
6    
7  /* Compile thus:  /* Compile thus:
8     gcc -o ucptest ucptest.c ../pcre_ucp_searchfuncs.c     gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9  */  */
10    
11    /* The program expects to read commands on stdin, and it writes output
12    to stdout. There is only one command, "findprop", followed by a list of Unicode
13    code points as hex numbers (without any prefixes). The output is one line per
14    character, giving its Unicode properties followed by its other case if there is
15    one. */
16    
17    #ifdef HAVE_CONFIG_H
18    #include "../config.h"
19    #endif
20    
21  #include <ctype.h>  #include <ctype.h>
22  #include <stdio.h>  #include <stdio.h>
23  #include <stdlib.h>  #include <stdlib.h>
24  #include <string.h>  #include <string.h>
25  #include "../pcre_internal.h"  #include "../pcre_internal.h"
26  #include "../ucp.h"  #include "../ucp.h"
 #include "../ucpinternal.h"  
27    
28    
29  /* -------------------------------------------------------------------*/  /* -------------------------------------------------------------------*/
# Line 38  Line 47 
47  static void  static void
48  print_prop(int c)  print_prop(int c)
49  {  {
50  int fulltype, script, othercase;  int type = UCD_CATEGORY(c);
51  int type = _pcre_ucp_findprop(c, &fulltype, &script);  int fulltype = UCD_CHARTYPE(c);
52    int script = UCD_SCRIPT(c);
53    int othercase = UCD_OTHERCASE(c);
54    
55    uschar *fulltypename = US"??";
56    uschar *typename = US"??";
57    uschar *scriptname = US"??";
58    
59    switch (type)
60      {
61      case ucp_C: typename = US"Control"; break;
62      case ucp_L: typename = US"Letter"; break;
63      case ucp_M: typename = US"Mark"; break;
64      case ucp_N: typename = US"Number"; break;
65      case ucp_P: typename = US"Punctuation"; break;
66      case ucp_S: typename = US"Symbol"; break;
67      case ucp_Z: typename = US"Separator"; break;
68      }
69    
70  printf("%04x ", c);  switch (fulltype)
 if (type < 0) printf("not found\n"); else  
71    {    {
72    uschar *fulltypename = US"??";    case ucp_Cc: fulltypename = US"Control"; break;
73    uschar *typename = US"??";    case ucp_Cf: fulltypename = US"Format"; break;
74    uschar *scriptname = US"??";    case ucp_Cn: fulltypename = US"Unassigned"; break;
75    switch (type)    case ucp_Co: fulltypename = US"Private use"; break;
76      {    case ucp_Cs: fulltypename = US"Surrogate"; break;
77      case ucp_C: typename = US"Control"; break;    case ucp_Ll: fulltypename = US"Lower case letter"; break;
78      case ucp_L: typename = US"Letter"; break;    case ucp_Lm: fulltypename = US"Modifier letter"; break;
79      case ucp_M: typename = US"Mark"; break;    case ucp_Lo: fulltypename = US"Other letter"; break;
80      case ucp_N: typename = US"Number"; break;    case ucp_Lt: fulltypename = US"Title case letter"; break;
81      case ucp_P: typename = US"Punctuation"; break;    case ucp_Lu: fulltypename = US"Upper case letter"; break;
82      case ucp_S: typename = US"Symbol"; break;    case ucp_Mc: fulltypename = US"Spacing mark"; break;
83      case ucp_Z: typename = US"Separator"; break;    case ucp_Me: fulltypename = US"Enclosing mark"; break;
84      }    case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
85    switch (fulltype)    case ucp_Nd: fulltypename = US"Decimal number"; break;
86      {    case ucp_Nl: fulltypename = US"Letter number"; break;
87      case ucp_Cc: fulltypename = US"Control"; break;    case ucp_No: fulltypename = US"Other number"; break;
88      case ucp_Cf: fulltypename = US"Format"; break;    case ucp_Pc: fulltypename = US"Connector punctuation"; break;
89      case ucp_Cn: fulltypename = US"Unassigned"; break;    case ucp_Pd: fulltypename = US"Dash punctuation"; break;
90      case ucp_Co: fulltypename = US"Private use"; break;    case ucp_Pe: fulltypename = US"Close punctuation"; break;
91      case ucp_Cs: fulltypename = US"Surrogate"; break;    case ucp_Pf: fulltypename = US"Final punctuation"; break;
92      case ucp_Ll: fulltypename = US"Lower case letter"; break;    case ucp_Pi: fulltypename = US"Initial punctuation"; break;
93      case ucp_Lm: fulltypename = US"Modifier letter"; break;    case ucp_Po: fulltypename = US"Other punctuation"; break;
94      case ucp_Lo: fulltypename = US"Other letter"; break;    case ucp_Ps: fulltypename = US"Open punctuation"; break;
95      case ucp_Lt: fulltypename = US"Title case letter"; break;    case ucp_Sc: fulltypename = US"Currency symbol"; break;
96      case ucp_Lu: fulltypename = US"Upper case letter"; break;    case ucp_Sk: fulltypename = US"Modifier symbol"; break;
97      case ucp_Mc: fulltypename = US"Spacing mark"; break;    case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
98      case ucp_Me: fulltypename = US"Enclosing mark"; break;    case ucp_So: fulltypename = US"Other symbol"; break;
99      case ucp_Mn: fulltypename = US"Non-spacing mark"; break;    case ucp_Zl: fulltypename = US"Line separator"; break;
100      case ucp_Nd: fulltypename = US"Decimal number"; break;    case ucp_Zp: fulltypename = US"Paragraph separator"; break;
101      case ucp_Nl: fulltypename = US"Letter number"; break;    case ucp_Zs: fulltypename = US"Space separator"; break;
     case ucp_No: fulltypename = US"Other number"; break;  
     case ucp_Pc: fulltypename = US"Connector punctuation"; break;  
     case ucp_Pd: fulltypename = US"Dash punctuation"; break;  
     case ucp_Pe: fulltypename = US"Close punctuation"; break;  
     case ucp_Pf: fulltypename = US"Final punctuation"; break;  
     case ucp_Pi: fulltypename = US"Initial punctuation"; break;  
     case ucp_Po: fulltypename = US"Other punctuation"; break;  
     case ucp_Ps: fulltypename = US"Open punctuation"; break;  
     case ucp_Sc: fulltypename = US"Currency symbol"; break;  
     case ucp_Sk: fulltypename = US"Modifier symbol"; break;  
     case ucp_Sm: fulltypename = US"Mathematical symbol"; break;  
     case ucp_So: fulltypename = US"Other symbol"; break;  
     case ucp_Zl: fulltypename = US"Line separator"; break;  
     case ucp_Zp: fulltypename = US"Paragraph separator"; break;  
     case ucp_Zs: fulltypename = US"Space separator"; break;  
     }  
   switch(script)  
     {  
     case ucp_Arabic:      scriptname = US"Arabic"; break;  
     case ucp_Armenian:    scriptname = US"Armenian"; break;  
     case ucp_Balinese:    scriptname = US"Balinese"; break;  
     case ucp_Bengali:     scriptname = US"Bengali"; break;  
     case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;  
     case ucp_Braille:     scriptname = US"Braille"; break;  
     case ucp_Buginese:    scriptname = US"Buginese"; break;  
     case ucp_Buhid:       scriptname = US"Buhid"; break;  
     case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;  
     case ucp_Cherokee:    scriptname = US"Cherokee"; break;  
     case ucp_Common:      scriptname = US"Common"; break;  
     case ucp_Coptic:      scriptname = US"Coptic"; break;  
     case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;  
     case ucp_Cypriot:     scriptname = US"Cypriot"; break;  
     case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;  
     case ucp_Deseret:     scriptname = US"Deseret"; break;  
     case ucp_Devanagari:  scriptname = US"Devanagari"; break;  
     case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;  
     case ucp_Georgian:    scriptname = US"Georgian"; break;  
     case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;  
     case ucp_Gothic:      scriptname = US"Gothic"; break;  
     case ucp_Greek:       scriptname = US"Greek"; break;  
     case ucp_Gujarati:    scriptname = US"Gujarati"; break;  
     case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;  
     case ucp_Han:         scriptname = US"Han"; break;  
     case ucp_Hangul:      scriptname = US"Hangul"; break;  
     case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;  
     case ucp_Hebrew:      scriptname = US"Hebrew"; break;  
     case ucp_Hiragana:    scriptname = US"Hiragana"; break;  
     case ucp_Inherited:   scriptname = US"Inherited"; break;  
     case ucp_Kannada:     scriptname = US"Kannada"; break;  
     case ucp_Katakana:    scriptname = US"Katakana"; break;  
     case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;  
     case ucp_Khmer:       scriptname = US"Khmer"; break;  
     case ucp_Lao:         scriptname = US"Lao"; break;  
     case ucp_Latin:       scriptname = US"Latin"; break;  
     case ucp_Limbu:       scriptname = US"Limbu"; break;  
     case ucp_Linear_B:    scriptname = US"Linear_B"; break;  
     case ucp_Malayalam:   scriptname = US"Malayalam"; break;  
     case ucp_Mongolian:   scriptname = US"Mongolian"; break;  
     case ucp_Myanmar:     scriptname = US"Myanmar"; break;  
     case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;  
     case ucp_Nko:         scriptname = US"Nko"; break;  
     case ucp_Ogham:       scriptname = US"Ogham"; break;  
     case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;  
     case ucp_Old_Persian: scriptname = US"Old_Persian"; break;  
     case ucp_Oriya:       scriptname = US"Oriya"; break;  
     case ucp_Osmanya:     scriptname = US"Osmanya"; break;  
     case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;  
     case ucp_Phoenician:  scriptname = US"Phoenician"; break;  
     case ucp_Runic:       scriptname = US"Runic"; break;  
     case ucp_Shavian:     scriptname = US"Shavian"; break;  
     case ucp_Sinhala:     scriptname = US"Sinhala"; break;  
     case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;  
     case ucp_Syriac:      scriptname = US"Syriac"; break;  
     case ucp_Tagalog:     scriptname = US"Tagalog"; break;  
     case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;  
     case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;  
     case ucp_Tamil:       scriptname = US"Tamil"; break;  
     case ucp_Telugu:      scriptname = US"Telugu"; break;  
     case ucp_Thaana:      scriptname = US"Thaana"; break;  
     case ucp_Thai:        scriptname = US"Thai"; break;  
     case ucp_Tibetan:     scriptname = US"Tibetan"; break;  
     case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;  
     case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;  
     case ucp_Yi:          scriptname = US"Yi"; break;  
     }  
   
   printf("%s: %s %s", typename, fulltypename, scriptname);  
   othercase = _pcre_ucp_othercase(c);  
   if (othercase >= 0) printf(" %04x", othercase);  
   printf("\n");  
102    }    }
103    
104    switch(script)
105      {
106      case ucp_Arabic:      scriptname = US"Arabic"; break;
107      case ucp_Armenian:    scriptname = US"Armenian"; break;
108      case ucp_Balinese:    scriptname = US"Balinese"; break;
109      case ucp_Bengali:     scriptname = US"Bengali"; break;
110      case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
111      case ucp_Braille:     scriptname = US"Braille"; break;
112      case ucp_Buginese:    scriptname = US"Buginese"; break;
113      case ucp_Buhid:       scriptname = US"Buhid"; break;
114      case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
115      case ucp_Cherokee:    scriptname = US"Cherokee"; break;
116      case ucp_Common:      scriptname = US"Common"; break;
117      case ucp_Coptic:      scriptname = US"Coptic"; break;
118      case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
119      case ucp_Cypriot:     scriptname = US"Cypriot"; break;
120      case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
121      case ucp_Deseret:     scriptname = US"Deseret"; break;
122      case ucp_Devanagari:  scriptname = US"Devanagari"; break;
123      case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
124      case ucp_Georgian:    scriptname = US"Georgian"; break;
125      case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
126      case ucp_Gothic:      scriptname = US"Gothic"; break;
127      case ucp_Greek:       scriptname = US"Greek"; break;
128      case ucp_Gujarati:    scriptname = US"Gujarati"; break;
129      case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
130      case ucp_Han:         scriptname = US"Han"; break;
131      case ucp_Hangul:      scriptname = US"Hangul"; break;
132      case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
133      case ucp_Hebrew:      scriptname = US"Hebrew"; break;
134      case ucp_Hiragana:    scriptname = US"Hiragana"; break;
135      case ucp_Inherited:   scriptname = US"Inherited"; break;
136      case ucp_Kannada:     scriptname = US"Kannada"; break;
137      case ucp_Katakana:    scriptname = US"Katakana"; break;
138      case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
139      case ucp_Khmer:       scriptname = US"Khmer"; break;
140      case ucp_Lao:         scriptname = US"Lao"; break;
141      case ucp_Latin:       scriptname = US"Latin"; break;
142      case ucp_Limbu:       scriptname = US"Limbu"; break;
143      case ucp_Linear_B:    scriptname = US"Linear_B"; break;
144      case ucp_Malayalam:   scriptname = US"Malayalam"; break;
145      case ucp_Mongolian:   scriptname = US"Mongolian"; break;
146      case ucp_Myanmar:     scriptname = US"Myanmar"; break;
147      case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
148      case ucp_Nko:         scriptname = US"Nko"; break;
149      case ucp_Ogham:       scriptname = US"Ogham"; break;
150      case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
151      case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
152      case ucp_Oriya:       scriptname = US"Oriya"; break;
153      case ucp_Osmanya:     scriptname = US"Osmanya"; break;
154      case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
155      case ucp_Phoenician:  scriptname = US"Phoenician"; break;
156      case ucp_Runic:       scriptname = US"Runic"; break;
157      case ucp_Shavian:     scriptname = US"Shavian"; break;
158      case ucp_Sinhala:     scriptname = US"Sinhala"; break;
159      case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
160      case ucp_Syriac:      scriptname = US"Syriac"; break;
161      case ucp_Tagalog:     scriptname = US"Tagalog"; break;
162      case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
163      case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
164      case ucp_Tamil:       scriptname = US"Tamil"; break;
165      case ucp_Telugu:      scriptname = US"Telugu"; break;
166      case ucp_Thaana:      scriptname = US"Thaana"; break;
167      case ucp_Thai:        scriptname = US"Thai"; break;
168      case ucp_Tibetan:     scriptname = US"Tibetan"; break;
169      case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
170      case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
171      case ucp_Yi:          scriptname = US"Yi"; break;
172      }
173    
174    printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
175    if (othercase != c) printf(" %04x", othercase);
176    printf("\n");
177  }  }
178    
179    

Legend:
Removed from v.114  
changed lines
  Added in v.351

  ViewVC Help
Powered by ViewVC 1.1.5