/[pcre]/code/trunk/maint/ucptest.c
ViewVC logotype

Diff of /code/trunk/maint/ucptest.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

code/trunk/maintain/ucptest.c revision 97 by ph10, Mon Mar 5 12:36:47 2007 UTC code/trunk/maint/ucptest.c revision 943 by ph10, Tue Feb 28 15:02:51 2012 UTC
# Line 2  Line 2 
2  * A program for testing the Unicode property table *  * A program for testing the Unicode property table *
3  ***************************************************/  ***************************************************/
4    
5  /* Copyright (c) University of Cambridge 2006 */  /* Copyright (c) University of Cambridge 2008 */
6    
7  /* Compile thus:  /* Compile thus:
8     gcc -o ucptest maintain/ucptest.c pcre_ucp_searchfuncs.c     gcc -DHAVE_CONFIG_H -o ucptest ucptest.c ../pcre_ucd.c ../pcre_tables.c
9  */  */
10    
11    /* The program expects to read commands on stdin, and it writes output
12    to stdout. There is only one command, "findprop", followed by a list of Unicode
13    code points as hex numbers (without any prefixes). The output is one line per
14    character, giving its Unicode properties followed by its other case if there is
15    one. */
16    
17    #ifdef HAVE_CONFIG_H
18    #include "../config.h"
19    #endif
20    
21    #ifndef SUPPORT_UCP
22    #define SUPPORT_UCP
23    #endif
24    
25  #include <ctype.h>  #include <ctype.h>
26  #include <stdio.h>  #include <stdio.h>
27  #include <stdlib.h>  #include <stdlib.h>
28  #include <string.h>  #include <string.h>
29  #include "pcre_internal.h"  #include "../pcre_internal.h"
30  #include "ucp.h"  #include "../ucp.h"
 #include "ucpinternal.h"  
31    
32    
33  /* -------------------------------------------------------------------*/  /* -------------------------------------------------------------------*/
# Line 38  Line 51 
51  static void  static void
52  print_prop(int c)  print_prop(int c)
53  {  {
54  int fulltype, script, othercase;  int type = UCD_CATEGORY(c);
55  int type = _pcre_ucp_findprop(c, &fulltype, &script);  int fulltype = UCD_CHARTYPE(c);
56    int script = UCD_SCRIPT(c);
57    int othercase = UCD_OTHERCASE(c);
58    
59    unsigned char *fulltypename = US"??";
60    unsigned char *typename = US"??";
61    unsigned char *scriptname = US"??";
62    
63  printf("%04x ", c);  switch (type)
 if (type < 0) printf("not found\n"); else  
64    {    {
65    uschar *fulltypename = US"??";    case ucp_C: typename = US"Control"; break;
66    uschar *typename = US"??";    case ucp_L: typename = US"Letter"; break;
67    uschar *scriptname = US"??";    case ucp_M: typename = US"Mark"; break;
68    switch (type)    case ucp_N: typename = US"Number"; break;
69      {    case ucp_P: typename = US"Punctuation"; break;
70      case ucp_C: typename = US"Control"; break;    case ucp_S: typename = US"Symbol"; break;
71      case ucp_L: typename = US"Letter"; break;    case ucp_Z: typename = US"Separator"; break;
72      case ucp_M: typename = US"Mark"; break;    }
73      case ucp_N: typename = US"Number"; break;  
74      case ucp_P: typename = US"Punctuation"; break;  switch (fulltype)
75      case ucp_S: typename = US"Symbol"; break;    {
76      case ucp_Z: typename = US"Separator"; break;    case ucp_Cc: fulltypename = US"Control"; break;
77      }    case ucp_Cf: fulltypename = US"Format"; break;
78    switch (fulltype)    case ucp_Cn: fulltypename = US"Unassigned"; break;
79      {    case ucp_Co: fulltypename = US"Private use"; break;
80      case ucp_Cc: fulltypename = US"Control"; break;    case ucp_Cs: fulltypename = US"Surrogate"; break;
81      case ucp_Cf: fulltypename = US"Format"; break;    case ucp_Ll: fulltypename = US"Lower case letter"; break;
82      case ucp_Cn: fulltypename = US"Unassigned"; break;    case ucp_Lm: fulltypename = US"Modifier letter"; break;
83      case ucp_Co: fulltypename = US"Private use"; break;    case ucp_Lo: fulltypename = US"Other letter"; break;
84      case ucp_Cs: fulltypename = US"Surrogate"; break;    case ucp_Lt: fulltypename = US"Title case letter"; break;
85      case ucp_Ll: fulltypename = US"Lower case letter"; break;    case ucp_Lu: fulltypename = US"Upper case letter"; break;
86      case ucp_Lm: fulltypename = US"Modifier letter"; break;    case ucp_Mc: fulltypename = US"Spacing mark"; break;
87      case ucp_Lo: fulltypename = US"Other letter"; break;    case ucp_Me: fulltypename = US"Enclosing mark"; break;
88      case ucp_Lt: fulltypename = US"Title case letter"; break;    case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
89      case ucp_Lu: fulltypename = US"Upper case letter"; break;    case ucp_Nd: fulltypename = US"Decimal number"; break;
90      case ucp_Mc: fulltypename = US"Spacing mark"; break;    case ucp_Nl: fulltypename = US"Letter number"; break;
91      case ucp_Me: fulltypename = US"Enclosing mark"; break;    case ucp_No: fulltypename = US"Other number"; break;
92      case ucp_Mn: fulltypename = US"Non-spacing mark"; break;    case ucp_Pc: fulltypename = US"Connector punctuation"; break;
93      case ucp_Nd: fulltypename = US"Decimal number"; break;    case ucp_Pd: fulltypename = US"Dash punctuation"; break;
94      case ucp_Nl: fulltypename = US"Letter number"; break;    case ucp_Pe: fulltypename = US"Close punctuation"; break;
95      case ucp_No: fulltypename = US"Other number"; break;    case ucp_Pf: fulltypename = US"Final punctuation"; break;
96      case ucp_Pc: fulltypename = US"Connector punctuation"; break;    case ucp_Pi: fulltypename = US"Initial punctuation"; break;
97      case ucp_Pd: fulltypename = US"Dash punctuation"; break;    case ucp_Po: fulltypename = US"Other punctuation"; break;
98      case ucp_Pe: fulltypename = US"Close punctuation"; break;    case ucp_Ps: fulltypename = US"Open punctuation"; break;
99      case ucp_Pf: fulltypename = US"Final punctuation"; break;    case ucp_Sc: fulltypename = US"Currency symbol"; break;
100      case ucp_Pi: fulltypename = US"Initial punctuation"; break;    case ucp_Sk: fulltypename = US"Modifier symbol"; break;
101      case ucp_Po: fulltypename = US"Other punctuation"; break;    case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
102      case ucp_Ps: fulltypename = US"Open punctuation"; break;    case ucp_So: fulltypename = US"Other symbol"; break;
103      case ucp_Sc: fulltypename = US"Currency symbol"; break;    case ucp_Zl: fulltypename = US"Line separator"; break;
104      case ucp_Sk: fulltypename = US"Modifier symbol"; break;    case ucp_Zp: fulltypename = US"Paragraph separator"; break;
105      case ucp_Sm: fulltypename = US"Mathematical symbol"; break;    case ucp_Zs: fulltypename = US"Space separator"; break;
106      case ucp_So: fulltypename = US"Other symbol"; break;    }
107      case ucp_Zl: fulltypename = US"Line separator"; break;  
108      case ucp_Zp: fulltypename = US"Paragraph separator"; break;  switch(script)
109      case ucp_Zs: fulltypename = US"Space separator"; break;    {
110      }    case ucp_Arabic:      scriptname = US"Arabic"; break;
111    switch(script)    case ucp_Armenian:    scriptname = US"Armenian"; break;
112      {    case ucp_Balinese:    scriptname = US"Balinese"; break;
113      case ucp_Arabic:      scriptname = US"Arabic"; break;    case ucp_Bengali:     scriptname = US"Bengali"; break;
114      case ucp_Armenian:    scriptname = US"Armenian"; break;    case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;
115      case ucp_Balinese:    scriptname = US"Balinese"; break;    case ucp_Braille:     scriptname = US"Braille"; break;
116      case ucp_Bengali:     scriptname = US"Bengali"; break;    case ucp_Buginese:    scriptname = US"Buginese"; break;
117      case ucp_Bopomofo:    scriptname = US"Bopomofo"; break;    case ucp_Buhid:       scriptname = US"Buhid"; break;
118      case ucp_Braille:     scriptname = US"Braille"; break;    case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;
119      case ucp_Buginese:    scriptname = US"Buginese"; break;    case ucp_Cherokee:    scriptname = US"Cherokee"; break;
120      case ucp_Buhid:       scriptname = US"Buhid"; break;    case ucp_Common:      scriptname = US"Common"; break;
121      case ucp_Canadian_Aboriginal: scriptname = US"Canadian_Aboriginal"; break;    case ucp_Coptic:      scriptname = US"Coptic"; break;
122      case ucp_Cherokee:    scriptname = US"Cherokee"; break;    case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;
123      case ucp_Common:      scriptname = US"Common"; break;    case ucp_Cypriot:     scriptname = US"Cypriot"; break;
124      case ucp_Coptic:      scriptname = US"Coptic"; break;    case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;
125      case ucp_Cuneiform:   scriptname = US"Cuneiform"; break;    case ucp_Deseret:     scriptname = US"Deseret"; break;
126      case ucp_Cypriot:     scriptname = US"Cypriot"; break;    case ucp_Devanagari:  scriptname = US"Devanagari"; break;
127      case ucp_Cyrillic:    scriptname = US"Cyrillic"; break;    case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;
128      case ucp_Deseret:     scriptname = US"Deseret"; break;    case ucp_Georgian:    scriptname = US"Georgian"; break;
129      case ucp_Devanagari:  scriptname = US"Devanagari"; break;    case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;
130      case ucp_Ethiopic:    scriptname = US"Ethiopic"; break;    case ucp_Gothic:      scriptname = US"Gothic"; break;
131      case ucp_Georgian:    scriptname = US"Georgian"; break;    case ucp_Greek:       scriptname = US"Greek"; break;
132      case ucp_Glagolitic:  scriptname = US"Glagolitic"; break;    case ucp_Gujarati:    scriptname = US"Gujarati"; break;
133      case ucp_Gothic:      scriptname = US"Gothic"; break;    case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;
134      case ucp_Greek:       scriptname = US"Greek"; break;    case ucp_Han:         scriptname = US"Han"; break;
135      case ucp_Gujarati:    scriptname = US"Gujarati"; break;    case ucp_Hangul:      scriptname = US"Hangul"; break;
136      case ucp_Gurmukhi:    scriptname = US"Gurmukhi"; break;    case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;
137      case ucp_Han:         scriptname = US"Han"; break;    case ucp_Hebrew:      scriptname = US"Hebrew"; break;
138      case ucp_Hangul:      scriptname = US"Hangul"; break;    case ucp_Hiragana:    scriptname = US"Hiragana"; break;
139      case ucp_Hanunoo:     scriptname = US"Hanunoo"; break;    case ucp_Inherited:   scriptname = US"Inherited"; break;
140      case ucp_Hebrew:      scriptname = US"Hebrew"; break;    case ucp_Kannada:     scriptname = US"Kannada"; break;
141      case ucp_Hiragana:    scriptname = US"Hiragana"; break;    case ucp_Katakana:    scriptname = US"Katakana"; break;
142      case ucp_Inherited:   scriptname = US"Inherited"; break;    case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;
143      case ucp_Kannada:     scriptname = US"Kannada"; break;    case ucp_Khmer:       scriptname = US"Khmer"; break;
144      case ucp_Katakana:    scriptname = US"Katakana"; break;    case ucp_Lao:         scriptname = US"Lao"; break;
145      case ucp_Kharoshthi:  scriptname = US"Kharoshthi"; break;    case ucp_Latin:       scriptname = US"Latin"; break;
146      case ucp_Khmer:       scriptname = US"Khmer"; break;    case ucp_Limbu:       scriptname = US"Limbu"; break;
147      case ucp_Lao:         scriptname = US"Lao"; break;    case ucp_Linear_B:    scriptname = US"Linear_B"; break;
148      case ucp_Latin:       scriptname = US"Latin"; break;    case ucp_Malayalam:   scriptname = US"Malayalam"; break;
149      case ucp_Limbu:       scriptname = US"Limbu"; break;    case ucp_Mongolian:   scriptname = US"Mongolian"; break;
150      case ucp_Linear_B:    scriptname = US"Linear_B"; break;    case ucp_Myanmar:     scriptname = US"Myanmar"; break;
151      case ucp_Malayalam:   scriptname = US"Malayalam"; break;    case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;
152      case ucp_Mongolian:   scriptname = US"Mongolian"; break;    case ucp_Nko:         scriptname = US"Nko"; break;
153      case ucp_Myanmar:     scriptname = US"Myanmar"; break;    case ucp_Ogham:       scriptname = US"Ogham"; break;
154      case ucp_New_Tai_Lue: scriptname = US"New_Tai_Lue"; break;    case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;
155      case ucp_Nko:         scriptname = US"Nko"; break;    case ucp_Old_Persian: scriptname = US"Old_Persian"; break;
156      case ucp_Ogham:       scriptname = US"Ogham"; break;    case ucp_Oriya:       scriptname = US"Oriya"; break;
157      case ucp_Old_Italic:  scriptname = US"Old_Italic"; break;    case ucp_Osmanya:     scriptname = US"Osmanya"; break;
158      case ucp_Old_Persian: scriptname = US"Old_Persian"; break;    case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;
159      case ucp_Oriya:       scriptname = US"Oriya"; break;    case ucp_Phoenician:  scriptname = US"Phoenician"; break;
160      case ucp_Osmanya:     scriptname = US"Osmanya"; break;    case ucp_Runic:       scriptname = US"Runic"; break;
161      case ucp_Phags_Pa:    scriptname = US"Phags_Pa"; break;    case ucp_Shavian:     scriptname = US"Shavian"; break;
162      case ucp_Phoenician:  scriptname = US"Phoenician"; break;    case ucp_Sinhala:     scriptname = US"Sinhala"; break;
163      case ucp_Runic:       scriptname = US"Runic"; break;    case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;
164      case ucp_Shavian:     scriptname = US"Shavian"; break;    case ucp_Syriac:      scriptname = US"Syriac"; break;
165      case ucp_Sinhala:     scriptname = US"Sinhala"; break;    case ucp_Tagalog:     scriptname = US"Tagalog"; break;
166      case ucp_Syloti_Nagri: scriptname = US"Syloti_Nagri"; break;    case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;
167      case ucp_Syriac:      scriptname = US"Syriac"; break;    case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;
168      case ucp_Tagalog:     scriptname = US"Tagalog"; break;    case ucp_Tamil:       scriptname = US"Tamil"; break;
169      case ucp_Tagbanwa:    scriptname = US"Tagbanwa"; break;    case ucp_Telugu:      scriptname = US"Telugu"; break;
170      case ucp_Tai_Le:      scriptname = US"Tai_Le"; break;    case ucp_Thaana:      scriptname = US"Thaana"; break;
171      case ucp_Tamil:       scriptname = US"Tamil"; break;    case ucp_Thai:        scriptname = US"Thai"; break;
172      case ucp_Telugu:      scriptname = US"Telugu"; break;    case ucp_Tibetan:     scriptname = US"Tibetan"; break;
173      case ucp_Thaana:      scriptname = US"Thaana"; break;    case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;
174      case ucp_Thai:        scriptname = US"Thai"; break;    case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;
175      case ucp_Tibetan:     scriptname = US"Tibetan"; break;    case ucp_Yi:          scriptname = US"Yi"; break;
176      case ucp_Tifinagh:    scriptname = US"Tifinagh"; break;    /* New for Unicode 5.1: */
177      case ucp_Ugaritic:    scriptname = US"Ugaritic"; break;    case ucp_Carian:      scriptname = US"Carian"; break;
178      case ucp_Yi:          scriptname = US"Yi"; break;    case ucp_Cham:        scriptname = US"Cham"; break;
179      }    case ucp_Kayah_Li:    scriptname = US"Kayah_Li"; break;
180      case ucp_Lepcha:      scriptname = US"Lepcha"; break;
181    printf("%s: %s %s", typename, fulltypename, scriptname);    case ucp_Lycian:      scriptname = US"Lycian"; break;
182    othercase = _pcre_ucp_othercase(c);    case ucp_Lydian:      scriptname = US"Lydian"; break;
183    if (othercase >= 0) printf(" %04x", othercase);    case ucp_Ol_Chiki:    scriptname = US"Ol_Chiki"; break;
184    printf("\n");    case ucp_Rejang:      scriptname = US"Rejang"; break;
185      case ucp_Saurashtra:  scriptname = US"Saurashtra"; break;
186      case ucp_Sundanese:   scriptname = US"Sundanese"; break;
187      case ucp_Vai:         scriptname = US"Vai"; break;
188      /* New for Unicode 5.2: */
189      case ucp_Avestan:     scriptname = US"Avestan"; break;
190      case ucp_Bamum:       scriptname = US"Bamum"; break;
191      case ucp_Egyptian_Hieroglyphs: scriptname = US"Egyptian_Hieroglyphs"; break;
192      case ucp_Imperial_Aramaic: scriptname = US"Imperial_Aramaic"; break;
193      case ucp_Inscriptional_Pahlavi: scriptname = US"Inscriptional_Pahlavi"; break;
194      case ucp_Inscriptional_Parthian: scriptname = US"Inscriptional_Parthian"; break;
195      case ucp_Javanese:    scriptname = US"Javanese"; break;
196      case ucp_Kaithi:      scriptname = US"Kaithi"; break;
197      case ucp_Lisu:        scriptname = US"Lisu"; break;
198      case ucp_Meetei_Mayek: scriptname = US"Meetei_Mayek"; break;
199      case ucp_Old_South_Arabian: scriptname = US"Old_South_Arabian"; break;
200      case ucp_Old_Turkic:  scriptname = US"Old_Turkic"; break;
201      case ucp_Samaritan:   scriptname = US"Samaritan"; break;
202      case ucp_Tai_Tham:    scriptname = US"Tai_Tham"; break;
203      case ucp_Tai_Viet:    scriptname = US"Tai_Viet"; break;
204      /* New for Unicode 6.0.0 */
205      case ucp_Batak:       scriptname = US"Batak"; break;
206      case ucp_Brahmi:      scriptname = US"Brahmi"; break;
207      case ucp_Mandaic:     scriptname = US"Mandaic"; break;
208    
209      /* New for Unicode 6.1.0 */
210      case ucp_Chakma:               scriptname = US"Chakma"; break;
211      case ucp_Meroitic_Cursive:     scriptname = US"Meroitic_Cursive"; break;
212      case ucp_Meroitic_Hieroglyphs: scriptname = US"Meroitic_Hieroglyphs"; break;
213      case ucp_Miao:                 scriptname = US"Miao"; break;
214      case ucp_Sharada:              scriptname = US"Sharada"; break;
215      case ucp_Sora_Sompeng:         scriptname = US"Sora Sompent"; break;
216      case ucp_Takri:                scriptname = US"Takri"; break;
217    
218    }    }
219    
220    printf("%04x %s: %s %s", c, typename, fulltypename, scriptname);
221    if (othercase != c) printf(" %04x", othercase);
222    printf("\n");
223  }  }
224    
225    
# Line 176  if (type < 0) printf("not found\n"); els Line 231  if (type < 0) printf("not found\n"); els
231  int  int
232  main(void)  main(void)
233  {  {
234  uschar buffer[1024];  unsigned char buffer[1024];
235  while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)  while (fgets(CS buffer, sizeof(buffer), stdin) != NULL)
236    {    {
237    uschar name[24];    unsigned char name[24];
238    uschar *s, *t;    unsigned char *s, *t;
239    
240    printf("%s", buffer);    printf("%s", buffer);
241    s = buffer;    s = buffer;
# Line 195  while (fgets(CS buffer, sizeof(buffer), Line 250  while (fgets(CS buffer, sizeof(buffer),
250      {      {
251      while (*s != 0)      while (*s != 0)
252        {        {
253        uschar *endptr;        unsigned char *endptr;
254        int c = strtoul(CS s, CSS(&endptr), 16);        int c = strtoul(CS s, CSS(&endptr), 16);
255        print_prop(c);        print_prop(c);
256        s = endptr;        s = endptr;

Legend:
Removed from v.97  
changed lines
  Added in v.943

  ViewVC Help
Powered by ViewVC 1.1.5