1 |
#! /usr/bin/python
|
2 |
|
3 |
# Generate utt tables.
|
4 |
|
5 |
# The source file pcre_tables.c contains (amongst other things), a table that
|
6 |
# is indexed by script name. In order to reduce the number of relocations when
|
7 |
# loading the library, the names are held as a single large string, with
|
8 |
# offsets in the table. This is tedious to maintain by hand. Therefore, this
|
9 |
# script is used to generate the table. The output is sent to stdout.
|
10 |
|
11 |
# Modified by PH 17-March-2009 to generate the more verbose form that works
|
12 |
# for UTF-support in EBCDIC as well as ASCII environments.
|
13 |
# Modified by PH 01-March-2010 to add new scripts from Unicode 5.2.0.
|
14 |
|
15 |
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
16 |
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
17 |
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
18 |
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
19 |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
20 |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
21 |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
22 |
# New for Unicode 5.0
|
23 |
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
24 |
# New for Unicode 5.1
|
25 |
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
|
26 |
# New for Unicode 5.2
|
27 |
'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
|
28 |
'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
|
29 |
'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
|
30 |
'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet'
|
31 |
]
|
32 |
|
33 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
34 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
35 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
36 |
|
37 |
general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z']
|
38 |
|
39 |
utt_table = zip(script_names, ['PT_SC'] * len(script_names))
|
40 |
utt_table += zip(category_names, ['PT_PC'] * len(category_names))
|
41 |
utt_table += zip(general_category_names, ['PT_GC'] * len(general_category_names))
|
42 |
utt_table.append(('L&', 'PT_LAMP'))
|
43 |
utt_table.append(('Any', 'PT_ANY'))
|
44 |
|
45 |
utt_table.sort()
|
46 |
|
47 |
# We have to use STR_ macros to define the strings so that it all works in
|
48 |
# UTF-8 mode on EBCDIC platforms.
|
49 |
|
50 |
for utt in utt_table:
|
51 |
print '#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')),
|
52 |
for c in utt[0]:
|
53 |
if c == '_':
|
54 |
print 'STR_UNDERSCORE',
|
55 |
elif c == '&':
|
56 |
print 'STR_AMPERSAND',
|
57 |
else:
|
58 |
print 'STR_%s' % c,;
|
59 |
print '"\\0"'
|
60 |
|
61 |
# Print the actual table, using the string names
|
62 |
|
63 |
print ''
|
64 |
print 'const char _pcre_utt_names[] = ';
|
65 |
last = ''
|
66 |
for utt in utt_table:
|
67 |
if utt == utt_table[-1]:
|
68 |
last = ';'
|
69 |
print ' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)
|
70 |
# This was how it was done before the EBCDIC-compatible modification.
|
71 |
# print ' "%s\\0"%s' % (utt[0], last)
|
72 |
|
73 |
print '\nconst ucp_type_table _pcre_utt[] = { '
|
74 |
offset = 0
|
75 |
last = ','
|
76 |
for utt in utt_table:
|
77 |
if utt[1] in ('PT_ANY', 'PT_LAMP'):
|
78 |
value = '0'
|
79 |
else:
|
80 |
value = 'ucp_' + utt[0]
|
81 |
if utt == utt_table[-1]:
|
82 |
last = ''
|
83 |
print ' { %3d, %s, %s }%s ' % (offset, utt[1], value, last)
|
84 |
offset += len(utt[0]) + 1
|
85 |
print '};'
|