1 |
#! /usr/bin/python
|
2 |
|
3 |
# Multistage table builder
|
4 |
# (c) Peter Kankowski, 2008
|
5 |
|
6 |
##############################################################################
|
7 |
# This script was submitted to the PCRE project by Peter Kankowski as part of
|
8 |
# the upgrading of Unicode property support. The new code speeds up property
|
9 |
# matching many times. The script is for the use of PCRE maintainers, to
|
10 |
# generate the pcre_ucd.c file that contains a digested form of the Unicode
|
11 |
# data tables.
|
12 |
#
|
13 |
# The script should be run in the maint subdirectory, using the command
|
14 |
#
|
15 |
# ./MultiStage2.py >../pcre_ucd.c
|
16 |
#
|
17 |
# It requires three Unicode data tables, DerivedGeneralCategory.txt,
|
18 |
# Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
|
19 |
#
|
20 |
# Minor modifications made to this script:
|
21 |
# Added #! line at start
|
22 |
# Removed tabs
|
23 |
# Made it work with Python 2.4 by rewriting two statements that needed 2.5
|
24 |
# Consequent code tidy
|
25 |
# Adjusted data file names to take from the Unicode.tables directory
|
26 |
# Adjusted global table names by prefixing _pcre_.
|
27 |
# Commented out stuff relating to the casefolding table, which isn't used.
|
28 |
# Corrected size calculation
|
29 |
# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
|
30 |
#
|
31 |
# The tables generated by this script are used by macros defined in
|
32 |
# pcre_internal.h. They look up Unicode character properties using short
|
33 |
# sequences of code that contains no branches, which makes for greater speed.
|
34 |
#
|
35 |
# Conceptually, there is a table of records (of type ucd_record), containing a
|
36 |
# script number, character type, and offset to the character's other case for
|
37 |
# every character. However, a real table covering all Unicode characters would
|
38 |
# be far too big. It can be efficiently compressed by observing that many
|
39 |
# characters have the same record, and many blocks of characters (taking 128
|
40 |
# characters in a block) have the same set of records as other blocks. This
|
41 |
# leads to a 2-stage lookup process.
|
42 |
#
|
43 |
# This script constructs three tables. The _pcre_ucd_records table contains
|
44 |
# one instance of every unique record that is required. The _pcre_ucd_stage1
|
45 |
# table is indexed by a character's block number, and yields what is in effect
|
46 |
# a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual"
|
47 |
# blocks; each block is indexed by the offset of a character within its own
|
48 |
# block, and the result is the offset of the required record.
|
49 |
#
|
50 |
# Example: lowercase "a" (U+0061) is in block 0
|
51 |
# lookup 0 in stage1 table yields 0
|
52 |
# lookup 97 in the first table in stage2 yields 12
|
53 |
# record 12 is { 33, 5, -32 } (Latin, lowercase, upper is U+0041)
|
54 |
#
|
55 |
# All lowercase latin characters resolve to the same record.
|
56 |
#
|
57 |
# Example: hiragana letter A (U+3042) is in block 96 (0x60)
|
58 |
# lookup 96 in stage1 table yields 83
|
59 |
# lookup 66 in the 83rd table in stage2 yields 348
|
60 |
# record 348 is { 26, 7, 0 } (Hiragana, other letter, no other case)
|
61 |
#
|
62 |
# In these examples, no other blocks resolve to the same "virtual" block, as it
|
63 |
# happens, but plenty of other blocks do share "virtual" blocks.
|
64 |
#
|
65 |
# There is a fourth table, maintained by hand, which translates from the
|
66 |
# individual character types such as ucp_Cc to the general types like ucp_C.
|
67 |
#
|
68 |
# Philip Hazel, 03 July 2008
|
69 |
##############################################################################
|
70 |
|
71 |
|
72 |
import re
|
73 |
import string
|
74 |
import sys
|
75 |
|
76 |
MAX_UNICODE = 0x110000
|
77 |
NOTACHAR = 0xffffffff
|
78 |
|
79 |
# Parse a line of CaseFolding.txt, Scripts.txt, and DerivedGeneralCategory.txt file
|
80 |
def make_get_names(enum):
|
81 |
return lambda chardata: enum.index(chardata[1])
|
82 |
|
83 |
#def get_case_folding_value(chardata):
|
84 |
# if chardata[1] != 'C' and chardata[1] != 'S':
|
85 |
# return 0
|
86 |
# return int(chardata[2], 16) - int(chardata[0], 16)
|
87 |
|
88 |
def get_other_case(chardata):
|
89 |
if chardata[12] != '':
|
90 |
return int(chardata[12], 16) - int(chardata[0], 16)
|
91 |
if chardata[13] != '':
|
92 |
return int(chardata[13], 16) - int(chardata[0], 16)
|
93 |
return 0
|
94 |
|
95 |
# Read the whole table in memory
|
96 |
def read_table(file_name, get_value, default_value):
|
97 |
file = open(file_name, 'r')
|
98 |
table = [default_value] * MAX_UNICODE
|
99 |
for line in file:
|
100 |
line = re.sub(r'#.*', '', line)
|
101 |
chardata = map(string.strip, line.split(';'))
|
102 |
if len(chardata) <= 1:
|
103 |
continue
|
104 |
value = get_value(chardata)
|
105 |
|
106 |
m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
|
107 |
char = int(m.group(1), 16)
|
108 |
if m.group(3) is None:
|
109 |
last = char
|
110 |
else:
|
111 |
last = int(m.group(3), 16)
|
112 |
for i in range(char, last + 1):
|
113 |
table[i] = value
|
114 |
file.close()
|
115 |
return table
|
116 |
|
117 |
# Get the smallest possible C language type for the values
|
118 |
def get_type_size(table):
|
119 |
type_size = [("uschar", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
|
120 |
("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
|
121 |
limits = [(0, 255), (0, 65535), (0, 4294967295),
|
122 |
(-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
|
123 |
minval = min(table)
|
124 |
maxval = max(table)
|
125 |
for num, (minlimit, maxlimit) in enumerate(limits):
|
126 |
if minlimit <= minval and maxval <= maxlimit:
|
127 |
return type_size[num]
|
128 |
else:
|
129 |
raise OverflowError, "Too large to fit into C types"
|
130 |
|
131 |
def get_tables_size(*tables):
|
132 |
total_size = 0
|
133 |
for table in tables:
|
134 |
type, size = get_type_size(table)
|
135 |
total_size += size * len(table)
|
136 |
return total_size
|
137 |
|
138 |
# Compress the table into the two stages
|
139 |
def compress_table(table, block_size):
|
140 |
blocks = {} # Dictionary for finding identical blocks
|
141 |
stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
|
142 |
stage2 = [] # Stage 2 table contains the blocks with property values
|
143 |
table = tuple(table)
|
144 |
for i in range(0, len(table), block_size):
|
145 |
block = table[i:i+block_size]
|
146 |
start = blocks.get(block)
|
147 |
if start is None:
|
148 |
# Allocate a new block
|
149 |
start = len(stage2) / block_size
|
150 |
stage2 += block
|
151 |
blocks[block] = start
|
152 |
stage1.append(start)
|
153 |
|
154 |
return stage1, stage2
|
155 |
|
156 |
# Print a table
|
157 |
def print_table(table, table_name, block_size = None):
|
158 |
type, size = get_type_size(table)
|
159 |
ELEMS_PER_LINE = 16
|
160 |
|
161 |
s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
|
162 |
if block_size:
|
163 |
s += ", block = %d" % block_size
|
164 |
print s + " */"
|
165 |
table = tuple(table)
|
166 |
if block_size is None:
|
167 |
fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
|
168 |
mult = MAX_UNICODE / len(table)
|
169 |
for i in range(0, len(table), ELEMS_PER_LINE):
|
170 |
print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
|
171 |
else:
|
172 |
if block_size > ELEMS_PER_LINE:
|
173 |
el = ELEMS_PER_LINE
|
174 |
else:
|
175 |
el = block_size
|
176 |
fmt = "%3d," * el + "\n"
|
177 |
if block_size > ELEMS_PER_LINE:
|
178 |
fmt = fmt * (block_size / ELEMS_PER_LINE)
|
179 |
for i in range(0, len(table), block_size):
|
180 |
print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
|
181 |
print "};\n"
|
182 |
|
183 |
# Extract the unique combinations of properties into records
|
184 |
def combine_tables(*tables):
|
185 |
records = {}
|
186 |
index = []
|
187 |
for t in zip(*tables):
|
188 |
i = records.get(t)
|
189 |
if i is None:
|
190 |
i = records[t] = len(records)
|
191 |
index.append(i)
|
192 |
return index, records
|
193 |
|
194 |
def get_record_size_struct(records):
|
195 |
size = 0
|
196 |
structure = '/* When recompiling tables with a new Unicode version,\n' + \
|
197 |
'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n'
|
198 |
for i in range(len(records[0])):
|
199 |
record_slice = map(lambda record: record[i], records)
|
200 |
slice_type, slice_size = get_type_size(record_slice)
|
201 |
# add padding: round up to the nearest power of slice_size
|
202 |
size = (size + slice_size - 1) & -slice_size
|
203 |
size += slice_size
|
204 |
structure += '%s property_%d;\n' % (slice_type, i)
|
205 |
|
206 |
# round up to the first item of the next structure in array
|
207 |
record_slice = map(lambda record: record[0], records)
|
208 |
slice_type, slice_size = get_type_size(record_slice)
|
209 |
size = (size + slice_size - 1) & -slice_size
|
210 |
|
211 |
structure += '} ucd_record; */\n\n'
|
212 |
return size, structure
|
213 |
|
214 |
def test_record_size():
|
215 |
tests = [ \
|
216 |
( [(3,), (6,), (6,), (1,)], 1 ), \
|
217 |
( [(300,), (600,), (600,), (100,)], 2 ), \
|
218 |
( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
|
219 |
( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
|
220 |
( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
221 |
( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
|
222 |
( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
|
223 |
( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
|
224 |
]
|
225 |
for test in tests:
|
226 |
size, struct = get_record_size_struct(test[0])
|
227 |
assert(size == test[1])
|
228 |
#print struct
|
229 |
|
230 |
def print_records(records, record_size):
|
231 |
print 'const ucd_record _pcre_ucd_records[] = { ' + \
|
232 |
'/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
|
233 |
records = zip(records.keys(), records.values())
|
234 |
records.sort(None, lambda x: x[1])
|
235 |
for i, record in enumerate(records):
|
236 |
print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
|
237 |
print '};\n'
|
238 |
|
239 |
script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
|
240 |
'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
|
241 |
'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
|
242 |
'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
|
243 |
'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
|
244 |
'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
|
245 |
'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
|
246 |
# New for Unicode 5.0
|
247 |
'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
|
248 |
# New for Unicode 5.1
|
249 |
'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai']
|
250 |
|
251 |
category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
|
252 |
'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
|
253 |
'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
|
254 |
|
255 |
test_record_size()
|
256 |
|
257 |
script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
|
258 |
category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
|
259 |
other_case = read_table('Unicode.tables/UnicodeData.txt', get_other_case, 0)
|
260 |
# case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)
|
261 |
|
262 |
table, records = combine_tables(script, category, other_case)
|
263 |
record_size, record_struct = get_record_size_struct(records.keys())
|
264 |
|
265 |
# Find the optimum block size for the two-stage table
|
266 |
min_size = sys.maxint
|
267 |
for block_size in [2 ** i for i in range(5,10)]:
|
268 |
size = len(records) * record_size
|
269 |
stage1, stage2 = compress_table(table, block_size)
|
270 |
size += get_tables_size(stage1, stage2)
|
271 |
#print "/* block size %5d => %5d bytes */" % (block_size, size)
|
272 |
if size < min_size:
|
273 |
min_size = size
|
274 |
min_stage1, min_stage2 = stage1, stage2
|
275 |
min_block_size = block_size
|
276 |
|
277 |
print "#ifdef HAVE_CONFIG_H"
|
278 |
print "#include \"config.h\""
|
279 |
print "#endif"
|
280 |
print
|
281 |
print "#include \"pcre_internal.h\""
|
282 |
print
|
283 |
print "/* Unicode character database. */"
|
284 |
print "/* This file was autogenerated by the MultiStage2.py script. */"
|
285 |
print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
|
286 |
print
|
287 |
print "/* The tables herein are needed only when UCP support is built */"
|
288 |
print "/* into PCRE. This module should not be referenced otherwise, so */"
|
289 |
print "/* it should not matter whether it is compiled or not. However */"
|
290 |
print "/* a comment was received about space saving - maybe the guy linked */"
|
291 |
print "/* all the modules rather than using a library - so we include a */"
|
292 |
print "/* condition to cut out the tables when not needed. But don't leave */"
|
293 |
print "/* a totally empty module because some compilers barf at that. */"
|
294 |
print "/* Instead, just supply small dummy tables. */"
|
295 |
print
|
296 |
print "#ifndef SUPPORT_UCP"
|
297 |
print "const ucd_record _pcre_ucd_records[] = {{0,0,0 }};"
|
298 |
print "const uschar _pcre_ucd_stage1[] = {0};"
|
299 |
print "const pcre_uint16 _pcre_ucd_stage2[] = {0};"
|
300 |
print "#else"
|
301 |
print
|
302 |
print record_struct
|
303 |
print_records(records, record_size)
|
304 |
print_table(min_stage1, '_pcre_ucd_stage1')
|
305 |
print_table(min_stage2, '_pcre_ucd_stage2', min_block_size)
|
306 |
print "#if UCD_BLOCK_SIZE != %d" % min_block_size
|
307 |
print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
|
308 |
print "#endif"
|
309 |
print "#endif /* SUPPORT_UCP */"
|
310 |
|
311 |
"""
|
312 |
|
313 |
# Three-stage tables:
|
314 |
|
315 |
# Find the optimum block size for 3-stage table
|
316 |
min_size = sys.maxint
|
317 |
for stage3_block in [2 ** i for i in range(2,6)]:
|
318 |
stage_i, stage3 = compress_table(table, stage3_block)
|
319 |
for stage2_block in [2 ** i for i in range(5,10)]:
|
320 |
size = len(records) * 4
|
321 |
stage1, stage2 = compress_table(stage_i, stage2_block)
|
322 |
size += get_tables_size(stage1, stage2, stage3)
|
323 |
# print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
|
324 |
if size < min_size:
|
325 |
min_size = size
|
326 |
min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
|
327 |
min_stage2_block, min_stage3_block = stage2_block, stage3_block
|
328 |
|
329 |
print "/* Total size: %d bytes" % min_size */
|
330 |
print_records(records)
|
331 |
print_table(min_stage1, 'ucd_stage1')
|
332 |
print_table(min_stage2, 'ucd_stage2', min_stage2_block)
|
333 |
print_table(min_stage3, 'ucd_stage3', min_stage3_block)
|
334 |
|
335 |
"""
|