/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Contents of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1490 - (show annotations)
Thu Jun 19 07:51:39 2014 UTC (5 years, 5 months ago) by chpe
File MIME type: text/x-python
File size: 21614 byte(s)
Update to Unicode 7.0.0 release
1 #! /usr/bin/python
2
3 # Multistage table builder
4 # (c) Peter Kankowski, 2008
5
6 ##############################################################################
7 # This script was submitted to the PCRE project by Peter Kankowski as part of
8 # the upgrading of Unicode property support. The new code speeds up property
9 # matching many times. The script is for the use of PCRE maintainers, to
10 # generate the pcre_ucd.c file that contains a digested form of the Unicode
11 # data tables.
12 #
13 # The script should be run in the maint subdirectory, using the command
14 #
15 # ./MultiStage2.py >../pcre_ucd.c
16 #
17 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
18 # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
19 # Unicode.tables subdirectory. The first of these is found in the "extracted"
20 # subdirectory of the Unicode database (UCD) on the Unicode web site; the
21 # second is in the "auxiliary" subdirectory; the other two are directly in the
22 # UCD directory.
23 #
24 # Minor modifications made to this script:
25 # Added #! line at start
26 # Removed tabs
27 # Made it work with Python 2.4 by rewriting two statements that needed 2.5
28 # Consequent code tidy
29 # Adjusted data file names to take from the Unicode.tables directory
30 # Adjusted global table names by prefixing _pcre_.
31 # Commented out stuff relating to the casefolding table, which isn't used;
32 # removed completely in 2012.
33 # Corrected size calculation
34 # Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
35 #
36 # Major modifications made to this script:
37 # Added code to add a grapheme break property field to records.
38 #
39 # Added code to search for sets of more than two characters that must match
40 # each other caselessly. A new table is output containing these sets, and
41 # offsets into the table are added to the main output records. This new
42 # code scans CaseFolding.txt instead of UnicodeData.txt.
43 #
44 # The main tables generated by this script are used by macros defined in
45 # pcre_internal.h. They look up Unicode character properties using short
46 # sequences of code that contains no branches, which makes for greater speed.
47 #
48 # Conceptually, there is a table of records (of type ucd_record), containing a
49 # script number, character type, grapheme break type, offset to caseless
50 # matching set, and offset to the character's other case for every character.
51 # However, a real table covering all Unicode characters would be far too big.
52 # It can be efficiently compressed by observing that many characters have the
53 # same record, and many blocks of characters (taking 128 characters in a block)
54 # have the same set of records as other blocks. This leads to a 2-stage lookup
55 # process.
56 #
57 # This script constructs four tables. The ucd_caseless_sets table contains
58 # lists of characters that all match each other caselessly. Each list is
59 # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
60 # any valid character. The first list is empty; this is used for characters
61 # that are not part of any list.
62 #
63 # The ucd_records table contains one instance of every unique record that is
64 # required. The ucd_stage1 table is indexed by a character's block number, and
65 # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
66 # table of "virtual" blocks; each block is indexed by the offset of a character
67 # within its own block, and the result is the offset of the required record.
68 #
69 # Example: lowercase "a" (U+0061) is in block 0
70 # lookup 0 in stage1 table yields 0
71 # lookup 97 in the first table in stage2 yields 16
72 # record 17 is { 33, 5, 11, 0, -32 }
73 # 33 = ucp_Latin => Latin script
74 # 5 = ucp_Ll => Lower case letter
75 # 11 = ucp_gbOther => Grapheme break property "Other"
76 # 0 => not part of a caseless set
77 # -32 => Other case is U+0041
78 #
79 # Almost all lowercase latin characters resolve to the same record. One or two
80 # are different because they are part of a multi-character caseless set (for
81 # example, k, K and the Kelvin symbol are such a set).
82 #
83 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
84 # lookup 96 in stage1 table yields 88
85 # lookup 66 in the 88th table in stage2 yields 467
86 # record 470 is { 26, 7, 11, 0, 0 }
87 # 26 = ucp_Hiragana => Hiragana script
88 # 7 = ucp_Lo => Other letter
89 # 11 = ucp_gbOther => Grapheme break property "Other"
90 # 0 => not part of a caseless set
91 # 0 => No other case
92 #
93 # In these examples, no other blocks resolve to the same "virtual" block, as it
94 # happens, but plenty of other blocks do share "virtual" blocks.
95 #
96 # There is a fourth table, maintained by hand, which translates from the
97 # individual character types such as ucp_Cc to the general types like ucp_C.
98 #
99 # Philip Hazel, 03 July 2008
100 #
101 # 01-March-2010: Updated list of scripts for Unicode 5.2.0
102 # 30-April-2011: Updated list of scripts for Unicode 6.0.0
103 # July-2012: Updated list of scripts for Unicode 6.1.0
104 # 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
105 # field in the record to hold the value. Luckily, the
106 # structure had a hole in it, so the resulting table is
107 # not much bigger than before.
108 # 18-September-2012: Added code for multiple caseless sets. This uses the
109 # final hole in the structure.
110 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
111 ##############################################################################
112
113
114 import re
115 import string
116 import sys
117
118 MAX_UNICODE = 0x110000
119 NOTACHAR = 0xffffffff
120
121 # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
122 def make_get_names(enum):
123 return lambda chardata: enum.index(chardata[1])
124
125 # Parse a line of CaseFolding.txt
126 def get_other_case(chardata):
127 if chardata[1] == 'C' or chardata[1] == 'S':
128 return int(chardata[2], 16) - int(chardata[0], 16)
129 return 0
130
131
132 # Read the whole table in memory
133 def read_table(file_name, get_value, default_value):
134 file = open(file_name, 'r')
135 table = [default_value] * MAX_UNICODE
136 for line in file:
137 line = re.sub(r'#.*', '', line)
138 chardata = map(string.strip, line.split(';'))
139 if len(chardata) <= 1:
140 continue
141 value = get_value(chardata)
142 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
143 char = int(m.group(1), 16)
144 if m.group(3) is None:
145 last = char
146 else:
147 last = int(m.group(3), 16)
148 for i in range(char, last + 1):
149 # It is important not to overwrite a previously set
150 # value because in the CaseFolding file there are lines
151 # to be ignored (returning the default value of 0)
152 # which often come after a line which has already set
153 # data.
154 if table[i] == default_value:
155 table[i] = value
156 file.close()
157 return table
158
159 # Get the smallest possible C language type for the values
160 def get_type_size(table):
161 type_size = [("pcre_uint8", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
162 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
163 limits = [(0, 255), (0, 65535), (0, 4294967295),
164 (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
165 minval = min(table)
166 maxval = max(table)
167 for num, (minlimit, maxlimit) in enumerate(limits):
168 if minlimit <= minval and maxval <= maxlimit:
169 return type_size[num]
170 else:
171 raise OverflowError, "Too large to fit into C types"
172
173 def get_tables_size(*tables):
174 total_size = 0
175 for table in tables:
176 type, size = get_type_size(table)
177 total_size += size * len(table)
178 return total_size
179
180 # Compress the table into the two stages
181 def compress_table(table, block_size):
182 blocks = {} # Dictionary for finding identical blocks
183 stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
184 stage2 = [] # Stage 2 table contains the blocks with property values
185 table = tuple(table)
186 for i in range(0, len(table), block_size):
187 block = table[i:i+block_size]
188 start = blocks.get(block)
189 if start is None:
190 # Allocate a new block
191 start = len(stage2) / block_size
192 stage2 += block
193 blocks[block] = start
194 stage1.append(start)
195
196 return stage1, stage2
197
198 # Print a table
199 def print_table(table, table_name, block_size = None):
200 type, size = get_type_size(table)
201 ELEMS_PER_LINE = 16
202
203 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
204 if block_size:
205 s += ", block = %d" % block_size
206 print s + " */"
207 table = tuple(table)
208 if block_size is None:
209 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
210 mult = MAX_UNICODE / len(table)
211 for i in range(0, len(table), ELEMS_PER_LINE):
212 print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
213 else:
214 if block_size > ELEMS_PER_LINE:
215 el = ELEMS_PER_LINE
216 else:
217 el = block_size
218 fmt = "%3d," * el + "\n"
219 if block_size > ELEMS_PER_LINE:
220 fmt = fmt * (block_size / ELEMS_PER_LINE)
221 for i in range(0, len(table), block_size):
222 print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
223 print "};\n"
224
225 # Extract the unique combinations of properties into records
226 def combine_tables(*tables):
227 records = {}
228 index = []
229 for t in zip(*tables):
230 i = records.get(t)
231 if i is None:
232 i = records[t] = len(records)
233 index.append(i)
234 return index, records
235
236 def get_record_size_struct(records):
237 size = 0
238 structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
239 'types in this structure definition from pcre_internal.h (the actual\n' + \
240 'field names will be different):\n\ntypedef struct {\n'
241 for i in range(len(records[0])):
242 record_slice = map(lambda record: record[i], records)
243 slice_type, slice_size = get_type_size(record_slice)
244 # add padding: round up to the nearest power of slice_size
245 size = (size + slice_size - 1) & -slice_size
246 size += slice_size
247 structure += '%s property_%d;\n' % (slice_type, i)
248
249 # round up to the first item of the next structure in array
250 record_slice = map(lambda record: record[0], records)
251 slice_type, slice_size = get_type_size(record_slice)
252 size = (size + slice_size - 1) & -slice_size
253
254 structure += '} ucd_record;\n*/\n\n'
255 return size, structure
256
257 def test_record_size():
258 tests = [ \
259 ( [(3,), (6,), (6,), (1,)], 1 ), \
260 ( [(300,), (600,), (600,), (100,)], 2 ), \
261 ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
262 ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
263 ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
264 ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
265 ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
266 ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
267 ]
268 for test in tests:
269 size, struct = get_record_size_struct(test[0])
270 assert(size == test[1])
271 #print struct
272
273 def print_records(records, record_size):
274 print 'const ucd_record PRIV(ucd_records)[] = { ' + \
275 '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
276 records = zip(records.keys(), records.values())
277 records.sort(None, lambda x: x[1])
278 for i, record in enumerate(records):
279 print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
280 print '};\n'
281
282 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
283 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
284 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
285 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
286 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
287 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
288 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
289 # New for Unicode 5.0
290 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
291 # New for Unicode 5.1
292 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
293 # New for Unicode 5.2
294 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
295 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
296 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
297 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
298 # New for Unicode 6.0.0
299 'Batak', 'Brahmi', 'Mandaic', \
300 # New for Unicode 6.1.0
301 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri',
302 # New for Unicode 7.0.0
303 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi',
304 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean',
305 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi',
306 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi'
307 ]
308
309 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
310 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
311 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
312
313 break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
314 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
315
316 test_record_size()
317
318 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
319 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
320 break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
321 other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
322
323
324 # This block of code was added by PH in September 2012. I am not a Python
325 # programmer, so the style is probably dreadful, but it does the job. It scans
326 # the other_case table to find sets of more than two characters that must all
327 # match each other caselessly. Later in this script a table of these sets is
328 # written out. However, we have to do this work here in order to compute the
329 # offsets in the table that are inserted into the main table.
330
331 # The CaseFolding.txt file lists pairs, but the common logic for reading data
332 # sets only one value, so first we go through the table and set "return"
333 # offsets for those that are not already set.
334
335 for c in range(0x10ffff):
336 if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
337 other_case[c + other_case[c]] = -other_case[c]
338
339 # Now scan again and create equivalence sets.
340
341 sets = []
342
343 for c in range(0x10ffff):
344 o = c + other_case[c]
345
346 # Trigger when this character's other case does not point back here. We
347 # now have three characters that are case-equivalent.
348
349 if other_case[o] != -other_case[c]:
350 t = o + other_case[o]
351
352 # Scan the existing sets to see if any of the three characters are already
353 # part of a set. If so, unite the existing set with the new set.
354
355 appended = 0
356 for s in sets:
357 found = 0
358 for x in s:
359 if x == c or x == o or x == t:
360 found = 1
361
362 # Add new characters to an existing set
363
364 if found:
365 found = 0
366 for y in [c, o, t]:
367 for x in s:
368 if x == y:
369 found = 1
370 if not found:
371 s.append(y)
372 appended = 1
373
374 # If we have not added to an existing set, create a new one.
375
376 if not appended:
377 sets.append([c, o, t])
378
379 # End of loop looking for caseless sets.
380
381 # Now scan the sets and set appropriate offsets for the characters.
382
383 caseless_offsets = [0] * MAX_UNICODE
384
385 offset = 1;
386 for s in sets:
387 for x in s:
388 caseless_offsets[x] = offset
389 offset += len(s) + 1
390
391 # End of block of code for creating offsets for caseless matching sets.
392
393
394 # Combine the tables
395
396 table, records = combine_tables(script, category, break_props,
397 caseless_offsets, other_case)
398
399 record_size, record_struct = get_record_size_struct(records.keys())
400
401 # Find the optimum block size for the two-stage table
402 min_size = sys.maxint
403 for block_size in [2 ** i for i in range(5,10)]:
404 size = len(records) * record_size
405 stage1, stage2 = compress_table(table, block_size)
406 size += get_tables_size(stage1, stage2)
407 #print "/* block size %5d => %5d bytes */" % (block_size, size)
408 if size < min_size:
409 min_size = size
410 min_stage1, min_stage2 = stage1, stage2
411 min_block_size = block_size
412
413 print "/* This module is generated by the maint/MultiStage2.py script."
414 print "Do not modify it by hand. Instead modify the script and run it"
415 print "to regenerate this code."
416 print
417 print "As well as being part of the PCRE library, this module is #included"
418 print "by the pcretest program, which redefines the PRIV macro to change"
419 print "table names from _pcre_xxx to xxxx, thereby avoiding name clashes"
420 print "with the library. At present, just one of these tables is actually"
421 print "needed. */"
422 print
423 print "#ifndef PCRE_INCLUDED"
424 print
425 print "#ifdef HAVE_CONFIG_H"
426 print "#include \"config.h\""
427 print "#endif"
428 print
429 print "#include \"pcre_internal.h\""
430 print
431 print "#endif /* PCRE_INCLUDED */"
432 print
433 print "/* Unicode character database. */"
434 print "/* This file was autogenerated by the MultiStage2.py script. */"
435 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
436 print
437 print "/* The tables herein are needed only when UCP support is built"
438 print "into PCRE. This module should not be referenced otherwise, so"
439 print "it should not matter whether it is compiled or not. However"
440 print "a comment was received about space saving - maybe the guy linked"
441 print "all the modules rather than using a library - so we include a"
442 print "condition to cut out the tables when not needed. But don't leave"
443 print "a totally empty module because some compilers barf at that."
444 print "Instead, just supply small dummy tables. */"
445 print
446 print "#ifndef SUPPORT_UCP"
447 print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
448 print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
449 print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
450 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
451 print "#else"
452 print
453 print record_struct
454
455 # --- Added by PH: output the table of caseless character sets ---
456
457 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
458 print " NOTACHAR,"
459 for s in sets:
460 s = sorted(s)
461 for x in s:
462 print ' 0x%04x,' % x,
463 print ' NOTACHAR,'
464 print '};'
465 print
466
467 # ------
468
469 print "/* When #included in pcretest, we don't need this large table. */"
470 print
471 print "#ifndef PCRE_INCLUDED"
472 print
473 print_records(records, record_size)
474 print_table(min_stage1, 'PRIV(ucd_stage1)')
475 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
476 print "#if UCD_BLOCK_SIZE != %d" % min_block_size
477 print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
478 print "#endif"
479 print "#endif /* SUPPORT_UCP */"
480 print
481 print "#endif /* PCRE_INCLUDED */"
482
483 """
484
485 # Three-stage tables:
486
487 # Find the optimum block size for 3-stage table
488 min_size = sys.maxint
489 for stage3_block in [2 ** i for i in range(2,6)]:
490 stage_i, stage3 = compress_table(table, stage3_block)
491 for stage2_block in [2 ** i for i in range(5,10)]:
492 size = len(records) * 4
493 stage1, stage2 = compress_table(stage_i, stage2_block)
494 size += get_tables_size(stage1, stage2, stage3)
495 # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
496 if size < min_size:
497 min_size = size
498 min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
499 min_stage2_block, min_stage3_block = stage2_block, stage3_block
500
501 print "/* Total size: %d bytes" % min_size */
502 print_records(records)
503 print_table(min_stage1, 'ucd_stage1')
504 print_table(min_stage2, 'ucd_stage2', min_stage2_block)
505 print_table(min_stage3, 'ucd_stage3', min_stage3_block)
506
507 """

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.5