/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Contents of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1050 - (show annotations)
Sun Sep 30 18:20:10 2012 UTC (7 years, 2 months ago) by chpe
File MIME type: text/x-python
File size: 21276 byte(s)
unicode: Update to Unicode 6.2
1 #! /usr/bin/python
2
3 # Multistage table builder
4 # (c) Peter Kankowski, 2008
5
6 ##############################################################################
7 # This script was submitted to the PCRE project by Peter Kankowski as part of
8 # the upgrading of Unicode property support. The new code speeds up property
9 # matching many times. The script is for the use of PCRE maintainers, to
10 # generate the pcre_ucd.c file that contains a digested form of the Unicode
11 # data tables.
12 #
13 # The script should be run in the maint subdirectory, using the command
14 #
15 # ./MultiStage2.py >../pcre_ucd.c
16 #
17 # It requires four Unicode data tables, DerivedGeneralCategory.txt,
18 # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
19 # Unicode.tables subdirectory. The first of these is found in the "extracted"
20 # subdirectory of the Unicode database (UCD) on the Unicode web site; the
21 # second is in the "auxiliary" subdirectory; the other two are directly in the
22 # UCD directory.
23 #
24 # Minor modifications made to this script:
25 # Added #! line at start
26 # Removed tabs
27 # Made it work with Python 2.4 by rewriting two statements that needed 2.5
28 # Consequent code tidy
29 # Adjusted data file names to take from the Unicode.tables directory
30 # Adjusted global table names by prefixing _pcre_.
31 # Commented out stuff relating to the casefolding table, which isn't used;
32 # removed completely in 2012.
33 # Corrected size calculation
34 # Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
35 #
36 # Major modifications made to this script:
37 # Added code to add a grapheme break property field to records.
38 #
39 # Added code to search for sets of more than two characters that must match
40 # each other caselessly. A new table is output containing these sets, and
41 # offsets into the table are added to the main output records. This new
42 # code scans CaseFolding.txt instead of UnicodeData.txt.
43 #
44 # The main tables generated by this script are used by macros defined in
45 # pcre_internal.h. They look up Unicode character properties using short
46 # sequences of code that contains no branches, which makes for greater speed.
47 #
48 # Conceptually, there is a table of records (of type ucd_record), containing a
49 # script number, character type, grapheme break type, offset to caseless
50 # matching set, and offset to the character's other case for every character.
51 # However, a real table covering all Unicode characters would be far too big.
52 # It can be efficiently compressed by observing that many characters have the
53 # same record, and many blocks of characters (taking 128 characters in a block)
54 # have the same set of records as other blocks. This leads to a 2-stage lookup
55 # process.
56 #
57 # This script constructs four tables. The ucd_caseless_sets table contains
58 # lists of characters that all match each other caselessly. Each list is
59 # in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
60 # any valid character. The first list is empty; this is used for characters
61 # that are not part of any list.
62 #
63 # The ucd_records table contains one instance of every unique record that is
64 # required. The ucd_stage1 table is indexed by a character's block number, and
65 # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
66 # table of "virtual" blocks; each block is indexed by the offset of a character
67 # within its own block, and the result is the offset of the required record.
68 #
69 # Example: lowercase "a" (U+0061) is in block 0
70 # lookup 0 in stage1 table yields 0
71 # lookup 97 in the first table in stage2 yields 16
72 # record 17 is { 33, 5, 11, 0, -32 }
73 # 33 = ucp_Latin => Latin script
74 # 5 = ucp_Ll => Lower case letter
75 # 11 = ucp_gbOther => Grapheme break property "Other"
76 # 0 => not part of a caseless set
77 # -32 => Other case is U+0041
78 #
79 # Almost all lowercase latin characters resolve to the same record. One or two
80 # are different because they are part of a multi-character caseless set (for
81 # example, k, K and the Kelvin symbol are such a set).
82 #
83 # Example: hiragana letter A (U+3042) is in block 96 (0x60)
84 # lookup 96 in stage1 table yields 88
85 # lookup 66 in the 88th table in stage2 yields 467
86 # record 470 is { 26, 7, 11, 0, 0 }
87 # 26 = ucp_Hiragana => Hiragana script
88 # 7 = ucp_Lo => Other letter
89 # 11 = ucp_gbOther => Grapheme break property "Other"
90 # 0 => not part of a caseless set
91 # 0 => No other case
92 #
93 # In these examples, no other blocks resolve to the same "virtual" block, as it
94 # happens, but plenty of other blocks do share "virtual" blocks.
95 #
96 # There is a fourth table, maintained by hand, which translates from the
97 # individual character types such as ucp_Cc to the general types like ucp_C.
98 #
99 # Philip Hazel, 03 July 2008
100 #
101 # 01-March-2010: Updated list of scripts for Unicode 5.2.0
102 # 30-April-2011: Updated list of scripts for Unicode 6.0.0
103 # July-2012: Updated list of scripts for Unicode 6.1.0
104 # 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new
105 # field in the record to hold the value. Luckily, the
106 # structure had a hole in it, so the resulting table is
107 # not much bigger than before.
108 # 18-September-2012: Added code for multiple caseless sets. This uses the
109 # final hole in the structure.
110 # 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
111 ##############################################################################
112
113
114 import re
115 import string
116 import sys
117
118 MAX_UNICODE = 0x110000
119 NOTACHAR = 0xffffffff
120
121 # Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt
122 def make_get_names(enum):
123 return lambda chardata: enum.index(chardata[1])
124
125 # Parse a line of CaseFolding.txt
126 def get_other_case(chardata):
127 if chardata[1] == 'C' or chardata[1] == 'S':
128 return int(chardata[2], 16) - int(chardata[0], 16)
129 return 0
130
131
132 # Read the whole table in memory
133 def read_table(file_name, get_value, default_value):
134 file = open(file_name, 'r')
135 table = [default_value] * MAX_UNICODE
136 for line in file:
137 line = re.sub(r'#.*', '', line)
138 chardata = map(string.strip, line.split(';'))
139 if len(chardata) <= 1:
140 continue
141 value = get_value(chardata)
142 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
143 char = int(m.group(1), 16)
144 if m.group(3) is None:
145 last = char
146 else:
147 last = int(m.group(3), 16)
148 for i in range(char, last + 1):
149 # It is important not to overwrite a previously set
150 # value because in the CaseFolding file there are lines
151 # to be ignored (returning the default value of 0)
152 # which often come after a line which has already set
153 # data.
154 if table[i] == default_value:
155 table[i] = value
156 file.close()
157 return table
158
159 # Get the smallest possible C language type for the values
160 def get_type_size(table):
161 type_size = [("pcre_uint8", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
162 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
163 limits = [(0, 255), (0, 65535), (0, 4294967295),
164 (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
165 minval = min(table)
166 maxval = max(table)
167 for num, (minlimit, maxlimit) in enumerate(limits):
168 if minlimit <= minval and maxval <= maxlimit:
169 return type_size[num]
170 else:
171 raise OverflowError, "Too large to fit into C types"
172
173 def get_tables_size(*tables):
174 total_size = 0
175 for table in tables:
176 type, size = get_type_size(table)
177 total_size += size * len(table)
178 return total_size
179
180 # Compress the table into the two stages
181 def compress_table(table, block_size):
182 blocks = {} # Dictionary for finding identical blocks
183 stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
184 stage2 = [] # Stage 2 table contains the blocks with property values
185 table = tuple(table)
186 for i in range(0, len(table), block_size):
187 block = table[i:i+block_size]
188 start = blocks.get(block)
189 if start is None:
190 # Allocate a new block
191 start = len(stage2) / block_size
192 stage2 += block
193 blocks[block] = start
194 stage1.append(start)
195
196 return stage1, stage2
197
198 # Print a table
199 def print_table(table, table_name, block_size = None):
200 type, size = get_type_size(table)
201 ELEMS_PER_LINE = 16
202
203 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
204 if block_size:
205 s += ", block = %d" % block_size
206 print s + " */"
207 table = tuple(table)
208 if block_size is None:
209 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
210 mult = MAX_UNICODE / len(table)
211 for i in range(0, len(table), ELEMS_PER_LINE):
212 print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
213 else:
214 if block_size > ELEMS_PER_LINE:
215 el = ELEMS_PER_LINE
216 else:
217 el = block_size
218 fmt = "%3d," * el + "\n"
219 if block_size > ELEMS_PER_LINE:
220 fmt = fmt * (block_size / ELEMS_PER_LINE)
221 for i in range(0, len(table), block_size):
222 print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
223 print "};\n"
224
225 # Extract the unique combinations of properties into records
226 def combine_tables(*tables):
227 records = {}
228 index = []
229 for t in zip(*tables):
230 i = records.get(t)
231 if i is None:
232 i = records[t] = len(records)
233 index.append(i)
234 return index, records
235
236 def get_record_size_struct(records):
237 size = 0
238 structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \
239 'types in this structure definition from pcre_internal.h (the actual\n' + \
240 'field names will be different):\n\ntypedef struct {\n'
241 for i in range(len(records[0])):
242 record_slice = map(lambda record: record[i], records)
243 slice_type, slice_size = get_type_size(record_slice)
244 # add padding: round up to the nearest power of slice_size
245 size = (size + slice_size - 1) & -slice_size
246 size += slice_size
247 structure += '%s property_%d;\n' % (slice_type, i)
248
249 # round up to the first item of the next structure in array
250 record_slice = map(lambda record: record[0], records)
251 slice_type, slice_size = get_type_size(record_slice)
252 size = (size + slice_size - 1) & -slice_size
253
254 structure += '} ucd_record;\n*/\n\n'
255 return size, structure
256
257 def test_record_size():
258 tests = [ \
259 ( [(3,), (6,), (6,), (1,)], 1 ), \
260 ( [(300,), (600,), (600,), (100,)], 2 ), \
261 ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
262 ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
263 ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
264 ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
265 ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
266 ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
267 ]
268 for test in tests:
269 size, struct = get_record_size_struct(test[0])
270 assert(size == test[1])
271 #print struct
272
273 def print_records(records, record_size):
274 print 'const ucd_record PRIV(ucd_records)[] = { ' + \
275 '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
276 records = zip(records.keys(), records.values())
277 records.sort(None, lambda x: x[1])
278 for i, record in enumerate(records):
279 print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
280 print '};\n'
281
282 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
283 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
284 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
285 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
286 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
287 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
288 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
289 # New for Unicode 5.0
290 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
291 # New for Unicode 5.1
292 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \
293 # New for Unicode 5.2
294 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \
295 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \
296 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \
297 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \
298 # New for Unicode 6.0.0
299 'Batak', 'Brahmi', 'Mandaic', \
300 # New for Unicode 6.1.0
301 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri'
302 ]
303
304 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
305 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
306 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
307
308 break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend',
309 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other' ]
310
311 test_record_size()
312
313 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
314 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
315 break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
316 other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
317
318
319 # This block of code was added by PH in September 2012. I am not a Python
320 # programmer, so the style is probably dreadful, but it does the job. It scans
321 # the other_case table to find sets of more than two characters that must all
322 # match each other caselessly. Later in this script a table of these sets is
323 # written out. However, we have to do this work here in order to compute the
324 # offsets in the table that are inserted into the main table.
325
326 # The CaseFolding.txt file lists pairs, but the common logic for reading data
327 # sets only one value, so first we go through the table and set "return"
328 # offsets for those that are not already set.
329
330 for c in range(0x10ffff):
331 if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
332 other_case[c + other_case[c]] = -other_case[c]
333
334 # Now scan again and create equivalence sets.
335
336 sets = []
337
338 for c in range(0x10ffff):
339 o = c + other_case[c]
340
341 # Trigger when this character's other case does not point back here. We
342 # now have three characters that are case-equivalent.
343
344 if other_case[o] != -other_case[c]:
345 t = o + other_case[o]
346
347 # Scan the existing sets to see if any of the three characters are already
348 # part of a set. If so, unite the existing set with the new set.
349
350 appended = 0
351 for s in sets:
352 found = 0
353 for x in s:
354 if x == c or x == o or x == t:
355 found = 1
356
357 # Add new characters to an existing set
358
359 if found:
360 found = 0
361 for y in [c, o, t]:
362 for x in s:
363 if x == y:
364 found = 1
365 if not found:
366 s.append(y)
367 appended = 1
368
369 # If we have not added to an existing set, create a new one.
370
371 if not appended:
372 sets.append([c, o, t])
373
374 # End of loop looking for caseless sets.
375
376 # Now scan the sets and set appropriate offsets for the characters.
377
378 caseless_offsets = [0] * MAX_UNICODE
379
380 offset = 1;
381 for s in sets:
382 for x in s:
383 caseless_offsets[x] = offset
384 offset += len(s) + 1
385
386 # End of block of code for creating offsets for caseless matching sets.
387
388
389 # Combine the tables
390
391 table, records = combine_tables(script, category, break_props,
392 caseless_offsets, other_case)
393
394 record_size, record_struct = get_record_size_struct(records.keys())
395
396 # Find the optimum block size for the two-stage table
397 min_size = sys.maxint
398 for block_size in [2 ** i for i in range(5,10)]:
399 size = len(records) * record_size
400 stage1, stage2 = compress_table(table, block_size)
401 size += get_tables_size(stage1, stage2)
402 #print "/* block size %5d => %5d bytes */" % (block_size, size)
403 if size < min_size:
404 min_size = size
405 min_stage1, min_stage2 = stage1, stage2
406 min_block_size = block_size
407
408 print "/* This module is generated by the maint/MultiStage2.py script."
409 print "Do not modify it by hand. Instead modify the script and run it"
410 print "to regenerate this code."
411 print
412 print "As well as being part of the PCRE library, this module is #included"
413 print "by the pcretest program, which redefines the PRIV macro to change"
414 print "table names from _pcre_xxx to xxxx, thereby avoiding name clashes"
415 print "with the library. At present, just one of these tables is actually"
416 print "needed. */"
417 print
418 print "#ifndef PCRE_INCLUDED"
419 print
420 print "#ifdef HAVE_CONFIG_H"
421 print "#include \"config.h\""
422 print "#endif"
423 print
424 print "#include \"pcre_internal.h\""
425 print
426 print "#endif /* PCRE_INCLUDED */"
427 print
428 print "/* Unicode character database. */"
429 print "/* This file was autogenerated by the MultiStage2.py script. */"
430 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
431 print
432 print "/* The tables herein are needed only when UCP support is built"
433 print "into PCRE. This module should not be referenced otherwise, so"
434 print "it should not matter whether it is compiled or not. However"
435 print "a comment was received about space saving - maybe the guy linked"
436 print "all the modules rather than using a library - so we include a"
437 print "condition to cut out the tables when not needed. But don't leave"
438 print "a totally empty module because some compilers barf at that."
439 print "Instead, just supply small dummy tables. */"
440 print
441 print "#ifndef SUPPORT_UCP"
442 print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0 }};"
443 print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
444 print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
445 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
446 print "#else"
447 print
448 print record_struct
449
450 # --- Added by PH: output the table of caseless character sets ---
451
452 print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
453 print " NOTACHAR,"
454 for s in sets:
455 s = sorted(s)
456 for x in s:
457 print ' 0x%04x,' % x,
458 print ' NOTACHAR,'
459 print '};'
460 print
461
462 # ------
463
464 print "/* When #included in pcretest, we don't need this large table. */"
465 print
466 print "#ifndef PCRE_INCLUDED"
467 print
468 print_records(records, record_size)
469 print_table(min_stage1, 'PRIV(ucd_stage1)')
470 print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
471 print "#if UCD_BLOCK_SIZE != %d" % min_block_size
472 print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
473 print "#endif"
474 print "#endif /* SUPPORT_UCP */"
475 print
476 print "#endif /* PCRE_INCLUDED */"
477
478 """
479
480 # Three-stage tables:
481
482 # Find the optimum block size for 3-stage table
483 min_size = sys.maxint
484 for stage3_block in [2 ** i for i in range(2,6)]:
485 stage_i, stage3 = compress_table(table, stage3_block)
486 for stage2_block in [2 ** i for i in range(5,10)]:
487 size = len(records) * 4
488 stage1, stage2 = compress_table(stage_i, stage2_block)
489 size += get_tables_size(stage1, stage2, stage3)
490 # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
491 if size < min_size:
492 min_size = size
493 min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
494 min_stage2_block, min_stage3_block = stage2_block, stage3_block
495
496 print "/* Total size: %d bytes" % min_size */
497 print_records(records)
498 print_table(min_stage1, 'ucd_stage1')
499 print_table(min_stage2, 'ucd_stage2', min_stage2_block)
500 print_table(min_stage3, 'ucd_stage3', min_stage3_block)
501
502 """

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.5