/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Contents of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 350 - (show annotations)
Wed Jul 2 19:18:41 2008 UTC (12 years, 7 months ago) by ph10
File MIME type: text/x-python
File size: 9745 byte(s)
Error occurred while calculating annotation data.
More of the UCP speedup update.
1 #! /usr/bin/python
2
3 # Multistage table builder
4 # (c) Peter Kankowski, 2008
5
6 # This script was submitted to the PCRE project by Peter Kankowski as part of
7 # the upgrading of Unicode property support. The new code speeds up property
8 # matching many times. The script is for the use of PCRE maintainers, to
9 # generate the pcre_ucd.c file that contains a digested form of the Unicode
10 # data tables.
11
12 # The script should be run in the maint subdirectory, using the command
13 #
14 # ./MultiStage2.py >../pcre_ucd.c
15 #
16 # It requires three Unicode data tables, DerivedGeneralCategory.txt,
17 # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
18
19 # Added with minor modifications:
20 # Added #! line at start
21 # Removed tabs
22 # Made it work with Python 2.4 by rewriting two statements that needed 2.5
23 # Consequent code tidy
24 # Adjusted file names to Unicode.tables directory
25 #
26 # Philip Hazel, 02 July 2008
27
28
29 import re
30 import string
31 import sys
32
33 MAX_UNICODE = 0x110000
34 NOTACHAR = 0xffffffff
35
36 # Parse a line of CaseFolding.txt, Scripts.txt, and DerivedGeneralCategory.txt file
37 def make_get_names(enum):
38 return lambda chardata: enum.index(chardata[1])
39
40 def get_case_folding_value(chardata):
41 if chardata[1] != 'C' and chardata[1] != 'S':
42 return 0
43 return int(chardata[2], 16) - int(chardata[0], 16)
44
45 def get_other_case(chardata):
46 if chardata[12] != '':
47 return int(chardata[12], 16) - int(chardata[0], 16)
48 if chardata[13] != '':
49 return int(chardata[13], 16) - int(chardata[0], 16)
50 return 0
51
52 # Read the whole table in memory
53 def read_table(file_name, get_value, default_value):
54 file = open(file_name, 'r')
55 table = [default_value] * MAX_UNICODE
56 for line in file:
57 line = re.sub(r'#.*', '', line)
58 chardata = map(string.strip, line.split(';'))
59 if len(chardata) <= 1:
60 continue
61 value = get_value(chardata)
62
63 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
64 char = int(m.group(1), 16)
65 # PH last = char if m.group(3) is None else int(m.group(3), 16)
66 if m.group(3) is None:
67 last = char
68 else:
69 last = int(m.group(3), 16)
70 for i in range(char, last + 1):
71 table[i] = value
72 file.close()
73 return table
74
75 # Get the smallest possible C language type for the values
76 def get_type_size(table):
77 type_size = [("uschar", 1), ("pcre_uint16", 2), ("pcre_uint32", 4),
78 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)]
79 limits = [(0, 255), (0, 65535), (0, 4294967295),
80 (-128, 127), (-32768, 32767), (-2147483648, 2147483647)]
81 minval = min(table)
82 maxval = max(table)
83 for num, (minlimit, maxlimit) in enumerate(limits):
84 if minlimit <= minval and maxval <= maxlimit:
85 return type_size[num]
86 else:
87 raise OverflowError, "Too large to fit into C types"
88
89 def get_tables_size(*tables):
90 total_size = 0
91 for table in tables:
92 type, size = get_type_size(table)
93 total_size += size * len(table)
94 return total_size
95
96 # Compress the table into the two stages
97 def compress_table(table, block_size):
98 blocks = {} # Dictionary for finding identical blocks
99 stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
100 stage2 = [] # Stage 2 table contains the blocks with property values
101 table = tuple(table)
102 for i in range(0, len(table), block_size):
103 block = table[i:i+block_size]
104 start = blocks.get(block)
105 if start is None:
106 # Allocate a new block
107 start = len(stage2) / block_size
108 stage2 += block
109 blocks[block] = start
110 stage1.append(start)
111
112 return stage1, stage2
113
114 # Print a table
115 def print_table(table, table_name, block_size = None):
116 type, size = get_type_size(table)
117 ELEMS_PER_LINE = 16
118
119 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
120 if block_size:
121 s += ", block = %d" % block_size
122 print s + " */"
123 table = tuple(table)
124 if block_size is None:
125 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */"
126 mult = MAX_UNICODE / len(table)
127 for i in range(0, len(table), ELEMS_PER_LINE):
128 print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
129 else:
130 # PH fmt = "%3d," * (ELEMS_PER_LINE if block_size > ELEMS_PER_LINE else block_size) + "\n"
131 if block_size > ELEMS_PER_LINE:
132 fmt = "%3d," * ELEMS_PER_LINE + "\n"
133 fmt = fmt * (block_size / ELEMS_PER_LINE)
134 else:
135 fmt = "%3d," * block_size + "\n"
136 # PH if block_size > ELEMS_PER_LINE:
137 # PH fmt = fmt * (block_size / ELEMS_PER_LINE)
138 for i in range(0, len(table), block_size):
139 print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
140 print "};\n"
141
142 # Extract the unique combinations of properties into records
143 def combine_tables(*tables):
144 records = {}
145 index = []
146 for t in zip(*tables):
147 i = records.get(t)
148 if i is None:
149 i = records[t] = len(records)
150 index.append(i)
151 return index, records
152
153 def print_records(records):
154 print 'const ucd_record ucd_records[] = { /* %d bytes */' % (len(records) * 4)
155 records = zip(records.keys(), records.values())
156 records.sort(None, lambda x: x[1])
157 for i, record in enumerate(records):
158 print (' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))
159 print '};\n'
160
161 script_names = ['Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \
162 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \
163 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \
164 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \
165 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
166 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
167 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
168 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician']
169
170 category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
171 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
172 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
173
174
175 script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
176 category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
177 other_case = read_table('Unicode.tables/UnicodeData.txt', get_other_case, 0)
178 # case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)
179
180 table, records = combine_tables(script, category, other_case)
181
182 # Find the optimum block size for the two-stage table
183 min_size = sys.maxint
184 for block_size in [2 ** i for i in range(5,10)]:
185 size = len(records) * 4
186 stage1, stage2 = compress_table(table, block_size)
187 size += get_tables_size(stage1, stage2)
188 #print "/* block size %5d => %5d bytes */" % (block_size, size)
189 if size < min_size:
190 min_size = size
191 min_stage1, min_stage2 = stage1, stage2
192 min_block_size = block_size
193
194 print "#ifdef HAVE_CONFIG_H"
195 print "#include \"config.h\""
196 print "#endif"
197 print "#include \"pcre_internal.h\""
198 print
199 print "/* Unicode character database. */"
200 print "/* This file was autogenerated by MultiStage2.py script. */"
201 print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
202 print_records(records)
203 print_table(min_stage1, 'ucd_stage1')
204 print_table(min_stage2, 'ucd_stage2', min_block_size)
205 print "#if UCD_BLOCK_SIZE != %d" % min_block_size
206 print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
207 print "#endif"
208
209 """
210
211 # Three-stage tables:
212
213 # Find the optimum block size for 3-stage table
214 min_size = sys.maxint
215 for stage3_block in [2 ** i for i in range(2,6)]:
216 stage_i, stage3 = compress_table(table, stage3_block)
217 for stage2_block in [2 ** i for i in range(5,10)]:
218 size = len(records) * 4
219 stage1, stage2 = compress_table(stage_i, stage2_block)
220 size += get_tables_size(stage1, stage2, stage3)
221 # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size)
222 if size < min_size:
223 min_size = size
224 min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3
225 min_stage2_block, min_stage3_block = stage2_block, stage3_block
226
227 print "/* Total size: %d bytes" % min_size */
228 print_records(records)
229 print_table(min_stage1, 'ucd_stage1')
230 print_table(min_stage2, 'ucd_stage2', min_stage2_block)
231 print_table(min_stage3, 'ucd_stage3', min_stage3_block)
232
233 """

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.5