/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Diff of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 350 by ph10, Wed Jul 2 19:18:41 2008 UTC revision 352 by ph10, Mon Jul 7 15:12:56 2008 UTC
# Line 3  Line 3 
3  # Multistage table builder  # Multistage table builder
4  # (c) Peter Kankowski, 2008  # (c) Peter Kankowski, 2008
5    
6    ##############################################################################
7  # This script was submitted to the PCRE project by Peter Kankowski as part of  # This script was submitted to the PCRE project by Peter Kankowski as part of
8  # the upgrading of Unicode property support. The new code speeds up property  # the upgrading of Unicode property support. The new code speeds up property
9  # matching many times. The script is for the use of PCRE maintainers, to  # matching many times. The script is for the use of PCRE maintainers, to
10  # generate the pcre_ucd.c file that contains a digested form of the Unicode  # generate the pcre_ucd.c file that contains a digested form of the Unicode
11  # data tables.  # data tables.
12    #
13  # The script should be run in the maint subdirectory, using the command  # The script should be run in the maint subdirectory, using the command
14  #  #
15  # ./MultiStage2.py >../pcre_ucd.c  # ./MultiStage2.py >../pcre_ucd.c
16  #  #
17  # It requires three Unicode data tables, DerivedGeneralCategory.txt,  # It requires three Unicode data tables, DerivedGeneralCategory.txt,
18  # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.  # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
19    #
20  # Added with minor modifications:  # Minor modifications made to this script:
21  #  Added #! line at start  #  Added #! line at start
22  #  Removed tabs  #  Removed tabs
23  #  Made it work with Python 2.4 by rewriting two statements that needed 2.5  #  Made it work with Python 2.4 by rewriting two statements that needed 2.5
24  #  Consequent code tidy  #  Consequent code tidy
25  #  Adjusted file names to Unicode.tables directory  #  Adjusted data file names to take from the Unicode.tables directory
26    #  Adjusted global table names by prefixing _pcre_.
27    #  Commented out stuff relating to the casefolding table, which isn't used.
28    #  Corrected size calculation
29    #
30    # The tables generated by this script are used by macros defined in
31    # pcre_internal.h. They look up Unicode character properties using short
32    # sequences of code that contains no branches, which makes for greater speed.
33    #
34    # Conceptually, there is a table of records (of type ucd_record), containing a
35    # script number, character type, and offset to the character's other case for
36    # every character. However, a real table covering all Unicode characters would
37    # be far too big. It can be efficiently compressed by observing that many
38    # characters have the same record, and many blocks of characters (taking 128
39    # characters in a block) have the same set of records as other blocks. This
40    # leads to a 2-stage lookup process.
41    #
42    # This script constructs three tables. The _pcre_ucd_records table contains
43    # one instance of every unique record that is required. The _pcre_ucd_stage1
44    # table is indexed by a character's block number, and yields what is in effect
45    # a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual"
46    # blocks; each block is indexed by the offset of a character within its own
47    # block, and the result is the offset of the required record.
48    #
49    # Example: lowercase "a" (U+0061) is in block 0
50    #          lookup 0 in stage1 table yields 0
51    #          lookup 97 in the first table in stage2 yields 12
52    #          record 12 is { 33, 5, -32 } (Latin, lowercase, upper is U+0041)
53    #
54    # All lowercase latin characters resolve to the same record.
55    #
56    # Example: hiragana letter A (U+3042) is in block 96 (0x60)
57    #          lookup 96 in stage1 table yields 83
58    #          lookup 66 in the 83rd table in stage2 yields 348
59    #          record 348 is { 26, 7, 0 } (Hiragana, other letter, no other case)
60  #  #
61  #  Philip Hazel, 02 July 2008  # In these examples, no other blocks resolve to the same "virtual" block, as it
62    # happens, but plenty of other blocks do share "virtual" blocks.
63    #
64    # There is a fourth table, maintained by hand, which translates from the
65    # individual character types such as ucp_Cc to the general types like ucp_C.
66    #
67    #  Philip Hazel, 03 July 2008
68    ##############################################################################
69    
70    
71  import re  import re
# Line 37  NOTACHAR = 0xffffffff Line 79  NOTACHAR = 0xffffffff
79  def make_get_names(enum):  def make_get_names(enum):
80          return lambda chardata: enum.index(chardata[1])          return lambda chardata: enum.index(chardata[1])
81    
82  def get_case_folding_value(chardata):  #def get_case_folding_value(chardata):
83          if chardata[1] != 'C' and chardata[1] != 'S':  #        if chardata[1] != 'C' and chardata[1] != 'S':
84                  return 0  #                return 0
85          return int(chardata[2], 16) - int(chardata[0], 16)  #        return int(chardata[2], 16) - int(chardata[0], 16)
86    
87  def get_other_case(chardata):  def get_other_case(chardata):
88          if chardata[12] != '':          if chardata[12] != '':
# Line 62  def read_table(file_name, get_value, def Line 104  def read_table(file_name, get_value, def
104    
105                  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])                  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
106                  char = int(m.group(1), 16)                  char = int(m.group(1), 16)
 # PH            last = char if m.group(3) is None else int(m.group(3), 16)  
107                  if m.group(3) is None:                  if m.group(3) is None:
108                          last = char                          last = char
109                  else:                  else:
# Line 127  def print_table(table, table_name, block Line 168  def print_table(table, table_name, block
168                  for i in range(0, len(table), ELEMS_PER_LINE):                  for i in range(0, len(table), ELEMS_PER_LINE):
169                          print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))                          print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
170          else:          else:
 # PH            fmt = "%3d," * (ELEMS_PER_LINE if block_size > ELEMS_PER_LINE else block_size) + "\n"  
171                  if block_size > ELEMS_PER_LINE:                  if block_size > ELEMS_PER_LINE:
172                          fmt = "%3d," * ELEMS_PER_LINE + "\n"                          el = ELEMS_PER_LINE
                         fmt = fmt * (block_size / ELEMS_PER_LINE)  
173                  else:                  else:
174                          fmt = "%3d," * block_size + "\n"                          el = block_size
175  # PH            if block_size > ELEMS_PER_LINE:                  fmt = "%3d," * el + "\n"
176  # PH                    fmt = fmt * (block_size / ELEMS_PER_LINE)                  if block_size > ELEMS_PER_LINE:
177                            fmt = fmt * (block_size / ELEMS_PER_LINE)
178                  for i in range(0, len(table), block_size):                  for i in range(0, len(table), block_size):
179                          print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])                          print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
180          print "};\n"          print "};\n"
# Line 150  def combine_tables(*tables): Line 190  def combine_tables(*tables):
190                  index.append(i)                  index.append(i)
191          return index, records          return index, records
192    
193  def print_records(records):  def get_record_size_struct(records):
194          print 'const ucd_record ucd_records[] = { /* %d bytes */' % (len(records) * 4)          size = 0
195            structure = '/* When recompiling tables with a new Unicode version,\n' + \
196            'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n'
197            for i in range(len(records[0])):
198                    record_slice = map(lambda record: record[i], records)
199                    slice_type, slice_size = get_type_size(record_slice)
200                    # add padding: round up to the nearest power of slice_size
201                    size = (size + slice_size - 1) & -slice_size
202                    size += slice_size
203                    structure += '%s property_%d;\n' % (slice_type, i)
204    
205            # round up to the first item of the next structure in array
206            record_slice = map(lambda record: record[0], records)
207            slice_type, slice_size = get_type_size(record_slice)
208            size = (size + slice_size - 1) & -slice_size
209    
210            structure += '} ucd_record; */\n\n'
211            return size, structure
212    
213    def test_record_size():
214            tests = [ \
215              ( [(3,), (6,), (6,), (1,)], 1 ), \
216              ( [(300,), (600,), (600,), (100,)], 2 ), \
217              ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
218              ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
219              ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
220              ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
221              ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
222              ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
223            ]
224            for test in tests:
225                size, struct = get_record_size_struct(test[0])
226                assert(size == test[1])
227                #print struct
228    
229    def print_records(records, record_size):
230            print 'const ucd_record _pcre_ucd_records[] = { ' + \
231                  '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)
232          records = zip(records.keys(), records.values())          records = zip(records.keys(), records.values())
233          records.sort(None, lambda x: x[1])          records.sort(None, lambda x: x[1])
234          for i, record in enumerate(records):          for i, record in enumerate(records):
# Line 165  script_names = ['Arabic', 'Armenian', 'B Line 242  script_names = ['Arabic', 'Armenian', 'B
242   'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \   'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
243   'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \   'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
244   'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \   'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
245   'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician']  # New for Unicode 5.0
246     'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
247    # New for Unicode 5.1
248     'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai']
249    
250  category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',  category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
251    'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',    'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
252    'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]    'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ]
253    
254    test_record_size()
255    
256  script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))  script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
257  category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))  category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
# Line 178  other_case = read_table('Unicode.tables/ Line 259  other_case = read_table('Unicode.tables/
259  # case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)  # case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0)
260    
261  table, records = combine_tables(script, category, other_case)  table, records = combine_tables(script, category, other_case)
262    record_size, record_struct = get_record_size_struct(records.keys())
263    
264  # Find the optimum block size for the two-stage table  # Find the optimum block size for the two-stage table
265  min_size = sys.maxint  min_size = sys.maxint
266  for block_size in [2 ** i for i in range(5,10)]:  for block_size in [2 ** i for i in range(5,10)]:
267          size = len(records) * 4          size = len(records) * record_size
268          stage1, stage2 = compress_table(table, block_size)          stage1, stage2 = compress_table(table, block_size)
269          size += get_tables_size(stage1, stage2)          size += get_tables_size(stage1, stage2)
270          #print "/* block size %5d  => %5d bytes */" % (block_size, size)          #print "/* block size %5d  => %5d bytes */" % (block_size, size)
# Line 197  print "#endif" Line 279  print "#endif"
279  print "#include \"pcre_internal.h\""  print "#include \"pcre_internal.h\""
280  print  print
281  print "/* Unicode character database. */"  print "/* Unicode character database. */"
282  print "/* This file was autogenerated by MultiStage2.py script. */"  print "/* This file was autogenerated by the MultiStage2.py script. */"
283  print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)  print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
284  print_records(records)  print record_struct
285  print_table(min_stage1, 'ucd_stage1')  print_records(records, record_size)
286  print_table(min_stage2, 'ucd_stage2', min_block_size)  print_table(min_stage1, '_pcre_ucd_stage1')
287    print_table(min_stage2, '_pcre_ucd_stage2', min_block_size)
288  print "#if UCD_BLOCK_SIZE != %d" % min_block_size  print "#if UCD_BLOCK_SIZE != %d" % min_block_size
289  print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"  print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
290  print "#endif"  print "#endif"

Legend:
Removed from v.350  
changed lines
  Added in v.352

  ViewVC Help
Powered by ViewVC 1.1.5