--- code/trunk/maint/MultiStage2.py 2008/07/04 18:27:16 351 +++ code/trunk/maint/MultiStage2.py 2008/07/07 15:12:56 352 @@ -25,6 +25,7 @@ # Adjusted data file names to take from the Unicode.tables directory # Adjusted global table names by prefixing _pcre_. # Commented out stuff relating to the casefolding table, which isn't used. +# Corrected size calculation # # The tables generated by this script are used by macros defined in # pcre_internal.h. They look up Unicode character properties using short @@ -189,8 +190,45 @@ index.append(i) return index, records -def print_records(records): - print 'const ucd_record _pcre_ucd_records[] = { /* %d bytes */' % (len(records) * 4) +def get_record_size_struct(records): + size = 0 + structure = '/* When recompiling tables with a new Unicode version,\n' + \ + 'please check types in the structure definition from pcre_internal.h:\ntypedef struct {\n' + for i in range(len(records[0])): + record_slice = map(lambda record: record[i], records) + slice_type, slice_size = get_type_size(record_slice) + # add padding: round up to the nearest power of slice_size + size = (size + slice_size - 1) & -slice_size + size += slice_size + structure += '%s property_%d;\n' % (slice_type, i) + + # round up to the first item of the next structure in array + record_slice = map(lambda record: record[0], records) + slice_type, slice_size = get_type_size(record_slice) + size = (size + slice_size - 1) & -slice_size + + structure += '} ucd_record; */\n\n' + return size, structure + +def test_record_size(): + tests = [ \ + ( [(3,), (6,), (6,), (1,)], 1 ), \ + ( [(300,), (600,), (600,), (100,)], 2 ), \ + ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ + ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ + ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ + ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ + ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ + ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ + ] + for test in tests: + size, struct = get_record_size_struct(test[0]) + assert(size == test[1]) + #print struct + +def print_records(records, record_size): + print 'const ucd_record _pcre_ucd_records[] = { ' + \ + '/* %d bytes, record size %d */' % (len(records) * record_size, record_size) records = zip(records.keys(), records.values()) records.sort(None, lambda x: x[1]) for i, record in enumerate(records): @@ -213,6 +251,7 @@ 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] +test_record_size() script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common')) category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) @@ -220,11 +259,12 @@ # case_fold = read_table('CaseFolding.txt', get_case_folding_value, 0) table, records = combine_tables(script, category, other_case) +record_size, record_struct = get_record_size_struct(records.keys()) # Find the optimum block size for the two-stage table min_size = sys.maxint for block_size in [2 ** i for i in range(5,10)]: - size = len(records) * 4 + size = len(records) * record_size stage1, stage2 = compress_table(table, block_size) size += get_tables_size(stage1, stage2) #print "/* block size %5d => %5d bytes */" % (block_size, size) @@ -241,7 +281,8 @@ print "/* Unicode character database. */" print "/* This file was autogenerated by the MultiStage2.py script. */" print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size) -print_records(records) +print record_struct +print_records(records, record_size) print_table(min_stage1, '_pcre_ucd_stage1') print_table(min_stage2, '_pcre_ucd_stage2', min_block_size) print "#if UCD_BLOCK_SIZE != %d" % min_block_size