/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Diff of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 350 by ph10, Wed Jul 2 19:18:41 2008 UTC revision 351 by ph10, Fri Jul 4 18:27:16 2008 UTC
# Line 3  Line 3 
3  # Multistage table builder  # Multistage table builder
4  # (c) Peter Kankowski, 2008  # (c) Peter Kankowski, 2008
5    
6    ##############################################################################
7  # This script was submitted to the PCRE project by Peter Kankowski as part of  # This script was submitted to the PCRE project by Peter Kankowski as part of
8  # the upgrading of Unicode property support. The new code speeds up property  # the upgrading of Unicode property support. The new code speeds up property
9  # matching many times. The script is for the use of PCRE maintainers, to  # matching many times. The script is for the use of PCRE maintainers, to
10  # generate the pcre_ucd.c file that contains a digested form of the Unicode  # generate the pcre_ucd.c file that contains a digested form of the Unicode
11  # data tables.  # data tables.
12    #
13  # The script should be run in the maint subdirectory, using the command  # The script should be run in the maint subdirectory, using the command
14  #  #
15  # ./MultiStage2.py >../pcre_ucd.c  # ./MultiStage2.py >../pcre_ucd.c
16  #  #
17  # It requires three Unicode data tables, DerivedGeneralCategory.txt,  # It requires three Unicode data tables, DerivedGeneralCategory.txt,
18  # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.  # Scripts.txt, and UnicodeData.txt, to be in the Unicode.tables subdirectory.
19    #
20  # Added with minor modifications:  # Minor modifications made to this script:
21  #  Added #! line at start  #  Added #! line at start
22  #  Removed tabs  #  Removed tabs
23  #  Made it work with Python 2.4 by rewriting two statements that needed 2.5  #  Made it work with Python 2.4 by rewriting two statements that needed 2.5
24  #  Consequent code tidy  #  Consequent code tidy
25  #  Adjusted file names to Unicode.tables directory  #  Adjusted data file names to take from the Unicode.tables directory
26    #  Adjusted global table names by prefixing _pcre_.
27    #  Commented out stuff relating to the casefolding table, which isn't used.
28    #
29    # The tables generated by this script are used by macros defined in
30    # pcre_internal.h. They look up Unicode character properties using short
31    # sequences of code that contains no branches, which makes for greater speed.
32    #
33    # Conceptually, there is a table of records (of type ucd_record), containing a
34    # script number, character type, and offset to the character's other case for
35    # every character. However, a real table covering all Unicode characters would
36    # be far too big. It can be efficiently compressed by observing that many
37    # characters have the same record, and many blocks of characters (taking 128
38    # characters in a block) have the same set of records as other blocks. This
39    # leads to a 2-stage lookup process.
40    #
41    # This script constructs three tables. The _pcre_ucd_records table contains
42    # one instance of every unique record that is required. The _pcre_ucd_stage1
43    # table is indexed by a character's block number, and yields what is in effect
44    # a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual"
45    # blocks; each block is indexed by the offset of a character within its own
46    # block, and the result is the offset of the required record.
47  #  #
48  #  Philip Hazel, 02 July 2008  # Example: lowercase "a" (U+0061) is in block 0
49    #          lookup 0 in stage1 table yields 0
50    #          lookup 97 in the first table in stage2 yields 12
51    #          record 12 is { 33, 5, -32 } (Latin, lowercase, upper is U+0041)
52    #
53    # All lowercase latin characters resolve to the same record.
54    #
55    # Example: hiragana letter A (U+3042) is in block 96 (0x60)
56    #          lookup 96 in stage1 table yields 83
57    #          lookup 66 in the 83rd table in stage2 yields 348
58    #          record 348 is { 26, 7, 0 } (Hiragana, other letter, no other case)
59    #
60    # In these examples, no other blocks resolve to the same "virtual" block, as it
61    # happens, but plenty of other blocks do share "virtual" blocks.
62    #
63    # There is a fourth table, maintained by hand, which translates from the
64    # individual character types such as ucp_Cc to the general types like ucp_C.
65    #
66    #  Philip Hazel, 03 July 2008
67    ##############################################################################
68    
69    
70  import re  import re
# Line 37  NOTACHAR = 0xffffffff Line 78  NOTACHAR = 0xffffffff
78  def make_get_names(enum):  def make_get_names(enum):
79          return lambda chardata: enum.index(chardata[1])          return lambda chardata: enum.index(chardata[1])
80    
81  def get_case_folding_value(chardata):  #def get_case_folding_value(chardata):
82          if chardata[1] != 'C' and chardata[1] != 'S':  #        if chardata[1] != 'C' and chardata[1] != 'S':
83                  return 0  #                return 0
84          return int(chardata[2], 16) - int(chardata[0], 16)  #        return int(chardata[2], 16) - int(chardata[0], 16)
85    
86  def get_other_case(chardata):  def get_other_case(chardata):
87          if chardata[12] != '':          if chardata[12] != '':
# Line 62  def read_table(file_name, get_value, def Line 103  def read_table(file_name, get_value, def
103    
104                  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])                  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
105                  char = int(m.group(1), 16)                  char = int(m.group(1), 16)
 # PH            last = char if m.group(3) is None else int(m.group(3), 16)  
106                  if m.group(3) is None:                  if m.group(3) is None:
107                          last = char                          last = char
108                  else:                  else:
# Line 127  def print_table(table, table_name, block Line 167  def print_table(table, table_name, block
167                  for i in range(0, len(table), ELEMS_PER_LINE):                  for i in range(0, len(table), ELEMS_PER_LINE):
168                          print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))                          print fmt % (table[i:i+ELEMS_PER_LINE] + (i * mult,))
169          else:          else:
 # PH            fmt = "%3d," * (ELEMS_PER_LINE if block_size > ELEMS_PER_LINE else block_size) + "\n"  
170                  if block_size > ELEMS_PER_LINE:                  if block_size > ELEMS_PER_LINE:
171                          fmt = "%3d," * ELEMS_PER_LINE + "\n"                          el = ELEMS_PER_LINE
                         fmt = fmt * (block_size / ELEMS_PER_LINE)  
172                  else:                  else:
173                          fmt = "%3d," * block_size + "\n"                          el = block_size
174  # PH            if block_size > ELEMS_PER_LINE:                  fmt = "%3d," * el + "\n"
175  # PH                    fmt = fmt * (block_size / ELEMS_PER_LINE)                  if block_size > ELEMS_PER_LINE:
176                            fmt = fmt * (block_size / ELEMS_PER_LINE)
177                  for i in range(0, len(table), block_size):                  for i in range(0, len(table), block_size):
178                          print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])                          print ("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])
179          print "};\n"          print "};\n"
# Line 151  def combine_tables(*tables): Line 190  def combine_tables(*tables):
190          return index, records          return index, records
191    
192  def print_records(records):  def print_records(records):
193          print 'const ucd_record ucd_records[] = { /* %d bytes */' % (len(records) * 4)          print 'const ucd_record _pcre_ucd_records[] = { /* %d bytes */' % (len(records) * 4)
194          records = zip(records.keys(), records.values())          records = zip(records.keys(), records.values())
195          records.sort(None, lambda x: x[1])          records.sort(None, lambda x: x[1])
196          for i, record in enumerate(records):          for i, record in enumerate(records):
# Line 165  script_names = ['Arabic', 'Armenian', 'B Line 204  script_names = ['Arabic', 'Armenian', 'B
204   'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \   'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \
205   'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \   'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \
206   'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \   'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \
207   'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician']  # New for Unicode 5.0
208     'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \
209    # New for Unicode 5.1
210     'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai']
211    
212  category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',  category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
213    'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',    'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps',
# Line 197  print "#endif" Line 239  print "#endif"
239  print "#include \"pcre_internal.h\""  print "#include \"pcre_internal.h\""
240  print  print
241  print "/* Unicode character database. */"  print "/* Unicode character database. */"
242  print "/* This file was autogenerated by MultiStage2.py script. */"  print "/* This file was autogenerated by the MultiStage2.py script. */"
243  print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)  print "/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)
244  print_records(records)  print_records(records)
245  print_table(min_stage1, 'ucd_stage1')  print_table(min_stage1, '_pcre_ucd_stage1')
246  print_table(min_stage2, 'ucd_stage2', min_block_size)  print_table(min_stage2, '_pcre_ucd_stage2', min_block_size)
247  print "#if UCD_BLOCK_SIZE != %d" % min_block_size  print "#if UCD_BLOCK_SIZE != %d" % min_block_size
248  print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"  print "#error Please correct UCD_BLOCK_SIZE in pcre_internal.h"
249  print "#endif"  print "#endif"

Legend:
Removed from v.350  
changed lines
  Added in v.351

  ViewVC Help
Powered by ViewVC 1.1.5