/[pcre]/code/trunk/maint/MultiStage2.py
ViewVC logotype

Diff of /code/trunk/maint/MultiStage2.py

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1043 by ph10, Sat Aug 25 11:36:15 2012 UTC revision 1044 by ph10, Thu Sep 20 16:23:57 2012 UTC
# Line 15  Line 15 
15  # ./MultiStage2.py >../pcre_ucd.c  # ./MultiStage2.py >../pcre_ucd.c
16  #  #
17  # It requires four Unicode data tables, DerivedGeneralCategory.txt,  # It requires four Unicode data tables, DerivedGeneralCategory.txt,
18  # GraphemeBreakProperty.txt, Scripts.txt, and UnicodeData.txt, to be in the  # GraphemeBreakProperty.txt, Scripts.txt, and CaseFolding.txt, to be in the
19  # Unicode.tables subdirectory. The first of these is found in the "extracted"  # Unicode.tables subdirectory. The first of these is found in the "extracted"
20  # subdirectory of the Unicode database (UCD) on the Unicode web site; the  # subdirectory of the Unicode database (UCD) on the Unicode web site; the
21  # second is in the "auxiliary" subdirectory; the other two are directly in the  # second is in the "auxiliary" subdirectory; the other two are directly in the
# Line 33  Line 33 
33  #  Corrected size calculation  #  Corrected size calculation
34  #  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.  #  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
35  #  #
36  # The tables generated by this script are used by macros defined in  # Major modifications made to this script:
37    #  Added code to add a grapheme break property field to records.
38    #
39    #  Added code to search for sets of more than two characters that must match
40    #  each other caselessly. A new table is output containing these sets, and
41    #  offsets into the table are added to the main output records. This new
42    #  code scans CaseFolding.txt instead of UnicodeData.txt.
43    #
44    # The main tables generated by this script are used by macros defined in
45  # pcre_internal.h. They look up Unicode character properties using short  # pcre_internal.h. They look up Unicode character properties using short
46  # sequences of code that contains no branches, which makes for greater speed.  # sequences of code that contains no branches, which makes for greater speed.
47  #  #
48  # Conceptually, there is a table of records (of type ucd_record), containing a  # Conceptually, there is a table of records (of type ucd_record), containing a
49  # script number, character type, and offset to the character's other case for  # script number, character type, grapheme break type, offset to caseless
50  # every character. However, a real table covering all Unicode characters would  # matching set, and offset to the character's other case for every character.
51  # be far too big. It can be efficiently compressed by observing that many  # However, a real table covering all Unicode characters would be far too big.
52  # characters have the same record, and many blocks of characters (taking 128  # It can be efficiently compressed by observing that many characters have the
53  # characters in a block) have the same set of records as other blocks. This  # same record, and many blocks of characters (taking 128 characters in a block)
54  # leads to a 2-stage lookup process.  # have the same set of records as other blocks. This leads to a 2-stage lookup
55  #  # process.
56  # This script constructs three tables. The _pcre_ucd_records table contains  #
57  # one instance of every unique record that is required. The _pcre_ucd_stage1  # This script constructs four tables. The ucd_caseless_sets table contains
58  # table is indexed by a character's block number, and yields what is in effect  # lists of characters that all match each other caselessly. Each list is
59  # a "virtual" block number. The _pcre_ucd_stage2 table is a table of "virtual"  # in order, and is terminated by 0xffffffff, which is of course larger than any
60  # blocks; each block is indexed by the offset of a character within its own  # valid character. The first list is empty; this is used for characters that
61  # block, and the result is the offset of the required record.  # are not part of any list.
62    #
63    # The ucd_records table contains one instance of every unique record that is
64    # required. The ucd_stage1 table is indexed by a character's block number, and
65    # yields what is in effect a "virtual" block number. The ucd_stage2 table is a
66    # table of "virtual" blocks; each block is indexed by the offset of a character
67    # within its own block, and the result is the offset of the required record.
68  #  #
69  # Example: lowercase "a" (U+0061) is in block 0  # Example: lowercase "a" (U+0061) is in block 0
70  #          lookup 0 in stage1 table yields 0  #          lookup 0 in stage1 table yields 0
71  #          lookup 97 in the first table in stage2 yields 14  #          lookup 97 in the first table in stage2 yields 16
72  #          record 14 is { 33, 5, 11, -32 }  #          record 17 is { 33, 5, 11, 0, -32 }
73  #            33 = ucp_Latin   => Latin script  #            33 = ucp_Latin   => Latin script
74  #             5 = ucp_Ll      => Lower case letter  #             5 = ucp_Ll      => Lower case letter
75  #            11 = ucp_gbOther => Grapheme break property "Other"  #            11 = ucp_gbOther => Grapheme break property "Other"
76    #             0               => not part of a caseless set
77  #           -32               => Other case is U+0041  #           -32               => Other case is U+0041
78  #  #
79  # All lowercase latin characters resolve to the same record.  # Almost all lowercase latin characters resolve to the same record. One or two
80    # are different because they are part of a multi-character caseless set (for
81    # example, k, K and the Kelvin symbol are such a set).
82  #  #
83  # Example: hiragana letter A (U+3042) is in block 96 (0x60)  # Example: hiragana letter A (U+3042) is in block 96 (0x60)
84  #          lookup 96 in stage1 table yields 88  #          lookup 96 in stage1 table yields 88
85  #          lookup 66 in the 88th table in stage2 yields 428  #          lookup 66 in the 88th table in stage2 yields 467
86  #          record 428 is { 26, 7, 11, 0 }  #          record 470 is { 26, 7, 11, 0, 0 }
87  #            26 = ucp_Hiragana => Hiragana script  #            26 = ucp_Hiragana => Hiragana script
88  #             7 = ucp_Lo       => Other letter  #             7 = ucp_Lo       => Other letter
89  #            11 = ucp_gbOther  => Grapheme break property "Other"  #            11 = ucp_gbOther  => Grapheme break property "Other"
90    #             0                => not part of a caseless set
91  #             0                => No other case  #             0                => No other case
92  #  #
93  # In these examples, no other blocks resolve to the same "virtual" block, as it  # In these examples, no other blocks resolve to the same "virtual" block, as it
# Line 80  Line 98 
98  #  #
99  #  Philip Hazel, 03 July 2008  #  Philip Hazel, 03 July 2008
100  #  #
101  # 01-March-2010:  Updated list of scripts for Unicode 5.2.0  # 01-March-2010:     Updated list of scripts for Unicode 5.2.0
102  # 30-April-2011:  Updated list of scripts for Unicode 6.0.0  # 30-April-2011:     Updated list of scripts for Unicode 6.0.0
103  #     July-2012:  Updated list of scripts for Unicode 6.1.0  #     July-2012:     Updated list of scripts for Unicode 6.1.0
104  # 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new  # 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
105  #                   field in the record to hold the value. Luckily, the  #                      field in the record to hold the value. Luckily, the
106  #                   structure had a hole in it, so the resulting table is  #                      structure had a hole in it, so the resulting table is
107  #                   no bigger than before.  #                      not much bigger than before.
108    # 18-September-2012: Added code for multiple caseless sets. This uses the
109    #                      final hole in the structure.
110  ##############################################################################  ##############################################################################
111    
112    
# Line 101  NOTACHAR = 0xffffffff Line 121  NOTACHAR = 0xffffffff
121  def make_get_names(enum):  def make_get_names(enum):
122          return lambda chardata: enum.index(chardata[1])          return lambda chardata: enum.index(chardata[1])
123    
124    # Parse a line of CaseFolding.txt
125  def get_other_case(chardata):  def get_other_case(chardata):
126          if chardata[12] != '':          if chardata[1] == 'C' or chardata[1] == 'S':
127                  return int(chardata[12], 16) - int(chardata[0], 16)            return int(chardata[2], 16) - int(chardata[0], 16)
         if chardata[13] != '':  
                 return int(chardata[13], 16) - int(chardata[0], 16)  
128          return 0          return 0
129    
130    
131  # Read the whole table in memory  # Read the whole table in memory
132  def read_table(file_name, get_value, default_value):  def read_table(file_name, get_value, default_value):
133          file = open(file_name, 'r')          file = open(file_name, 'r')
# Line 125  def read_table(file_name, get_value, def Line 145  def read_table(file_name, get_value, def
145                  else:                  else:
146                          last = int(m.group(3), 16)                          last = int(m.group(3), 16)
147                  for i in range(char, last + 1):                  for i in range(char, last + 1):
148                          table[i] = value                          # It is important not to overwrite a previously set
149                            # value because in the CaseFolding file there are lines
150                            # to be ignored (returning the default value of 0)
151                            # which often come after a line which has already set
152                            # data.
153                            if table[i] == default_value:
154                              table[i] = value
155          file.close()          file.close()
156          return table          return table
157    
# Line 286  test_record_size() Line 312  test_record_size()
312  script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))  script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Common'))
313  category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))  category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
314  break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))  break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other'))
315  other_case = read_table('Unicode.tables/UnicodeData.txt', get_other_case, 0)  other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
316    
317    
318    # This block of code was added by PH in September 2012. I am not a Python
319    # programmer, so the style is probably dreadful, but it does the job. It scans
320    # the other_case table to find sets of more than two characters that must all
321    # match each other caselessly. Later in this script a table of these sets is
322    # written out. However, we have to do this work here in order to compute the
323    # offsets in the table that are inserted into the main table.
324    
325    # The CaseFolding.txt file lists pairs, but the common logic for reading data
326    # sets only one value, so first we go through the table and set "return"
327    # offsets for those that are not already set.
328    
329    for c in range(0x10ffff):
330      if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
331        other_case[c + other_case[c]] = -other_case[c]
332    
333    # Now scan again and create equivalence sets.
334    
335    sets = []
336    
337    for c in range(0x10ffff):
338      o = c + other_case[c]
339    
340      # Trigger when this character's other case does not point back here. We
341      # now have three characters that are case-equivalent.
342    
343      if other_case[o] != -other_case[c]:
344        t = o + other_case[o]
345    
346        # Scan the existing sets to see if any of the three characters are already
347        # part of a set. If so, unite the existing set with the new set.
348    
349        appended = 0
350        for s in sets:
351          found = 0
352          for x in s:
353            if x == c or x == o or x == t:
354              found = 1
355    
356          # Add new characters to an existing set
357    
358          if found:
359            found = 0
360            for y in [c, o, t]:
361              for x in s:
362                if x == y:
363                  found = 1
364              if not found:
365                s.append(y)
366            appended = 1
367    
368        # If we have not added to an existing set, create a new one.
369    
370        if not appended:
371          sets.append([c, o, t])
372    
373    # End of loop looking for caseless sets.
374    
375    # Now scan the sets and set appropriate offsets for the characters.
376    
377    caseless_offsets = [0] * MAX_UNICODE
378    
379    offset = 1;
380    for s in sets:
381      for x in s:
382        caseless_offsets[x] = offset
383      offset += len(s) + 1
384    
385    # End of block of code for creating offsets for caseless matching sets.
386    
387    
388    # Combine the tables
389    
390    table, records = combine_tables(script, category, break_props,
391      caseless_offsets, other_case)
392    
 table, records = combine_tables(script, category, break_props, other_case)  
393  record_size, record_struct = get_record_size_struct(records.keys())  record_size, record_struct = get_record_size_struct(records.keys())
394    
395  # Find the optimum block size for the two-stage table  # Find the optimum block size for the two-stage table
# Line 323  print "a totally empty module because so Line 424  print "a totally empty module because so
424  print "Instead, just supply small dummy tables. */"  print "Instead, just supply small dummy tables. */"
425  print  print
426  print "#ifndef SUPPORT_UCP"  print "#ifndef SUPPORT_UCP"
427  print "const ucd_record PRIV(ucd_records)[] = {{0,0,0 }};"  print "const ucd_record PRIV(ucd_records)[] = {{0,0,0,0 }};"
428  print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"  print "const pcre_uint8 PRIV(ucd_stage1)[] = {0};"
429  print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"  print "const pcre_uint16 PRIV(ucd_stage2)[] = {0};"
430    print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {0};"
431  print "#else"  print "#else"
432  print  print
433  print record_struct  print record_struct
434    
435    # --- Added by PH: output the table of caseless character sets ---
436    
437    print "const pcre_uint32 PRIV(ucd_caseless_sets)[] = {"
438    print "  0xffffffff,"
439    for s in sets:
440      s = sorted(s)
441      for x in s:
442        print '  0x%04x,' % x,
443      print '  0xffffffff,'
444    print '};'
445    print
446    
447    # ------
448    
449  print_records(records, record_size)  print_records(records, record_size)
450  print_table(min_stage1, 'PRIV(ucd_stage1)')  print_table(min_stage1, 'PRIV(ucd_stage1)')
451  print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)  print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)

Legend:
Removed from v.1043  
changed lines
  Added in v.1044

  ViewVC Help
Powered by ViewVC 1.1.5