1 |
#! /usr/bin/perl -w
|
2 |
|
3 |
# This is a Perl script to create the table of character properties. For
|
4 |
# information on the format, see ucpinternal.h. The Unicode files are expected
|
5 |
# to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
|
6 |
# required. The table is written to the standard output.
|
7 |
|
8 |
# The script is rather slow because it just searches linearly through the
|
9 |
# Scripts data in order to find the script for each character or character
|
10 |
# range. It could be made faster by sorting that data, or something, but hey,
|
11 |
# it is only ever run once in a blue moon.
|
12 |
|
13 |
# Subroutine: Given a character number, return the script number. The
|
14 |
# Scripts.txt file has been read into an array, keeping just the codepoints
|
15 |
# and the script name. The lines are in one of two formats:
|
16 |
#
|
17 |
# xxxx name
|
18 |
# xxxx..yyyy name
|
19 |
#
|
20 |
# where xxxx and yyyy are code points.
|
21 |
|
22 |
sub script{
|
23 |
my($n) = $_[0];
|
24 |
foreach $line (@scriptlist)
|
25 |
{
|
26 |
my($a,$z,$s);
|
27 |
|
28 |
if ($line =~ /\.\./)
|
29 |
{
|
30 |
($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
|
31 |
}
|
32 |
else
|
33 |
{
|
34 |
($a,$s) = $line =~ /^(\S+)\s+(.*)/;
|
35 |
$z = $a;
|
36 |
}
|
37 |
|
38 |
die "Problem on this scripts data line:\n$line"
|
39 |
if (!defined $a || !defined $z);
|
40 |
|
41 |
if ($n >= hex($a) && $n <= hex($z))
|
42 |
{
|
43 |
my($x) = $scriptnum{$s};
|
44 |
return $x if defined $x;
|
45 |
die "Can't find script number for $s\n";
|
46 |
}
|
47 |
}
|
48 |
|
49 |
# All code points not explicitly listed are "Common"
|
50 |
|
51 |
return $scriptnum{"Common"};
|
52 |
}
|
53 |
|
54 |
|
55 |
# Subroutine: given a category name, return its number
|
56 |
|
57 |
sub category {
|
58 |
my($x) = $gencat{$_[0]};
|
59 |
return $x if defined $x;
|
60 |
die "Can't find number for general category $_[0]\n";
|
61 |
}
|
62 |
|
63 |
|
64 |
# Subroutine: output an entry for a range, unless it isn't really a range,
|
65 |
# in which case just output a single entry.
|
66 |
|
67 |
sub outrange{
|
68 |
my($cp,$ncp,$gc) = @_;
|
69 |
my($flag) = ($cp != $ncp)? 0x00800000 : 0;
|
70 |
printf " { 0x%08x, 0x%08x },\n",
|
71 |
$cp | $flag | (script($cp) << 24),
|
72 |
(category($gc) << 26) | $ncp - $cp;
|
73 |
}
|
74 |
|
75 |
|
76 |
# Entry point: An argument giving the Unicode version is required.
|
77 |
|
78 |
die "Require a single argument, the Unicode version"
|
79 |
if ($#ARGV != 0);
|
80 |
$Uversion = shift @ARGV;
|
81 |
|
82 |
|
83 |
# Read in the Scripts.txt file, retaining only the code points
|
84 |
# and script names.
|
85 |
|
86 |
open(IN, "Unicode.tables/Scripts.txt") ||
|
87 |
die "Can't open Unicode.tables/Scripts.txt: $!\n";
|
88 |
|
89 |
while (<IN>)
|
90 |
{
|
91 |
next if !/^[0-9A-Z]/;
|
92 |
my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
|
93 |
push @scriptlist, "$range $name";
|
94 |
}
|
95 |
close(IN);
|
96 |
|
97 |
|
98 |
# Now read the ucp.h file to get the values for the general categories
|
99 |
# and for the scripts.
|
100 |
|
101 |
open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
|
102 |
|
103 |
while (<IN>) { last if /^enum/; }
|
104 |
while (<IN>) { last if /^enum/; }
|
105 |
|
106 |
|
107 |
# The second enum are the general categories.
|
108 |
|
109 |
$count = 0;
|
110 |
while (<IN>)
|
111 |
{
|
112 |
last if $_ !~ /^\s+ucp_(..)/;
|
113 |
$gencat{$1} = $count++;
|
114 |
}
|
115 |
|
116 |
while (<IN>) { last if /^enum/; }
|
117 |
|
118 |
# The third enum are script names.
|
119 |
|
120 |
$count = 0;
|
121 |
while (<IN>)
|
122 |
{
|
123 |
last if $_ !~ /^\s+ucp_(\w+)/;
|
124 |
$scriptnum{$1} = $count++;
|
125 |
}
|
126 |
|
127 |
close(IN);
|
128 |
|
129 |
# Write out the initial boilerplace.
|
130 |
|
131 |
print "/* This source module is automatically generated from the Unicode\n" .
|
132 |
"property table. See ucpinternal.h for a description of the layout.\n" .
|
133 |
"This version was made from the Unicode $Uversion tables. */\n\n" .
|
134 |
"static const cnode ucp_table[] = {\n";
|
135 |
|
136 |
# Now read the input file and generate the output.
|
137 |
|
138 |
open(IN, "Unicode.tables/UnicodeData.txt") ||
|
139 |
die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
|
140 |
|
141 |
while (<IN>)
|
142 |
{
|
143 |
@fields = split /;/;
|
144 |
|
145 |
$cp = hex($fields[0]);
|
146 |
$gc = $fields[2];
|
147 |
$uc = $fields[12];
|
148 |
$lc = $fields[13];
|
149 |
|
150 |
# If this line has no "other case" data, it might be the start or end of
|
151 |
# a range, either one that is explicit in the data, or one that we can
|
152 |
# create by scanning forwards.
|
153 |
|
154 |
if ($uc eq "" && $lc eq "")
|
155 |
{
|
156 |
if ($fields[1] =~ /First>$/)
|
157 |
{
|
158 |
$_ = <IN>;
|
159 |
@fields = split /;/;
|
160 |
die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
|
161 |
die "First and last have different categories\n",
|
162 |
if $gc ne $fields[2];
|
163 |
outrange($cp, hex($fields[0]), $gc);
|
164 |
}
|
165 |
|
166 |
else
|
167 |
{
|
168 |
my($ncp) = $cp + 1;
|
169 |
while (<IN>)
|
170 |
{
|
171 |
@fields = split /;/;
|
172 |
last if (hex($fields[0]) != $ncp ||
|
173 |
$fields[2] ne $gc ||
|
174 |
$fields[12] ne "" ||
|
175 |
$fields[13] ne "");
|
176 |
|
177 |
$ncp++;
|
178 |
}
|
179 |
|
180 |
$ncp--;
|
181 |
outrange($cp, $ncp, $gc);
|
182 |
redo if defined $_; # Reprocess terminating line
|
183 |
}
|
184 |
}
|
185 |
|
186 |
# If there is an "other case" character, we output a single-char line
|
187 |
|
188 |
else
|
189 |
{
|
190 |
my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
|
191 |
printf " { 0x%08x, 0x%08x },\n",
|
192 |
$cp | (script($cp) << 24), (category($gc) << 26) | $co;
|
193 |
}
|
194 |
}
|
195 |
|
196 |
close(IN);
|
197 |
|
198 |
# Final boilerplate
|
199 |
|
200 |
print "};\n";
|
201 |
|
202 |
# End
|