1 |
#! /usr/bin/perl -w
|
2 |
|
3 |
# This is a Perl script to create the table of character properties. For
|
4 |
# information on the format, see ucpinternal.h. The Unicode files are expected
|
5 |
# to be in Unicode.tables/{Scripts,UnicodeData}.txt. The ../ucp.h file is also
|
6 |
# required. The table is written to the standard output.
|
7 |
|
8 |
# The script is rather slow because it just searches linearly through the
|
9 |
# Scripts data in order to find the script for each character or character
|
10 |
# range. It could be made faster by sorting that data, or something, but hey,
|
11 |
# it is only ever run once in a blue moon. (It's even slower after I mended the
|
12 |
# "forgot to check for script number before amalgamation" bug, but even so,
|
13 |
# the effort of improving it isn't worth it.)
|
14 |
|
15 |
# Subroutine: Given a character number, return the script number. The
|
16 |
# Scripts.txt file has been read into an array, keeping just the codepoints
|
17 |
# and the script name. The lines are in one of two formats:
|
18 |
#
|
19 |
# xxxx name
|
20 |
# xxxx..yyyy name
|
21 |
#
|
22 |
# where xxxx and yyyy are code points.
|
23 |
|
24 |
sub script{
|
25 |
my($n) = $_[0];
|
26 |
foreach $line (@scriptlist)
|
27 |
{
|
28 |
my($a,$z,$s);
|
29 |
|
30 |
if ($line =~ /\.\./)
|
31 |
{
|
32 |
($a,$z,$s) = $line =~ /^([^\.]+)\.\.(\S+)\s+(.*)/;
|
33 |
}
|
34 |
else
|
35 |
{
|
36 |
($a,$s) = $line =~ /^(\S+)\s+(.*)/;
|
37 |
$z = $a;
|
38 |
}
|
39 |
|
40 |
die "Problem on this scripts data line:\n$line"
|
41 |
if (!defined $a || !defined $z);
|
42 |
|
43 |
if ($n >= hex($a) && $n <= hex($z))
|
44 |
{
|
45 |
my($x) = $scriptnum{$s};
|
46 |
return $x if defined $x;
|
47 |
die "Can't find script number for $s\n";
|
48 |
}
|
49 |
}
|
50 |
|
51 |
# All code points not explicitly listed are "Common"
|
52 |
|
53 |
return $scriptnum{"Common"};
|
54 |
}
|
55 |
|
56 |
|
57 |
# Subroutine: given a category name, return its number
|
58 |
|
59 |
sub category {
|
60 |
my($x) = $gencat{$_[0]};
|
61 |
return $x if defined $x;
|
62 |
die "Can't find number for general category $_[0]\n";
|
63 |
}
|
64 |
|
65 |
|
66 |
# Subroutine: output an entry for a range, unless it isn't really a range,
|
67 |
# in which case just output a single entry.
|
68 |
|
69 |
sub outrange{
|
70 |
my($cp,$ncp,$gc) = @_;
|
71 |
my($flag) = ($cp != $ncp)? 0x00800000 : 0;
|
72 |
printf " { 0x%08x, 0x%08x },\n",
|
73 |
$cp | $flag | (script($cp) << 24),
|
74 |
(category($gc) << 26) | $ncp - $cp;
|
75 |
}
|
76 |
|
77 |
|
78 |
# Entry point: An argument giving the Unicode version is required.
|
79 |
|
80 |
die "Require a single argument, the Unicode version"
|
81 |
if ($#ARGV != 0);
|
82 |
$Uversion = shift @ARGV;
|
83 |
|
84 |
|
85 |
# Read in the Scripts.txt file, retaining only the code points
|
86 |
# and script names.
|
87 |
|
88 |
open(IN, "Unicode.tables/Scripts.txt") ||
|
89 |
die "Can't open Unicode.tables/Scripts.txt: $!\n";
|
90 |
|
91 |
while (<IN>)
|
92 |
{
|
93 |
next if !/^[0-9A-Z]/;
|
94 |
my($range,$name) = $_ =~ /^(\S+)\s*;\s*(\S+)/;
|
95 |
push @scriptlist, "$range $name";
|
96 |
}
|
97 |
close(IN);
|
98 |
|
99 |
|
100 |
# Now read the ucp.h file to get the values for the general categories
|
101 |
# and for the scripts.
|
102 |
|
103 |
open(IN, "../ucp.h") || die "Can't open ../ucp.h: $!\n";
|
104 |
|
105 |
while (<IN>) { last if /^enum/; }
|
106 |
while (<IN>) { last if /^enum/; }
|
107 |
|
108 |
|
109 |
# The second enum are the general categories.
|
110 |
|
111 |
$count = 0;
|
112 |
while (<IN>)
|
113 |
{
|
114 |
last if $_ !~ /^\s+ucp_(..)/;
|
115 |
$gencat{$1} = $count++;
|
116 |
}
|
117 |
|
118 |
while (<IN>) { last if /^enum/; }
|
119 |
|
120 |
# The third enum are script names.
|
121 |
|
122 |
$count = 0;
|
123 |
while (<IN>)
|
124 |
{
|
125 |
last if $_ !~ /^\s+ucp_(\w+)/;
|
126 |
$scriptnum{$1} = $count++;
|
127 |
}
|
128 |
|
129 |
close(IN);
|
130 |
|
131 |
# Write out the initial boilerplace.
|
132 |
|
133 |
print "/* This source module is automatically generated from the Unicode\n" .
|
134 |
"property table. See ucpinternal.h for a description of the layout.\n" .
|
135 |
"This version was made from the Unicode $Uversion tables. */\n\n" .
|
136 |
"static const cnode ucp_table[] = {\n";
|
137 |
|
138 |
# Now read the input file and generate the output.
|
139 |
|
140 |
open(IN, "Unicode.tables/UnicodeData.txt") ||
|
141 |
die "Can't open Unicode.tables/UnicodeData.txt: $!\n";
|
142 |
|
143 |
while (<IN>)
|
144 |
{
|
145 |
@fields = split /;/;
|
146 |
|
147 |
$cp = hex($fields[0]);
|
148 |
$gc = $fields[2];
|
149 |
$uc = $fields[12];
|
150 |
$lc = $fields[13];
|
151 |
|
152 |
# If this line has no "other case" data, it might be the start or end of
|
153 |
# a range, either one that is explicit in the data, or one that we can
|
154 |
# create by scanning forwards.
|
155 |
|
156 |
if ($uc eq "" && $lc eq "")
|
157 |
{
|
158 |
if ($fields[1] =~ /First>$/)
|
159 |
{
|
160 |
$_ = <IN>;
|
161 |
@fields = split /;/;
|
162 |
die "First not followed by Last\n", if $fields[1] !~ /Last>$/;
|
163 |
die "First and last have different categories\n",
|
164 |
if $gc ne $fields[2];
|
165 |
outrange($cp, hex($fields[0]), $gc);
|
166 |
}
|
167 |
|
168 |
else
|
169 |
{
|
170 |
my($startscript) = script($cp);
|
171 |
my($ncp) = $cp + 1;
|
172 |
while (<IN>)
|
173 |
{
|
174 |
@fields = split /;/;
|
175 |
last if (hex($fields[0]) != $ncp ||
|
176 |
$fields[2] ne $gc ||
|
177 |
$fields[12] ne "" ||
|
178 |
$fields[13] ne "" ||
|
179 |
script($ncp) != $startscript);
|
180 |
|
181 |
$ncp++;
|
182 |
}
|
183 |
|
184 |
$ncp--;
|
185 |
outrange($cp, $ncp, $gc);
|
186 |
redo if defined $_; # Reprocess terminating line
|
187 |
}
|
188 |
}
|
189 |
|
190 |
# If there is an "other case" character, we output a single-char line
|
191 |
|
192 |
else
|
193 |
{
|
194 |
my($co) = (hex(($uc eq "")? $lc : $uc) - $cp) & 0xffff;
|
195 |
printf " { 0x%08x, 0x%08x },\n",
|
196 |
$cp | (script($cp) << 24), (category($gc) << 26) | $co;
|
197 |
}
|
198 |
}
|
199 |
|
200 |
close(IN);
|
201 |
|
202 |
# Final boilerplate
|
203 |
|
204 |
print "};\n";
|
205 |
|
206 |
# End
|