1 |
/*************************************************
|
2 |
* libucp - Unicode Property Table handler *
|
3 |
*************************************************/
|
4 |
|
5 |
/* Internal header file defining the layout of compact nodes in the tree. */
|
6 |
|
7 |
typedef struct cnode {
|
8 |
unsigned short int f0;
|
9 |
unsigned short int f1;
|
10 |
unsigned short int f2;
|
11 |
} cnode;
|
12 |
|
13 |
/* Things for the f0 field */
|
14 |
|
15 |
#define f0_leftexists 0x8000 /* Left child exists */
|
16 |
#define f0_typemask 0x3f00 /* Type bits */
|
17 |
#define f0_typeshift 8 /* Type shift */
|
18 |
#define f0_chhmask 0x00ff /* Character high bits */
|
19 |
|
20 |
/* Things for the f2 field */
|
21 |
|
22 |
#define f2_rightmask 0xf000 /* Mask for right offset bits */
|
23 |
#define f2_rightshift 12 /* Shift for right offset */
|
24 |
#define f2_casemask 0x0fff /* Mask for case offset */
|
25 |
|
26 |
/* The tree consists of a vector of structures of type cnode, with the root
|
27 |
node as the first element. The three short ints (16-bits) are used as follows:
|
28 |
|
29 |
(f0) (1) The 0x8000 bit of f0 is set if a left child exists. The child's node
|
30 |
is the next node in the vector.
|
31 |
(2) The 0x4000 bits of f0 is spare.
|
32 |
(3) The 0x3f00 bits of f0 contain the character type; this is a number
|
33 |
defined by the enumeration in ucp.h (e.g. ucp_Lu).
|
34 |
(4) The bottom 8 bits of f0 contain the most significant byte of the
|
35 |
character's 24-bit codepoint.
|
36 |
|
37 |
(f1) (1) The f1 field contains the two least significant bytes of the
|
38 |
codepoint.
|
39 |
|
40 |
(f2) (1) The 0xf000 bits of f2 contain zero if there is no right child of this
|
41 |
node. Otherwise, they contain one plus the exponent of the power of
|
42 |
two of the offset to the right node (e.g. a value of 3 means 8). The
|
43 |
units of the offset are node items.
|
44 |
|
45 |
(2) The 0x0fff bits of f2 contain the signed offset from this character to
|
46 |
its alternate cased value. They are zero if there is no such
|
47 |
character.
|
48 |
|
49 |
|
50 |
-----------------------------------------------------------------------------
|
51 |
||.|.| type (6) | ms char (8) || ls char (16) ||....| case offset (12) ||
|
52 |
-----------------------------------------------------------------------------
|
53 |
| | |
|
54 |
| |-> spare |
|
55 |
| exponent of right
|
56 |
|-> left child exists child offset
|
57 |
|
58 |
|
59 |
The upper/lower casing information is set only for characters that come in
|
60 |
pairs. There are (at present) four non-one-to-one mappings in the Unicode data.
|
61 |
These are ignored. They are:
|
62 |
|
63 |
1FBE Greek Prosgegrammeni (lower, with upper -> capital iota)
|
64 |
2126 Ohm
|
65 |
212A Kelvin
|
66 |
212B Angstrom
|
67 |
|
68 |
Certainly for the last three, having an alternate case would seem to be a
|
69 |
mistake. I don't know any Greek, so cannot comment on the first one.
|
70 |
|
71 |
|
72 |
When searching the tree, proceed as follows:
|
73 |
|
74 |
(1) Start at the first node.
|
75 |
|
76 |
(2) Extract the character value from f1 and the bottom 8 bits of f0;
|
77 |
|
78 |
(3) Compare with the character being sought. If equal, we are done.
|
79 |
|
80 |
(4) If the test character is smaller, inspect the f0_leftexists flag. If it is
|
81 |
not set, the character is not in the tree. If it is set, move to the next
|
82 |
node, and go to (2).
|
83 |
|
84 |
(5) If the test character is bigger, extract the f2_rightmask bits from f2, and
|
85 |
shift them right by f2_rightshift. If the result is zero, the character is
|
86 |
not in the tree. Otherwise, calculate the number of nodes to skip by
|
87 |
shifting the value 1 left by this number minus one. Go to (2).
|
88 |
*/
|
89 |
|
90 |
|
91 |
/* End of internal.h */
|