9 |
|
|
10 |
Written by: Philip Hazel <ph10@cam.ac.uk> |
Written by: Philip Hazel <ph10@cam.ac.uk> |
11 |
|
|
12 |
Copyright (c) 1997-2002 University of Cambridge |
Copyright (c) 1997-2003 University of Cambridge |
13 |
|
|
14 |
----------------------------------------------------------------------------- |
----------------------------------------------------------------------------- |
15 |
Permission is granted to anyone to use this software for any purpose on any |
Permission is granted to anyone to use this software for any purpose on any |
297 |
/* Character class where all the information is in a bit map: set the |
/* Character class where all the information is in a bit map: set the |
298 |
bits and either carry on or not, according to the repeat count. If it was |
bits and either carry on or not, according to the repeat count. If it was |
299 |
a negative class, and we are operating with UTF-8 characters, any byte |
a negative class, and we are operating with UTF-8 characters, any byte |
300 |
with the top-bit set is a potentially valid starter because it may start |
with a value >= 0xc4 is a potentially valid starter because it starts a |
301 |
a character with a value > 255. (This is sub-optimal in that the |
character with a value > 255. */ |
|
character may be in the range 128-255, and those characters might be |
|
|
unwanted, but that's as far as we go for the moment.) */ |
|
302 |
|
|
303 |
case OP_NCLASS: |
case OP_NCLASS: |
304 |
if (utf8) memset(start_bits+16, 0xff, 16); |
if (utf8) |
305 |
|
{ |
306 |
|
start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ |
307 |
|
memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ |
308 |
|
} |
309 |
/* Fall through */ |
/* Fall through */ |
310 |
|
|
311 |
case OP_CLASS: |
case OP_CLASS: |
312 |
{ |
{ |
313 |
tcode++; |
tcode++; |
314 |
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; |
|
315 |
|
/* In UTF-8 mode, the bits in a bit map correspond to character |
316 |
|
values, not to byte values. However, the bit map we are constructing is |
317 |
|
for byte values. So we have to do a conversion for characters whose |
318 |
|
value is > 127. In fact, there are only two possible starting bytes for |
319 |
|
characters in the range 128 - 255. */ |
320 |
|
|
321 |
|
if (utf8) |
322 |
|
{ |
323 |
|
for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; |
324 |
|
for (c = 128; c < 256; c++) |
325 |
|
{ |
326 |
|
if ((tcode[c/8] && (1 << (c&7))) != 0) |
327 |
|
{ |
328 |
|
int d = (c >> 6) | 0xc0; /* Set bit for this starter */ |
329 |
|
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ |
330 |
|
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ |
331 |
|
} |
332 |
|
} |
333 |
|
} |
334 |
|
|
335 |
|
/* In non-UTF-8 mode, the two bit maps are completely compatible. */ |
336 |
|
|
337 |
|
else |
338 |
|
{ |
339 |
|
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; |
340 |
|
} |
341 |
|
|
342 |
|
/* Advance past the bit map, and act on what follows */ |
343 |
|
|
344 |
tcode += 32; |
tcode += 32; |
345 |
switch (*tcode) |
switch (*tcode) |
346 |
{ |
{ |