--- code/trunk/pcre_tables.c 2008/07/02 19:18:41 350 +++ code/trunk/pcre_tables.c 2008/07/04 18:27:16 351 @@ -87,6 +87,19 @@ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; +/* Table to translate from particular type value to the general value. */ + +const int _pcre_ucp_gentype[] = { + ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */ + ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */ + ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */ + ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */ + ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */ + ucp_P, ucp_P, /* Ps, Po */ + ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */ + ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */ +}; + /* The pcre_utt[] table below translates Unicode property names into type and code values. It is searched by binary chop, so must be in collating sequence of name. Originally, the table contained pointers to the name strings in the first @@ -94,7 +107,10 @@ a shared library is dynamically loaded. A significant reduction is made by putting all the names into a single, large string and then using offsets in the table itself. Maintenance is more error-prone, but frequent changes to this -data is unlikely. */ +data are unlikely. + +July 2008: There is now a script called maint/GenerateUtt.py which can be used +to generate this data instead of maintaining it entirely by hand. */ const char _pcre_utt_names[] = "Any\0" @@ -108,8 +124,10 @@ "Buhid\0" "C\0" "Canadian_Aboriginal\0" + "Carian\0" "Cc\0" "Cf\0" + "Cham\0" "Cherokee\0" "Cn\0" "Co\0" @@ -136,12 +154,14 @@ "Inherited\0" "Kannada\0" "Katakana\0" + "Kayah_Li\0" "Kharoshthi\0" "Khmer\0" "L\0" "L&\0" "Lao\0" "Latin\0" + "Lepcha\0" "Limbu\0" "Linear_B\0" "Ll\0" @@ -149,6 +169,8 @@ "Lo\0" "Lt\0" "Lu\0" + "Lycian\0" + "Lydian\0" "M\0" "Malayalam\0" "Mc\0" @@ -163,6 +185,7 @@ "Nl\0" "No\0" "Ogham\0" + "Ol_Chiki\0" "Old_Italic\0" "Old_Persian\0" "Oriya\0" @@ -177,14 +200,17 @@ "Pi\0" "Po\0" "Ps\0" + "Rejang\0" "Runic\0" "S\0" + "Saurashtra\0" "Sc\0" "Shavian\0" "Sinhala\0" "Sk\0" "Sm\0" "So\0" + "Sundanese\0" "Syloti_Nagri\0" "Syriac\0" "Tagalog\0" @@ -197,6 +223,7 @@ "Tibetan\0" "Tifinagh\0" "Ugaritic\0" + "Vai\0" "Yi\0" "Z\0" "Zl\0" @@ -204,111 +231,122 @@ "Zs\0"; const ucp_type_table _pcre_utt[] = { - { 0, PT_ANY, 0 }, - { 4, PT_SC, ucp_Arabic }, - { 11, PT_SC, ucp_Armenian }, - { 20, PT_SC, ucp_Balinese }, - { 29, PT_SC, ucp_Bengali }, - { 37, PT_SC, ucp_Bopomofo }, - { 46, PT_SC, ucp_Braille }, - { 54, PT_SC, ucp_Buginese }, - { 63, PT_SC, ucp_Buhid }, - { 69, PT_GC, ucp_C }, - { 71, PT_SC, ucp_Canadian_Aboriginal }, - { 91, PT_PC, ucp_Cc }, - { 94, PT_PC, ucp_Cf }, - { 97, PT_SC, ucp_Cherokee }, - { 106, PT_PC, ucp_Cn }, - { 109, PT_PC, ucp_Co }, - { 112, PT_SC, ucp_Common }, - { 119, PT_SC, ucp_Coptic }, - { 126, PT_PC, ucp_Cs }, - { 129, PT_SC, ucp_Cuneiform }, - { 139, PT_SC, ucp_Cypriot }, - { 147, PT_SC, ucp_Cyrillic }, - { 156, PT_SC, ucp_Deseret }, - { 164, PT_SC, ucp_Devanagari }, - { 175, PT_SC, ucp_Ethiopic }, - { 184, PT_SC, ucp_Georgian }, - { 193, PT_SC, ucp_Glagolitic }, - { 204, PT_SC, ucp_Gothic }, - { 211, PT_SC, ucp_Greek }, - { 217, PT_SC, ucp_Gujarati }, - { 226, PT_SC, ucp_Gurmukhi }, - { 235, PT_SC, ucp_Han }, - { 239, PT_SC, ucp_Hangul }, - { 246, PT_SC, ucp_Hanunoo }, - { 254, PT_SC, ucp_Hebrew }, - { 261, PT_SC, ucp_Hiragana }, - { 270, PT_SC, ucp_Inherited }, - { 280, PT_SC, ucp_Kannada }, - { 288, PT_SC, ucp_Katakana }, - { 297, PT_SC, ucp_Kharoshthi }, - { 308, PT_SC, ucp_Khmer }, - { 314, PT_GC, ucp_L }, - { 316, PT_LAMP, 0 }, - { 319, PT_SC, ucp_Lao }, - { 323, PT_SC, ucp_Latin }, - { 329, PT_SC, ucp_Limbu }, - { 335, PT_SC, ucp_Linear_B }, - { 344, PT_PC, ucp_Ll }, - { 347, PT_PC, ucp_Lm }, - { 350, PT_PC, ucp_Lo }, - { 353, PT_PC, ucp_Lt }, - { 356, PT_PC, ucp_Lu }, - { 359, PT_GC, ucp_M }, - { 361, PT_SC, ucp_Malayalam }, - { 371, PT_PC, ucp_Mc }, - { 374, PT_PC, ucp_Me }, - { 377, PT_PC, ucp_Mn }, - { 380, PT_SC, ucp_Mongolian }, - { 390, PT_SC, ucp_Myanmar }, - { 398, PT_GC, ucp_N }, - { 400, PT_PC, ucp_Nd }, - { 403, PT_SC, ucp_New_Tai_Lue }, - { 415, PT_SC, ucp_Nko }, - { 419, PT_PC, ucp_Nl }, - { 422, PT_PC, ucp_No }, - { 425, PT_SC, ucp_Ogham }, - { 431, PT_SC, ucp_Old_Italic }, - { 442, PT_SC, ucp_Old_Persian }, - { 454, PT_SC, ucp_Oriya }, - { 460, PT_SC, ucp_Osmanya }, - { 468, PT_GC, ucp_P }, - { 470, PT_PC, ucp_Pc }, - { 473, PT_PC, ucp_Pd }, - { 476, PT_PC, ucp_Pe }, - { 479, PT_PC, ucp_Pf }, - { 482, PT_SC, ucp_Phags_Pa }, - { 491, PT_SC, ucp_Phoenician }, - { 502, PT_PC, ucp_Pi }, - { 505, PT_PC, ucp_Po }, - { 508, PT_PC, ucp_Ps }, - { 511, PT_SC, ucp_Runic }, - { 517, PT_GC, ucp_S }, - { 519, PT_PC, ucp_Sc }, - { 522, PT_SC, ucp_Shavian }, - { 530, PT_SC, ucp_Sinhala }, - { 538, PT_PC, ucp_Sk }, - { 541, PT_PC, ucp_Sm }, - { 544, PT_PC, ucp_So }, - { 547, PT_SC, ucp_Syloti_Nagri }, - { 560, PT_SC, ucp_Syriac }, - { 567, PT_SC, ucp_Tagalog }, - { 575, PT_SC, ucp_Tagbanwa }, - { 584, PT_SC, ucp_Tai_Le }, - { 591, PT_SC, ucp_Tamil }, - { 597, PT_SC, ucp_Telugu }, - { 604, PT_SC, ucp_Thaana }, - { 611, PT_SC, ucp_Thai }, - { 616, PT_SC, ucp_Tibetan }, - { 624, PT_SC, ucp_Tifinagh }, - { 633, PT_SC, ucp_Ugaritic }, - { 642, PT_SC, ucp_Yi }, - { 645, PT_GC, ucp_Z }, - { 647, PT_PC, ucp_Zl }, - { 650, PT_PC, ucp_Zp }, - { 653, PT_PC, ucp_Zs } + { 0, PT_ANY, 0 }, + { 4, PT_SC, ucp_Arabic }, + { 11, PT_SC, ucp_Armenian }, + { 20, PT_SC, ucp_Balinese }, + { 29, PT_SC, ucp_Bengali }, + { 37, PT_SC, ucp_Bopomofo }, + { 46, PT_SC, ucp_Braille }, + { 54, PT_SC, ucp_Buginese }, + { 63, PT_SC, ucp_Buhid }, + { 69, PT_GC, ucp_C }, + { 71, PT_SC, ucp_Canadian_Aboriginal }, + { 91, PT_SC, ucp_Carian }, + { 98, PT_PC, ucp_Cc }, + { 101, PT_PC, ucp_Cf }, + { 104, PT_SC, ucp_Cham }, + { 109, PT_SC, ucp_Cherokee }, + { 118, PT_PC, ucp_Cn }, + { 121, PT_PC, ucp_Co }, + { 124, PT_SC, ucp_Common }, + { 131, PT_SC, ucp_Coptic }, + { 138, PT_PC, ucp_Cs }, + { 141, PT_SC, ucp_Cuneiform }, + { 151, PT_SC, ucp_Cypriot }, + { 159, PT_SC, ucp_Cyrillic }, + { 168, PT_SC, ucp_Deseret }, + { 176, PT_SC, ucp_Devanagari }, + { 187, PT_SC, ucp_Ethiopic }, + { 196, PT_SC, ucp_Georgian }, + { 205, PT_SC, ucp_Glagolitic }, + { 216, PT_SC, ucp_Gothic }, + { 223, PT_SC, ucp_Greek }, + { 229, PT_SC, ucp_Gujarati }, + { 238, PT_SC, ucp_Gurmukhi }, + { 247, PT_SC, ucp_Han }, + { 251, PT_SC, ucp_Hangul }, + { 258, PT_SC, ucp_Hanunoo }, + { 266, PT_SC, ucp_Hebrew }, + { 273, PT_SC, ucp_Hiragana }, + { 282, PT_SC, ucp_Inherited }, + { 292, PT_SC, ucp_Kannada }, + { 300, PT_SC, ucp_Katakana }, + { 309, PT_SC, ucp_Kayah_Li }, + { 318, PT_SC, ucp_Kharoshthi }, + { 329, PT_SC, ucp_Khmer }, + { 335, PT_GC, ucp_L }, + { 337, PT_LAMP, 0 }, + { 340, PT_SC, ucp_Lao }, + { 344, PT_SC, ucp_Latin }, + { 350, PT_SC, ucp_Lepcha }, + { 357, PT_SC, ucp_Limbu }, + { 363, PT_SC, ucp_Linear_B }, + { 372, PT_PC, ucp_Ll }, + { 375, PT_PC, ucp_Lm }, + { 378, PT_PC, ucp_Lo }, + { 381, PT_PC, ucp_Lt }, + { 384, PT_PC, ucp_Lu }, + { 387, PT_SC, ucp_Lycian }, + { 394, PT_SC, ucp_Lydian }, + { 401, PT_GC, ucp_M }, + { 403, PT_SC, ucp_Malayalam }, + { 413, PT_PC, ucp_Mc }, + { 416, PT_PC, ucp_Me }, + { 419, PT_PC, ucp_Mn }, + { 422, PT_SC, ucp_Mongolian }, + { 432, PT_SC, ucp_Myanmar }, + { 440, PT_GC, ucp_N }, + { 442, PT_PC, ucp_Nd }, + { 445, PT_SC, ucp_New_Tai_Lue }, + { 457, PT_SC, ucp_Nko }, + { 461, PT_PC, ucp_Nl }, + { 464, PT_PC, ucp_No }, + { 467, PT_SC, ucp_Ogham }, + { 473, PT_SC, ucp_Ol_Chiki }, + { 482, PT_SC, ucp_Old_Italic }, + { 493, PT_SC, ucp_Old_Persian }, + { 505, PT_SC, ucp_Oriya }, + { 511, PT_SC, ucp_Osmanya }, + { 519, PT_GC, ucp_P }, + { 521, PT_PC, ucp_Pc }, + { 524, PT_PC, ucp_Pd }, + { 527, PT_PC, ucp_Pe }, + { 530, PT_PC, ucp_Pf }, + { 533, PT_SC, ucp_Phags_Pa }, + { 542, PT_SC, ucp_Phoenician }, + { 553, PT_PC, ucp_Pi }, + { 556, PT_PC, ucp_Po }, + { 559, PT_PC, ucp_Ps }, + { 562, PT_SC, ucp_Rejang }, + { 569, PT_SC, ucp_Runic }, + { 575, PT_GC, ucp_S }, + { 577, PT_SC, ucp_Saurashtra }, + { 588, PT_PC, ucp_Sc }, + { 591, PT_SC, ucp_Shavian }, + { 599, PT_SC, ucp_Sinhala }, + { 607, PT_PC, ucp_Sk }, + { 610, PT_PC, ucp_Sm }, + { 613, PT_PC, ucp_So }, + { 616, PT_SC, ucp_Sundanese }, + { 626, PT_SC, ucp_Syloti_Nagri }, + { 639, PT_SC, ucp_Syriac }, + { 646, PT_SC, ucp_Tagalog }, + { 654, PT_SC, ucp_Tagbanwa }, + { 663, PT_SC, ucp_Tai_Le }, + { 670, PT_SC, ucp_Tamil }, + { 676, PT_SC, ucp_Telugu }, + { 683, PT_SC, ucp_Thaana }, + { 690, PT_SC, ucp_Thai }, + { 695, PT_SC, ucp_Tibetan }, + { 703, PT_SC, ucp_Tifinagh }, + { 712, PT_SC, ucp_Ugaritic }, + { 721, PT_SC, ucp_Vai }, + { 725, PT_SC, ucp_Yi }, + { 728, PT_GC, ucp_Z }, + { 730, PT_PC, ucp_Zl }, + { 733, PT_PC, ucp_Zp }, + { 736, PT_PC, ucp_Zs } }; const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);