[XML-DEV Mailing List Archive Home] [By Thread] [By Date] [Recent Entries] [Reply To This Message] Character classification
I've been working on making Lark really do Unicode. JDK 1.1 is supposed to have, unlike 1.0, a usable input method; thus the problem is to check, when you're reading a GI or Attribute name, whether the characters are legal namestart/name characters. It turns out to be quite a lot of work, so this is an offer to share. I wrote a program (based on Lark) that pulls the relevant character classes out of the XML spec, picks apart the markup, and writes another Java class that has some static arrays and offers two methods: package textuality.lark; public class CharClasses { public static boolean isNameC(char c) public static boolean isNameStart(char c) } It needs about 4k of tables (which it binary-searches); it might be faster with 128k of byte-addressable tables or 16K of bitmaps, neither of which would be hard to implement. (a) is this a waste of time, i.e. are there Unicode library calls that do it? (b) if not, has everyone else already done this? (c) if not, if I'm going to publish this, is the API above OK? I've attached the current Java source file for those who find the explanation above insufficiently clear. // Synthetically generated; do not edit! // package textuality.lark; import java.util.*; public class CharClasses { static final char[] sNameStart = { 170,170, 181,181, 186,186, 192,214, 216,246, 248,501, 506,535, 592,680, 688,696, 699,705, 736,740, 890,890, 902,902, 904,906, 908,908, 910,929, 931,974, 976,982, 986,986, 988,988, 990,990, 992,992, 994,1011, 1025,1036, 1038,1103, 1105,1116, 1118,1153, 1168,1220, 1223,1224, 1227,1228, 1232,1259, 1262,1269, 1272,1273, 1329,1366, 1369,1369, 1377,1415, 1488,1514, 1520,1522, 1569,1594, 1601,1610, 1649,1719, 1722,1726, 1728,1742, 1744,1747, 1749,1749, 1765,1766, 2309,2361, 2365,2365, 2392,2401, 2437,2444, 2447,2448, 2451,2472, 2474,2480, 2482,2482, 2486,2489, 2524,2525, 2527,2529, 2544,2545, 2565,2570, 2575,2576, 2579,2600, 2602,2608, 2610,2611, 2613,2614, 2616,2617, 2649,2652, 2654,2654, 2674,2676, 2693,2699, 2701,2701, 2703,2705, 2707,2728, 2730,2736, 2738,2739, 2741,2745, 2749,2749, 2784,2784, 2821,2828, 2831,2832, 2835,2856, 2858,2864, 2866,2867, 2870,2873, 2877,2877, 2908,2909, 2911,2913, 2949,2954, 2958,2960, 2962,2965, 2969,2970, 2972,2972, 2974,2975, 2979,2980, 2984,2986, 2990,2997, 2999,3001, 3077,3084, 3086,3088, 3090,3112, 3114,3123, 3125,3129, 3168,3169, 3205,3212, 3214,3216, 3218,3240, 3242,3251, 3253,3257, 3294,3294, 3296,3297, 3333,3340, 3342,3344, 3346,3368, 3370,3385, 3424,3425, 3585,3630, 3632,3632, 3634,3635, 3648,3653, 3713,3714, 3716,3716, 3719,3720, 3722,3722, 3725,3725, 3732,3735, 3737,3743, 3745,3747, 3749,3749, 3751,3751, 3754,3755, 3757,3758, 3760,3760, 3762,3763, 3773,3773, 3776,3780, 3804,3805, 3904,3911, 3913,3945, 4256,4293, 4304,4342, 4352,4441, 4447,4514, 4520,4601, 7680,7835, 7840,7929, 7936,7957, 7960,7965, 7968,8005, 8008,8013, 8016,8023, 8025,8025, 8027,8027, 8029,8029, 8031,8061, 8064,8116, 8118,8124, 8126,8126, 8130,8132, 8134,8140, 8144,8147, 8150,8155, 8160,8172, 8178,8180, 8182,8188, 8319,8319, 8450,8450, 8455,8455, 8458,8467, 8469,8469, 8472,8477, 8484,8484, 8486,8486, 8488,8488, 8490,8497, 8499,8504, 8544,8578, 12295,12295, 12321,12329, 12353,12436, 12449,12538, 12549,12588, 12593,12686, 19968,40869, 44032,55203, 63744,64045, 64256,64262, 64275,64279, 64287,64296, 64298,64310, 64312,64316, 64318,64318, 64320,64321, 64323,64324, 64326,64433, 64467,64829, 64848,64911, 64914,64967, 65008,65019, 65136,65437, 65440,65470, 65474,65479, 65482,65487, 65490,65495, 65498,65500 }; static final char[] sNameC = { 170,170, 181,181, 183,183, 186,186, 192,214, 216,246, 248,501, 506,535, 592,680, 688,696, 699,705, 720,721, 736,740, 768,837, 864,865, 890,890, 902,906, 908,908, 910,929, 931,974, 976,982, 986,986, 988,988, 990,990, 992,992, 994,1011, 1025,1036, 1038,1103, 1105,1116, 1118,1153, 1155,1158, 1168,1220, 1223,1224, 1227,1228, 1232,1259, 1262,1269, 1272,1273, 1329,1366, 1369,1369, 1377,1415, 1425,1441, 1443,1465, 1467,1469, 1471,1471, 1473,1474, 1476,1476, 1488,1514, 1520,1522, 1569,1594, 1600,1618, 1632,1641, 1648,1719, 1722,1726, 1728,1742, 1744,1747, 1749,1768, 1770,1773, 1776,1785, 2305,2307, 2309,2361, 2364,2381, 2385,2388, 2392,2403, 2406,2415, 2433,2435, 2437,2444, 2447,2448, 2451,2472, 2474,2480, 2482,2482, 2486,2489, 2492,2492, 2494,2500, 2503,2504, 2507,2509, 2519,2519, 2524,2525, 2527,2531, 2534,2545, 2562,2562, 2565,2570, 2575,2576, 2579,2600, 2602,2608, 2610,2611, 2613,2614, 2616,2617, 2620,2620, 2622,2626, 2631,2632, 2635,2637, 2649,2652, 2654,2654, 2662,2676, 2689,2691, 2693,2699, 2701,2701, 2703,2705, 2707,2728, 2730,2736, 2738,2739, 2741,2745, 2748,2757, 2759,2761, 2763,2765, 2784,2784, 2790,2799, 2817,2819, 2821,2828, 2831,2832, 2835,2856, 2858,2864, 2866,2867, 2870,2873, 2876,2883, 2887,2888, 2891,2893, 2902,2903, 2908,2909, 2911,2913, 2918,2927, 2946,2947, 2949,2954, 2958,2960, 2962,2965, 2969,2970, 2972,2972, 2974,2975, 2979,2980, 2984,2986, 2990,2997, 2999,3001, 3006,3010, 3014,3016, 3018,3021, 3031,3031, 3047,3055, 3073,3075, 3077,3084, 3086,3088, 3090,3112, 3114,3123, 3125,3129, 3134,3140, 3142,3144, 3146,3149, 3157,3158, 3168,3169, 3174,3183, 3202,3203, 3205,3212, 3214,3216, 3218,3240, 3242,3251, 3253,3257, 3262,3268, 3270,3272, 3274,3277, 3285,3286, 3294,3294, 3296,3297, 3302,3311, 3330,3331, 3333,3340, 3342,3344, 3346,3368, 3370,3385, 3390,3395, 3398,3400, 3402,3405, 3415,3415, 3424,3425, 3430,3439, 3585,3630, 3632,3642, 3648,3662, 3664,3673, 3713,3714, 3716,3716, 3719,3720, 3722,3722, 3725,3725, 3732,3735, 3737,3743, 3745,3747, 3749,3749, 3751,3751, 3754,3755, 3757,3758, 3760,3769, 3771,3773, 3776,3780, 3782,3782, 3784,3789, 3792,3801, 3804,3805, 3864,3865, 3872,3881, 3893,3893, 3895,3895, 3897,3897, 3902,3911, 3913,3945, 3953,3972, 3974,3979, 3984,3989, 3991,3991, 3993,4013, 4017,4023, 4025,4025, 4256,4293, 4304,4342, 4352,4441, 4447,4514, 4520,4601, 7680,7835, 7840,7929, 7936,7957, 7960,7965, 7968,8005, 8008,8013, 8016,8023, 8025,8025, 8027,8027, 8029,8029, 8031,8061, 8064,8116, 8118,8124, 8126,8126, 8130,8132, 8134,8140, 8144,8147, 8150,8155, 8160,8172, 8178,8180, 8182,8188, 8204,8207, 8234,8238, 8298,8303, 8319,8319, 8400,8412, 8417,8417, 8450,8450, 8455,8455, 8458,8467, 8469,8469, 8472,8477, 8484,8484, 8486,8486, 8488,8488, 8490,8497, 8499,8504, 8544,8578, 12293,12293, 12295,12295, 12321,12335, 12337,12341, 12353,12436, 12441,12446, 12449,12538, 12540,12542, 12549,12588, 12593,12686, 19968,40869, 44032,55203, 63744,64045, 64256,64262, 64275,64279, 64286,64296, 64298,64310, 64312,64316, 64318,64318, 64320,64321, 64323,64324, 64326,64433, 64467,64829, 64848,64911, 64914,64967, 65008,65019, 65056,65059, 65136,65470, 65474,65479, 65482,65487, 65490,65495, 65498,65500 }; public static boolean isNameC(char c) { return find(c, sNameC); } public static boolean isNameStart(char c) { return find(c, sNameStart); } // binary-search to find out if C is in one of the ranges in the // map. Remember that the map consists of pairs, not individuals. // If this turns into a horrible performance bottleneck, we could // put the maps into a 64k byte array or as a compromise 2 * 8k bitmaps; the // pair-array trick uses about 4k for both, at the cost of all this // binary searching private static boolean find(char c, char[] map) { int high, low, probe; low = -1; high = map.length/2; while ((high - low) > 1) { // invariant (modulo division by 2): // map[high] is strictly greater than c probe = (high + low) / 2; if (c < map[probe * 2]) high = probe; else low = probe; } return (low != -1 && c >= map[low*2] && c <= map[(low*2) + 1]); } } Cheers, Tim Bray tbray@t... http://www.textuality.com/ +1-604-708-9592
|
PURCHASE STYLUS STUDIO ONLINE TODAY!Purchasing Stylus Studio from our online shop is Easy, Secure and Value Priced! Download The World's Best XML IDE!Accelerate XML development with our award-winning XML IDE - Download a free trial today! Subscribe in XML format
|