2 *******************************************************************************
\r
3 * Copyright (C) 2000-2004, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.translit;
\r
9 import com.ibm.icu.text.*;
\r
11 public final class TestUtility {
\r
13 public static String hex(char ch) {
\r
14 String foo = Integer.toString(ch,16).toUpperCase();
\r
15 return "0000".substring(0,4-foo.length()) + foo;
\r
18 public static String hex(int ch) {
\r
19 String foo = Integer.toString(ch,16).toUpperCase();
\r
20 return "00000000".substring(0,4-foo.length()) + foo;
\r
23 public static String hex(String s) {
\r
27 public static String hex(String s, String sep) {
\r
28 if (s.length() == 0) return "";
\r
29 String result = hex(s.charAt(0));
\r
30 for (int i = 1; i < s.length(); ++i) {
\r
32 result += hex(s.charAt(i));
\r
37 public static String replace(String source, String toBeReplaced, String replacement) {
\r
38 StringBuffer results = new StringBuffer();
\r
39 int len = toBeReplaced.length();
\r
40 for (int i = 0; i < source.length(); ++i) {
\r
41 if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
\r
42 results.append(replacement);
\r
43 i += len - 1; // minus one, since we will increment
\r
45 results.append(source.charAt(i));
\r
48 return results.toString();
\r
51 public static String replaceAll(String source, UnicodeSet set, String replacement) {
\r
52 StringBuffer results = new StringBuffer();
\r
54 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
\r
55 cp = UTF16.charAt(source,i);
\r
56 if (set.contains(cp)) {
\r
57 results.append(replacement);
\r
59 UTF16.append(results, cp);
\r
62 return results.toString();
\r
65 // COMMENTED OUT ALL THE OLD SCRIPT STUFF
\r
67 public static byte getScript(char c) {
\r
68 return getScript(getBlock(c));
\r
71 public static byte getScript(byte block) {
\r
72 return blockToScript[block];
\r
75 public static byte getBlock(char c) {
\r
77 byte block = charToBlock[index];
\r
78 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
\r
79 int[] tuple = split[-block-1];
\r
80 if (c < tuple[0]) block = (byte)tuple[1];
\r
81 else block = (byte)tuple[2];
\r
86 // returns next letter of script, or 0xFFFF if done
\r
88 public static char getNextLetter(char c, byte script) {
\r
89 while (c < 0xFFFF) {
\r
91 if (getScript(c) == script && Character.isLetter(c)) {
\r
98 // Supplements to Character methods; these methods go through
\r
99 // UCharacter if possible. If not, they fall back to Character.
\r
101 public static boolean isUnassigned(char c) {
\r
103 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
\r
104 } catch (NullPointerException e) {
\r
105 System.out.print("");
\r
107 return Character.getType(c) == Character.UNASSIGNED;
\r
110 public static boolean isLetter(char c) {
\r
112 return UCharacter.isLetter(c);
\r
113 } catch (NullPointerException e) {
\r
114 System.out.print("");
\r
116 return Character.isLetter(c);
\r
119 public static void main(String[] args) {
\r
120 System.out.println("Blocks: ");
\r
121 byte lastblock = -128;
\r
122 for (char cc = 0; cc < 0xFFFF; ++cc) {
\r
123 byte block = TestUtility.getBlock(cc);
\r
124 if (block != lastblock) {
\r
125 System.out.println(TestUtility.hex(cc) + "\t" + block);
\r
129 System.out.println();
\r
130 System.out.println("Scripts: ");
\r
131 byte lastScript = -128;
\r
132 for (char cc = 0; cc < 0xFFFF; ++cc) {
\r
133 byte script = TestUtility.getScript(cc);
\r
134 if (script != lastScript) {
\r
135 System.out.println(TestUtility.hex(cc) + "\t" + script);
\r
136 lastScript = script;
\r
143 public static final byte // SCRIPT CODE
\r
147 CYRILLIC_SCRIPT = 3,
\r
148 ARMENIAN_SCRIPT = 4,
\r
152 THAANA_SCRIPT = 8,
\r
153 DEVANAGARI_SCRIPT = 9,
\r
154 BENGALI_SCRIPT = 10,
\r
155 GURMUKHI_SCRIPT = 11,
\r
156 GUJARATI_SCRIPT = 12,
\r
159 TELUGU_SCRIPT = 15,
\r
160 KANNADA_SCRIPT = 16,
\r
161 MALAYALAM_SCRIPT = 17,
\r
162 SINHALA_SCRIPT = 18,
\r
165 TIBETAN_SCRIPT = 21,
\r
166 MYANMAR_SCRIPT = 22,
\r
167 GEORGIAN_SCRIPT = 23,
\r
169 HANGUL_SCRIPT = 25,
\r
170 ETHIOPIC_SCRIPT = 26,
\r
171 CHEROKEE_SCRIPT = 27,
\r
172 ABORIGINAL_SCRIPT = 28,
\r
176 MONGOLIAN_SCRIPT = 32,
\r
177 HIRAGANA_SCRIPT = 33,
\r
178 KATAKANA_SCRIPT = 34,
\r
179 BOPOMOFO_SCRIPT = 35,
\r
183 public static final byte // block code
\r
184 RESERVED_BLOCK = 0,
\r
186 LATIN_1_SUPPLEMENT = 2,
\r
187 LATIN_EXTENDED_A = 3,
\r
188 LATIN_EXTENDED_B = 4,
\r
189 IPA_EXTENSIONS = 5,
\r
190 SPACING_MODIFIER_LETTERS = 6,
\r
191 COMBINING_DIACRITICAL_MARKS = 7,
\r
217 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
\r
222 LATIN_EXTENDED_ADDITIONAL = 38,
\r
223 GREEK_EXTENDED = 39,
\r
224 GENERAL_PUNCTUATION = 40,
\r
225 SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
\r
226 CURRENCY_SYMBOLS = 42,
\r
227 COMBINING_MARKS_FOR_SYMBOLS = 43,
\r
228 LETTERLIKE_SYMBOLS = 44,
\r
231 MATHEMATICAL_OPERATORS = 47,
\r
232 MISCELLANEOUS_TECHNICAL = 48,
\r
233 CONTROL_PICTURES = 49,
\r
234 OPTICAL_CHARACTER_RECOGNITION = 50,
\r
235 ENCLOSED_ALPHANUMERICS = 51,
\r
237 BLOCK_ELEMENTS = 53,
\r
238 GEOMETRIC_SHAPES = 54,
\r
239 MISCELLANEOUS_SYMBOLS = 55,
\r
241 BRAILLE_PATTERNS = 57,
\r
242 CJK_RADICALS_SUPPLEMENT = 58,
\r
243 KANGXI_RADICALS = 59,
\r
244 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
\r
245 CJK_SYMBOLS_AND_PUNCTUATION = 61,
\r
249 HANGUL_COMPATIBILITY_JAMO = 65,
\r
251 BOPOMOFO_EXTENDED = 67,
\r
252 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
\r
253 CJK_COMPATIBILITY = 69,
\r
254 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
\r
255 CJK_UNIFIED_IDEOGRAPHS = 71,
\r
258 HANGUL_SYLLABLES = 74,
\r
259 HIGH_SURROGATES = 75,
\r
260 HIGH_PRIVATE_USE_SURROGATES = 76,
\r
261 LOW_SURROGATES = 77,
\r
263 CJK_COMPATIBILITY_IDEOGRAPHS = 79,
\r
264 ALPHABETIC_PRESENTATION_FORMS = 80,
\r
265 ARABIC_PRESENTATION_FORMS_A = 81,
\r
266 COMBINING_HALF_MARKS = 82,
\r
267 CJK_COMPATIBILITY_FORMS = 83,
\r
268 SMALL_FORM_VARIANTS = 84,
\r
269 ARABIC_PRESENTATION_FORMS_B = 85,
\r
271 HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
\r
273 static final byte[] blockToScript = {
\r
274 COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
\r
275 LATIN_SCRIPT, // 1, BASIC_LATIN
\r
276 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
\r
277 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
\r
278 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
\r
279 LATIN_SCRIPT, // 5, IPA_EXTENSIONS
\r
280 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
\r
281 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
\r
282 GREEK_SCRIPT, // 8, GREEK
\r
283 CYRILLIC_SCRIPT, // 9, CYRILLIC
\r
284 ARMENIAN_SCRIPT, // 10, ARMENIAN
\r
285 HEBREW_SCRIPT, // 11, HEBREW
\r
286 ARABIC_SCRIPT, // 12, ARABIC
\r
287 SYRIAC_SCRIPT, // 13, SYRIAC
\r
288 THAANA_SCRIPT, // 14, THAANA
\r
289 DEVANAGARI_SCRIPT, // 15, DEVANAGARI
\r
290 BENGALI_SCRIPT, // 16, BENGALI
\r
291 GURMUKHI_SCRIPT, // 17, GURMUKHI
\r
292 GUJARATI_SCRIPT, // 18, GUJARATI
\r
293 ORIYA_SCRIPT, // 19, ORIYA
\r
294 TAMIL_SCRIPT, // 20, TAMIL
\r
295 TELUGU_SCRIPT, // 21, TELUGU
\r
296 KANNADA_SCRIPT, // 22, KANNADA
\r
297 MALAYALAM_SCRIPT, // 23, MALAYALAM
\r
298 SINHALA_SCRIPT, // 24, SINHALA
\r
299 THAI_SCRIPT, // 25, THAI
\r
300 LAO_SCRIPT, // 26, LAO
\r
301 TIBETAN_SCRIPT, // 27, TIBETAN
\r
302 MYANMAR_SCRIPT, // 28, MYANMAR
\r
303 GEORGIAN_SCRIPT, // 29, GEORGIAN
\r
304 JAMO_SCRIPT, // 30, HANGUL_JAMO
\r
305 ETHIOPIC_SCRIPT, // 31, ETHIOPIC
\r
306 CHEROKEE_SCRIPT, // 32, CHEROKEE
\r
307 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
\r
308 OGHAM_SCRIPT, // 34, OGHAM
\r
309 RUNIC_SCRIPT, // 35, RUNIC
\r
310 KHMER_SCRIPT, // 36, KHMER
\r
311 MONGOLIAN_SCRIPT, // 37, MONGOLIAN
\r
312 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
\r
313 GREEK_SCRIPT, // 39, GREEK_EXTENDED
\r
314 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
\r
315 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
\r
316 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
\r
317 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
\r
318 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
\r
319 COMMON_SCRIPT, // 45, NUMBER_FORMS
\r
320 COMMON_SCRIPT, // 46, ARROWS
\r
321 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
\r
322 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
\r
323 COMMON_SCRIPT, // 49, CONTROL_PICTURES
\r
324 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
\r
325 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
\r
326 COMMON_SCRIPT, // 52, BOX_DRAWING
\r
327 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
\r
328 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
\r
329 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
\r
330 COMMON_SCRIPT, // 56, DINGBATS
\r
331 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
\r
332 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
\r
333 HAN_SCRIPT, // 59, KANGXI_RADICALS
\r
334 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
\r
335 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
\r
336 HIRAGANA_SCRIPT, // 62, HIRAGANA
\r
337 KATAKANA_SCRIPT, // 63, KATAKANA
\r
338 BOPOMOFO_SCRIPT, // 64, BOPOMOFO
\r
339 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
\r
340 HAN_SCRIPT, // 66, KANBUN
\r
341 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
\r
342 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
\r
343 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
\r
344 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
\r
345 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
\r
346 YI_SCRIPT, // 72, YI_SYLLABLES
\r
347 YI_SCRIPT, // 73, YI_RADICALS
\r
348 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
\r
349 COMMON_SCRIPT, // 75, HIGH_SURROGATES
\r
350 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
\r
351 COMMON_SCRIPT, // 77, LOW_SURROGATES
\r
352 COMMON_SCRIPT, // 78, PRIVATE_USE
\r
353 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
\r
354 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
\r
355 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
\r
356 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
\r
357 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
\r
358 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
\r
359 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
\r
360 COMMON_SCRIPT, // 86, SPECIALS
\r
361 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
\r
362 COMMON_SCRIPT, // 88, SPECIALS
\r
365 // could be further reduced to a byte array, but I didn't bother.
\r
366 static final int[][] split = {
\r
367 {0x0250, 4, 5}, // -1
\r
368 {0x02B0, 5, 6}, // -2
\r
369 {0x0370, 7, 8}, // -3
\r
370 {0x0530, 0, 10}, // -4
\r
371 {0x0590, 10, 11}, // -5
\r
372 {0x0750, 13, 0}, // -6
\r
373 {0x07C0, 14, 0}, // -7
\r
374 {0x10A0, 28, 29}, // -8
\r
375 {0x13A0, 0, 32}, // -9
\r
376 {0x16A0, 34, 35}, // -10
\r
377 {0x18B0, 37, 0}, // -11
\r
378 {0x2070, 40, 41}, // -12
\r
379 {0x20A0, 41, -31}, // -13
\r
380 {0x2150, 44, 45}, // -14
\r
381 {0x2190, 45, 46}, // -15
\r
382 {0x2440, 49, -32}, // -16
\r
383 {0x25A0, 53, 54}, // -17
\r
384 {0x27C0, 56, 0}, // -18
\r
385 {0x2FE0, 59, -33}, // -19
\r
386 {0x3040, 61, 62}, // -20
\r
387 {0x30A0, 62, 63}, // -21
\r
388 {0x3130, 64, 65}, // -22
\r
389 {0x3190, 65, -34}, // -23
\r
390 {0x4DB6, 70, 0}, // -24
\r
391 {0xA490, 72, -35}, // -25
\r
392 {0xD7A4, 74, 0}, // -26
\r
393 {0xFB50, 80, 81}, // -27
\r
394 {0xFE20, 0, -36}, // -28
\r
395 {0xFEFF, 85, 86}, // -29
\r
396 {0xFFF0, 87, -37}, // -30
\r
397 {0x20D0, 42, 43}, // -31
\r
398 {0x2460, 50, 51}, // -32
\r
399 {0x2FF0, 0, 60}, // -33
\r
400 {0x31A0, 66, -38}, // -34
\r
401 {0xA4D0, 73, 0}, //-35
\r
402 {0xFE30, 82, -39}, //-36
\r
403 {0xFFFE, 88, 0}, //-37
\r
404 {0x31C0, 67, 0}, // -38
\r
405 {0xFE50, 83, -40}, //-39
\r
406 {0xFE70, 84, 85} // -40
\r
409 static final byte[] charToBlock = {
\r
410 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
\r
411 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
\r
412 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
\r
413 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
\r
414 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
\r
415 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
\r
416 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
\r
417 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
\r
418 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
\r
419 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
\r
420 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
421 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
422 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
423 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
424 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
425 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
430 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
\r
431 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
\r
432 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
433 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
434 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
435 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
436 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
\r
437 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
\r
438 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
439 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
440 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
441 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
\r