2 *******************************************************************************
3 * Copyright (C) 2000-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test.translit;
9 import com.ibm.icu.text.UTF16;
10 import com.ibm.icu.text.UnicodeSet;
12 public final class TestUtility {
14 public static String hex(char ch) {
15 String foo = Integer.toString(ch,16).toUpperCase();
16 return "0000".substring(0,4-foo.length()) + foo;
19 public static String hex(int ch) {
20 String foo = Integer.toString(ch,16).toUpperCase();
21 return "00000000".substring(0,4-foo.length()) + foo;
24 public static String hex(String s) {
28 public static String hex(String s, String sep) {
29 if (s.length() == 0) return "";
30 String result = hex(s.charAt(0));
31 for (int i = 1; i < s.length(); ++i) {
33 result += hex(s.charAt(i));
38 public static String replace(String source, String toBeReplaced, String replacement) {
39 StringBuffer results = new StringBuffer();
40 int len = toBeReplaced.length();
41 for (int i = 0; i < source.length(); ++i) {
42 if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
43 results.append(replacement);
44 i += len - 1; // minus one, since we will increment
46 results.append(source.charAt(i));
49 return results.toString();
52 public static String replaceAll(String source, UnicodeSet set, String replacement) {
53 StringBuffer results = new StringBuffer();
55 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
56 cp = UTF16.charAt(source,i);
57 if (set.contains(cp)) {
58 results.append(replacement);
60 UTF16.append(results, cp);
63 return results.toString();
66 // COMMENTED OUT ALL THE OLD SCRIPT STUFF
68 public static byte getScript(char c) {
69 return getScript(getBlock(c));
72 public static byte getScript(byte block) {
73 return blockToScript[block];
76 public static byte getBlock(char c) {
78 byte block = charToBlock[index];
79 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
80 int[] tuple = split[-block-1];
81 if (c < tuple[0]) block = (byte)tuple[1];
82 else block = (byte)tuple[2];
87 // returns next letter of script, or 0xFFFF if done
89 public static char getNextLetter(char c, byte script) {
92 if (getScript(c) == script && Character.isLetter(c)) {
99 // Supplements to Character methods; these methods go through
100 // UCharacter if possible. If not, they fall back to Character.
102 public static boolean isUnassigned(char c) {
104 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
105 } catch (NullPointerException e) {
106 System.out.print("");
108 return Character.getType(c) == Character.UNASSIGNED;
111 public static boolean isLetter(char c) {
113 return UCharacter.isLetter(c);
114 } catch (NullPointerException e) {
115 System.out.print("");
117 return Character.isLetter(c);
120 public static void main(String[] args) {
121 System.out.println("Blocks: ");
122 byte lastblock = -128;
123 for (char cc = 0; cc < 0xFFFF; ++cc) {
124 byte block = TestUtility.getBlock(cc);
125 if (block != lastblock) {
126 System.out.println(TestUtility.hex(cc) + "\t" + block);
130 System.out.println();
131 System.out.println("Scripts: ");
132 byte lastScript = -128;
133 for (char cc = 0; cc < 0xFFFF; ++cc) {
134 byte script = TestUtility.getScript(cc);
135 if (script != lastScript) {
136 System.out.println(TestUtility.hex(cc) + "\t" + script);
144 public static final byte // SCRIPT CODE
154 DEVANAGARI_SCRIPT = 9,
156 GURMUKHI_SCRIPT = 11,
157 GUJARATI_SCRIPT = 12,
162 MALAYALAM_SCRIPT = 17,
168 GEORGIAN_SCRIPT = 23,
171 ETHIOPIC_SCRIPT = 26,
172 CHEROKEE_SCRIPT = 27,
173 ABORIGINAL_SCRIPT = 28,
177 MONGOLIAN_SCRIPT = 32,
178 HIRAGANA_SCRIPT = 33,
179 KATAKANA_SCRIPT = 34,
180 BOPOMOFO_SCRIPT = 35,
184 public static final byte // block code
187 LATIN_1_SUPPLEMENT = 2,
188 LATIN_EXTENDED_A = 3,
189 LATIN_EXTENDED_B = 4,
191 SPACING_MODIFIER_LETTERS = 6,
192 COMBINING_DIACRITICAL_MARKS = 7,
218 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
223 LATIN_EXTENDED_ADDITIONAL = 38,
225 GENERAL_PUNCTUATION = 40,
226 SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
227 CURRENCY_SYMBOLS = 42,
228 COMBINING_MARKS_FOR_SYMBOLS = 43,
229 LETTERLIKE_SYMBOLS = 44,
232 MATHEMATICAL_OPERATORS = 47,
233 MISCELLANEOUS_TECHNICAL = 48,
234 CONTROL_PICTURES = 49,
235 OPTICAL_CHARACTER_RECOGNITION = 50,
236 ENCLOSED_ALPHANUMERICS = 51,
239 GEOMETRIC_SHAPES = 54,
240 MISCELLANEOUS_SYMBOLS = 55,
242 BRAILLE_PATTERNS = 57,
243 CJK_RADICALS_SUPPLEMENT = 58,
244 KANGXI_RADICALS = 59,
245 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
246 CJK_SYMBOLS_AND_PUNCTUATION = 61,
250 HANGUL_COMPATIBILITY_JAMO = 65,
252 BOPOMOFO_EXTENDED = 67,
253 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
254 CJK_COMPATIBILITY = 69,
255 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
256 CJK_UNIFIED_IDEOGRAPHS = 71,
259 HANGUL_SYLLABLES = 74,
260 HIGH_SURROGATES = 75,
261 HIGH_PRIVATE_USE_SURROGATES = 76,
264 CJK_COMPATIBILITY_IDEOGRAPHS = 79,
265 ALPHABETIC_PRESENTATION_FORMS = 80,
266 ARABIC_PRESENTATION_FORMS_A = 81,
267 COMBINING_HALF_MARKS = 82,
268 CJK_COMPATIBILITY_FORMS = 83,
269 SMALL_FORM_VARIANTS = 84,
270 ARABIC_PRESENTATION_FORMS_B = 85,
272 HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
274 static final byte[] blockToScript = {
275 COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
276 LATIN_SCRIPT, // 1, BASIC_LATIN
277 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
278 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
279 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
280 LATIN_SCRIPT, // 5, IPA_EXTENSIONS
281 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
282 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
283 GREEK_SCRIPT, // 8, GREEK
284 CYRILLIC_SCRIPT, // 9, CYRILLIC
285 ARMENIAN_SCRIPT, // 10, ARMENIAN
286 HEBREW_SCRIPT, // 11, HEBREW
287 ARABIC_SCRIPT, // 12, ARABIC
288 SYRIAC_SCRIPT, // 13, SYRIAC
289 THAANA_SCRIPT, // 14, THAANA
290 DEVANAGARI_SCRIPT, // 15, DEVANAGARI
291 BENGALI_SCRIPT, // 16, BENGALI
292 GURMUKHI_SCRIPT, // 17, GURMUKHI
293 GUJARATI_SCRIPT, // 18, GUJARATI
294 ORIYA_SCRIPT, // 19, ORIYA
295 TAMIL_SCRIPT, // 20, TAMIL
296 TELUGU_SCRIPT, // 21, TELUGU
297 KANNADA_SCRIPT, // 22, KANNADA
298 MALAYALAM_SCRIPT, // 23, MALAYALAM
299 SINHALA_SCRIPT, // 24, SINHALA
300 THAI_SCRIPT, // 25, THAI
301 LAO_SCRIPT, // 26, LAO
302 TIBETAN_SCRIPT, // 27, TIBETAN
303 MYANMAR_SCRIPT, // 28, MYANMAR
304 GEORGIAN_SCRIPT, // 29, GEORGIAN
305 JAMO_SCRIPT, // 30, HANGUL_JAMO
306 ETHIOPIC_SCRIPT, // 31, ETHIOPIC
307 CHEROKEE_SCRIPT, // 32, CHEROKEE
308 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
309 OGHAM_SCRIPT, // 34, OGHAM
310 RUNIC_SCRIPT, // 35, RUNIC
311 KHMER_SCRIPT, // 36, KHMER
312 MONGOLIAN_SCRIPT, // 37, MONGOLIAN
313 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
314 GREEK_SCRIPT, // 39, GREEK_EXTENDED
315 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
316 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
317 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
318 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
319 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
320 COMMON_SCRIPT, // 45, NUMBER_FORMS
321 COMMON_SCRIPT, // 46, ARROWS
322 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
323 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
324 COMMON_SCRIPT, // 49, CONTROL_PICTURES
325 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
326 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
327 COMMON_SCRIPT, // 52, BOX_DRAWING
328 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
329 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
330 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
331 COMMON_SCRIPT, // 56, DINGBATS
332 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
333 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
334 HAN_SCRIPT, // 59, KANGXI_RADICALS
335 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
336 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
337 HIRAGANA_SCRIPT, // 62, HIRAGANA
338 KATAKANA_SCRIPT, // 63, KATAKANA
339 BOPOMOFO_SCRIPT, // 64, BOPOMOFO
340 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
341 HAN_SCRIPT, // 66, KANBUN
342 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
343 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
344 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
345 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
346 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
347 YI_SCRIPT, // 72, YI_SYLLABLES
348 YI_SCRIPT, // 73, YI_RADICALS
349 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
350 COMMON_SCRIPT, // 75, HIGH_SURROGATES
351 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
352 COMMON_SCRIPT, // 77, LOW_SURROGATES
353 COMMON_SCRIPT, // 78, PRIVATE_USE
354 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
355 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
356 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
357 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
358 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
359 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
360 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
361 COMMON_SCRIPT, // 86, SPECIALS
362 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
363 COMMON_SCRIPT, // 88, SPECIALS
366 // could be further reduced to a byte array, but I didn't bother.
367 static final int[][] split = {
368 {0x0250, 4, 5}, // -1
369 {0x02B0, 5, 6}, // -2
370 {0x0370, 7, 8}, // -3
371 {0x0530, 0, 10}, // -4
372 {0x0590, 10, 11}, // -5
373 {0x0750, 13, 0}, // -6
374 {0x07C0, 14, 0}, // -7
375 {0x10A0, 28, 29}, // -8
376 {0x13A0, 0, 32}, // -9
377 {0x16A0, 34, 35}, // -10
378 {0x18B0, 37, 0}, // -11
379 {0x2070, 40, 41}, // -12
380 {0x20A0, 41, -31}, // -13
381 {0x2150, 44, 45}, // -14
382 {0x2190, 45, 46}, // -15
383 {0x2440, 49, -32}, // -16
384 {0x25A0, 53, 54}, // -17
385 {0x27C0, 56, 0}, // -18
386 {0x2FE0, 59, -33}, // -19
387 {0x3040, 61, 62}, // -20
388 {0x30A0, 62, 63}, // -21
389 {0x3130, 64, 65}, // -22
390 {0x3190, 65, -34}, // -23
391 {0x4DB6, 70, 0}, // -24
392 {0xA490, 72, -35}, // -25
393 {0xD7A4, 74, 0}, // -26
394 {0xFB50, 80, 81}, // -27
395 {0xFE20, 0, -36}, // -28
396 {0xFEFF, 85, 86}, // -29
397 {0xFFF0, 87, -37}, // -30
398 {0x20D0, 42, 43}, // -31
399 {0x2460, 50, 51}, // -32
400 {0x2FF0, 0, 60}, // -33
401 {0x31A0, 66, -38}, // -34
402 {0xA4D0, 73, 0}, //-35
403 {0xFE30, 82, -39}, //-36
404 {0xFFFE, 88, 0}, //-37
405 {0x31C0, 67, 0}, // -38
406 {0xFE50, 83, -40}, //-39
407 {0xFE70, 84, 85} // -40
410 static final byte[] charToBlock = {
411 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
412 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
413 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
414 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
415 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
416 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
417 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
418 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
419 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
420 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
421 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
422 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
423 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
424 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
425 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
430 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
431 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
432 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
433 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
434 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
435 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
436 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
437 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
438 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
439 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
440 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
441 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
442 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30