2 *******************************************************************************
\r
3 * Copyright (C) 2000-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.translit;
\r
9 import com.ibm.icu.text.UTF16;
\r
10 import com.ibm.icu.text.UnicodeSet;
\r
12 public final class TestUtility {
\r
14 public static String hex(char ch) {
\r
15 String foo = Integer.toString(ch,16).toUpperCase();
\r
16 return "0000".substring(0,4-foo.length()) + foo;
\r
19 public static String hex(int ch) {
\r
20 String foo = Integer.toString(ch,16).toUpperCase();
\r
21 return "00000000".substring(0,4-foo.length()) + foo;
\r
24 public static String hex(String s) {
\r
28 public static String hex(String s, String sep) {
\r
29 if (s.length() == 0) return "";
\r
30 String result = hex(s.charAt(0));
\r
31 for (int i = 1; i < s.length(); ++i) {
\r
33 result += hex(s.charAt(i));
\r
38 public static String replace(String source, String toBeReplaced, String replacement) {
\r
39 StringBuffer results = new StringBuffer();
\r
40 int len = toBeReplaced.length();
\r
41 for (int i = 0; i < source.length(); ++i) {
\r
42 if (source.regionMatches(false, i, toBeReplaced, 0, len)) {
\r
43 results.append(replacement);
\r
44 i += len - 1; // minus one, since we will increment
\r
46 results.append(source.charAt(i));
\r
49 return results.toString();
\r
52 public static String replaceAll(String source, UnicodeSet set, String replacement) {
\r
53 StringBuffer results = new StringBuffer();
\r
55 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
\r
56 cp = UTF16.charAt(source,i);
\r
57 if (set.contains(cp)) {
\r
58 results.append(replacement);
\r
60 UTF16.append(results, cp);
\r
63 return results.toString();
\r
66 // COMMENTED OUT ALL THE OLD SCRIPT STUFF
\r
68 public static byte getScript(char c) {
\r
69 return getScript(getBlock(c));
\r
72 public static byte getScript(byte block) {
\r
73 return blockToScript[block];
\r
76 public static byte getBlock(char c) {
\r
78 byte block = charToBlock[index];
\r
79 while (block < 0) { // take care of exceptions, blocks split across 128 boundaries
\r
80 int[] tuple = split[-block-1];
\r
81 if (c < tuple[0]) block = (byte)tuple[1];
\r
82 else block = (byte)tuple[2];
\r
87 // returns next letter of script, or 0xFFFF if done
\r
89 public static char getNextLetter(char c, byte script) {
\r
90 while (c < 0xFFFF) {
\r
92 if (getScript(c) == script && Character.isLetter(c)) {
\r
99 // Supplements to Character methods; these methods go through
\r
100 // UCharacter if possible. If not, they fall back to Character.
\r
102 public static boolean isUnassigned(char c) {
\r
104 return UCharacter.getType(c) == UCharacterCategory.UNASSIGNED;
\r
105 } catch (NullPointerException e) {
\r
106 System.out.print("");
\r
108 return Character.getType(c) == Character.UNASSIGNED;
\r
111 public static boolean isLetter(char c) {
\r
113 return UCharacter.isLetter(c);
\r
114 } catch (NullPointerException e) {
\r
115 System.out.print("");
\r
117 return Character.isLetter(c);
\r
120 public static void main(String[] args) {
\r
121 System.out.println("Blocks: ");
\r
122 byte lastblock = -128;
\r
123 for (char cc = 0; cc < 0xFFFF; ++cc) {
\r
124 byte block = TestUtility.getBlock(cc);
\r
125 if (block != lastblock) {
\r
126 System.out.println(TestUtility.hex(cc) + "\t" + block);
\r
130 System.out.println();
\r
131 System.out.println("Scripts: ");
\r
132 byte lastScript = -128;
\r
133 for (char cc = 0; cc < 0xFFFF; ++cc) {
\r
134 byte script = TestUtility.getScript(cc);
\r
135 if (script != lastScript) {
\r
136 System.out.println(TestUtility.hex(cc) + "\t" + script);
\r
137 lastScript = script;
\r
144 public static final byte // SCRIPT CODE
\r
148 CYRILLIC_SCRIPT = 3,
\r
149 ARMENIAN_SCRIPT = 4,
\r
153 THAANA_SCRIPT = 8,
\r
154 DEVANAGARI_SCRIPT = 9,
\r
155 BENGALI_SCRIPT = 10,
\r
156 GURMUKHI_SCRIPT = 11,
\r
157 GUJARATI_SCRIPT = 12,
\r
160 TELUGU_SCRIPT = 15,
\r
161 KANNADA_SCRIPT = 16,
\r
162 MALAYALAM_SCRIPT = 17,
\r
163 SINHALA_SCRIPT = 18,
\r
166 TIBETAN_SCRIPT = 21,
\r
167 MYANMAR_SCRIPT = 22,
\r
168 GEORGIAN_SCRIPT = 23,
\r
170 HANGUL_SCRIPT = 25,
\r
171 ETHIOPIC_SCRIPT = 26,
\r
172 CHEROKEE_SCRIPT = 27,
\r
173 ABORIGINAL_SCRIPT = 28,
\r
177 MONGOLIAN_SCRIPT = 32,
\r
178 HIRAGANA_SCRIPT = 33,
\r
179 KATAKANA_SCRIPT = 34,
\r
180 BOPOMOFO_SCRIPT = 35,
\r
184 public static final byte // block code
\r
185 RESERVED_BLOCK = 0,
\r
187 LATIN_1_SUPPLEMENT = 2,
\r
188 LATIN_EXTENDED_A = 3,
\r
189 LATIN_EXTENDED_B = 4,
\r
190 IPA_EXTENSIONS = 5,
\r
191 SPACING_MODIFIER_LETTERS = 6,
\r
192 COMBINING_DIACRITICAL_MARKS = 7,
\r
218 UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS = 33,
\r
223 LATIN_EXTENDED_ADDITIONAL = 38,
\r
224 GREEK_EXTENDED = 39,
\r
225 GENERAL_PUNCTUATION = 40,
\r
226 SUPERSCRIPTS_AND_SUBSCRIPTS = 41,
\r
227 CURRENCY_SYMBOLS = 42,
\r
228 COMBINING_MARKS_FOR_SYMBOLS = 43,
\r
229 LETTERLIKE_SYMBOLS = 44,
\r
232 MATHEMATICAL_OPERATORS = 47,
\r
233 MISCELLANEOUS_TECHNICAL = 48,
\r
234 CONTROL_PICTURES = 49,
\r
235 OPTICAL_CHARACTER_RECOGNITION = 50,
\r
236 ENCLOSED_ALPHANUMERICS = 51,
\r
238 BLOCK_ELEMENTS = 53,
\r
239 GEOMETRIC_SHAPES = 54,
\r
240 MISCELLANEOUS_SYMBOLS = 55,
\r
242 BRAILLE_PATTERNS = 57,
\r
243 CJK_RADICALS_SUPPLEMENT = 58,
\r
244 KANGXI_RADICALS = 59,
\r
245 IDEOGRAPHIC_DESCRIPTION_CHARACTERS = 60,
\r
246 CJK_SYMBOLS_AND_PUNCTUATION = 61,
\r
250 HANGUL_COMPATIBILITY_JAMO = 65,
\r
252 BOPOMOFO_EXTENDED = 67,
\r
253 ENCLOSED_CJK_LETTERS_AND_MONTHS = 68,
\r
254 CJK_COMPATIBILITY = 69,
\r
255 CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A = 70,
\r
256 CJK_UNIFIED_IDEOGRAPHS = 71,
\r
259 HANGUL_SYLLABLES = 74,
\r
260 HIGH_SURROGATES = 75,
\r
261 HIGH_PRIVATE_USE_SURROGATES = 76,
\r
262 LOW_SURROGATES = 77,
\r
264 CJK_COMPATIBILITY_IDEOGRAPHS = 79,
\r
265 ALPHABETIC_PRESENTATION_FORMS = 80,
\r
266 ARABIC_PRESENTATION_FORMS_A = 81,
\r
267 COMBINING_HALF_MARKS = 82,
\r
268 CJK_COMPATIBILITY_FORMS = 83,
\r
269 SMALL_FORM_VARIANTS = 84,
\r
270 ARABIC_PRESENTATION_FORMS_B = 85,
\r
272 HALFWIDTH_AND_FULLWIDTH_FORMS = 87;
\r
274 static final byte[] blockToScript = {
\r
275 COMMON_SCRIPT, // 0, <RESERVED_BLOCK>
\r
276 LATIN_SCRIPT, // 1, BASIC_LATIN
\r
277 LATIN_SCRIPT, // 2, LATIN_1_SUPPLEMENT
\r
278 LATIN_SCRIPT, // 3, LATIN_EXTENDED_A
\r
279 LATIN_SCRIPT, // 4, LATIN_EXTENDED_B
\r
280 LATIN_SCRIPT, // 5, IPA_EXTENSIONS
\r
281 COMMON_SCRIPT, // 6, SPACING_MODIFIER_LETTERS
\r
282 COMMON_SCRIPT, // 7, COMBINING_DIACRITICAL_MARKS
\r
283 GREEK_SCRIPT, // 8, GREEK
\r
284 CYRILLIC_SCRIPT, // 9, CYRILLIC
\r
285 ARMENIAN_SCRIPT, // 10, ARMENIAN
\r
286 HEBREW_SCRIPT, // 11, HEBREW
\r
287 ARABIC_SCRIPT, // 12, ARABIC
\r
288 SYRIAC_SCRIPT, // 13, SYRIAC
\r
289 THAANA_SCRIPT, // 14, THAANA
\r
290 DEVANAGARI_SCRIPT, // 15, DEVANAGARI
\r
291 BENGALI_SCRIPT, // 16, BENGALI
\r
292 GURMUKHI_SCRIPT, // 17, GURMUKHI
\r
293 GUJARATI_SCRIPT, // 18, GUJARATI
\r
294 ORIYA_SCRIPT, // 19, ORIYA
\r
295 TAMIL_SCRIPT, // 20, TAMIL
\r
296 TELUGU_SCRIPT, // 21, TELUGU
\r
297 KANNADA_SCRIPT, // 22, KANNADA
\r
298 MALAYALAM_SCRIPT, // 23, MALAYALAM
\r
299 SINHALA_SCRIPT, // 24, SINHALA
\r
300 THAI_SCRIPT, // 25, THAI
\r
301 LAO_SCRIPT, // 26, LAO
\r
302 TIBETAN_SCRIPT, // 27, TIBETAN
\r
303 MYANMAR_SCRIPT, // 28, MYANMAR
\r
304 GEORGIAN_SCRIPT, // 29, GEORGIAN
\r
305 JAMO_SCRIPT, // 30, HANGUL_JAMO
\r
306 ETHIOPIC_SCRIPT, // 31, ETHIOPIC
\r
307 CHEROKEE_SCRIPT, // 32, CHEROKEE
\r
308 ABORIGINAL_SCRIPT, // 33, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS
\r
309 OGHAM_SCRIPT, // 34, OGHAM
\r
310 RUNIC_SCRIPT, // 35, RUNIC
\r
311 KHMER_SCRIPT, // 36, KHMER
\r
312 MONGOLIAN_SCRIPT, // 37, MONGOLIAN
\r
313 LATIN_SCRIPT, // 38, LATIN_EXTENDED_ADDITIONAL
\r
314 GREEK_SCRIPT, // 39, GREEK_EXTENDED
\r
315 COMMON_SCRIPT, // 40, GENERAL_PUNCTUATION
\r
316 COMMON_SCRIPT, // 41, SUPERSCRIPTS_AND_SUBSCRIPTS
\r
317 COMMON_SCRIPT, // 42, CURRENCY_SYMBOLS
\r
318 COMMON_SCRIPT, // 43, COMBINING_MARKS_FOR_SYMBOLS
\r
319 COMMON_SCRIPT, // 44, LETTERLIKE_SYMBOLS
\r
320 COMMON_SCRIPT, // 45, NUMBER_FORMS
\r
321 COMMON_SCRIPT, // 46, ARROWS
\r
322 COMMON_SCRIPT, // 47, MATHEMATICAL_OPERATORS
\r
323 COMMON_SCRIPT, // 48, MISCELLANEOUS_TECHNICAL
\r
324 COMMON_SCRIPT, // 49, CONTROL_PICTURES
\r
325 COMMON_SCRIPT, // 50, OPTICAL_CHARACTER_RECOGNITION
\r
326 COMMON_SCRIPT, // 51, ENCLOSED_ALPHANUMERICS
\r
327 COMMON_SCRIPT, // 52, BOX_DRAWING
\r
328 COMMON_SCRIPT, // 53, BLOCK_ELEMENTS
\r
329 COMMON_SCRIPT, // 54, GEOMETRIC_SHAPES
\r
330 COMMON_SCRIPT, // 55, MISCELLANEOUS_SYMBOLS
\r
331 COMMON_SCRIPT, // 56, DINGBATS
\r
332 COMMON_SCRIPT, // 57, BRAILLE_PATTERNS
\r
333 HAN_SCRIPT, // 58, CJK_RADICALS_SUPPLEMENT
\r
334 HAN_SCRIPT, // 59, KANGXI_RADICALS
\r
335 HAN_SCRIPT, // 60, IDEOGRAPHIC_DESCRIPTION_CHARACTERS
\r
336 COMMON_SCRIPT, // 61, CJK_SYMBOLS_AND_PUNCTUATION
\r
337 HIRAGANA_SCRIPT, // 62, HIRAGANA
\r
338 KATAKANA_SCRIPT, // 63, KATAKANA
\r
339 BOPOMOFO_SCRIPT, // 64, BOPOMOFO
\r
340 JAMO_SCRIPT, // 65, HANGUL_COMPATIBILITY_JAMO
\r
341 HAN_SCRIPT, // 66, KANBUN
\r
342 BOPOMOFO_SCRIPT, // 67, BOPOMOFO_EXTENDED
\r
343 COMMON_SCRIPT, // 68, ENCLOSED_CJK_LETTERS_AND_MONTHS
\r
344 COMMON_SCRIPT, // 69, CJK_COMPATIBILITY
\r
345 HAN_SCRIPT, // 70, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
\r
346 HAN_SCRIPT, // 71, CJK_UNIFIED_IDEOGRAPHS
\r
347 YI_SCRIPT, // 72, YI_SYLLABLES
\r
348 YI_SCRIPT, // 73, YI_RADICALS
\r
349 HANGUL_SCRIPT, // 74, HANGUL_SYLLABLES
\r
350 COMMON_SCRIPT, // 75, HIGH_SURROGATES
\r
351 COMMON_SCRIPT, // 76, HIGH_PRIVATE_USE_SURROGATES
\r
352 COMMON_SCRIPT, // 77, LOW_SURROGATES
\r
353 COMMON_SCRIPT, // 78, PRIVATE_USE
\r
354 HAN_SCRIPT, // 79, CJK_COMPATIBILITY_IDEOGRAPHS
\r
355 COMMON_SCRIPT, // 80, ALPHABETIC_PRESENTATION_FORMS
\r
356 ARABIC_SCRIPT, // 81, ARABIC_PRESENTATION_FORMS_A
\r
357 COMMON_SCRIPT, // 82, COMBINING_HALF_MARKS
\r
358 COMMON_SCRIPT, // 83, CJK_COMPATIBILITY_FORMS
\r
359 COMMON_SCRIPT, // 84, SMALL_FORM_VARIANTS
\r
360 ARABIC_SCRIPT, // 85, ARABIC_PRESENTATION_FORMS_B
\r
361 COMMON_SCRIPT, // 86, SPECIALS
\r
362 COMMON_SCRIPT, // 87, HALFWIDTH_AND_FULLWIDTH_FORMS
\r
363 COMMON_SCRIPT, // 88, SPECIALS
\r
366 // could be further reduced to a byte array, but I didn't bother.
\r
367 static final int[][] split = {
\r
368 {0x0250, 4, 5}, // -1
\r
369 {0x02B0, 5, 6}, // -2
\r
370 {0x0370, 7, 8}, // -3
\r
371 {0x0530, 0, 10}, // -4
\r
372 {0x0590, 10, 11}, // -5
\r
373 {0x0750, 13, 0}, // -6
\r
374 {0x07C0, 14, 0}, // -7
\r
375 {0x10A0, 28, 29}, // -8
\r
376 {0x13A0, 0, 32}, // -9
\r
377 {0x16A0, 34, 35}, // -10
\r
378 {0x18B0, 37, 0}, // -11
\r
379 {0x2070, 40, 41}, // -12
\r
380 {0x20A0, 41, -31}, // -13
\r
381 {0x2150, 44, 45}, // -14
\r
382 {0x2190, 45, 46}, // -15
\r
383 {0x2440, 49, -32}, // -16
\r
384 {0x25A0, 53, 54}, // -17
\r
385 {0x27C0, 56, 0}, // -18
\r
386 {0x2FE0, 59, -33}, // -19
\r
387 {0x3040, 61, 62}, // -20
\r
388 {0x30A0, 62, 63}, // -21
\r
389 {0x3130, 64, 65}, // -22
\r
390 {0x3190, 65, -34}, // -23
\r
391 {0x4DB6, 70, 0}, // -24
\r
392 {0xA490, 72, -35}, // -25
\r
393 {0xD7A4, 74, 0}, // -26
\r
394 {0xFB50, 80, 81}, // -27
\r
395 {0xFE20, 0, -36}, // -28
\r
396 {0xFEFF, 85, 86}, // -29
\r
397 {0xFFF0, 87, -37}, // -30
\r
398 {0x20D0, 42, 43}, // -31
\r
399 {0x2460, 50, 51}, // -32
\r
400 {0x2FF0, 0, 60}, // -33
\r
401 {0x31A0, 66, -38}, // -34
\r
402 {0xA4D0, 73, 0}, //-35
\r
403 {0xFE30, 82, -39}, //-36
\r
404 {0xFFFE, 88, 0}, //-37
\r
405 {0x31C0, 67, 0}, // -38
\r
406 {0xFE50, 83, -40}, //-39
\r
407 {0xFE70, 84, 85} // -40
\r
410 static final byte[] charToBlock = {
\r
411 1, 2, 3, 4, -1, -2, -3, 8, 9, 9, -4, -5, 12, 12, -6, -7,
\r
412 0, 0, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 27,
\r
413 28, -8, 30, 30, 31, 31, 31, -9, 33, 33, 33, 33, 33, -10, 0, 36,
\r
414 37, -11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 38, 39, 39,
\r
415 -12, -13, -14, -15, 47, 47, 48, 48, -16, 51, 52, -17, 55, 55, 56, -18,
\r
416 57, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 59, -19,
\r
417 -20, -21, -22, -23, 68, 68, 69, 69, 70, 70, 70, 70, 70, 70, 70, 70,
\r
418 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
\r
419 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70,
\r
420 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, -24, 71, 71, 71, 71,
\r
421 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
422 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
423 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
424 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
425 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
426 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
427 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
428 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
429 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
430 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71, 71,
\r
431 72, 72, 72, 72, 72, 72, 72, 72, 72, -25, 0, 0, 0, 0, 0, 0,
\r
432 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 74, 74, 74, 74, 74, 74,
\r
433 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
434 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
435 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
436 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74,
\r
437 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, -26,
\r
438 75, 75, 75, 75, 75, 75, 75, 76, 77, 77, 77, 77, 77, 77, 77, 77,
\r
439 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
440 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
441 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78, 78,
\r
442 78, 78, 79, 79, 79, 79, -27, 81, 81, 81, 81, 81, -28, -29, 87, -30
\r