2 *******************************************************************************
\r
3 * Copyright (C) 2000-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.translit;
\r
9 import java.io.BufferedWriter;
\r
10 import java.io.ByteArrayOutputStream;
\r
11 import java.io.File;
\r
12 import java.io.FileNotFoundException;
\r
13 import java.io.FileOutputStream;
\r
14 import java.io.IOException;
\r
15 import java.io.OutputStreamWriter;
\r
16 import java.io.PrintWriter;
\r
17 import java.io.UnsupportedEncodingException;
\r
18 import java.util.MissingResourceException;
\r
20 import com.ibm.icu.dev.test.TestFmwk;
\r
21 import com.ibm.icu.impl.Utility;
\r
22 import com.ibm.icu.lang.UCharacter;
\r
23 import com.ibm.icu.lang.UProperty;
\r
24 import com.ibm.icu.text.Normalizer;
\r
25 import com.ibm.icu.text.Transliterator;
\r
26 import com.ibm.icu.text.UTF16;
\r
27 import com.ibm.icu.text.UnicodeSet;
\r
28 import com.ibm.icu.text.UnicodeSetIterator;
\r
29 import com.ibm.icu.util.LocaleData;
\r
30 import com.ibm.icu.util.ULocale;
\r
34 * @summary Round trip test of Transliterator
\r
36 public class RoundTripTest extends TestFmwk {
\r
38 static final boolean EXTRA_TESTS = true;
\r
39 static final boolean PRINT_RULES = true;
\r
41 public static void main(String[] args) throws Exception {
\r
42 new RoundTripTest().run(args);
\r
45 public void TestSingle() throws IOException, ParseException {
\r
46 Transliterator t = Transliterator.getInstance("Latin-Greek");
\r
47 String s = t.transliterate("\u0101\u0069");
\r
52 Note: Unicode 3.2 added new Hiragana/Katakana characters:
\r
54 3095..3096 ; 3.2 # [2] HIRAGANA LETTER SMALL KA..HIRAGANA LETTER SMALL KE
\r
55 309F..30A0 ; 3.2 # [2] HIRAGANA DIGRAPH YORI..KATAKANA-HIRAGANA DOUBLE HYPHEN
\r
56 30FF ; 3.2 # KATAKANA DIGRAPH KOTO
\r
57 31F0..31FF ; 3.2 # [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO
\r
59 Unicode 5.2 added another Hiragana character:
\r
60 1F200 ; 5.2 # SQUARE HIRAGANA HOKA
\r
62 We will not add them to the rules until they are more supported (e.g. in fonts on Windows)
\r
63 A bug has been filed to remind us to do this: #1979.
\r
66 static String KATAKANA = "[[[:katakana:][\u30A1-\u30FA\u30FC]]-[\u30FF\u31F0-\u31FF]]";
\r
67 static String HIRAGANA = "[[[:hiragana:][\u3040-\u3094]]-[\u3095-\u3096\u309F-\u30A0\\U0001F200-\\U0001F2FF]]";
\r
68 static String LENGTH = "[\u30FC]";
\r
69 static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9D]";
\r
70 static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
\r
71 static String HIRAGANA_ITERATION = "[\u309D\u309E]";
\r
73 //------------------------------------------------------------------
\r
74 // AbbreviatedUnicodeSetIterator
\r
75 //------------------------------------------------------------------
\r
77 static class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
\r
79 private boolean abbreviated;
\r
80 private int perRange;
\r
82 public AbbreviatedUnicodeSetIterator() {
\r
84 abbreviated = false;
\r
87 public void reset(UnicodeSet newSet) {
\r
88 reset(newSet, false);
\r
91 public void reset(UnicodeSet newSet, boolean abb) {
\r
92 reset(newSet, abb, 100);
\r
95 public void reset(UnicodeSet newSet, boolean abb, int density) {
\r
96 super.reset(newSet);
\r
98 perRange = newSet.getRangeCount();
\r
99 if (perRange != 0) {
\r
100 perRange = density / perRange;
\r
104 protected void loadRange(int myRange) {
\r
105 super.loadRange(myRange);
\r
106 if (abbreviated && (endElement > nextElement + perRange)) {
\r
107 endElement = nextElement + perRange;
\r
112 //--------------------------------------------------------------------
\r
114 public void showElapsed(long start, String name) {
\r
115 double dur = (System.currentTimeMillis() - start) / 1000.0;
\r
116 logln(name + " took " + dur + " seconds");
\r
119 public void TestKana() throws IOException {
\r
120 long start = System.currentTimeMillis();
\r
121 new Test("Katakana-Hiragana")
\r
122 .test(KATAKANA, "[" + HIRAGANA + LENGTH + "]", "[" + HALFWIDTH_KATAKANA + LENGTH + "]", this, new Legal());
\r
123 showElapsed(start, "TestKana");
\r
126 public void TestHiragana() throws IOException {
\r
127 long start = System.currentTimeMillis();
\r
128 new Test("Latin-Hiragana")
\r
129 .test("[a-zA-Z]", HIRAGANA, HIRAGANA_ITERATION, this, new Legal());
\r
130 showElapsed(start, "TestHiragana");
\r
133 public void TestKatakana() throws IOException {
\r
134 long start = System.currentTimeMillis();
\r
135 new Test("Latin-Katakana")
\r
136 .test("[a-zA-Z]", KATAKANA, "[" + KATAKANA_ITERATION + HALFWIDTH_KATAKANA + "]", this, new Legal());
\r
137 showElapsed(start, "TestKatakana");
\r
140 public void TestJamo() throws IOException {
\r
141 long start = System.currentTimeMillis();
\r
142 new Test("Latin-Jamo")
\r
143 .test("[a-zA-Z]", "[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]", "", this, new LegalJamo());
\r
144 showElapsed(start, "TestJamo");
\r
148 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
\r
149 LCount = 19, VCount = 21, TCount = 28,
\r
150 NCount = VCount * TCount, // 588
\r
151 SCount = LCount * NCount, // 11172
\r
152 LLimit = LBase + LCount, // 1113
\r
153 VLimit = VBase + VCount, // 1176
\r
154 TLimit = TBase + TCount, // 11C3
\r
155 SLimit = SBase + SCount; // D7A4
\r
158 public void TestHangul() throws IOException {
\r
159 long start = System.currentTimeMillis();
\r
160 Test t = new Test("Latin-Hangul", 5);
\r
161 boolean TEST_ALL = "true".equalsIgnoreCase(getProperty("HangulRoundTripAll"));
\r
162 if (TEST_ALL && getInclusion() == 10) {
\r
163 t.setPairLimit(Integer.MAX_VALUE); // only go to the limit if we have TEST_ALL and getInclusion
\r
165 t.test("[a-zA-Z]", "[\uAC00-\uD7A4]", "", this, new Legal());
\r
166 showElapsed(start, "TestHangul");
\r
170 * This is a shorter version of the test for doubles, that allows us to skip lots of cases, but
\r
171 * does check the ones that should cause problems (if any do).
\r
173 public void TestHangul2() {
\r
174 Transliterator lh = Transliterator.getInstance("Latin-Hangul");
\r
175 Transliterator hl = lh.getInverse();
\r
176 final UnicodeSet representativeHangul = getRepresentativeHangul();
\r
177 for (UnicodeSetIterator it = new UnicodeSetIterator(representativeHangul); it.next();) {
\r
178 assertRoundTripTransform("Transform", it.getString(), lh, hl);
\r
182 private void assertRoundTripTransform(String message, String source, Transliterator lh, Transliterator hl) {
\r
183 String to = hl.transform(source);
\r
184 String back = lh.transform(to);
\r
185 if (!source.equals(back)) {
\r
186 String to2 = hl.transform(source.replaceAll("(.)", "$1 ").trim());
\r
187 String to3 = hl.transform(back.replaceAll("(.)", "$1 ").trim());
\r
188 assertEquals(message + " " + source + " [" + to + "/"+ to2 + "/"+ to3 + "]", source, back);
\r
192 public static UnicodeSet getRepresentativeHangul() {
\r
193 UnicodeSet extraSamples = new UnicodeSet("[\uCE20{\uAD6C\uB514}{\uAD73\uC774}{\uBB34\uB837}{\uBB3C\uC5FF}{\uC544\uAE4C}{\uC544\uB530}{\uC544\uBE60}{\uC544\uC2F8}{\uC544\uC9DC}{\uC544\uCC28}{\uC545\uC0AC}{\uC545\uC2F8}{\uC546\uCE74}{\uC548\uAC00}{\uC548\uC790}{\uC548\uC9DC}{\uC548\uD558}{\uC54C\uAC00}{\uC54C\uB530}{\uC54C\uB9C8}{\uC54C\uBC14}{\uC54C\uBE60}{\uC54C\uC0AC}{\uC54C\uC2F8}{\uC54C\uD0C0}{\uC54C\uD30C}{\uC54C\uD558}{\uC555\uC0AC}{\uC555\uC2F8}{\uC558\uC0AC}{\uC5C5\uC12F\uC501}{\uC5C6\uC5C8\uC2B5}]");
\r
194 UnicodeSet sourceSet = new UnicodeSet();
\r
195 addRepresentativeHangul(sourceSet, 2, false);
\r
196 addRepresentativeHangul(sourceSet, 3, false);
\r
197 addRepresentativeHangul(sourceSet, 2, true);
\r
198 addRepresentativeHangul(sourceSet, 3, true);
\r
199 // add the boundary cases; we want an example of each case of V + L and one example of each case of T+L
\r
201 UnicodeSet more = getRepresentativeBoundaryHangul();
\r
202 sourceSet.addAll(more);
\r
203 sourceSet.addAll(extraSamples);
\r
207 private static UnicodeSet getRepresentativeBoundaryHangul() {
\r
208 UnicodeSet resultToAddTo = new UnicodeSet();
\r
209 // U+1100 HANGUL CHOSEONG KIYEOK
\r
210 // U+1161 HANGUL JUNGSEONG A
\r
211 UnicodeSet L = new UnicodeSet("[:hst=L:]");
\r
212 UnicodeSet V = new UnicodeSet("[:hst=V:]");
\r
213 UnicodeSet T = new UnicodeSet("[:hst=T:]");
\r
215 String prefixLV = "\u1100\u1161";
\r
216 String prefixL = "\u1100";
\r
217 String suffixV = "\u1161";
\r
218 String nullL = "\u110B"; // HANGUL CHOSEONG IEUNG
\r
220 UnicodeSet L0 = new UnicodeSet("[\u1100\u110B]");
\r
222 // do all combinations of L0 + V + nullL + V
\r
224 for (UnicodeSetIterator iL0 = new UnicodeSetIterator(L0); iL0.next();) {
\r
225 for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next();) {
\r
226 for (UnicodeSetIterator iV2 = new UnicodeSetIterator(V); iV2.next();) {
\r
227 String sample = iL0.getString() + iV.getString() + nullL + iV2.getString();
\r
228 String trial = Normalizer.compose(sample, false);
\r
229 if (trial.length() == 2) {
\r
230 resultToAddTo.add(trial);
\r
236 for (UnicodeSetIterator iL = new UnicodeSetIterator(L); iL.next();) {
\r
237 // do all combinations of "g" + V + L + "a"
\r
238 final String suffix = iL.getString() + suffixV;
\r
239 for (UnicodeSetIterator iV = new UnicodeSetIterator(V); iV.next();) {
\r
240 String sample = prefixL + iV.getString() + suffix;
\r
241 String trial = Normalizer.compose(sample, false);
\r
242 if (trial.length() == 2) {
\r
243 resultToAddTo.add(trial);
\r
246 // do all combinations of "ga" + T + L + "a"
\r
247 for (UnicodeSetIterator iT = new UnicodeSetIterator(T); iT.next();) {
\r
248 String sample = prefixLV + iT.getString() + suffix;
\r
249 String trial = Normalizer.compose(sample, false);
\r
250 if (trial.length() == 2) {
\r
251 resultToAddTo.add(trial);
\r
255 return resultToAddTo;
\r
258 private static void addRepresentativeHangul(UnicodeSet resultToAddTo, int leng, boolean noFirstConsonant) {
\r
259 UnicodeSet notYetSeen = new UnicodeSet();
\r
260 for (char c = '\uAC00'; c < '\uD7AF'; ++c) {
\r
261 String charStr = String.valueOf(c);
\r
262 String decomp = Normalizer.decompose(charStr, false);
\r
263 if (decomp.length() != leng) {
\r
264 continue; // only take one length at a time
\r
266 if (decomp.startsWith("\u110B ") != noFirstConsonant) {
\r
269 if (!notYetSeen.containsAll(decomp)) {
\r
270 resultToAddTo.add(c);
\r
271 notYetSeen.addAll(decomp);
\r
277 public void TestHan() throws UnsupportedEncodingException, FileNotFoundException {
\r
279 UnicodeSet exemplars = LocaleData.getExemplarSet(new ULocale("zh"),0);
\r
280 // create string with all chars
\r
281 StringBuffer b = new StringBuffer();
\r
282 for (UnicodeSetIterator it = new UnicodeSetIterator(exemplars); it.next();) {
\r
283 UTF16.append(b,it.codepoint);
\r
285 String source = b.toString();
\r
286 // transform with Han translit
\r
287 Transliterator han = Transliterator.getInstance("Han-Latin");
\r
288 String target = han.transliterate(source);
\r
289 // now verify that there are no Han characters left
\r
290 UnicodeSet allHan = new UnicodeSet("[:han:]");
\r
291 assertFalse("No Han must be left after Han-Latin transliteration",allHan.containsSome(target));
\r
292 // check the pinyin translit
\r
293 Transliterator pn = Transliterator.getInstance("Latin-NumericPinyin");
\r
294 String target2 = pn.transliterate(target);
\r
295 // verify that there are no marks
\r
296 Transliterator nfc = Transliterator.getInstance("nfc");
\r
297 String nfced = nfc.transliterate(target2);
\r
298 UnicodeSet allMarks = new UnicodeSet("[:mark:]");
\r
299 assertFalse("NumericPinyin must contain no marks", allMarks.containsSome(nfced));
\r
300 // verify roundtrip
\r
301 Transliterator np = pn.getInverse();
\r
302 String target3 = np.transliterate(target);
\r
303 boolean roundtripOK = target3.equals(target);
\r
304 assertTrue("NumericPinyin must roundtrip", roundtripOK);
\r
305 if (!roundtripOK) {
\r
306 String filename = "numeric-pinyin.log.txt";
\r
307 PrintWriter out = new PrintWriter(
\r
308 new BufferedWriter(
\r
309 new OutputStreamWriter(
\r
310 new FileOutputStream(filename), "UTF8"), 4*1024));
\r
311 errln("Creating log file " + new File(filename).getAbsoluteFile());
\r
312 out.println("Pinyin: " + target);
\r
313 out.println("Pinyin-Numeric-Pinyin: " + target2);
\r
316 }catch(MissingResourceException ex){
\r
317 warnln("Could not load the locale data for fetching the exemplar characters.");
\r
321 public void TestSingle() {
\r
322 Transliterator t = Transliterator.getInstance("Latin-Greek");
\r
323 t.transliterate("\u0061\u0101\u0069");
\r
326 String getGreekSet() {
\r
328 if (skipIfBeforeICU(4,5,0)) {
\r
329 // We temporarily filter against Unicode 4.1, but we only do this
\r
330 // before version 3.5.
\r
331 logln("TestGreek needs to be updated to remove delete the section marked [:Age=4.0:] filter");
\r
333 errln("TestGreek needs to be updated to remove delete the [:Age=4.0:] filter ");
\r
336 // isICU28() ? "[[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]&[:Age=3.2:]]" :
\r
337 "[\u003B\u00B7[[:Greek:]&[:Letter:]]-[" +
\r
338 "\u1D26-\u1D2A" + // L& [5] GREEK LETTER SMALL CAPITAL GAMMA..GREEK LETTER SMALL CAPITAL PSI
\r
339 "\u1D5D-\u1D61" + // Lm [5] MODIFIER LETTER SMALL BETA..MODIFIER LETTER SMALL CHI
\r
340 "\u1D66-\u1D6A" + // L& [5] GREEK SUBSCRIPT SMALL LETTER BETA..GREEK SUBSCRIPT SMALL LETTER CHI
\r
341 "\u03D7-\u03EF" + // \N{GREEK KAI SYMBOL}..\N{COPTIC SMALL LETTER DEI}
\r
342 "] & [:Age=4.0:]]";
\r
345 public void TestGreek() throws IOException {
\r
346 long start = System.currentTimeMillis();
\r
347 new Test("Latin-Greek", 50)
\r
348 .test("[a-zA-Z]", getGreekSet(),
\r
349 "[\u00B5\u037A\u03D0-\u03F5\u03F9]", /* roundtrip exclusions */
\r
350 this, new LegalGreek(true));
\r
351 showElapsed(start, "TestGreek");
\r
354 public void TestGreekUNGEGN() throws IOException {
\r
355 long start = System.currentTimeMillis();
\r
356 new Test("Latin-Greek/UNGEGN")
\r
357 .test("[a-zA-Z]", getGreekSet(),
\r
358 "[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
\r
359 this, new LegalGreek(false));
\r
360 showElapsed(start, "TestGreekUNGEGN");
\r
363 public void Testel() throws IOException {
\r
364 long start = System.currentTimeMillis();
\r
365 new Test("Latin-el")
\r
366 .test("[a-zA-Z]", getGreekSet(),
\r
367 "[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
\r
368 this, new LegalGreek(false));
\r
369 showElapsed(start, "Testel");
\r
372 public void TestCyrillic() throws IOException {
\r
373 long start = System.currentTimeMillis();
\r
374 new Test("Latin-Cyrillic")
\r
375 .test("[a-zA-Z\u0110\u0111\u02BA\u02B9]", "[\u0400-\u045F]", null, this, new Legal());
\r
376 showElapsed(start, "TestCyrillic");
\r
379 static final String ARABIC = "[\u06A9\u060C\u061B\u061F\u0621\u0627-\u063A\u0641-\u0655\u0660-\u066C\u067E\u0686\u0698\u06A4\u06AD\u06AF\u06CB-\u06CC\u06F0-\u06F9]";
\r
381 public void TestArabic() throws IOException {
\r
382 long start = System.currentTimeMillis();
\r
383 new Test("Latin-Arabic")
\r
384 .test("[a-zA-Z\u02BE\u02BF]", ARABIC, "[a-zA-Z\u02BE\u02BF\u207F]", null, this, new Legal()); //
\r
385 showElapsed(start, "TestArabic");
\r
388 public void TestHebrew() throws IOException {
\r
390 if (skipIfBeforeICU(4,5,0)) {
\r
391 // We temporarily filter against Unicode 4.1, but we only do this
\r
392 // before version 3.5.
\r
393 logln("TestHebrew needs to be updated to remove delete the section marked [:Age=4.0:] filter");
\r
395 errln("TestHebrew needs to be updated to remove delete the [:Age=4.0:] filter ");
\r
397 long start = System.currentTimeMillis();
\r
398 new Test("Latin-Hebrew")
\r
399 .test("[a-zA-Z\u02BC\u02BB]", "[[[:hebrew:]-[\u05BD\uFB00-\uFBFF]]& [:Age=4.0:]]", "[\u05F0\u05F1\u05F2]", this, new LegalHebrew());
\r
400 showElapsed(start, "TestHebrew");
\r
403 public void TestThai() throws IOException {
\r
404 long start = System.currentTimeMillis();
\r
405 if(skipIfBeforeICU(4,5,0)){
\r
406 new Test("Latin-Thai")
\r
407 .test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",
\r
408 "[\u0E01-\u0E3A\u0E40-\u0E5B]",
\r
409 "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",
\r
410 "[\u0E4F]", this, new LegalThai());
\r
412 new Test("Latin-Thai")
\r
413 .test("[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02CC]",
\r
414 "[\u0E01-\u0E3A\u0E40-\u0E5B]",
\r
415 "[a-zA-Z\u0142\u1ECD\u00E6\u0131\u0268\u02B9\u02CC]",
\r
416 null, this, new LegalThai());
\r
419 showElapsed(start, "TestThai");
\r
422 //----------------------------------
\r
423 // Inter-Indic Tests
\r
424 //----------------------------------
\r
425 public static class LegalIndic extends Legal{
\r
426 UnicodeSet vowelSignSet = new UnicodeSet();
\r
428 public LegalIndic(){
\r
429 vowelSignSet.addAll(new UnicodeSet("[\u0901\u0902\u0903\u0904\u093e-\u094c\u0962\u0963]")); /* Devanagari */
\r
430 vowelSignSet.addAll(new UnicodeSet("[\u0981\u0982\u0983\u09be-\u09cc\u09e2\u09e3\u09D7]")); /* Bengali */
\r
431 vowelSignSet.addAll(new UnicodeSet("[\u0a01\u0a02\u0a03\u0a3e-\u0a4c\u0a62\u0a63\u0a70\u0a71]")); /* Gurmukhi */
\r
432 vowelSignSet.addAll(new UnicodeSet("[\u0a81\u0a82\u0a83\u0abe-\u0acc\u0ae2\u0ae3]")); /* Gujarati */
\r
433 vowelSignSet.addAll(new UnicodeSet("[\u0b01\u0b02\u0b03\u0b3e-\u0b4c\u0b62\u0b63\u0b56\u0b57]")); /* Oriya */
\r
434 vowelSignSet.addAll(new UnicodeSet("[\u0b81\u0b82\u0b83\u0bbe-\u0bcc\u0be2\u0be3\u0bd7]")); /* Tamil */
\r
435 vowelSignSet.addAll(new UnicodeSet("[\u0c01\u0c02\u0c03\u0c3e-\u0c4c\u0c62\u0c63\u0c55\u0c56]")); /* Telugu */
\r
436 vowelSignSet.addAll(new UnicodeSet("[\u0c81\u0c82\u0c83\u0cbe-\u0ccc\u0ce2\u0ce3\u0cd5\u0cd6]")); /* Kannada */
\r
437 vowelSignSet.addAll(new UnicodeSet("[\u0d01\u0d02\u0d03\u0d3e-\u0d4c\u0d62\u0d63\u0d57]")); /* Malayalam */
\r
440 String avagraha = "\u093d\u09bd\u0abd\u0b3d\u0cbd";
\r
441 String nukta = "\u093c\u09bc\u0a3c\u0abc\u0b3c\u0cbc";
\r
442 String virama = "\u094d\u09cd\u0a4d\u0acd\u0b4d\u0bcd\u0c4d\u0ccd\u0d4d";
\r
443 String sanskritStressSigns = "\u0951\u0952\u0953\u0954\u097d";
\r
444 String chandrabindu = "\u0901\u0981\u0A81\u0b01\u0c01";
\r
445 public boolean is(String sourceString){
\r
446 int cp=sourceString.charAt(0);
\r
448 // A vowel sign cannot be the first char
\r
449 if(vowelSignSet.contains(cp)){
\r
451 }else if(avagraha.indexOf(cp)!=-1){
\r
453 }else if(virama.indexOf(cp)!=-1){
\r
455 }else if(nukta.indexOf(cp)!=-1){
\r
457 }else if(sanskritStressSigns.indexOf(cp)!=-1){
\r
459 }else if((chandrabindu.indexOf(cp)!=-1) &&
\r
460 (sourceString.length() >1 &&
\r
461 vowelSignSet.contains(sourceString.charAt(1)))){
\r
467 static String latinForIndic = "[['.0-9A-Za-z~\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD"+
\r
468 "\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F"+
\r
469 "\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148"+
\r
470 "\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0"+
\r
471 "\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u01FB"+
\r
472 "\u0200-\u021B\u021E-\u021F\u0226-\u0233\u0294\u0303-\u0304\u0306\u0314-\u0315"+
\r
473 "\u0325\u040E\u0419\u0439\u045E\u04C1-\u04C2\u04D0-\u04D1\u04D6-\u04D7"+
\r
474 "\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1F01\u1F03\u1F05"+
\r
475 "\u1F07\u1F09\u1F0B\u1F0D\u1F0F\u1F11\u1F13\u1F15\u1F19\u1F1B\u1F1D\u1F21"+
\r
476 "\u1F23\u1F25\u1F27\u1F29\u1F2B\u1F2D\u1F2F\u1F31\u1F33\u1F35\u1F37\u1F39"+
\r
477 "\u1F3B\u1F3D\u1F3F\u1F41\u1F43\u1F45\u1F49\u1F4B\u1F4D\u1F51\u1F53\u1F55"+
\r
478 "\u1F57\u1F59\u1F5B\u1F5D\u1F5F\u1F61\u1F63\u1F65\u1F67\u1F69\u1F6B\u1F6D"+
\r
479 "\u1F6F\u1F81\u1F83\u1F85\u1F87\u1F89\u1F8B\u1F8D\u1F8F\u1F91\u1F93\u1F95"+
\r
480 "\u1F97\u1F99\u1F9B\u1F9D\u1F9F\u1FA1\u1FA3\u1FA5\u1FA7\u1FA9\u1FAB\u1FAD"+
\r
481 "\u1FAF-\u1FB1\u1FB8-\u1FB9\u1FD0-\u1FD1\u1FD8-\u1FD9\u1FE0-\u1FE1\u1FE5"+
\r
482 "\u1FE8-\u1FE9\u1FEC\u212A-\u212B\uE04D\uE064]"+
\r
483 "-[\uE000-\uE080 \u01E2\u01E3]& [[:latin:][:mark:]]]";
\r
485 public void TestDevanagariLatin() throws IOException {
\r
486 long start = System.currentTimeMillis();
\r
487 if (skipIfBeforeICU(4,5,0)) {
\r
488 logln("Warning: TestDevanagariLatin needs to be updated to remove delete the section marked [:Age=4.1:] filter");
\r
490 // We temporarily filter against Unicode 4.1, but we only do this
\r
491 // before version 3.4.
\r
492 errln("FAIL: TestDevanagariLatin needs to be updated to remove delete the [:Age=4.1:] filter ");
\r
495 new Test("Latin-DEVANAGARI", 50)
\r
496 .test(latinForIndic, "[[[:Devanagari:][\u094d][\u0964\u0965]]&[:Age=4.1:]]", "[\u0965\u0904]", this, new LegalIndic());
\r
497 showElapsed(start, "TestDevanagariLatin");
\r
500 private static final String [][] interIndicArray= new String[][]{
\r
501 new String [] { "BENGALI-DEVANAGARI",
\r
502 "[:BENGALI:]", "[:Devanagari:]",
\r
503 "[\u0904\u0951-\u0954\u0943-\u0949\u094a\u0962\u0963\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u097d]", /*roundtrip exclusions*/
\r
505 new String [] { "DEVANAGARI-BENGALI",
\r
506 "[:Devanagari:]", "[:BENGALI:]",
\r
507 "[\u09D7\u090D\u090e\u0911\u0912\u0929\u0933\u0934\u0935\u0950\u0958\u0959\u095a\u095b\u095e\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
510 new String [] { "GURMUKHI-DEVANAGARI",
\r
511 "[:GURMUKHI:]", "[:Devanagari:]",
\r
512 "[\u0904\u0902\u0936\u0933\u0951-\u0954\u0902\u0903\u0943-\u0949\u094a\u0962\u0963\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u097d]", /*roundtrip exclusions*/
\r
514 new String [] { "DEVANAGARI-GURMUKHI",
\r
515 "[:Devanagari:]", "[:GURMUKHI:]",
\r
516 "[\u0A02\u0946\u0A5C\u0951-\u0954\u0A70\u0A71\u090B\u090C\u090D\u090e\u0911\u0912\u0934\u0937\u093D\u0950\u0960\u0961\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
\r
519 new String [] { "GUJARATI-DEVANAGARI",
\r
520 "[:GUJARATI:]", "[:Devanagari:]",
\r
521 "[\u0904\u0946\u094A\u0962\u0963\u0951-\u0954\u0961\u090c\u090e\u0912\u097d]", /*roundtrip exclusions*/
\r
523 new String [] { "DEVANAGARI-GUJARATI",
\r
524 "[:Devanagari:]", "[:GUJARATI:]",
\r
525 "[\u0951-\u0954\u0961\u090c\u090e\u0912]", /*roundtrip exclusions*/
\r
528 new String [] { "ORIYA-DEVANAGARI",
\r
529 "[:ORIYA:]", "[:Devanagari:]",
\r
530 "[\u0904\u0912\u0911\u090D\u090e\u0931\u0943-\u094a\u0962\u0963\u0951-\u0954\u0950\u097d]", /*roundtrip exclusions*/
\r
532 new String [] { "DEVANAGARI-ORIYA",
\r
533 "[:Devanagari:]", "[:ORIYA:]",
\r
534 "[\u0b5f\u0b56\u0b57\u0b70\u0b71\u0950\u090D\u090e\u0912\u0911\u0931]", /*roundtrip exclusions*/
\r
537 new String [] { "Tamil-DEVANAGARI",
\r
538 "[:tamil:]", "[:Devanagari:]",
\r
539 "[\u0901\u0904\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]\u097d]", /*roundtrip exclusions*/
\r
541 new String [] { "DEVANAGARI-Tamil",
\r
542 "[:Devanagari:]", "[:tamil:]",
\r
543 "[\u0bd7\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
546 new String [] { "Telugu-DEVANAGARI",
\r
547 "[:telugu:]", "[:Devanagari:]",
\r
548 "[\u0904\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
\r
550 new String [] { "DEVANAGARI-TELUGU",
\r
551 "[:Devanagari:]", "[:TELUGU:]",
\r
552 "[\u0c55\u0c56\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
\r
555 new String [] { "KANNADA-DEVANAGARI",
\r
556 "[:KANNADA:]", "[:Devanagari:]",
\r
557 "[\u0901\u0904\u0946\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
\r
559 new String [] { "DEVANAGARI-KANNADA",
\r
560 "[:Devanagari:]", "[:KANNADA:]",
\r
561 "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cde\u0cd5\u0cd6\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
\r
564 new String [] { "MALAYALAM-DEVANAGARI",
\r
565 "[:MALAYALAM:]", "[:Devanagari:]",
\r
566 "[\u0901\u0904\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]\u097d]", /*roundtrip exclusions*/
\r
568 new String [] { "DEVANAGARI-MALAYALAM",
\r
569 "[:Devanagari:]", "[:MALAYALAM:]",
\r
570 "[\u0d4c\u0d57\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
\r
573 new String [] { "GURMUKHI-BENGALI",
\r
574 "[:GURMUKHI:]", "[:BENGALI:]",
\r
575 "[\u0982\u09b6\u09e2\u09e3\u09c3\u09c4\u09d7\u098B\u098C\u09B7\u09E0\u09E1\u09F0\u09F1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
577 new String [] { "BENGALI-GURMUKHI",
\r
578 "[:BENGALI:]", "[:GURMUKHI:]",
\r
579 "[\u0A02\u0a5c\u0a47\u0a70\u0a71\u0A33\u0A35\u0A59\u0A5A\u0A5B\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
\r
582 new String [] { "GUJARATI-BENGALI",
\r
583 "[:GUJARATI:]", "[:BENGALI:]",
\r
584 "[\u09d7\u09e2\u09e3\u098c\u09e1\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
586 new String [] { "BENGALI-GUJARATI",
\r
587 "[:BENGALI:]", "[:GUJARATI:]",
\r
588 "[\u0A82\u0a83\u0Ac9\u0Ac5\u0ac7\u0A8D\u0A91\u0AB3\u0AB5\u0ABD\u0AD0]", /*roundtrip exclusions*/
\r
591 new String [] { "ORIYA-BENGALI",
\r
592 "[:ORIYA:]", "[:BENGALI:]",
\r
593 "[\u09c4\u09e2\u09e3\u09f0\u09f1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
595 new String [] { "BENGALI-ORIYA",
\r
596 "[:BENGALI:]", "[:ORIYA:]",
\r
597 "[\u0b35\u0b71\u0b5f\u0b56\u0b33\u0b3d]", /*roundtrip exclusions*/
\r
600 new String [] { "Tamil-BENGALI",
\r
601 "[:tamil:]", "[:BENGALI:]",
\r
602 "[\u0981\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
604 new String [] { "BENGALI-Tamil",
\r
605 "[:BENGALI:]", "[:tamil:]",
\r
606 "[\u0bc6\u0bc7\u0bca\u0B8E\u0B92\u0BA9\u0BB1\u0BB3\u0BB4\u0BB5\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
609 new String [] { "Telugu-BENGALI",
\r
610 "[:telugu:]", "[:BENGALI:]",
\r
611 "[\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
613 new String [] { "BENGALI-TELUGU",
\r
614 "[:BENGALI:]", "[:TELUGU:]",
\r
615 "[\u0c55\u0c56\u0c47\u0c46\u0c4a\u0C0E\u0C12\u0C31\u0C33\u0C35]", /*roundtrip exclusions*/
\r
618 new String [] { "KANNADA-BENGALI",
\r
619 "[:KANNADA:]", "[:BENGALI:]",
\r
620 "[\u0981\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
622 new String [] { "BENGALI-KANNADA",
\r
623 "[:BENGALI:]", "[:KANNADA:]",
\r
624 "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0cc7\u0C8E\u0C92\u0CB1\u0cb3\u0cb5\u0cde]", /*roundtrip exclusions*/
\r
627 new String [] { "MALAYALAM-BENGALI",
\r
628 "[:MALAYALAM:]", "[:BENGALI:]",
\r
629 "[\u0981\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df\u09f2-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
631 new String [] { "BENGALI-MALAYALAM",
\r
632 "[:BENGALI:]", "[:MALAYALAM:]",
\r
633 "[\u0d46\u0d4a\u0d47\u0d31-\u0d35\u0d0e\u0d12]", /*roundtrip exclusions*/
\r
636 new String [] { "GUJARATI-GURMUKHI",
\r
637 "[:GUJARATI:]", "[:GURMUKHI:]",
\r
638 "[\u0A02\u0ab3\u0ab6\u0A70\u0a71\u0a82\u0a83\u0ac3\u0ac4\u0ac5\u0ac9\u0a5c\u0a72\u0a73\u0a74\u0a8b\u0a8d\u0a91\u0abd]", /*roundtrip exclusions*/
\r
640 new String [] { "GURMUKHI-GUJARATI",
\r
641 "[:GURMUKHI:]", "[:GUJARATI:]",
\r
642 "[\u0a5c\u0A70\u0a71\u0a72\u0a73\u0a74\u0a82\u0a83\u0a8b\u0a8c\u0a8d\u0a91\u0ab3\u0ab6\u0ab7\u0abd\u0ac3\u0ac4\u0ac5\u0ac9\u0ad0\u0ae0\u0ae1]", /*roundtrip exclusions*/
\r
645 new String [] { "ORIYA-GURMUKHI",
\r
646 "[:ORIYA:]", "[:GURMUKHI:]",
\r
647 "[\u0A02\u0a5c\u0a21\u0a47\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0a35\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
\r
649 new String [] { "GURMUKHI-ORIYA",
\r
650 "[:GURMUKHI:]", "[:ORIYA:]",
\r
651 "[\u0a71\u0b02\u0b03\u0b33\u0b36\u0b43\u0b56\u0b57\u0B0B\u0B0C\u0B37\u0B3D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
\r
654 new String [] { "TAMIL-GURMUKHI",
\r
655 "[:TAMIL:]", "[:GURMUKHI:]",
\r
656 "[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0a47\u0A16\u0A17\u0A18\u0A1B\u0A1D\u0A20\u0A21\u0A22\u0A25\u0A26\u0A27\u0A2B\u0A2C\u0A2D\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
\r
658 new String [] { "GURMUKHI-TAMIL",
\r
659 "[:GURMUKHI:]", "[:TAMIL:]",
\r
660 "[\u0b82\u0bc6\u0bca\u0bd7\u0bb7\u0bb3\u0b83\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0bb6\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
663 new String [] { "TELUGU-GURMUKHI",
\r
664 "[:TELUGU:]", "[:GURMUKHI:]",
\r
665 "[\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
\r
667 new String [] { "GURMUKHI-TELUGU",
\r
668 "[:GURMUKHI:]", "[:TELUGU:]",
\r
669 "[\u0c02\u0c03\u0c33\u0c36\u0c44\u0c43\u0c46\u0c4a\u0c56\u0c55\u0C0B\u0C0C\u0C0E\u0C12\u0C31\u0C37\u0C60\u0C61]", /*roundtrip exclusions*/
\r
671 new String [] { "KANNADA-GURMUKHI",
\r
672 "[:KANNADA:]", "[:GURMUKHI:]",
\r
673 "[\u0A01\u0A02\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
\r
675 new String [] { "GURMUKHI-KANNADA",
\r
676 "[:GURMUKHI:]", "[:KANNADA:]",
\r
677 "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0c82\u0c83\u0cb3\u0cb6\u0cc4\u0cc3\u0cc6\u0cca\u0cd5\u0cd6\u0C8B\u0C8C\u0C8E\u0C92\u0CB1\u0CB7\u0cbd\u0CE0\u0CE1\u0cde]", /*roundtrip exclusions*/
\r
680 new String [] { "MALAYALAM-GURMUKHI",
\r
681 "[:MALAYALAM:]", "[:GURMUKHI:]",
\r
682 "[\u0A01\u0A02\u0a4b\u0a4c\u0a33\u0a36\u0a3c\u0a70\u0a71\u0A59\u0A5A\u0A5B\u0A5C\u0A5E\u0A72\u0A73\u0A74]", /*roundtrip exclusions*/
\r
684 new String [] { "GURMUKHI-MALAYALAM",
\r
685 "[:GURMUKHI:]", "[:MALAYALAM:]",
\r
686 "[\u0d02\u0d03\u0d33\u0d36\u0d43\u0d46\u0d4a\u0d4c\u0d57\u0D0B\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D37\u0D60\u0D61]", /*roundtrip exclusions*/
\r
689 new String [] { "GUJARATI-ORIYA",
\r
690 "[:GUJARATI:]", "[:ORIYA:]",
\r
691 "[\u0b56\u0b57\u0B0C\u0B5F\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
\r
693 new String [] { "ORIYA-GUJARATI",
\r
694 "[:ORIYA:]", "[:GUJARATI:]",
\r
695 "[\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8D\u0A91\u0AB5\u0Ad0]", /*roundtrip exclusions*/
\r
698 new String [] { "TAMIL-GUJARATI",
\r
699 "[:TAMIL:]", "[:GUJARATI:]",
\r
700 "[\u0A81\u0a8c\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0\u0AE1]", /*roundtrip exclusions*/
\r
702 new String [] { "GUJARATI-TAMIL",
\r
703 "[:GUJARATI:]", "[:TAMIL:]",
\r
704 "[\u0Bc6\u0Bca\u0Bd7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
707 new String [] { "TELUGU-GUJARATI",
\r
708 "[:TELUGU:]", "[:GUJARATI:]",
\r
709 "[\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
\r
711 new String [] { "GUJARATI-TELUGU",
\r
712 "[:GUJARATI:]", "[:TELUGU:]",
\r
713 "[\u0c46\u0c4a\u0c55\u0c56\u0C0C\u0C0E\u0C12\u0C31\u0C61]", /*roundtrip exclusions*/
\r
716 new String [] { "KANNADA-GUJARATI",
\r
717 "[:KANNADA:]", "[:GUJARATI:]",
\r
718 "[\u0A81\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
\r
720 new String [] { "GUJARATI-KANNADA",
\r
721 "[:GUJARATI:]", "[:KANNADA:]",
\r
722 "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc6\u0cca\u0cd5\u0cd6\u0C8C\u0C8E\u0C92\u0CB1\u0CDE\u0CE1]", /*roundtrip exclusions*/
\r
725 new String [] { "MALAYALAM-GUJARATI",
\r
726 "[:MALAYALAM:]", "[:GUJARATI:]",
\r
727 "[\u0A81\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
\r
729 new String [] { "GUJARATI-MALAYALAM",
\r
730 "[:GUJARATI:]", "[:MALAYALAM:]",
\r
731 "[\u0d46\u0d4a\u0d4c\u0d55\u0d57\u0D0C\u0D0E\u0D12\u0D31\u0D34\u0D61]", /*roundtrip exclusions*/
\r
734 new String [] { "TAMIL-ORIYA",
\r
735 "[:TAMIL:]", "[:ORIYA:]",
\r
736 "[\u0B01\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61\u0b70\u0b71]", /*roundtrip exclusions*/
\r
738 new String [] { "ORIYA-TAMIL",
\r
739 "[:ORIYA:]", "[:TAMIL:]",
\r
740 "[\u0bc6\u0bca\u0bc7\u0B8E\u0B92\u0BA9\u0BB1\u0BB4\u0BB5\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
743 new String [] { "TELUGU-ORIYA",
\r
744 "[:TELUGU:]", "[:ORIYA:]",
\r
745 "[\u0b3c\u0b57\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
\r
747 new String [] { "ORIYA-TELUGU",
\r
748 "[:ORIYA:]", "[:TELUGU:]",
\r
749 "[\u0c44\u0c46\u0c4a\u0c55\u0c47\u0C0E\u0C12\u0C31\u0C35]", /*roundtrip exclusions*/
\r
752 new String [] { "KANNADA-ORIYA",
\r
753 "[:KANNADA:]", "[:ORIYA:]",
\r
754 "[\u0B01\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
\r
756 new String [] { "ORIYA-KANNADA",
\r
757 "[:ORIYA:]", "[:KANNADA:]",
\r
758 "[{\u0cb0\u0cbc}{\u0cb3\u0cbc}\u0cc4\u0cc6\u0cca\u0cd5\u0cc7\u0C8E\u0C92\u0CB1\u0CB5\u0CDE]", /*roundtrip exclusions*/
\r
761 new String [] { "MALAYALAM-ORIYA",
\r
762 "[:MALAYALAM:]", "[:ORIYA:]",
\r
763 "[\u0B01\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F\u0b70\u0b71]", /*roundtrip exclusions*/
\r
765 new String [] { "ORIYA-MALAYALAM",
\r
766 "[:ORIYA:]", "[:MALAYALAM:]",
\r
767 "[\u0D47\u0D46\u0D4a\u0D0E\u0D12\u0D31\u0D34\u0D35]", /*roundtrip exclusions*/
\r
770 new String [] { "TELUGU-TAMIL",
\r
771 "[:TELUGU:]", "[:TAMIL:]",
\r
772 "[\u0bd7\u0ba9\u0bb4\u0BF0\u0BF1\u0BF2\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
774 new String [] { "TAMIL-TELUGU",
\r
775 "[:TAMIL:]", "[:TELUGU:]",
\r
776 "[\u0C01\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
\r
779 new String [] { "KANNADA-TAMIL",
\r
780 "[:KANNADA:]", "[:TAMIL:]",
\r
781 "[\u0bd7\u0bc6\u0ba9\u0bb4\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
783 new String [] { "TAMIL-KANNADA",
\r
784 "[:TAMIL:]", "[:KANNADA:]",
\r
785 "[\u0cc3\u0cc4\u0cc6\u0cc7\u0cd5\u0cd6\u0C8B\u0C8C\u0C96\u0C97\u0C98\u0C9B\u0C9D\u0CA0\u0CA1\u0CA2\u0CA5\u0CA6\u0CA7\u0CAB\u0CAC\u0CAD\u0CB6\u0cbc\u0cbd\u0CDE\u0CE0\u0CE1]", /*roundtrip exclusions*/
\r
788 new String [] { "MALAYALAM-TAMIL",
\r
789 "[:MALAYALAM:]", "[:TAMIL:]",
\r
790 "[\u0ba9\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
792 new String [] { "TAMIL-MALAYALAM",
\r
793 "[:TAMIL:]", "[:MALAYALAM:]",
\r
794 "[\u0d43\u0d12\u0D0B\u0D0C\u0D16\u0D17\u0D18\u0D1B\u0D1D\u0D20\u0D21\u0D22\u0D25\u0D26\u0D27\u0D2B\u0D2C\u0D2D\u0D36\u0D60\u0D61]", /*roundtrip exclusions*/
\r
797 new String [] { "KANNADA-TELUGU",
\r
798 "[:KANNADA:]", "[:TELUGU:]",
\r
799 "[\u0C01\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
\r
801 new String [] { "TELUGU-KANNADA",
\r
802 "[:TELUGU:]", "[:KANNADA:]",
\r
803 "[\u0cc8\u0cd5\u0cd6\u0CDE\u0cbc\u0cbd]", /*roundtrip exclusions*/
\r
806 new String [] { "MALAYALAM-TELUGU",
\r
807 "[:MALAYALAM:]", "[:TELUGU:]",
\r
808 "[\u0C01\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
\r
810 new String [] { "TELUGU-MALAYALAM",
\r
811 "[:TELUGU:]", "[:MALAYALAM:]",
\r
812 "[\u0d4c\u0d57\u0D34]", /*roundtrip exclusions*/
\r
815 new String [] { "MALAYALAM-KANNADA",
\r
816 "[:MALAYALAM:]", "[:KANNADA:]",
\r
817 "[\u0cbc\u0cbd\u0cc4\u0cc6\u0cca\u0ccc\u0ccb\u0cd5\u0cd6\u0cDe]", /*roundtrip exclusions*/
\r
819 new String [] { "Latin-Bengali",
\r
820 latinForIndic, "[[:Bengali:][\u0964\u0965]]",
\r
821 "[\u0965\u09f0-\u09fa\u09ce]", /*roundtrip exclusions*/
\r
823 new String [] { "Latin-Gurmukhi",
\r
824 latinForIndic, "[[:Gurmukhi:][\u0964\u0965]]",
\r
825 "[\u0a01\u0a02\u0965\u0a72\u0a73\u0a74]", /*roundtrip exclusions*/
\r
827 new String [] { "Latin-Gujarati",
\r
828 latinForIndic, "[[:Gujarati:][\u0964\u0965]]",
\r
829 "[\u0965]", /*roundtrip exclusions*/
\r
831 new String [] { "Latin-Oriya",
\r
832 latinForIndic, "[[:Oriya:][\u0964\u0965]]",
\r
833 "[\u0965\u0b70]", /*roundtrip exclusions*/
\r
835 new String [] { "Latin-Tamil",
\r
836 latinForIndic, "[:Tamil:]",
\r
837 "[\u0BF0\u0BF1\u0BF2]", /*roundtrip exclusions*/
\r
839 new String [] { "Latin-Telugu",
\r
840 latinForIndic, "[:Telugu:]",
\r
841 null, /*roundtrip exclusions*/
\r
843 new String [] { "Latin-Kannada",
\r
844 latinForIndic, "[:Kannada:]",
\r
845 null, /*roundtrip exclusions*/
\r
847 new String [] { "Latin-Malayalam",
\r
848 latinForIndic, "[:Malayalam:]",
\r
849 null, /*roundtrip exclusions*/
\r
853 public void TestInterIndic() throws Exception{
\r
854 long start = System.currentTimeMillis();
\r
855 int num = interIndicArray.length;
\r
857 logln("Testing only 5 of "+ interIndicArray.length+" Skipping rest (use -e for exhaustive)");
\r
860 if (skipIfBeforeICU(4,5,0)) {
\r
861 logln("Warning: TestInterIndic needs to be updated to remove delete the section marked [:Age=4.1:] filter");
\r
863 // We temporarily filter against Unicode 4.1, but we only do this
\r
864 // before version 3.4.
\r
865 errln("FAIL: TestInterIndic needs to be updated to remove delete the [:Age=4.1:] filter ");
\r
868 for(int i=0; i<num;i++){
\r
869 logln("Testing " + interIndicArray[i][0] + " at index " + i );
\r
870 /*TODO: uncomment the line below when the transliterator is fixed
\r
871 new Test(interIndicArray[i][0], 50)
\r
872 .test(interIndicArray[i][1],
\r
873 interIndicArray[i][2],
\r
874 interIndicArray[i][3],
\r
875 this, new LegalIndic());
\r
877 /* comment lines below when transliterator is fixed */
\r
879 new Test(interIndicArray[i][0], 50)
\r
880 .test("["+interIndicArray[i][1]+" &[:Age=4.1:]]",
\r
881 "["+interIndicArray[i][2]+" &[:Age=4.1:]]",
\r
882 interIndicArray[i][3],
\r
883 this, new LegalIndic());
\r
886 showElapsed(start, "TestInterIndic");
\r
893 public static class Legal {
\r
894 public boolean is(String sourceString) {return true;}
\r
897 public static class LegalJamo extends Legal {
\r
898 // any initial must be followed by a medial (or initial)
\r
899 // any medial must follow an initial (or medial)
\r
900 // any final must follow a medial (or final)
\r
902 public boolean is(String sourceString) {
\r
905 String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
\r
906 for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates
\r
907 switch (getType(decomp.charAt(i))) {
\r
909 t = getType(decomp.charAt(i+1));
\r
910 if (t != 0 && t != 1) return false;
\r
913 t = getType(decomp.charAt(i-1));
\r
914 if (t != 0 && t != 1) return false;
\r
917 t = getType(decomp.charAt(i-1));
\r
918 if (t != 1 && t != 2) return false;
\r
923 } catch (StringIndexOutOfBoundsException e) {
\r
928 public int getType(char c) {
\r
929 if ('\u1100' <= c && c <= '\u1112') return 0;
\r
930 else if ('\u1161' <= c && c <= '\u1175') return 1;
\r
931 else if ('\u11A8' <= c && c <= '\u11C2') return 2;
\r
932 return -1; // other
\r
936 //static BreakIterator thaiBreak = BreakIterator.getWordInstance(new Locale("th", "TH"));
\r
937 // anything is legal except word ending with Logical-order-exception
\r
938 public static class LegalThai extends Legal {
\r
939 public boolean is(String sourceString) {
\r
940 if (sourceString.length() == 0) return true;
\r
941 char ch = sourceString.charAt(sourceString.length() - 1); // don't worry about surrogates.
\r
942 if (UCharacter.hasBinaryProperty(ch, UProperty.LOGICAL_ORDER_EXCEPTION)) return false;
\r
945 // disallow anything with a wordbreak between
\r
947 if (UTF16.countCodePoint(sourceString) <= 1) return true;
\r
948 thaiBreak.setText(sourceString);
\r
949 for (int pos = thaiBreak.first(); pos != BreakIterator.DONE; pos = thaiBreak.next()) {
\r
950 if (pos > 0 && pos < sourceString.length()) {
\r
951 System.out.println("Skipping " + Utility.escape(sourceString));
\r
960 // anything is legal except that Final letters can't be followed by letter; NonFinal must be
\r
961 public static class LegalHebrew extends Legal {
\r
962 static UnicodeSet FINAL = new UnicodeSet("[\u05DA\u05DD\u05DF\u05E3\u05E5]");
\r
963 static UnicodeSet NON_FINAL = new UnicodeSet("[\u05DB\u05DE\u05E0\u05E4\u05E6]");
\r
964 static UnicodeSet LETTER = new UnicodeSet("[:letter:]");
\r
965 public boolean is(String sourceString) {
\r
966 if (sourceString.length() == 0) return true;
\r
967 // don't worry about surrogates.
\r
968 for (int i = 0; i < sourceString.length(); ++i) {
\r
969 char ch = sourceString.charAt(i);
\r
970 char next = i+1 == sourceString.length() ? '\u0000' : sourceString.charAt(i);
\r
971 if (FINAL.contains(ch)) {
\r
972 if (LETTER.contains(next)) return false;
\r
973 } else if (NON_FINAL.contains(ch)) {
\r
974 if (!LETTER.contains(next)) return false;
\r
982 public static class LegalGreek extends Legal {
\r
986 public LegalGreek(boolean full) {
\r
990 static final char IOTA_SUBSCRIPT = '\u0345';
\r
991 static final UnicodeSet breathing = new UnicodeSet("[\\u0313\\u0314']");
\r
992 static final UnicodeSet validSecondVowel = new UnicodeSet("[\\u03C5\\u03B9\\u03A5\\u0399]");
\r
994 public static boolean isVowel(char c) {
\r
995 return "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9".indexOf(c) >= 0;
\r
998 public static boolean isRho(char c) {
\r
999 return "\u03C1\u03A1".indexOf(c) >= 0;
\r
1002 public boolean is(String sourceString) {
\r
1004 String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
\r
1006 // modern is simpler: don't care about anything but a grave
\r
1008 //if (sourceString.equals("\u039C\u03C0")) return false;
\r
1009 for (int i = 0; i < decomp.length(); ++i) {
\r
1010 char c = decomp.charAt(i);
\r
1011 // exclude all the accents
\r
1012 if (c == '\u0313' || c == '\u0314' || c == '\u0300' || c == '\u0302'
\r
1013 || c == '\u0342' || c == '\u0345'
\r
1019 // Legal full Greek has breathing marks IFF there is a vowel or RHO at the start
\r
1020 // IF it has them, it has exactly one.
\r
1021 // IF it starts with a RHO, then the breathing mark must come before the second letter.
\r
1022 // IF it starts with a vowel, then it must before the third letter.
\r
1023 // it will only come after the second if of the format [vowel] [no iota subscript!] [upsilon or iota]
\r
1024 // Since there are no surrogates in greek, don't worry about them
\r
1026 boolean firstIsVowel = false;
\r
1027 boolean firstIsRho = false;
\r
1028 boolean noLetterYet = true;
\r
1029 int breathingCount = 0;
\r
1030 int letterCount = 0;
\r
1031 //int breathingPosition = -1;
\r
1033 for (int i = 0; i < decomp.length(); ++i) {
\r
1034 char c = decomp.charAt(i);
\r
1035 if (UCharacter.isLetter(c)) {
\r
1037 if (firstIsVowel && !validSecondVowel.contains(c) && breathingCount == 0) return false;
\r
1038 if (noLetterYet) {
\r
1039 noLetterYet = false;
\r
1040 firstIsVowel = isVowel(c);
\r
1041 firstIsRho = isRho(c);
\r
1043 if (firstIsRho && letterCount == 2 && breathingCount == 0) return false;
\r
1045 if (c == IOTA_SUBSCRIPT && firstIsVowel && breathingCount == 0) return false;
\r
1046 if (breathing.contains(c)) {
\r
1047 // breathingPosition = i;
\r
1052 if (firstIsVowel || firstIsRho) return breathingCount == 1;
\r
1053 return breathingCount == 0;
\r
1054 } catch (Throwable t) {
\r
1055 System.out.println(t.getClass().getName() + " " + t.getMessage());
\r
1061 static class Test {
\r
1065 private String transliteratorID;
\r
1066 private int errorLimit = 500;
\r
1067 private int errorCount = 0;
\r
1068 private long pairLimit = 1000000; // make default be 1M.
\r
1069 private int density = 100;
\r
1070 UnicodeSet sourceRange;
\r
1071 UnicodeSet targetRange;
\r
1072 UnicodeSet toSource;
\r
1073 UnicodeSet toTarget;
\r
1074 UnicodeSet roundtripExclusions;
\r
1076 RoundTripTest log;
\r
1077 Legal legalSource;
\r
1078 UnicodeSet badCharacters;
\r
1081 * create a test for the given script transliterator.
\r
1083 Test(String transliteratorID) {
\r
1084 this(transliteratorID, 100);
\r
1087 Test(String transliteratorID, int dens) {
\r
1088 this.transliteratorID = transliteratorID;
\r
1089 this.density = dens;
\r
1092 public void setErrorLimit(int limit) {
\r
1093 errorLimit = limit;
\r
1096 public void setPairLimit(int limit) {
\r
1097 pairLimit = limit;
\r
1100 // Added to do better equality check.
\r
1102 public static boolean isSame(String a, String b) {
\r
1103 if (a.equals(b)) return true;
\r
1104 if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
\r
1105 a = Normalizer.normalize(a, Normalizer.NFD);
\r
1106 b = Normalizer.normalize(b, Normalizer.NFD);
\r
1107 if (a.equals(b)) return true;
\r
1108 if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
\r
1113 public boolean includesSome(UnicodeSet set, String a) {
\r
1115 for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
\r
1116 cp = UTF16.charAt(a, i);
\r
1117 if (set.contains(cp)) return true;
\r
1123 public static boolean isCamel(String a) {
\r
1124 //System.out.println("CamelTest");
\r
1125 // see if string is of the form aB; e.g. lower, then upper or title
\r
1127 boolean haveLower = false;
\r
1128 for (int i = 0; i < a.length(); i += UTF16.getCharCount(cp)) {
\r
1129 cp = UTF16.charAt(a, i);
\r
1130 int t = UCharacter.getType(cp);
\r
1131 //System.out.println("\t" + t + " " + Integer.toString(cp,16) + " " + UCharacter.getName(cp));
\r
1133 case Character.UPPERCASE_LETTER:
\r
1134 if (haveLower) return true;
\r
1136 case Character.TITLECASE_LETTER:
\r
1137 if (haveLower) return true;
\r
1138 // drop through, since second letter is lower.
\r
1139 case Character.LOWERCASE_LETTER:
\r
1144 //System.out.println("FALSE");
\r
1148 static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
\r
1149 static final UnicodeSet neverOk = new UnicodeSet("[:Other:]");
\r
1151 public void test(String srcRange, String trgtRange,
\r
1152 String rdtripExclusions, RoundTripTest logger, Legal legalSrc)
\r
1153 throws java.io.IOException {
\r
1154 test(srcRange, trgtRange, srcRange, rdtripExclusions, logger, legalSrc);
\r
1159 * that everything in sourceRange maps to targetRange,
\r
1160 * that everything in targetRange maps to backtoSourceRange
\r
1161 * that everything roundtrips from target -> source -> target, except roundtripExceptions
\r
1163 public void test(String srcRange, String trgtRange, String backtoSourceRange,
\r
1164 String rdtripExclusions, RoundTripTest logger, Legal legalSrc)
\r
1165 throws java.io.IOException {
\r
1167 legalSource = legalSrc;
\r
1168 sourceRange = new UnicodeSet(srcRange);
\r
1169 sourceRange.removeAll(neverOk);
\r
1171 targetRange = new UnicodeSet(trgtRange);
\r
1172 targetRange.removeAll(neverOk);
\r
1174 toSource = new UnicodeSet(backtoSourceRange);
\r
1175 toSource.addAll(okAnyway);
\r
1177 toTarget = new UnicodeSet(trgtRange);
\r
1178 toTarget.addAll(okAnyway);
\r
1180 if (rdtripExclusions != null && rdtripExclusions.length() > 0) {
\r
1181 roundtripExclusions = new UnicodeSet(rdtripExclusions);
\r
1183 roundtripExclusions = new UnicodeSet(); // empty
\r
1188 log.logln(Utility.escape("Source: " + sourceRange));
\r
1189 log.logln(Utility.escape("Target: " + targetRange));
\r
1190 log.logln(Utility.escape("Exclude: " + roundtripExclusions));
\r
1191 if (log.isQuick()) log.logln("Abbreviated Test");
\r
1193 badCharacters = new UnicodeSet("[:other:]");
\r
1195 // make a UTF-8 output file we can read with a browser
\r
1197 // note: check that every transliterator transliterates the null string correctly!
\r
1199 // {dlf} reorganize so can run test in protected security environment
\r
1200 // String logFileName = "test_" + transliteratorID.replace('/', '_') + ".html";
\r
1202 // File lf = new File(logFileName);
\r
1203 // log.logln("Creating log file " + lf.getAbsoluteFile());
\r
1205 // out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
\r
1206 // new FileOutputStream(logFileName), "UTF8"), 4*1024));
\r
1208 ByteArrayOutputStream bast = new ByteArrayOutputStream();
\r
1209 out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
\r
1210 bast, "UTF8"), 4*1024));
\r
1211 //out.write('\uFFEF'); // BOM
\r
1212 out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
\r
1213 out.println("<HTML><HEAD>");
\r
1214 out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
\r
1215 out.println("<BODY bgcolor='#FFFFFF' style='font-family: Arial Unicode MS'>");
\r
1219 } catch (TestTruncated e) {
\r
1220 out.println(e.getMessage());
\r
1222 out.println("</BODY></HTML>");
\r
1225 if (errorCount > 0) {
\r
1227 File translitErrorDirectory = new File("translitErrorLogs");
\r
1228 if (!translitErrorDirectory.exists()) {
\r
1229 translitErrorDirectory.mkdir();
\r
1231 String logFileName = "translitErrorLogs/test_" + transliteratorID.replace('/', '_') + ".html";
\r
1232 File lf = new File(logFileName);
\r
1233 logger.logln("Creating log file " + lf.getAbsoluteFile());
\r
1234 FileOutputStream fos = new FileOutputStream(lf);
\r
1235 fos.write(bast.toByteArray());
\r
1237 logger.errln(transliteratorID + " errors: "
\r
1238 + errorCount + (errorCount > errorLimit ? " (at least!)" : "")
\r
1239 + ", see " + lf.getAbsoluteFile());
\r
1241 catch (SecurityException e) {
\r
1242 logger.errln(transliteratorID + " errors: "
\r
1243 + errorCount + (errorCount > errorLimit ? " (at least!)" : "")
\r
1244 + ", no log provided due to protected test domain");
\r
1247 logger.logln(transliteratorID + " ok");
\r
1248 // new File(logFileName).delete();
\r
1252 // ok if at least one is not equal
\r
1253 public boolean checkIrrelevants(Transliterator t, String irrelevants) {
\r
1254 for (int i = 0; i < irrelevants.length(); ++i) {
\r
1255 char c = irrelevants.charAt(i);
\r
1256 String cs = UTF16.valueOf(c);
\r
1257 String targ = t.transliterate(cs);
\r
1258 if (cs.equals(targ)) return true;
\r
1263 AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
\r
1264 AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();
\r
1266 Transliterator sourceToTarget;
\r
1267 Transliterator targetToSource;
\r
1269 public void test2() {
\r
1271 sourceToTarget = Transliterator.getInstance(transliteratorID);
\r
1272 targetToSource = sourceToTarget.getInverse();
\r
1274 log.logln("Checking that at least one irrevant characters is not NFC'ed");
\r
1275 out.println("<h3>Checking that at least one irrevant characters is not NFC'ed</h3>");
\r
1277 String irrelevants = "\u2000\u2001\u2126\u212A\u212B\u2329"; // string is from NFC_NO in the UCD
\r
1279 if (!checkIrrelevants(sourceToTarget, irrelevants)) {
\r
1280 logFails("" + getSourceTarget(transliteratorID) + ", Must not NFC everything");
\r
1282 if (!checkIrrelevants(targetToSource, irrelevants)) {
\r
1283 logFails("" + getTargetSource(transliteratorID) + ", irrelevants");
\r
1286 if (EXTRA_TESTS) {
\r
1287 log.logln("Checking that toRules works");
\r
1288 String rules = "";
\r
1289 Transliterator sourceToTarget2;
\r
1290 Transliterator targetToSource2;
\r
1292 rules = sourceToTarget.toRules(false);
\r
1293 sourceToTarget2 = Transliterator.createFromRules("s2t2", rules, Transliterator.FORWARD);
\r
1294 if (PRINT_RULES) {
\r
1295 out.println("<h3>Forward Rules:</h3><p>");
\r
1296 out.println(TestUtility.replace(rules, "\n", "\u200E<br>\n\u200E"));
\r
1297 out.println("</p>");
\r
1299 rules = targetToSource.toRules(false);
\r
1300 targetToSource2 = Transliterator.createFromRules("t2s2", rules, Transliterator.FORWARD);
\r
1301 if (PRINT_RULES) {
\r
1302 out.println("<h3>Backward Rules:</h3><p>");
\r
1303 out.println(TestUtility.replace(rules, "\n", "\u200E<br>\n\u200E"));
\r
1304 out.println("</p>");
\r
1306 } catch (RuntimeException e) {
\r
1307 out.println("<h3>Broken Rules:</h3><p>");
\r
1308 out.println(TestUtility.replace(rules, "\n", "<br>\n"));
\r
1309 out.println("</p>");
\r
1314 out.println("<h3>Roundtrip Exclusions: " + new UnicodeSet(roundtripExclusions) + "</h3>");
\r
1317 checkSourceTargetSource(sourceToTarget2);
\r
1319 checkTargetSourceTarget(targetToSource2);
\r
1322 UnicodeSet failSourceTarg = new UnicodeSet();
\r
1325 checkSourceTargetSingles(failSourceTarg);
\r
1327 boolean quickRt = checkSourceTargetDoubles(failSourceTarg);
\r
1329 UnicodeSet failTargSource = new UnicodeSet();
\r
1330 UnicodeSet failRound = new UnicodeSet();
\r
1332 checkTargetSourceSingles(failTargSource, failRound);
\r
1333 checkTargetSourceDoubles(quickRt, failTargSource, failRound);
\r
1336 private void checkSourceTargetSource(Transliterator sourceToTarget2) {
\r
1337 log.logln("Checking that source -> target -> source");
\r
1338 out.println("<h3>Checking that source -> target -> source</h3>");
\r
1340 usi.reset(sourceRange);
\r
1341 while (usi.next()) {
\r
1342 int c = usi.codepoint;
\r
1344 String cs = UTF16.valueOf(c);
\r
1345 String targ = sourceToTarget.transliterate(cs);
\r
1346 String targ2 = sourceToTarget2.transliterate(cs);
\r
1347 if (!targ.equals(targ2)) {
\r
1348 logToRulesFails("" + getSourceTarget(transliteratorID) + ", toRules", cs, targ, targ2);
\r
1353 private void checkTargetSourceTarget(Transliterator targetToSource2) {
\r
1354 log.logln("Checking that target -> source -> target");
\r
1355 out.println("<h3>Checking that target -> source -> target</h3>");
\r
1356 usi.reset(targetRange);
\r
1357 while (usi.next()) {
\r
1358 int c = usi.codepoint;
\r
1360 String cs = UTF16.valueOf(c);
\r
1361 String targ = targetToSource.transliterate(cs);
\r
1362 String targ2 = targetToSource2.transliterate(cs);
\r
1363 if (!targ.equals(targ2)) {
\r
1364 logToRulesFails("" + getTargetSource(transliteratorID) + ", toRules", cs, targ, targ2);
\r
1369 private void checkSourceTargetSingles(UnicodeSet failSourceTarg) {
\r
1370 log.logln("Checking that source characters convert to target - Singles");
\r
1371 out.println("<h3>Checking that source characters convert to target - Singles</h3>");
\r
1375 for (char c = 0; c < 0xFFFF; ++c) {
\r
1376 if (!sourceRange.contains(c)) continue;
\r
1378 usi.reset(sourceRange);
\r
1379 while (usi.next()) {
\r
1380 int c = usi.codepoint;
\r
1382 String cs = UTF16.valueOf(c);
\r
1383 String targ = sourceToTarget.transliterate(cs);
\r
1384 if (!toTarget.containsAll(targ)
\r
1385 || badCharacters.containsSome(targ)) {
\r
1386 String targD = Normalizer.normalize(targ, Normalizer.NFD);
\r
1387 if (!toTarget.containsAll(targD)
\r
1388 || badCharacters.containsSome(targD)) {
\r
1389 logWrongScript("" + getSourceTarget(transliteratorID) + "", cs, targ, toTarget, badCharacters);
\r
1390 failSourceTarg.add(c);
\r
1395 String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
\r
1396 String targ2 = sourceToTarget.transliterate(cs2);
\r
1397 if (!targ.equals(targ2)) {
\r
1398 logNotCanonical("" + getSourceTarget(transliteratorID) + "", cs, targ, cs2, targ2);
\r
1403 private boolean checkSourceTargetDoubles(UnicodeSet failSourceTarg) {
\r
1404 log.logln("Checking that source characters convert to target - Doubles");
\r
1405 out.println("<h3>Checking that source characters convert to target - Doubles</h3>");
\r
1409 for (char c = 0; c < 0xFFFF; ++c) {
\r
1410 if (TestUtility.isUnassigned(c) ||
\r
1411 !sourceRange.contains(c)) continue;
\r
1412 if (failSourceTarg.get(c)) continue;
\r
1416 UnicodeSet sourceRangeMinusFailures = new UnicodeSet(sourceRange);
\r
1417 sourceRangeMinusFailures.removeAll(failSourceTarg);
\r
1419 boolean quickRt = log.getInclusion() < 10;
\r
1421 usi.reset(sourceRangeMinusFailures, quickRt, density);
\r
1423 while (usi.next()) {
\r
1424 int c = usi.codepoint;
\r
1427 for (char d = 0; d < 0xFFFF; ++d) {
\r
1428 if (TestUtility.isUnassigned(d) ||
\r
1429 !sourceRange.contains(d)) continue;
\r
1430 if (failSourceTarg.get(d)) continue;
\r
1432 log.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
\r
1433 usi2.reset(sourceRangeMinusFailures, quickRt, density);
\r
1435 while (usi2.next()) {
\r
1436 int d = usi2.codepoint;
\r
1439 String cs = UTF16.valueOf(c) + UTF16.valueOf(d);
\r
1440 String targ = sourceToTarget.transliterate(cs);
\r
1441 if (!toTarget.containsAll(targ)
\r
1442 || badCharacters.containsSome(targ)) {
\r
1443 String targD = Normalizer.normalize(targ, Normalizer.NFD);
\r
1444 if (!toTarget.containsAll(targD)
\r
1445 || badCharacters.containsSome(targD)) {
\r
1446 logWrongScript("" + getSourceTarget(transliteratorID) + "", cs, targ, toTarget, badCharacters);
\r
1450 String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
\r
1451 String targ2 = sourceToTarget.transliterate(cs2);
\r
1452 if (!targ.equals(targ2)) {
\r
1453 logNotCanonical("" + getSourceTarget(transliteratorID) + "", cs, targ, cs2, targ2);
\r
1460 void checkTargetSourceSingles(UnicodeSet failTargSource, UnicodeSet failRound) {
\r
1461 log.logln("Checking that target characters convert to source and back - Singles");
\r
1462 out.println("<h3>Checking that target characters convert to source and back - Singles</h3>");
\r
1465 /*for (char c = 0; c < 0xFFFF; ++c) {
\r
1466 if (TestUtility.isUnassigned(c) ||
\r
1467 !targetRange.contains(c)) continue;
\r
1470 usi.reset(targetRange);
\r
1471 while (usi.next()) {
\r
1474 if(usi.codepoint == UnicodeSetIterator.IS_STRING){
\r
1476 c = UTF16.charAt(cs,0);
\r
1478 c = usi.codepoint;
\r
1479 cs =UTF16.valueOf(c);
\r
1482 String targ = targetToSource.transliterate(cs);
\r
1483 String reverse = sourceToTarget.transliterate(targ);
\r
1485 if (!toSource.containsAll(targ)
\r
1486 || badCharacters.containsSome(targ)) {
\r
1487 String targD = Normalizer.normalize(targ, Normalizer.NFD);
\r
1488 if (!toSource.containsAll(targD)
\r
1489 || badCharacters.containsSome(targD)) {
\r
1490 /*UnicodeSet temp = */new UnicodeSet().addAll(targD);
\r
1491 logWrongScript("" + getTargetSource(transliteratorID) + "", cs, targ, toSource, badCharacters);
\r
1492 failTargSource.add(cs);
\r
1496 if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)
\r
1497 && !roundtripExclusions.contains(cs)) {
\r
1498 logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
\r
1502 String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
\r
1503 String reverse2 = sourceToTarget.transliterate(targ2);
\r
1504 if (!reverse.equals(reverse2)) {
\r
1505 logNotCanonical("" + getTargetSource(transliteratorID) + "", targ, reverse, targ2, reverse2);
\r
1511 private void checkTargetSourceDoubles(boolean quickRt, UnicodeSet failTargSource,
\r
1512 UnicodeSet failRound) {
\r
1513 log.logln("Checking that target characters convert to source and back - Doubles");
\r
1514 out.println("<h3>Checking that target characters convert to source and back - Doubles</h3>");
\r
1517 UnicodeSet targetRangeMinusFailures = new UnicodeSet(targetRange);
\r
1518 targetRangeMinusFailures.removeAll(failTargSource);
\r
1519 targetRangeMinusFailures.removeAll(failRound);
\r
1521 //char[] buf = new char[4]; // maximum we can have with 2 code points
\r
1523 for (char c = 0; c < 0xFFFF; ++c) {
\r
1524 if (TestUtility.isUnassigned(c) ||
\r
1525 !targetRange.contains(c)) continue;
\r
1528 usi.reset(targetRangeMinusFailures, quickRt, density);
\r
1530 while (usi.next()) {
\r
1531 int c = usi.codepoint;
\r
1533 //log.log(TestUtility.hex(c));
\r
1536 for (char d = 0; d < 0xFFFF; ++d) {
\r
1537 if (TestUtility.isUnassigned(d) ||
\r
1538 !targetRange.contains(d)) continue;
\r
1540 log.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
\r
1541 usi2.reset(targetRangeMinusFailures, quickRt, density);
\r
1543 while (usi2.next()) {
\r
1545 int d = usi2.codepoint;
\r
1548 if (++count > pairLimit) {
\r
1549 throw new TestTruncated("Test truncated at " + pairLimit);
\r
1552 String cs = UTF16.valueOf(c) + UTF16.valueOf(d);
\r
1553 String targ = targetToSource.transliterate(cs);
\r
1554 String reverse = sourceToTarget.transliterate(targ);
\r
1556 if (!toSource.containsAll(targ) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
\r
1557 || badCharacters.containsSome(targ)) {
\r
1558 String targD = Normalizer.normalize(targ, Normalizer.NFD);
\r
1559 if (!toSource.containsAll(targD) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
\r
1560 || badCharacters.containsSome(targD)) {
\r
1561 logWrongScript("" + getTargetSource(transliteratorID) + "", cs, targ, toSource, badCharacters);
\r
1565 if (!isSame(cs, reverse) /*&& !failRound.contains(c) && !failRound.contains(d)*/
\r
1566 && !roundtripExclusions.contains(c)
\r
1567 && !roundtripExclusions.contains(d)
\r
1568 && !roundtripExclusions.contains(cs)) {
\r
1569 logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
\r
1572 String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
\r
1573 String reverse2 = sourceToTarget.transliterate(targ2);
\r
1574 if (!reverse.equals(reverse2)) {
\r
1575 logNotCanonical("" + getTargetSource(transliteratorID) + "", targ, reverse, targ2, reverse2);
\r
1583 * @param transliteratorID2
\r
1586 private String getTargetSource(String transliteratorID2) {
\r
1587 return "Target-Source [" + transliteratorID2 + "]";
\r
1591 * @param transliteratorID2
\r
1594 private String getSourceTarget(String transliteratorID2) {
\r
1595 return "Source-Target [" + transliteratorID2 + "]";
\r
1598 final String info(String s) {
\r
1599 StringBuffer result = new StringBuffer();
\r
1600 result.append("\u200E").append(s).append("\u200E (").append(TestUtility.hex(s)).append("/");
\r
1601 if (false) { // append age, as a check
\r
1603 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
\r
1604 cp = UTF16.charAt(s, i);
\r
1605 if (i > 0) result.append(", ");
\r
1606 result.append(UCharacter.getAge(cp));
\r
1609 result.append(")");
\r
1610 return result.toString();
\r
1613 final void logWrongScript(String label, String from, String to,
\r
1614 UnicodeSet shouldContainAll, UnicodeSet shouldNotContainAny) {
\r
1615 if (++errorCount > errorLimit) {
\r
1616 throw new TestTruncated("Test truncated; too many failures");
\r
1618 String toD = Normalizer.normalize(to, Normalizer.NFD);
\r
1619 UnicodeSet temp = new UnicodeSet().addAll(toD);
\r
1620 UnicodeSet bad = new UnicodeSet(shouldNotContainAny).retainAll(temp)
\r
1621 .addAll(new UnicodeSet(temp).removeAll(shouldContainAll));
\r
1623 out.println("<br>Fail " + label + ": " +
\r
1624 info(from) + " => " + info(to) + " " + bad
\r
1628 final void logNotCanonical(String label, String from, String to, String fromCan, String toCan) {
\r
1629 if (++errorCount > errorLimit) {
\r
1630 throw new TestTruncated("Test truncated; too many failures");
\r
1632 out.println("<br>Fail (can.equiv) " + label + ": " +
\r
1633 info(from) + " => " + info(to) +
\r
1635 info(fromCan) + " => " + info(toCan) + ")"
\r
1639 final void logFails(String label) {
\r
1640 if (++errorCount > errorLimit) {
\r
1641 throw new TestTruncated("Test truncated; too many failures");
\r
1643 out.println("<br>Fail (can.equiv)" + label);
\r
1646 final void logToRulesFails(String label, String from, String to, String toCan) {
\r
1647 if (++errorCount > errorLimit) {
\r
1648 throw new TestTruncated("Test truncated; too many failures");
\r
1650 out.println("<br>Fail " + label + ": " +
\r
1651 info(from) + " => " + info(to) + ", " + info(toCan)
\r
1655 final void logRoundTripFailure(String from,String toID, String to,String backID, String back) {
\r
1656 if (!legalSource.is(from)) return; // skip illegals
\r
1658 if (++errorCount > errorLimit) {
\r
1659 throw new TestTruncated("Test truncated; too many failures");
\r
1661 out.println("<br>Fail Roundtrip: " +
\r
1662 info(from) + " "+toID+" => " + info(to) + " " + backID+" => " + info(back)
\r
1667 * Characters to filter for source-target mapping completeness
\r
1668 * Typically is base alphabet, minus extended characters
\r
1669 * Default is ASCII letters for Latin
\r
1672 public boolean isSource(char c) {
\r
1673 if (!sourceRange.contains(c)) return false;
\r
1679 * Characters to check for target back to source mapping.
\r
1680 * Typically the same as the target script, plus punctuation
\r
1683 public boolean isReceivingSource(char c) {
\r
1684 if (!targetRange.contains(c)) return false;
\r
1689 * Characters to filter for target-source mapping
\r
1690 * Typically is base alphabet, minus extended characters
\r
1693 public boolean isTarget(char c) {
\r
1694 byte script = TestUtility.getScript(c);
\r
1695 if (script != targetScript) return false;
\r
1696 if (!TestUtility.isLetter(c)) return false;
\r
1697 if (targetRange != null && !targetRange.contains(c)) return false;
\r
1703 * Characters to check for target-source mapping
\r
1704 * Typically the same as the source script, plus punctuation
\r
1707 public boolean isReceivingTarget(char c) {
\r
1708 byte script = TestUtility.getScript(c);
\r
1709 return (script == targetScript || script == TestUtility.COMMON_SCRIPT);
\r
1712 final boolean isSource(String s) {
\r
1713 for (int i = 0; i < s.length(); ++i) {
\r
1714 if (!isSource(s.charAt(i))) return false;
\r
1719 final boolean isTarget(String s) {
\r
1720 for (int i = 0; i < s.length(); ++i) {
\r
1721 if (!isTarget(s.charAt(i))) return false;
\r
1726 final boolean isReceivingSource(String s) {
\r
1727 for (int i = 0; i < s.length(); ++i) {
\r
1728 if (!isReceivingSource(s.charAt(i))) return false;
\r
1733 final boolean isReceivingTarget(String s) {
\r
1734 for (int i = 0; i < s.length(); ++i) {
\r
1735 if (!isReceivingTarget(s.charAt(i))) return false;
\r
1741 static class TestTruncated extends RuntimeException {
\r
1743 * For serialization
\r
1745 private static final long serialVersionUID = 3361828190488168323L;
\r
1747 TestTruncated(String msg) {
\r
1753 // static class TestHangul extends Test {
\r
1754 // TestHangul () {
\r
1755 // super("Jamo-Hangul", TestUtility.JAMO_SCRIPT, TestUtility.HANGUL_SCRIPT);
\r
1758 // public boolean isSource(char c) {
\r
1759 // if (0x1113 <= c && c <= 0x1160) return false;
\r
1760 // if (0x1176 <= c && c <= 0x11F9) return false;
\r
1761 // if (0x3131 <= c && c <= 0x318E) return false;
\r
1762 // return super.isSource(c);
\r