/* ******************************************************************************* * Copyright (C) 2002-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ /** * Port From: ICU4C v2.1 : collate/CollationRegressionTest * Source File: $ICU4CRoot/source/test/intltest/regcoll.cpp **/ package com.ibm.icu.dev.test.collator; import java.io.BufferedReader; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import java.util.Locale; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; import com.ibm.icu.text.CollationElementIterator; import com.ibm.icu.text.CollationKey; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; public class CollationThaiTest extends TestFmwk { final int MAX_FAILURES_TO_SHOW = -1; public static void main(String[] args) throws Exception { new CollationThaiTest().run(args); } /** * Odd corner conditions taken from "How to Sort Thai Without Rewriting Sort", * by Doug Cooper, http://seasrc.th.net/paper/thaisort.zip */ public void TestCornerCases() { String TESTS[] = { // Shorter words precede longer "\u0e01", "<", "\u0e01\u0e01", // Tone marks are considered after letters (i.e. are primary ignorable) "\u0e01\u0e32", "<", "\u0e01\u0e49\u0e32", // ditto for other over-marks "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e4c", // commonly used mark-in-context order. // In effect, marks are sorted after each syllable. "\u0e01\u0e32\u0e01\u0e49\u0e32", "<", "\u0e01\u0e48\u0e32\u0e01\u0e49\u0e32", // Hyphens and other punctuation follow whitespace but come before letters "\u0e01\u0e32", "<", "\u0e01\u0e32-", "\u0e01\u0e32-", "<", "\u0e01\u0e32\u0e01\u0e32", // Doubler follows an indentical word without the doubler "\u0e01\u0e32", "<", "\u0e01\u0e32\u0e46", "\u0e01\u0e32\u0e46", "<", "\u0e01\u0e32\u0e01\u0e32", // \u0e45 after either \u0e24 or \u0e26 is treated as a single // combining character, similar to "c < ch" in traditional spanish. // TODO: beef up this case "\u0e24\u0e29\u0e35", "<", "\u0e24\u0e45\u0e29\u0e35", "\u0e26\u0e29\u0e35", "<", "\u0e26\u0e45\u0e29\u0e35", // Vowels reorder, should compare \u0e2d and \u0e34 "\u0e40\u0e01\u0e2d", "<", "\u0e40\u0e01\u0e34", // Tones are compared after the rest of the word (e.g. primary ignorable) "\u0e01\u0e32\u0e01\u0e48\u0e32", "<", "\u0e01\u0e49\u0e32\u0e01\u0e32", // Periods are ignored entirely "\u0e01.\u0e01.", "<", "\u0e01\u0e32", }; RuleBasedCollator coll = null; try { coll = getThaiCollator(); } catch (Exception e) { warnln("could not construct Thai collator"); return; } compareArray(coll, TESTS); } void compareArray(RuleBasedCollator c, String[] tests) { for (int i = 0; i < tests.length; i += 3) { int expect = 0; if (tests[i+1].equals("<")) { expect = -1; } else if (tests[i+1].equals(">")) { expect = 1; } else if (tests[i+1].equals("=")) { expect = 0; } else { // expect = Integer.decode(tests[i+1]).intValue(); errln("Error: unknown operator " + tests[i+1]); return; } String s1 = tests[i]; String s2 = tests[i+2]; CollationTest.doTest(this, c, s1, s2, expect); } } int sign(int i ) { if (i < 0) return -1; if (i > 0) return 1; return 0; } /** * Read the external dictionary file, which is already in proper * sorted order, and confirm that the collator compares each line as * preceding the following line. */ public void TestDictionary() { RuleBasedCollator coll = null; try { coll = getThaiCollator(); } catch (Exception e) { warnln("could not construct Thai collator"); return; } // Read in a dictionary of Thai words BufferedReader in = null; String fileName = "riwords.txt"; try { in = TestUtil.getDataReader(fileName, "UTF-8"); } catch (SecurityException e) { warnln("Security exception encountered reading test data file."); return; } catch (Exception e) { try { if (in != null) { in.close(); } } catch (IOException ioe) {} errln("Error: could not open test file: " + fileName + ". Aborting test."); return; } // // Loop through each word in the dictionary and compare it to the previous // word. They should be in sorted order. // String lastWord = ""; int line = 0; int failed = 0; int wordCount = 0; try { String word = in.readLine(); while (word != null) { line++; // Skip comments and blank lines if (word.length() == 0 || word.charAt(0) == 0x23) { word = in.readLine(); continue; } // Show the first 8 words being compared, so we can see what's happening ++wordCount; if (wordCount <= 8) { logln("Word " + wordCount + ": " + word); } if (lastWord.length() > 0) { CollationTest.doTest(this, coll, lastWord, word, -1); int result = coll.compare(lastWord, word); if (result >= 0) { failed++; if (MAX_FAILURES_TO_SHOW < 0 || failed <= MAX_FAILURES_TO_SHOW) { String msg = "--------------------------------------------\n" + line + " compare(" + lastWord + ", " + word + ") returned " + result + ", expected -1\n"; CollationKey k1, k2; try { k1 = coll.getCollationKey(lastWord); k2 = coll.getCollationKey(word); } catch (Exception e) { errln("Fail: getCollationKey returned "); return; } msg += "key1: " + prettify(k1) + "\n" + "key2: " + prettify(k2); errln(msg); } } } lastWord = word; word = in.readLine(); } } catch (IOException e) { errln("IOException " + e.getMessage()); } if (failed != 0) { if (failed > MAX_FAILURES_TO_SHOW) { errln("Too many failures; only the first " + MAX_FAILURES_TO_SHOW + " failures were shown"); } errln("Summary: " + failed + " of " + (line - 1) + " comparisons failed"); } logln("Words checked: " + wordCount); } public void TestInvalidThai() { String tests[] = { "\u0E44\u0E01\u0E44\u0E01", "\u0E44\u0E01\u0E01\u0E44", "\u0E01\u0E44\u0E01\u0E44", "\u0E01\u0E01\u0E44\u0E44", "\u0E44\u0E44\u0E01\u0E01", "\u0E01\u0E44\u0E44\u0E01", }; RuleBasedCollator collator; StrCmp comparator; try { collator = getThaiCollator(); comparator = new StrCmp(); } catch (Exception e) { warnln("could not construct Thai collator"); return; } Arrays.sort(tests, comparator); for (int i = 0; i < tests.length; i ++) { for (int j = i + 1; j < tests.length; j ++) { if (collator.compare(tests[i], tests[j]) > 0) { // inconsistency ordering found! errln("Inconsistent ordering between strings " + i + " and " + j); } } CollationElementIterator iterator = collator.getCollationElementIterator(tests[i]); CollationTest.backAndForth(this, iterator); } } public void TestReordering() { String tests[] = { "\u0E41c\u0301", "=", "\u0E41\u0107", // composition "\u0E41\uD835\uDFCE", "<", "\u0E41\uD835\uDFCF", // supplementaries "\u0E41\uD834\uDD5F", "=", "\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary "\u0E41\uD87E\uDC02", "=", "\u0E41\u4E41", // supplementary composition decomps to BMP "\u0E41\u0301", "=", "\u0E41\u0301", // unsafe (just checking backwards iteration) "\u0E41\u0301\u0316", "=", "\u0E41\u0316\u0301", "abc\u0E41c\u0301", "=", "abc\u0E41\u0107", // composition "abc\u0E41\uD834\uDC00", "<", "abc\u0E41\uD834\uDC01", // supplementaries "abc\u0E41\uD834\uDD5F", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65", // supplementary composition decomps to supplementary "abc\u0E41\uD87E\uDC02", "=", "abc\u0E41\u4E41", // supplementary composition decomps to BMP "abc\u0E41\u0301", "=", "abc\u0E41\u0301", // unsafe (just checking backwards iteration) "abc\u0E41\u0301\u0316", "=", "abc\u0E41\u0316\u0301", "\u0E41c\u0301abc", "=", "\u0E41\u0107abc", // composition "\u0E41\uD834\uDC00abc", "<", "\u0E41\uD834\uDC01abc", // supplementaries "\u0E41\uD834\uDD5Fabc", "=", "\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary "\u0E41\uD87E\uDC02abc", "=", "\u0E41\u4E41abc", // supplementary composition decomps to BMP "\u0E41\u0301abc", "=", "\u0E41\u0301abc", // unsafe (just checking backwards iteration) "\u0E41\u0301\u0316abc", "=", "\u0E41\u0316\u0301abc", "abc\u0E41c\u0301abc", "=", "abc\u0E41\u0107abc", // composition "abc\u0E41\uD834\uDC00abc", "<", "abc\u0E41\uD834\uDC01abc", // supplementaries "abc\u0E41\uD834\uDD5Fabc", "=", "abc\u0E41\uD834\uDD58\uD834\uDD65abc", // supplementary composition decomps to supplementary "abc\u0E41\uD87E\uDC02abc", "=", "abc\u0E41\u4E41abc", // supplementary composition decomps to BMP "abc\u0E41\u0301abc", "=", "abc\u0E41\u0301abc", // unsafe (just checking backwards iteration) "abc\u0E41\u0301\u0316abc", "=", "abc\u0E41\u0316\u0301abc", }; RuleBasedCollator collator; try { collator = (RuleBasedCollator)getThaiCollator(); } catch (Exception e) { warnln("could not construct Thai collator"); return; } compareArray(collator, tests); String rule = "& c < ab"; String testcontraction[] = { "\u0E41ab", ">", "\u0E41c"}; try { collator = new RuleBasedCollator(rule); } catch (Exception e) { errln("Error: could not construct collator with rule " + rule); return; } compareArray(collator, testcontraction); } String prettify(CollationKey sourceKey) { int i; byte[] bytes= sourceKey.toByteArray(); String target = "["; for (i = 0; i < bytes.length; i++) { target += Integer.toHexString(bytes[i]); target += " "; } target += "]"; return target; } // private inner class ------------------------------------------------- private static final class StrCmp implements Comparator { public int compare(String string1, String string2) { return collator.compare(string1, string2); } StrCmp() throws Exception { collator = getThaiCollator(); } Collator collator; } // private data members ------------------------------------------------ private static RuleBasedCollator m_collator_; // private methods ----------------------------------------------------- private static RuleBasedCollator getThaiCollator() throws Exception { if (m_collator_ == null) { m_collator_ = (RuleBasedCollator)Collator.getInstance( new Locale("th", "TH", "")); } return m_collator_; } }