3 *******************************************************************************
4 * Copyright (C) 2005-2009, International Business Machines Corporation and *
5 * others. All Rights Reserved. *
6 *******************************************************************************
8 package com.ibm.icu.dev.test.charsetdet;
10 import java.io.ByteArrayInputStream;
11 import java.io.Reader;
13 import com.ibm.icu.dev.test.TestFmwk;
14 import com.ibm.icu.text.CharsetDetector;
15 import com.ibm.icu.text.CharsetMatch;
17 //#if defined(FOUNDATION10) || defined(J2SE13)
19 import java.io.InputStream;
20 import java.io.UnsupportedEncodingException;
22 import javax.xml.parsers.DocumentBuilder;
23 import javax.xml.parsers.DocumentBuilderFactory;
25 import org.w3c.dom.Document;
26 import org.w3c.dom.Element;
27 import org.w3c.dom.NamedNodeMap;
28 import org.w3c.dom.Node;
29 import org.w3c.dom.NodeList;
31 import com.ibm.icu.charset.CharsetProviderICU;
32 import java.nio.charset.CharsetEncoder;
33 import java.nio.CharBuffer;
40 public class TestCharsetDetector extends TestFmwk
46 public TestCharsetDetector()
50 public static void main(String[] args) {
53 TestCharsetDetector test = new TestCharsetDetector();
62 private void CheckAssert(boolean exp) {
66 throw new Exception();
69 //#if defined(FOUNDATION10) || defined(J2SE13)
70 //## msg = "Test failure " + e.getMessage() ;
72 StackTraceElement failPoint = e.getStackTrace()[1];
73 msg = "Test failure in file " + failPoint.getFileName() +
74 " at line " + failPoint.getLineNumber();
82 private String stringFromReader(Reader reader)
84 StringBuffer sb = new StringBuffer();
85 char[] buffer = new char[1024];
89 while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
90 sb.append(buffer, 0, bytesRead);
94 } catch (Exception e) {
95 errln("stringFromReader() failed: " + e.toString());
100 public void TestConstruction() {
102 CharsetDetector det = new CharsetDetector();
104 errln("Could not construct a charset detector");
106 String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
107 CheckAssert(charsetNames.length != 0);
108 for (i=0; i<charsetNames.length; i++) {
109 CheckAssert(charsetNames[i].equals("") == false);
110 // System.out.println("\"" + charsetNames[i] + "\"");
114 public void TestInputFilter() throws Exception
116 String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
117 byte[] bytes = s.getBytes("ISO-8859-1");
118 CharsetDetector det = new CharsetDetector();
121 det.enableInputFilter(true);
122 if (!det.inputFilterEnabled()){
123 errln("input filter should be enabled");
129 if (! m.getLanguage().equals("fr")) {
130 errln("input filter did not strip markup!");
133 det.enableInputFilter(false);
137 if (! m.getLanguage().equals("en")) {
138 errln("unfiltered input did not detect as English!");
142 public void TestUTF8() throws Exception {
144 String s = "This is a string with some non-ascii characters that will " +
145 "be converted to UTF-8, then shoved through the detection process. " +
146 "\u0391\u0392\u0393\u0394\u0395" +
147 "Sure would be nice if our source could contain Unicode directly!";
148 byte [] bytes = s.getBytes("UTF-8");
149 CharsetDetector det = new CharsetDetector();
153 retrievedS = det.getString(bytes, "UTF-8");
154 CheckAssert(s.equals(retrievedS));
156 reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
157 CheckAssert(s.equals(stringFromReader(reader)));
158 det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
161 public void TestUTF16() throws Exception
164 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
165 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
167 byte[] beBytes = source.getBytes("UnicodeBig");
168 byte[] leBytes = source.getBytes("UnicodeLittle");
169 CharsetDetector det = new CharsetDetector();
172 det.setText(beBytes);
175 if (! m.getName().equals("UTF-16BE")) {
176 errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
179 det.setText(leBytes);
182 if (! m.getName().equals("UTF-16LE")) {
183 errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
186 // Jitterbug 4451, for coverage
187 int confidence = m.getConfidence();
188 if(confidence != 100){
189 errln("Did not get the expected confidence level " + confidence);
191 int matchType = m.getMatchType();
193 errln("Did not get the expected matchType level " + matchType);
197 public void TestC1Bytes() throws Exception
200 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
203 "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
205 byte[] bISO = sISO.getBytes("ISO-8859-1");
206 byte[] bWindows = sWindows.getBytes("windows-1252");
208 CharsetDetector det = new CharsetDetector();
211 det.setText(bWindows);
214 if (m.getName() != "windows-1252") {
215 errln("Text with C1 bytes not correctly detected as windows-1252.");
222 if (m.getName() != "ISO-8859-1") {
223 errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
227 public void TestShortInput() {
228 // Test that detection with very short byte strings does not crash and burn.
229 // The shortest input that should produce positive detection result is two bytes,
231 // TODO: Detector confidence levels needs to be refined for very short input.
232 // Too high now, for some charsets that happen to be compatible with a few bytes of input.
233 byte [][] shortBytes = new byte [][]
237 {(byte)'A', (byte)'B'},
238 {(byte)'A', (byte)'B', (byte)'C'},
239 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
242 CharsetDetector det = new CharsetDetector();
244 for (int i=0; i<shortBytes.length; i++) {
245 det.setText(shortBytes[i]);
247 logln("i=" + i + " -> " + m.getName());
251 public void TestBufferOverflow()
253 byte testStrings[][] = {
254 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
255 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
256 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
257 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
258 {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
259 {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
260 {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
261 {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
264 String testResults[] = {
275 CharsetDetector det = new CharsetDetector();
278 det.setDeclaredEncoding("ISO-2022-JP");
280 for (int idx = 0; idx < testStrings.length; idx += 1) {
281 det.setText(testStrings[idx]);
282 match = det.detect();
285 if (testResults[idx] != null) {
286 errln("Unexpectedly got no results at index " + idx);
289 logln("Got no result as expected at index " + idx);
294 if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
295 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
296 " at index " + idx + " with confidence " + match.getConfidence());
302 //#if defined(FOUNDATION10) || defined(J2SE13)
304 public void TestDetection()
307 // Open and read the test data file.
309 //InputStreamReader isr = null;
312 InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
314 errln("Could not open test data file CharsetDetectionTests.xml");
318 //isr = new InputStreamReader(is, "UTF-8");
320 // Set up an xml parser.
321 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
323 factory.setIgnoringComments(true);
325 DocumentBuilder builder = factory.newDocumentBuilder();
327 // Parse the xml content from the test case file.
328 Document doc = builder.parse(is, null);
329 Element root = doc.getDocumentElement();
331 NodeList testCases = root.getElementsByTagName("test-case");
333 // Process each test case
334 for (int n = 0; n < testCases.getLength(); n += 1) {
335 Node testCase = testCases.item(n);
336 NamedNodeMap attrs = testCase.getAttributes();
337 NodeList testData = testCase.getChildNodes();
338 StringBuffer testText = new StringBuffer();
339 String id = attrs.getNamedItem("id").getNodeValue();
340 String encodings = attrs.getNamedItem("encodings").getNodeValue();
342 // Collect the test case text.
343 for (int t = 0; t < testData.getLength(); t += 1) {
344 Node textNode = testData.item(t);
346 testText.append(textNode.getNodeValue());
349 // Process test text with each encoding / language pair.
350 String testString = testText.toString();
351 String[] encodingList = encodings.split(" ");
352 for (int e = 0; e < encodingList.length; e += 1) {
353 checkEncoding(testString, encodingList[e], id);
357 } catch (Exception e) {
358 errln("exception while processing test cases: " + e.toString());
362 private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
364 CharsetMatch m = det.detect();
367 if (! m.getName().equals(encoding)) {
368 errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
372 String charsetMatchLanguage = m.getLanguage();
373 if ((language != null && !charsetMatchLanguage.equals(language))
374 || (language == null && charsetMatchLanguage != null)
375 || (language != null && charsetMatchLanguage == null))
377 errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
380 if (encoding.startsWith("UTF-32")) {
384 decoded = m.getString();
386 if (! testString.equals(decoded)) {
387 errln(id + ", " + encoding + ": getString() didn't return the original string!");
390 decoded = stringFromReader(m.getReader());
392 if (! testString.equals(decoded)) {
393 errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
397 private void checkEncoding(String testString, String encoding, String id)
399 String enc = null, lang = null;
400 String[] split = encoding.split("/");
404 if (split.length > 1) {
409 CharsetDetector det = new CharsetDetector();
412 //if (enc.startsWith("UTF-32")) {
413 // UTF32 utf32 = UTF32.getInstance(enc);
415 // bytes = utf32.toBytes(testString);
421 bytes = testString.getBytes(from);
422 } catch (UnsupportedOperationException uoe) {
423 // In some runtimes, the ISO-2022-CN converter
424 // only converts *to* Unicode - we have to use
425 // x-ISO-2022-CN-GB to convert *from* Unicode.
426 if (from.equals("ISO-2022-CN")) {
427 from = "x-ISO-2022-CN-GB";
431 // Ignore any other converters that can't
432 // convert from Unicode.
434 } catch (UnsupportedEncodingException uee) {
435 // Ignore any encodings that this runtime
445 checkMatch(det, testString, enc, lang, id);
447 det.setText(new ByteArrayInputStream(bytes));
448 checkMatch(det, testString, enc, lang, id);
449 } catch (Exception e) {
450 errln(id + ": " + e.toString() + "enc=" + enc);
455 public void TestArabic() throws Exception {
456 String s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
457 "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
458 "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644" +
459 "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
460 "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
461 "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
462 "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
463 "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
464 "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
465 "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
466 "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
467 "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
468 "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
470 CharsetMatch m = _test1256(s);
471 String charsetMatch = m.getName();
472 CheckAssert(charsetMatch.equals("windows-1256"));
474 /* Create an encoder to get the bytes.
475 * Using String.getBytes("IBM420") can produce inconsistent results
476 * between different versions of the JDK.
478 CharsetEncoder encoder = new CharsetProviderICU().charsetForName("IBM420").newEncoder();
480 m = _testIBM420_ar_rtl(s, encoder);
481 charsetMatch = m.getName();
482 CheckAssert(charsetMatch.equals("IBM420_rtl"));
484 m = _testIBM420_ar_ltr(s, encoder);
485 charsetMatch = m.getName();
486 CheckAssert(charsetMatch.equals("IBM420_ltr"));
490 private CharsetMatch _testIBM420_ar_rtl(String s, CharsetEncoder encoder) throws Exception {
491 CharsetDetector det = new CharsetDetector();
492 det.setText(encoder.encode(CharBuffer.wrap(s)).array());
493 CharsetMatch m = det.detect();
497 private CharsetMatch _testIBM420_ar_ltr(String s, CharsetEncoder encoder) throws Exception {
499 * transformation of input string to CP420 left to right requires reversing the string
502 StringBuffer ltrStrBuf = new StringBuffer(s);
503 ltrStrBuf = ltrStrBuf.reverse();
505 CharsetDetector det = new CharsetDetector();
506 det.setText(encoder.encode(CharBuffer.wrap(ltrStrBuf.toString())).array());
507 CharsetMatch m = det.detect();
511 private CharsetMatch _test1256(String s) throws Exception {
513 byte [] bytes = s.getBytes("windows-1256");
514 CharsetDetector det = new CharsetDetector();
516 CharsetMatch m = det.detect();
521 public void TestHebrew() throws Exception {
522 String s = "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
523 "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
524 "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
525 "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
526 "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
527 "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
528 "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
529 "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
530 "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
531 "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
532 "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
533 "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
534 " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
535 " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
536 " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
537 "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
539 CharsetMatch m = _test1255(s);
540 String charsetMatch = m.getName();
541 CheckAssert(charsetMatch.equals("ISO-8859-8"));
543 m = _testIBM424_he_rtl(s);
544 charsetMatch = m.getName();
545 CheckAssert(charsetMatch.equals("IBM424_rtl"));
547 m = _testIBM424_he_ltr(s);
548 charsetMatch = m.getName();
549 CheckAssert(charsetMatch.equals("IBM424_ltr"));
552 private CharsetMatch _test1255(String s) throws Exception {
553 byte [] bytes = s.getBytes("ISO-8859-8");
554 CharsetDetector det = new CharsetDetector();
556 CharsetMatch m = det.detect();
560 private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
561 byte [] bytes = s.getBytes("IBM424");
562 CharsetDetector det = new CharsetDetector();
564 CharsetMatch m = det.detect();
568 private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
570 * transformation of input string to CP420 left to right requires reversing the string
573 StringBuffer ltrStrBuf = new StringBuffer(s);
574 ltrStrBuf = ltrStrBuf.reverse();
575 byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
577 CharsetDetector det = new CharsetDetector();
579 CharsetMatch m = det.detect();