3 *******************************************************************************
\r
4 * Copyright (C) 2005-2009, International Business Machines Corporation and *
\r
5 * others. All Rights Reserved. *
\r
6 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.charsetdet;
\r
10 import java.io.ByteArrayInputStream;
\r
11 import java.io.Reader;
\r
13 import com.ibm.icu.dev.test.TestFmwk;
\r
14 import com.ibm.icu.text.CharsetDetector;
\r
15 import com.ibm.icu.text.CharsetMatch;
\r
17 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
19 import java.io.InputStream;
\r
20 import java.io.UnsupportedEncodingException;
\r
22 import javax.xml.parsers.DocumentBuilder;
\r
23 import javax.xml.parsers.DocumentBuilderFactory;
\r
25 import org.w3c.dom.Document;
\r
26 import org.w3c.dom.Element;
\r
27 import org.w3c.dom.NamedNodeMap;
\r
28 import org.w3c.dom.Node;
\r
29 import org.w3c.dom.NodeList;
\r
31 import com.ibm.icu.charset.CharsetProviderICU;
\r
32 import java.nio.charset.CharsetEncoder;
\r
33 import java.nio.CharBuffer;
\r
40 public class TestCharsetDetector extends TestFmwk
\r
46 public TestCharsetDetector()
\r
50 public static void main(String[] args) {
\r
53 TestCharsetDetector test = new TestCharsetDetector();
\r
58 e.printStackTrace();
\r
62 private void CheckAssert(boolean exp) {
\r
66 throw new Exception();
\r
68 catch (Exception e) {
\r
69 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
70 //## msg = "Test failure " + e.getMessage() ;
\r
72 StackTraceElement failPoint = e.getStackTrace()[1];
\r
73 msg = "Test failure in file " + failPoint.getFileName() +
\r
74 " at line " + failPoint.getLineNumber();
\r
82 private String stringFromReader(Reader reader)
\r
84 StringBuffer sb = new StringBuffer();
\r
85 char[] buffer = new char[1024];
\r
89 while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
\r
90 sb.append(buffer, 0, bytesRead);
\r
93 return sb.toString();
\r
94 } catch (Exception e) {
\r
95 errln("stringFromReader() failed: " + e.toString());
\r
100 public void TestConstruction() {
\r
102 CharsetDetector det = new CharsetDetector();
\r
104 errln("Could not construct a charset detector");
\r
106 String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
\r
107 CheckAssert(charsetNames.length != 0);
\r
108 for (i=0; i<charsetNames.length; i++) {
\r
109 CheckAssert(charsetNames[i].equals("") == false);
\r
110 // System.out.println("\"" + charsetNames[i] + "\"");
\r
114 public void TestInputFilter() throws Exception
\r
116 String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
\r
117 byte[] bytes = s.getBytes("ISO-8859-1");
\r
118 CharsetDetector det = new CharsetDetector();
\r
121 det.enableInputFilter(true);
\r
122 if (!det.inputFilterEnabled()){
\r
123 errln("input filter should be enabled");
\r
126 det.setText(bytes);
\r
129 if (! m.getLanguage().equals("fr")) {
\r
130 errln("input filter did not strip markup!");
\r
133 det.enableInputFilter(false);
\r
134 det.setText(bytes);
\r
137 if (! m.getLanguage().equals("en")) {
\r
138 errln("unfiltered input did not detect as English!");
\r
142 public void TestUTF8() throws Exception {
\r
144 String s = "This is a string with some non-ascii characters that will " +
\r
145 "be converted to UTF-8, then shoved through the detection process. " +
\r
146 "\u0391\u0392\u0393\u0394\u0395" +
\r
147 "Sure would be nice if our source could contain Unicode directly!";
\r
148 byte [] bytes = s.getBytes("UTF-8");
\r
149 CharsetDetector det = new CharsetDetector();
\r
153 retrievedS = det.getString(bytes, "UTF-8");
\r
154 CheckAssert(s.equals(retrievedS));
\r
156 reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
\r
157 CheckAssert(s.equals(stringFromReader(reader)));
\r
158 det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
\r
161 public void TestUTF16() throws Exception
\r
164 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
\r
165 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
\r
167 byte[] beBytes = source.getBytes("UnicodeBig");
\r
168 byte[] leBytes = source.getBytes("UnicodeLittle");
\r
169 CharsetDetector det = new CharsetDetector();
\r
172 det.setText(beBytes);
\r
175 if (! m.getName().equals("UTF-16BE")) {
\r
176 errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
\r
179 det.setText(leBytes);
\r
182 if (! m.getName().equals("UTF-16LE")) {
\r
183 errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
\r
186 // Jitterbug 4451, for coverage
\r
187 int confidence = m.getConfidence();
\r
188 if(confidence != 100){
\r
189 errln("Did not get the expected confidence level " + confidence);
\r
191 int matchType = m.getMatchType();
\r
192 if(matchType != 0){
\r
193 errln("Did not get the expected matchType level " + matchType);
\r
197 public void TestC1Bytes() throws Exception
\r
200 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
\r
203 "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
\r
205 byte[] bISO = sISO.getBytes("ISO-8859-1");
\r
206 byte[] bWindows = sWindows.getBytes("windows-1252");
\r
208 CharsetDetector det = new CharsetDetector();
\r
211 det.setText(bWindows);
\r
214 if (m.getName() != "windows-1252") {
\r
215 errln("Text with C1 bytes not correctly detected as windows-1252.");
\r
222 if (m.getName() != "ISO-8859-1") {
\r
223 errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
\r
227 public void TestShortInput() {
\r
228 // Test that detection with very short byte strings does not crash and burn.
\r
229 // The shortest input that should produce positive detection result is two bytes,
\r
231 // TODO: Detector confidence levels needs to be refined for very short input.
\r
232 // Too high now, for some charsets that happen to be compatible with a few bytes of input.
\r
233 byte [][] shortBytes = new byte [][]
\r
237 {(byte)'A', (byte)'B'},
\r
238 {(byte)'A', (byte)'B', (byte)'C'},
\r
239 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
\r
242 CharsetDetector det = new CharsetDetector();
\r
244 for (int i=0; i<shortBytes.length; i++) {
\r
245 det.setText(shortBytes[i]);
\r
247 logln("i=" + i + " -> " + m.getName());
\r
251 public void TestBufferOverflow()
\r
253 byte testStrings[][] = {
\r
254 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
\r
255 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
\r
256 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
\r
257 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
\r
258 {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
\r
259 {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
\r
260 {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
\r
261 {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
\r
264 String testResults[] = {
\r
275 CharsetDetector det = new CharsetDetector();
\r
276 CharsetMatch match;
\r
278 det.setDeclaredEncoding("ISO-2022-JP");
\r
280 for (int idx = 0; idx < testStrings.length; idx += 1) {
\r
281 det.setText(testStrings[idx]);
\r
282 match = det.detect();
\r
284 if (match == null) {
\r
285 if (testResults[idx] != null) {
\r
286 errln("Unexpectedly got no results at index " + idx);
\r
289 logln("Got no result as expected at index " + idx);
\r
294 if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
\r
295 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
\r
296 " at index " + idx + " with confidence " + match.getConfidence());
\r
302 //#if defined(FOUNDATION10) || defined(J2SE13)
\r
304 public void TestDetection()
\r
307 // Open and read the test data file.
\r
309 //InputStreamReader isr = null;
\r
312 InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
\r
314 errln("Could not open test data file CharsetDetectionTests.xml");
\r
318 //isr = new InputStreamReader(is, "UTF-8");
\r
320 // Set up an xml parser.
\r
321 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
\r
323 factory.setIgnoringComments(true);
\r
325 DocumentBuilder builder = factory.newDocumentBuilder();
\r
327 // Parse the xml content from the test case file.
\r
328 Document doc = builder.parse(is, null);
\r
329 Element root = doc.getDocumentElement();
\r
331 NodeList testCases = root.getElementsByTagName("test-case");
\r
333 // Process each test case
\r
334 for (int n = 0; n < testCases.getLength(); n += 1) {
\r
335 Node testCase = testCases.item(n);
\r
336 NamedNodeMap attrs = testCase.getAttributes();
\r
337 NodeList testData = testCase.getChildNodes();
\r
338 StringBuffer testText = new StringBuffer();
\r
339 String id = attrs.getNamedItem("id").getNodeValue();
\r
340 String encodings = attrs.getNamedItem("encodings").getNodeValue();
\r
342 // Collect the test case text.
\r
343 for (int t = 0; t < testData.getLength(); t += 1) {
\r
344 Node textNode = testData.item(t);
\r
346 testText.append(textNode.getNodeValue());
\r
349 // Process test text with each encoding / language pair.
\r
350 String testString = testText.toString();
\r
351 String[] encodingList = encodings.split(" ");
\r
352 for (int e = 0; e < encodingList.length; e += 1) {
\r
353 checkEncoding(testString, encodingList[e], id);
\r
357 } catch (Exception e) {
\r
358 errln("exception while processing test cases: " + e.toString());
\r
362 private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
\r
364 CharsetMatch m = det.detect();
\r
367 if (! m.getName().equals(encoding)) {
\r
368 errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
\r
372 String charsetMatchLanguage = m.getLanguage();
\r
373 if ((language != null && !charsetMatchLanguage.equals(language))
\r
374 || (language == null && charsetMatchLanguage != null)
\r
375 || (language != null && charsetMatchLanguage == null))
\r
377 errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
\r
380 if (encoding.startsWith("UTF-32")) {
\r
384 decoded = m.getString();
\r
386 if (! testString.equals(decoded)) {
\r
387 errln(id + ", " + encoding + ": getString() didn't return the original string!");
\r
390 decoded = stringFromReader(m.getReader());
\r
392 if (! testString.equals(decoded)) {
\r
393 errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
\r
397 private void checkEncoding(String testString, String encoding, String id)
\r
399 String enc = null, lang = null;
\r
400 String[] split = encoding.split("/");
\r
404 if (split.length > 1) {
\r
409 CharsetDetector det = new CharsetDetector();
\r
412 //if (enc.startsWith("UTF-32")) {
\r
413 // UTF32 utf32 = UTF32.getInstance(enc);
\r
415 // bytes = utf32.toBytes(testString);
\r
421 bytes = testString.getBytes(from);
\r
422 } catch (UnsupportedOperationException uoe) {
\r
423 // In some runtimes, the ISO-2022-CN converter
\r
424 // only converts *to* Unicode - we have to use
\r
425 // x-ISO-2022-CN-GB to convert *from* Unicode.
\r
426 if (from.equals("ISO-2022-CN")) {
\r
427 from = "x-ISO-2022-CN-GB";
\r
431 // Ignore any other converters that can't
\r
432 // convert from Unicode.
\r
434 } catch (UnsupportedEncodingException uee) {
\r
435 // Ignore any encodings that this runtime
\r
436 // doesn't support.
\r
444 det.setText(bytes);
\r
445 checkMatch(det, testString, enc, lang, id);
\r
447 det.setText(new ByteArrayInputStream(bytes));
\r
448 checkMatch(det, testString, enc, lang, id);
\r
449 } catch (Exception e) {
\r
450 errln(id + ": " + e.toString() + "enc=" + enc);
\r
451 e.printStackTrace();
\r
455 public void TestArabic() throws Exception {
\r
456 String s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
\r
457 "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
\r
458 "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644" +
\r
459 "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
\r
460 "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
\r
461 "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
\r
462 "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
\r
463 "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
\r
464 "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
\r
465 "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
\r
466 "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
\r
467 "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
\r
468 "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
\r
470 CharsetMatch m = _test1256(s);
\r
471 String charsetMatch = m.getName();
\r
472 CheckAssert(charsetMatch.equals("windows-1256"));
\r
474 /* Create an encoder to get the bytes.
\r
475 * Using String.getBytes("IBM420") can produce inconsistent results
\r
476 * between different versions of the JDK.
\r
478 CharsetEncoder encoder = new CharsetProviderICU().charsetForName("IBM420").newEncoder();
\r
480 m = _testIBM420_ar_rtl(s, encoder);
\r
481 charsetMatch = m.getName();
\r
482 CheckAssert(charsetMatch.equals("IBM420_rtl"));
\r
484 m = _testIBM420_ar_ltr(s, encoder);
\r
485 charsetMatch = m.getName();
\r
486 CheckAssert(charsetMatch.equals("IBM420_ltr"));
\r
490 private CharsetMatch _testIBM420_ar_rtl(String s, CharsetEncoder encoder) throws Exception {
\r
491 CharsetDetector det = new CharsetDetector();
\r
492 det.setText(encoder.encode(CharBuffer.wrap(s)).array());
\r
493 CharsetMatch m = det.detect();
\r
497 private CharsetMatch _testIBM420_ar_ltr(String s, CharsetEncoder encoder) throws Exception {
\r
499 * transformation of input string to CP420 left to right requires reversing the string
\r
502 StringBuffer ltrStrBuf = new StringBuffer(s);
\r
503 ltrStrBuf = ltrStrBuf.reverse();
\r
505 CharsetDetector det = new CharsetDetector();
\r
506 det.setText(encoder.encode(CharBuffer.wrap(ltrStrBuf.toString())).array());
\r
507 CharsetMatch m = det.detect();
\r
511 private CharsetMatch _test1256(String s) throws Exception {
\r
513 byte [] bytes = s.getBytes("windows-1256");
\r
514 CharsetDetector det = new CharsetDetector();
\r
515 det.setText(bytes);
\r
516 CharsetMatch m = det.detect();
\r
521 public void TestHebrew() throws Exception {
\r
522 String s = "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
\r
523 "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
\r
524 "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
\r
525 "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
\r
526 "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
\r
527 "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
\r
528 "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
\r
529 "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
\r
530 "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
\r
531 "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
\r
532 "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
\r
533 "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
\r
534 " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
\r
535 " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
\r
536 " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
\r
537 "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
\r
539 CharsetMatch m = _test1255(s);
\r
540 String charsetMatch = m.getName();
\r
541 CheckAssert(charsetMatch.equals("ISO-8859-8"));
\r
543 m = _testIBM424_he_rtl(s);
\r
544 charsetMatch = m.getName();
\r
545 CheckAssert(charsetMatch.equals("IBM424_rtl"));
\r
547 m = _testIBM424_he_ltr(s);
\r
548 charsetMatch = m.getName();
\r
549 CheckAssert(charsetMatch.equals("IBM424_ltr"));
\r
552 private CharsetMatch _test1255(String s) throws Exception {
\r
553 byte [] bytes = s.getBytes("ISO-8859-8");
\r
554 CharsetDetector det = new CharsetDetector();
\r
555 det.setText(bytes);
\r
556 CharsetMatch m = det.detect();
\r
560 private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
\r
561 byte [] bytes = s.getBytes("IBM424");
\r
562 CharsetDetector det = new CharsetDetector();
\r
563 det.setText(bytes);
\r
564 CharsetMatch m = det.detect();
\r
568 private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
\r
570 * transformation of input string to CP420 left to right requires reversing the string
\r
573 StringBuffer ltrStrBuf = new StringBuffer(s);
\r
574 ltrStrBuf = ltrStrBuf.reverse();
\r
575 byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
\r
577 CharsetDetector det = new CharsetDetector();
\r
578 det.setText(bytes);
\r
579 CharsetMatch m = det.detect();
\r