2 *******************************************************************************
3 * Copyright (C) 2005-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test.charsetdet;
9 import java.io.ByteArrayInputStream;
10 import java.io.InputStream;
11 import java.io.Reader;
12 import java.io.UnsupportedEncodingException;
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
17 import org.w3c.dom.Document;
18 import org.w3c.dom.Element;
19 import org.w3c.dom.NamedNodeMap;
20 import org.w3c.dom.Node;
21 import org.w3c.dom.NodeList;
23 import com.ibm.icu.dev.test.TestFmwk;
24 import com.ibm.icu.text.CharsetDetector;
25 import com.ibm.icu.text.CharsetMatch;
31 public class TestCharsetDetector extends TestFmwk
37 public TestCharsetDetector()
41 public static void main(String[] args) {
44 TestCharsetDetector test = new TestCharsetDetector();
53 private void CheckAssert(boolean exp) {
57 throw new Exception();
60 StackTraceElement failPoint = e.getStackTrace()[1];
61 msg = "Test failure in file " + failPoint.getFileName() +
62 " at line " + failPoint.getLineNumber();
69 private String stringFromReader(Reader reader)
71 StringBuffer sb = new StringBuffer();
72 char[] buffer = new char[1024];
76 while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
77 sb.append(buffer, 0, bytesRead);
81 } catch (Exception e) {
82 errln("stringFromReader() failed: " + e.toString());
87 public void TestConstruction() {
89 CharsetDetector det = new CharsetDetector();
91 errln("Could not construct a charset detector");
93 String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
94 CheckAssert(charsetNames.length != 0);
95 for (i=0; i<charsetNames.length; i++) {
96 CheckAssert(charsetNames[i].equals("") == false);
97 // System.out.println("\"" + charsetNames[i] + "\"");
101 public void TestInputFilter() throws Exception
103 String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
104 byte[] bytes = s.getBytes("ISO-8859-1");
105 CharsetDetector det = new CharsetDetector();
108 det.enableInputFilter(true);
109 if (!det.inputFilterEnabled()){
110 errln("input filter should be enabled");
116 if (! m.getLanguage().equals("fr")) {
117 errln("input filter did not strip markup!");
120 det.enableInputFilter(false);
124 if (! m.getLanguage().equals("en")) {
125 errln("unfiltered input did not detect as English!");
129 public void TestUTF8() throws Exception {
131 String s = "This is a string with some non-ascii characters that will " +
132 "be converted to UTF-8, then shoved through the detection process. " +
133 "\u0391\u0392\u0393\u0394\u0395" +
134 "Sure would be nice if our source could contain Unicode directly!";
135 byte [] bytes = s.getBytes("UTF-8");
136 CharsetDetector det = new CharsetDetector();
140 retrievedS = det.getString(bytes, "UTF-8");
141 CheckAssert(s.equals(retrievedS));
143 reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
144 CheckAssert(s.equals(stringFromReader(reader)));
145 det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
148 public void TestUTF16() throws Exception
151 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
152 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
154 byte[] beBytes = source.getBytes("UnicodeBig");
155 byte[] leBytes = source.getBytes("UnicodeLittle");
156 CharsetDetector det = new CharsetDetector();
159 det.setText(beBytes);
162 if (! m.getName().equals("UTF-16BE")) {
163 errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
166 det.setText(leBytes);
169 if (! m.getName().equals("UTF-16LE")) {
170 errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
173 // Jitterbug 4451, for coverage
174 int confidence = m.getConfidence();
175 if(confidence != 100){
176 errln("Did not get the expected confidence level " + confidence);
178 int matchType = m.getMatchType();
180 errln("Did not get the expected matchType level " + matchType);
184 public void TestC1Bytes() throws Exception
187 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
190 "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
192 byte[] bISO = sISO.getBytes("ISO-8859-1");
193 byte[] bWindows = sWindows.getBytes("windows-1252");
195 CharsetDetector det = new CharsetDetector();
198 det.setText(bWindows);
201 if (m.getName() != "windows-1252") {
202 errln("Text with C1 bytes not correctly detected as windows-1252.");
209 if (m.getName() != "ISO-8859-1") {
210 errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
214 public void TestShortInput() {
215 // Test that detection with very short byte strings does not crash and burn.
216 // The shortest input that should produce positive detection result is two bytes,
218 // TODO: Detector confidence levels needs to be refined for very short input.
219 // Too high now, for some charsets that happen to be compatible with a few bytes of input.
220 byte [][] shortBytes = new byte [][]
224 {(byte)'A', (byte)'B'},
225 {(byte)'A', (byte)'B', (byte)'C'},
226 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
229 CharsetDetector det = new CharsetDetector();
231 for (int i=0; i<shortBytes.length; i++) {
232 det.setText(shortBytes[i]);
234 logln("i=" + i + " -> " + m.getName());
238 public void TestBufferOverflow()
240 byte testStrings[][] = {
241 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
242 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
243 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
244 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
245 {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
246 {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
247 {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
248 {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
251 String testResults[] = {
262 CharsetDetector det = new CharsetDetector();
265 det.setDeclaredEncoding("ISO-2022-JP");
267 for (int idx = 0; idx < testStrings.length; idx += 1) {
268 det.setText(testStrings[idx]);
269 match = det.detect();
272 if (testResults[idx] != null) {
273 errln("Unexpectedly got no results at index " + idx);
276 logln("Got no result as expected at index " + idx);
281 if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
282 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
283 " at index " + idx + " with confidence " + match.getConfidence());
289 public void TestDetection()
292 // Open and read the test data file.
294 //InputStreamReader isr = null;
297 InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
299 errln("Could not open test data file CharsetDetectionTests.xml");
303 //isr = new InputStreamReader(is, "UTF-8");
305 // Set up an xml parser.
306 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
308 factory.setIgnoringComments(true);
310 DocumentBuilder builder = factory.newDocumentBuilder();
312 // Parse the xml content from the test case file.
313 Document doc = builder.parse(is, null);
314 Element root = doc.getDocumentElement();
316 NodeList testCases = root.getElementsByTagName("test-case");
318 // Process each test case
319 for (int n = 0; n < testCases.getLength(); n += 1) {
320 Node testCase = testCases.item(n);
321 NamedNodeMap attrs = testCase.getAttributes();
322 NodeList testData = testCase.getChildNodes();
323 StringBuffer testText = new StringBuffer();
324 String id = attrs.getNamedItem("id").getNodeValue();
325 String encodings = attrs.getNamedItem("encodings").getNodeValue();
327 // Collect the test case text.
328 for (int t = 0; t < testData.getLength(); t += 1) {
329 Node textNode = testData.item(t);
331 testText.append(textNode.getNodeValue());
334 // Process test text with each encoding / language pair.
335 String testString = testText.toString();
336 String[] encodingList = encodings.split(" ");
337 for (int e = 0; e < encodingList.length; e += 1) {
338 checkEncoding(testString, encodingList[e], id);
342 } catch (Exception e) {
343 errln("exception while processing test cases: " + e.toString());
347 private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
349 CharsetMatch m = det.detect();
352 if (! m.getName().equals(encoding)) {
353 errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
357 String charsetMatchLanguage = m.getLanguage();
358 if ((language != null && !charsetMatchLanguage.equals(language))
359 || (language == null && charsetMatchLanguage != null)
360 || (language != null && charsetMatchLanguage == null))
362 errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
365 if (encoding.startsWith("UTF-32")) {
369 decoded = m.getString();
371 if (! testString.equals(decoded)) {
372 errln(id + ", " + encoding + ": getString() didn't return the original string!");
375 decoded = stringFromReader(m.getReader());
377 if (! testString.equals(decoded)) {
378 errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
382 private void checkEncoding(String testString, String encoding, String id)
384 String enc = null, lang = null;
385 String[] split = encoding.split("/");
389 if (split.length > 1) {
394 CharsetDetector det = new CharsetDetector();
397 //if (enc.startsWith("UTF-32")) {
398 // UTF32 utf32 = UTF32.getInstance(enc);
400 // bytes = utf32.toBytes(testString);
406 bytes = testString.getBytes(from);
407 } catch (UnsupportedOperationException uoe) {
408 // In some runtimes, the ISO-2022-CN converter
409 // only converts *to* Unicode - we have to use
410 // x-ISO-2022-CN-GB to convert *from* Unicode.
411 if (from.equals("ISO-2022-CN")) {
412 from = "x-ISO-2022-CN-GB";
416 // Ignore any other converters that can't
417 // convert from Unicode.
419 } catch (UnsupportedEncodingException uee) {
420 // Ignore any encodings that this runtime
430 checkMatch(det, testString, enc, lang, id);
432 det.setText(new ByteArrayInputStream(bytes));
433 checkMatch(det, testString, enc, lang, id);
434 } catch (Exception e) {
435 errln(id + ": " + e.toString() + "enc=" + enc);
440 public void TestJapanese() throws Exception {
441 String s = "\u3000\u3001\u3002\u3003\u3005\u3006\u3007\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011\u3012\u3013\u3014" +
442 "\u3015\u301C\u3041\u3042\u3043\u3044\u3045\u3046\u3047\u3048\u3049\u304A\u304B\u304C\u304D\u304E\u304F\u3050\u3051\u3052" +
443 "\u3053\u3054\u3055\u3056\u3057\u3058\u3059\u305A\u305B\u305C\u305D\u305E\u305F\u3060\u3061\u3062\u3063\u3064\u3065\u3066" +
444 "\u3067\u3068\u3069\u306A\u306B\u306C\u306D\u306E\u306F\u3070\u3071\u3072\u3073\u3074\u3075\u3076\u3077\u3078\u3079\u307A" +
445 "\u307B\u307C\u307D\u307E\u307F\u3080\u3081\u3082\u3083\u3084\u3085\u3086\u3087\u3088\u3089\u308A\u308B\u308C\u308D\u308E" +
446 "\u308F\u3090\u3091\u3092\u3093\u309B\u309C\u309D\u309E\u30A1\u30A2\u30A3\u30A4\u30A5\u30A6\u30A7\u30A8\u30A9\u30AA\u30AB" +
447 "\u30AC\u30AD\u30AE\u30AF\u30B0\u30B1\u30B2\u30B3\u30B4\u30B5\u30B6\u30B7\u30B8\u30B9\u30BA\u30BB\u30BC\u30BD\u30BE\u30BF" +
448 "\u30C0\u30C1\u30C2\u30C3\u30C4\u30C5\u30C6\u30C7\u30C8\u30C9\u30CA\u30CB\u30CC\u30CD\u30CE\u30CF\u30D0\u30D1\u30D2\u30D3" +
449 "\u30D4\u30D5\u30D6\u30D7\u30D8\u30D9\u30DA\u30DB\u30DC\u30DD\u30DE\u30DF\u30E0\u30E1\u30E2\u30E3\u30E4\u30E5\u30E6\u30E7" +
450 "\u30E8\u30E9\u30EA\u30EB\u30EC\u30ED\u30EE\u30EF\u30F0\u30F1\u30F2\u30F3\u30F4\u30F5\u30F6\u30FB\u30FC\u30FD\u30FE\u4E00" +
451 "\u4E01\u4E02\u4E03\u4E04\u4E05\u4E07\u4E08\u4E09\u4E0A\u4E0B\u4E0C\u4E0D\u4E0E\u4E10\u4E11\u4E12\u4E14\u4E15\u4E16\u4E17" +
452 "\u4E18\u4E19\u4E1E\u4E1F\u4E21\u4E23\u4E24\u4E26\u4E28\u4E2A\u4E2B\u4E2D\u4E2E\u4E2F\u4E30\u4E31\u4E32\u4E35\u4E36\u4E38" +
453 "\u4E39\u4E3B\u4E3C\u4E3F\u4E40\u4E41\u4E42\u4E43\u4E44\u4E45\u4E47\u4E4B\u4E4D\u4E4E\u4E4F\u4E51\u4E55\u4E56\u4E57\u4E58" +
454 "\u4E59\u4E5A\u4E5C\u4E5D\u4E5E\u4E5F\u4E62\u4E63\u4E68\u4E69\u4E71\u4E73\u4E74\u4E75\u4E79\u4E7E\u4E7F\u4E80\u4E82\u4E85" +
455 "\u4E86\u4E88\u4E89\u4E8A\u4E8B\u4E8C";
457 CharsetDetector det = new CharsetDetector();
462 bytes = s.getBytes("EUC-JP");
465 charsetMatch = m.getName();
466 CheckAssert(charsetMatch.equals("EUC-JP"));
468 // Tests "public String getLanguage()"
469 CheckAssert(m.getLanguage().equals("ja"));
473 public void TestArabic() throws Exception {
474 String s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
475 "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
476 "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644" +
477 "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
478 "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
479 "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
480 "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
481 "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
482 "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
483 "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
484 "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
485 "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
486 "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
488 CharsetDetector det = new CharsetDetector();
493 bytes = s.getBytes("windows-1256");
496 charsetMatch = m.getName();
497 CheckAssert(charsetMatch.equals("windows-1256"));
499 // Tests "public String getLanguage()"
500 CheckAssert(m.getLanguage().endsWith("ar"));
504 // We cannot rely on IBM420 converter in Sun Java
506 bytes = s.getBytes("IBM420");
509 (byte)0xCF, (byte)0x8D, (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0xBD, (byte)0xAB,
510 (byte)0x74, (byte)0x63, (byte)0x40, (byte)0x58, (byte)0x75, (byte)0x56, (byte)0xBB, (byte)0x67,
511 (byte)0x40, (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x9A,
512 (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x62, (byte)0x40, (byte)0xAB, (byte)0xDC, (byte)0x40,
513 (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77, (byte)0x62, (byte)0x40, (byte)0x56, (byte)0xB1,
514 (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x56, (byte)0xB1,
515 (byte)0xCF, (byte)0x8F, (byte)0xBD, (byte)0xDC, (byte)0x6B, (byte)0x40, (byte)0xBB, (byte)0x9A,
516 (byte)0x40, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x63, (byte)0xBF,
517 (byte)0x56, (byte)0x40, (byte)0x73, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x56, (byte)0x40,
518 (byte)0xB1, (byte)0xB1, (byte)0x56, (byte)0x69, (byte)0x63, (byte)0xDC, (byte)0x56, (byte)0x67,
519 (byte)0x56, (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x63, (byte)0x9E,
520 (byte)0xDC, (byte)0x75, (byte)0x62, (byte)0x40, (byte)0xB1, (byte)0xB1, (byte)0xBB, (byte)0x67,
521 (byte)0x63, (byte)0xBB, (byte)0x9A, (byte)0x40, (byte)0xCF, (byte)0xB1, (byte)0xB1, (byte)0x73,
522 (byte)0xCF, (byte)0xB1, (byte)0x62, (byte)0x4B, (byte)0x40, (byte)0x63, (byte)0xCF, (byte)0x77,
523 (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0x63, (byte)0x8F, (byte)0xCF, (byte)0x75,
524 (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77,
525 (byte)0x62, (byte)0x40, (byte)0x58, (byte)0xBF, (byte)0x73, (byte)0xAB, (byte)0x40, (byte)0x8D,
526 (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x80, (byte)0x58, (byte)0xAF, (byte)0x62,
527 (byte)0x40, (byte)0x49, (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0xB1, (byte)0x77,
528 (byte)0xAF, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x73, (byte)0xCF, (byte)0xB1, (byte)0x62,
529 (byte)0x40, (byte)0x56, (byte)0x77, (byte)0x75, (byte)0x56, (byte)0x55, (byte)0xDC, (byte)0xB1,
530 (byte)0x40, (byte)0x58, (byte)0xCF, (byte)0x67, (byte)0xBF, (byte)0x40, (byte)0x56, (byte)0xB1,
531 (byte)0xBB, (byte)0x71, (byte)0x56, (byte)0x8F, (byte)0x75, (byte)0x40, (byte)0x56, (byte)0xB1,
532 (byte)0x56, (byte)0xAD, (byte)0x63, (byte)0x8B, (byte)0x56, (byte)0x73, (byte)0xDC, (byte)0x62,
533 (byte)0x40, (byte)0xCF, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x67, (byte)0x63, (byte)0xBB,
534 (byte)0x56, (byte)0x9A, (byte)0xDC, (byte)0x62, (byte)0x4B,
538 charsetMatch = m.getName();
539 CheckAssert(charsetMatch.equals("IBM420_rtl"));
541 // Tests "public String getLanguage()"
542 CheckAssert(m.getLanguage().endsWith("ar"));
546 // We cannot rely on IBM420 converter in Sun Java
548 StringBuffer ltrStrBuf = new StringBuffer(s);
549 ltrStrBuf = ltrStrBuf.reverse();
550 bytes = ltrStrBuf.toString().getBytes("IBM420");
553 (byte)0x4B, (byte)0x62, (byte)0xDC, (byte)0x9A, (byte)0x56, (byte)0xBB, (byte)0x63, (byte)0x67,
554 (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0xCF, (byte)0x40, (byte)0x62, (byte)0xDC, (byte)0x73,
555 (byte)0x56, (byte)0x8B, (byte)0x63, (byte)0xAD, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x40,
556 (byte)0x75, (byte)0x8F, (byte)0x56, (byte)0x71, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40,
557 (byte)0xBF, (byte)0x67, (byte)0xCF, (byte)0x58, (byte)0x40, (byte)0xB1, (byte)0xDC, (byte)0x55,
558 (byte)0x56, (byte)0x75, (byte)0x77, (byte)0x56, (byte)0x40, (byte)0x62, (byte)0xB1, (byte)0xCF,
559 (byte)0x73, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xAF, (byte)0x77, (byte)0xB1, (byte)0x40,
560 (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x49, (byte)0x40, (byte)0x62, (byte)0xAF, (byte)0x58,
561 (byte)0x80, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x8D, (byte)0x40, (byte)0xAB,
562 (byte)0x73, (byte)0xBF, (byte)0x58, (byte)0x40, (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52,
563 (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x75, (byte)0xCF, (byte)0x8F,
564 (byte)0x63, (byte)0xCF, (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x77, (byte)0xCF, (byte)0x63,
565 (byte)0x40, (byte)0x4B, (byte)0x62, (byte)0xB1, (byte)0xCF, (byte)0x73, (byte)0xB1, (byte)0xB1,
566 (byte)0xCF, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x63, (byte)0x67, (byte)0xBB, (byte)0xB1,
567 (byte)0xB1, (byte)0x40, (byte)0x62, (byte)0x75, (byte)0xDC, (byte)0x9E, (byte)0x63, (byte)0xBB,
568 (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x56, (byte)0x67, (byte)0x56, (byte)0xDC,
569 (byte)0x63, (byte)0x69, (byte)0x56, (byte)0xB1, (byte)0xB1, (byte)0x40, (byte)0x56, (byte)0xBB,
570 (byte)0x55, (byte)0x56, (byte)0x73, (byte)0x40, (byte)0x56, (byte)0xBF, (byte)0x63, (byte)0xBB,
571 (byte)0x55, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x40,
572 (byte)0x6B, (byte)0xDC, (byte)0xBD, (byte)0x8F, (byte)0xCF, (byte)0xB1, (byte)0x56, (byte)0x40,
573 (byte)0xBD, (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0xB1, (byte)0x56, (byte)0x40,
574 (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52, (byte)0xBB, (byte)0x40, (byte)0xDC, (byte)0xAB,
575 (byte)0x40, (byte)0x62, (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x9A, (byte)0x40, (byte)0xBD,
576 (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0x40, (byte)0x67, (byte)0xBB, (byte)0x56,
577 (byte)0x75, (byte)0x58, (byte)0x40, (byte)0x63, (byte)0x74, (byte)0xAB, (byte)0xBD, (byte)0xCF,
578 (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x8D, (byte)0xCF,
583 charsetMatch = m.getName();
584 CheckAssert(charsetMatch.equals("IBM420_ltr"));
588 public void TestHebrew() throws Exception {
589 String s = "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
590 "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
591 "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
592 "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
593 "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
594 "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
595 "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
596 "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
597 "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
598 "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
599 "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
600 "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
601 " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
602 " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
603 " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
604 "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
606 CharsetMatch m = _test1255(s);
607 String charsetMatch = m.getName();
608 CheckAssert(charsetMatch.equals("ISO-8859-8"));
609 CheckAssert(m.getLanguage().equals("he"));
611 m = _test1255_reverse(s);
612 charsetMatch = m.getName();
613 CheckAssert(charsetMatch.equals("ISO-8859-8"));
614 CheckAssert(m.getLanguage().equals("he"));
616 m = _testIBM424_he_rtl(s);
617 charsetMatch = m.getName();
618 CheckAssert(charsetMatch.equals("IBM424_rtl"));
619 CheckAssert(m.getLanguage().equals("he"));
621 m = _testIBM424_he_ltr(s);
622 charsetMatch = m.getName();
623 CheckAssert(charsetMatch.equals("IBM424_ltr"));
624 CheckAssert(m.getLanguage().equals("he"));
627 private CharsetMatch _test1255(String s) throws Exception {
628 byte [] bytes = s.getBytes("ISO-8859-8");
629 CharsetDetector det = new CharsetDetector();
631 CharsetMatch m = det.detect();
635 private CharsetMatch _test1255_reverse(String s) throws Exception {
636 StringBuffer reverseStrBuf = new StringBuffer(s);
637 reverseStrBuf = reverseStrBuf.reverse();
638 byte [] bytes = reverseStrBuf.toString().getBytes("ISO-8859-8");
640 CharsetDetector det = new CharsetDetector();
642 CharsetMatch m = det.detect();
646 private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
647 byte [] bytes = s.getBytes("IBM424");
648 CharsetDetector det = new CharsetDetector();
650 CharsetMatch m = det.detect();
654 private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
656 * transformation of input string to CP420 left to right requires reversing the string
659 StringBuffer ltrStrBuf = new StringBuffer(s);
660 ltrStrBuf = ltrStrBuf.reverse();
661 byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
663 CharsetDetector det = new CharsetDetector();
665 CharsetMatch m = det.detect();
670 * Test the method int match(CharsetDetector det) in CharsetRecog_UTF_16_LE
672 public void TestCharsetRecog_UTF_16_LE_Match() {
673 byte[] in = { Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE };
674 CharsetDetector cd = new CharsetDetector();
675 // Tests when if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) is true inside the
676 // match(CharsetDetector) method of CharsetRecog_UTF_16_LE
679 } catch (Exception e) {
680 errln("CharsetRecog_UTF_16_LE.match(CharsetDetector) was not suppose to return an exception.");