jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java

   1 //##header J2SE15
   2 /**
   3  *******************************************************************************
   4  * Copyright (C) 2005-2009, International Business Machines Corporation and    *
   5  * others. All Rights Reserved.                                                *
   6  *******************************************************************************
   7  */
   8 package com.ibm.icu.dev.test.charsetdet;
   9
  10 import java.io.ByteArrayInputStream;
  11 import java.io.Reader;
  12
  13 import com.ibm.icu.dev.test.TestFmwk;
  14 import com.ibm.icu.text.CharsetDetector;
  15 import com.ibm.icu.text.CharsetMatch;
  16
  17 //#if defined(FOUNDATION10) || defined(J2SE13)
  18 //#else
  19 import java.io.InputStream;
  20 import java.io.UnsupportedEncodingException;
  21
  22 import javax.xml.parsers.DocumentBuilder;
  23 import javax.xml.parsers.DocumentBuilderFactory;
  24
  25 import org.w3c.dom.Document;
  26 import org.w3c.dom.Element;
  27 import org.w3c.dom.NamedNodeMap;
  28 import org.w3c.dom.Node;
  29 import org.w3c.dom.NodeList;
  30
  31 import com.ibm.icu.charset.CharsetProviderICU;
  32 import java.nio.charset.CharsetEncoder;
  33 import java.nio.CharBuffer;
  34 //#endif
  35
  36
  37 /**
  38  * @author andy
  39  */
  40 public class TestCharsetDetector extends TestFmwk
  41 {
  42
  43     /**
  44      * Constructor
  45      */
  46     public TestCharsetDetector()
  47     {
  48     }
  49
  50     public static void main(String[] args) {
  51         try
  52         {
  53             TestCharsetDetector test = new TestCharsetDetector();
  54             test.run(args);
  55         }
  56         catch (Exception e)
  57         {
  58             e.printStackTrace();
  59         }
  60     }
  61
  62     private void CheckAssert(boolean exp) {
  63         if (exp == false) {
  64             String msg;
  65             try {
  66                 throw new Exception();
  67             }
  68             catch (Exception e) {
  69 //#if defined(FOUNDATION10) || defined(J2SE13)
  70 //##           msg = "Test failure  " + e.getMessage() ;
  71 //#else
  72                 StackTraceElement failPoint = e.getStackTrace()[1];
  73                 msg = "Test failure in file " + failPoint.getFileName() +
  74                              " at line " + failPoint.getLineNumber();
  75 //#endif
  76             }
  77             errln(msg);
  78         }
  79
  80     }
  81
  82     private String stringFromReader(Reader reader)
  83     {
  84         StringBuffer sb = new StringBuffer();
  85         char[] buffer   = new char[1024];
  86         int bytesRead   = 0;
  87
  88         try {
  89             while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
  90                 sb.append(buffer, 0, bytesRead);
  91             }
  92
  93             return sb.toString();
  94         } catch (Exception e) {
  95             errln("stringFromReader() failed: " + e.toString());
  96             return null;
  97         }
  98     }
  99
 100     public void TestConstruction() {
 101         int i;
 102         CharsetDetector  det = new CharsetDetector();
 103         if(det==null){
 104             errln("Could not construct a charset detector");
 105         }
 106         String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
 107         CheckAssert(charsetNames.length != 0);
 108         for (i=0; i<charsetNames.length; i++) {
 109             CheckAssert(charsetNames[i].equals("") == false);
 110             // System.out.println("\"" + charsetNames[i] + "\"");
 111         }
 112      }
 113
 114     public void TestInputFilter() throws Exception
 115     {
 116         String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
 117         byte[] bytes = s.getBytes("ISO-8859-1");
 118         CharsetDetector det = new CharsetDetector();
 119         CharsetMatch m;
 120
 121         det.enableInputFilter(true);
 122         if (!det.inputFilterEnabled()){
 123             errln("input filter should be enabled");
 124         }
 125
 126         det.setText(bytes);
 127         m = det.detect();
 128
 129         if (! m.getLanguage().equals("fr")) {
 130             errln("input filter did not strip markup!");
 131         }
 132
 133         det.enableInputFilter(false);
 134         det.setText(bytes);
 135         m = det.detect();
 136
 137         if (! m.getLanguage().equals("en")) {
 138             errln("unfiltered input did not detect as English!");
 139         }
 140     }
 141
 142     public void TestUTF8() throws Exception {
 143
 144         String  s = "This is a string with some non-ascii characters that will " +
 145                     "be converted to UTF-8, then shoved through the detection process.  " +
 146                     "\u0391\u0392\u0393\u0394\u0395" +
 147                     "Sure would be nice if our source could contain Unicode directly!";
 148         byte [] bytes = s.getBytes("UTF-8");
 149         CharsetDetector det = new CharsetDetector();
 150         String retrievedS;
 151         Reader reader;
 152
 153         retrievedS = det.getString(bytes, "UTF-8");
 154         CheckAssert(s.equals(retrievedS));
 155
 156         reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
 157         CheckAssert(s.equals(stringFromReader(reader)));
 158         det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
 159     }
 160
 161     public void TestUTF16() throws Exception
 162     {
 163         String source =
 164                 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
 165                 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
 166
 167         byte[] beBytes = source.getBytes("UnicodeBig");
 168         byte[] leBytes = source.getBytes("UnicodeLittle");
 169         CharsetDetector det = new CharsetDetector();
 170         CharsetMatch m;
 171
 172         det.setText(beBytes);
 173         m = det.detect();
 174
 175         if (! m.getName().equals("UTF-16BE")) {
 176             errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
 177         }
 178
 179         det.setText(leBytes);
 180         m = det.detect();
 181
 182         if (! m.getName().equals("UTF-16LE")) {
 183             errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
 184         }
 185
 186         // Jitterbug 4451, for coverage
 187         int confidence = m.getConfidence();
 188         if(confidence != 100){
 189             errln("Did not get the expected confidence level " + confidence);
 190         }
 191         int matchType = m.getMatchType();
 192         if(matchType != 0){
 193             errln("Did not get the expected matchType level " + matchType);
 194         }
 195     }
 196
 197     public void TestC1Bytes() throws Exception
 198     {
 199         String sISO =
 200             "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
 201
 202         String sWindows =
 203             "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
 204
 205         byte[] bISO     = sISO.getBytes("ISO-8859-1");
 206         byte[] bWindows = sWindows.getBytes("windows-1252");
 207
 208         CharsetDetector det = new CharsetDetector();
 209         CharsetMatch m;
 210
 211         det.setText(bWindows);
 212         m = det.detect();
 213
 214         if (m.getName() != "windows-1252") {
 215             errln("Text with C1 bytes not correctly detected as windows-1252.");
 216             return;
 217         }
 218
 219         det.setText(bISO);
 220         m = det.detect();
 221
 222         if (m.getName() != "ISO-8859-1") {
 223             errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
 224         }
 225     }
 226
 227     public void TestShortInput() {
 228         // Test that detection with very short byte strings does not crash and burn.
 229         // The shortest input that should produce positive detection result is two bytes,
 230         //   a UTF-16 BOM.
 231         // TODO:  Detector confidence levels needs to be refined for very short input.
 232         //        Too high now, for some charsets that happen to be compatible with a few bytes of input.
 233         byte [][]  shortBytes = new byte [][]
 234             {
 235                 {},
 236                 {(byte)0x0a},
 237                 {(byte)'A', (byte)'B'},
 238                 {(byte)'A', (byte)'B', (byte)'C'},
 239                 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
 240             };
 241
 242         CharsetDetector det = new CharsetDetector();
 243         CharsetMatch m;
 244         for (int i=0; i<shortBytes.length; i++) {
 245             det.setText(shortBytes[i]);
 246             m = det.detect();
 247             logln("i=" + i + " -> " + m.getName());
 248         }
 249     }
 250
 251     public void TestBufferOverflow()
 252     {
 253         byte testStrings[][] = {
 254             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
 255             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
 256             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
 257             {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
 258             {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
 259             {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
 260             {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
 261             {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
 262         };
 263
 264         String testResults[] = {
 265             "windows-1252",
 266             "windows-1252",
 267             "windows-1252",
 268             "windows-1252",
 269             "ISO-2022-JP",
 270             null,
 271             null,
 272             "ISO-8859-1"
 273         };
 274
 275         CharsetDetector det = new CharsetDetector();
 276         CharsetMatch match;
 277
 278         det.setDeclaredEncoding("ISO-2022-JP");
 279
 280         for (int idx = 0; idx < testStrings.length; idx += 1) {
 281             det.setText(testStrings[idx]);
 282             match = det.detect();
 283
 284             if (match == null) {
 285                 if (testResults[idx] != null) {
 286                     errln("Unexpectedly got no results at index " + idx);
 287                 }
 288                 else {
 289                     logln("Got no result as expected at index " + idx);
 290                 }
 291                 continue;
 292             }
 293
 294             if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
 295                 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
 296                       " at index " + idx + " with confidence " + match.getConfidence());
 297                 return;
 298             }
 299         }
 300     }
 301
 302 //#if defined(FOUNDATION10) || defined(J2SE13)
 303 //#else
 304     public void TestDetection()
 305     {
 306         //
 307         //  Open and read the test data file.
 308         //
 309         //InputStreamReader isr = null;
 310
 311         try {
 312             InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
 313             if (is == null) {
 314                 errln("Could not open test data file CharsetDetectionTests.xml");
 315                 return;
 316             }
 317
 318             //isr = new InputStreamReader(is, "UTF-8");
 319
 320             // Set up an xml parser.
 321             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
 322
 323             factory.setIgnoringComments(true);
 324
 325             DocumentBuilder builder = factory.newDocumentBuilder();
 326
 327             // Parse the xml content from the test case file.
 328             Document doc = builder.parse(is, null);
 329             Element root = doc.getDocumentElement();
 330
 331             NodeList testCases = root.getElementsByTagName("test-case");
 332
 333             // Process each test case
 334             for (int n = 0; n < testCases.getLength(); n += 1) {
 335                 Node testCase = testCases.item(n);
 336                 NamedNodeMap attrs = testCase.getAttributes();
 337                 NodeList testData  = testCase.getChildNodes();
 338                 StringBuffer testText = new StringBuffer();
 339                 String id = attrs.getNamedItem("id").getNodeValue();
 340                 String encodings = attrs.getNamedItem("encodings").getNodeValue();
 341
 342                 // Collect the test case text.
 343                 for (int t = 0; t < testData.getLength(); t += 1) {
 344                     Node textNode = testData.item(t);
 345
 346                     testText.append(textNode.getNodeValue());
 347                 }
 348
 349                 // Process test text with each encoding / language pair.
 350                 String testString = testText.toString();
 351                 String[] encodingList = encodings.split(" ");
 352                 for (int e = 0; e < encodingList.length; e += 1) {
 353                     checkEncoding(testString, encodingList[e], id);
 354                 }
 355             }
 356
 357         } catch (Exception e) {
 358             errln("exception while processing test cases: " + e.toString());
 359         }
 360     }
 361
 362     private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
 363     {
 364         CharsetMatch m = det.detect();
 365         String decoded;
 366
 367         if (! m.getName().equals(encoding)) {
 368             errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
 369             return;
 370         }
 371
 372         String charsetMatchLanguage = m.getLanguage();
 373         if ((language != null && !charsetMatchLanguage.equals(language))
 374             || (language == null && charsetMatchLanguage != null)
 375             || (language != null && charsetMatchLanguage == null))
 376         {
 377             errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
 378         }
 379
 380         if (encoding.startsWith("UTF-32")) {
 381             return;
 382         }
 383
 384         decoded = m.getString();
 385
 386         if (! testString.equals(decoded)) {
 387             errln(id + ", " + encoding + ": getString() didn't return the original string!");
 388         }
 389
 390         decoded = stringFromReader(m.getReader());
 391
 392         if (! testString.equals(decoded)) {
 393             errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
 394         }
 395     }
 396
 397     private void checkEncoding(String testString, String encoding, String id)
 398     {
 399         String enc = null, lang = null;
 400         String[] split = encoding.split("/");
 401
 402         enc = split[0];
 403
 404         if (split.length > 1) {
 405             lang = split[1];
 406         }
 407
 408         try {
 409             CharsetDetector det = new CharsetDetector();
 410             byte[] bytes;
 411
 412             //if (enc.startsWith("UTF-32")) {
 413             //    UTF32 utf32 = UTF32.getInstance(enc);
 414
 415             //    bytes = utf32.toBytes(testString);
 416             //} else {
 417                 String from = enc;
 418
 419                 while (true) {
 420                     try {
 421                         bytes = testString.getBytes(from);
 422                     } catch (UnsupportedOperationException uoe) {
 423                          // In some runtimes, the ISO-2022-CN converter
 424                          // only converts *to* Unicode - we have to use
 425                          // x-ISO-2022-CN-GB to convert *from* Unicode.
 426                         if (from.equals("ISO-2022-CN")) {
 427                             from = "x-ISO-2022-CN-GB";
 428                             continue;
 429                         }
 430
 431                         // Ignore any other converters that can't
 432                         // convert from Unicode.
 433                         return;
 434                     } catch (UnsupportedEncodingException uee) {
 435                         // Ignore any encodings that this runtime
 436                         // doesn't support.
 437                         return;
 438                     }
 439
 440                     break;
 441                 }
 442             //}
 443
 444             det.setText(bytes);
 445             checkMatch(det, testString, enc, lang, id);
 446
 447             det.setText(new ByteArrayInputStream(bytes));
 448             checkMatch(det, testString, enc, lang, id);
 449          } catch (Exception e) {
 450             errln(id + ": " + e.toString() + "enc=" + enc);
 451             e.printStackTrace();
 452         }
 453     }
 454
 455     public void TestArabic() throws Exception {
 456         String  s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
 457         "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
 458         "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644"  +
 459         "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
 460         "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
 461         "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
 462         "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
 463         "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
 464         "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
 465         "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
 466         "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
 467         "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
 468         "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
 469
 470         CharsetMatch m = _test1256(s);
 471         String charsetMatch = m.getName();
 472         CheckAssert(charsetMatch.equals("windows-1256"));
 473
 474         /* Create an encoder to get the bytes.
 475          * Using String.getBytes("IBM420") can produce inconsistent results
 476          * between different versions of the JDK.
 477          */
 478         CharsetEncoder encoder = new CharsetProviderICU().charsetForName("IBM420").newEncoder();
 479
 480         m = _testIBM420_ar_rtl(s, encoder);
 481         charsetMatch = m.getName();
 482         CheckAssert(charsetMatch.equals("IBM420_rtl"));
 483
 484          m = _testIBM420_ar_ltr(s, encoder);
 485         charsetMatch = m.getName();
 486         CheckAssert(charsetMatch.equals("IBM420_ltr"));
 487
 488     }
 489
 490     private CharsetMatch _testIBM420_ar_rtl(String s, CharsetEncoder encoder) throws Exception {
 491         CharsetDetector det = new CharsetDetector();
 492         det.setText(encoder.encode(CharBuffer.wrap(s)).array());
 493         CharsetMatch m = det.detect();
 494         return m;
 495     }
 496
 497     private CharsetMatch _testIBM420_ar_ltr(String s, CharsetEncoder encoder) throws Exception {
 498         /**
 499          * transformation of input string to CP420 left to right requires reversing the string
 500          */
 501
 502         StringBuffer ltrStrBuf = new StringBuffer(s);
 503         ltrStrBuf = ltrStrBuf.reverse();
 504
 505         CharsetDetector det = new CharsetDetector();
 506         det.setText(encoder.encode(CharBuffer.wrap(ltrStrBuf.toString())).array());
 507         CharsetMatch m = det.detect();
 508         return m;
 509     }
 510
 511     private CharsetMatch _test1256(String s) throws Exception {
 512
 513         byte [] bytes = s.getBytes("windows-1256");
 514         CharsetDetector det = new CharsetDetector();
 515         det.setText(bytes);
 516         CharsetMatch m = det.detect();
 517         return m;
 518     }
 519 //#endif
 520
 521     public void TestHebrew() throws Exception {
 522         String  s =  "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
 523             "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
 524             "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
 525             "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
 526             "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
 527             "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
 528             "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
 529             "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
 530             "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
 531             "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
 532             "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
 533             "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
 534             " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
 535             " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
 536             " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
 537             "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
 538
 539         CharsetMatch m = _test1255(s);
 540         String charsetMatch = m.getName();
 541         CheckAssert(charsetMatch.equals("ISO-8859-8"));
 542
 543         m = _testIBM424_he_rtl(s);
 544         charsetMatch = m.getName();
 545         CheckAssert(charsetMatch.equals("IBM424_rtl"));
 546
 547         m = _testIBM424_he_ltr(s);
 548         charsetMatch = m.getName();
 549         CheckAssert(charsetMatch.equals("IBM424_ltr"));
 550     }
 551
 552     private CharsetMatch _test1255(String s) throws Exception {
 553         byte [] bytes = s.getBytes("ISO-8859-8");
 554         CharsetDetector det = new CharsetDetector();
 555         det.setText(bytes);
 556         CharsetMatch m = det.detect();
 557         return m;
 558     }
 559
 560     private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
 561         byte [] bytes = s.getBytes("IBM424");
 562         CharsetDetector det = new CharsetDetector();
 563         det.setText(bytes);
 564         CharsetMatch m = det.detect();
 565         return m;
 566     }
 567
 568     private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
 569         /**
 570          * transformation of input string to CP420 left to right requires reversing the string
 571          */
 572
 573         StringBuffer ltrStrBuf = new StringBuffer(s);
 574         ltrStrBuf = ltrStrBuf.reverse();
 575         byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
 576
 577         CharsetDetector det = new CharsetDetector();
 578         det.setText(bytes);
 579         CharsetMatch m = det.detect();
 580         return m;
 581     }
 582 }