2 *******************************************************************************
\r
3 * Copyright (C) 2005-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.dev.test.charsetdet;
\r
9 import java.io.ByteArrayInputStream;
\r
10 import java.io.InputStream;
\r
11 import java.io.Reader;
\r
12 import java.io.UnsupportedEncodingException;
\r
14 import javax.xml.parsers.DocumentBuilder;
\r
15 import javax.xml.parsers.DocumentBuilderFactory;
\r
17 import org.w3c.dom.Document;
\r
18 import org.w3c.dom.Element;
\r
19 import org.w3c.dom.NamedNodeMap;
\r
20 import org.w3c.dom.Node;
\r
21 import org.w3c.dom.NodeList;
\r
23 import com.ibm.icu.dev.test.TestFmwk;
\r
24 import com.ibm.icu.text.CharsetDetector;
\r
25 import com.ibm.icu.text.CharsetMatch;
\r
31 public class TestCharsetDetector extends TestFmwk
\r
37 public TestCharsetDetector()
\r
41 public static void main(String[] args) {
\r
44 TestCharsetDetector test = new TestCharsetDetector();
\r
49 e.printStackTrace();
\r
53 private void CheckAssert(boolean exp) {
\r
57 throw new Exception();
\r
59 catch (Exception e) {
\r
60 StackTraceElement failPoint = e.getStackTrace()[1];
\r
61 msg = "Test failure in file " + failPoint.getFileName() +
\r
62 " at line " + failPoint.getLineNumber();
\r
69 private String stringFromReader(Reader reader)
\r
71 StringBuffer sb = new StringBuffer();
\r
72 char[] buffer = new char[1024];
\r
76 while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
\r
77 sb.append(buffer, 0, bytesRead);
\r
80 return sb.toString();
\r
81 } catch (Exception e) {
\r
82 errln("stringFromReader() failed: " + e.toString());
\r
87 public void TestConstruction() {
\r
89 CharsetDetector det = new CharsetDetector();
\r
91 errln("Could not construct a charset detector");
\r
93 String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
\r
94 CheckAssert(charsetNames.length != 0);
\r
95 for (i=0; i<charsetNames.length; i++) {
\r
96 CheckAssert(charsetNames[i].equals("") == false);
\r
97 // System.out.println("\"" + charsetNames[i] + "\"");
\r
101 public void TestInputFilter() throws Exception
\r
103 String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
\r
104 byte[] bytes = s.getBytes("ISO-8859-1");
\r
105 CharsetDetector det = new CharsetDetector();
\r
108 det.enableInputFilter(true);
\r
109 if (!det.inputFilterEnabled()){
\r
110 errln("input filter should be enabled");
\r
113 det.setText(bytes);
\r
116 if (! m.getLanguage().equals("fr")) {
\r
117 errln("input filter did not strip markup!");
\r
120 det.enableInputFilter(false);
\r
121 det.setText(bytes);
\r
124 if (! m.getLanguage().equals("en")) {
\r
125 errln("unfiltered input did not detect as English!");
\r
129 public void TestUTF8() throws Exception {
\r
131 String s = "This is a string with some non-ascii characters that will " +
\r
132 "be converted to UTF-8, then shoved through the detection process. " +
\r
133 "\u0391\u0392\u0393\u0394\u0395" +
\r
134 "Sure would be nice if our source could contain Unicode directly!";
\r
135 byte [] bytes = s.getBytes("UTF-8");
\r
136 CharsetDetector det = new CharsetDetector();
\r
140 retrievedS = det.getString(bytes, "UTF-8");
\r
141 CheckAssert(s.equals(retrievedS));
\r
143 reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
\r
144 CheckAssert(s.equals(stringFromReader(reader)));
\r
145 det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
\r
148 public void TestUTF16() throws Exception
\r
151 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
\r
152 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
\r
154 byte[] beBytes = source.getBytes("UnicodeBig");
\r
155 byte[] leBytes = source.getBytes("UnicodeLittle");
\r
156 CharsetDetector det = new CharsetDetector();
\r
159 det.setText(beBytes);
\r
162 if (! m.getName().equals("UTF-16BE")) {
\r
163 errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
\r
166 det.setText(leBytes);
\r
169 if (! m.getName().equals("UTF-16LE")) {
\r
170 errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
\r
173 // Jitterbug 4451, for coverage
\r
174 int confidence = m.getConfidence();
\r
175 if(confidence != 100){
\r
176 errln("Did not get the expected confidence level " + confidence);
\r
178 int matchType = m.getMatchType();
\r
179 if(matchType != 0){
\r
180 errln("Did not get the expected matchType level " + matchType);
\r
184 public void TestC1Bytes() throws Exception
\r
187 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
\r
190 "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
\r
192 byte[] bISO = sISO.getBytes("ISO-8859-1");
\r
193 byte[] bWindows = sWindows.getBytes("windows-1252");
\r
195 CharsetDetector det = new CharsetDetector();
\r
198 det.setText(bWindows);
\r
201 if (m.getName() != "windows-1252") {
\r
202 errln("Text with C1 bytes not correctly detected as windows-1252.");
\r
209 if (m.getName() != "ISO-8859-1") {
\r
210 errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
\r
214 public void TestShortInput() {
\r
215 // Test that detection with very short byte strings does not crash and burn.
\r
216 // The shortest input that should produce positive detection result is two bytes,
\r
218 // TODO: Detector confidence levels needs to be refined for very short input.
\r
219 // Too high now, for some charsets that happen to be compatible with a few bytes of input.
\r
220 byte [][] shortBytes = new byte [][]
\r
224 {(byte)'A', (byte)'B'},
\r
225 {(byte)'A', (byte)'B', (byte)'C'},
\r
226 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
\r
229 CharsetDetector det = new CharsetDetector();
\r
231 for (int i=0; i<shortBytes.length; i++) {
\r
232 det.setText(shortBytes[i]);
\r
234 logln("i=" + i + " -> " + m.getName());
\r
238 public void TestBufferOverflow()
\r
240 byte testStrings[][] = {
\r
241 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
\r
242 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
\r
243 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
\r
244 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
\r
245 {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
\r
246 {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
\r
247 {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
\r
248 {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
\r
251 String testResults[] = {
\r
262 CharsetDetector det = new CharsetDetector();
\r
263 CharsetMatch match;
\r
265 det.setDeclaredEncoding("ISO-2022-JP");
\r
267 for (int idx = 0; idx < testStrings.length; idx += 1) {
\r
268 det.setText(testStrings[idx]);
\r
269 match = det.detect();
\r
271 if (match == null) {
\r
272 if (testResults[idx] != null) {
\r
273 errln("Unexpectedly got no results at index " + idx);
\r
276 logln("Got no result as expected at index " + idx);
\r
281 if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
\r
282 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
\r
283 " at index " + idx + " with confidence " + match.getConfidence());
\r
289 public void TestDetection()
\r
292 // Open and read the test data file.
\r
294 //InputStreamReader isr = null;
\r
297 InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
\r
299 errln("Could not open test data file CharsetDetectionTests.xml");
\r
303 //isr = new InputStreamReader(is, "UTF-8");
\r
305 // Set up an xml parser.
\r
306 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
\r
308 factory.setIgnoringComments(true);
\r
310 DocumentBuilder builder = factory.newDocumentBuilder();
\r
312 // Parse the xml content from the test case file.
\r
313 Document doc = builder.parse(is, null);
\r
314 Element root = doc.getDocumentElement();
\r
316 NodeList testCases = root.getElementsByTagName("test-case");
\r
318 // Process each test case
\r
319 for (int n = 0; n < testCases.getLength(); n += 1) {
\r
320 Node testCase = testCases.item(n);
\r
321 NamedNodeMap attrs = testCase.getAttributes();
\r
322 NodeList testData = testCase.getChildNodes();
\r
323 StringBuffer testText = new StringBuffer();
\r
324 String id = attrs.getNamedItem("id").getNodeValue();
\r
325 String encodings = attrs.getNamedItem("encodings").getNodeValue();
\r
327 // Collect the test case text.
\r
328 for (int t = 0; t < testData.getLength(); t += 1) {
\r
329 Node textNode = testData.item(t);
\r
331 testText.append(textNode.getNodeValue());
\r
334 // Process test text with each encoding / language pair.
\r
335 String testString = testText.toString();
\r
336 String[] encodingList = encodings.split(" ");
\r
337 for (int e = 0; e < encodingList.length; e += 1) {
\r
338 checkEncoding(testString, encodingList[e], id);
\r
342 } catch (Exception e) {
\r
343 errln("exception while processing test cases: " + e.toString());
\r
347 private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
\r
349 CharsetMatch m = det.detect();
\r
352 if (! m.getName().equals(encoding)) {
\r
353 errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
\r
357 String charsetMatchLanguage = m.getLanguage();
\r
358 if ((language != null && !charsetMatchLanguage.equals(language))
\r
359 || (language == null && charsetMatchLanguage != null)
\r
360 || (language != null && charsetMatchLanguage == null))
\r
362 errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
\r
365 if (encoding.startsWith("UTF-32")) {
\r
369 decoded = m.getString();
\r
371 if (! testString.equals(decoded)) {
\r
372 errln(id + ", " + encoding + ": getString() didn't return the original string!");
\r
375 decoded = stringFromReader(m.getReader());
\r
377 if (! testString.equals(decoded)) {
\r
378 errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
\r
382 private void checkEncoding(String testString, String encoding, String id)
\r
384 String enc = null, lang = null;
\r
385 String[] split = encoding.split("/");
\r
389 if (split.length > 1) {
\r
394 CharsetDetector det = new CharsetDetector();
\r
397 //if (enc.startsWith("UTF-32")) {
\r
398 // UTF32 utf32 = UTF32.getInstance(enc);
\r
400 // bytes = utf32.toBytes(testString);
\r
406 bytes = testString.getBytes(from);
\r
407 } catch (UnsupportedOperationException uoe) {
\r
408 // In some runtimes, the ISO-2022-CN converter
\r
409 // only converts *to* Unicode - we have to use
\r
410 // x-ISO-2022-CN-GB to convert *from* Unicode.
\r
411 if (from.equals("ISO-2022-CN")) {
\r
412 from = "x-ISO-2022-CN-GB";
\r
416 // Ignore any other converters that can't
\r
417 // convert from Unicode.
\r
419 } catch (UnsupportedEncodingException uee) {
\r
420 // Ignore any encodings that this runtime
\r
421 // doesn't support.
\r
429 det.setText(bytes);
\r
430 checkMatch(det, testString, enc, lang, id);
\r
432 det.setText(new ByteArrayInputStream(bytes));
\r
433 checkMatch(det, testString, enc, lang, id);
\r
434 } catch (Exception e) {
\r
435 errln(id + ": " + e.toString() + "enc=" + enc);
\r
436 e.printStackTrace();
\r
440 public void TestArabic() throws Exception {
\r
441 String s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
\r
442 "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
\r
443 "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644" +
\r
444 "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
\r
445 "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
\r
446 "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
\r
447 "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
\r
448 "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
\r
449 "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
\r
450 "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
\r
451 "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
\r
452 "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
\r
453 "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
\r
455 CharsetDetector det = new CharsetDetector();
\r
457 String charsetMatch;
\r
460 bytes = s.getBytes("windows-1256");
\r
461 det.setText(bytes);
\r
463 charsetMatch = m.getName();
\r
464 CheckAssert(charsetMatch.equals("windows-1256"));
\r
466 // Tests "public String getLanguage()"
\r
467 CheckAssert(m.getLanguage().endsWith("ar"));
\r
471 // We cannot rely on IBM420 converter in Sun Java
\r
473 bytes = s.getBytes("IBM420");
\r
475 bytes = new byte[] {
\r
476 (byte)0xCF, (byte)0x8D, (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0xBD, (byte)0xAB,
\r
477 (byte)0x74, (byte)0x63, (byte)0x40, (byte)0x58, (byte)0x75, (byte)0x56, (byte)0xBB, (byte)0x67,
\r
478 (byte)0x40, (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x9A,
\r
479 (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x62, (byte)0x40, (byte)0xAB, (byte)0xDC, (byte)0x40,
\r
480 (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77, (byte)0x62, (byte)0x40, (byte)0x56, (byte)0xB1,
\r
481 (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x56, (byte)0xB1,
\r
482 (byte)0xCF, (byte)0x8F, (byte)0xBD, (byte)0xDC, (byte)0x6B, (byte)0x40, (byte)0xBB, (byte)0x9A,
\r
483 (byte)0x40, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x63, (byte)0xBF,
\r
484 (byte)0x56, (byte)0x40, (byte)0x73, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x56, (byte)0x40,
\r
485 (byte)0xB1, (byte)0xB1, (byte)0x56, (byte)0x69, (byte)0x63, (byte)0xDC, (byte)0x56, (byte)0x67,
\r
486 (byte)0x56, (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x63, (byte)0x9E,
\r
487 (byte)0xDC, (byte)0x75, (byte)0x62, (byte)0x40, (byte)0xB1, (byte)0xB1, (byte)0xBB, (byte)0x67,
\r
488 (byte)0x63, (byte)0xBB, (byte)0x9A, (byte)0x40, (byte)0xCF, (byte)0xB1, (byte)0xB1, (byte)0x73,
\r
489 (byte)0xCF, (byte)0xB1, (byte)0x62, (byte)0x4B, (byte)0x40, (byte)0x63, (byte)0xCF, (byte)0x77,
\r
490 (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0x63, (byte)0x8F, (byte)0xCF, (byte)0x75,
\r
491 (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77,
\r
492 (byte)0x62, (byte)0x40, (byte)0x58, (byte)0xBF, (byte)0x73, (byte)0xAB, (byte)0x40, (byte)0x8D,
\r
493 (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x80, (byte)0x58, (byte)0xAF, (byte)0x62,
\r
494 (byte)0x40, (byte)0x49, (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0xB1, (byte)0x77,
\r
495 (byte)0xAF, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x73, (byte)0xCF, (byte)0xB1, (byte)0x62,
\r
496 (byte)0x40, (byte)0x56, (byte)0x77, (byte)0x75, (byte)0x56, (byte)0x55, (byte)0xDC, (byte)0xB1,
\r
497 (byte)0x40, (byte)0x58, (byte)0xCF, (byte)0x67, (byte)0xBF, (byte)0x40, (byte)0x56, (byte)0xB1,
\r
498 (byte)0xBB, (byte)0x71, (byte)0x56, (byte)0x8F, (byte)0x75, (byte)0x40, (byte)0x56, (byte)0xB1,
\r
499 (byte)0x56, (byte)0xAD, (byte)0x63, (byte)0x8B, (byte)0x56, (byte)0x73, (byte)0xDC, (byte)0x62,
\r
500 (byte)0x40, (byte)0xCF, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x67, (byte)0x63, (byte)0xBB,
\r
501 (byte)0x56, (byte)0x9A, (byte)0xDC, (byte)0x62, (byte)0x4B,
\r
503 det.setText(bytes);
\r
505 charsetMatch = m.getName();
\r
506 CheckAssert(charsetMatch.equals("IBM420_rtl"));
\r
508 // Tests "public String getLanguage()"
\r
509 CheckAssert(m.getLanguage().endsWith("ar"));
\r
513 // We cannot rely on IBM420 converter in Sun Java
\r
515 StringBuffer ltrStrBuf = new StringBuffer(s);
\r
516 ltrStrBuf = ltrStrBuf.reverse();
\r
517 bytes = ltrStrBuf.toString().getBytes("IBM420");
\r
519 bytes = new byte[] {
\r
520 (byte)0x4B, (byte)0x62, (byte)0xDC, (byte)0x9A, (byte)0x56, (byte)0xBB, (byte)0x63, (byte)0x67,
\r
521 (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0xCF, (byte)0x40, (byte)0x62, (byte)0xDC, (byte)0x73,
\r
522 (byte)0x56, (byte)0x8B, (byte)0x63, (byte)0xAD, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x40,
\r
523 (byte)0x75, (byte)0x8F, (byte)0x56, (byte)0x71, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40,
\r
524 (byte)0xBF, (byte)0x67, (byte)0xCF, (byte)0x58, (byte)0x40, (byte)0xB1, (byte)0xDC, (byte)0x55,
\r
525 (byte)0x56, (byte)0x75, (byte)0x77, (byte)0x56, (byte)0x40, (byte)0x62, (byte)0xB1, (byte)0xCF,
\r
526 (byte)0x73, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xAF, (byte)0x77, (byte)0xB1, (byte)0x40,
\r
527 (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x49, (byte)0x40, (byte)0x62, (byte)0xAF, (byte)0x58,
\r
528 (byte)0x80, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x8D, (byte)0x40, (byte)0xAB,
\r
529 (byte)0x73, (byte)0xBF, (byte)0x58, (byte)0x40, (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52,
\r
530 (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x75, (byte)0xCF, (byte)0x8F,
\r
531 (byte)0x63, (byte)0xCF, (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x77, (byte)0xCF, (byte)0x63,
\r
532 (byte)0x40, (byte)0x4B, (byte)0x62, (byte)0xB1, (byte)0xCF, (byte)0x73, (byte)0xB1, (byte)0xB1,
\r
533 (byte)0xCF, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x63, (byte)0x67, (byte)0xBB, (byte)0xB1,
\r
534 (byte)0xB1, (byte)0x40, (byte)0x62, (byte)0x75, (byte)0xDC, (byte)0x9E, (byte)0x63, (byte)0xBB,
\r
535 (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x56, (byte)0x67, (byte)0x56, (byte)0xDC,
\r
536 (byte)0x63, (byte)0x69, (byte)0x56, (byte)0xB1, (byte)0xB1, (byte)0x40, (byte)0x56, (byte)0xBB,
\r
537 (byte)0x55, (byte)0x56, (byte)0x73, (byte)0x40, (byte)0x56, (byte)0xBF, (byte)0x63, (byte)0xBB,
\r
538 (byte)0x55, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x40,
\r
539 (byte)0x6B, (byte)0xDC, (byte)0xBD, (byte)0x8F, (byte)0xCF, (byte)0xB1, (byte)0x56, (byte)0x40,
\r
540 (byte)0xBD, (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0xB1, (byte)0x56, (byte)0x40,
\r
541 (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52, (byte)0xBB, (byte)0x40, (byte)0xDC, (byte)0xAB,
\r
542 (byte)0x40, (byte)0x62, (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x9A, (byte)0x40, (byte)0xBD,
\r
543 (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0x40, (byte)0x67, (byte)0xBB, (byte)0x56,
\r
544 (byte)0x75, (byte)0x58, (byte)0x40, (byte)0x63, (byte)0x74, (byte)0xAB, (byte)0xBD, (byte)0xCF,
\r
545 (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x8D, (byte)0xCF,
\r
548 det.setText(bytes);
\r
550 charsetMatch = m.getName();
\r
551 CheckAssert(charsetMatch.equals("IBM420_ltr"));
\r
555 public void TestHebrew() throws Exception {
\r
556 String s = "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
\r
557 "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
\r
558 "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
\r
559 "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
\r
560 "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
\r
561 "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
\r
562 "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
\r
563 "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
\r
564 "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
\r
565 "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
\r
566 "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
\r
567 "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
\r
568 " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
\r
569 " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
\r
570 " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
\r
571 "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
\r
573 CharsetMatch m = _test1255(s);
\r
574 String charsetMatch = m.getName();
\r
575 CheckAssert(charsetMatch.equals("ISO-8859-8"));
\r
576 CheckAssert(m.getLanguage().equals("he"));
\r
578 m = _testIBM424_he_rtl(s);
\r
579 charsetMatch = m.getName();
\r
580 CheckAssert(charsetMatch.equals("IBM424_rtl"));
\r
581 CheckAssert(m.getLanguage().equals("he"));
\r
583 m = _testIBM424_he_ltr(s);
\r
584 charsetMatch = m.getName();
\r
585 CheckAssert(charsetMatch.equals("IBM424_ltr"));
\r
586 CheckAssert(m.getLanguage().equals("he"));
\r
589 private CharsetMatch _test1255(String s) throws Exception {
\r
590 byte [] bytes = s.getBytes("ISO-8859-8");
\r
591 CharsetDetector det = new CharsetDetector();
\r
592 det.setText(bytes);
\r
593 CharsetMatch m = det.detect();
\r
597 private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
\r
598 byte [] bytes = s.getBytes("IBM424");
\r
599 CharsetDetector det = new CharsetDetector();
\r
600 det.setText(bytes);
\r
601 CharsetMatch m = det.detect();
\r
605 private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
\r
607 * transformation of input string to CP420 left to right requires reversing the string
\r
610 StringBuffer ltrStrBuf = new StringBuffer(s);
\r
611 ltrStrBuf = ltrStrBuf.reverse();
\r
612 byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
\r
614 CharsetDetector det = new CharsetDetector();
\r
615 det.setText(bytes);
\r
616 CharsetMatch m = det.detect();
\r
621 * Test the method int match(CharsetDetector det) in CharsetRecog_UTF_16_LE
\r
623 public void TestCharsetRecog_UTF_16_LE_Match() {
\r
624 byte[] in = { Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE };
\r
625 CharsetDetector cd = new CharsetDetector();
\r
626 // Tests when if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) is true inside the
\r
627 // match(CharsetDetector) method of CharsetRecog_UTF_16_LE
\r
630 } catch (Exception e) {
\r
631 errln("CharsetRecog_UTF_16_LE.match(CharsetDetector) was not suppose to return an exception.");
\r