2 *******************************************************************************
3 * Copyright (C) 2005-2013, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.dev.test.charsetdet;
9 import java.io.ByteArrayInputStream;
10 import java.io.ByteArrayOutputStream;
11 import java.io.InputStream;
12 import java.io.Reader;
13 import java.io.UnsupportedEncodingException;
14 import java.util.HashSet;
16 import javax.xml.parsers.DocumentBuilder;
17 import javax.xml.parsers.DocumentBuilderFactory;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21 import org.w3c.dom.NamedNodeMap;
22 import org.w3c.dom.Node;
23 import org.w3c.dom.NodeList;
25 import com.ibm.icu.dev.test.TestFmwk;
26 import com.ibm.icu.text.CharsetDetector;
27 import com.ibm.icu.text.CharsetMatch;
33 public class TestCharsetDetector extends TestFmwk
39 public TestCharsetDetector()
43 public static void main(String[] args) {
46 TestCharsetDetector test = new TestCharsetDetector();
55 private void CheckAssert(boolean exp) {
59 throw new Exception();
62 StackTraceElement failPoint = e.getStackTrace()[1];
63 msg = "Test failure in file " + failPoint.getFileName() +
64 " at line " + failPoint.getLineNumber();
71 private String stringFromReader(Reader reader)
73 StringBuffer sb = new StringBuffer();
74 char[] buffer = new char[1024];
78 while ((bytesRead = reader.read(buffer, 0, 1024)) >= 0) {
79 sb.append(buffer, 0, bytesRead);
83 } catch (Exception e) {
84 errln("stringFromReader() failed: " + e.toString());
89 public void TestConstruction() {
91 CharsetDetector det = new CharsetDetector();
93 errln("Could not construct a charset detector");
95 String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
96 CheckAssert(charsetNames.length != 0);
97 for (i=0; i<charsetNames.length; i++) {
98 CheckAssert(charsetNames[i].equals("") == false);
99 // System.out.println("\"" + charsetNames[i] + "\"");
102 final String[] defDisabled = {
103 "IBM420_rtl", "IBM420_ltr",
104 "IBM424_rtl", "IBM424_ltr"
106 String[] activeCharsetNames = det.getDetectableCharsets();
107 for (String cs : activeCharsetNames) {
108 // the charset must be included in all list
109 boolean found = false;
110 for (String cs0 : charsetNames) {
111 if (cs0.equals(cs)) {
117 errln(cs + " is not included in the all charset list." );
120 // some charsets are disabled by default
122 for (String cs1 : defDisabled) {
123 if (cs1.equals(cs)) {
129 errln(cs + " should not be included in the default charset list.");
134 public void TestInputFilter() throws Exception
136 String s = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\u00E8s petit peu de Fran\u00E7ais. <to> <confuse> <the> <detector>";
137 byte[] bytes = s.getBytes("ISO-8859-1");
138 CharsetDetector det = new CharsetDetector();
141 det.enableInputFilter(true);
142 if (!det.inputFilterEnabled()){
143 errln("input filter should be enabled");
149 if (! m.getLanguage().equals("fr")) {
150 errln("input filter did not strip markup!");
153 det.enableInputFilter(false);
157 if (! m.getLanguage().equals("en")) {
158 errln("unfiltered input did not detect as English!");
162 public void TestUTF8() throws Exception {
164 String s = "This is a string with some non-ascii characters that will " +
165 "be converted to UTF-8, then shoved through the detection process. " +
166 "\u0391\u0392\u0393\u0394\u0395" +
167 "Sure would be nice if our source could contain Unicode directly!";
168 byte [] bytes = s.getBytes("UTF-8");
169 CharsetDetector det = new CharsetDetector();
173 retrievedS = det.getString(bytes, "UTF-8");
174 CheckAssert(s.equals(retrievedS));
176 reader = det.getReader(new ByteArrayInputStream(bytes), "UTF-8");
177 CheckAssert(s.equals(stringFromReader(reader)));
178 det.setDeclaredEncoding("UTF-8"); // Jitterbug 4451, for coverage
181 public void TestUTF16() throws Exception
184 "u0623\u0648\u0631\u0648\u0628\u0627, \u0628\u0631\u0645\u062c\u064a\u0627\u062a " +
185 "\u0627\u0644\u062d\u0627\u0633\u0648\u0628 \u002b\u0020\u0627\u0646\u062a\u0631\u0646\u064a\u062a";
187 byte[] beBytes = source.getBytes("UnicodeBig");
188 byte[] leBytes = source.getBytes("UnicodeLittle");
189 CharsetDetector det = new CharsetDetector();
192 det.setText(beBytes);
195 if (! m.getName().equals("UTF-16BE")) {
196 errln("Encoding detection failure: expected UTF-16BE, got " + m.getName());
199 det.setText(leBytes);
202 if (! m.getName().equals("UTF-16LE")) {
203 errln("Encoding detection failure: expected UTF-16LE, got " + m.getName());
206 // Jitterbug 4451, for coverage
207 int confidence = m.getConfidence();
208 if(confidence != 100){
209 errln("Did not get the expected confidence level " + confidence);
213 public void TestC1Bytes() throws Exception
216 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
219 "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \u201CC1\u201D bytes.";
221 byte[] bISO = sISO.getBytes("ISO-8859-1");
222 byte[] bWindows = sWindows.getBytes("windows-1252");
224 CharsetDetector det = new CharsetDetector();
227 det.setText(bWindows);
230 if (!m.getName().equals("windows-1252")) {
231 errln("Text with C1 bytes not correctly detected as windows-1252.");
238 if (!m.getName().equals("ISO-8859-1")) {
239 errln("Text without C1 bytes not correctly detected as ISO-8859-1.");
243 public void TestShortInput() {
244 // Test that detection with very short byte strings does not crash and burn.
245 // The shortest input that should produce positive detection result is two bytes,
247 // TODO: Detector confidence levels needs to be refined for very short input.
248 // Too high now, for some charsets that happen to be compatible with a few bytes of input.
249 byte [][] shortBytes = new byte [][]
253 {(byte)'A', (byte)'B'},
254 {(byte)'A', (byte)'B', (byte)'C'},
255 {(byte)'A', (byte)'B', (byte)'C', (byte)'D'}
258 CharsetDetector det = new CharsetDetector();
260 for (int i=0; i<shortBytes.length; i++) {
261 det.setText(shortBytes[i]);
263 logln("i=" + i + " -> " + m.getName());
267 public void TestBufferOverflow()
269 byte testStrings[][] = {
270 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
271 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
272 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
273 {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
274 {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
275 {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
276 {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
277 {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
280 String testResults[] = {
291 CharsetDetector det = new CharsetDetector();
294 det.setDeclaredEncoding("ISO-2022-JP");
296 for (int idx = 0; idx < testStrings.length; idx += 1) {
297 det.setText(testStrings[idx]);
298 match = det.detect();
301 if (testResults[idx] != null) {
302 errln("Unexpectedly got no results at index " + idx);
305 logln("Got no result as expected at index " + idx);
310 if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
311 errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
312 " at index " + idx + " with confidence " + match.getConfidence());
318 public void TestDetection()
321 // Open and read the test data file.
323 //InputStreamReader isr = null;
326 InputStream is = TestCharsetDetector.class.getResourceAsStream("CharsetDetectionTests.xml");
328 errln("Could not open test data file CharsetDetectionTests.xml");
332 //isr = new InputStreamReader(is, "UTF-8");
334 // Set up an xml parser.
335 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
337 factory.setIgnoringComments(true);
339 DocumentBuilder builder = factory.newDocumentBuilder();
341 // Parse the xml content from the test case file.
342 Document doc = builder.parse(is, null);
343 Element root = doc.getDocumentElement();
345 NodeList testCases = root.getElementsByTagName("test-case");
347 // Process each test case
348 for (int n = 0; n < testCases.getLength(); n += 1) {
349 Node testCase = testCases.item(n);
350 NamedNodeMap attrs = testCase.getAttributes();
351 NodeList testData = testCase.getChildNodes();
352 StringBuffer testText = new StringBuffer();
353 String id = attrs.getNamedItem("id").getNodeValue();
354 String encodings = attrs.getNamedItem("encodings").getNodeValue();
356 // Collect the test case text.
357 for (int t = 0; t < testData.getLength(); t += 1) {
358 Node textNode = testData.item(t);
360 testText.append(textNode.getNodeValue());
363 // Process test text with each encoding / language pair.
364 String testString = testText.toString();
365 String[] encodingList = encodings.split(" ");
366 for (int e = 0; e < encodingList.length; e += 1) {
367 checkEncoding(testString, encodingList[e], id);
371 } catch (Exception e) {
372 errln("exception while processing test cases: " + e.toString());
376 private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
378 CharsetMatch m = det.detect();
381 if (! m.getName().equals(encoding)) {
382 errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
386 String charsetMatchLanguage = m.getLanguage();
387 if ((language != null && !charsetMatchLanguage.equals(language))
388 || (language == null && charsetMatchLanguage != null)
389 || (language != null && charsetMatchLanguage == null))
391 errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
394 if (encoding.startsWith("UTF-32")) {
398 decoded = m.getString();
400 if (! testString.equals(decoded)) {
401 errln(id + ", " + encoding + ": getString() didn't return the original string!");
404 decoded = stringFromReader(m.getReader());
406 if (! testString.equals(decoded)) {
407 errln(id + ", " + encoding + ": getReader() didn't yield the original string!");
411 private void checkEncoding(String testString, String encoding, String id)
413 String enc = null, lang = null;
414 String[] split = encoding.split("/");
418 if (split.length > 1) {
423 CharsetDetector det = new CharsetDetector();
426 //if (enc.startsWith("UTF-32")) {
427 // UTF32 utf32 = UTF32.getInstance(enc);
429 // bytes = utf32.toBytes(testString);
435 bytes = testString.getBytes(from);
436 } catch (UnsupportedOperationException uoe) {
437 // In some runtimes, the ISO-2022-CN converter
438 // only converts *to* Unicode - we have to use
439 // x-ISO-2022-CN-GB to convert *from* Unicode.
440 if (from.equals("ISO-2022-CN")) {
441 from = "x-ISO-2022-CN-GB";
445 // Ignore any other converters that can't
446 // convert from Unicode.
448 } catch (UnsupportedEncodingException uee) {
449 // Ignore any encodings that this runtime
459 checkMatch(det, testString, enc, lang, id);
461 det.setText(new ByteArrayInputStream(bytes));
462 checkMatch(det, testString, enc, lang, id);
463 } catch (Exception e) {
464 errln(id + ": " + e.toString() + "enc=" + enc);
469 public void TestJapanese() throws Exception {
470 String s = "\u3000\u3001\u3002\u3003\u3005\u3006\u3007\u3008\u3009\u300A\u300B\u300C\u300D\u300E\u300F\u3010\u3011\u3012\u3013\u3014" +
471 "\u3015\u301C\u3041\u3042\u3043\u3044\u3045\u3046\u3047\u3048\u3049\u304A\u304B\u304C\u304D\u304E\u304F\u3050\u3051\u3052" +
472 "\u3053\u3054\u3055\u3056\u3057\u3058\u3059\u305A\u305B\u305C\u305D\u305E\u305F\u3060\u3061\u3062\u3063\u3064\u3065\u3066" +
473 "\u3067\u3068\u3069\u306A\u306B\u306C\u306D\u306E\u306F\u3070\u3071\u3072\u3073\u3074\u3075\u3076\u3077\u3078\u3079\u307A" +
474 "\u307B\u307C\u307D\u307E\u307F\u3080\u3081\u3082\u3083\u3084\u3085\u3086\u3087\u3088\u3089\u308A\u308B\u308C\u308D\u308E" +
475 "\u308F\u3090\u3091\u3092\u3093\u309B\u309C\u309D\u309E\u30A1\u30A2\u30A3\u30A4\u30A5\u30A6\u30A7\u30A8\u30A9\u30AA\u30AB" +
476 "\u30AC\u30AD\u30AE\u30AF\u30B0\u30B1\u30B2\u30B3\u30B4\u30B5\u30B6\u30B7\u30B8\u30B9\u30BA\u30BB\u30BC\u30BD\u30BE\u30BF" +
477 "\u30C0\u30C1\u30C2\u30C3\u30C4\u30C5\u30C6\u30C7\u30C8\u30C9\u30CA\u30CB\u30CC\u30CD\u30CE\u30CF\u30D0\u30D1\u30D2\u30D3" +
478 "\u30D4\u30D5\u30D6\u30D7\u30D8\u30D9\u30DA\u30DB\u30DC\u30DD\u30DE\u30DF\u30E0\u30E1\u30E2\u30E3\u30E4\u30E5\u30E6\u30E7" +
479 "\u30E8\u30E9\u30EA\u30EB\u30EC\u30ED\u30EE\u30EF\u30F0\u30F1\u30F2\u30F3\u30F4\u30F5\u30F6\u30FB\u30FC\u30FD\u30FE\u4E00" +
480 "\u4E01\u4E02\u4E03\u4E04\u4E05\u4E07\u4E08\u4E09\u4E0A\u4E0B\u4E0C\u4E0D\u4E0E\u4E10\u4E11\u4E12\u4E14\u4E15\u4E16\u4E17" +
481 "\u4E18\u4E19\u4E1E\u4E1F\u4E21\u4E23\u4E24\u4E26\u4E28\u4E2A\u4E2B\u4E2D\u4E2E\u4E2F\u4E30\u4E31\u4E32\u4E35\u4E36\u4E38" +
482 "\u4E39\u4E3B\u4E3C\u4E3F\u4E40\u4E41\u4E42\u4E43\u4E44\u4E45\u4E47\u4E4B\u4E4D\u4E4E\u4E4F\u4E51\u4E55\u4E56\u4E57\u4E58" +
483 "\u4E59\u4E5A\u4E5C\u4E5D\u4E5E\u4E5F\u4E62\u4E63\u4E68\u4E69\u4E71\u4E73\u4E74\u4E75\u4E79\u4E7E\u4E7F\u4E80\u4E82\u4E85" +
484 "\u4E86\u4E88\u4E89\u4E8A\u4E8B\u4E8C";
486 CharsetDetector det = new CharsetDetector();
491 bytes = s.getBytes("EUC-JP");
494 charsetMatch = m.getName();
495 CheckAssert(charsetMatch.equals("EUC-JP"));
497 // Tests "public String getLanguage()"
498 CheckAssert(m.getLanguage().equals("ja"));
502 public void TestArabic() throws Exception {
503 String s = "\u0648\u0636\u0639\u062A \u0648\u0646\u0641\u0630\u062A \u0628\u0631\u0627" +
504 "\u0645\u062C \u062A\u0623\u0645\u064A\u0646 \u0639\u062F\u064A\u062F\u0629 \u0641\u064A " +
505 "\u0645\u0624\u0633\u0633\u0629 \u0627\u0644\u062A\u0623\u0645\u064A\u0646 \u0627\u0644" +
506 "\u0648\u0637\u0646\u064A, \u0645\u0639 \u0645\u0644\u0627\u0626\u0645\u062A\u0647\u0627 " +
507 "\u062F\u0627\u0626\u0645\u0627 \u0644\u0644\u0627\u062D\u062A\u064A\u0627\u062C" +
508 "\u0627\u062A \u0627\u0644\u0645\u062A\u063A\u064A\u0631\u0629 \u0644\u0644\u0645\u062C" +
509 "\u062A\u0645\u0639 \u0648\u0644\u0644\u062F\u0648\u0644\u0629. \u062A\u0648\u0633\u0639" +
510 "\u062A \u0648\u062A\u0637\u0648\u0631\u062A \u0627\u0644\u0645\u0624\u0633\u0633\u0629 " +
511 "\u0628\u0647\u062F\u0641 \u0636\u0645\u0627\u0646 \u0634\u0628\u0643\u0629 \u0623\u0645" +
512 "\u0627\u0646 \u0644\u0633\u0643\u0627\u0646 \u062F\u0648\u0644\u0629 \u0627\u0633\u0631" +
513 "\u0627\u0626\u064A\u0644 \u0628\u0648\u062C\u0647 \u0627\u0644\u0645\u062E\u0627\u0637" +
514 "\u0631 \u0627\u0644\u0627\u0642\u062A\u0635\u0627\u062F\u064A\u0629 \u0648\u0627\u0644" +
515 "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
517 CharsetDetector det = new CharsetDetector();
518 det.setDetectableCharset("IBM424_rtl", true);
519 det.setDetectableCharset("IBM424_ltr", true);
520 det.setDetectableCharset("IBM420_rtl", true);
521 det.setDetectableCharset("IBM420_ltr", true);
526 bytes = s.getBytes("windows-1256");
529 charsetMatch = m.getName();
530 CheckAssert(charsetMatch.equals("windows-1256"));
532 // Tests "public String getLanguage()"
533 CheckAssert(m.getLanguage().endsWith("ar"));
537 // We cannot rely on IBM420 converter in Sun Java
539 bytes = s.getBytes("IBM420");
542 (byte)0xCF, (byte)0x8D, (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0xBD, (byte)0xAB,
543 (byte)0x74, (byte)0x63, (byte)0x40, (byte)0x58, (byte)0x75, (byte)0x56, (byte)0xBB, (byte)0x67,
544 (byte)0x40, (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x9A,
545 (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x62, (byte)0x40, (byte)0xAB, (byte)0xDC, (byte)0x40,
546 (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77, (byte)0x62, (byte)0x40, (byte)0x56, (byte)0xB1,
547 (byte)0x63, (byte)0x49, (byte)0xBB, (byte)0xDC, (byte)0xBD, (byte)0x40, (byte)0x56, (byte)0xB1,
548 (byte)0xCF, (byte)0x8F, (byte)0xBD, (byte)0xDC, (byte)0x6B, (byte)0x40, (byte)0xBB, (byte)0x9A,
549 (byte)0x40, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x63, (byte)0xBF,
550 (byte)0x56, (byte)0x40, (byte)0x73, (byte)0x56, (byte)0x55, (byte)0xBB, (byte)0x56, (byte)0x40,
551 (byte)0xB1, (byte)0xB1, (byte)0x56, (byte)0x69, (byte)0x63, (byte)0xDC, (byte)0x56, (byte)0x67,
552 (byte)0x56, (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x63, (byte)0x9E,
553 (byte)0xDC, (byte)0x75, (byte)0x62, (byte)0x40, (byte)0xB1, (byte)0xB1, (byte)0xBB, (byte)0x67,
554 (byte)0x63, (byte)0xBB, (byte)0x9A, (byte)0x40, (byte)0xCF, (byte)0xB1, (byte)0xB1, (byte)0x73,
555 (byte)0xCF, (byte)0xB1, (byte)0x62, (byte)0x4B, (byte)0x40, (byte)0x63, (byte)0xCF, (byte)0x77,
556 (byte)0x9A, (byte)0x63, (byte)0x40, (byte)0xCF, (byte)0x63, (byte)0x8F, (byte)0xCF, (byte)0x75,
557 (byte)0x63, (byte)0x40, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x52, (byte)0x77, (byte)0x77,
558 (byte)0x62, (byte)0x40, (byte)0x58, (byte)0xBF, (byte)0x73, (byte)0xAB, (byte)0x40, (byte)0x8D,
559 (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x80, (byte)0x58, (byte)0xAF, (byte)0x62,
560 (byte)0x40, (byte)0x49, (byte)0xBB, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0xB1, (byte)0x77,
561 (byte)0xAF, (byte)0x56, (byte)0xBD, (byte)0x40, (byte)0x73, (byte)0xCF, (byte)0xB1, (byte)0x62,
562 (byte)0x40, (byte)0x56, (byte)0x77, (byte)0x75, (byte)0x56, (byte)0x55, (byte)0xDC, (byte)0xB1,
563 (byte)0x40, (byte)0x58, (byte)0xCF, (byte)0x67, (byte)0xBF, (byte)0x40, (byte)0x56, (byte)0xB1,
564 (byte)0xBB, (byte)0x71, (byte)0x56, (byte)0x8F, (byte)0x75, (byte)0x40, (byte)0x56, (byte)0xB1,
565 (byte)0x56, (byte)0xAD, (byte)0x63, (byte)0x8B, (byte)0x56, (byte)0x73, (byte)0xDC, (byte)0x62,
566 (byte)0x40, (byte)0xCF, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x67, (byte)0x63, (byte)0xBB,
567 (byte)0x56, (byte)0x9A, (byte)0xDC, (byte)0x62, (byte)0x4B,
571 charsetMatch = m.getName();
572 CheckAssert(charsetMatch.equals("IBM420_rtl"));
574 // Tests "public String getLanguage()"
575 CheckAssert(m.getLanguage().endsWith("ar"));
579 // We cannot rely on IBM420 converter in Sun Java
581 StringBuffer ltrStrBuf = new StringBuffer(s);
582 ltrStrBuf = ltrStrBuf.reverse();
583 bytes = ltrStrBuf.toString().getBytes("IBM420");
586 (byte)0x4B, (byte)0x62, (byte)0xDC, (byte)0x9A, (byte)0x56, (byte)0xBB, (byte)0x63, (byte)0x67,
587 (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0xCF, (byte)0x40, (byte)0x62, (byte)0xDC, (byte)0x73,
588 (byte)0x56, (byte)0x8B, (byte)0x63, (byte)0xAD, (byte)0x56, (byte)0xB1, (byte)0x56, (byte)0x40,
589 (byte)0x75, (byte)0x8F, (byte)0x56, (byte)0x71, (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40,
590 (byte)0xBF, (byte)0x67, (byte)0xCF, (byte)0x58, (byte)0x40, (byte)0xB1, (byte)0xDC, (byte)0x55,
591 (byte)0x56, (byte)0x75, (byte)0x77, (byte)0x56, (byte)0x40, (byte)0x62, (byte)0xB1, (byte)0xCF,
592 (byte)0x73, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xAF, (byte)0x77, (byte)0xB1, (byte)0x40,
593 (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x49, (byte)0x40, (byte)0x62, (byte)0xAF, (byte)0x58,
594 (byte)0x80, (byte)0x40, (byte)0xBD, (byte)0x56, (byte)0xBB, (byte)0x8D, (byte)0x40, (byte)0xAB,
595 (byte)0x73, (byte)0xBF, (byte)0x58, (byte)0x40, (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52,
596 (byte)0xBB, (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x75, (byte)0xCF, (byte)0x8F,
597 (byte)0x63, (byte)0xCF, (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x77, (byte)0xCF, (byte)0x63,
598 (byte)0x40, (byte)0x4B, (byte)0x62, (byte)0xB1, (byte)0xCF, (byte)0x73, (byte)0xB1, (byte)0xB1,
599 (byte)0xCF, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x63, (byte)0x67, (byte)0xBB, (byte)0xB1,
600 (byte)0xB1, (byte)0x40, (byte)0x62, (byte)0x75, (byte)0xDC, (byte)0x9E, (byte)0x63, (byte)0xBB,
601 (byte)0xB1, (byte)0x56, (byte)0x40, (byte)0x63, (byte)0x56, (byte)0x67, (byte)0x56, (byte)0xDC,
602 (byte)0x63, (byte)0x69, (byte)0x56, (byte)0xB1, (byte)0xB1, (byte)0x40, (byte)0x56, (byte)0xBB,
603 (byte)0x55, (byte)0x56, (byte)0x73, (byte)0x40, (byte)0x56, (byte)0xBF, (byte)0x63, (byte)0xBB,
604 (byte)0x55, (byte)0x56, (byte)0xB1, (byte)0xBB, (byte)0x40, (byte)0x9A, (byte)0xBB, (byte)0x40,
605 (byte)0x6B, (byte)0xDC, (byte)0xBD, (byte)0x8F, (byte)0xCF, (byte)0xB1, (byte)0x56, (byte)0x40,
606 (byte)0xBD, (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0xB1, (byte)0x56, (byte)0x40,
607 (byte)0x62, (byte)0x77, (byte)0x77, (byte)0x52, (byte)0xBB, (byte)0x40, (byte)0xDC, (byte)0xAB,
608 (byte)0x40, (byte)0x62, (byte)0x73, (byte)0xDC, (byte)0x73, (byte)0x9A, (byte)0x40, (byte)0xBD,
609 (byte)0xDC, (byte)0xBB, (byte)0x49, (byte)0x63, (byte)0x40, (byte)0x67, (byte)0xBB, (byte)0x56,
610 (byte)0x75, (byte)0x58, (byte)0x40, (byte)0x63, (byte)0x74, (byte)0xAB, (byte)0xBD, (byte)0xCF,
611 (byte)0x40, (byte)0x63, (byte)0x9A, (byte)0x8D, (byte)0xCF,
616 charsetMatch = m.getName();
617 CheckAssert(charsetMatch.equals("IBM420_ltr"));
621 public void TestHebrew() throws Exception {
622 String s = "\u05D4\u05E4\u05E8\u05E7\u05DC\u05D9\u05D8 \u05D4\u05E6\u05D1\u05D0\u05D9 \u05D4" +
623 "\u05E8\u05D0\u05E9\u05D9, \u05EA\u05EA \u05D0\u05DC\u05D5\u05E3 \u05D0\u05D1\u05D9" +
624 "\u05D7\u05D9 \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8, \u05D4\u05D5\u05E8" +
625 "\u05D4 \u05E2\u05DC \u05E4\u05EA\u05D9\u05D7\u05EA \u05D7\u05E7\u05D9\u05E8\u05EA " +
626 "\u05DE\u05E6\"\u05D7 \u05D1\u05E2\u05E7\u05D1\u05D5\u05EA \u05E2\u05D3\u05D5\u05D9" +
627 "\u05D5\u05EA \u05D7\u05D9\u05D9\u05DC\u05D9 \u05E6\u05D4\"\u05DC \u05DE\u05DE\u05D1" +
628 "\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4 \u05D1+ " +
629 "\u05E8\u05E6\u05D5\u05E2\u05EA \u05E2\u05D6\u05D4. \u05DC\u05D3\u05D1\u05E8\u05D9 " +
630 "\u05D4\u05E4\u05E6\"\u05E8, \u05DE\u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA \u05E2" +
631 "\u05D5\u05DC\u05D4 \u05EA\u05DE\u05D5\u05E0\u05D4 \u05E9\u05DC \"\u05D4\u05EA\u05E0" +
632 "\u05D4\u05D2\u05D5\u05EA \u05E4\u05E1\u05D5\u05DC\u05D4 \u05DC\u05DB\u05D0\u05D5\u05E8" +
633 "\u05D4 \u05E9\u05DC \u05D7\u05D9\u05D9\u05DC\u05D9\u05DD \u05D1\u05DE\u05D4\u05DC\u05DA" +
634 " \u05DE\u05D1\u05E6\u05E2 \u05E2\u05D5\u05E4\u05E8\u05EA \u05D9\u05E6\u05D5\u05E7\u05D4\"." +
635 " \u05DE\u05E0\u05D3\u05DC\u05D1\u05DC\u05D9\u05D8 \u05E7\u05D9\u05D1\u05DC \u05D0\u05EA" +
636 " \u05D4\u05D7\u05DC\u05D8\u05EA\u05D5 \u05DC\u05D0\u05D7\u05E8 \u05E9\u05E2\u05D9\u05D9" +
637 "\u05DF \u05D1\u05EA\u05DE\u05DC\u05D9\u05DC \u05D4\u05E2\u05D3\u05D5\u05D9\u05D5\u05EA";
639 CharsetMatch m = _test1255(s);
640 String charsetMatch = m.getName();
641 CheckAssert(charsetMatch.equals("ISO-8859-8-I"));
642 CheckAssert(m.getLanguage().equals("he"));
644 m = _test1255_reverse(s);
645 charsetMatch = m.getName();
646 CheckAssert(charsetMatch.equals("ISO-8859-8"));
647 CheckAssert(m.getLanguage().equals("he"));
649 m = _testIBM424_he_rtl(s);
650 charsetMatch = m.getName();
651 CheckAssert(charsetMatch.equals("IBM424_rtl"));
652 CheckAssert(m.getLanguage().equals("he"));
655 } catch (Exception ex) {
656 errln("Error getting string for charsetMatch: " + charsetMatch);
659 m = _testIBM424_he_ltr(s);
660 charsetMatch = m.getName();
661 CheckAssert(charsetMatch.equals("IBM424_ltr"));
662 CheckAssert(m.getLanguage().equals("he"));
665 } catch (Exception ex) {
666 errln("Error getting string for charsetMatch: " + charsetMatch);
670 private CharsetMatch _test1255(String s) throws Exception {
671 byte [] bytes = s.getBytes("ISO-8859-8");
672 CharsetDetector det = new CharsetDetector();
674 CharsetMatch m = det.detect();
678 private CharsetMatch _test1255_reverse(String s) throws Exception {
679 StringBuffer reverseStrBuf = new StringBuffer(s);
680 reverseStrBuf = reverseStrBuf.reverse();
681 byte [] bytes = reverseStrBuf.toString().getBytes("ISO-8859-8");
683 CharsetDetector det = new CharsetDetector();
685 CharsetMatch m = det.detect();
689 private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
690 byte [] bytes = s.getBytes("IBM424");
691 CharsetDetector det = new CharsetDetector();
692 det.setDetectableCharset("IBM424_rtl", true);
693 det.setDetectableCharset("IBM424_ltr", true);
694 det.setDetectableCharset("IBM420_rtl", true);
695 det.setDetectableCharset("IBM420_ltr", true);
697 CharsetMatch m = det.detect();
701 private CharsetMatch _testIBM424_he_ltr(String s) throws Exception {
703 * transformation of input string to CP420 left to right requires reversing the string
706 StringBuffer ltrStrBuf = new StringBuffer(s);
707 ltrStrBuf = ltrStrBuf.reverse();
708 byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
710 CharsetDetector det = new CharsetDetector();
711 det.setDetectableCharset("IBM424_rtl", true);
712 det.setDetectableCharset("IBM424_ltr", true);
713 det.setDetectableCharset("IBM420_rtl", true);
714 det.setDetectableCharset("IBM420_ltr", true);
716 CharsetMatch m = det.detect();
721 * Test the method int match(CharsetDetector det) in CharsetRecog_UTF_16_LE
723 public void TestCharsetRecog_UTF_16_LE_Match() {
724 byte[] in = { Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE, Byte.MIN_VALUE };
725 CharsetDetector cd = new CharsetDetector();
726 // Tests when if (input.length>=4 && input[2] == 0x00 && input[3] == 0x00) is true inside the
727 // match(CharsetDetector) method of CharsetRecog_UTF_16_LE
730 } catch (Exception e) {
731 errln("CharsetRecog_UTF_16_LE.match(CharsetDetector) was not suppose to return an exception.");
736 // Bug #8309, test case submitted with original bug report.
738 public void TestFreshDetectorEachTime() throws Exception
740 CharsetDetector detector1 = new CharsetDetector();
741 byte[] data1 = createData1();
742 detector1.setText(data1);
743 CharsetMatch match1 = detector1.detect();
744 assertEquals("Expected GB18030", "GB18030", match1.getName());
746 CharsetDetector detector2 = new CharsetDetector();
747 byte[] data2 = createData2();
748 detector2.setText(data2);
749 CharsetMatch match2 = detector2.detect();
750 // It is actually GB18030 but the sample size is way too small to be reliable.
751 assertEquals("Expected ISO-8859-1, even though that isn't strictly correct", "ISO-8859-1", match2.getName());
754 public void TestReusingDetector() throws Exception
756 CharsetDetector detector = new CharsetDetector();
758 byte[] data1 = createData1();
759 detector.setText(data1);
760 CharsetMatch match1 = detector.detect();
761 assertEquals("Expected GB18030", "GB18030", match1.getName());
762 byte[] data2 = createData2();
763 detector.setText(data2);
764 CharsetMatch match2 = detector.detect();
765 assertEquals("Expected ISO-8859-1, even though that isn't strictly correct", "ISO-8859-1", match2.getName());
766 // calling detect() one more time without changing the input data
767 CharsetMatch match2a = detector.detect();
768 assertEquals("[second]Expected ISO-8859-1, even though that isn't strictly correct", "ISO-8859-1", match2a.getName());
771 private static byte[] createData1()
773 return bytesFromString("3B 3B 3B 20 2D 2A 2D 20 4D 6F 64 65 3A 20 4C 49 53 50 3B 20 53 79 6E 74 61 78 " +
774 "3A 20 43 6F 6D 6D 6F 6E 2D 6C 69 73 70 3B 20 50 61 63 6B 61 67 65 3A 20 41 4C " +
775 "45 4D 42 49 43 20 3B 20 42 61 73 65 3A 20 31 30 20 2D 2A 2D 0D 0A 0D 0A 28 69 " +
776 "6E 2D 70 61 63 6B 61 67 65 20 22 41 4C 45 4D 42 49 43 22 29 0D 0A 0D 0A 28 6E " +
777 "6F 74 69 63 65 20 22 43 6F 70 79 72 69 67 68 74 20 74 68 65 20 4D 49 54 52 45 " +
778 "20 43 6F 72 70 6F 72 61 74 69 6F 6E 20 31 39 39 37 2D 31 39 39 38 2E 20 20 41 " +
779 "6C 6C 20 72 69 67 68 74 73 20 72 65 73 65 72 76 65 64 2E 22 29 0D 0A 0D 0A 3B " +
780 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B " +
781 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B " +
782 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 0D 0A 3B 3B 3B 20 20 " +
783 "20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 " +
784 "20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 " +
785 "20 20 20 20 20 20 20 20 20 20 20 20 3B 3B 3B 0D 0A 3B 3B 3B 20 20 20 43 68 69 " +
786 "6E 65 73 65 20 73 75 66 66 69 78 2C 20 70 72 65 66 69 78 20 61 6E 64 20 61 66 " +
787 "66 69 78 20 70 72 65 64 69 63 61 74 65 73 2E 09 09 20 20 20 20 20 3B 3B 3B 0D " +
788 "0A 3B 3B 3B 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 " +
789 "20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 " +
790 "20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 3B 3B 3B 0D 0A 3B 3B 3B " +
791 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B " +
792 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B " +
793 "3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 3B 0D 0A 0D 0A 28 64 65 66 76 " +
794 "61 72 20 2A 7A 68 2D 6E 61 6D 65 2D 66 69 6E 61 6C 2D 63 68 61 72 73 2A 0D 0A " +
795 "20 20 20 20 27 28 3B 3B 20 63 6F 6D 6D 6F 6E 20 4A 61 70 61 6E 65 73 65 20 50 " +
796 "65 72 73 6E 61 6D 65 20 65 6E 64 69 6E 67 0D 0A 20 20 20 20 20 20 22 CC AB C0 " +
797 "C9 22 0D 0A 20 20 20 20 20 20 3B 3B 20 4D 65 79 65 72 3F 0D 0A 20 20 20 20 20 " +
798 "20 3B 3B 20 C3 B7 C8 D5 0D 0A 20 20 20 20 20 20 22 B0 A2 22 20 22 B0 A3 22 20 " +
799 "22 B0 B2 22 20 22 B0 BA 22 20 22 B0 C2 22 20 22 B0 CD 22 0D 0A 20 20 20 20 20 " +
800 "20 22 B0 D7 22 20 22 B0 DD 22 20 22 B0 E0 22 20 22 B0 EE 22 20 22 B1 A4 22 20 " +
801 "22 B1 AB 22 0D 0A 20 20 20 20 20 20 22 B1 B4 22 20 22 B1 BE 22 20 22 B1 C8 22 " +
802 "20 22 B1 CF 22 20 22 B1 F6 22 20 22 B2 A8 22 0D 0A 20 20 20 20 20 20 22 B2 A9 " +
803 "22 20 22 B2 AA 22 20 22 B2 AE 22 20 22 B2 B7 22 20 22 B2 BC 22 20 22 B2 C9 22 " +
804 "0D 0A 20 20 20 20 20 20 22 B2 DF 22 20 22 B2 E9 22 20 22 B2 EC 22 20 22 B2 EE " +
805 "22 20 22 B3 B9 22 20 22 B4 C4 22 0D 0A 20 20 20 20 20 20 22 B4 EF 22 20 22 B4 " +
806 "F3 22 20 22 B4 F7 22 20 22 B4 FA 22 20 22 B5 A4 22 20 22 B5 B1 22 0D 0A 20 20 " +
807 "20 20 20 20 22 B5 C0 22 20 22 B5 C2 22 20 22 B5 C3 22 20 22 B5 C7 22 20 22 B5 " +
808 "CF 22 20 22 B5 D0 22 0D 0A 20 20 20 20 20 20 22 B5 D9 22 20 22 B5 DA 22 20 22 " +
809 "B5 D8 22 20 22 B6 A1 22 20 22 B6 AB 22 20 22 B6 BC 22 0D 0A 20 20 20 20 20 20 " +
810 "22 B6 C5 22 20 22 B6 D6 22 20 22 B6 D8 22 20 22 B6 D9 22 20 22 B6 E0 22 20 22 " +
811 "B6 F2 22 0D 0A 20 20 20 20 20 20 22 B6 F7 22 20 22 B6 FA 22 20 22 B6 FB 22 20 " +
812 "22 B7 A8 22 20 22 B7 B2 22 20 22 B7 B6 22 0D 0A 20 20 20 20 20 20 22 B7 BD 22 " +
813 "20 22 B7 C6 22 20 22 B7 D1 22 20 22 B7 D2 22 20 22 B7 E1 22 20 22 B7 EB 22 0D " +
814 "0A 20 20 20 20 20 20 22 B7 F0 22 20 22 B7 F2 22 20 22 B8 A3 22 20 22 B8 A5 22 " +
815 "20 22 B8 BB 22 20 22 B8 C7 22 0D 0A 20 20 20 20 20 20 22 B8 D4 22 20 22 B8 DF " +
816 "22 20 22 B8 E7 22 20 22 B8 EA 22 20 22 B8 F1 22 20 22 B8 F7 22 0D 0A 20 20 20 " +
817 "20 20 20 22 B8 F9 22 20 22 B9 B1 22 20 22 B9 C5 22 20 22 B9 E7 22 20 22 B9 FB " +
818 "22 20 22 B9 FE 22 0D 0A 20 20 20 20 20 20 22 BA A3 22 20 22 BA B2 22 20 22 BA " +
819 "BA 22 20 22 BA C0 22 20 22 BA D5 22 20 22 BA DA 22 0D 0A 20 20 20 20 20 20 22 " +
820 "BA E0 22 20 22 BA E9 22 20 22 BB AA 22 20 22 BB DD 22 20 22 BB F4 22 20 22 BB " +
821 "F9 22 0D 0A 20 20 20 20 20 20 22 BC AA 22 20 22 BC BE 22 20 22 BC CE 22 20 22 " +
822 "BC D3 22 20 22 BC D6 22 20 22 BC F2 22 0D 0A 20 20 20 20 20 20 22 BD AA 22 20 " +
823 "22 BD DC 22 20 22 BD F0 22 20 22 BD F1 22 20 22 BD F2 22 20 22 BF A1 22 0D 0A " +
824 "20 20 20 20 20 20 22 BF A8 22 20 22 BF AD 22 20 22 BF B2 22 20 22 BF B5 22 20 " +
825 "22 BF BC 22 20 22 BF C6 22 0D 0A 20 20 20 20 20 20 22 BF CB 22 20 22 BF CF 22 " +
826 "20 22 BF E2 22 20 22 BF E4 22 20 22 C0 A4 22 20 22 C0 A5 22 0D 0A 20 20 20 20 " +
827 "20 20 22 C0 AD 22 20 22 C0 B0 22 20 22 C0 B3 22 20 22 C0 B5 22 20 22 C0 BC 22 " +
828 "20 22 C0 CA 22 0D 0A 20 20 20 20 20 20 22 C0 CD 22 20 22 C0 D5 22 20 22 C0 D6 " +
829 "22 20 22 C0 D7 22 20 22 C0 E8 22 20 22 C0 EE 22 0D 0A 20 20 20 20 20 20 22 C0 " +
830 "EF 22 20 22 C0 F1 22 20 22 C0 F2 22 20 22 C0 F6 22 20 22 C0 FB 22 20 22 C1 AB " +
831 "22 0D 0A 20 20 20 20 20 20 22 C1 AE 22 20 22 C1 BC 22 20 22 C1 D0 22 20 22 C1 " +
832 "D2 22 20 22 C1 D5 22 20 22 C1 D6 22 0D 0A 20 20 20 20 20 20 22 C1 F5 22 20 22 " +
833 "C1 F8 22 20 22 C1 FA 22 20 22 C2 A1 22 20 22 C2 AC 22 20 22 C2 B3 22 0D 0A 20 " +
834 "20 20 20 20 20 22 C2 D4 22 20 22 C2 D7 22 20 22 C2 DE 22 20 22 C2 E5 22 20 22 " +
835 "C2 EA 22 20 22 C2 ED 22 0D 0A 20 20 20 20 20 20 22 C2 F5 22 20 22 C2 FC 22 20 " +
836 "22 C3 A2 22 20 22 C3 B7 22 20 22 C3 C5 22 20 22 C3 C9 22 0D 0A 20 20 20 20 20 " +
837 "20 22 C3 D7 22 20 22 C3 DC 22 20 22 C3 F7 22 20 22 C4 A6 22 20 22 C4 AA 22 20 " +
838 "22 C4 AC 22 0D 0A 20 20 20 20 20 20 22 C4 B7 22 20 22 C4 B8 22 20 22 C4 C3 22 " +
839 "20 22 C4 C7 22 20 22 C4 C8 22 20 22 C4 C9 22 0D 0A 20 20 20 20 20 20 22 C4 CE " +
840 "22 20 22 C4 CF 22 20 22 C4 DA 22 20 22 C4 DD 22 20 22 C4 E1 22 20 22 C4 F9 22 " +
841 "0D 0A 20 20 20 20 20 20 22 C4 FE 22 20 22 C5 A6 22 20 22 C5 A9 22 20 22 C5 AC " +
842 "22 20 22 C5 B5 22 20 22 C5 B7 22 0D 0A 20 20 20 20 20 20 22 C5 C1 22 20 22 C5 " +
843 "CB 22 20 22 C5 D3 22 20 22 C5 E5 22 20 22 C5 ED 22 20 22 C5 EE 22 0D 0A 20 20 " +
844 "20 20 20 20 22 C6 A4 22 20 22 C6 BD 22 20 22 C6 D5 22 20 22 C6 E6 22 20 22 C6 " +
845 "EB 22 20 22 C6 F5 22 0D 0A 20 20 20 20 20 20 22 C7 A1 22 20 22 C7 C7 22 20 22 " +
846 "C7 D0 22 20 22 C7 D5 22 20 22 C7 D9 22 20 22 C7 E5 22 0D 0A 20 20 20 20 20 20 " +
847 "22 C7 ED 22 20 22 C7 F0 22 20 22 C7 F5 22 20 22 C8 AA 22 20 22 C8 C3 22 20 22 " +
848 "C8 C8 22 0D 0A 20 20 20 20 20 20 22 C8 E5 22 20 22 C8 F6 22 20 22 C8 F8 22 20 " +
849 "22 C8 FB 22 20 22 C8 FC 22 20 22 C9 A3 22 0D 0A 20 20 20 20 20 20 22 C9 AA 22 " +
850 "20 22 C9 AD 22 20 22 C9 AF 22 20 22 C9 B3 22 20 22 C9 BA 22 0D 0A 20 20 20 20 " +
851 "20 20 3B 3B 20 22 C9 CF 22 0D 0A 20 20 20 20 20 20 22 C9 D0 22 20 22 C9 DC 22 " +
852 "20 22 C9 E1 22 20 22 C9 EA 22 20 22 C9 FA 22 20 22 CA A9 22 0D 0A 20 20 20 20 " +
853 "20 20 22 CA AB 22 20 22 CA AF 22 20 22 CA B2 22 20 22 CA BF 22 20 22 CA D2 22 " +
854 "20 22 CB B9 22 0D 0A 20 20 20 20 20 20 22 CB BC 22 20 22 CB BF 22 20 22 CB C9 " +
855 "22 20 22 CB D5 22 20 22 CB D8 22 20 22 CB E7 22 0D 0A 20 20 20 20 20 20 22 CB " +
856 "F7 22 20 22 CB FE 22 20 22 CC A9 22 20 22 CC AB 22 20 22 CC B9 22 20 22 CC C0 " +
857 "22 0D 0A 20 20 20 20 20 20 22 CC C6 22 20 22 CC D5 22 20 22 CC D8 22 20 22 CC " +
858 "E1 22 20 22 CD A1 22 20 22 CD A2 22 0D 0A 20 20 20 20 20 20 22 CD A8 22 20 22 " +
859 "CD B8 22 20 22 CD BC 22 20 22 CD D0 22 20 22 CD D1 22 20 22 CD DE 22 0D 0A 20 " +
860 "20 20 20 20 20 22 CD DF 22 20 22 CD F2 22 20 22 CD FA 22 20 22 CD FE 22 20 22 " +
861 "CE A2 22 20 22 CE A4 22 0D 0A 20 20 20 20 20 20 22 CE AC 22 20 22 CE C0 22 20 " +
862 "22 CE C2 22 20 22 CE C4 22 20 22 CE CC 22 20 22 CE D6 22 0D 0A 20 20 20 20 20 " +
863 "20 22 CE DA 22 20 22 CE F7 22 20 22 CE FD 22 20 22 CF A3 22 20 22 CF BC 22 20 " +
864 "22 CF C4 22 0D 0A 20 20 20 20 20 20 22 CF E3 22 0D 0A 20 20 20 20 20 20 3B 3B " +
865 "20 70 72 65 70 6F 73 69 74 69 6F 6E 0D 0A 20 20 20 20 20 20 3B 3B 20 22 CF F2 " +
866 "22 0D 0A 20 20 20 20 20 20 22 D0 A4 22 20 22 D0 AA 22 20 22 D0 BB 22 0D 0A 20 " +
867 "20 20 20 20 20 22 D0 C0 22 20 22 D0 C1 22 20 22 D0 CB 22 20 22 D0 D0 22 20 22 " +
868 "D0 D2 22 20 22 D0 DB 22 0D 0A 20 20 20 20 20 20 22 D0 DD 22 20 22 D1 B7 22 20 " +
869 "22 D1 C0 22 20 22 D1 C7 22 20 22 D1 D3 22 20 22 D1 EF 22 0D 0A 20 20 20 20 20 " +
870 "20 22 D2 AE 22 20 22 D2 B6 22 20 22 D2 C1 22 20 22 D2 F2 22 20 22 D3 A2 22 20 " +
871 "22 D3 C8 22 0D 0A 20 20 20 20 20 20 22 D4 BC 22 20 22 D4 D7 22 20 22 D4 DE 22 " +
872 "20 22 D4 E7 22 20 22 D4 F3 22 20 22 D4 F8 22 0D 0A 20 20 20 20 20 20 22 D4 FA " +
873 "22 20 22 D5 B2 22 20 22 D5 C2 22 20 22 D6 A5 22 20 22 D6 BA 22 20 22 D6 CE 22 " +
874 "0D 0A 20 20 20 20 20 20 22 D6 E9 22 20 22 D7 C8 22 20 22 D7 CC 22 20 22 D7 DA " +
875 "22 20 22 D7 F4 22 20 22 E1 AF 22 0D 0A 20 20 20 20 20 20 22 E6 AB 22 20 22 E6 " +
876 "DA 22 20 22 E7 D1 22 20 22 E7 EA 22 20 22 E7 F7 22 20 22 E8 A7 22 0D 0A 20 20 " +
877 "20 20 20 20 22 E9 AA 22 20 22 EB F8 22 20 22 F7 EB 22 0D 0A 20 20 20 20 20 20 " +
878 "29 29 0D 0A 0D 0A 28 64 65 66 76 61 72 20 2A 7A 68 2D 6E 61 6D 65 2D 69 6E 69 " +
879 "74 69 61 6C 2D 63 68 61 72 73 2A 0D 0A 20 20 20 20 27 28 22 B0 A2 22 20 22 B0 " +
880 "A3 22 20 22 B0 AC 22 20 22 B0 AE 22 20 22 B0 B2 22 20 22 B0 BA 22 20 22 B0 C2 " +
881 "22 0D 0A 20 20 20 20 20 20 22 B0 C4 22 20 22 B0 CD 22 20 22 B0 D7 22 20 22 B0 " +
882 "DD 22 20 22 B0 E0 22 20 22 B0 EE 22 0D 0A 20 20 20 20 20 20 22 B1 A3 22 20 22 " +
883 "B1 AB 22 20 22 B1 B4 22 20 22 B1 BE 22 20 22 B1 C8 22 20 22 B1 CB 22 0D 0A 20 " +
884 "20 20 20 20 20 22 B1 CF 22 20 22 B1 F6 22 20 22 B2 A8 22 20 22 B2 A9 22 20 22 " +
885 "B2 AE 22 20 22 B2 BC 22 0D 0A 20 20 20 20 20 20 22 B2 CC 22 20 22 B2 E9 22 20 " +
886 "22 B2 F1 22 20 22 B2 FD 22 20 22 B3 B9 22 20 22 B3 FE 22 0D 0A 20 20 20 20 20 " +
887 "20 22 B4 C4 22 20 22 B4 EF 22 20 22 B4 F7 22 20 22 B4 FA 22 20 22 B5 A4 22 20 " +
888 "22 B5 B1 22 0D 0A 20 20 20 20 20 20 22 B5 C0 22 20 22 B5 C2 22 20 22 B5 C7 22 " +
889 "20 22 B5 CB 22 20 22 B5 CF 22 20 22 B5 D2 22 0D 0A 20 20 20 20 20 20 22 B5 D9 " +
890 "22 20 22 B6 A1 22 20 22 B6 C5 22 20 22 B6 CB 22 20 22 B6 D8 22 20 22 B6 D9 22 " +
891 "0D 0A 20 20 20 20 20 20 22 B6 E0 22 20 22 B6 F2 22 20 22 B6 F7 22 20 22 B7 A8 " +
892 "22 20 22 B7 B6 22 20 22 B7 BD 22 0D 0A 20 20 20 20 20 20 22 B7 C6 22 20 22 B7 " +
893 "D1 22 20 22 B7 D2 22 20 22 B7 EB 22 20 22 B7 F2 22 20 22 B7 F0 22 0D 0A 20 20 " +
894 "20 20 20 20 22 B7 FC 22 20 22 B8 A3 22 20 22 B8 A5 22 20 22 B8 BB 22 20 22 B8 " +
895 "C7 22 20 22 B8 CA 22 0D 0A 20 20 20 20 20 20 22 B8 D4 22 20 22 B8 DF 22 20 22 " +
896 "B8 E7 22 20 22 B8 EA 22 20 22 B8 F1 22 20 22 B8 F9 22 0D 0A 20 20 20 20 20 20 " +
897 "22 B9 B1 22 20 22 B9 C5 22 20 22 B9 CF 22 20 22 B9 E7 22 20 22 B9 FA 22 20 22 " +
898 "B9 FE 22 0D 0A 20 20 20 20 20 20 22 BA A3 22 20 22 BA BA 22 20 22 BA BC 22 20 " +
899 "22 BA C0 22 20 22 BA D5 22 20 22 BA DA 22 0D 0A 20 20 20 20 20 20 22 BA E0 22 " +
900 "20 22 BA F4 22 20 22 BA FA 22 20 22 BB AA 22 20 22 BB B3 22 20 22 BB DD 22 0D " +
901 "0A 20 20 20 20 20 20 22 BB F4 22 20 22 BB F9 22 20 22 BC AA 22 20 22 BC B8 22 " +
902 "20 22 BC D3 22 20 22 BC D6 22 0D 0A 20 20 20 20 20 20 22 BC F2 22 20 22 BC FB " +
903 "22 20 22 BD AD 22 20 22 BD DC 22 20 22 BD F0 22 20 22 BD F2 22 0D 0A 20 20 20 " +
904 "20 20 20 22 BE AE 22 20 22 BF A8 22 20 22 BF AD 22 20 22 BF B2 22 20 22 BF B5 " +
905 "22 20 22 BF BC 22 0D 0A 20 20 20 20 20 20 22 BF C2 22 20 22 BF C6 22 20 22 BF " +
906 "CB 22 20 22 BF CF 22 20 22 BF D7 22 20 22 BF E2 22 0D 0A 20 20 20 20 20 20 22 " +
907 "BF E4 22 20 22 BF EF 22 20 22 BF FC 22 20 22 BF FD 22 20 22 C0 A5 22 20 22 C0 " +
908 "AD 22 0D 0A 20 20 20 20 20 20 22 C0 B0 22 20 22 C0 B3 22 20 22 C0 B5 22 20 22 " +
909 "C0 BC 22 20 22 C0 CA 22 20 22 C0 CD 22 0D 0A 20 20 20 20 20 20 22 C0 D5 22 20 " +
910 "22 C0 D7 22 20 22 C0 E8 22 20 22 C0 ED 22 20 22 C0 EE 22 20 22 C0 EF 22 0D 0A " +
911 "20 20 20 20 20 20 22 C0 F2 22 20 22 C0 F6 22 20 22 C0 FB 22 20 22 C1 D0 22 20 " +
912 "22 C1 D5 22 20 22 C1 D6 22 0D 0A 20 20 20 20 20 20 22 C1 F5 22 20 22 C1 F8 22 " +
913 "20 22 C1 FA 22 20 22 C2 A1 22 20 22 C2 AC 22 20 22 C2 B3 22 0D 0A 20 20 20 20 " +
914 "20 20 22 C2 B6 22 20 22 C2 B7 22 20 22 C2 C0 22 20 22 C2 D7 22 20 22 C2 DE 22 " +
915 "20 22 C2 E5 22 0D 0A 20 20 20 20 20 20 22 C2 EA 22 20 22 C2 ED 22 20 22 C2 F3 " +
916 "22 20 22 C2 F5 22 20 22 C2 FC 22 20 22 C3 A2 22 0D 0A 20 20 20 20 20 20 22 C3 " +
917 "AB 22 20 22 C3 B7 22 20 22 C3 C5 22 20 22 C3 C9 22 20 22 C3 CF 22 20 22 C3 D7 " +
918 "22 0D 0A 20 20 20 20 20 20 22 C3 F7 22 20 22 C4 A6 22 20 22 C4 AA 22 20 22 C4 " +
919 "AB 22 20 22 C4 AC 22 20 22 C4 B7 22 0D 0A 20 20 20 20 20 20 22 C4 C2 22 20 22 " +
920 "C4 C9 22 20 22 C4 CE 22 20 22 C4 CF 22 20 22 C4 DA 22 20 22 C4 DD 22 0D 0A 20 " +
921 "20 20 20 20 20 22 C4 E1 22 20 22 C4 FE 22 20 22 C5 A3 22 20 22 C5 A6 22 20 22 " +
922 "C5 A9 22 20 22 C5 AC 22 0D 0A 20 20 20 20 20 20 22 C5 B5 22 20 22 C5 B7 22 20 " +
923 "22 C5 C1 22 20 22 C5 C9 22 20 22 C5 CB 22 20 22 C5 D3 22 0D 0A 20 20 20 20 20 " +
924 "20 22 C5 E0 22 20 22 C5 E5 22 20 22 C5 ED 22 20 22 C6 A4 22 20 22 C6 BD 22 20 " +
925 "22 C6 D3 22 0D 0A 20 20 20 20 20 20 22 C6 D5 22 20 22 C6 CF 22 20 22 C6 E6 22 " +
926 "20 22 C6 EB 22 20 22 C7 A1 22 20 22 C7 AE 22 0D 0A 20 20 20 20 20 20 22 C7 C5 " +
927 "22 20 22 C7 C7 22 20 22 C7 D0 22 20 22 C7 D5 22 20 22 C7 ED 22 20 22 C7 F0 22 " +
928 "0D 0A 20 20 20 20 20 20 22 C7 F1 22 20 22 C8 C8 22 20 22 C8 D9 22 20 22 C8 E5 " +
929 "22 20 22 C8 F0 22 20 22 C8 F6 22 0D 0A 20 20 20 20 20 20 22 C8 F8 22 20 22 C8 " +
930 "FB 22 20 22 C8 FC 22 20 22 C9 A3 22 20 22 C9 AA 22 20 22 C9 AD 22 0D 0A 20 20 " +
931 "20 20 20 20 22 C9 AF 22 20 22 C9 B3 22 20 22 C9 DC 22 20 22 C9 E1 22 20 22 C9 " +
932 "EA 22 20 22 CA A5 22 0D 0A 20 20 20 20 20 20 22 CA A9 22 20 22 CA B7 22 20 22 " +
933 "CA E6 22 20 22 CB B7 22 20 22 CB B9 22 20 22 CB BC 22 0D 0A 20 20 20 20 20 20 " +
934 "22 CB C9 22 20 22 CB D5 22 20 22 CB F7 22 20 22 CB F9 22 20 22 CB FE 22 20 22 " +
935 "CC A9 22 0D 0A 20 20 20 20 20 20 22 CC B9 22 20 22 CC C0 22 20 22 CC C6 22 20 " +
936 "22 CC D5 22 20 22 CC D8 22 20 22 CC FA 22 0D 0A 20 20 20 20 20 20 22 CD A2 22 " +
937 "20 22 CD A8 22 20 22 CD BC 22 20 22 CD CF 22 20 22 CD D0 22 20 22 CD DF 22 0D " +
938 "0A 20 20 20 20 20 20 22 CD F2 22 20 22 CD FA 22 20 22 CD FE 22 20 22 CE A4 22 " +
939 "20 22 CE AC 22 20 22 CE BA 22 0D 0A 20 20 20 20 20 20 22 CE C2 22 20 22 CE C4 " +
940 "22 20 22 CE CC 22 20 22 CE D6 22 20 22 CE DA 22 20 22 CE E4 22 0D 0A 20 20 20 " +
941 "20 20 20 22 CE E9 22 20 22 CE F7 22 20 22 CE FD 22 20 22 CF A3 22 20 22 CF A4 " +
942 "22 20 22 CF AF 22 0D 0A 20 20 20 20 20 20 22 CF C4 22 20 22 CF E3 22 20 22 D0 " +
943 "A4 22 20 22 D0 AA 22 20 22 D0 BB 22 20 22 D0 C0 22 0D 0A 20 20 20 20 20 20 22 " +
944 "D0 C1 22 20 22 D0 DD 22 20 22 D0 DE 22 20 22 D0 ED 22 20 22 D0 F0 22 20 22 D1 " +
945 "A6 22 0D 0A 20 20 20 20 20 20 22 D1 A9 22 20 22 D1 C5 22 20 22 D1 C7 22 20 22 " +
946 "D1 D3 22 20 22 D1 EF 22 20 22 D2 AB 22 0D 0A 20 20 20 20 20 20 22 D2 AE 22 20 " +
947 "22 D2 B6 22 20 22 D2 C1 22 20 22 D2 D7 22 20 22 D3 A1 22 20 22 D3 A2 22 0D 0A " +
948 "20 20 20 20 20 20 22 D3 C8 22 20 22 D3 DA 22 20 22 D3 EA 22 20 22 D4 BC 22 20 " +
949 "22 D4 DE 22 20 22 D4 F3 22 0D 0A 20 20 20 20 20 20 22 D4 F8 22 20 22 D5 A7 22 " +
950 "20 22 D4 FA 22 20 22 D5 B2 22 20 22 D5 C5 22 20 22 D5 E4 22 0D 0A 20 20 20 20 " +
951 "20 20 22 D6 A5 22 20 22 D6 EC 22 20 22 D7 C8 22 20 22 D7 DA 22 20 22 D7 F4 22 " +
952 "20 22 E7 EA 22 0D 0A 20 20 20 20 20 20 22 E8 A7 22 20 22 EB F8 22 20 22 F7 EC " +
953 "22 0D 0A 20 20 20 20 20 20 29 29 0D 0A 0D 0A 28 64 65 66 76 61 72 20 2A 7A 68 " +
954 "2D 6E 61 6D 65 2D 6D 65 64 69 61 6C 2D 63 68 61 72 73 2A 0D 0A 20 20 20 20 27 " +
955 "28 22 A1 A4 22 20 22 A3 AE 22 20 22 B0 A2 22 20 22 B0 A3 22 20 22 B0 AC 22 20 " +
956 "22 B0 AE 22 0D 0A 20 20 20 20 20 20 22 B0 B2 22 20 22 B0 BA 22 20 22 B0 C2 22 " +
957 "20 22 B0 C4 22 20 22 B0 CD 22 20 22 B0 D7 22 0D 0A 20 20 20 20 20 20 22 B0 DD " +
958 "22 20 22 B0 E0 22 20 22 B0 EE 22 20 22 B1 AB 22 20 22 B1 B1 22 20 22 B1 B4 22 " +
959 "0D 0A 20 20 20 20 20 20 22 B1 B6 22 20 22 B1 BE 22 20 22 B1 C8 22 20 22 B1 CB " +
960 "22 20 22 B1 D9 22 20 22 B1 F0 22 0D 0A 20 20 20 20 20 20 22 B1 F6 22 20 22 B2 " +
961 "A8 22 20 22 B2 AA 22 20 22 B2 A9 22 20 22 B2 AE 22 20 22 B2 B7 22 0D 0A 20 20 " +
962 "20 20 20 20 22 B2 BC 22 20 22 B2 DF 22 20 22 B2 E9 22 20 22 B2 F1 22 20 22 B3 " +
963 "B9 22 20 22 B3 C2 22 0D 0A 20 20 20 20 20 20 22 B4 C4 22 20 22 B4 CE 22 20 22 " +
964 "B4 EF 22 20 22 B4 F3 22 20 22 B4 F7 22 20 22 B4 FA 22 0D 0A 20 20 20 20 20 20 " +
965 "22 B5 A4 22 20 22 B5 B1 22 20 22 B5 C0 22 20 22 B5 C2 22 20 22 B5 C3 22 20 22 " +
966 "B5 C7 22 0D 0A 20 20 20 20 20 20 22 B5 CB 22 20 22 B5 CF 22 20 22 B5 D2 22 20 " +
967 "22 B5 D8 22 20 22 B5 DA 22 20 22 B5 D9 22 0D 0A 20 20 20 20 20 20 22 B6 A1 22 " +
968 "20 22 B6 AB 22 20 22 B6 C5 22 20 22 B6 D9 22 20 22 B6 E0 22 20 22 B6 F2 22 0D " +
969 "0A 20 20 20 20 20 20 22 B6 ED 22 20 22 B6 F7 22 20 22 B6 FB 22 20 22 B7 A8 22 " +
970 "20 22 B7 B6 22 20 22 B7 BD 22 0D 0A 20 20 20 20 20 20 22 B7 C6 22 20 22 B7 C7 " +
971 "22 20 22 B7 D1 22 20 22 B7 D2 22 20 22 B7 E1 22 20 22 B7 EB 22 0D 0A 20 20 20 " +
972 "20 20 20 22 B7 F0 22 20 22 B7 F2 22 20 22 B8 A3 22 20 22 B8 A5 22 20 22 B8 B5 " +
973 "22 20 22 B8 BB 22 0D 0A 20 20 20 20 20 20 22 B8 C7 22 20 22 B8 CA 22 20 22 B8 " +
974 "DF 22 20 22 B8 E7 22 20 22 B8 EA 22 20 22 B8 EF 22 0D 0A 20 20 20 20 20 20 22 " +
975 "B8 F0 22 20 22 B8 F1 22 20 22 B8 F7 22 20 22 B8 F9 22 20 22 B8 FC 22 20 22 B9 " +
976 "B1 22 0D 0A 20 20 20 20 20 20 22 B9 C3 22 20 22 B9 C5 22 20 22 B9 CF 22 20 22 " +
977 "B9 E7 22 20 22 B9 FA 22 20 22 B9 FE 22 0D 0A 20 20 20 20 20 20 22 BA A3 22 20 " +
978 "22 BA B1 22 20 22 BA B2 22 20 22 BA BA 22 20 22 BA C0 22 20 22 BA C1 22 0D 0A " +
979 "20 20 20 20 20 20 22 BA D5 22 20 22 BA DA 22 20 22 BA E0 22 20 22 BA E3 22 20 " +
980 "22 BA E9 22 20 22 BA EE 22 0D 0A 20 20 20 20 20 20 22 BA FA 22 20 22 BB AA 22 " +
981 "20 22 BB B3 22 20 22 BB DD 22 20 22 BB F4 22 20 22 BB F9 22 0D 0A 20 20 20 20 " +
982 "20 20 22 BC AA 22 20 22 BC BE 22 20 22 BC CE 22 20 22 BC D3 22 20 22 BC D6 22 " +
983 "20 22 BD DC 22 0D 0A 20 20 20 20 20 20 22 BD F0 22 20 22 BD F2 22 20 22 BF A8 " +
984 "22 20 22 BF AA 22 20 22 BF AD 22 20 22 BF B2 22 0D 0A 20 20 20 20 20 20 22 BF " +
985 "B5 22 20 22 BF BC 22 20 22 BF C2 22 20 22 BF C6 22 20 22 BF CB 22 20 22 BF A6 " +
986 "22 0D 0A 20 20 20 20 20 20 22 BF CF 22 20 22 BF D7 22 20 22 BF DB 22 20 22 BF " +
987 "E2 22 20 22 BF E4 22 20 22 BF FC 22 0D 0A 20 20 20 20 20 20 22 C0 A5 22 20 22 " +
988 "C0 AA 22 20 22 C0 AD 22 20 22 C0 B0 22 20 22 C0 B3 22 20 22 C0 B4 22 0D 0A 20 " +
989 "20 20 20 20 20 22 C0 B5 22 20 22 C0 BC 22 20 22 C0 CA 22 20 22 C0 CD 22 20 22 " +
990 "C0 D5 22 20 22 C0 D7 22 0D 0A 20 20 20 20 20 20 22 C0 D9 22 20 22 C0 E8 22 20 " +
991 "22 C0 ED 22 20 22 C0 EE 22 20 22 C0 EF 22 20 22 C0 F1 22 0D 0A 20 20 20 20 20 " +
992 "20 22 C0 F2 22 20 22 C0 F6 22 20 22 C0 FA 22 20 22 C0 FB 22 20 22 C1 A2 22 20 " +
993 "22 C1 AC 22 0D 0A 20 20 20 20 20 20 22 C1 AB 22 20 22 C1 AA 22 20 22 C1 AE 22 " +
994 "20 22 C1 BF 22 20 22 C1 D0 22 20 22 C1 D2 22 0D 0A 20 20 20 20 20 20 22 C1 D5 " +
995 "22 20 22 C1 D6 22 20 22 C1 F4 22 20 22 C1 F8 22 20 22 C1 FA 22 20 22 C2 A1 22 " +
996 "0D 0A 20 20 20 20 20 20 22 C2 AC 22 20 22 C2 B3 22 20 22 C2 B6 22 20 22 C2 B7 " +
997 "22 20 22 C2 C0 22 20 22 C2 D4 22 0D 0A 20 20 20 20 20 20 22 C2 D7 22 20 22 C2 " +
998 "D8 22 20 22 C2 DC 22 20 22 C2 DE 22 20 22 C2 E5 22 20 22 C2 EA 22 0D 0A 20 20 " +
999 "20 20 20 20 22 C2 ED 22 20 22 C2 F3 22 20 22 C2 F5 22 20 22 C2 FA 22 20 22 C2 " +
1000 "FC 22 20 22 C3 A2 22 0D 0A 20 20 20 20 20 20 22 C3 A9 22 20 22 C3 B7 22 20 22 " +
1001 "C3 C0 22 20 22 C3 C5 22 20 22 C3 C9 22 20 22 C3 D7 22 0D 0A 20 20 20 20 20 20 " +
1002 "22 C3 DC 22 20 22 C3 F4 22 20 22 C3 F7 22 20 22 C4 A6 22 20 22 C4 AA 22 20 22 " +
1003 "C4 AC 22 0D 0A 20 20 20 20 20 20 22 C4 AD 22 20 22 C4 B7 22 20 22 C4 BE 22 20 " +
1004 "22 C4 C2 22 20 22 C4 C3 22 20 22 C4 C7 22 0D 0A 20 20 20 20 20 20 22 C4 C8 22 " +
1005 "20 22 C4 C9 22 20 22 C4 CE 22 20 22 C4 CF 22 20 22 C4 DA 22 20 22 C4 DD 22 0D " +
1006 "0A 20 20 20 20 20 20 22 C4 E1 22 20 22 C4 EA 22 20 22 C4 EE 22 20 22 C4 F9 22 " +
1007 "20 22 C4 FE 22 20 22 C5 A6 22 0D 0A 20 20 20 20 20 20 22 C5 A9 22 20 22 C5 AC " +
1008 "22 20 22 C5 B5 22 20 22 C5 B7 22 20 22 C5 C1 22 20 22 C5 C9 22 0D 0A 20 20 20 " +
1009 "20 20 20 22 C5 CB 22 20 22 C5 D3 22 20 22 C5 E0 22 20 22 C5 E5 22 20 22 C5 ED " +
1010 "22 20 22 C5 EE 22 0D 0A 20 20 20 20 20 20 22 C6 A4 22 20 22 C6 BD 22 20 22 C6 " +
1011 "C3 22 20 22 C6 D5 22 20 22 C6 E1 22 20 22 C6 E6 22 0D 0A 20 20 20 20 20 20 22 " +
1012 "C6 EB 22 20 22 C6 E4 22 20 22 C6 F5 22 20 22 C7 A1 22 20 22 C7 AE 22 20 22 C7 " +
1013 "BF 22 0D 0A 20 20 20 20 20 20 22 C7 C7 22 20 22 C7 D0 22 20 22 C7 D5 22 20 22 " +
1014 "C7 D9 22 20 22 C7 ED 22 20 22 C7 F0 22 0D 0A 20 20 20 20 20 20 22 C7 F3 22 20 " +
1015 "22 C8 B4 22 20 22 C8 C8 22 20 22 C8 E5 22 20 22 C8 F4 22 20 22 C8 F6 22 0D 0A " +
1016 "20 20 20 20 20 20 22 C8 F8 22 20 22 C8 FB 22 20 22 C8 FC 22 20 22 C9 A3 22 20 " +
1017 "22 C9 AA 22 20 22 C9 AD 22 0D 0A 20 20 20 20 20 20 22 C9 AF 22 20 22 C9 B3 22 " +
1018 "20 22 C9 BA 22 20 22 C9 BD 22 20 22 C9 DC 22 20 22 C9 E1 22 0D 0A 20 20 20 20 " +
1019 "20 20 22 C9 EA 22 20 22 CA A2 22 20 22 CA A9 22 20 22 CA B2 22 20 22 CA B7 22 " +
1020 "20 22 CA BF 22 0D 0A 20 20 20 20 20 20 22 CA E6 22 20 22 CB B9 22 20 22 CB BC " +
1021 "22 20 22 CB BE 22 20 22 CB BF 22 20 22 CB C9 22 0D 0A 20 20 20 20 20 20 22 CB " +
1022 "D5 22 20 22 CB F7 22 20 22 CB FE 22 20 22 CC A9 22 20 22 CC AB 22 20 22 CC B9 " +
1023 "22 0D 0A 20 20 20 20 20 20 22 CC C0 22 20 22 CC C6 22 20 22 CC D5 22 20 22 CC " +
1024 "D1 22 20 22 CC D8 22 20 22 CC E8 22 0D 0A 20 20 20 20 20 20 22 CC FA 22 20 22 " +
1025 "CD A1 22 20 22 CD A2 22 20 22 CD A8 22 20 22 CD BC 22 20 22 CD BD 22 0D 0A 20 " +
1026 "20 20 20 20 20 22 CD C2 22 20 22 CD CB 22 20 22 CD D0 22 20 22 CD DF 22 20 22 " +
1027 "CD F2 22 20 22 CD FA 22 0D 0A 20 20 20 20 20 20 22 CD FE 22 20 22 CE A4 22 20 " +
1028 "22 CE AC 22 20 22 CE B0 22 20 22 CE BA 22 20 22 CE C0 22 0D 0A 20 20 20 20 20 " +
1029 "20 22 CE C2 22 20 22 CE C4 22 20 22 CE CC 22 20 22 CE D6 22 20 22 CE DA 22 20 " +
1030 "22 CE E4 22 0D 0A 20 20 20 20 20 20 22 CE E9 22 20 22 CE EE 22 20 22 CE F7 22 " +
1031 "20 22 CE FD 22 20 22 CF A3 22 20 22 CF AF 22 0D 0A 20 20 20 20 20 20 22 CF C4 " +
1032 "22 20 22 CF E3 22 20 22 CF FE 22 20 22 D0 A4 22 20 22 D0 AA 22 20 22 D0 BB 22 " +
1033 "0D 0A 20 20 20 20 20 20 22 D0 C0 22 20 22 D0 C1 22 20 22 D0 C2 22 20 22 D0 CB " +
1034 "22 20 22 D0 D2 22 20 22 D0 DD 22 0D 0A 20 20 20 20 20 20 22 D0 DE 22 20 22 D0 " +
1035 "F5 22 20 22 D1 A9 22 20 22 D1 C7 22 20 22 E6 AB 22 20 22 D1 EF 22 0D 0A 20 20 " +
1036 "20 20 20 20 22 D1 F4 22 20 22 D2 AE 22 20 22 D2 B6 22 20 22 D2 C0 22 20 22 D2 " +
1037 "C1 22 20 22 D2 D4 22 0D 0A 20 20 20 20 20 20 22 D2 D7 22 20 22 D2 F2 22 20 22 " +
1038 "D3 A2 22 20 22 D3 C0 22 20 22 D3 C8 22 20 22 D3 D0 22 0D 0A 20 20 20 20 20 20 " +
1039 "22 D3 D1 22 20 22 D3 D7 22 20 22 D3 F4 22 20 22 D4 BC 22 20 22 D4 DE 22 20 22 " +
1040 "D4 F3 22 0D 0A 20 20 20 20 20 20 22 D4 F6 22 20 22 D4 F8 22 20 22 D4 FA 22 20 " +
1041 "22 D5 B2 22 20 22 D5 E4 22 20 22 D5 FD 22 0D 0A 20 20 20 20 20 20 22 D6 CE 22 " +
1042 "20 22 D6 DC 22 20 22 D6 EC 22 20 22 D7 BF 22 20 22 D7 C8 22 20 22 D7 CC 22 0D " +
1043 "0A 20 20 20 20 20 20 22 D7 D3 22 20 22 D7 E6 22 20 22 D7 F4 22 20 22 DA F7 22 " +
1044 "20 22 E1 AF 22 20 22 E7 D1 22 0D 0A 20 20 20 20 20 20 22 E7 EA 22 20 22 E8 A7 " +
1045 "22 20 22 E9 A6 22 20 22 EB F8 22 20 22 EC B3 22 20 22 F1 E3 22 0D 0A 20 20 20 " +
1046 "20 20 20 22 F7 EB 22 20 22 F7 EC 22 0D 0A 20 20 20 20 20 20 29 29 0D 0A 0D 0A " +
1047 "0D 0A 28 64 65 66 75 6E 20 7A 68 2D 6E 61 6D 65 2D 73 75 66 66 69 78 3F 20 28 " +
1048 "73 74 72 69 6E 67 29 0D 0A 20 20 28 6D 65 6D 62 65 72 20 73 74 72 69 6E 67 20 " +
1049 "2A 7A 68 2D 6E 61 6D 65 2D 66 69 6E 61 6C 2D 63 68 61 72 73 2A 20 3A 74 65 73 " +
1050 "74 20 23 27 6D 61 74 63 68 69 6E 67 2D 73 75 66 66 69 78 3F 29 29 0D 0A 0D 0A " +
1051 "28 64 65 66 75 6E 20 7A 68 2D 6E 61 6D 65 2D 70 72 65 66 69 78 3F 20 28 73 74 " +
1052 "72 69 6E 67 29 0D 0A 20 20 28 6D 65 6D 62 65 72 20 73 74 72 69 6E 67 20 2A 7A " +
1053 "68 2D 6E 61 6D 65 2D 69 6E 69 74 69 61 6C 2D 63 68 61 72 73 2A 20 3A 74 65 73 " +
1054 "74 20 23 27 6D 61 74 63 68 69 6E 67 2D 70 72 65 66 69 78 3F 29 29 0D 0A 0D 0A " +
1055 "28 64 65 66 75 6E 20 7A 68 2D 6E 61 6D 65 2D 69 6E 66 69 78 3F 20 28 73 74 72 " +
1056 "69 6E 67 29 0D 0A 20 20 28 6D 65 6D 62 65 72 20 73 74 72 69 6E 67 20 2A 7A 68 " +
1057 "2D 6E 61 6D 65 2D 6D 65 64 69 61 6C 2D 63 68 61 72 73 2A 20 3A 74 65 73 74 20 " +
1058 "23 27 6D 61 74 63 68 69 6E 67 2D 73 75 62 73 74 72 69 6E 67 3F 29 29 0D 0A 09 " +
1062 private static byte[] createData2()
1064 return bytesFromString("0A D0 A1 CA B1 20 3B 3B 20 48 6F 75 72 28 73 29 0A D0 C7 C6 DA 20 3B 3B 20 57 " +
1065 "65 65 6B 28 73 29 0A B5 B1 B5 D8 20 CA B1 BC E4 20 3B 3B 20 6C 6F 63 61 6C 20 " +
1071 * Creates a byte array by decoding the hex string passed in.
1073 * @param data hex string data.
1074 * @return binary data in a byte array.
1076 private static byte[] bytesFromString(String data)
1078 ByteArrayOutputStream out = new ByteArrayOutputStream();
1080 while (index < data.length())
1082 char[] number = new char[2];
1083 number[0] = data.charAt(index++);
1084 if (Character.isLetterOrDigit(number[0]))
1086 number[1] = data.charAt(index++);
1087 if (!Character.isLetterOrDigit(number[1]))
1089 throw new IllegalArgumentException("Invalid input: " + data);
1091 out.write(Integer.parseInt(new String(number), 16));
1095 return out.toByteArray();
1098 // End of Bug #8309 Test Case
1102 public void TestBug9267() {
1103 // Test a long input of Lam Alef characters for CharsetRecog_IBM420_ar.
1104 // Bug 9267 was an array out of bounds problem in the unshaping code for these.
1105 byte [] input = new byte [7700];
1107 for (i=0; i<input.length; i++) {
1108 input[i] = (byte)0xb2;
1110 CharsetDetector det = new CharsetDetector();
1115 public void TestBug6954 () throws Exception {
1116 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
1117 // similar Windows and non-Windows SBCS encodings. State was kept in the shared
1118 // Charset Recognizer objects, and could be overwritten.
1119 String sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
1120 String sWindows = "This is another small sample of some English text. Just enough to be sure that it detects correctly."
1121 + "It also includes some \u201CC1\u201D bytes.";
1123 byte[] bISO = sISO.getBytes("ISO-8859-1");
1124 byte[] bWindows = sWindows.getBytes("windows-1252");
1126 // First do a plain vanilla detect of 1252 text
1128 CharsetDetector csd1 = new CharsetDetector();
1129 csd1.setText(bWindows);
1130 CharsetMatch match1 = csd1.detect();
1131 String name1 = match1.getName();
1132 assertEquals("Initial detection of charset", "windows-1252", name1);
1134 // Next, using a completely separate detector, detect some 8859-1 text
1136 CharsetDetector csd2 = new CharsetDetector();
1138 CharsetMatch match2 = csd2.detect();
1139 String name2 = match2.getName();
1140 assertEquals("Initial use of second detector", "ISO-8859-1", name2);
1142 // Recheck the 1252 results from the first detector, which should not have been
1143 // altered by the use of a different detector.
1145 name1 = match1.getName();
1146 assertEquals("Wrong charset name after running a second charset detector", "windows-1252", name1);
1149 public void TestBug6889() {
1150 // Verify that CharsetDetector.detectAll() does not return the same encoding multiple times.
1152 "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
1155 textBytes = text.getBytes("ISO-8859-1");
1157 catch (Exception e) {
1158 fail("Unexpected exception " + e.toString());
1162 CharsetDetector det = new CharsetDetector();
1163 det.setText(textBytes);
1164 CharsetMatch matches[] = det.detectAll();
1166 HashSet<String> detectedEncodings = new HashSet<String>();
1167 for (CharsetMatch m: matches) {
1168 assertTrue("Charset " + m.getName() + " encountered before",
1169 detectedEncodings.add(m.getName()));
1173 public void TestMultithreaded() {
1174 String s = "This is some random plain text to run charset detection on.";
1175 final byte [] bytes;
1177 bytes = s.getBytes("ISO-8859-1");
1179 catch (Exception e) {
1180 fail("Unexpected exception " + e.toString());
1184 class WorkerThread extends Thread {
1185 WorkerThread(int num) {
1190 // System.out.println("Thread " + n + " is running.");
1191 CharsetDetector det = new CharsetDetector();
1193 for (int i=0; i<10000; i++) {
1194 CharsetMatch matches[] = det.detectAll();
1195 for (CharsetMatch m: matches) {
1196 assertNotNull("Failure in thread " + n, m);
1199 // System.out.println("Thread " + n + " is finished.");
1203 Thread threads[] = new Thread[10];
1204 for (int i=0; i<10; i++) {
1205 threads[i] = new WorkerThread(i);
1208 for (Thread thread: threads) {
1211 } catch(Exception e) {
1212 fail("Unexpected exception " + e.toString());