2 *******************************************************************************
3 * Copyright (C) 2002-2012, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 *******************************************************************************
10 package com.ibm.icu.dev.test.charset;
12 import java.nio.ByteBuffer;
13 import java.nio.CharBuffer;
14 import java.nio.charset.Charset;
15 import java.nio.charset.CharsetDecoder;
16 import java.nio.charset.CharsetEncoder;
17 import java.nio.charset.CoderResult;
18 import java.nio.charset.CodingErrorAction;
19 import java.util.Iterator;
21 import com.ibm.icu.charset.CharsetCallback;
22 import com.ibm.icu.charset.CharsetDecoderICU;
23 import com.ibm.icu.charset.CharsetEncoderICU;
24 import com.ibm.icu.charset.CharsetICU;
25 import com.ibm.icu.charset.CharsetProviderICU;
26 import com.ibm.icu.dev.test.ModuleTest;
27 import com.ibm.icu.dev.test.TestDataModule.DataMap;
28 import com.ibm.icu.impl.ICUResourceBundle;
29 import com.ibm.icu.text.UnicodeSet;
32 * This maps to convtest.c which tests the test file for data-driven conversion tests.
35 public class TestConversion extends ModuleTest {
37 * This maps to the C struct of conversion case in convtest.h that stores the
38 * data for a conversion test
41 private class ConversionCase {
42 int caseNr; // testcase index
43 String option = null; // callback options
44 CodingErrorAction cbErrorAction = null; // callback action type
45 CharBuffer toUnicodeResult = null;
46 ByteBuffer fromUnicodeResult = null;
48 // data retrieved from a test case conversion.txt
49 String charset; // charset
50 String unicode; // unicode string
51 ByteBuffer bytes; // byte
52 int[] offsets; // offsets
53 boolean finalFlush; // flush
54 boolean fallbacks; // fallback
55 String outErrorCode; // errorCode
56 String cbopt; // callback
58 // TestGetUnicodeSet variables
63 // CharsetCallback encoder and decoder
64 CharsetCallback.Decoder cbDecoder = null;
65 CharsetCallback.Encoder cbEncoder = null;
67 String caseNrAsString() {
68 return "[" + caseNr + "]";
72 /* In the data-driven conversion test, converters that are not available in
73 * ICU4J are marked with the following leading symbol.
75 private static final char UNSUPPORTED_CHARSET_SYMBOL = '+';
77 // public methods --------------------------------------------------------
79 public static void main(String[] args) throws Exception {
80 new TestConversion().run(args);
83 public TestConversion() {
84 super("com/ibm/icu/dev/data/testdata/", "conversion");
88 * This method maps to the convtest.cpp runIndexedTest() method to run each
91 public void processModules() {
93 int testFromUnicode = 0;
94 int testToUnicode = 0;
95 String testName = t.getName().toString();
97 // Iterate through and get each of the test case to process
98 for (Iterator iter = t.getDataIterator(); iter.hasNext();) {
99 DataMap testcase = (DataMap) iter.next();
101 if (testName.equalsIgnoreCase("toUnicode")) {
102 TestToUnicode(testcase, testToUnicode);
105 } else if (testName.equalsIgnoreCase("fromUnicode")) {
106 TestFromUnicode(testcase, testFromUnicode);
108 } else if (testName.equalsIgnoreCase("getUnicodeSet")) {
109 TestGetUnicodeSet(testcase);
111 warnln("Could not load the test cases for conversion");
115 } catch (Exception e) {
121 // private methods -------------------------------------------------------
124 // fromUnicode test worker functions ---------------------------------------
125 private void TestFromUnicode(DataMap testcase, int caseNr) {
127 ConversionCase cc = new ConversionCase();
130 // retrieve test case data
132 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
133 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
134 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
135 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
136 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
137 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
138 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
139 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
141 } catch (Exception e) {
142 errln("Skipping test:");
143 errln("error parsing conversion/toUnicode test case " + cc.caseNr);
148 * Skip the following data driven converter tests.
149 * These tests were added to the data driven conversion test in ICU
150 * to test direct-from-UTF-8 m:n Unicode:charset conversion.
151 * This feature is not in ICU4J.
154 String [] testsToSkip = {
157 for (int i = 0; i < testsToSkip.length; i++) {
158 if (cc.charset.equals(testsToSkip[i])) {
160 logln("Skipping: " + cc.charset);
161 logln("...............................................");
166 // ----for debugging only
168 logln("TestFromUnicode[" + caseNr + "] " + cc.charset + " ");
169 logln("Unicode: " + cc.unicode);
170 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit()));
171 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
172 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
173 logln("...............................................");
175 // process the retrieved test data case
176 if (cc.offsets.length == 0) {
178 } else if (cc.offsets.length != cc.bytes.limit()) {
179 errln("fromUnicode[" + cc.caseNr + "] bytes[" + cc.bytes
180 + "] and offsets[" + cc.offsets.length
181 + "] must have the same length");
185 // check the callback replacement value
186 if (cc.cbopt.length() > 0) {
188 switch ((cc.cbopt).charAt(0)) {
190 cc.cbErrorAction = CodingErrorAction.REPLACE;
193 cc.cbErrorAction = CodingErrorAction.IGNORE;
196 cc.cbErrorAction = CodingErrorAction.REPORT;
199 cc.cbErrorAction = CodingErrorAction.REPLACE;
200 cc.cbEncoder = CharsetCallback.FROM_U_CALLBACK_ESCAPE;
203 cc.cbErrorAction = null;
207 // check for any options for the callback value --
208 cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt
210 if (cc.option == null) {
218 private void FromUnicodeCase(ConversionCase cc) {
220 // create charset encoder for conversion test
221 CharsetProviderICU provider = new CharsetProviderICU();
222 CharsetEncoder encoder = null;
223 Charset charset = null;
225 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
226 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
227 ? (Charset) provider.charsetForName(cc.charset.substring(1),
228 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
229 : (Charset) provider.charsetForName(cc.charset);
230 encoder = (CharsetEncoder) charset.newEncoder();
231 encoder.onMalformedInput(CodingErrorAction.REPLACE);
232 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
233 if (encoder instanceof CharsetEncoderICU) {
234 ((CharsetEncoderICU)encoder).setFallbackUsed(cc.fallbacks);
235 if (((CharsetEncoderICU)encoder).isFallbackUsed() != cc.fallbacks) {
236 errln("Fallback could not be set for " + cc.charset);
240 } catch (Exception e) {
241 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
242 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
244 errln(cc.charset + " was not found");
249 // set the callback for the encoder
250 if (cc.cbErrorAction != null) {
251 if (cc.cbEncoder != null) {
252 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option);
253 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.unmappableForLength(1), cc.cbEncoder, cc.option);
255 encoder.onUnmappableCharacter(cc.cbErrorAction);
256 encoder.onMalformedInput(cc.cbErrorAction);
259 // if action has an option, put in the option for the case
260 if (cc.option.equals("i")) {
261 encoder.onMalformedInput(CodingErrorAction.REPORT);
264 // if callback action is replace,
265 // and there is a subchar
266 // replace the decoder's default replacement value
267 // if substring, skip test due to current api not supporting
269 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
270 if (cc.cbopt.length() > 1) {
271 if (cc.cbopt.length() > 1 && cc.cbopt.charAt(1) == '=') {
272 logln("Skipping test due to limitation in Java API - substitution string not supported");
275 // // read NUL-separated subchar first, if any
276 // copy the subchar from Latin-1 characters
277 // start after the NUL
278 if (cc.cbopt.charAt(1) == 0x00) {
279 cc.cbopt = cc.cbopt.substring(2);
282 encoder.replaceWith(toByteArray(cc.cbopt));
283 } catch (Exception e) {
284 logln("Skipping test due to limitation in Java API - substitution character sequence size error");
293 // do charset encoding from unicode
295 // testing by steps using charset.encoder(in,out,flush)
298 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
299 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
304 for (i = 0; i < steps.length && ok; ++i) {
305 step = Integer.parseInt(steps[i][0]);
307 logln("Testing step:[" + step + "]");
309 resultLength = stepFromUnicode(cc, encoder, step);
310 ok = checkFromUnicode(cc, resultLength);
311 } catch (Exception ex) {
312 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
313 ex.printStackTrace(System.out);
318 // testing by whole buffer using out = charset.encoder(in)
319 while (ok && cc.finalFlush) {
320 logln("Testing java API charset.encoder(in):");
321 cc.fromUnicodeResult = null;
322 ByteBuffer out = null;
325 out = encoder.encode(CharBuffer.wrap(cc.unicode.toCharArray()));
326 out.position(out.limit());
327 if (out.limit() != out.capacity() || cc.finalFlush) {
328 int pos = out.position();
329 byte[] temp = out.array();
330 out = ByteBuffer.allocate(temp.length * 4);
333 CoderResult cr = encoder.flush(out);
334 if (cr.isOverflow()) {
335 logln("Overflow error with flushing encoder");
338 cc.fromUnicodeResult = out;
340 ok = checkFromUnicode(cc, out.limit());
344 } catch (Exception e) {
345 //check the error code to see if it matches cc.errorCode
346 logln("Encoder returned an error code");
347 logln("ErrorCode expected is: " + cc.outErrorCode);
348 logln("Error Result is: " + e.toString());
354 private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) {
356 errln("Negative step size, test internal error.");
360 int sourceLen = cc.unicode.length();
361 int targetLen = cc.bytes.capacity() + 20; // for BOM, and to let failures produce excess output
362 CharBuffer source = CharBuffer.wrap(cc.unicode.toCharArray());
363 ByteBuffer target = ByteBuffer.allocate(targetLen);
364 cc.fromUnicodeResult = null;
367 int currentSourceLimit;
368 int currentTargetLimit;
370 currentSourceLimit = Math.min(step, sourceLen);
371 currentTargetLimit = Math.min(step, targetLen);
373 currentSourceLimit = sourceLen;
374 currentTargetLimit = targetLen;
377 CoderResult cr = null;
380 source.limit(currentSourceLimit);
381 target.limit(currentTargetLimit);
383 cr = encoder.encode(source, target, currentSourceLimit == sourceLen);
385 if (cr.isUnderflow()) {
386 if (currentSourceLimit == sourceLen) {
387 if (target.position() == cc.bytes.limit()) {
388 // target contains the correct number of bytes
391 // Do a final flush for cleanup, then break out
392 // Encode loop, exits with cr==underflow in normal operation.
393 //target.limit(targetLen);
394 target.limit(targetLen);
395 cr = encoder.flush(target);
396 if (cr.isUnderflow()) {
398 } else if (cr.isOverflow()) {
399 errln(cc.caseNrAsString() + " Flush is producing excessive output");
401 errln(cc.caseNrAsString() + " Flush operation failed. CoderResult = \""
402 + cr.toString() + "\"");
406 currentSourceLimit = Math.min(currentSourceLimit + step, sourceLen);
407 } else if (cr.isOverflow()) {
408 if (currentTargetLimit == targetLen) {
409 errln(cc.caseNrAsString() + " encode() is producing excessive output");
412 currentTargetLimit = Math.min(currentTargetLimit + step, targetLen);
414 // check the error code to see if it matches cc.errorCode
415 logln("Encoder returned an error code");
416 logln("ErrorCode expected is: " + cc.outErrorCode);
417 logln("Error Result is: " + cr.toString());
423 cc.fromUnicodeResult = target;
424 return target.position();
427 private boolean checkFromUnicode(ConversionCase cc, int resultLength) {
428 return checkResultsFromUnicode(cc, cc.bytes, cc.fromUnicodeResult);
431 // toUnicode test worker functions ----------------------------------------- ***
433 private void TestToUnicode(DataMap testcase, int caseNr) {
434 // create Conversion case to store the test case data
435 ConversionCase cc = new ConversionCase();
438 // retrieve test case data
440 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString();
441 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary();
442 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString();
443 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector();
444 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0;
445 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0;
446 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString();
447 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString();
449 } catch (Exception e) {
450 errln("Skipping test: error parsing conversion/toUnicode test case " + cc.caseNr);
454 // ----for debugging only
456 logln("TestToUnicode[" + caseNr + "] " + cc.charset + " ");
457 logln("Unicode: " + hex(cc.unicode));
458 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit()));
459 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes());
460 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")");
461 logln("...............................................");
463 // process the retrieved test data case
464 if (cc.offsets.length == 0) {
466 } else if (cc.offsets.length != cc.unicode.length()) {
467 errln("Skipping test: toUnicode[" + cc.caseNr + "] unicode["
468 + cc.unicode.length() + "] and offsets["
469 + cc.offsets.length + "] must have the same length");
472 // check for the callback replacement value for unmappable
473 // characters or malformed errors
474 if (cc.cbopt.length() > 0) {
475 switch ((cc.cbopt).charAt(0)) {
476 case '?': // CALLBACK_SUBSTITUTE
477 cc.cbErrorAction = CodingErrorAction.REPLACE;
479 case '0': // CALLBACK_SKIP
480 cc.cbErrorAction = CodingErrorAction.IGNORE;
482 case '.': // CALLBACK_STOP
483 cc.cbErrorAction = CodingErrorAction.REPORT;
485 case '&': // CALLBACK_ESCAPE
486 cc.cbErrorAction = CodingErrorAction.REPORT;
487 cc.cbDecoder = CharsetCallback.TO_U_CALLBACK_ESCAPE;
490 cc.cbErrorAction = null;
494 // check for any options for the callback value
495 cc.option = cc.cbErrorAction == null ? null : cc.cbopt.substring(1);
496 if (cc.option == null) {
504 private void ToUnicodeCase(ConversionCase cc) {
506 // create converter for charset and decoder for each test case
507 CharsetProviderICU provider = new CharsetProviderICU();
508 CharsetDecoder decoder = null;
509 Charset charset = null;
512 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
513 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
514 ? (Charset) provider.charsetForName(cc.charset.substring(1),
515 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
516 : (Charset) provider.charsetForName(cc.charset);
517 decoder = (CharsetDecoder) charset.newDecoder();
518 decoder.onMalformedInput(CodingErrorAction.REPLACE);
519 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
521 } catch (Exception e) {
522 // TODO implement loading of test data.
523 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) {
524 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time");
526 errln(cc.charset + " was not found");
531 // set the callback for the decoder
532 if (cc.cbErrorAction != null) {
533 if (cc.cbDecoder != null) {
534 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.malformedForLength(1), cc.cbDecoder, cc.option);
535 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.unmappableForLength(1), cc.cbDecoder, cc.option);
537 decoder.onMalformedInput(cc.cbErrorAction);
538 decoder.onUnmappableCharacter(cc.cbErrorAction);
541 // set the options (if any: SKIP_STOP_ON_ILLEGAL) for callback
542 if (cc.option.equals("i")) {
543 decoder.onMalformedInput(CodingErrorAction.REPORT);
546 // if callback action is replace, and there is a subchar
547 // replace the decoder's default replacement value
548 // if substring, skip test due to current api not supporting
549 // substring replacement
550 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) {
551 if (cc.cbopt.length() > 1) {
552 if (cc.cbopt.charAt(1) == '=') {
553 logln("Skipping test due to limitation in Java API - substitution string not supported");
556 // // read NUL-separated subchar first, if any
557 // copy the subchar from Latin-1 characters
558 // start after the NUL
559 if (cc.cbopt.charAt(1) == 0x00) {
560 cc.cbopt = cc.cbopt.substring(2);
563 decoder.replaceWith(cc.cbopt);
564 } catch (Exception e) {
565 logln("Skipping test due to limitation in Java API - substitution character sequence size error");
573 // Check the step to unicode
577 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked
578 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } };
579 /* TODO: currently not supported test steps, getNext API is not supported for now
581 { "-2", "toU(bulk)+getNext" },
582 { "-3", "getNext+toU(bulk)" },
583 { "-4", "toU(1)+getNext" },
584 { "-5", "getNext+toU(1)" },
585 { "-12", "toU(5)+getNext" },
586 { "-13", "getNext+toU(5)" }};*/
590 // testing by steps using the CoderResult cr = charset.decoder(in,out,flush) api
591 for (int i = 0; i < steps.length && ok; ++i) {
592 step = Integer.parseInt(steps[i][0]);
594 if (step < 0 && !cc.finalFlush) {
597 logln("Testing step:[" + step + "]");
600 resultLength = stepToUnicode(cc, decoder, step);
601 ok = checkToUnicode(cc, resultLength);
602 } catch (Exception ex) {
603 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]");
604 ex.printStackTrace(System.out);
609 //testing the java's out = charset.decoder(in) api
610 while (ok && cc.finalFlush) {
611 logln("Testing java charset.decoder(in):");
612 cc.toUnicodeResult = null;
613 CharBuffer out = null;
617 out = decoder.decode(cc.bytes);
618 out.position(out.limit());
619 if (out.limit() < cc.unicode.length()) {
620 int pos = out.position();
621 char[] temp = out.array();
622 out = CharBuffer.allocate(cc.bytes.limit());
625 CoderResult cr = decoder.flush(out);
626 if (cr.isOverflow()) {
627 logln("Overflow error with flushing decodering");
631 cc.toUnicodeResult = out;
633 ok = checkToUnicode(cc, out.limit());
637 } catch (Exception e) {
638 //check the error code to see if it matches cc.errorCode
639 logln("Decoder returned an error code");
640 logln("ErrorCode expected is: " + cc.outErrorCode);
641 logln("Error Result is: " + e.toString());
652 private int stepToUnicode(ConversionCase cc, CharsetDecoder decoder,
658 boolean flush = false;
661 sourceLen = cc.bytes.limit();
663 target = CharBuffer.allocate(cc.unicode.length() + 4);
665 cc.toUnicodeResult = null;
676 source.limit((iStep <= sourceLen) ? iStep : sourceLen);
677 target.limit((oStep <= target.capacity()) ? oStep : target
679 flush = (cc.finalFlush && source.limit() == sourceLen);
683 source.limit(sourceLen);
684 target.limit(target.capacity());
685 flush = cc.finalFlush;
688 CoderResult cr = null;
689 if (source.hasRemaining()) {
691 cr = decoder.decode(source, target, flush);
692 // check pointers and errors
693 if (cr.isOverflow()) {
694 // the partial target is filled, set a new limit,
695 oStep = (target.position() + step);
696 target.limit((oStep < target.capacity()) ? oStep
697 : target.capacity());
698 if (target.limit() > target.capacity()) {
699 //target has reached its limit, an error occurred or test case has an error code
701 logln("UnExpected error: Target Buffer is larger than capacity");
705 } else if (cr.isError()) {
706 //check the error code to see if it matches cc.errorCode
707 logln("Decoder returned an error code");
708 logln("ErrorCode expected is: " + cc.outErrorCode);
709 logln("Error Result is: " + cr.toString());
714 if (source.limit() == sourceLen) {
716 cr = decoder.decode(source, target, true);
718 //due to limitation of the API we need to check for target limit for expected
719 if (target.position() != cc.unicode.length()) {
720 if (target.limit() != cc.unicode.length()) {
721 target.limit(cc.unicode.length());
723 cr = decoder.flush(target);
725 errln("Flush operation failed");
737 //--------------------------------------------------------------------------
740 * step==-1: call only ucnv_getNextUChar()
741 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar()
742 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input,
743 * else give it at most (-step-2)/2 bytes
748 if ((step & 1) != 0 /* odd: -1, -3, -5, ... */) {
750 target.limit(target.position() < target.capacity() ? target
751 .position() + 1 : target.capacity());
753 // decode behavior is return to output target 1 character
754 CoderResult cr = null;
756 //similar to getNextUChar() , input is the whole string, while outputs only 1 character
757 source.limit(sourceLen);
758 while (target.position() != target.limit()
759 && source.hasRemaining()) {
760 cr = decoder.decode(source, target,
761 source.limit() == sourceLen);
763 if (cr.isOverflow()) {
765 if (target.limit() >= target.capacity()) {
766 // target has reached its limit, an error occurred
767 logln("UnExpected error: Target Buffer is larger than capacity");
770 //1 character has been consumed
771 target.limit(target.position() + 1);
774 } else if (cr.isError()) {
775 logln("Decoder returned an error code");
776 logln("ErrorCode expected is: " + cc.outErrorCode);
777 logln("Error Result is: " + cr.toString());
779 cc.toUnicodeResult = target;
780 return target.position();
784 // one character has been consumed
785 if (target.limit() == target.position()) {
786 target.limit(target.position() + 1);
792 if (source.position() == sourceLen) {
794 // due to limitation of the API we need to check
795 // for target limit for expected
796 cr = decoder.decode(source, target, true);
797 if (target.position() != cc.unicode.length()) {
799 target.limit(cc.unicode.length());
800 cr = decoder.flush(target);
802 errln("Flush operation failed");
807 // alternate between -n-1 and -n but leave -1 alone
811 } else {/* step is even */
812 // allow only one UChar output
814 target.limit(target.position() < target.capacity() ? target
815 .position() + 1 : target.capacity());
817 source.limit(sourceLen);
819 source.limit(source.position() + (-step - 2) / 2);
820 if (source.limit() > sourceLen) {
821 source.limit(sourceLen);
824 CoderResult cr = decoder.decode(source, target, source
825 .limit() == sourceLen);
826 // check pointers and errors
827 if (cr.isOverflow()) {
828 // one character has been consumed
829 if (target.limit() >= target.capacity()) {
830 // target has reached its limit, an error occurred
831 logln("Unexpected error: Target Buffer is larger than capacity");
834 } else if (cr.isError()) {
835 logln("Decoder returned an error code");
836 logln("ErrorCode expected is: " + cc.outErrorCode);
837 logln("Error Result is: " + cr.toString());
846 //--------------------------------------------------------------------------
848 cc.toUnicodeResult = target;
849 return target.position();
854 private boolean checkToUnicode(ConversionCase cc, int resultLength) {
855 return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult);
859 private void TestGetUnicodeSet(DataMap testcase) {
861 * charset - will be opened, and ucnv_getUnicodeSet() called on it //
862 * map - set of code points and strings that must be in the returned set //
863 * mapnot - set of code points and strings that must *not* be in the //
864 * returned set // which - numeric UConverterUnicodeSet value Headers {
865 * "charset", "map", "mapnot", "which" }
869 // retrieve test case data
870 ConversionCase cc = new ConversionCase();
871 CharsetProviderICU provider = new CharsetProviderICU();
875 UnicodeSet mapset = new UnicodeSet();
876 UnicodeSet mapnotset = new UnicodeSet();
877 UnicodeSet unicodeset = new UnicodeSet();
878 String ellipsis = "0x2e";
879 cc.charset = ((ICUResourceBundle) testcase.getObject("charset"))
881 cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString();
882 cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot"))
886 cc.which = ((ICUResourceBundle) testcase.getObject("which")).getInt(); // only checking for ROUNDTRIP_SET
888 // ----for debugging only
890 logln("TestGetUnicodeSet[" + cc.charset + "] ");
891 logln("...............................................");
894 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata
895 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*')
896 ? (CharsetICU) provider.charsetForName(cc.charset.substring(1),
897 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader())
898 : (CharsetICU) provider.charsetForName(cc.charset);
900 //checking for converter that are not supported at this point
902 if(charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" ||
903 charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" ||
904 charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" ||
905 charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){
907 logln("Converter not supported at this point :" +charset.displayName());
912 logln("Fallback set not supported at this point for converter : "+charset.displayName());
923 mapset.applyPattern(cc.map,false);
924 mapnotset.applyPattern(cc.mapnot,false);
926 charset.getUnicodeSet(unicodeset, cc.which);
927 UnicodeSet diffset = new UnicodeSet();
929 //are there items that must be in unicodeset but are not?
930 (diffset = mapset).removeAll(unicodeset);
931 if(!diffset.isEmpty()){
932 StringBuffer s = new StringBuffer(diffset.toPattern(true));
934 s.replace(0, 0x7fffffff, ellipsis);
936 errln("error in missing items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
939 //are the items that must not be in unicodeset but are?
940 (diffset=mapnotset).retainAll(unicodeset);
941 if(!diffset.isEmpty()){
942 StringBuffer s = new StringBuffer(diffset.toPattern(true));
944 s.replace(0, 0x7fffffff, ellipsis);
946 errln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString());
948 } catch (Exception e) {
949 errln("getUnicodeSet returned an error code");
950 errln("ErrorCode expected is: " + cc.outErrorCode);
951 errln("Error Result is: " + e.toString());
957 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the
958 * start of the stream for example U+FEFF (the Unicode BOM/signature
959 * character) that can be ignored.
961 * Detects Unicode signature byte sequences at the start of the byte stream
962 * and returns number of bytes of the BOM of the indicated Unicode charset.
963 * 0 is returned when no Unicode signature is recognized.
967 private String detectUnicodeSignature(ByteBuffer source) {
968 int signatureLength = 0; // number of bytes of the signature
969 final int SIG_MAX_LEN = 5;
970 String sigUniCharset = null; // states what unicode charset is the BOM
974 * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we
975 * don't misdetect something
977 byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5,
980 while (i < source.limit() && i < SIG_MAX_LEN) {
981 start[i] = source.get(i);
985 if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) {
987 sigUniCharset = "UTF-16BE";
988 source.position(signatureLength);
989 return sigUniCharset;
990 } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) {
991 if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) {
993 sigUniCharset = "UTF-32LE";
994 source.position(signatureLength);
995 return sigUniCharset;
998 sigUniCharset = "UTF-16LE";
999 source.position(signatureLength);
1000 return sigUniCharset;
1002 } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB
1003 && start[2] == (byte) 0xBF) {
1004 signatureLength = 3;
1005 sigUniCharset = "UTF-8";
1006 source.position(signatureLength);
1007 return sigUniCharset;
1008 } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00
1009 && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) {
1010 signatureLength = 4;
1011 sigUniCharset = "UTF-32BE";
1012 source.position(signatureLength);
1013 return sigUniCharset;
1014 } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE
1015 && start[2] == (byte) 0xFF) {
1016 signatureLength = 3;
1017 sigUniCharset = "SCSU";
1018 source.position(signatureLength);
1019 return sigUniCharset;
1020 } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE
1021 && start[2] == (byte) 0x28) {
1022 signatureLength = 3;
1023 sigUniCharset = "BOCU-1";
1024 source.position(signatureLength);
1025 return sigUniCharset;
1026 } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F
1027 && start[2] == (byte) 0x76) {
1029 if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) {
1030 signatureLength = 5;
1031 sigUniCharset = "UTF-7";
1032 source.position(signatureLength);
1033 return sigUniCharset;
1034 } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39
1035 || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) {
1036 signatureLength = 4;
1037 sigUniCharset = "UTF-7";
1038 source.position(signatureLength);
1039 return sigUniCharset;
1041 } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73
1042 && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) {
1043 signatureLength = 4;
1044 sigUniCharset = "UTF-EBCDIC";
1045 source.position(signatureLength);
1046 return sigUniCharset;
1049 /* no known Unicode signature byte sequence recognized */
1053 String printbytes(ByteBuffer buf, int pos) {
1054 int cur = buf.position();
1055 String res = " (" + pos + ")==[";
1056 for (int i = 0; i < pos; i++) {
1057 res += "(" + i + ")" + hex(buf.get(i) & 0xff).substring(2) + " ";
1063 String printchars(CharBuffer buf, int pos) {
1064 int cur = buf.position();
1065 String res = " (" + pos + ")==[";
1066 for (int i = 0; i < pos; i++) {
1067 res += "(" + i + ")" + hex(buf.get(i)) + " ";
1073 private boolean checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected,
1074 ByteBuffer output) {
1078 output.limit(output.position());
1081 // remove any BOM signature before checking
1082 if (!cc.charset.contains("UnicodeLittle") && !cc.charset.contains("UnicodeBig")) {
1083 detectUnicodeSignature(output); // sets the position to after the BOM
1084 output = output.slice(); // removes anything before the current position
1087 if (output.limit() != expected.limit()) {
1088 errln("Test failed: output length does not match expected for charset: " + cc.charset
1089 + " [" + cc.caseNr + "]");
1092 while (output.hasRemaining()) {
1093 if (output.get() != expected.get()) {
1094 errln("Test failed: output does not match expected for charset: " + cc.charset
1095 + " [" + cc.caseNr + "]");
1103 logln("[" + cc.caseNr + "]:" + cc.charset);
1104 logln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1105 logln("Output: " + printbytes(output, output.limit()));
1106 logln("Expected: " + printbytes(expected, expected.limit()));
1110 errln("[" + cc.caseNr + "]:" + cc.charset);
1111 errln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length()));
1112 errln("Output: " + printbytes(output, output.limit()));
1113 errln("Expected: " + printbytes(expected, expected.limit()));
1119 private boolean checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output) {
1122 output.limit(output.position());
1125 // test to see if the conversion matches actual results
1126 if (output.limit() != expected.length()) {
1127 errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]");
1130 for (int i = 0; i < expected.length(); i++) {
1131 if (output.get(i) != expected.charAt(i)) {
1132 errln("Test failed: output does not match expected for charset: " + cc.charset
1133 + " [" + cc.caseNr + "]");
1141 logln("[" + cc.caseNr + "]:" + cc.charset);
1142 logln("Input: " + printbytes(cc.bytes, cc.bytes.limit()));
1143 logln("Output: " + printchars(output, output.limit()));
1144 logln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length()));
1147 errln("[" + cc.caseNr + "]:" + cc.charset);
1148 errln("Input: " + printbytes(cc.bytes, cc.bytes.limit()));
1149 errln("Output: " + printchars(output, output.limit()));
1150 errln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length()));
1156 private byte[] toByteArray(String str) {
1157 byte[] ret = new byte[str.length()];
1158 for (int i = 0; i < ret.length; i++) {
1159 char ch = str.charAt(i);
1163 throw new IllegalArgumentException(" byte value out of range: " + ch);