2 *******************************************************************************
\r
3 * Copyright (C) 2006-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 *******************************************************************************
\r
10 package com.ibm.icu.charset;
\r
12 import java.nio.ByteBuffer;
\r
13 import java.nio.CharBuffer;
\r
14 import java.nio.IntBuffer;
\r
15 import java.nio.charset.CharsetDecoder;
\r
16 import java.nio.charset.CoderResult;
\r
17 import java.nio.charset.CodingErrorAction;
\r
19 import com.ibm.icu.impl.Assert;
\r
22 * An abstract class that provides framework methods of decoding operations for concrete
\r
24 * In the future this class will contain API that will implement converter sematics of ICU4C.
\r
27 public abstract class CharsetDecoderICU extends CharsetDecoder{
\r
29 int toUnicodeStatus;
\r
30 byte[] toUBytesArray = new byte[128];
\r
31 int toUBytesBegin = 0;
\r
33 char[] charErrorBufferArray = new char[128];
\r
34 int charErrorBufferLength;
\r
35 int charErrorBufferBegin;
\r
36 char[] invalidCharBuffer = new char[128];
\r
37 int invalidCharLength;
\r
40 * Maximum number of indexed bytes
\r
42 * @deprecated This API is ICU internal only.
\r
44 protected static final int EXT_MAX_BYTES = 0x1f;
\r
46 /* store previous UChars/chars to continue partial matches */
\r
47 byte[] preToUArray = new byte[EXT_MAX_BYTES];
\r
49 int preToULength; /* negative: replay */
\r
50 int preToUFirstLength; /* length of first character */
\r
53 Object toUContext = null;
\r
54 private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP;
\r
55 private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
\r
56 CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() {
\r
57 public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source,
\r
58 CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) {
\r
59 if (cr.isUnmappable()) {
\r
60 return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer,
\r
62 } else /* if (cr.isMalformed()) */ {
\r
63 return onMalformedInput.call(decoder, context, source, target, offsets, buffer,
\r
66 // return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr);
\r
70 // exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive
\r
71 private boolean malformedInputCalled = false;
\r
72 private boolean unmappableCharacterCalled = false;
\r
75 * Construct a CharsetDecorderICU based on the information provided from a CharsetICU object.
\r
77 * @param cs The CharsetICU object containing information about how to charset to decode.
\r
79 CharsetDecoderICU(CharsetICU cs) {
\r
80 super(cs, (1/cs.maxCharsPerByte), cs.maxCharsPerByte);
\r
84 * Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping
\r
85 * that will convert a byte sequence to a Unicode codepoint sequence, but
\r
86 * the encoded Unicode codepoint sequence will round trip convert to a different
\r
87 * byte sequence. In ICU, this is can be called a reverse fallback.
\r
90 final boolean isFallbackUsed() {
\r
95 * Fallback is currently always used by icu4j decoders.
\r
97 static final boolean isToUUseFallback() {
\r
98 return isToUUseFallback(true);
\r
102 * Fallback is currently always used by icu4j decoders.
\r
104 static final boolean isToUUseFallback(boolean iUseFallback) {
\r
109 * Sets the action to be taken if an illegal sequence is encountered
\r
111 * @param newAction action to be taken
\r
112 * @exception IllegalArgumentException
\r
115 protected final void implOnMalformedInput(CodingErrorAction newAction) {
\r
116 // don't run infinitely
\r
117 if (malformedInputCalled)
\r
120 // if we get a replace, do not let the nio replace
\r
121 if (newAction == CodingErrorAction.REPLACE) {
\r
122 malformedInputCalled = true;
\r
123 super.onMalformedInput(CodingErrorAction.IGNORE);
\r
124 malformedInputCalled = false;
\r
127 onMalformedInput = getCallback(newAction);
\r
131 * Sets the action to be taken if an illegal sequence is encountered
\r
133 * @param newAction action to be taken
\r
134 * @exception IllegalArgumentException
\r
137 protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
\r
138 // dont run infinitely
\r
139 if (unmappableCharacterCalled)
\r
142 // if we get a replace, do not let the nio replace
\r
143 if (newAction == CodingErrorAction.REPLACE) {
\r
144 unmappableCharacterCalled = true;
\r
145 super.onUnmappableCharacter(CodingErrorAction.IGNORE);
\r
146 unmappableCharacterCalled = false;
\r
149 onUnmappableCharacter = getCallback(newAction);
\r
153 * Sets the callback encoder method and context to be used if an illegal sequence is encounterd.
\r
154 * You would normally call this twice to set both the malform and unmappable error. In this case,
\r
155 * newContext should remain the same since using a different newContext each time will negate the last
\r
157 * @param err CoderResult
\r
158 * @param newCallback CharsetCallback.Encoder
\r
159 * @param newContext Object
\r
162 public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) {
\r
163 if (err.isMalformed()) {
\r
164 onMalformedInput = newCallback;
\r
165 } else if (err.isUnmappable()) {
\r
166 onUnmappableCharacter = newCallback;
\r
168 /* Error: Only malformed and unmappable are handled. */
\r
171 if (toUContext == null || !toUContext.equals(newContext)) {
\r
172 toUContext = newContext;
\r
176 private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
\r
177 if(action==CodingErrorAction.REPLACE){
\r
178 return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
\r
179 }else if(action==CodingErrorAction.IGNORE){
\r
180 return CharsetCallback.TO_U_CALLBACK_SKIP;
\r
181 }else /* if(action==CodingErrorAction.REPORT) */ {
\r
182 return CharsetCallback.TO_U_CALLBACK_STOP;
\r
185 private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
\r
187 * Flushes any characters saved in the converter's internal buffer and
\r
188 * resets the converter.
\r
189 * @param out action to be taken
\r
190 * @return result of flushing action and completes the decoding all input.
\r
191 * Returns CoderResult.UNDERFLOW if the action succeeds.
\r
194 protected final CoderResult implFlush(CharBuffer out) {
\r
195 return decode(EMPTY, out, null, true);
\r
199 * Resets the to Unicode mode of converter
\r
202 protected void implReset() {
\r
203 toUnicodeStatus = 0 ;
\r
205 charErrorBufferLength = 0;
\r
206 charErrorBufferBegin = 0;
\r
208 /* store previous UChars/chars to continue partial matches */
\r
210 preToULength = 0; /* negative: replay */
\r
211 preToUFirstLength = 0;
\r
217 * Decodes one or more bytes. The default behaviour of the converter
\r
218 * is stop and report if an error in input stream is encountered.
\r
219 * To set different behaviour use @see CharsetDecoder.onMalformedInput()
\r
220 * This method allows a buffer by buffer conversion of a data stream.
\r
221 * The state of the conversion is saved between calls to convert.
\r
222 * Among other things, this means multibyte input sequences can be
\r
223 * split between calls. If a call to convert results in an Error, the
\r
224 * conversion may be continued by calling convert again with suitably
\r
225 * modified parameters.All conversions should be finished with a call to
\r
226 * the flush method.
\r
227 * @param in buffer to decode
\r
228 * @param out buffer to populate with decoded result
\r
229 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
230 * action succeeds or more input is needed for completing the decoding action.
\r
233 protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
\r
234 if(in.remaining() < toUCountPending()){
\r
235 return CoderResult.UNDERFLOW;
\r
237 // if (!in.hasRemaining()) {
\r
239 // return CoderResult.UNDERFLOW;
\r
242 in.position(in.position() + toUCountPending());
\r
244 /* do the conversion */
\r
245 CoderResult ret = decode(in, out, null, false);
\r
247 // ok was there input held in the previous invocation of decodeLoop
\r
248 // that resulted in output in this invocation?
\r
249 in.position(in.position() - toUCountPending());
\r
255 * Implements the ICU semantic for decode operation
\r
256 * @param in The input byte buffer
\r
257 * @param out The output character buffer
\r
258 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
259 * action succeeds or more input is needed for completing the decoding action.
\r
261 abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush);
\r
264 * Implements the ICU semantic for decode operation
\r
265 * @param source The input byte buffer
\r
266 * @param target The output character buffer
\r
268 * @param flush true if, and only if, the invoker can provide no
\r
269 * additional input bytes beyond those in the given buffer.
\r
270 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
271 * action succeeds or more input is needed for completing the decoding action.
\r
273 final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
275 /* check parameters */
\r
276 if (target == null || source == null) {
\r
277 throw new IllegalArgumentException();
\r
281 * Make sure that the buffer sizes do not exceed the number range for
\r
282 * int32_t because some functions use the size (in units or bytes)
\r
283 * rather than comparing pointers, and because offsets are int32_t values.
\r
285 * size_t is guaranteed to be unsigned and large enough for the job.
\r
287 * Return with an error instead of adjusting the limits because we would
\r
288 * not be able to maintain the semantics that either the source must be
\r
289 * consumed or the target filled (unless an error occurs).
\r
290 * An adjustment would be sourceLimit=t+0x7fffffff; for example.
\r
294 ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
\r
295 ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
\r
297 *err=U_ILLEGAL_ARGUMENT_ERROR;
\r
302 /* flush the target overflow buffer */
\r
303 if (charErrorBufferLength > 0) {
\r
306 if (!target.hasRemaining()) {
\r
307 /* the overflow buffer contains too much, keep the rest */
\r
311 charErrorBufferArray[j++] = charErrorBufferArray[i++];
\r
312 } while (i < charErrorBufferLength);
\r
314 charErrorBufferLength = (byte) j;
\r
315 return CoderResult.OVERFLOW;
\r
318 /* copy the overflow contents to the target */
\r
319 target.put(charErrorBufferArray[i++]);
\r
320 if (offsets != null) {
\r
321 offsets.put(-1); /* no source index available for old output */
\r
323 } while (i < charErrorBufferLength);
\r
325 /* the overflow buffer is completely copied to the target */
\r
326 charErrorBufferLength = 0;
\r
329 if (!flush && !source.hasRemaining() && preToULength >= 0) {
\r
330 /* the overflow buffer is emptied and there is no new input: we are done */
\r
331 return CoderResult.UNDERFLOW;
\r
335 * Do not simply return with a buffer overflow error if
\r
336 * !flush && t==targetLimit
\r
337 * because it is possible that the source will not generate any output.
\r
338 * For example, the skip callback may be called;
\r
339 * it does not output anything.
\r
342 return toUnicodeWithCallback(source, target, offsets, flush);
\r
345 /* Currently, we are not using offsets in ICU4J. */
\r
346 /* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
\r
350 if(sourceIndex>=0) {
\r
352 * adjust each offset by adding the previous sourceIndex
\r
353 * minus the length of the input sequence that caused an
\r
356 /* delta=sourceIndex-errorInputLength;
\r
359 * set each offset to -1 because this conversion function
\r
360 * does not handle offsets
\r
364 limit=offsets.position()+length;
\r
366 /* most common case, nothing to do */
\r
367 /* } else if(delta>0) {
\r
368 /* add the delta to each offset (but not if the offset is <0) */
\r
369 /* while(offsets.position()<limit) {
\r
370 offset=offsets.get(offsets.position());
\r
372 offsets.put(offset+delta);
\r
374 //FIXME: ++offsets;
\r
376 } else /* delta<0 */ /* {
\r
378 * set each offset to -1 because this conversion function
\r
379 * does not handle offsets
\r
380 * or the error input sequence started in a previous buffer
\r
382 /* while(offsets.position()<limit) {
\r
387 final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
\r
390 int errorInputLength;
\r
391 boolean converterSawEndOfInput, calledCallback;
\r
392 //int t=target.position();
\r
393 int s=source.position();
\r
394 /* variables for m:n conversion */
\r
395 ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
\r
396 int replayArrayIndex = 0;
\r
398 ByteBuffer realSource=null;
\r
399 boolean realFlush=false;
\r
400 int realSourceIndex=0;
\r
403 CoderResult cr = CoderResult.UNDERFLOW;
\r
405 /* get the converter implementation function */
\r
408 if(preToULength>=0) {
\r
412 * Previous m:n conversion stored source units from a partial match
\r
413 * and failed to consume all of them.
\r
414 * We need to "replay" them from a temporary buffer and convert them first.
\r
418 realSourceIndex=sourceIndex;
\r
419 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
\r
420 replayArray.put(preToUArray,0, -preToULength);
\r
421 source=replayArray;
\r
422 source.position(0);
\r
423 source.limit(replayArrayIndex-preToULength);
\r
430 * loop for conversion and error handling
\r
436 * handle end of input
\r
437 * handle errors/call callback
\r
444 cr = decodeLoop(source, target, offsets, flush);
\r
447 * set a flag for whether the converter
\r
448 * successfully processed the end of the input
\r
450 * need not check cnv->preToULength==0 because a replay (<0) will cause
\r
451 * s<sourceLimit before converterSawEndOfInput is checked
\r
453 converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength == 0);
\r
455 /* no callback called yet for this iteration */
\r
456 calledCallback=false;
\r
458 /* no sourceIndex adjustment for conversion, only for callback output */
\r
459 errorInputLength=0;
\r
462 * loop for offsets and error handling
\r
464 * iterates at most 3 times:
\r
465 * 1. to clean up after the conversion function
\r
466 * 2. after the callback
\r
467 * 3. after the callback again if there was truncated input
\r
470 /* update offsets if we write any */
\r
471 /* Currently offsets are not being used in ICU4J */
\r
472 /* if(offsets!=null) {
\r
474 int length=(target.position()-t);
\r
476 updateOffsets(offsets, length, sourceIndex, errorInputLength);
\r
480 * if a converter handles offsets and updates the offsets
\r
481 * pointer at the end, then pArgs->offset should not change
\r
483 * however, some converters do not handle offsets at all
\r
484 * (sourceIndex<0) or may not update the offsets pointer
\r
486 //TODO: pArgs->offsets=offsets+=length;
\r
489 if(sourceIndex>=0) {
\r
490 sourceIndex+=(source.position()-s);
\r
495 if(preToULength<0) {
\r
497 * switch the source to new replay units (cannot occur while replaying)
\r
498 * after offset handling and before end-of-input and callback handling
\r
500 if(realSource==null)
\r
504 realSourceIndex=sourceIndex;
\r
506 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
\r
507 replayArray.put(preToUArray,0, -preToULength);
\r
509 replayArray.position(0);
\r
511 source=replayArray;
\r
512 source.limit(replayArrayIndex-preToULength);
\r
514 if((sourceIndex+=preToULength)<0) {
\r
520 /* see implementation note before _fromUnicodeWithCallback() */
\r
521 //agljport:todo U_ASSERT(realSource==NULL);
\r
522 Assert.assrt(realSource==null);
\r
526 /* update pointers */
\r
527 s=source.position();
\r
528 //t=target.position();
\r
530 if(cr.isUnderflow()) {
\r
531 if(s<source.limit())
\r
534 * continue with the conversion loop while there is still input left
\r
535 * (continue converting by breaking out of only the inner loop)
\r
538 } else if(realSource!=null) {
\r
539 /* switch back from replaying to the real source and continue */
\r
540 source = realSource;
\r
542 sourceIndex=realSourceIndex;
\r
545 } else if(flush && toULength>0) {
\r
547 * the entire input stream is consumed
\r
548 * and there is a partial, truncated input sequence left
\r
551 /* inject an error and continue with callback handling */
\r
552 cr = CoderResult.malformedForLength(toULength);
\r
553 calledCallback=false; /* new error condition */
\r
555 /* input consumed */
\r
558 * return to the conversion loop once more if the flush
\r
559 * flag is set and the conversion function has not
\r
560 * successfully processed the end of the input yet
\r
562 * (continue converting by breaking out of only the inner loop)
\r
564 if(!converterSawEndOfInput) {
\r
568 /* reset the converter without calling the callback function */
\r
572 /* done successfully */
\r
577 /* U_FAILURE(*err) */
\r
580 if( calledCallback || cr.isOverflow() ||
\r
581 (cr.isMalformed() && cr.isUnmappable())
\r
584 * the callback did not or cannot resolve the error:
\r
585 * set output pointers and return
\r
587 * the check for buffer overflow is redundant but it is
\r
588 * a high-runner case and hopefully documents the intent
\r
591 * if we were replaying, then the replay buffer must be
\r
592 * copied back into the UConverter
\r
593 * and the real arguments must be restored
\r
595 if(realSource!=null) {
\r
597 Assert.assrt(preToULength==0);
\r
598 length = source.limit() - source.position();
\r
600 //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
\r
601 source.get(preToUArray, preToUBegin, length);
\r
602 preToULength=(byte)-length;
\r
612 /* copy toUBytes[] to invalidCharBuffer[] */
\r
613 errorInputLength=invalidCharLength=toULength;
\r
614 if(errorInputLength>0) {
\r
615 copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
\r
618 /* set the converter state to deal with the next character */
\r
621 /* call the callback function */
\r
622 cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
\r
624 * loop back to the offset handling
\r
626 * this flag will indicate after offset handling
\r
627 * that a callback was called;
\r
628 * if the callback did not resolve the error, then we return
\r
630 calledCallback=true;
\r
636 * Returns the number of chars held in the converter's internal state
\r
637 * because more input is needed for completing the conversion. This function is
\r
638 * useful for mapping semantics of ICU's converter interface to those of iconv,
\r
639 * and this information is not needed for normal conversion.
\r
640 * @return The number of chars in the state. -1 if an error is encountered.
\r
642 /*public*/ int toUCountPending() {
\r
643 if(preToULength > 0){
\r
644 return preToULength ;
\r
645 } else if(preToULength < 0){
\r
646 return -preToULength;
\r
647 } else if(toULength > 0){
\r
655 private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
\r
656 for(int i=srcOffset; i<length; i++){
\r
657 dst[dstOffset++]=(char)(src[srcOffset++] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
661 * ONLY used by ToU callback functions.
\r
662 * This function will write out the specified characters to the target
\r
663 * character buffer.
\r
664 * @return A CoderResult object that contains the error result when an error occurs.
\r
666 static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
\r
667 char[] ucharsArray, int ucharsBegin, int length,
\r
668 CharBuffer target, IntBuffer offsets, int sourceIndex) {
\r
670 CoderResult cr = CoderResult.UNDERFLOW;
\r
673 if(offsets==null) {
\r
674 while(length>0 && target.hasRemaining()) {
\r
675 target.put(ucharsArray[ucharsBegin++]);
\r
680 /* output with offsets */
\r
681 while(length>0 && target.hasRemaining()) {
\r
682 target.put(ucharsArray[ucharsBegin++]);
\r
683 offsets.put(sourceIndex);
\r
687 /* write overflow */
\r
689 cnv.charErrorBufferLength= 0;
\r
690 cr = CoderResult.OVERFLOW;
\r
692 cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++];
\r
693 } while(--length>0);
\r
698 * This function will write out the Unicode substitution character to the
\r
699 * target character buffer.
\r
700 * Sub classes to override this method if required
\r
705 * @return A CoderResult object that contains the error result when an error occurs.
\r
707 /* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with
\r
708 * the substitution characters. Will leave in here for the time being. To be removed later. (4.0)
\r
710 /*CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
\r
711 ByteBuffer source, CharBuffer target,
\r
712 IntBuffer offsets){
\r
713 String sub = decoder.replacement();
\r
714 CharsetICU cs = (CharsetICU) decoder.charset();
\r
715 if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
\r
716 char[] subArr = new char[] { 0x1a };
\r
717 return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
\r
718 .length(), target, offsets, source.position());
\r
720 return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
\r
721 0, sub.length(), target, offsets, source.position());
\r