2 *******************************************************************************
\r
3 * Copyright (C) 2006-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 *******************************************************************************
\r
10 package com.ibm.icu.charset;
\r
12 import java.nio.ByteBuffer;
\r
13 import java.nio.CharBuffer;
\r
14 import java.nio.IntBuffer;
\r
15 import java.nio.charset.CharsetDecoder;
\r
16 import java.nio.charset.CoderResult;
\r
17 import java.nio.charset.CodingErrorAction;
\r
19 import com.ibm.icu.impl.Assert;
\r
22 * An abstract class that provides framework methods of decoding operations for concrete
\r
24 * In the future this class will contain API that will implement converter sematics of ICU4C.
\r
27 public abstract class CharsetDecoderICU extends CharsetDecoder{
\r
29 int toUnicodeStatus;
\r
30 byte[] toUBytesArray = new byte[128];
\r
31 int toUBytesBegin = 0;
\r
33 char[] charErrorBufferArray = new char[128];
\r
34 int charErrorBufferLength;
\r
35 int charErrorBufferBegin;
\r
36 char[] invalidCharBuffer = new char[128];
\r
37 int invalidCharLength;
\r
39 /* maximum number of indexed bytes */
\r
40 private static final int EXT_MAX_BYTES = 0x1f;
\r
42 /* store previous UChars/chars to continue partial matches */
\r
43 byte[] preToUArray = new byte[EXT_MAX_BYTES];
\r
45 int preToULength; /* negative: replay */
\r
46 int preToUFirstLength; /* length of first character */
\r
49 Object toUContext = null;
\r
50 private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP;
\r
51 private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
\r
52 CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() {
\r
53 public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source,
\r
54 CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) {
\r
55 if (cr.isUnmappable()) {
\r
56 return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer,
\r
58 } else /* if (cr.isMalformed()) */ {
\r
59 return onMalformedInput.call(decoder, context, source, target, offsets, buffer,
\r
62 // return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr);
\r
66 // exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive
\r
67 private boolean malformedInputCalled = false;
\r
68 private boolean unmappableCharacterCalled = false;
\r
71 * Construct a CharsetDecorderICU based on the information provided from a CharsetICU object.
\r
73 * @param cs The CharsetICU object containing information about how to charset to decode.
\r
75 CharsetDecoderICU(CharsetICU cs) {
\r
76 super(cs, (float) (1/(float)cs.maxCharsPerByte), cs.maxCharsPerByte);
\r
80 * Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping
\r
81 * that will convert a byte sequence to a Unicode codepoint sequence, but
\r
82 * the encoded Unicode codepoint sequence will round trip convert to a different
\r
83 * byte sequence. In ICU, this is can be called a reverse fallback.
\r
86 final boolean isFallbackUsed() {
\r
91 * Fallback is currently always used by icu4j decoders.
\r
93 static final boolean isToUUseFallback() {
\r
94 return isToUUseFallback(true);
\r
98 * Fallback is currently always used by icu4j decoders.
\r
100 static final boolean isToUUseFallback(boolean iUseFallback) {
\r
105 * Sets the action to be taken if an illegal sequence is encountered
\r
107 * @param newAction action to be taken
\r
108 * @exception IllegalArgumentException
\r
111 protected final void implOnMalformedInput(CodingErrorAction newAction) {
\r
112 // don't run infinitely
\r
113 if (malformedInputCalled)
\r
116 // if we get a replace, do not let the nio replace
\r
117 if (newAction == CodingErrorAction.REPLACE) {
\r
118 malformedInputCalled = true;
\r
119 super.onMalformedInput(CodingErrorAction.IGNORE);
\r
120 malformedInputCalled = false;
\r
123 onMalformedInput = getCallback(newAction);
\r
127 * Sets the action to be taken if an illegal sequence is encountered
\r
129 * @param newAction action to be taken
\r
130 * @exception IllegalArgumentException
\r
133 protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
\r
134 // dont run infinitely
\r
135 if (unmappableCharacterCalled)
\r
138 // if we get a replace, do not let the nio replace
\r
139 if (newAction == CodingErrorAction.REPLACE) {
\r
140 unmappableCharacterCalled = true;
\r
141 super.onUnmappableCharacter(CodingErrorAction.IGNORE);
\r
142 unmappableCharacterCalled = false;
\r
145 onUnmappableCharacter = getCallback(newAction);
\r
149 * Sets the callback encoder method and context to be used if an illegal sequence is encounterd.
\r
150 * You would normally call this twice to set both the malform and unmappable error. In this case,
\r
151 * newContext should remain the same since using a different newContext each time will negate the last
\r
153 * @param err CoderResult
\r
154 * @param newCallback CharsetCallback.Encoder
\r
155 * @param newContext Object
\r
158 public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) {
\r
159 if (err.isMalformed()) {
\r
160 onMalformedInput = newCallback;
\r
161 } else if (err.isUnmappable()) {
\r
162 onUnmappableCharacter = newCallback;
\r
164 /* Error: Only malformed and unmappable are handled. */
\r
167 if (toUContext == null || !toUContext.equals(newContext)) {
\r
168 toUContext = newContext;
\r
172 private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
\r
173 if(action==CodingErrorAction.REPLACE){
\r
174 return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
\r
175 }else if(action==CodingErrorAction.IGNORE){
\r
176 return CharsetCallback.TO_U_CALLBACK_SKIP;
\r
177 }else /* if(action==CodingErrorAction.REPORT) */ {
\r
178 return CharsetCallback.TO_U_CALLBACK_STOP;
\r
181 private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
\r
183 * Flushes any characters saved in the converter's internal buffer and
\r
184 * resets the converter.
\r
185 * @param out action to be taken
\r
186 * @return result of flushing action and completes the decoding all input.
\r
187 * Returns CoderResult.UNDERFLOW if the action succeeds.
\r
190 protected final CoderResult implFlush(CharBuffer out) {
\r
191 return decode(EMPTY, out, null, true);
\r
195 * Resets the to Unicode mode of converter
\r
198 protected void implReset() {
\r
199 toUnicodeStatus = 0 ;
\r
201 charErrorBufferLength = 0;
\r
202 charErrorBufferBegin = 0;
\r
204 /* store previous UChars/chars to continue partial matches */
\r
206 preToULength = 0; /* negative: replay */
\r
207 preToUFirstLength = 0;
\r
213 * Decodes one or more bytes. The default behaviour of the converter
\r
214 * is stop and report if an error in input stream is encountered.
\r
215 * To set different behaviour use @see CharsetDecoder.onMalformedInput()
\r
216 * This method allows a buffer by buffer conversion of a data stream.
\r
217 * The state of the conversion is saved between calls to convert.
\r
218 * Among other things, this means multibyte input sequences can be
\r
219 * split between calls. If a call to convert results in an Error, the
\r
220 * conversion may be continued by calling convert again with suitably
\r
221 * modified parameters.All conversions should be finished with a call to
\r
222 * the flush method.
\r
223 * @param in buffer to decode
\r
224 * @param out buffer to populate with decoded result
\r
225 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
226 * action succeeds or more input is needed for completing the decoding action.
\r
229 protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
\r
230 if(in.remaining() < toUCountPending()){
\r
231 return CoderResult.UNDERFLOW;
\r
233 // if (!in.hasRemaining()) {
\r
235 // return CoderResult.UNDERFLOW;
\r
238 in.position(in.position() + toUCountPending());
\r
240 /* do the conversion */
\r
241 CoderResult ret = decode(in, out, null, false);
\r
243 // ok was there input held in the previous invocation of decodeLoop
\r
244 // that resulted in output in this invocation?
\r
245 in.position(in.position() - toUCountPending());
\r
251 * Implements the ICU semantic for decode operation
\r
252 * @param in The input byte buffer
\r
253 * @param out The output character buffer
\r
254 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
255 * action succeeds or more input is needed for completing the decoding action.
\r
257 abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush);
\r
260 * Implements the ICU semantic for decode operation
\r
261 * @param source The input byte buffer
\r
262 * @param target The output character buffer
\r
264 * @param flush true if, and only if, the invoker can provide no
\r
265 * additional input bytes beyond those in the given buffer.
\r
266 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
\r
267 * action succeeds or more input is needed for completing the decoding action.
\r
269 final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
\r
271 /* check parameters */
\r
272 if (target == null || source == null) {
\r
273 throw new IllegalArgumentException();
\r
277 * Make sure that the buffer sizes do not exceed the number range for
\r
278 * int32_t because some functions use the size (in units or bytes)
\r
279 * rather than comparing pointers, and because offsets are int32_t values.
\r
281 * size_t is guaranteed to be unsigned and large enough for the job.
\r
283 * Return with an error instead of adjusting the limits because we would
\r
284 * not be able to maintain the semantics that either the source must be
\r
285 * consumed or the target filled (unless an error occurs).
\r
286 * An adjustment would be sourceLimit=t+0x7fffffff; for example.
\r
290 ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
\r
291 ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
\r
293 *err=U_ILLEGAL_ARGUMENT_ERROR;
\r
298 /* flush the target overflow buffer */
\r
299 if (charErrorBufferLength > 0) {
\r
302 if (!target.hasRemaining()) {
\r
303 /* the overflow buffer contains too much, keep the rest */
\r
307 charErrorBufferArray[j++] = charErrorBufferArray[i++];
\r
308 } while (i < charErrorBufferLength);
\r
310 charErrorBufferLength = (byte) j;
\r
311 return CoderResult.OVERFLOW;
\r
314 /* copy the overflow contents to the target */
\r
315 target.put(charErrorBufferArray[i++]);
\r
316 if (offsets != null) {
\r
317 offsets.put(-1); /* no source index available for old output */
\r
319 } while (i < charErrorBufferLength);
\r
321 /* the overflow buffer is completely copied to the target */
\r
322 charErrorBufferLength = 0;
\r
325 if (!flush && !source.hasRemaining() && preToULength >= 0) {
\r
326 /* the overflow buffer is emptied and there is no new input: we are done */
\r
327 return CoderResult.UNDERFLOW;
\r
331 * Do not simply return with a buffer overflow error if
\r
332 * !flush && t==targetLimit
\r
333 * because it is possible that the source will not generate any output.
\r
334 * For example, the skip callback may be called;
\r
335 * it does not output anything.
\r
338 return toUnicodeWithCallback(source, target, offsets, flush);
\r
341 /* Currently, we are not using offsets in ICU4J. */
\r
342 /* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
\r
346 if(sourceIndex>=0) {
\r
348 * adjust each offset by adding the previous sourceIndex
\r
349 * minus the length of the input sequence that caused an
\r
352 /* delta=sourceIndex-errorInputLength;
\r
355 * set each offset to -1 because this conversion function
\r
356 * does not handle offsets
\r
360 limit=offsets.position()+length;
\r
362 /* most common case, nothing to do */
\r
363 /* } else if(delta>0) {
\r
364 /* add the delta to each offset (but not if the offset is <0) */
\r
365 /* while(offsets.position()<limit) {
\r
366 offset=offsets.get(offsets.position());
\r
368 offsets.put(offset+delta);
\r
370 //FIXME: ++offsets;
\r
372 } else /* delta<0 */ /* {
\r
374 * set each offset to -1 because this conversion function
\r
375 * does not handle offsets
\r
376 * or the error input sequence started in a previous buffer
\r
378 /* while(offsets.position()<limit) {
\r
383 final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){
\r
386 int errorInputLength;
\r
387 boolean converterSawEndOfInput, calledCallback;
\r
388 //int t=target.position();
\r
389 int s=source.position();
\r
390 /* variables for m:n conversion */
\r
391 ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
\r
392 int replayArrayIndex = 0;
\r
394 ByteBuffer realSource=null;
\r
395 boolean realFlush=false;
\r
396 int realSourceIndex=0;
\r
399 CoderResult cr = CoderResult.UNDERFLOW;
\r
401 /* get the converter implementation function */
\r
404 if(preToULength>=0) {
\r
408 * Previous m:n conversion stored source units from a partial match
\r
409 * and failed to consume all of them.
\r
410 * We need to "replay" them from a temporary buffer and convert them first.
\r
414 realSourceIndex=sourceIndex;
\r
415 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
\r
416 replayArray.put(preToUArray,0, -preToULength);
\r
417 source=replayArray;
\r
418 source.position(0);
\r
419 source.limit(replayArrayIndex-preToULength);
\r
426 * loop for conversion and error handling
\r
432 * handle end of input
\r
433 * handle errors/call callback
\r
440 cr = decodeLoop(source, target, offsets, flush);
\r
443 * set a flag for whether the converter
\r
444 * successfully processed the end of the input
\r
446 * need not check cnv->preToULength==0 because a replay (<0) will cause
\r
447 * s<sourceLimit before converterSawEndOfInput is checked
\r
449 converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength == 0);
\r
451 /* no callback called yet for this iteration */
\r
452 calledCallback=false;
\r
454 /* no sourceIndex adjustment for conversion, only for callback output */
\r
455 errorInputLength=0;
\r
458 * loop for offsets and error handling
\r
460 * iterates at most 3 times:
\r
461 * 1. to clean up after the conversion function
\r
462 * 2. after the callback
\r
463 * 3. after the callback again if there was truncated input
\r
466 /* update offsets if we write any */
\r
467 /* Currently offsets are not being used in ICU4J */
\r
468 /* if(offsets!=null) {
\r
470 int length=(target.position()-t);
\r
472 updateOffsets(offsets, length, sourceIndex, errorInputLength);
\r
476 * if a converter handles offsets and updates the offsets
\r
477 * pointer at the end, then pArgs->offset should not change
\r
479 * however, some converters do not handle offsets at all
\r
480 * (sourceIndex<0) or may not update the offsets pointer
\r
482 //TODO: pArgs->offsets=offsets+=length;
\r
485 if(sourceIndex>=0) {
\r
486 sourceIndex+=(source.position()-s);
\r
491 if(preToULength<0) {
\r
493 * switch the source to new replay units (cannot occur while replaying)
\r
494 * after offset handling and before end-of-input and callback handling
\r
496 if(realSource==null)
\r
500 realSourceIndex=sourceIndex;
\r
502 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
\r
503 replayArray.put(preToUArray,0, -preToULength);
\r
505 replayArray.position(0);
\r
507 source=replayArray;
\r
508 source.limit(replayArrayIndex-preToULength);
\r
510 if((sourceIndex+=preToULength)<0) {
\r
516 /* see implementation note before _fromUnicodeWithCallback() */
\r
517 //agljport:todo U_ASSERT(realSource==NULL);
\r
518 Assert.assrt(realSource==null);
\r
522 /* update pointers */
\r
523 s=source.position();
\r
524 //t=target.position();
\r
526 if(cr.isUnderflow()) {
\r
527 if(s<source.limit())
\r
530 * continue with the conversion loop while there is still input left
\r
531 * (continue converting by breaking out of only the inner loop)
\r
534 } else if(realSource!=null) {
\r
535 /* switch back from replaying to the real source and continue */
\r
536 source = realSource;
\r
538 sourceIndex=realSourceIndex;
\r
541 } else if(flush && toULength>0) {
\r
543 * the entire input stream is consumed
\r
544 * and there is a partial, truncated input sequence left
\r
547 /* inject an error and continue with callback handling */
\r
548 cr = CoderResult.malformedForLength(toULength);
\r
549 calledCallback=false; /* new error condition */
\r
551 /* input consumed */
\r
554 * return to the conversion loop once more if the flush
\r
555 * flag is set and the conversion function has not
\r
556 * successfully processed the end of the input yet
\r
558 * (continue converting by breaking out of only the inner loop)
\r
560 if(!converterSawEndOfInput) {
\r
564 /* reset the converter without calling the callback function */
\r
568 /* done successfully */
\r
573 /* U_FAILURE(*err) */
\r
576 if( calledCallback || cr.isOverflow() ||
\r
577 (cr.isMalformed() && cr.isUnmappable())
\r
580 * the callback did not or cannot resolve the error:
\r
581 * set output pointers and return
\r
583 * the check for buffer overflow is redundant but it is
\r
584 * a high-runner case and hopefully documents the intent
\r
587 * if we were replaying, then the replay buffer must be
\r
588 * copied back into the UConverter
\r
589 * and the real arguments must be restored
\r
591 if(realSource!=null) {
\r
593 Assert.assrt(preToULength==0);
\r
594 length=(int)(source.limit()-source.position());
\r
596 //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
\r
597 source.get(preToUArray, preToUBegin, length);
\r
598 preToULength=(byte)-length;
\r
608 /* copy toUBytes[] to invalidCharBuffer[] */
\r
609 errorInputLength=invalidCharLength=toULength;
\r
610 if(errorInputLength>0) {
\r
611 copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
\r
614 /* set the converter state to deal with the next character */
\r
617 /* call the callback function */
\r
618 cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
\r
620 * loop back to the offset handling
\r
622 * this flag will indicate after offset handling
\r
623 * that a callback was called;
\r
624 * if the callback did not resolve the error, then we return
\r
626 calledCallback=true;
\r
632 * Returns the number of chars held in the converter's internal state
\r
633 * because more input is needed for completing the conversion. This function is
\r
634 * useful for mapping semantics of ICU's converter interface to those of iconv,
\r
635 * and this information is not needed for normal conversion.
\r
636 * @return The number of chars in the state. -1 if an error is encountered.
\r
638 /*public*/ int toUCountPending() {
\r
639 if(preToULength > 0){
\r
640 return preToULength ;
\r
641 } else if(preToULength < 0){
\r
642 return -preToULength;
\r
643 } else if(toULength > 0){
\r
651 private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
\r
652 for(int i=srcOffset; i<length; i++){
\r
653 dst[dstOffset++]=(char)(src[srcOffset++] & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
657 * ONLY used by ToU callback functions.
\r
658 * This function will write out the specified characters to the target
\r
659 * character buffer.
\r
660 * @return A CoderResult object that contains the error result when an error occurs.
\r
662 static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
\r
663 char[] ucharsArray, int ucharsBegin, int length,
\r
664 CharBuffer target, IntBuffer offsets, int sourceIndex) {
\r
666 CoderResult cr = CoderResult.UNDERFLOW;
\r
669 if(offsets==null) {
\r
670 while(length>0 && target.hasRemaining()) {
\r
671 target.put(ucharsArray[ucharsBegin++]);
\r
676 /* output with offsets */
\r
677 while(length>0 && target.hasRemaining()) {
\r
678 target.put(ucharsArray[ucharsBegin++]);
\r
679 offsets.put(sourceIndex);
\r
683 /* write overflow */
\r
685 cnv.charErrorBufferLength= 0;
\r
686 cr = CoderResult.OVERFLOW;
\r
688 cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++];
\r
689 } while(--length>0);
\r
694 * This function will write out the Unicode substitution character to the
\r
695 * target character buffer.
\r
696 * Sub classes to override this method if required
\r
701 * @return A CoderResult object that contains the error result when an error occurs.
\r
703 /* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with
\r
704 * the substitution characters. Will leave in here for the time being. To be removed later. (4.0)
\r
706 /*CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
\r
707 ByteBuffer source, CharBuffer target,
\r
708 IntBuffer offsets){
\r
709 String sub = decoder.replacement();
\r
710 CharsetICU cs = (CharsetICU) decoder.charset();
\r
711 if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
\r
712 char[] subArr = new char[] { 0x1a };
\r
713 return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
\r
714 .length(), target, offsets, source.position());
\r
716 return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
\r
717 0, sub.length(), target, offsets, source.position());
\r