2 *******************************************************************************
3 * Copyright (C) 2007-2011, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.charset;
9 import java.nio.ByteBuffer;
10 import java.nio.CharBuffer;
11 import java.nio.IntBuffer;
12 import java.nio.charset.CharsetDecoder;
13 import java.nio.charset.CharsetEncoder;
14 import java.nio.charset.CoderResult;
16 import com.ibm.icu.text.UnicodeSet;
22 class CharsetUTF7 extends CharsetICU {
23 private final static String IMAP_NAME="IMAP-mailbox-name";
24 private boolean useIMAP;
25 protected byte[] fromUSubstitution=new byte[]{0x3F};
27 public CharsetUTF7(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
28 super(icuCanonicalName, javaCanonicalName, aliases);
29 maxBytesPerChar=4; /* max 3 bytes per code unit from UTF-7 (base64) */
35 if (icuCanonicalName.equals(IMAP_NAME)) {
40 //private static boolean inSetD(char c) {
42 // (char)(c - 97) < 26 || (char)(c - 65) < 26 || /* letters */
43 // (char)(c - 48) < 10 || /* digits */
44 // (char)(c - 39) < 3 || /* ' () */
45 // (char)(c - 44) < 4 || /* ,-./ */
46 // (c==58) || (c==63) /* :? */
50 //private static boolean inSetO(char c) {
52 // (char)(c - 33) < 6 || /* !"#$%& */
53 // (char)(c - 59) < 4 || /* ;<=> */
54 // (char)(c - 93) < 4 || /* ]^_` */
55 // (char)(c - 123) < 3 || /* {|} */
56 // (c==58) || (c==63) /* *@[ */
60 private static boolean isCRLFTAB(char c) {
62 (c==13) || (c==10) || (c==9)
66 //private static boolean isCRLFSPTAB(char c) {
68 // (c==32) || (c==13) || (c==10) || (c==9)
72 private static final byte PLUS=43;
73 private static final byte MINUS=45;
74 private static final byte BACKSLASH=92;
75 //private static final byte TILDE=126;
76 private static final byte AMPERSAND=0x26;
77 private static final byte COMMA=0x2c;
78 private static final byte SLASH=0x2f;
80 // legal byte values: all US-ASCII graphic characters 0x20..0x7e
81 private static boolean isLegal(char c, boolean useIMAP) {
84 (0x20 <= c) && (c <= 0x7e)
88 ((char)(c - 32) < 94 && (c != BACKSLASH)) || isCRLFTAB(c)
93 // directly encode all of printable ASCII 0x20..0x7e except '&' 0x26
94 private static boolean inSetDIMAP(char c) {
96 (isLegal(c, true) && c != AMPERSAND)
100 private static byte TO_BASE64_IMAP(int n) {
101 return (n < 63 ? TO_BASE_64[n] : COMMA);
104 private static byte FROM_BASE64_IMAP(char c) {
105 return (c==COMMA ? 63 : c==SLASH ? -1 : FROM_BASE_64[c]);
108 /* encode directly sets D and O and CR LF SP TAB */
109 private static final byte ENCODE_DIRECTLY_MAXIMUM[] =
111 /*0 1 2 3 4 5 6 7 8 9 a b c d e f*/
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
121 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
122 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
125 /* encode directly set D and CR LF SP TAB but not set O */
126 private static final byte ENCODE_DIRECTLY_RESTRICTED[] =
128 /*0 1 2 3 4 5 6 7 8 9 a b c d e f*/
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
135 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
138 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
139 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
142 private static final byte TO_BASE_64[] =
145 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
146 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
148 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
149 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
151 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
156 private static final byte FROM_BASE_64[] =
158 /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
159 -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
160 -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
161 /* general punctuation with + and / and a special value (-2) for - */
162 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
164 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
166 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
167 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
169 -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
170 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
173 class CharsetDecoderUTF7 extends CharsetDecoderICU {
174 public CharsetDecoderUTF7(CharsetICU cs) {
179 protected void implReset() {
181 toUnicodeStatus=(toUnicodeStatus & 0xf0000000) | 0x1000000;
184 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
185 CoderResult cr=CoderResult.UNDERFLOW;
191 int sourceIndex, nextSourceIndex;
198 int sourceArrayIndex=source.position();
200 //get the state of the machine state
202 int status=toUnicodeStatus;
203 inDirectMode=(byte)((status >> 24) & 1);
204 base64Counter=(byte)(status >> 16);
208 /* sourceIndex=-1 if the current character began in the previous buffer */
209 sourceIndex=byteIndex==0 ? 0 : -1;
212 directMode: while (true) {
213 if (inDirectMode==1) {
215 * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
216 * with their US-ASCII byte values.
217 * Backslash and Tilde and most control characters are not alled in UTF-7.
218 * A plus sign starts Unicode (or "escape") Mode.
219 * An ampersand starts Unicode Mode for IMAP.
221 * In Direct Mode, only the sourceIndex is used.
224 length=source.remaining();
225 //targetCapacity=target.remaining();
226 //Commented out because length of source may be larger than target when it comes to bytes
227 /*if (useIMAP && length > targetCapacity) {
228 length=targetCapacity;
231 b=(char)(source.get());
233 if (!isLegal(b, useIMAP)) {
234 toUBytesArray[0]=(byte)b;
236 cr=CoderResult.malformedForLength(sourceArrayIndex);
238 } else if ((!useIMAP && b!=PLUS) || (useIMAP && b!=AMPERSAND)) {
239 // write directly encoded character
240 if (target.hasRemaining()) { // Check to make sure that there is room in target.
242 if (offsets!= null) {
243 offsets.put(sourceIndex++);
245 } else { // Get out and set the CoderResult.
246 charErrorBufferArray[charErrorBufferLength++] = b;
247 cr = CoderResult.OVERFLOW;
250 } else { /* PLUS or (AMPERSAND in IMAP)*/
251 /* switch to Unicode mode */
252 nextSourceIndex=++sourceIndex;
261 if (source.hasRemaining() && target.position() >= target.limit()) {
263 cr=CoderResult.OVERFLOW;
266 } else { /* Unicode Mode*/
268 * In Unicode Mode, UTF-16BE is base64-encoded.
269 * The base64 sequence ends with any character that is not in the base64 alphabet.
270 * A terminating minus sign is consumed.
272 * In Unicode Mode, the sourceIndex has the index to the start of the current
273 * base64 bytes, while nextSourceIndex is precisely parallel to source,
274 * keeping the index to the following byte.
276 while(source.hasRemaining()) {
277 if (target.hasRemaining()) {
278 b=(char)source.get();
280 toUBytesArray[byteIndex++]=(byte)b;
281 base64Value = -3; /* initialize as illegal */
282 if ((!useIMAP && (b>=126 || (base64Value=FROM_BASE_64[b])==-3 || base64Value==-1)) || (useIMAP && b>0x7e)) {
284 * base64Value==-1 for any legal character except base64 and minus sign, or
285 * base64Value==-3 for illegal characters:
286 * 1. In either case, leave Unicode mode.
287 * 2.1. If we ended with an incomplete UChar or none after the +, then
288 * generate an error for the preceding erroneous sequence and deal with
289 * the current (possibly illegal) character next time through.
290 * 2.2. Else the current char comes after a complete UChar, which was already
291 * pushed to the output buf, so:
292 * 2.2.1. If the current char is legal, just save it for processing next time.
293 * It may be for example, a plus which we need to deal with in direct mode.
294 * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
298 if(base64Counter==-1) {
299 /* illegal: + immediately followed by something other than base64 or minus sign */
300 /* include the plus sign in the reported sequence, but not the subsequent char */
301 source.position(source.position() -1);
302 toUBytesArray[0]=PLUS;
304 cr=CoderResult.malformedForLength(sourceArrayIndex);
307 /* bits are illegally left over, a UChar is incomplete */
308 /* don't include current char (legal or illegal) in error seq */
309 source.position(source.position() -1);
311 cr=CoderResult.malformedForLength(sourceArrayIndex);
314 /* previous UChar was complete */
315 if(base64Value==-3) {
316 /* current character is illegal, deal with it here */
317 cr=CoderResult.malformedForLength(sourceArrayIndex);
320 /* un-read the current character in case it is a plus sign */
321 source.position(source.position() -1);
322 sourceIndex=nextSourceIndex-1;
326 } else if ((!useIMAP && (base64Value=FROM_BASE_64[b])>=0) || (useIMAP && (base64Value=FROM_BASE64_IMAP(b))>=0)) {
327 /* collect base64 bytes */
328 switch (base64Counter) {
329 case -1: /* -1 is immediately after the + */
331 bits=(char)base64Value;
338 bits=(char)((bits<<6) | base64Value);
342 c=(char)((bits<<4) | (base64Value>>2));
343 if (useIMAP && isLegal(c, useIMAP)) {
346 cr=CoderResult.malformedForLength(sourceArrayIndex);
351 if (offsets != null) {
352 offsets.put(sourceIndex);
353 sourceIndex=nextSourceIndex - 1;
355 toUBytesArray[0]=(byte)b; /* keep this byte in case an error occurs */
357 bits=(char)(base64Value&3);
361 c=(char)((bits<<2) | (base64Value>>4));
362 if(useIMAP && isLegal(c, useIMAP)) {
365 cr=CoderResult.malformedForLength(sourceArrayIndex);
370 if (offsets != null) {
371 offsets.put(sourceIndex);
372 sourceIndex=nextSourceIndex - 1;
374 toUBytesArray[0]=(byte)b; /* keep this byte in case an error occurs */
376 bits=(char)(base64Value&15);
380 c=(char)((bits<<6) | base64Value);
381 if (useIMAP && isLegal(c, useIMAP)) {
384 cr=CoderResult.malformedForLength(sourceArrayIndex);
389 if (offsets != null) {
390 offsets.put(sourceIndex);
391 sourceIndex=nextSourceIndex;
398 /* will never occur */
401 } else if (!useIMAP || (useIMAP && base64Value==-2)) {
402 /* minus sign terminates the base64 sequence */
404 if (base64Counter==-1) {
405 /* +- i.e. a minus immediately following a plus */
406 target.put(useIMAP ? (char)AMPERSAND : (char)PLUS);
407 if (offsets != null) {
408 offsets.put(sourceIndex - 1);
411 /* absorb the minus and leave the Unicode Mode */
412 if (bits!=0 || (useIMAP && base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
413 /*bits are illegally left over, a unicode character is incomplete */
414 cr=CoderResult.malformedForLength(sourceArrayIndex);
418 sourceIndex=nextSourceIndex;
420 } else if (useIMAP) {
421 if (base64Counter==-1) {
422 // illegal: & immediately followed by something other than base64 or minus sign
423 // include the ampersand in the reported sequence
425 toUBytesArray[0]=AMPERSAND;
426 toUBytesArray[1]=(byte)b;
429 /* base64Value==-3 for illegal characters */
432 cr=CoderResult.malformedForLength(sourceArrayIndex);
437 cr=CoderResult.OVERFLOW;
443 }//end of direct mode label
445 if (!cr.isError() && inDirectMode==0 && flush && byteIndex==0 && !source.hasRemaining()) {
446 if (base64Counter==-1) {
447 /* & at the very end of the input */
448 /* make the ampersand the reported sequence */
449 toUBytesArray[0]=AMPERSAND;
452 /* else if (base64Counter!=-1) byteIndex remains 0 because ther is no particular byte sequence */
454 cr=CoderResult.malformedForLength(sourceIndex);
458 if (!cr.isError() && flush && !source.hasRemaining() && bits ==0) {
460 * if we are in Unicode Mode, then the byteIndex might not be 0,
461 * but that is ok if bits -- 0
462 * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
463 * (not true for IMAP-mailbox-name where we must end in direct mode)
465 if (!cr.isOverflow()) {
470 /* set the converter state */
471 toUnicodeStatus=(inDirectMode<<24 | (((short)base64Counter & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) | (int)bits);
478 class CharsetEncoderUTF7 extends CharsetEncoderICU {
479 public CharsetEncoderUTF7(CharsetICU cs) {
480 super(cs, fromUSubstitution);
484 protected void implReset() {
486 fromUnicodeStatus=(fromUnicodeStatus & 0xf0000000) | 0x1000000;
489 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
490 CoderResult cr=CoderResult.UNDERFLOW;
492 byte encodeDirectly[];
495 int length, targetCapacity, sourceIndex;
501 /* get the state machine state */
503 status=fromUnicodeStatus;
504 encodeDirectly=(((long)status) < 0x10000000) ? ENCODE_DIRECTLY_MAXIMUM : ENCODE_DIRECTLY_RESTRICTED;
505 inDirectMode=(byte)((status >> 24) & 1);
506 base64Counter=(byte)(status >> 16);
507 bits=(char)((byte)status);
509 /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
512 directMode: while(true) {
513 if(inDirectMode==1) {
514 length=source.remaining();
515 targetCapacity=target.remaining();
516 if(length > targetCapacity) {
517 length=targetCapacity;
521 /* UTF7: currently always encode CR LF SP TAB directly */
522 /* IMAP: encode 0x20..0x7e except '&' directly */
523 if ((!useIMAP && c<=127 && encodeDirectly[c]==1) || (useIMAP && inSetDIMAP(c))) {
524 /* encode directly */
526 if (offsets != null) {
527 offsets.put(sourceIndex++);
529 } else if ((!useIMAP && c==PLUS) || (useIMAP && c==AMPERSAND)) {
530 /* IMAP: output &- for & */
531 /* UTF-7: output +- for + */
532 target.put(useIMAP ? AMPERSAND : PLUS);
533 if (target.hasRemaining()) {
535 if (offsets != null) {
536 offsets.put(sourceIndex);
537 offsets.put(sourceIndex++);
539 /* realign length and targetCapacity */
542 if (offsets != null) {
543 offsets.put(sourceIndex++);
545 errorBuffer[0]=MINUS;
547 cr=CoderResult.OVERFLOW;
551 /* un-read this character and switch to unicode mode */
552 source.position(source.position() - 1);
553 target.put(useIMAP ? AMPERSAND : PLUS);
554 if (offsets != null) {
555 offsets.put(sourceIndex);
563 if (source.hasRemaining() && !target.hasRemaining()) {
565 cr=CoderResult.OVERFLOW;
570 while (source.hasRemaining()) {
571 if (target.hasRemaining()) {
573 if ((!useIMAP && c<=127 && encodeDirectly[c]==1) || (useIMAP && isLegal(c, useIMAP))) {
574 /* encode directly */
577 /* trick: back out this character to make this easier */
578 source.position(source.position() - 1);
580 /* terminate the base64 sequence */
581 if (base64Counter!=0) {
582 /* write remaining bits for the previous character */
583 target.put(useIMAP ? TO_BASE64_IMAP(bits) : TO_BASE_64[bits]);
585 offsets.put(sourceIndex-1);
588 if (FROM_BASE_64[c]!=-1 || useIMAP) {
589 /* need to terminate with a minus */
590 if (target.hasRemaining()) {
593 offsets.put(sourceIndex-1);
596 errorBuffer[0]=MINUS;
598 cr=CoderResult.OVERFLOW;
605 * base64 this character:
606 * Output 2 or 3 base64 bytres for the remaining bits of the previous character
607 * and the bits of this character, each implicitly in UTF-16BE.
609 * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
610 * character to the next. The actual 2 or 4 bits are shifted to the left edge
611 * of the 6-bits filed 5..0 to make the termination of the base64 sequence easier.
613 switch (base64Counter) {
616 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
617 if (target.hasRemaining()) {
618 b=(char)((c>>4)&0x3f);
619 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
621 offsets.put(sourceIndex);
622 offsets.put(sourceIndex++);
626 offsets.put(sourceIndex++);
628 b=(char)((c>>4)&0x3f);
629 errorBuffer[0]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
631 cr=CoderResult.OVERFLOW;
633 bits=(char)((c&15)<<2);
637 b=(char)(bits|(c>>14));
638 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
639 if (target.hasRemaining()) {
640 b=(char)((c>>8)&0x3f);
641 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
642 if (target.hasRemaining()) {
643 b=(char)((c>>2)&0x3f);
644 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
646 offsets.put(sourceIndex);
647 offsets.put(sourceIndex);
648 offsets.put(sourceIndex++);
652 offsets.put(sourceIndex);
653 offsets.put(sourceIndex++);
655 b=(char)((c>>2)&0x3f);
656 errorBuffer[0]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
658 cr=CoderResult.OVERFLOW;
662 offsets.put(sourceIndex++);
664 b=(char)((c>>8)&0x3f);
665 errorBuffer[0]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
666 b=(char)((c>>2)&0x3f);
667 errorBuffer[1]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
669 cr=CoderResult.OVERFLOW;
671 bits=(char)((c&3)<<4);
675 b=(char)(bits|(c>>12));
676 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
677 if (target.hasRemaining()) {
678 b=(char)((c>>6)&0x3f);
679 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
680 if (target.hasRemaining()) {
682 target.put(useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b]);
684 offsets.put(sourceIndex);
685 offsets.put(sourceIndex);
686 offsets.put(sourceIndex++);
690 offsets.put(sourceIndex);
691 offsets.put(sourceIndex++);
694 errorBuffer[0]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
696 cr=CoderResult.OVERFLOW;
700 offsets.put(sourceIndex++);
702 b=(char)((c>>6)&0x3f);
703 errorBuffer[0]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
705 errorBuffer[1]=useIMAP ? TO_BASE64_IMAP(b) : TO_BASE_64[b];
707 cr=CoderResult.OVERFLOW;
713 /* will never occur */
719 cr=CoderResult.OVERFLOW;
725 } //end of directMode label
727 if (flush && !source.hasRemaining()) {
728 /* flush remaining bits to the target */
729 if (inDirectMode==0) {
730 if (base64Counter!=0) {
731 if (target.hasRemaining()) {
732 target.put(useIMAP ? TO_BASE64_IMAP(bits) : TO_BASE_64[bits]);
734 offsets.put(sourceIndex - 1);
737 errorBuffer[errorBufferLength++]=useIMAP ? TO_BASE64_IMAP(bits) : TO_BASE_64[bits];
738 cr=CoderResult.OVERFLOW;
742 /* need to terminate with a minus */
743 if (target.hasRemaining()) {
746 offsets.put(sourceIndex - 1);
749 errorBuffer[errorBufferLength++]=MINUS;
750 cr=CoderResult.OVERFLOW;
753 /*reset the state for the next conversion */
754 fromUnicodeStatus=((status&0xf0000000) | 0x1000000); /* keep version, inDirectMode=TRUE */
756 /* set the converter state back */
757 fromUnicodeStatus=((status&0xf0000000) | (inDirectMode<<24) | (((short)base64Counter & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) | ((int)bits));
764 public CharsetDecoder newDecoder() {
765 return new CharsetDecoderUTF7(this);
768 public CharsetEncoder newEncoder() {
769 return new CharsetEncoderUTF7(this);
772 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
773 getCompleteUnicodeSet(setFillIn);