jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF32.java

   1 /**\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2006-2008, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.charset;\r
   8 \r
   9 import java.nio.ByteBuffer;\r
  10 import java.nio.CharBuffer;\r
  11 import java.nio.IntBuffer;\r
  12 import java.nio.charset.CharsetDecoder;\r
  13 import java.nio.charset.CharsetEncoder;\r
  14 import java.nio.charset.CoderResult;\r
  15 \r
  16 import com.ibm.icu.text.UTF16;\r
  17 import com.ibm.icu.text.UnicodeSet;\r
  18 \r
  19 /**\r
  20  * @author Niti Hantaweepant\r
  21  */\r
  22 class CharsetUTF32 extends CharsetICU {\r
  23 \r
  24     private static final int SIGNATURE_LENGTH = 4;\r
  25     private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };\r
  26     private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };\r
  27     private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };\r
  28     private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };\r
  29     private static final int ENDIAN_XOR_BE = 0;\r
  30     private static final int ENDIAN_XOR_LE = 3;\r
  31     private static final int NEED_TO_WRITE_BOM = 1;\r
  32 \r
  33     private boolean isEndianSpecified;\r
  34     private boolean isBigEndian;\r
  35     private int endianXOR;\r
  36     private byte[] bom;\r
  37     private byte[] fromUSubstitution;\r
  38 \r
  39     public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {\r
  40         super(icuCanonicalName, javaCanonicalName, aliases);\r
  41 \r
  42         this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE);\r
  43         this.isBigEndian = !(this instanceof CharsetUTF32LE);\r
  44 \r
  45         if (isBigEndian) {\r
  46             this.bom = BOM_BE;\r
  47             this.fromUSubstitution = fromUSubstitution_BE;\r
  48             this.endianXOR = ENDIAN_XOR_BE;\r
  49         } else {\r
  50             this.bom = BOM_LE;\r
  51             this.fromUSubstitution = fromUSubstitution_LE;\r
  52             this.endianXOR = ENDIAN_XOR_LE;\r
  53         }\r
  54 \r
  55         maxBytesPerChar = 4;\r
  56         minBytesPerChar = 4;\r
  57         maxCharsPerByte = 1;\r
  58     }\r
  59 \r
  60     class CharsetDecoderUTF32 extends CharsetDecoderICU {\r
  61 \r
  62         private boolean isBOMReadYet;\r
  63         private int actualEndianXOR;\r
  64         private byte[] actualBOM;\r
  65 \r
  66         public CharsetDecoderUTF32(CharsetICU cs) {\r
  67             super(cs);\r
  68         }\r
  69 \r
  70         protected void implReset() {\r
  71             super.implReset();\r
  72             isBOMReadYet = false;\r
  73             actualBOM = null;\r
  74         }\r
  75 \r
  76         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {\r
  77             /*\r
  78              * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual\r
  79              * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that\r
  80              * are in the current buffer.\r
  81              */\r
  82             if (!isBOMReadYet) {\r
  83                 while (true) {\r
  84                     if (!source.hasRemaining())\r
  85                         return CoderResult.UNDERFLOW;\r
  86 \r
  87                     toUBytesArray[toULength++] = source.get();\r
  88 \r
  89                     if (toULength == 1) {\r
  90                         // on the first byte, we haven't decided whether or not it's bigEndian yet\r
  91                         if ((!isEndianSpecified || isBigEndian)\r
  92                                 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {\r
  93                             actualBOM = BOM_BE;\r
  94                             actualEndianXOR = ENDIAN_XOR_BE;\r
  95                         } else if ((!isEndianSpecified || !isBigEndian)\r
  96                                 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {\r
  97                             actualBOM = BOM_LE;\r
  98                             actualEndianXOR = ENDIAN_XOR_LE;\r
  99                         } else {\r
 100                             // we do not have a BOM (and we have toULength==1 bytes)\r
 101                             actualBOM = null;\r
 102                             actualEndianXOR = endianXOR;\r
 103                             break;\r
 104                         }\r
 105                     } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {\r
 106                         // we do not have a BOM (and we have toULength bytes)\r
 107                         actualBOM = null;\r
 108                         actualEndianXOR = endianXOR;\r
 109                         break;\r
 110                     } else if (toULength == SIGNATURE_LENGTH) {\r
 111                         // we found a BOM! at last!\r
 112                         // too bad we have to get ignore it now (like it was unwanted or something)\r
 113                         toULength = 0;\r
 114                         break;\r
 115                     }\r
 116                 }\r
 117 \r
 118                 isBOMReadYet = true;\r
 119             }\r
 120 \r
 121             // now that we no longer need to look for a BOM, let's do some work\r
 122             int char32;\r
 123 \r
 124             while (true) {\r
 125                 while (toULength < 4) {\r
 126                     if (!source.hasRemaining())\r
 127                         return CoderResult.UNDERFLOW;\r
 128                     toUBytesArray[toULength++] = source.get();\r
 129                 }\r
 130 \r
 131                 if (!target.hasRemaining())\r
 132                     return CoderResult.OVERFLOW;\r
 133 \r
 134                 char32 = 0;\r
 135                 for (int i = 0; i < 4; i++)\r
 136                     char32 = (char32 << 8)\r
 137                             | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);\r
 138 \r
 139                 if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {\r
 140                     toULength = 0;\r
 141                     if (char32 <= UConverterConstants.MAXIMUM_UCS2) {\r
 142                         /* fits in 16 bits */\r
 143                         target.put((char) char32);\r
 144                     } else {\r
 145                         /* write out the surrogates */\r
 146                         target.put(UTF16.getLeadSurrogate(char32));\r
 147                         char32 = UTF16.getTrailSurrogate(char32);\r
 148                         if (target.hasRemaining()) {\r
 149                             target.put((char) char32);\r
 150                         } else {\r
 151                             /* Put in overflow buffer (not handled here) */\r
 152                             charErrorBufferArray[0] = (char) char32;\r
 153                             charErrorBufferLength = 1;\r
 154                             return CoderResult.OVERFLOW;\r
 155                         }\r
 156                     }\r
 157                 } else {\r
 158                     return CoderResult.malformedForLength(toULength);\r
 159                 }\r
 160             }\r
 161         }\r
 162     }\r
 163 \r
 164     class CharsetEncoderUTF32 extends CharsetEncoderICU {\r
 165         private final byte[] temp = new byte[4];\r
 166 \r
 167         public CharsetEncoderUTF32(CharsetICU cs) {\r
 168             super(cs, fromUSubstitution);\r
 169             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;\r
 170         }\r
 171 \r
 172         protected void implReset() {\r
 173             super.implReset();\r
 174             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;\r
 175         }\r
 176 \r
 177         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {\r
 178             CoderResult cr;\r
 179 \r
 180             /* write the BOM if necessary */\r
 181             if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {\r
 182                 if (!target.hasRemaining())\r
 183                     return CoderResult.OVERFLOW;\r
 184 \r
 185                 fromUnicodeStatus = 0;\r
 186                 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);\r
 187                 if (cr.isOverflow())\r
 188                     return cr;\r
 189             }\r
 190 \r
 191             if (fromUChar32 != 0) {\r
 192                 if (!target.hasRemaining())\r
 193                     return CoderResult.OVERFLOW;\r
 194 \r
 195                 // a note: fromUChar32 will either be 0 or a lead surrogate\r
 196                 cr = encodeChar(source, target, offsets, (char) fromUChar32);\r
 197                 if (cr != null)\r
 198                     return cr;\r
 199             }\r
 200 \r
 201             while (true) {\r
 202                 if (!source.hasRemaining())\r
 203                     return CoderResult.UNDERFLOW;\r
 204                 if (!target.hasRemaining())\r
 205                     return CoderResult.OVERFLOW;\r
 206 \r
 207                 cr = encodeChar(source, target, offsets, source.get());\r
 208                 if (cr != null)\r
 209                     return cr;\r
 210             }\r
 211         }\r
 212 \r
 213         private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {\r
 214             int sourceIndex = source.position() - 1;\r
 215             CoderResult cr;\r
 216             int char32;\r
 217 \r
 218             if (UTF16.isSurrogate(ch)) {\r
 219                 cr = handleSurrogates(source, ch);\r
 220                 if (cr != null)\r
 221                     return cr;\r
 222 \r
 223                 char32 = fromUChar32;\r
 224                 fromUChar32 = 0;\r
 225             } else {\r
 226                 char32 = ch;\r
 227             }\r
 228 \r
 229             /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */\r
 230             // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)\r
 231             temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)\r
 232             temp[2 ^ endianXOR] = (byte) (char32 >>> 8);\r
 233             temp[3 ^ endianXOR] = (byte) (char32);\r
 234             cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);\r
 235             return (cr.isUnderflow() ? null : cr);\r
 236         }\r
 237     }\r
 238 \r
 239     public CharsetDecoder newDecoder() {\r
 240         return new CharsetDecoderUTF32(this);\r
 241     }\r
 242 \r
 243     public CharsetEncoder newEncoder() {\r
 244         return new CharsetEncoderUTF32(this);\r
 245     }\r
 246     \r
 247     \r
 248     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
 249         getNonSurrogateUnicodeSet(setFillIn);                    \r
 250     }\r
 251 }\r