jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetHZ.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2008-2009, International Business Machines Corporation and         *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.charset;\r
   8 \r
   9 import java.nio.ByteBuffer;\r
  10 import java.nio.CharBuffer;\r
  11 import java.nio.IntBuffer;\r
  12 import java.nio.charset.CharsetDecoder;\r
  13 import java.nio.charset.CharsetEncoder;\r
  14 import java.nio.charset.CoderResult;\r
  15 \r
  16 import com.ibm.icu.text.UTF16;\r
  17 import com.ibm.icu.text.UnicodeSet;\r
  18 \r
  19 class CharsetHZ extends CharsetICU {\r
  20 \r
  21     private static final int UCNV_TILDE = 0x7E; /* ~ */\r
  22     private static final int UCNV_OPEN_BRACE = 0x7B; /* { */\r
  23     private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */\r
  24     private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D };\r
  25     private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B };\r
  26     private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E };\r
  27     private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A };\r
  28 \r
  29     private CharsetMBCS gbCharset;\r
  30     private boolean isEmptySegment;\r
  31 \r
  32     public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) {\r
  33         super(icuCanonicalName, canonicalName, aliases);\r
  34         gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK");\r
  35 \r
  36         maxBytesPerChar = 4;\r
  37         minBytesPerChar = 1;\r
  38         maxCharsPerByte = 1;\r
  39         \r
  40         isEmptySegment = false;\r
  41     }\r
  42 \r
  43     class CharsetDecoderHZ extends CharsetDecoderICU {\r
  44         CharsetMBCS.CharsetDecoderMBCS gbDecoder;\r
  45         boolean isStateDBCS = false;\r
  46 \r
  47         public CharsetDecoderHZ(CharsetICU cs) {\r
  48             super(cs);\r
  49             gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder();\r
  50         }\r
  51 \r
  52         protected void implReset() {\r
  53             super.implReset();\r
  54             gbDecoder.implReset();\r
  55 \r
  56             isStateDBCS = false;\r
  57             isEmptySegment = false;\r
  58         }\r
  59 \r
  60         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {\r
  61             CoderResult err = CoderResult.UNDERFLOW;\r
  62             byte[] tempBuf = new byte[2];\r
  63             int targetUniChar = 0;\r
  64             int mySourceChar = 0;\r
  65 \r
  66             if (!source.hasRemaining())\r
  67                 return CoderResult.UNDERFLOW;\r
  68             else if (!target.hasRemaining())\r
  69                 return CoderResult.OVERFLOW;\r
  70 \r
  71             while (source.hasRemaining()) {\r
  72 \r
  73                 if (target.hasRemaining()) {\r
  74 \r
  75                     // get the byte as unsigned\r
  76                     mySourceChar = source.get() & 0xff;\r
  77 \r
  78                     if (mode == UCNV_TILDE) {\r
  79                         /* second byte after ~ */\r
  80                         mode = 0;\r
  81                         switch (mySourceChar) {\r
  82                         case 0x0A:\r
  83                             /* no output for ~\n (line-continuation marker) */\r
  84                             continue;\r
  85                         case UCNV_TILDE:\r
  86                             if (offsets != null) {\r
  87                                 offsets.put(source.position() - 2);\r
  88                             }\r
  89                             target.put((char) mySourceChar);\r
  90                             continue;\r
  91                         case UCNV_OPEN_BRACE:\r
  92                         case UCNV_CLOSE_BRACE:\r
  93                             isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);\r
  94                             if (isEmptySegment) {\r
  95                                 isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */\r
  96                                 this.toUBytesArray[0] = UCNV_TILDE;\r
  97                                 this.toUBytesArray[1] = (byte)mySourceChar;\r
  98                                 this.toULength = 2;\r
  99                                 return CoderResult.malformedForLength(1);\r
 100                             }\r
 101                             isEmptySegment = true;\r
 102                             continue;\r
 103                         default:\r
 104                             /*\r
 105                              * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an\r
 106                              * error condition\r
 107                              */\r
 108                             /*\r
 109                              * Ticket 5691: consistent illegal sequences:\r
 110                              * - We include at least the first byte in the illegal sequence.\r
 111                              * - If any of the non-initial bytes could be the start of a character,\r
 112                              *   we stop the illegal sequence before the first one of those.\r
 113                              */\r
 114                             isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */\r
 115                             err = CoderResult.malformedForLength(1);\r
 116                             toUBytesArray[0] = UCNV_TILDE;\r
 117                             if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) {\r
 118                                 /* The current byte could be the start of a character: Back it out. */\r
 119                                 toULength = 1;\r
 120                                 source.position(source.position() - 1);\r
 121                             } else {\r
 122                                 /* Include the current byte in the illegal sequence. */\r
 123                                 toUBytesArray[1] = (byte)mySourceChar;\r
 124                                 toULength = 2;\r
 125                             }\r
 126                             return err;\r
 127                         }\r
 128                     } else if (isStateDBCS) {\r
 129                         if (toUnicodeStatus == 0) {\r
 130                             /* lead byte */\r
 131                             if (mySourceChar == UCNV_TILDE) {\r
 132                                 mode = UCNV_TILDE;\r
 133                             } else {\r
 134                                 /*\r
 135                                  * add another bit to distinguish a 0 byte from not having seen a lead byte\r
 136                                  */\r
 137                                 toUnicodeStatus = mySourceChar | 0x100;\r
 138                                 isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */ \r
 139                             }\r
 140                             continue;\r
 141                         } else {\r
 142                             /* trail byte */\r
 143                             boolean leadIsOk, trailIsOk;\r
 144                             int leadByte = toUnicodeStatus & 0xff;\r
 145                             targetUniChar = 0xffff;\r
 146                             /*\r
 147                              * Ticket 5691: consistent illegal sequence\r
 148                              * - We include at least the first byte in the illegal sequence.\r
 149                              * - If any of the non-initial bytes could be the start of a character,\r
 150                              *   we stop the illegal sequence before the first one of those\r
 151                              * \r
 152                              * In HZ DBCS, if the second byte is in the 21..7e range,\r
 153                              * we report ony the first byte as the illegal sequence.\r
 154                              * Otherwise we convert of report the pair of bytes.\r
 155                              */\r
 156                             leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21);\r
 157                             trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);\r
 158                             if (leadIsOk && trailIsOk) {\r
 159                                 tempBuf[0] = (byte)(leadByte + 0x80);\r
 160                                 tempBuf[1] = (byte)(mySourceChar + 0x80);\r
 161                                 targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());\r
 162                                 mySourceChar = (leadByte << 8) | mySourceChar;\r
 163                             } else if (trailIsOk) {\r
 164                                 /* report a single illegal byte and continue with the following DBCS starter byte */\r
 165                                 source.position(source.position() - 1);\r
 166                                 mySourceChar = leadByte;\r
 167                             } else {\r
 168                                 /* report a pair of illegal bytes if the second byte is not a DBCS starter */\r
 169                                 /* add another bit so that the code below writes 2 bytes in case of error */\r
 170                                 mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar;\r
 171                             }\r
 172                             toUnicodeStatus = 0x00;\r
 173                         }\r
 174                     } else {\r
 175                         if (mySourceChar == UCNV_TILDE) {\r
 176                             mode = UCNV_TILDE;\r
 177                             continue;\r
 178                         } else if (mySourceChar <= 0x7f) {\r
 179                             targetUniChar = mySourceChar; /* ASCII */\r
 180                             isEmptySegment = false; /* the segment has something valid */\r
 181                         } else {\r
 182                             targetUniChar = 0xffff;\r
 183                             isEmptySegment = false; /* different error here, reset this to avoid spurious future error */\r
 184                         }\r
 185                     }\r
 186 \r
 187                     if (targetUniChar < 0xfffe) {\r
 188                         if (offsets != null) {\r
 189                             offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0));\r
 190                         }\r
 191 \r
 192                         target.put((char) targetUniChar);\r
 193                     } else /* targetUniChar >= 0xfffe */{\r
 194                         if (mySourceChar > 0xff) {\r
 195                             toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8);\r
 196                             toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar;\r
 197                             toULength = 2;\r
 198                         } else {\r
 199                             toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar;\r
 200                             toULength = 1;\r
 201                         }\r
 202                         if (targetUniChar == 0xfffe) {\r
 203                             return CoderResult.unmappableForLength(toULength);\r
 204                         } else {\r
 205                             return CoderResult.malformedForLength(toULength);\r
 206                         }\r
 207                     }\r
 208                 } else {\r
 209                     return CoderResult.OVERFLOW;\r
 210                 }\r
 211             }\r
 212 \r
 213             return err;\r
 214         }\r
 215     }\r
 216 \r
 217     class CharsetEncoderHZ extends CharsetEncoderICU {\r
 218         CharsetMBCS.CharsetEncoderMBCS gbEncoder;\r
 219         boolean isEscapeAppended = false;\r
 220         boolean isTargetUCharDBCS = false;\r
 221 \r
 222         public CharsetEncoderHZ(CharsetICU cs) {\r
 223             super(cs, fromUSubstitution);\r
 224             gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder();\r
 225         }\r
 226 \r
 227         protected void implReset() {\r
 228             super.implReset();\r
 229             gbEncoder.implReset();\r
 230 \r
 231             isEscapeAppended = false;\r
 232             isTargetUCharDBCS = false;\r
 233         }\r
 234 \r
 235         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {\r
 236             int length = 0;\r
 237             int[] targetUniChar = new int[] { 0 };\r
 238             int mySourceChar = 0;\r
 239             boolean oldIsTargetUCharDBCS = isTargetUCharDBCS;\r
 240 \r
 241             if (!source.hasRemaining())\r
 242                 return CoderResult.UNDERFLOW;\r
 243             else if (!target.hasRemaining())\r
 244                 return CoderResult.OVERFLOW;\r
 245 \r
 246             if (fromUChar32 != 0 && target.hasRemaining()) {\r
 247                 CoderResult cr = handleSurrogates(source, (char) fromUChar32);\r
 248                 return (cr != null) ? cr : CoderResult.unmappableForLength(2);\r
 249             }\r
 250             /* writing the char to the output stream */\r
 251             while (source.hasRemaining()) {\r
 252                 targetUniChar[0] = MISSING_CHAR_MARKER;\r
 253                 if (target.hasRemaining()) {\r
 254 \r
 255                     mySourceChar = source.get();\r
 256 \r
 257                     oldIsTargetUCharDBCS = isTargetUCharDBCS;\r
 258                     if (mySourceChar == UCNV_TILDE) {\r
 259                         /*\r
 260                          * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);\r
 261                          */\r
 262                         concatEscape(source, target, offsets, TILDE_ESCAPE);\r
 263                         continue;\r
 264                     } else if (mySourceChar <= 0x7f) {\r
 265                         length = 1;\r
 266                         targetUniChar[0] = mySourceChar;\r
 267                     } else {\r
 268                         length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed());\r
 269 \r
 270                         /*\r
 271                          * we can only use lead bytes 21..7D and trail bytes 21..7E\r
 272                          */\r
 273                         if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe\r
 274                                 && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) {\r
 275                             targetUniChar[0] -= 0x8080;\r
 276                         } else {\r
 277                             targetUniChar[0] = MISSING_CHAR_MARKER;\r
 278                         }\r
 279                     }\r
 280                     if (targetUniChar[0] != MISSING_CHAR_MARKER) {\r
 281                         isTargetUCharDBCS = (targetUniChar[0] > 0x00FF);\r
 282                         if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) {\r
 283                             /* Shifting from a double byte to single byte mode */\r
 284                             if (!isTargetUCharDBCS) {\r
 285                                 concatEscape(source, target, offsets, SB_ESCAPE);\r
 286                                 isEscapeAppended = true;\r
 287                             } else { /*\r
 288                                          * Shifting from a single byte to double byte mode\r
 289                                          */\r
 290                                 concatEscape(source, target, offsets, DB_ESCAPE);\r
 291                                 isEscapeAppended = true;\r
 292 \r
 293                             }\r
 294                         }\r
 295 \r
 296                         if (isTargetUCharDBCS) {\r
 297                             if (target.hasRemaining()) {\r
 298                                 target.put((byte) (targetUniChar[0] >> 8));\r
 299                                 if (offsets != null) {\r
 300                                     offsets.put(source.position() - 1);\r
 301                                 }\r
 302                                 if (target.hasRemaining()) {\r
 303                                     target.put((byte) targetUniChar[0]);\r
 304                                     if (offsets != null) {\r
 305                                         offsets.put(source.position() - 1);\r
 306                                     }\r
 307                                 } else {\r
 308                                     errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
 309                                     // *err = U_BUFFER_OVERFLOW_ERROR;\r
 310                                 }\r
 311                             } else {\r
 312                                 errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8);\r
 313                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
 314                                 // *err = U_BUFFER_OVERFLOW_ERROR;\r
 315                             }\r
 316 \r
 317                         } else {\r
 318                             if (target.hasRemaining()) {\r
 319                                 target.put((byte) targetUniChar[0]);\r
 320                                 if (offsets != null) {\r
 321                                     offsets.put(source.position() - 1);\r
 322                                 }\r
 323 \r
 324                             } else {\r
 325                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
 326                                 // *err = U_BUFFER_OVERFLOW_ERROR;\r
 327                             }\r
 328                         }\r
 329 \r
 330                     } else {\r
 331                         /* oops.. the code point is unassigned */\r
 332                         /* Handle surrogates */\r
 333                         /* check if the char is a First surrogate */\r
 334 \r
 335                         if (UTF16.isSurrogate((char) mySourceChar)) {\r
 336                             // use that handy handleSurrogates method everyone's been talking about!\r
 337                             CoderResult cr = handleSurrogates(source, (char) mySourceChar);\r
 338                             return (cr != null) ? cr : CoderResult.unmappableForLength(2);\r
 339                         } else {\r
 340                             /* callback(unassigned) for a BMP code point */\r
 341                             // *err = U_INVALID_CHAR_FOUND;\r
 342                             fromUChar32 = mySourceChar;\r
 343                             return CoderResult.unmappableForLength(1);\r
 344                         }\r
 345                     }\r
 346                 } else {\r
 347                     // *err = U_BUFFER_OVERFLOW_ERROR;\r
 348                     return CoderResult.OVERFLOW;\r
 349                 }\r
 350             }\r
 351 \r
 352             return CoderResult.UNDERFLOW;\r
 353         }\r
 354 \r
 355         private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) {\r
 356             CoderResult cr = null;\r
 357             for (int i=0; i<strToAppend.length; i++) {\r
 358                 byte b = strToAppend[i];\r
 359                 if (target.hasRemaining()) {\r
 360                     target.put(b);\r
 361                     if (offsets != null)\r
 362                         offsets.put(source.position() - 1);\r
 363                 } else {\r
 364                     errorBuffer[errorBufferLength++] = b;\r
 365                     cr = CoderResult.OVERFLOW;\r
 366                 }\r
 367             }\r
 368             return cr;\r
 369         }\r
 370     }\r
 371 \r
 372     public CharsetDecoder newDecoder() {\r
 373         return new CharsetDecoderHZ(this);\r
 374     }\r
 375 \r
 376     public CharsetEncoder newEncoder() {\r
 377         return new CharsetEncoderHZ(this);\r
 378     }\r
 379     \r
 380     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
 381         setFillIn.add(0,0x7f);\r
 382        // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");\r
 383         gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);\r
 384     }\r
 385 }\r