]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/charset/src/com/ibm/icu/charset/CharsetHZ.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / charset / src / com / ibm / icu / charset / CharsetHZ.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 2008-2009, International Business Machines Corporation and         *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7 package com.ibm.icu.charset;\r
8 \r
9 import java.nio.ByteBuffer;\r
10 import java.nio.CharBuffer;\r
11 import java.nio.IntBuffer;\r
12 import java.nio.charset.CharsetDecoder;\r
13 import java.nio.charset.CharsetEncoder;\r
14 import java.nio.charset.CoderResult;\r
15 \r
16 import com.ibm.icu.text.UTF16;\r
17 import com.ibm.icu.text.UnicodeSet;\r
18 \r
19 class CharsetHZ extends CharsetICU {\r
20 \r
21     private static final int UCNV_TILDE = 0x7E; /* ~ */\r
22     private static final int UCNV_OPEN_BRACE = 0x7B; /* { */\r
23     private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */\r
24     private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D };\r
25     private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B };\r
26     private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E };\r
27     private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A };\r
28 \r
29     private CharsetMBCS gbCharset;\r
30     private boolean isEmptySegment;\r
31 \r
32     public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) {\r
33         super(icuCanonicalName, canonicalName, aliases);\r
34         gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK");\r
35 \r
36         maxBytesPerChar = 4;\r
37         minBytesPerChar = 1;\r
38         maxCharsPerByte = 1;\r
39         \r
40         isEmptySegment = false;\r
41     }\r
42 \r
43     class CharsetDecoderHZ extends CharsetDecoderICU {\r
44         CharsetMBCS.CharsetDecoderMBCS gbDecoder;\r
45         boolean isStateDBCS = false;\r
46 \r
47         public CharsetDecoderHZ(CharsetICU cs) {\r
48             super(cs);\r
49             gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder();\r
50         }\r
51 \r
52         protected void implReset() {\r
53             super.implReset();\r
54             gbDecoder.implReset();\r
55 \r
56             isStateDBCS = false;\r
57             isEmptySegment = false;\r
58         }\r
59 \r
60         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {\r
61             CoderResult err = CoderResult.UNDERFLOW;\r
62             byte[] tempBuf = new byte[2];\r
63             int targetUniChar = 0;\r
64             int mySourceChar = 0;\r
65 \r
66             if (!source.hasRemaining())\r
67                 return CoderResult.UNDERFLOW;\r
68             else if (!target.hasRemaining())\r
69                 return CoderResult.OVERFLOW;\r
70 \r
71             while (source.hasRemaining()) {\r
72 \r
73                 if (target.hasRemaining()) {\r
74 \r
75                     // get the byte as unsigned\r
76                     mySourceChar = source.get() & 0xff;\r
77 \r
78                     if (mode == UCNV_TILDE) {\r
79                         /* second byte after ~ */\r
80                         mode = 0;\r
81                         switch (mySourceChar) {\r
82                         case 0x0A:\r
83                             /* no output for ~\n (line-continuation marker) */\r
84                             continue;\r
85                         case UCNV_TILDE:\r
86                             if (offsets != null) {\r
87                                 offsets.put(source.position() - 2);\r
88                             }\r
89                             target.put((char) mySourceChar);\r
90                             continue;\r
91                         case UCNV_OPEN_BRACE:\r
92                         case UCNV_CLOSE_BRACE:\r
93                             isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);\r
94                             if (isEmptySegment) {\r
95                                 isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */\r
96                                 this.toUBytesArray[0] = UCNV_TILDE;\r
97                                 this.toUBytesArray[1] = (byte)mySourceChar;\r
98                                 this.toULength = 2;\r
99                                 return CoderResult.malformedForLength(1);\r
100                             }\r
101                             isEmptySegment = true;\r
102                             continue;\r
103                         default:\r
104                             /*\r
105                              * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an\r
106                              * error condition\r
107                              */\r
108                             /*\r
109                              * Ticket 5691: consistent illegal sequences:\r
110                              * - We include at least the first byte in the illegal sequence.\r
111                              * - If any of the non-initial bytes could be the start of a character,\r
112                              *   we stop the illegal sequence before the first one of those.\r
113                              */\r
114                             isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */\r
115                             err = CoderResult.malformedForLength(1);\r
116                             toUBytesArray[0] = UCNV_TILDE;\r
117                             if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) {\r
118                                 /* The current byte could be the start of a character: Back it out. */\r
119                                 toULength = 1;\r
120                                 source.position(source.position() - 1);\r
121                             } else {\r
122                                 /* Include the current byte in the illegal sequence. */\r
123                                 toUBytesArray[1] = (byte)mySourceChar;\r
124                                 toULength = 2;\r
125                             }\r
126                             return err;\r
127                         }\r
128                     } else if (isStateDBCS) {\r
129                         if (toUnicodeStatus == 0) {\r
130                             /* lead byte */\r
131                             if (mySourceChar == UCNV_TILDE) {\r
132                                 mode = UCNV_TILDE;\r
133                             } else {\r
134                                 /*\r
135                                  * add another bit to distinguish a 0 byte from not having seen a lead byte\r
136                                  */\r
137                                 toUnicodeStatus = mySourceChar | 0x100;\r
138                                 isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */ \r
139                             }\r
140                             continue;\r
141                         } else {\r
142                             /* trail byte */\r
143                             boolean leadIsOk, trailIsOk;\r
144                             int leadByte = toUnicodeStatus & 0xff;\r
145                             targetUniChar = 0xffff;\r
146                             /*\r
147                              * Ticket 5691: consistent illegal sequence\r
148                              * - We include at least the first byte in the illegal sequence.\r
149                              * - If any of the non-initial bytes could be the start of a character,\r
150                              *   we stop the illegal sequence before the first one of those\r
151                              * \r
152                              * In HZ DBCS, if the second byte is in the 21..7e range,\r
153                              * we report ony the first byte as the illegal sequence.\r
154                              * Otherwise we convert of report the pair of bytes.\r
155                              */\r
156                             leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21);\r
157                             trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);\r
158                             if (leadIsOk && trailIsOk) {\r
159                                 tempBuf[0] = (byte)(leadByte + 0x80);\r
160                                 tempBuf[1] = (byte)(mySourceChar + 0x80);\r
161                                 targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());\r
162                                 mySourceChar = (leadByte << 8) | mySourceChar;\r
163                             } else if (trailIsOk) {\r
164                                 /* report a single illegal byte and continue with the following DBCS starter byte */\r
165                                 source.position(source.position() - 1);\r
166                                 mySourceChar = leadByte;\r
167                             } else {\r
168                                 /* report a pair of illegal bytes if the second byte is not a DBCS starter */\r
169                                 /* add another bit so that the code below writes 2 bytes in case of error */\r
170                                 mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar;\r
171                             }\r
172                             toUnicodeStatus = 0x00;\r
173                         }\r
174                     } else {\r
175                         if (mySourceChar == UCNV_TILDE) {\r
176                             mode = UCNV_TILDE;\r
177                             continue;\r
178                         } else if (mySourceChar <= 0x7f) {\r
179                             targetUniChar = mySourceChar; /* ASCII */\r
180                             isEmptySegment = false; /* the segment has something valid */\r
181                         } else {\r
182                             targetUniChar = 0xffff;\r
183                             isEmptySegment = false; /* different error here, reset this to avoid spurious future error */\r
184                         }\r
185                     }\r
186 \r
187                     if (targetUniChar < 0xfffe) {\r
188                         if (offsets != null) {\r
189                             offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0));\r
190                         }\r
191 \r
192                         target.put((char) targetUniChar);\r
193                     } else /* targetUniChar >= 0xfffe */{\r
194                         if (mySourceChar > 0xff) {\r
195                             toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8);\r
196                             toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar;\r
197                             toULength = 2;\r
198                         } else {\r
199                             toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar;\r
200                             toULength = 1;\r
201                         }\r
202                         if (targetUniChar == 0xfffe) {\r
203                             return CoderResult.unmappableForLength(toULength);\r
204                         } else {\r
205                             return CoderResult.malformedForLength(toULength);\r
206                         }\r
207                     }\r
208                 } else {\r
209                     return CoderResult.OVERFLOW;\r
210                 }\r
211             }\r
212 \r
213             return err;\r
214         }\r
215     }\r
216 \r
217     class CharsetEncoderHZ extends CharsetEncoderICU {\r
218         CharsetMBCS.CharsetEncoderMBCS gbEncoder;\r
219         boolean isEscapeAppended = false;\r
220         boolean isTargetUCharDBCS = false;\r
221 \r
222         public CharsetEncoderHZ(CharsetICU cs) {\r
223             super(cs, fromUSubstitution);\r
224             gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder();\r
225         }\r
226 \r
227         protected void implReset() {\r
228             super.implReset();\r
229             gbEncoder.implReset();\r
230 \r
231             isEscapeAppended = false;\r
232             isTargetUCharDBCS = false;\r
233         }\r
234 \r
235         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {\r
236             int length = 0;\r
237             int[] targetUniChar = new int[] { 0 };\r
238             int mySourceChar = 0;\r
239             boolean oldIsTargetUCharDBCS = isTargetUCharDBCS;\r
240 \r
241             if (!source.hasRemaining())\r
242                 return CoderResult.UNDERFLOW;\r
243             else if (!target.hasRemaining())\r
244                 return CoderResult.OVERFLOW;\r
245 \r
246             if (fromUChar32 != 0 && target.hasRemaining()) {\r
247                 CoderResult cr = handleSurrogates(source, (char) fromUChar32);\r
248                 return (cr != null) ? cr : CoderResult.unmappableForLength(2);\r
249             }\r
250             /* writing the char to the output stream */\r
251             while (source.hasRemaining()) {\r
252                 targetUniChar[0] = MISSING_CHAR_MARKER;\r
253                 if (target.hasRemaining()) {\r
254 \r
255                     mySourceChar = source.get();\r
256 \r
257                     oldIsTargetUCharDBCS = isTargetUCharDBCS;\r
258                     if (mySourceChar == UCNV_TILDE) {\r
259                         /*\r
260                          * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);\r
261                          */\r
262                         concatEscape(source, target, offsets, TILDE_ESCAPE);\r
263                         continue;\r
264                     } else if (mySourceChar <= 0x7f) {\r
265                         length = 1;\r
266                         targetUniChar[0] = mySourceChar;\r
267                     } else {\r
268                         length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed());\r
269 \r
270                         /*\r
271                          * we can only use lead bytes 21..7D and trail bytes 21..7E\r
272                          */\r
273                         if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe\r
274                                 && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) {\r
275                             targetUniChar[0] -= 0x8080;\r
276                         } else {\r
277                             targetUniChar[0] = MISSING_CHAR_MARKER;\r
278                         }\r
279                     }\r
280                     if (targetUniChar[0] != MISSING_CHAR_MARKER) {\r
281                         isTargetUCharDBCS = (targetUniChar[0] > 0x00FF);\r
282                         if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) {\r
283                             /* Shifting from a double byte to single byte mode */\r
284                             if (!isTargetUCharDBCS) {\r
285                                 concatEscape(source, target, offsets, SB_ESCAPE);\r
286                                 isEscapeAppended = true;\r
287                             } else { /*\r
288                                          * Shifting from a single byte to double byte mode\r
289                                          */\r
290                                 concatEscape(source, target, offsets, DB_ESCAPE);\r
291                                 isEscapeAppended = true;\r
292 \r
293                             }\r
294                         }\r
295 \r
296                         if (isTargetUCharDBCS) {\r
297                             if (target.hasRemaining()) {\r
298                                 target.put((byte) (targetUniChar[0] >> 8));\r
299                                 if (offsets != null) {\r
300                                     offsets.put(source.position() - 1);\r
301                                 }\r
302                                 if (target.hasRemaining()) {\r
303                                     target.put((byte) targetUniChar[0]);\r
304                                     if (offsets != null) {\r
305                                         offsets.put(source.position() - 1);\r
306                                     }\r
307                                 } else {\r
308                                     errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
309                                     // *err = U_BUFFER_OVERFLOW_ERROR;\r
310                                 }\r
311                             } else {\r
312                                 errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8);\r
313                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
314                                 // *err = U_BUFFER_OVERFLOW_ERROR;\r
315                             }\r
316 \r
317                         } else {\r
318                             if (target.hasRemaining()) {\r
319                                 target.put((byte) targetUniChar[0]);\r
320                                 if (offsets != null) {\r
321                                     offsets.put(source.position() - 1);\r
322                                 }\r
323 \r
324                             } else {\r
325                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];\r
326                                 // *err = U_BUFFER_OVERFLOW_ERROR;\r
327                             }\r
328                         }\r
329 \r
330                     } else {\r
331                         /* oops.. the code point is unassigned */\r
332                         /* Handle surrogates */\r
333                         /* check if the char is a First surrogate */\r
334 \r
335                         if (UTF16.isSurrogate((char) mySourceChar)) {\r
336                             // use that handy handleSurrogates method everyone's been talking about!\r
337                             CoderResult cr = handleSurrogates(source, (char) mySourceChar);\r
338                             return (cr != null) ? cr : CoderResult.unmappableForLength(2);\r
339                         } else {\r
340                             /* callback(unassigned) for a BMP code point */\r
341                             // *err = U_INVALID_CHAR_FOUND;\r
342                             fromUChar32 = mySourceChar;\r
343                             return CoderResult.unmappableForLength(1);\r
344                         }\r
345                     }\r
346                 } else {\r
347                     // *err = U_BUFFER_OVERFLOW_ERROR;\r
348                     return CoderResult.OVERFLOW;\r
349                 }\r
350             }\r
351 \r
352             return CoderResult.UNDERFLOW;\r
353         }\r
354 \r
355         private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) {\r
356             CoderResult cr = null;\r
357             for (int i=0; i<strToAppend.length; i++) {\r
358                 byte b = strToAppend[i];\r
359                 if (target.hasRemaining()) {\r
360                     target.put(b);\r
361                     if (offsets != null)\r
362                         offsets.put(source.position() - 1);\r
363                 } else {\r
364                     errorBuffer[errorBufferLength++] = b;\r
365                     cr = CoderResult.OVERFLOW;\r
366                 }\r
367             }\r
368             return cr;\r
369         }\r
370     }\r
371 \r
372     public CharsetDecoder newDecoder() {\r
373         return new CharsetDecoderHZ(this);\r
374     }\r
375 \r
376     public CharsetEncoder newEncoder() {\r
377         return new CharsetEncoderHZ(this);\r
378     }\r
379     \r
380     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){\r
381         setFillIn.add(0,0x7f);\r
382        // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");\r
383         gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);\r
384     }\r
385 }\r