2 *******************************************************************************
\r
3 * Copyright (C) 2008-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.charset;
\r
9 import java.nio.ByteBuffer;
\r
10 import java.nio.CharBuffer;
\r
11 import java.nio.IntBuffer;
\r
12 import java.nio.charset.CharsetDecoder;
\r
13 import java.nio.charset.CharsetEncoder;
\r
14 import java.nio.charset.CoderResult;
\r
16 import com.ibm.icu.lang.UCharacter;
\r
17 import com.ibm.icu.text.UTF16;
\r
18 import com.ibm.icu.text.UnicodeSet;
\r
24 class CharsetSCSU extends CharsetICU{
\r
25 /* SCSU definitions --------------------------------------------------------- */
\r
27 /* SCSU command byte values */
\r
29 private static final short SQ0=0x01; /* Quote from window pair 0 */
\r
30 private static final short SQ7=0x08; /* Quote from window pair 7 */
\r
31 private static final short SDX=0x0B; /* Define a window as extended */
\r
32 //private static final short Srs=0x0C; /* reserved */
\r
33 private static final short SQU=0x0E; /* Quote a single Unicode character */
\r
34 private static final short SCU=0x0F; /* Change to Unicode mode */
\r
35 private static final short SC0=0x10; /* Select window 0 */
\r
36 private static final short SC7=0x17; /* Select window 7 */
\r
37 private static final short SD0=0x18; /* Define and select window 0 */
\r
38 //private static final short SD7=0x1F; /* Define and select window 7 */
\r
40 private static final short UC0=0xE0; /* Select window 0 */
\r
41 private static final short UC7=0xE7; /* Select window 7 */
\r
42 private static final short UD0=0xE8; /* Define and select window 0 */
\r
43 private static final short UD7=0xEF; /* Define and select window 7 */
\r
44 private static final short UQU=0xF0; /* Quote a single Unicode character */
\r
45 private static final short UDX=0xF1; /* Define a Window as extended */
\r
46 private static final short Urs=0xF2; /* reserved */
\r
51 * Unicode code points from 3400 to E000 are not adressible by
\r
52 * dynamic window, since in these areas no short run alphabets are
\r
53 * found. Therefore add gapOffset to all values from gapThreshold.
\r
55 private static final int gapThreshold=0x68;
\r
56 private static final int gapOffset = 0xAC00 ;
\r
57 /* values between reservedStart and fixedThreshold are reserved */
\r
58 private static final int reservedStart=0xA8;
\r
59 /* use table of predefined fixed offsets for values from fixedThreshold */
\r
60 private static final int fixedThreshold=0xF9;
\r
63 protected byte[] fromUSubstitution = new byte[]{(byte)0x0E,(byte)0xFF, (byte)0xFD};
\r
65 /* constant offsets for the 8 static windows */
\r
66 private static final int staticOffsets[]={
\r
67 0x0000, /* ASCII for quoted tags */
\r
68 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
\r
69 0x0100, /* Latin Extended-A */
\r
70 0x0300, /* Combining Diacritical Marks */
\r
71 0x2000, /* General Punctuation */
\r
72 0x2080, /* Currency Symbols */
\r
73 0x2100, /* Letterlike Symbols and Number Forms */
\r
74 0x3000 /* CJK Symbols and punctuation */
\r
77 /* initial offsets for the 8 dynamic (sliding) windows */
\r
78 private static final int initialDynamicOffsets[]={
\r
79 0x0080, /* Latin-1 */
\r
80 0x00C0, /* Latin Extended A */
\r
81 0x0400, /* Cyrillic */
\r
82 0x0600, /* Arabic */
\r
83 0x0900, /* Devanagari */
\r
84 0x3040, /* Hiragana */
\r
85 0x30A0, /* Katakana */
\r
86 0xFF00 /* Fullwidth ASCII */
\r
89 /* Table of fixed predefined Offsets */
\r
90 private static final int fixedOffsets[]={
\r
91 /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
\r
92 /* 0xFA */ 0x0250, /* IPA extensions */
\r
93 /* 0xFB */ 0x0370, /* Greek */
\r
94 /* 0xFC */ 0x0530, /* Armenian */
\r
95 /* 0xFD */ 0x3040, /* Hiragana */
\r
96 /* 0xFE */ 0x30A0, /* Katakana */
\r
97 /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
\r
102 private static final int readCommand=0;
\r
103 private static final int quotePairOne=1;
\r
104 private static final int quotePairTwo=2;
\r
105 private static final int quoteOne=3;
\r
106 private static final int definePairOne=4;
\r
107 private static final int definePairTwo=5;
\r
108 private static final int defineOne=6;
\r
111 @SuppressWarnings("unused")
\r
112 private final class SCSUData{
\r
113 /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
\r
114 int toUDynamicOffsets[] = new int[8] ;
\r
115 int fromUDynamicOffsets[] = new int[8] ;
\r
117 /* state machine state - toUnicode */
\r
118 boolean toUIsSingleByteMode;
\r
120 byte toUQuoteWindow, toUDynamicWindow;
\r
122 short toUPadding[];
\r
124 /* state machine state - fromUnicode */
\r
125 boolean fromUIsSingleByteMode;
\r
126 byte fromUDynamicWindow;
\r
129 * windowUse[] keeps track of the use of the dynamic windows:
\r
130 * At nextWindowUseIndex there is the least recently used window,
\r
131 * and the following windows (in a wrapping manner) are more and more
\r
133 * At nextWindowUseIndex-1 there is the most recently used window.
\r
136 byte nextWindowUseIndex;
\r
137 byte windowUse[] = new byte[8];
\r
144 for(int i=0;i<8;i++){
\r
145 this.toUDynamicOffsets[i] = initialDynamicOffsets[i];
\r
147 this.toUIsSingleByteMode = true;
\r
148 this.toUState = readCommand;
\r
149 this.toUQuoteWindow = 0;
\r
150 this.toUDynamicWindow = 0;
\r
151 this.toUByteOne = 0;
\r
152 this.fromUIsSingleByteMode = true;
\r
153 this.fromUDynamicWindow = 0;
\r
154 for(int i=0;i<8;i++){
\r
155 this.fromUDynamicOffsets[i] = initialDynamicOffsets[i];
\r
157 this.nextWindowUseIndex = 0;
\r
158 switch(this.locale){
\r
159 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
\r
161 for(int i=0;i<8;i++){
\r
162 this.windowUse[i] = initialWindowUse_ja[i];
\r
166 for(int i=0;i<8;i++){
\r
167 this.windowUse[i] = initialWindowUse[i];
\r
174 static final byte initialWindowUse[]={ 7, 0, 3, 2, 4, 5, 6, 1 };
\r
175 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
\r
176 // static final byte initialWindowUse_ja[]={ 3, 2, 4, 1, 0, 7, 5, 6 };
\r
179 //private static final int lGeneric = 0;
\r
180 /* Note being used right now because "SCSU,locale=ja" does not work in ICU4J. */
\r
181 // private static final int l_ja = 1;
\r
184 private SCSUData extraInfo = null;
\r
186 public CharsetSCSU(String icuCanonicalName, String javaCanonicalName, String[] aliases){
\r
187 super(icuCanonicalName, javaCanonicalName, aliases);
\r
188 maxBytesPerChar = 3;
\r
189 minBytesPerChar = 1;
\r
190 maxCharsPerByte = 1;
\r
191 extraInfo = new SCSUData();
\r
194 class CharsetDecoderSCSU extends CharsetDecoderICU {
\r
195 /* label values for supporting behavior similar to goto in C */
\r
196 private static final int FastSingle=0;
\r
197 private static final int SingleByteMode=1;
\r
198 private static final int EndLoop=2;
\r
201 private static final int ByteMode = 0;
\r
202 private static final int UnicodeMode =1;
\r
204 public CharsetDecoderSCSU(CharsetICU cs) {
\r
209 //private SCSUData data ;
\r
210 protected void implReset(){
\r
213 extraInfo.initialize();
\r
218 //Get the state machine state
\r
219 private boolean isSingleByteMode ;
\r
220 private short state ;
\r
221 private byte quoteWindow ;
\r
222 private byte dynamicWindow ;
\r
223 private short byteOne;
\r
226 //sourceIndex=-1 if the current character began in the previous buffer
\r
227 private int sourceIndex ;
\r
228 private int nextSourceIndex ;
\r
232 private boolean LabelLoop;// used to break the while loop
\r
234 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets,
\r
238 //Get the state machine state
\r
239 isSingleByteMode = data.toUIsSingleByteMode;
\r
240 state = data.toUState;
\r
241 quoteWindow = data.toUQuoteWindow;
\r
242 dynamicWindow = data.toUDynamicWindow;
\r
243 byteOne = data.toUByteOne;
\r
247 //sourceIndex=-1 if the current character began in the previous buffer
\r
248 sourceIndex = data.toUState == readCommand ? 0: -1 ;
\r
249 nextSourceIndex = 0;
\r
251 cr = CoderResult.UNDERFLOW;
\r
254 if(isSingleByteMode){
\r
257 /*fast path for single-byte mode*/
\r
258 labelType = fastSingle(source, target, offsets, ByteMode);
\r
260 case SingleByteMode:
\r
261 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */
\r
262 labelType = singleByteMode(source, target, offsets, ByteMode);
\r
265 endLoop(source, target, offsets);
\r
271 /*fast path for single-byte mode*/
\r
272 labelType = fastSingle(source, target, offsets, UnicodeMode);
\r
274 case SingleByteMode:
\r
275 /* normal state machine for single-byte mode, minus handling for what fastSingleCovers */
\r
276 labelType = singleByteMode(source, target, offsets, UnicodeMode);
\r
279 endLoop(source, target, offsets);
\r
282 //LabelLoop = false;
\r
288 private int fastSingle(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){
\r
290 if(modeType==ByteMode){
\r
292 if(state==readCommand){
\r
293 while(source.hasRemaining() && target.hasRemaining() && (b=(short)(source.get(source.position()) & UConverterConstants.UNSIGNED_BYTE_MASK)) >= 0x20){
\r
294 source.position(source.position()+1);
\r
297 /*Write US graphic character or DEL*/
\r
298 target.put((char)b);
\r
299 if(offsets != null){
\r
300 offsets.put(sourceIndex);
\r
303 /*Write from dynamic window*/
\r
304 int c = data.toUDynamicOffsets[dynamicWindow] + (b&0x7f);
\r
306 target.put((char)c);
\r
307 if(offsets != null){
\r
308 offsets.put(sourceIndex);
\r
311 /*Output surrogate pair */
\r
312 target.put((char)(0xd7c0 + (c>>10)));
\r
313 if(target.hasRemaining()){
\r
314 target.put((char)(0xdc00 | (c&0x3ff)));
\r
315 if(offsets != null){
\r
316 offsets.put(sourceIndex);
\r
317 offsets.put(sourceIndex);
\r
320 /* target overflow */
\r
321 if(offsets != null){
\r
322 offsets.put(sourceIndex);
\r
324 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff));
\r
325 charErrorBufferLength = 1;
\r
327 cr = CoderResult.OVERFLOW;
\r
332 sourceIndex = nextSourceIndex;
\r
334 // label = SingleByteMode;
\r
336 }else if(modeType==UnicodeMode){
\r
337 /* fast path for unicode mode */
\r
338 if(state == readCommand){
\r
339 while((source.position()+1)<source.limit() && target.hasRemaining() && (((b=source.get(source.position()))-UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs-UC0)){
\r
340 target.put((char)((b<<8)|(source.get(source.position()+1)&UConverterConstants.UNSIGNED_BYTE_MASK)));
\r
341 if(offsets != null){
\r
342 offsets.put(sourceIndex);
\r
344 sourceIndex = nextSourceIndex;
\r
345 nextSourceIndex+=2;
\r
346 source.position(source.position()+2);
\r
350 label = SingleByteMode;
\r
354 private int singleByteMode(ByteBuffer source, CharBuffer target, IntBuffer offsets, int modeType){
\r
355 int label = SingleByteMode;
\r
356 if(modeType == ByteMode){
\r
357 while(source.hasRemaining()){
\r
358 if(!target.hasRemaining()){
\r
359 cr = CoderResult.OVERFLOW;
\r
363 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
367 /*redundant conditions are commented out */
\r
368 if(((1L<<b)&0x2601)!=0){
\r
369 target.put((char)b);
\r
370 if(offsets != null){
\r
371 offsets.put(sourceIndex);
\r
373 sourceIndex = nextSourceIndex;
\r
374 label = FastSingle;
\r
376 }else if(SC0 <= b){
\r
378 dynamicWindow = (byte)(b-SC0);
\r
379 sourceIndex = nextSourceIndex;
\r
380 label = FastSingle;
\r
382 }else /* if(SD0<=b && b<=SQ7)*/{
\r
383 dynamicWindow = (byte)(b - SD0);
\r
386 }else if(/* SQ0<=b &&*/b <= SQ7){
\r
387 quoteWindow = (byte)(b - SQ0);
\r
390 state = definePairOne;
\r
392 state = quotePairOne;
\r
394 sourceIndex = nextSourceIndex;
\r
395 isSingleByteMode = false;
\r
396 label = FastSingle;
\r
399 /*callback (illegal)*/
\r
400 cr = CoderResult.malformedForLength(1);
\r
401 toUBytesArray[0] = (byte)b;
\r
407 /* Store the first byte of a multibyte sequence in toUByte[] */
\r
408 toUBytesArray[0] = (byte)b;
\r
413 toUBytesArray[1] = (byte)b;
\r
415 state = quotePairTwo;
\r
418 target.put((char)((byteOne<< 8) | b));
\r
419 if(offsets != null){
\r
420 offsets.put(sourceIndex);
\r
422 sourceIndex = nextSourceIndex;
\r
423 state = readCommand;
\r
424 label = FastSingle;
\r
428 /* all static offsets are in the BMP */
\r
429 target.put((char)(staticOffsets[quoteWindow] + b));
\r
430 if(offsets != null){
\r
431 offsets.put(sourceIndex);
\r
434 /*write from dynamic window */
\r
435 int c = data.toUDynamicOffsets[quoteWindow] + (b&0x7f);
\r
437 target.put((char)c);
\r
438 if(offsets != null){
\r
439 offsets.put(sourceIndex);
\r
442 /* output surrogate pair */
\r
443 target.put((char)(0xd7c0+(c>>10)));
\r
444 if(target.hasRemaining()){
\r
445 target.put((char)(0xdc00 | (c&0x3ff)));
\r
446 if(offsets != null){
\r
447 offsets.put(sourceIndex);
\r
448 offsets.put(sourceIndex);
\r
451 /* target overflow */
\r
452 if(offsets != null){
\r
453 offsets.put(sourceIndex);
\r
455 charErrorBufferArray[0] = (char)(0xdc00 | (c&0x3ff));
\r
456 charErrorBufferLength = 1;
\r
458 cr = CoderResult.OVERFLOW;
\r
464 sourceIndex = nextSourceIndex;
\r
465 state = readCommand;
\r
466 label = FastSingle;
\r
468 case definePairOne:
\r
469 dynamicWindow = (byte)((b>>5)&7);
\r
470 byteOne = (byte)(b&0x1f);
\r
471 toUBytesArray[1] = (byte)b;
\r
473 state = definePairTwo;
\r
475 case definePairTwo:
\r
476 data.toUDynamicOffsets[dynamicWindow] = 0x10000 + (byteOne<<15L | b<<7L);
\r
477 sourceIndex = nextSourceIndex;
\r
478 state = readCommand;
\r
479 label = FastSingle;
\r
483 /*callback (illegal)*/
\r
484 toUBytesArray[1] = (byte)b;
\r
488 }else if(b<gapThreshold){
\r
489 data.toUDynamicOffsets[dynamicWindow] = b<<7L;
\r
490 }else if(((b - gapThreshold)&UConverterConstants.UNSIGNED_BYTE_MASK)<(reservedStart - gapThreshold)){
\r
491 data.toUDynamicOffsets[dynamicWindow] = (b<<7L) + gapOffset;
\r
492 }else if(b>=fixedThreshold){
\r
493 data.toUDynamicOffsets[dynamicWindow] = fixedOffsets[b-fixedThreshold];
\r
495 /*callback (illegal)*/
\r
496 toUBytesArray[1] = (byte)b;
\r
501 sourceIndex = nextSourceIndex;
\r
502 state = readCommand;
\r
503 label = FastSingle;
\r
508 }else if(modeType==UnicodeMode){
\r
509 while(source.hasRemaining()){
\r
510 if(!target.hasRemaining()){
\r
511 cr = CoderResult.OVERFLOW;
\r
515 b = (short)(source.get() & UConverterConstants.UNSIGNED_BYTE_MASK);
\r
519 if((short)((b -UC0)&UConverterConstants.UNSIGNED_BYTE_MASK)>(Urs - UC0)){
\r
521 toUBytesArray[0] = (byte)b;
\r
523 state = quotePairTwo;
\r
524 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UC7){
\r
525 dynamicWindow = (byte)(b - UC0);
\r
526 sourceIndex = nextSourceIndex;
\r
527 isSingleByteMode = true;
\r
528 label = FastSingle;
\r
530 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) <= UD7){
\r
531 dynamicWindow = (byte)(b - UD0);
\r
532 isSingleByteMode = true;
\r
533 toUBytesArray[0] = (byte)b;
\r
536 label = SingleByteMode;
\r
538 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UDX){
\r
539 isSingleByteMode = true;
\r
540 toUBytesArray[0] = (byte)b;
\r
542 state = definePairOne;
\r
543 label = SingleByteMode;
\r
545 }else if((b&UConverterConstants.UNSIGNED_BYTE_MASK) == UQU){
\r
546 toUBytesArray[0] = (byte)b;
\r
548 state = quotePairOne;
\r
550 /* callback (illegal)*/
\r
551 cr = CoderResult.malformedForLength(1);
\r
552 toUBytesArray[0] = (byte)b;
\r
560 toUBytesArray[1] = (byte)b;
\r
562 state = quotePairTwo;
\r
565 target.put((char)((byteOne<<8) | b));
\r
566 if(offsets != null){
\r
567 offsets.put(sourceIndex);
\r
569 sourceIndex = nextSourceIndex;
\r
570 state = readCommand;
\r
571 label = FastSingle;
\r
580 private void endLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets){
\r
581 if(cr==CoderResult.OVERFLOW){
\r
582 state = readCommand;
\r
583 }else if(state == readCommand){
\r
586 data.toUIsSingleByteMode = isSingleByteMode;
\r
587 data.toUState = state;
\r
588 data.toUQuoteWindow = quoteWindow;
\r
589 data.toUDynamicWindow = dynamicWindow;
\r
590 data.toUByteOne = byteOne;
\r
595 class CharsetEncoderSCSU extends CharsetEncoderICU{
\r
596 public CharsetEncoderSCSU(CharsetICU cs) {
\r
597 super(cs, fromUSubstitution);
\r
601 //private SCSUData data;
\r
602 protected void implReset() {
\r
604 extraInfo.initialize();
\r
607 /* label values for supporting behavior similar to goto in C */
\r
608 private static final int Loop=0;
\r
609 private static final int GetTrailUnicode=1;
\r
610 private static final int OutputBytes=2;
\r
611 private static final int EndLoop =3;
\r
614 private int length;
\r
616 ///variables of compression heuristics
\r
617 private int offset;
\r
618 private char lead, trail;
\r
620 private byte window;
\r
622 //Get the state machine state
\r
623 private boolean isSingleByteMode;
\r
624 private byte dynamicWindow ;
\r
625 private int currentOffset;
\r
630 //sourceIndex=-1 if the current character began in the previous buffer
\r
631 private int sourceIndex ;
\r
632 private int nextSourceIndex;
\r
633 private int targetCapacity;
\r
635 private boolean LabelLoop;//used to break the while loop
\r
636 private boolean AfterGetTrail;// its value is set to true in order to ignore the code before getTrailSingle:
\r
637 private boolean AfterGetTrailUnicode;// is value is set to true in order to ignore the code before getTrailUnicode:
\r
641 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
\r
643 cr = CoderResult.UNDERFLOW;
\r
645 //Get the state machine state
\r
646 isSingleByteMode = data.fromUIsSingleByteMode;
\r
647 dynamicWindow = data.fromUDynamicWindow;
\r
648 currentOffset = data.fromUDynamicOffsets[dynamicWindow];
\r
651 sourceIndex = c== 0 ? 0: -1 ;
\r
652 nextSourceIndex = 0;
\r
655 targetCapacity = target.limit()-target.position();
\r
657 //sourceIndex=-1 if the current character began in the previous buffer
\r
658 sourceIndex = c== 0 ? 0: -1 ;
\r
659 nextSourceIndex = 0;
\r
661 int labelType = Loop; // set to Loop so that the code starts from loop:
\r
663 AfterGetTrail = false;
\r
664 AfterGetTrailUnicode = false;
\r
669 labelType = loop(source, target, offsets);
\r
671 case GetTrailUnicode:
\r
672 labelType = getTrailUnicode(source, target, offsets);
\r
675 labelType = outputBytes(source, target, offsets);
\r
678 endLoop(source, target, offsets);
\r
685 private byte getWindow(int[] offsets){
\r
688 if(((c-offsets[i]) & UConverterConstants.UNSIGNED_INT_MASK) <= 0x7f){
\r
695 private boolean isInOffsetWindowOrDirect(int offsetValue, int a){
\r
696 return (a & UConverterConstants.UNSIGNED_INT_MASK)<=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK)+0x7f &
\r
697 ((a & UConverterConstants.UNSIGNED_INT_MASK)>=(offsetValue & UConverterConstants.UNSIGNED_INT_MASK) ||
\r
698 ((a & UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && ((a & UConverterConstants.UNSIGNED_INT_MASK)>=0x20
\r
699 || ((1L<<(a & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0)));
\r
702 private byte getNextDynamicWindow(){
\r
703 byte windowValue = data.windowUse[data.nextWindowUseIndex];
\r
704 if(++data.nextWindowUseIndex==8){
\r
705 data.nextWindowUseIndex=0;
\r
707 return windowValue;
\r
710 private void useDynamicWindow(byte windowValue){
\r
711 /*first find the index of the window*/
\r
713 i = data.nextWindowUseIndex;
\r
718 }while(data.windowUse[i]!=windowValue);
\r
720 /*now copy each window[i+1] to [i]*/
\r
725 while(j!=data.nextWindowUseIndex){
\r
726 data.windowUse[i] = data.windowUse[j];
\r
733 /*finally, set the window into the most recently used index*/
\r
734 data.windowUse[i]= windowValue;
\r
738 private int getDynamicOffset(){
\r
741 if(((c-fixedOffsets[i])&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){
\r
742 offset = fixedOffsets[i];
\r
746 if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x80){
\r
747 /*No dynamic window for US-ASCII*/
\r
749 }else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0x3400 || ((c-0x10000)&UConverterConstants.UNSIGNED_INT_MASK)<(0x14000-0x10000) ||
\r
750 ((c-0x1d000)&UConverterConstants.UNSIGNED_INT_MASK)<=(0x1ffff-0x1d000)){
\r
751 /*This character is in the code range for a "small", i.e, reasonably windowable, script*/
\r
752 offset = c&0x7fffff80;
\r
754 }else if(0xe000<=(c&UConverterConstants.UNSIGNED_INT_MASK) && (c&UConverterConstants.UNSIGNED_INT_MASK)!=0xfeff && (c&UConverterConstants.UNSIGNED_INT_MASK) < 0xfff0){
\r
755 /*for these characters we need to take the gapOffset into account*/
\r
756 offset=(c)&0x7fffff80;
\r
757 return ((c-gapOffset)>>7);
\r
763 private int loop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
765 if(isSingleByteMode){
\r
766 if(c!=0 && targetCapacity>0 && !AfterGetTrail){
\r
767 label = getTrail(source, target, offsets);
\r
770 /*state machine for single byte mode*/
\r
771 while(AfterGetTrail || source.hasRemaining()){
\r
772 if(targetCapacity<=0 && !AfterGetTrail){
\r
774 cr = CoderResult.OVERFLOW;
\r
778 if(!AfterGetTrail){
\r
783 if(((c -0x20)&UConverterConstants.UNSIGNED_INT_MASK)<=0x5f && !AfterGetTrail){
\r
784 /*pass US-ASCII graphic character through*/
\r
785 target.put((byte)c);
\r
787 offsets.put(sourceIndex);
\r
790 }else if((c & UConverterConstants.UNSIGNED_INT_MASK)<0x20 && !AfterGetTrail){
\r
791 if(((1L<<(c & UConverterConstants.UNSIGNED_INT_MASK))&0x2601)!=0){
\r
793 target.put((byte)c);
\r
795 offsets.put(sourceIndex);
\r
799 /*quote c0 control character*/
\r
802 label = OutputBytes;
\r
805 } else if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f && !AfterGetTrail){
\r
806 /*use the current dynamic window*/
\r
807 target.put((byte)(delta|0x80));
\r
809 offsets.put(sourceIndex);
\r
812 } else if(AfterGetTrail || UTF16.isSurrogate((char)c)){
\r
813 if(!AfterGetTrail){
\r
814 if(UTF16.isLeadSurrogate((char)c)){
\r
815 label = getTrail(source, target, offsets);
\r
816 if(label==EndLoop){
\r
820 /*this is unmatched lead code unit (2nd Surrogate)*/
\r
821 /*callback(illegal)*/
\r
822 cr = CoderResult.malformedForLength(1);
\r
830 AfterGetTrail = false;
\r
833 /*Compress supplementary character U+10000...U+10ffff */
\r
834 if(((delta=(c-currentOffset))&UConverterConstants.UNSIGNED_INT_MASK)<=0x7f){
\r
835 /*use the current dynamic window*/
\r
836 target.put((byte)(delta|0x80));
\r
838 offsets.put(sourceIndex);
\r
841 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){
\r
842 /*there is a dynamic window that contains this character, change to it*/
\r
843 dynamicWindow = window;
\r
844 currentOffset = data.fromUDynamicOffsets[dynamicWindow];
\r
845 useDynamicWindow(dynamicWindow);
\r
846 c = ((SC0+dynamicWindow)<<8 | (c-currentOffset)|0x80);
\r
848 label = OutputBytes;
\r
850 } else if((code=getDynamicOffset())>=0){
\r
851 /*might check if there are come character in this window to come */
\r
852 /*define an extended window with this character*/
\r
854 dynamicWindow=getNextDynamicWindow();
\r
855 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
\r
856 useDynamicWindow(dynamicWindow);
\r
857 c = ((SDX<<24) | (dynamicWindow<<21)|
\r
858 (code<<8)| (c- currentOffset) |0x80);
\r
859 // c = (((SDX)<<25) | (dynamicWindow<<21)|
\r
860 // (code<<8)| (c- currentOffset) |0x80 );
\r
862 label = OutputBytes;
\r
865 /*change to unicode mode and output this (lead, trail) pair*/
\r
866 isSingleByteMode = false;
\r
867 target.put((byte)SCU);
\r
869 offsets.put(sourceIndex);
\r
872 c = (lead<<16)|trail;
\r
874 label = OutputBytes;
\r
877 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)<0xa0){
\r
878 /*quote C1 control character*/
\r
879 c = (c&0x7f) | (SQ0+1)<<8; /*SQ0+1 == SQ1*/
\r
881 label = OutputBytes;
\r
883 } else if((c&UConverterConstants.UNSIGNED_INT_MASK)==0xfeff || (c&UConverterConstants.UNSIGNED_INT_MASK)>= 0xfff0){
\r
884 /*quote signature character = byte order mark and specials*/
\r
887 label = OutputBytes;
\r
890 /*compress all other BMP characters*/
\r
891 if((window=getWindow(data.fromUDynamicOffsets))>=0){
\r
892 /*there is a window defined that contains this character - switch to it or quote from it*/
\r
893 if(source.position()>=source.limit() || isInOffsetWindowOrDirect(data.fromUDynamicOffsets[window], source.get(source.position()))){
\r
894 /*change to dynamic window*/
\r
895 dynamicWindow = window;
\r
896 currentOffset = data.fromUDynamicOffsets[dynamicWindow];
\r
897 useDynamicWindow(dynamicWindow);
\r
898 c = ((SC0+window)<<8) | (c- currentOffset) | 0x80;
\r
900 label = OutputBytes;
\r
903 /*quote from dynamic window*/
\r
904 c = ((SQ0+window)<<8) | (c - data.fromUDynamicOffsets[window]) |
\r
907 label = OutputBytes;
\r
910 } else if((window = getWindow(staticOffsets))>=0){
\r
911 /*quote from static window*/
\r
912 c = ((SQ0+window)<<8) | (c - staticOffsets[window]);
\r
914 label = OutputBytes;
\r
916 }else if((code=getDynamicOffset())>=0){
\r
917 /*define a dynamic window with this character*/
\r
918 dynamicWindow = getNextDynamicWindow();
\r
919 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
\r
920 useDynamicWindow(dynamicWindow);
\r
921 c = ((SD0+dynamicWindow)<<16) | (code<<8)|
\r
922 (c - currentOffset) | 0x80;
\r
924 label = OutputBytes;
\r
926 } else if(((int)((c-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && (source.position()>=source.limit() ||
\r
927 ((int)((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK))< (0xd800 - 0x3400))){
\r
930 * this character is not compressible (a BMP ideograph of similar)
\r
931 * switch to Unicode mode if this is the last character in the block
\r
932 * or there is at least one more ideograph following immediately
\r
934 isSingleByteMode = false;
\r
937 label = OutputBytes;
\r
943 label = OutputBytes;
\r
947 /*normal end of conversion : prepare for new character */
\r
949 sourceIndex = nextSourceIndex;
\r
952 if(c!=0 && targetCapacity>0 && !AfterGetTrailUnicode){
\r
953 label = GetTrailUnicode;
\r
957 /*state machine for Unicode*/
\r
958 /*unicodeByteMode*/
\r
959 while(AfterGetTrailUnicode || source.hasRemaining()){
\r
960 if(targetCapacity<=0 && !AfterGetTrailUnicode){
\r
962 cr = CoderResult.OVERFLOW;
\r
966 if(!AfterGetTrailUnicode){
\r
971 if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400) && !AfterGetTrailUnicode){
\r
972 /*not compressible, write character directly */
\r
973 if(targetCapacity>=2){
\r
974 target.put((byte)(c>>8));
\r
975 target.put((byte)c);
\r
977 offsets.put(sourceIndex);
\r
978 offsets.put(sourceIndex);
\r
983 label = OutputBytes;
\r
986 } else if((((c-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300*/&& !AfterGetTrailUnicode){
\r
987 /*compress BMP character if the following one is not an uncompressible ideograph*/
\r
988 if(!(source.hasRemaining() && (((source.get(source.position())-0x3400)& UConverterConstants.UNSIGNED_INT_MASK))<(0xd800-0x3400))){
\r
989 if(((((c-0x30)&UConverterConstants.UNSIGNED_INT_MASK))<10 || (((c-0x61)&UConverterConstants.UNSIGNED_INT_MASK))<26
\r
990 || (((c-0x41)&UConverterConstants.UNSIGNED_INT_MASK))<26)){
\r
991 /*ASCII digit or letter*/
\r
992 isSingleByteMode = true;
\r
993 c |=((UC0+dynamicWindow)<<8)|c;
\r
995 label = OutputBytes;
\r
997 } else if((window=getWindow(data.fromUDynamicOffsets))>=0){
\r
998 /*there is a dynamic window that contains this character, change to it*/
\r
999 isSingleByteMode = true;
\r
1000 dynamicWindow = window;
\r
1001 currentOffset = data.fromUDynamicOffsets[dynamicWindow];
\r
1002 useDynamicWindow(dynamicWindow);
\r
1003 c = ((UC0+dynamicWindow)<<8) | (c- currentOffset) | 0x80;
\r
1005 label = OutputBytes;
\r
1007 } else if((code=getDynamicOffset())>=0){
\r
1008 /*define a dynamic window with this character*/
\r
1009 isSingleByteMode = true;
\r
1010 dynamicWindow = getNextDynamicWindow();
\r
1011 currentOffset = data.fromUDynamicOffsets[dynamicWindow]=offset;
\r
1012 useDynamicWindow(dynamicWindow);
\r
1013 c = ((UD0+dynamicWindow)<<16) | (code<<8)
\r
1014 |(c - currentOffset) | 0x80;
\r
1016 label = OutputBytes;
\r
1021 /*don't know how to compress these character, just write it directly*/
\r
1023 label = OutputBytes;
\r
1025 } else if(c<0xe000 && !AfterGetTrailUnicode){
\r
1026 label = GetTrailUnicode;
\r
1028 } else if (!AfterGetTrailUnicode){
\r
1029 /*quote to avoid SCSU tags*/
\r
1032 label = OutputBytes;
\r
1036 if(AfterGetTrailUnicode){
\r
1037 AfterGetTrailUnicode = false;
\r
1039 /*normal end of conversion, prepare for a new character*/
\r
1041 sourceIndex = nextSourceIndex;
\r
1048 private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1051 if(source.hasRemaining()){
\r
1052 /*test the following code unit*/
\r
1053 trail = source.get(source.position());
\r
1054 if(UTF16.isTrailSurrogate(trail)){
\r
1055 source.position(source.position()+1);
\r
1056 ++nextSourceIndex;
\r
1057 c = UCharacter.getCodePoint((char)c, trail);
\r
1060 /*this is unmatched lead code unit (1st Surrogate)*/
\r
1061 /*callback(illegal)*/
\r
1062 cr = CoderResult.malformedForLength(1);
\r
1069 AfterGetTrail = true;
\r
1073 private int getTrailUnicode(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1074 int label = EndLoop;
\r
1075 AfterGetTrailUnicode = true;
\r
1076 /*c is surrogate*/
\r
1077 if(UTF16.isLeadSurrogate((char)c)){
\r
1078 // getTrailUnicode:
\r
1080 if(source.hasRemaining()){
\r
1081 /*test the following code unit*/
\r
1082 trail = source.get(source.position());
\r
1083 if(UTF16.isTrailSurrogate(trail)){
\r
1085 ++nextSourceIndex;
\r
1086 c = UCharacter.getCodePoint((char)c, trail);
\r
1087 /*convert this surrogate code point*/
\r
1088 /*exit this condition tree*/
\r
1090 /*this is unmatched lead code unit(1st surrogate)*/
\r
1091 /*callback(illegal)*/
\r
1092 cr = CoderResult.malformedForLength(1);
\r
1102 /*this is an unmatched trail code point (2nd surrogate)*/
\r
1103 /*callback (illegal)*/
\r
1104 cr = CoderResult.malformedForLength(1);
\r
1109 /*compress supplementary character*/
\r
1110 if((window=getWindow(data.fromUDynamicOffsets))>=0 &&
\r
1111 !(source.hasRemaining() && ((source.get(source.position())-0x3400)&UConverterConstants.UNSIGNED_INT_MASK) <
\r
1112 (0xd800 - 0x3400))){
\r
1114 * this is the dynamic window that contains this character and the following
\r
1115 * character is not uncompressible,
\r
1116 * change to the window
\r
1118 isSingleByteMode = true;
\r
1119 dynamicWindow = window;
\r
1120 currentOffset = data.fromUDynamicOffsets[dynamicWindow];
\r
1121 useDynamicWindow(dynamicWindow);
\r
1122 c = ((UC0+dynamicWindow)<<8 | (c-currentOffset) | 0x80);
\r
1124 label = OutputBytes;
\r
1126 } else if(source.hasRemaining() && lead == source.get(source.position()) && (code=getDynamicOffset())>=0){
\r
1127 /*two supplementary characters in (probably) the same window - define an extended one*/
\r
1128 isSingleByteMode = true;
\r
1129 dynamicWindow = getNextDynamicWindow();
\r
1130 currentOffset = data.fromUDynamicOffsets[dynamicWindow] = offset;
\r
1131 useDynamicWindow(dynamicWindow);
\r
1132 c = (UDX<<24) | (dynamicWindow<<21) |(code<<8) |(c - currentOffset) | 0x80;
\r
1134 label = OutputBytes;
\r
1137 /*don't know how to compress this character, just write it directly*/
\r
1138 c = (lead<<16)|trail;
\r
1140 label = OutputBytes;
\r
1146 private void endLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1147 /*set the converter state back to UConverter*/
\r
1148 data.fromUIsSingleByteMode = isSingleByteMode;
\r
1149 data.fromUDynamicWindow = dynamicWindow;
\r
1151 LabelLoop = false;
\r
1154 @SuppressWarnings("fallthrough")
\r
1155 private int outputBytes(CharBuffer source, ByteBuffer target, IntBuffer offsets){
\r
1157 //int targetCapacity = target.limit()-target.position();
\r
1158 /*write the output character byte from c and length*/
\r
1159 /*from the first if in the loop we know that targetCapacity>0*/
\r
1160 if(length<=targetCapacity){
\r
1162 /*each branch falls through the next one*/
\r
1164 target.put((byte)(c>>24));
\r
1165 if(offsets!=null){
\r
1166 offsets.put(sourceIndex);
\r
1169 target.put((byte)(c>>16));
\r
1170 if(offsets!=null){
\r
1171 offsets.put(sourceIndex);
\r
1174 target.put((byte)(c>>8));
\r
1175 if(offsets!=null){
\r
1176 offsets.put(sourceIndex);
\r
1179 target.put((byte)c);
\r
1180 if(offsets!=null){
\r
1181 offsets.put(sourceIndex);
\r
1184 /*will never occur*/
\r
1187 targetCapacity-=length;
\r
1189 /*normal end of conversion: prepare for a new character*/
\r
1191 sourceIndex = nextSourceIndex;
\r
1195 ByteBuffer p = ByteBuffer.wrap(errorBuffer);
\r
1197 * We actually do this backwards here:
\r
1198 * In order to save an intermediate variable, we output
\r
1199 * first to the overflow buffer what does not fit into the
\r
1202 /* we know that 0<=targetCapacity<length<=4 */
\r
1203 /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
\r
1204 length -= targetCapacity;
\r
1206 /*each branch falls through the next one*/
\r
1208 p.put((byte)(c>>24));
\r
1210 p.put((byte)(c>>16));
\r
1212 p.put((byte)(c>>8));
\r
1216 /*will never occur*/
\r
1219 errorBufferLength = length;
\r
1221 /*now output what fits into the regular target*/
\r
1222 c>>=8*length; //length was reduced by targetCapacity
\r
1223 switch(targetCapacity){
\r
1224 /*each branch falls through the next one*/
\r
1226 target.put((byte)(c>>16));
\r
1227 if(offsets!=null){
\r
1228 offsets.put(sourceIndex);
\r
1231 target.put((byte)(c>>8));
\r
1232 if(offsets!=null){
\r
1233 offsets.put(sourceIndex);
\r
1236 target.put((byte)c);
\r
1237 if(offsets!=null){
\r
1238 offsets.put(sourceIndex);
\r
1244 /*target overflow*/
\r
1245 targetCapacity = 0;
\r
1246 cr = CoderResult.OVERFLOW;
\r
1255 public CharsetDecoder newDecoder() {
\r
1256 return new CharsetDecoderSCSU(this);
\r
1259 public CharsetEncoder newEncoder() {
\r
1260 return new CharsetEncoderSCSU(this);
\r
1263 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
\r
1264 CharsetICU.getCompleteUnicodeSet(setFillIn);
\r