/* ******************************************************************************* * Copyright (C) 2003-2009, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.impl; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.UTF16; /** * Ported code from ICU punycode.c * @author ram */ /* Package Private class */ public final class Punycode { /* Punycode parameters for Bootstring */ private static final int BASE = 36; private static final int TMIN = 1; private static final int TMAX = 26; private static final int SKEW = 38; private static final int DAMP = 700; private static final int INITIAL_BIAS = 72; private static final int INITIAL_N = 0x80; /* "Basic" Unicode/ASCII code points */ private static final int HYPHEN = 0x2d; private static final int DELIMITER = HYPHEN; private static final int ZERO = 0x30; //private static final int NINE = 0x39; private static final int SMALL_A = 0x61; private static final int SMALL_Z = 0x7a; private static final int CAPITAL_A = 0x41; private static final int CAPITAL_Z = 0x5a; private static final int MAX_CP_COUNT = 200; //private static final int UINT_MAGIC = 0x80000000; //private static final long ULONG_MAGIC = 0x8000000000000000L; private static int adaptBias(int delta, int length, boolean firstTime){ if(firstTime){ delta /=DAMP; }else{ delta /= 2; } delta += delta/length; int count=0; for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { delta/=(BASE-TMIN); } return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); } /** * basicToDigit[] contains the numeric value of a basic code * point (for use in representing integers) in the range 0 to * BASE-1, or -1 if b is does not represent a value. */ static final int[] basicToDigit= new int[]{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; ///CLOVER:OFF private static char asciiCaseMap(char b, boolean uppercase) { if(uppercase) { if(SMALL_A<=b && b<=SMALL_Z) { b-=(SMALL_A-CAPITAL_A); } } else { if(CAPITAL_A<=b && b<=CAPITAL_Z) { b+=(SMALL_A-CAPITAL_A); } } return b; } ///CLOVER:ON /** * digitToBasic() returns the basic code point whose value * (when used for representing integers) is d, which must be in the * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is * nonzero, in which case the uppercase form is used. */ private static char digitToBasic(int digit, boolean uppercase) { /* 0..25 map to ASCII a..z or A..Z */ /* 26..35 map to ASCII 0..9 */ if(digit<26) { if(uppercase) { return (char)(CAPITAL_A+digit); } else { return (char)(SMALL_A+digit); } } else { return (char)((ZERO-26)+digit); } } /** * Converts Unicode to Punycode. * The input string must not contain single, unpaired surrogates. * The output will be represented as an array of ASCII code points. * * @param src The source of the String Buffer passed. * @param caseFlags The boolean array of case flags. * @return An array of ASCII code points. */ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{ int[] cpBuffer = new int[MAX_CP_COUNT]; int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; char c, c2; int srcLength = src.length(); int destCapacity = MAX_CP_COUNT; char[] dest = new char[destCapacity]; StringBuffer result = new StringBuffer(); /* * Handle the basic code points and * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ srcCPCount=destLength=0; for(j=0; j0) { if(destLength state to , but guard against overflow: */ if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { throw new IllegalStateException("Internal program error"); } delta+=(m-n)*(handledCPCount+1); n=m; /* Encode a sequence of same code points n */ for(j=0; jTMAX) { t=TMAX; } */ t=k-bias; if(t=(bias+TMAX)) { t=TMAX; } if(q= CAPITAL_Z); } ///CLOVER:ON private static boolean isSurrogate(int ch){ return (((ch)&0xfffff800)==0xd800); } /** * Converts Punycode to Unicode. * The Unicode string will be at most as long as the Punycode string. * * @param src The source of the string buffer being passed. * @param caseFlags The array of boolean case flags. * @return StringBuffer string. */ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{ int srcLength = src.length(); StringBuffer result = new StringBuffer(); int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, destCPCount, firstSupplementaryIndex, cpLength; char b; int destCapacity = MAX_CP_COUNT; char[] dest = new char[destCapacity]; /* * Handle the basic code points: * Let basicLength be the number of input code points * before the last delimiter, or 0 if there is none, * then copy the first basicLength code points to the output. * * The two following loops iterate backward. */ for(j=srcLength; j>0;) { if(src.charAt(--j)==DELIMITER) { break; } } destLength=basicLength=destCPCount=j; while(j>0) { b=src.charAt(--j); if(!isBasic(b)) { throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); } if(j0 ? basicLength+1 : 0; in=srcLength) { throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } digit=basicToDigit[src.charAt(in++) & 0xFF]; if(digit<0) { throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); } if(digit>(0x7fffffff-i)/w) { /* integer overflow */ throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } i+=digit*w; t=k-bias; if(t=(bias+TMAX)) { t=TMAX; } if(digit0x7fffffff/(BASE-t)) { /* integer overflow */ throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } w*=BASE-t; } /* * Modification from sample code: * Increments destCPCount here, * where needed instead of in for() loop tail. */ ++destCPCount; bias=adaptBias(i-oldi, destCPCount, (oldi==0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ if(i/destCPCount>(0x7fffffff-n)) { /* integer overflow */ throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } n+=i/destCPCount; i%=destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ if(n>0x10ffff || isSurrogate(n)) { /* Unicode code point overflow */ throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } /* Insert n at position i of the output: */ cpLength=UTF16.getCharCount(n); if((destLength+cpLength)1) { firstSupplementaryIndex=codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { codeUnitIndex=firstSupplementaryIndex; codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ if(codeUnitIndex