2 *******************************************************************************
\r
3 * Copyright (C) 2003-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.impl;
\r
9 import com.ibm.icu.lang.UCharacter;
\r
10 import com.ibm.icu.text.StringPrepParseException;
\r
11 import com.ibm.icu.text.UTF16;
\r
14 * Ported code from ICU punycode.c
\r
18 /* Package Private class */
\r
19 public final class Punycode {
\r
21 /* Punycode parameters for Bootstring */
\r
22 private static final int BASE = 36;
\r
23 private static final int TMIN = 1;
\r
24 private static final int TMAX = 26;
\r
25 private static final int SKEW = 38;
\r
26 private static final int DAMP = 700;
\r
27 private static final int INITIAL_BIAS = 72;
\r
28 private static final int INITIAL_N = 0x80;
\r
30 /* "Basic" Unicode/ASCII code points */
\r
31 private static final int HYPHEN = 0x2d;
\r
32 private static final int DELIMITER = HYPHEN;
\r
34 private static final int ZERO = 0x30;
\r
35 //private static final int NINE = 0x39;
\r
37 private static final int SMALL_A = 0x61;
\r
38 private static final int SMALL_Z = 0x7a;
\r
40 private static final int CAPITAL_A = 0x41;
\r
41 private static final int CAPITAL_Z = 0x5a;
\r
42 private static final int MAX_CP_COUNT = 200;
\r
43 //private static final int UINT_MAGIC = 0x80000000;
\r
44 //private static final long ULONG_MAGIC = 0x8000000000000000L;
\r
46 private static int adaptBias(int delta, int length, boolean firstTime){
\r
52 delta += delta/length;
\r
55 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
\r
59 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
\r
63 * basicToDigit[] contains the numeric value of a basic code
\r
64 * point (for use in representing integers) in the range 0 to
\r
65 * BASE-1, or -1 if b is does not represent a value.
\r
67 static final int[] basicToDigit= new int[]{
\r
68 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
69 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
71 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
72 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
\r
74 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
\r
75 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
\r
77 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
\r
78 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
\r
80 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
81 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
84 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
87 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
\r
90 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
\r
94 private static char asciiCaseMap(char b, boolean uppercase) {
\r
96 if(SMALL_A<=b && b<=SMALL_Z) {
\r
97 b-=(SMALL_A-CAPITAL_A);
\r
100 if(CAPITAL_A<=b && b<=CAPITAL_Z) {
\r
101 b+=(SMALL_A-CAPITAL_A);
\r
108 * digitToBasic() returns the basic code point whose value
\r
109 * (when used for representing integers) is d, which must be in the
\r
110 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
\r
111 * nonzero, in which case the uppercase form is used.
\r
113 private static char digitToBasic(int digit, boolean uppercase) {
\r
114 /* 0..25 map to ASCII a..z or A..Z */
\r
115 /* 26..35 map to ASCII 0..9 */
\r
118 return (char)(CAPITAL_A+digit);
\r
120 return (char)(SMALL_A+digit);
\r
123 return (char)((ZERO-26)+digit);
\r
127 * Converts Unicode to Punycode.
\r
128 * The input string must not contain single, unpaired surrogates.
\r
129 * The output will be represented as an array of ASCII code points.
\r
131 * @param src The source of the String Buffer passed.
\r
132 * @param caseFlags The boolean array of case flags.
\r
133 * @return An array of ASCII code points.
\r
135 public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{
\r
137 int[] cpBuffer = new int[MAX_CP_COUNT];
\r
138 int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
\r
140 int srcLength = src.length();
\r
141 int destCapacity = MAX_CP_COUNT;
\r
142 char[] dest = new char[destCapacity];
\r
143 StringBuffer result = new StringBuffer();
\r
145 * Handle the basic code points and
\r
146 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
\r
148 srcCPCount=destLength=0;
\r
150 for(j=0; j<srcLength; ++j) {
\r
151 if(srcCPCount==MAX_CP_COUNT) {
\r
152 /* too many input code points */
\r
153 throw new IndexOutOfBoundsException();
\r
157 if(destLength<destCapacity) {
\r
158 cpBuffer[srcCPCount++]=0;
\r
161 asciiCaseMap(c, caseFlags[j]) :
\r
166 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
\r
167 if(!UTF16.isSurrogate(c)) {
\r
169 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
\r
172 n|=UCharacter.getCodePoint(c, c2);
\r
174 /* error: unmatched surrogate */
\r
175 throw new StringPrepParseException("Illegal char found",StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
177 cpBuffer[srcCPCount++]=n;
\r
181 /* Finish the basic string - if it is not empty - with a delimiter. */
\r
182 basicLength=destLength;
\r
183 if(basicLength>0) {
\r
184 if(destLength<destCapacity) {
\r
185 dest[destLength]=DELIMITER;
\r
191 * handledCPCount is the number of code points that have been handled
\r
192 * basicLength is the number of basic code points
\r
193 * destLength is the number of chars that have been output
\r
196 /* Initialize the state: */
\r
201 /* Main encoding loop: */
\r
202 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
\r
204 * All non-basic code points < n have been handled already.
\r
205 * Find the next larger one:
\r
207 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
\r
208 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
\r
215 * Increase delta enough to advance the decoder's
\r
216 * <n,i> state to <m,0>, but guard against overflow:
\r
218 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
\r
219 throw new IllegalStateException("Internal program error");
\r
221 delta+=(m-n)*(handledCPCount+1);
\r
224 /* Encode a sequence of same code points n */
\r
225 for(j=0; j<srcCPCount; ++j) {
\r
226 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
\r
230 /* Represent delta as a generalized variable-length integer: */
\r
231 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
\r
233 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
\r
238 } else if(t>TMAX) {
\r
246 } else if(k>=(bias+TMAX)) {
\r
254 if(destLength<destCapacity) {
\r
255 dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
\r
260 if(destLength<destCapacity) {
\r
261 dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
\r
263 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
\r
273 return result.append(dest, 0, destLength);
\r
276 private static boolean isBasic(int ch){
\r
277 return (ch < INITIAL_N);
\r
280 private static boolean isBasicUpperCase(int ch){
\r
281 return( CAPITAL_A<=ch && ch >= CAPITAL_Z);
\r
284 private static boolean isSurrogate(int ch){
\r
285 return (((ch)&0xfffff800)==0xd800);
\r
288 * Converts Punycode to Unicode.
\r
289 * The Unicode string will be at most as long as the Punycode string.
\r
291 * @param src The source of the string buffer being passed.
\r
292 * @param caseFlags The array of boolean case flags.
\r
293 * @return StringBuffer string.
\r
295 public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
\r
296 throws StringPrepParseException{
\r
297 int srcLength = src.length();
\r
298 StringBuffer result = new StringBuffer();
\r
299 int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
\r
300 destCPCount, firstSupplementaryIndex, cpLength;
\r
302 int destCapacity = MAX_CP_COUNT;
\r
303 char[] dest = new char[destCapacity];
\r
306 * Handle the basic code points:
\r
307 * Let basicLength be the number of input code points
\r
308 * before the last delimiter, or 0 if there is none,
\r
309 * then copy the first basicLength code points to the output.
\r
311 * The two following loops iterate backward.
\r
313 for(j=srcLength; j>0;) {
\r
314 if(src.charAt(--j)==DELIMITER) {
\r
318 destLength=basicLength=destCPCount=j;
\r
323 throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND);
\r
326 if(j<destCapacity) {
\r
329 if(caseFlags!=null) {
\r
330 caseFlags[j]=isBasicUpperCase(b);
\r
335 /* Initialize the state: */
\r
339 firstSupplementaryIndex=1000000000;
\r
342 * Main decoding loop:
\r
343 * Start just after the last delimiter if any
\r
344 * basic code points were copied; start at the beginning otherwise.
\r
346 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
\r
348 * in is the index of the next character to be consumed, and
\r
349 * destCPCount is the number of code points in the output array.
\r
351 * Decode a generalized variable-length integer into delta,
\r
352 * which gets added to i. The overflow checking is easier
\r
353 * if we increase i as we go, then subtract off its starting
\r
354 * value at the end to obtain delta.
\r
356 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
\r
357 if(in>=srcLength) {
\r
358 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
361 digit=basicToDigit[src.charAt(in++) & 0xFF];
\r
363 throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
\r
365 if(digit>(0x7fffffff-i)/w) {
\r
366 /* integer overflow */
\r
367 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
374 } else if(k>=(bias+TMAX)) {
\r
381 if(w>0x7fffffff/(BASE-t)) {
\r
382 /* integer overflow */
\r
383 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
389 * Modification from sample code:
\r
390 * Increments destCPCount here,
\r
391 * where needed instead of in for() loop tail.
\r
394 bias=adaptBias(i-oldi, destCPCount, (oldi==0));
\r
397 * i was supposed to wrap around from (incremented) destCPCount to 0,
\r
398 * incrementing n each time, so we'll fix that now:
\r
400 if(i/destCPCount>(0x7fffffff-n)) {
\r
401 /* integer overflow */
\r
402 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
407 /* not needed for Punycode: */
\r
408 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
\r
410 if(n>0x10ffff || isSurrogate(n)) {
\r
411 /* Unicode code point overflow */
\r
412 throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
\r
415 /* Insert n at position i of the output: */
\r
416 cpLength=UTF16.getCharCount(n);
\r
417 if((destLength+cpLength)<destCapacity) {
\r
421 * Handle indexes when supplementary code points are present.
\r
423 * In almost all cases, there will be only BMP code points before i
\r
424 * and even in the entire string.
\r
425 * This is handled with the same efficiency as with UTF-32.
\r
427 * Only the rare cases with supplementary code points are handled
\r
428 * more slowly - but not too bad since this is an insertion anyway.
\r
430 if(i<=firstSupplementaryIndex) {
\r
433 firstSupplementaryIndex=codeUnitIndex;
\r
435 ++firstSupplementaryIndex;
\r
438 codeUnitIndex=firstSupplementaryIndex;
\r
439 codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
\r
442 /* use the UChar index codeUnitIndex instead of the code point index i */
\r
443 if(codeUnitIndex<destLength) {
\r
444 System.arraycopy(dest, codeUnitIndex,
\r
445 dest, codeUnitIndex+cpLength,
\r
446 (destLength-codeUnitIndex));
\r
447 if(caseFlags!=null) {
\r
448 System.arraycopy(caseFlags, codeUnitIndex,
\r
449 caseFlags, codeUnitIndex+cpLength,
\r
450 destLength-codeUnitIndex);
\r
454 /* BMP, insert one code unit */
\r
455 dest[codeUnitIndex]=(char)n;
\r
457 /* supplementary character, insert two code units */
\r
458 dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
\r
459 dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
\r
461 if(caseFlags!=null) {
\r
462 /* Case of last character determines uppercase flag: */
\r
463 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
\r
465 caseFlags[codeUnitIndex+1]=false;
\r
469 destLength+=cpLength;
\r
472 result.append(dest, 0, destLength);
\r