2 *******************************************************************************
3 * Copyright (C) 2003-2010, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.impl;
9 import com.ibm.icu.text.IDNA;
10 import com.ibm.icu.text.StringPrep;
11 import com.ibm.icu.text.StringPrepParseException;
12 import com.ibm.icu.text.UCharacterIterator;
15 * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
16 * while extending that class to support IDNA2008/UTS #46 as well.
17 * @author Ram Viswanadha
19 public final class IDNA2003 {
20 /* IDNA ACE Prefix is "xn--" */
21 private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
22 //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
24 private static final int MAX_LABEL_LENGTH = 63;
25 private static final int HYPHEN = 0x002D;
26 private static final int CAPITAL_A = 0x0041;
27 private static final int CAPITAL_Z = 0x005A;
28 private static final int LOWER_CASE_DELTA = 0x0020;
29 private static final int FULL_STOP = 0x002E;
30 private static final int MAX_DOMAIN_NAME_LENGTH = 255;
32 // The NamePrep profile object
33 private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
35 private static boolean startsWithPrefix(StringBuffer src){
36 boolean startsWithPrefix = true;
38 if(src.length() < ACE_PREFIX.length){
41 for(int i=0; i<ACE_PREFIX.length;i++){
42 if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
43 startsWithPrefix = false;
46 return startsWithPrefix;
49 private static char toASCIILower(char ch){
50 if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
51 return (char)(ch + LOWER_CASE_DELTA);
56 private static StringBuffer toASCIILower(CharSequence src){
57 StringBuffer dest = new StringBuffer();
58 for(int i=0; i<src.length();i++){
59 dest.append(toASCIILower(src.charAt(i)));
64 private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
67 for(int i =0;/* no condition */;i++) {
68 /* If we reach the ends of both strings then they match */
69 if(i == s1.length()) {
76 /* Case-insensitive comparison */
78 rc=toASCIILower(c1)-toASCIILower(c2);
86 private static int getSeparatorIndex(char[] src,int start, int limit){
87 for(; start<limit;start++){
88 if(isLabelSeparator(src[start])){
92 // we have not found the separator just return length
97 private static int getSeparatorIndex(UCharacterIterator iter){
98 int currentIndex = iter.getIndex();
99 int separatorIndex = 0;
101 while((ch=iter.next())!= UCharacterIterator.DONE){
102 if(isLabelSeparator(ch)){
103 separatorIndex = iter.getIndex();
104 iter.setIndex(currentIndex);
105 return separatorIndex;
109 iter.setIndex(currentIndex);
110 // we have not found the separator just return the length
116 private static boolean isLDHChar(int ch){
121 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
123 (0x0030 <= ch && ch <= 0x0039) ||
124 (0x0041 <= ch && ch <= 0x005A) ||
125 (0x0061 <= ch && ch <= 0x007A)
133 * Ascertain if the given code point is a label separator as
134 * defined by the IDNA RFC
136 * @param ch The code point to be ascertained
137 * @return true if the char is a label separator
140 private static boolean isLabelSeparator(int ch){
152 public static StringBuffer convertToASCII(UCharacterIterator src, int options)
153 throws StringPrepParseException{
155 boolean[] caseFlags = null;
157 // the source contains all ascii codepoints
158 boolean srcIsASCII = true;
159 // assume the source contains all LDH codepoints
160 boolean srcIsLDH = true;
163 boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
166 while((ch = src.next())!= UCharacterIterator.DONE){
173 StringBuffer processOut = null;
174 // step 2 is performed only if the source contains non ASCII
177 processOut = namePrep.prepare(src, options);
179 processOut = new StringBuffer(src.getText());
181 int poLen = processOut.length();
184 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
186 StringBuffer dest = new StringBuffer();
188 // reset the variable to verify if output of prepare is ASCII or not
192 for(int j=0;j<poLen;j++ ){
193 ch=processOut.charAt(j);
196 }else if(isLDHChar(ch)==false){
197 // here we do not assemble surrogates
198 // since we know that LDH code points
199 // are in the ASCII range only
205 if(useSTD3ASCIIRules == true){
207 if( srcIsLDH == false /* source contains some non-LDH characters */
208 || processOut.charAt(0) == HYPHEN
209 || processOut.charAt(processOut.length()-1) == HYPHEN){
211 /* populate the parseError struct */
213 throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
214 StringPrepParseException.STD3_ASCII_RULES_ERROR,
215 processOut.toString(),
216 (failPos>0) ? (failPos-1) : failPos);
217 }else if(processOut.charAt(0) == HYPHEN){
218 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
219 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
222 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
223 StringPrepParseException.STD3_ASCII_RULES_ERROR,
224 processOut.toString(),
225 (poLen>0) ? poLen-1 : poLen);
233 // step 5 : verify the sequence does not begin with ACE prefix
234 if(!startsWithPrefix(processOut)){
236 //step 6: encode the sequence with punycode
237 caseFlags = new boolean[poLen];
239 StringBuilder punyout = Punycode.encode(processOut,caseFlags);
241 // convert all codepoints to lower case ASCII
242 StringBuffer lowerOut = toASCIILower(punyout);
244 //Step 7: prepend the ACE prefix
245 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
246 //Step 6: copy the contents in b2 into dest
247 dest.append(lowerOut);
250 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
251 StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
254 if(dest.length() > MAX_LABEL_LENGTH){
255 throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
256 StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
261 public static StringBuffer convertIDNToASCII(String src,int options)
262 throws StringPrepParseException{
264 char[] srcArr = src.toCharArray();
265 StringBuffer result = new StringBuffer();
269 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
270 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
271 //make sure this is not a root label separator.
272 if(!(label.length()==0 && sepIndex==srcArr.length)){
273 UCharacterIterator iter = UCharacterIterator.getInstance(label);
274 result.append(convertToASCII(iter,options));
276 if(sepIndex==srcArr.length){
280 // increment the sepIndex to skip past the separator
282 oldSepIndex = sepIndex;
283 result.append((char)FULL_STOP);
285 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
286 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
291 public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
292 throws StringPrepParseException{
294 boolean[] caseFlags = null;
296 // the source contains all ascii codepoints
297 boolean srcIsASCII = true;
298 // assume the source contains all LDH codepoints
299 //boolean srcIsLDH = true;
302 //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
306 int saveIndex = src.getIndex();
307 // step 1: find out if all the codepoints in src are ASCII
308 while((ch=src.next())!= UCharacterIterator.DONE){
311 }/*else if((srcIsLDH = isLDHChar(ch))==false){
312 failPos = src.getIndex();
315 StringBuffer processOut;
317 if(srcIsASCII == false){
319 // step 2: process the string
320 src.setIndex(saveIndex);
321 processOut = namePrep.prepare(src,options);
322 } catch (StringPrepParseException ex) {
323 return new StringBuffer(src.getText());
327 //just point to source
328 processOut = new StringBuffer(src.getText());
331 // The RFC states that
333 // ToUnicode never fails. If any step fails, then the original input
334 // is returned immediately in that step.
337 //step 3: verify ACE Prefix
338 if(startsWithPrefix(processOut)){
339 StringBuffer decodeOut = null;
341 //step 4: Remove the ACE Prefix
342 String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
344 //step 5: Decode using punycode
346 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
347 } catch (StringPrepParseException e) {
351 //step 6:Apply toASCII
352 if (decodeOut != null) {
353 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
356 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
357 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
358 // StringPrepParseException.VERIFICATION_ERROR);
363 //step 8: return output of step 5
364 if (decodeOut != null) {
370 // // verify that STD3 ASCII rules are satisfied
371 // if(useSTD3ASCIIRules == true){
372 // if( srcIsLDH == false /* source contains some non-LDH characters */
373 // || processOut.charAt(0) == HYPHEN
374 // || processOut.charAt(processOut.length()-1) == HYPHEN){
376 // if(srcIsLDH==false){
377 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
378 // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
379 // (failPos>0) ? (failPos-1) : failPos);
380 // }else if(processOut.charAt(0) == HYPHEN){
381 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
382 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
383 // processOut.toString(),0);
386 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
387 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
388 // processOut.toString(),
389 // processOut.length());
394 // // just return the source
395 // return new StringBuffer(src.getText());
398 return new StringBuffer(src.getText());
401 public static StringBuffer convertIDNToUnicode(String src, int options)
402 throws StringPrepParseException{
404 char[] srcArr = src.toCharArray();
405 StringBuffer result = new StringBuffer();
409 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
410 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
411 if(label.length()==0 && sepIndex!=srcArr.length ){
412 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
414 UCharacterIterator iter = UCharacterIterator.getInstance(label);
415 result.append(convertToUnicode(iter,options));
416 if(sepIndex==srcArr.length){
419 // Unlike the ToASCII operation we don't normalize the label separators
420 result.append(srcArr[sepIndex]);
421 // increment the sepIndex to skip past the separator
423 oldSepIndex =sepIndex;
425 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
426 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
431 public static int compare(String s1, String s2, int options) throws StringPrepParseException{
432 StringBuffer s1Out = convertIDNToASCII(s1, options);
433 StringBuffer s2Out = convertIDNToASCII(s2, options);
434 return compareCaseInsensitiveASCII(s1Out,s2Out);