jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2003-2010, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 */
   7 package com.ibm.icu.impl;
   8
   9 import com.ibm.icu.text.IDNA;
  10 import com.ibm.icu.text.StringPrep;
  11 import com.ibm.icu.text.StringPrepParseException;
  12 import com.ibm.icu.text.UCharacterIterator;
  13
  14 /**
  15  * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
  16  * while extending that class to support IDNA2008/UTS #46 as well.
  17  * @author Ram Viswanadha
  18  */
  19 public final class IDNA2003 {
  20     /* IDNA ACE Prefix is "xn--" */
  21     private static char[] ACE_PREFIX                = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
  22     //private static final int ACE_PREFIX_LENGTH      = ACE_PREFIX.length;
  23
  24     private static final int MAX_LABEL_LENGTH       = 63;
  25     private static final int HYPHEN                 = 0x002D;
  26     private static final int CAPITAL_A              = 0x0041;
  27     private static final int CAPITAL_Z              = 0x005A;
  28     private static final int LOWER_CASE_DELTA       = 0x0020;
  29     private static final int FULL_STOP              = 0x002E;
  30     private static final int MAX_DOMAIN_NAME_LENGTH = 255;
  31
  32     // The NamePrep profile object
  33     private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
  34
  35     private static boolean startsWithPrefix(StringBuffer src){
  36         boolean startsWithPrefix = true;
  37
  38         if(src.length() < ACE_PREFIX.length){
  39             return false;
  40         }
  41         for(int i=0; i<ACE_PREFIX.length;i++){
  42             if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
  43                 startsWithPrefix = false;
  44             }
  45         }
  46         return startsWithPrefix;
  47     }
  48
  49     private static char toASCIILower(char ch){
  50         if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
  51             return (char)(ch + LOWER_CASE_DELTA);
  52         }
  53         return ch;
  54     }
  55
  56     private static StringBuffer toASCIILower(CharSequence src){
  57         StringBuffer dest = new StringBuffer();
  58         for(int i=0; i<src.length();i++){
  59             dest.append(toASCIILower(src.charAt(i)));
  60         }
  61         return dest;
  62     }
  63
  64     private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
  65         char c1,c2;
  66         int rc;
  67         for(int i =0;/* no condition */;i++) {
  68             /* If we reach the ends of both strings then they match */
  69             if(i == s1.length()) {
  70                 return 0;
  71             }
  72
  73             c1 = s1.charAt(i);
  74             c2 = s2.charAt(i);
  75
  76             /* Case-insensitive comparison */
  77             if(c1!=c2) {
  78                 rc=toASCIILower(c1)-toASCIILower(c2);
  79                 if(rc!=0) {
  80                     return rc;
  81                 }
  82             }
  83         }
  84     }
  85
  86     private static int getSeparatorIndex(char[] src,int start, int limit){
  87         for(; start<limit;start++){
  88             if(isLabelSeparator(src[start])){
  89                 return start;
  90             }
  91         }
  92         // we have not found the separator just return length
  93         return start;
  94     }
  95
  96     /*
  97     private static int getSeparatorIndex(UCharacterIterator iter){
  98         int currentIndex = iter.getIndex();
  99         int separatorIndex = 0;
 100         int ch;
 101         while((ch=iter.next())!= UCharacterIterator.DONE){
 102             if(isLabelSeparator(ch)){
 103                 separatorIndex = iter.getIndex();
 104                 iter.setIndex(currentIndex);
 105                 return separatorIndex;
 106             }
 107         }
 108         // reset index
 109         iter.setIndex(currentIndex);
 110         // we have not found the separator just return the length
 111
 112     }
 113     */
 114
 115
 116     private static boolean isLDHChar(int ch){
 117         // high runner case
 118         if(ch>0x007A){
 119             return false;
 120         }
 121         //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
 122         if( (ch==0x002D) ||
 123             (0x0030 <= ch && ch <= 0x0039) ||
 124             (0x0041 <= ch && ch <= 0x005A) ||
 125             (0x0061 <= ch && ch <= 0x007A)
 126           ){
 127             return true;
 128         }
 129         return false;
 130     }
 131
 132     /**
 133      * Ascertain if the given code point is a label separator as
 134      * defined by the IDNA RFC
 135      *
 136      * @param ch The code point to be ascertained
 137      * @return true if the char is a label separator
 138      * @stable ICU 2.8
 139      */
 140     private static boolean isLabelSeparator(int ch){
 141         switch(ch){
 142             case 0x002e:
 143             case 0x3002:
 144             case 0xFF0E:
 145             case 0xFF61:
 146                 return true;
 147             default:
 148                 return false;
 149         }
 150     }
 151
 152     public static StringBuffer convertToASCII(UCharacterIterator src, int options)
 153             throws StringPrepParseException{
 154
 155         boolean[] caseFlags = null;
 156
 157         // the source contains all ascii codepoints
 158         boolean srcIsASCII  = true;
 159         // assume the source contains all LDH codepoints
 160         boolean srcIsLDH = true;
 161
 162         //get the options
 163         boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
 164         int ch;
 165         // step 1
 166         while((ch = src.next())!= UCharacterIterator.DONE){
 167             if(ch> 0x7f){
 168                 srcIsASCII = false;
 169             }
 170         }
 171         int failPos = -1;
 172         src.setToStart();
 173         StringBuffer processOut = null;
 174         // step 2 is performed only if the source contains non ASCII
 175         if(!srcIsASCII){
 176             // step 2
 177             processOut = namePrep.prepare(src, options);
 178         }else{
 179             processOut = new StringBuffer(src.getText());
 180         }
 181         int poLen = processOut.length();
 182
 183         if(poLen==0){
 184             throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
 185         }
 186         StringBuffer dest = new StringBuffer();
 187
 188         // reset the variable to verify if output of prepare is ASCII or not
 189         srcIsASCII = true;
 190
 191         // step 3 & 4
 192         for(int j=0;j<poLen;j++ ){
 193             ch=processOut.charAt(j);
 194             if(ch > 0x7F){
 195                 srcIsASCII = false;
 196             }else if(isLDHChar(ch)==false){
 197                 // here we do not assemble surrogates
 198                 // since we know that LDH code points
 199                 // are in the ASCII range only
 200                 srcIsLDH = false;
 201                 failPos = j;
 202             }
 203         }
 204
 205         if(useSTD3ASCIIRules == true){
 206             // verify 3a and 3b
 207             if( srcIsLDH == false /* source contains some non-LDH characters */
 208                 || processOut.charAt(0) ==  HYPHEN
 209                 || processOut.charAt(processOut.length()-1) == HYPHEN){
 210
 211                 /* populate the parseError struct */
 212                 if(srcIsLDH==false){
 213                      throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
 214                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
 215                                               processOut.toString(),
 216                                              (failPos>0) ? (failPos-1) : failPos);
 217                 }else if(processOut.charAt(0) == HYPHEN){
 218                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
 219                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
 220
 221                 }else{
 222                      throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
 223                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
 224                                               processOut.toString(),
 225                                               (poLen>0) ? poLen-1 : poLen);
 226
 227                 }
 228             }
 229         }
 230         if(srcIsASCII){
 231             dest =  processOut;
 232         }else{
 233             // step 5 : verify the sequence does not begin with ACE prefix
 234             if(!startsWithPrefix(processOut)){
 235
 236                 //step 6: encode the sequence with punycode
 237                 caseFlags = new boolean[poLen];
 238
 239                 StringBuilder punyout = Punycode.encode(processOut,caseFlags);
 240
 241                 // convert all codepoints to lower case ASCII
 242                 StringBuffer lowerOut = toASCIILower(punyout);
 243
 244                 //Step 7: prepend the ACE prefix
 245                 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
 246                 //Step 6: copy the contents in b2 into dest
 247                 dest.append(lowerOut);
 248             }else{
 249
 250                 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
 251                                          StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
 252             }
 253         }
 254         if(dest.length() > MAX_LABEL_LENGTH){
 255             throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
 256                                      StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
 257         }
 258         return dest;
 259     }
 260
 261     public static StringBuffer convertIDNToASCII(String src,int options)
 262             throws StringPrepParseException{
 263
 264         char[] srcArr = src.toCharArray();
 265         StringBuffer result = new StringBuffer();
 266         int sepIndex=0;
 267         int oldSepIndex=0;
 268         for(;;){
 269             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
 270             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
 271             //make sure this is not a root label separator.
 272             if(!(label.length()==0 && sepIndex==srcArr.length)){
 273                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
 274                 result.append(convertToASCII(iter,options));
 275             }
 276             if(sepIndex==srcArr.length){
 277                 break;
 278             }
 279
 280             // increment the sepIndex to skip past the separator
 281             sepIndex++;
 282             oldSepIndex = sepIndex;
 283             result.append((char)FULL_STOP);
 284         }
 285         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
 286             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
 287         }
 288         return result;
 289     }
 290
 291     public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
 292             throws StringPrepParseException{
 293
 294         boolean[] caseFlags = null;
 295
 296         // the source contains all ascii codepoints
 297         boolean srcIsASCII  = true;
 298         // assume the source contains all LDH codepoints
 299         //boolean srcIsLDH = true;
 300
 301         //get the options
 302         //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
 303
 304         //int failPos = -1;
 305         int ch;
 306         int saveIndex = src.getIndex();
 307         // step 1: find out if all the codepoints in src are ASCII
 308         while((ch=src.next())!= UCharacterIterator.DONE){
 309             if(ch>0x7F){
 310                 srcIsASCII = false;
 311             }/*else if((srcIsLDH = isLDHChar(ch))==false){
 312                 failPos = src.getIndex();
 313             }*/
 314         }
 315         StringBuffer processOut;
 316
 317         if(srcIsASCII == false){
 318             try {
 319                 // step 2: process the string
 320                 src.setIndex(saveIndex);
 321                 processOut = namePrep.prepare(src,options);
 322             } catch (StringPrepParseException ex) {
 323                 return new StringBuffer(src.getText());
 324             }
 325
 326         }else{
 327             //just point to source
 328             processOut = new StringBuffer(src.getText());
 329         }
 330         // TODO:
 331         // The RFC states that
 332         // <quote>
 333         // ToUnicode never fails. If any step fails, then the original input
 334         // is returned immediately in that step.
 335         // </quote>
 336
 337         //step 3: verify ACE Prefix
 338         if(startsWithPrefix(processOut)){
 339             StringBuffer decodeOut = null;
 340
 341             //step 4: Remove the ACE Prefix
 342             String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
 343
 344             //step 5: Decode using punycode
 345             try {
 346                 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
 347             } catch (StringPrepParseException e) {
 348                 decodeOut = null;
 349             }
 350
 351             //step 6:Apply toASCII
 352             if (decodeOut != null) {
 353                 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
 354
 355                 //step 7: verify
 356                 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
 357 //                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
 358 //                                             StringPrepParseException.VERIFICATION_ERROR);
 359                     decodeOut = null;
 360                 }
 361             }
 362
 363             //step 8: return output of step 5
 364              if (decodeOut != null) {
 365                  return decodeOut;
 366              }
 367         }
 368
 369 //        }else{
 370 //            // verify that STD3 ASCII rules are satisfied
 371 //            if(useSTD3ASCIIRules == true){
 372 //                if( srcIsLDH == false /* source contains some non-LDH characters */
 373 //                    || processOut.charAt(0) ==  HYPHEN
 374 //                    || processOut.charAt(processOut.length()-1) == HYPHEN){
 375 //
 376 //                    if(srcIsLDH==false){
 377 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
 378 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
 379 //                                                 (failPos>0) ? (failPos-1) : failPos);
 380 //                    }else if(processOut.charAt(0) == HYPHEN){
 381 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
 382 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
 383 //                                                 processOut.toString(),0);
 384 //
 385 //                    }else{
 386 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
 387 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
 388 //                                                 processOut.toString(),
 389 //                                                 processOut.length());
 390 //
 391 //                    }
 392 //                }
 393 //            }
 394 //            // just return the source
 395 //            return new StringBuffer(src.getText());
 396 //        }
 397
 398         return new StringBuffer(src.getText());
 399     }
 400
 401     public static StringBuffer convertIDNToUnicode(String src, int options)
 402             throws StringPrepParseException{
 403
 404         char[] srcArr = src.toCharArray();
 405         StringBuffer result = new StringBuffer();
 406         int sepIndex=0;
 407         int oldSepIndex=0;
 408         for(;;){
 409             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
 410             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
 411             if(label.length()==0 && sepIndex!=srcArr.length ){
 412                 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
 413             }
 414             UCharacterIterator iter = UCharacterIterator.getInstance(label);
 415             result.append(convertToUnicode(iter,options));
 416             if(sepIndex==srcArr.length){
 417                 break;
 418             }
 419             // Unlike the ToASCII operation we don't normalize the label separators
 420             result.append(srcArr[sepIndex]);
 421             // increment the sepIndex to skip past the separator
 422             sepIndex++;
 423             oldSepIndex =sepIndex;
 424         }
 425         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
 426             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
 427         }
 428         return result;
 429     }
 430
 431     public static int compare(String s1, String s2, int options) throws StringPrepParseException{
 432         StringBuffer s1Out = convertIDNToASCII(s1, options);
 433         StringBuffer s2Out = convertIDNToASCII(s2, options);
 434         return compareCaseInsensitiveASCII(s1Out,s2Out);
 435     }
 436 }