]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java
Added flags.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / core / src / com / ibm / icu / impl / IDNA2003.java
1 /*
2 *******************************************************************************
3 * Copyright (C) 2003-2010, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 */
7 package com.ibm.icu.impl;
8
9 import com.ibm.icu.text.IDNA;
10 import com.ibm.icu.text.StringPrep;
11 import com.ibm.icu.text.StringPrepParseException;
12 import com.ibm.icu.text.UCharacterIterator;
13
14 /**
15  * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
16  * while extending that class to support IDNA2008/UTS #46 as well.
17  * @author Ram Viswanadha
18  */
19 public final class IDNA2003 {
20     /* IDNA ACE Prefix is "xn--" */
21     private static char[] ACE_PREFIX                = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
22     //private static final int ACE_PREFIX_LENGTH      = ACE_PREFIX.length;
23
24     private static final int MAX_LABEL_LENGTH       = 63;
25     private static final int HYPHEN                 = 0x002D;
26     private static final int CAPITAL_A              = 0x0041;
27     private static final int CAPITAL_Z              = 0x005A;
28     private static final int LOWER_CASE_DELTA       = 0x0020;
29     private static final int FULL_STOP              = 0x002E;
30     private static final int MAX_DOMAIN_NAME_LENGTH = 255;
31
32     // The NamePrep profile object
33     private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
34     
35     private static boolean startsWithPrefix(StringBuffer src){
36         boolean startsWithPrefix = true;
37
38         if(src.length() < ACE_PREFIX.length){
39             return false;
40         }
41         for(int i=0; i<ACE_PREFIX.length;i++){
42             if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
43                 startsWithPrefix = false;
44             }
45         }
46         return startsWithPrefix;
47     }
48
49     private static char toASCIILower(char ch){
50         if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
51             return (char)(ch + LOWER_CASE_DELTA);
52         }
53         return ch;
54     }
55
56     private static StringBuffer toASCIILower(CharSequence src){
57         StringBuffer dest = new StringBuffer();
58         for(int i=0; i<src.length();i++){
59             dest.append(toASCIILower(src.charAt(i)));
60         }
61         return dest;
62     }
63
64     private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
65         char c1,c2;
66         int rc;
67         for(int i =0;/* no condition */;i++) {
68             /* If we reach the ends of both strings then they match */
69             if(i == s1.length()) {
70                 return 0;
71             }
72
73             c1 = s1.charAt(i);
74             c2 = s2.charAt(i);
75         
76             /* Case-insensitive comparison */
77             if(c1!=c2) {
78                 rc=toASCIILower(c1)-toASCIILower(c2);
79                 if(rc!=0) {
80                     return rc;
81                 }
82             }
83         }
84     }
85    
86     private static int getSeparatorIndex(char[] src,int start, int limit){
87         for(; start<limit;start++){
88             if(isLabelSeparator(src[start])){
89                 return start;
90             }
91         }
92         // we have not found the separator just return length
93         return start;
94     }
95     
96     /*
97     private static int getSeparatorIndex(UCharacterIterator iter){
98         int currentIndex = iter.getIndex();
99         int separatorIndex = 0;
100         int ch;
101         while((ch=iter.next())!= UCharacterIterator.DONE){
102             if(isLabelSeparator(ch)){
103                 separatorIndex = iter.getIndex();
104                 iter.setIndex(currentIndex);
105                 return separatorIndex;
106             }
107         }
108         // reset index
109         iter.setIndex(currentIndex);
110         // we have not found the separator just return the length
111        
112     }
113     */
114     
115
116     private static boolean isLDHChar(int ch){
117         // high runner case
118         if(ch>0x007A){
119             return false;
120         }
121         //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
122         if( (ch==0x002D) || 
123             (0x0030 <= ch && ch <= 0x0039) ||
124             (0x0041 <= ch && ch <= 0x005A) ||
125             (0x0061 <= ch && ch <= 0x007A)
126           ){
127             return true;
128         }
129         return false;
130     }
131     
132     /**
133      * Ascertain if the given code point is a label separator as 
134      * defined by the IDNA RFC
135      * 
136      * @param ch The code point to be ascertained
137      * @return true if the char is a label separator
138      * @stable ICU 2.8
139      */
140     private static boolean isLabelSeparator(int ch){
141         switch(ch){
142             case 0x002e:
143             case 0x3002:
144             case 0xFF0E:
145             case 0xFF61:
146                 return true;
147             default:
148                 return false;           
149         }
150     }
151
152     public static StringBuffer convertToASCII(UCharacterIterator src, int options)
153             throws StringPrepParseException{
154         
155         boolean[] caseFlags = null;
156     
157         // the source contains all ascii codepoints
158         boolean srcIsASCII  = true;
159         // assume the source contains all LDH codepoints
160         boolean srcIsLDH = true; 
161
162         //get the options
163         boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
164         int ch;
165         // step 1
166         while((ch = src.next())!= UCharacterIterator.DONE){
167             if(ch> 0x7f){
168                 srcIsASCII = false;
169             }
170         }
171         int failPos = -1;
172         src.setToStart();
173         StringBuffer processOut = null;
174         // step 2 is performed only if the source contains non ASCII
175         if(!srcIsASCII){
176             // step 2
177             processOut = namePrep.prepare(src, options);
178         }else{
179             processOut = new StringBuffer(src.getText());
180         }
181         int poLen = processOut.length();
182         
183         if(poLen==0){
184             throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
185         }
186         StringBuffer dest = new StringBuffer();
187         
188         // reset the variable to verify if output of prepare is ASCII or not
189         srcIsASCII = true;
190         
191         // step 3 & 4
192         for(int j=0;j<poLen;j++ ){
193             ch=processOut.charAt(j);
194             if(ch > 0x7F){
195                 srcIsASCII = false;
196             }else if(isLDHChar(ch)==false){
197                 // here we do not assemble surrogates
198                 // since we know that LDH code points
199                 // are in the ASCII range only
200                 srcIsLDH = false;
201                 failPos = j;
202             }
203         }
204     
205         if(useSTD3ASCIIRules == true){
206             // verify 3a and 3b
207             if( srcIsLDH == false /* source contains some non-LDH characters */
208                 || processOut.charAt(0) ==  HYPHEN 
209                 || processOut.charAt(processOut.length()-1) == HYPHEN){
210
211                 /* populate the parseError struct */
212                 if(srcIsLDH==false){
213                      throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
214                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
215                                               processOut.toString(),
216                                              (failPos>0) ? (failPos-1) : failPos);
217                 }else if(processOut.charAt(0) == HYPHEN){
218                     throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
219                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
220      
221                 }else{
222                      throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
223                                               StringPrepParseException.STD3_ASCII_RULES_ERROR,
224                                               processOut.toString(),
225                                               (poLen>0) ? poLen-1 : poLen);
226
227                 }
228             }
229         }
230         if(srcIsASCII){
231             dest =  processOut;
232         }else{
233             // step 5 : verify the sequence does not begin with ACE prefix
234             if(!startsWithPrefix(processOut)){
235
236                 //step 6: encode the sequence with punycode
237                 caseFlags = new boolean[poLen];
238
239                 StringBuilder punyout = Punycode.encode(processOut,caseFlags);
240
241                 // convert all codepoints to lower case ASCII
242                 StringBuffer lowerOut = toASCIILower(punyout);
243
244                 //Step 7: prepend the ACE prefix
245                 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
246                 //Step 6: copy the contents in b2 into dest
247                 dest.append(lowerOut);
248             }else{
249
250                 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
251                                          StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
252             }
253         }
254         if(dest.length() > MAX_LABEL_LENGTH){
255             throw new StringPrepParseException("The labels in the input are too long. Length > 63.", 
256                                      StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
257         }
258         return dest;
259     }
260
261     public static StringBuffer convertIDNToASCII(String src,int options)
262             throws StringPrepParseException{
263
264         char[] srcArr = src.toCharArray();
265         StringBuffer result = new StringBuffer();
266         int sepIndex=0;
267         int oldSepIndex=0;
268         for(;;){
269             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
270             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
271             //make sure this is not a root label separator.
272             if(!(label.length()==0 && sepIndex==srcArr.length)){
273                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
274                 result.append(convertToASCII(iter,options));
275             }
276             if(sepIndex==srcArr.length){
277                 break;
278             }
279             
280             // increment the sepIndex to skip past the separator
281             sepIndex++;
282             oldSepIndex = sepIndex;
283             result.append((char)FULL_STOP);
284         }
285         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
286             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
287         }
288         return result;
289     }
290
291     public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
292             throws StringPrepParseException{
293         
294         boolean[] caseFlags = null;
295                 
296         // the source contains all ascii codepoints
297         boolean srcIsASCII  = true;
298         // assume the source contains all LDH codepoints
299         //boolean srcIsLDH = true; 
300         
301         //get the options
302         //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
303         
304         //int failPos = -1;
305         int ch;
306         int saveIndex = src.getIndex();
307         // step 1: find out if all the codepoints in src are ASCII  
308         while((ch=src.next())!= UCharacterIterator.DONE){
309             if(ch>0x7F){
310                 srcIsASCII = false;
311             }/*else if((srcIsLDH = isLDHChar(ch))==false){
312                 failPos = src.getIndex();
313             }*/
314         }
315         StringBuffer processOut;
316         
317         if(srcIsASCII == false){
318             try {
319                 // step 2: process the string
320                 src.setIndex(saveIndex);
321                 processOut = namePrep.prepare(src,options);
322             } catch (StringPrepParseException ex) {
323                 return new StringBuffer(src.getText());
324             }
325
326         }else{
327             //just point to source
328             processOut = new StringBuffer(src.getText());
329         }
330         // TODO:
331         // The RFC states that 
332         // <quote>
333         // ToUnicode never fails. If any step fails, then the original input
334         // is returned immediately in that step.
335         // </quote>
336         
337         //step 3: verify ACE Prefix
338         if(startsWithPrefix(processOut)){
339             StringBuffer decodeOut = null;
340
341             //step 4: Remove the ACE Prefix
342             String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
343
344             //step 5: Decode using punycode
345             try {
346                 decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
347             } catch (StringPrepParseException e) {
348                 decodeOut = null;
349             }
350
351             //step 6:Apply toASCII
352             if (decodeOut != null) {
353                 StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
354     
355                 //step 7: verify
356                 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
357 //                    throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
358 //                                             StringPrepParseException.VERIFICATION_ERROR); 
359                     decodeOut = null;
360                 }
361             }
362
363             //step 8: return output of step 5
364              if (decodeOut != null) {
365                  return decodeOut;
366              }
367         }
368             
369 //        }else{
370 //            // verify that STD3 ASCII rules are satisfied
371 //            if(useSTD3ASCIIRules == true){
372 //                if( srcIsLDH == false /* source contains some non-LDH characters */
373 //                    || processOut.charAt(0) ==  HYPHEN 
374 //                    || processOut.charAt(processOut.length()-1) == HYPHEN){
375 //    
376 //                    if(srcIsLDH==false){
377 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
378 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
379 //                                                 (failPos>0) ? (failPos-1) : failPos);
380 //                    }else if(processOut.charAt(0) == HYPHEN){
381 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
382 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
383 //                                                 processOut.toString(),0);
384 //         
385 //                    }else{
386 //                        throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
387 //                                                 StringPrepParseException.STD3_ASCII_RULES_ERROR,
388 //                                                 processOut.toString(),
389 //                                                 processOut.length());
390 //    
391 //                    }
392 //                }
393 //            }
394 //            // just return the source
395 //            return new StringBuffer(src.getText());
396 //        }  
397         
398         return new StringBuffer(src.getText());
399     }
400
401     public static StringBuffer convertIDNToUnicode(String src, int options)
402             throws StringPrepParseException{
403         
404         char[] srcArr = src.toCharArray();
405         StringBuffer result = new StringBuffer();
406         int sepIndex=0;
407         int oldSepIndex=0;
408         for(;;){
409             sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
410             String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
411             if(label.length()==0 && sepIndex!=srcArr.length ){
412                 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
413             }
414             UCharacterIterator iter = UCharacterIterator.getInstance(label);
415             result.append(convertToUnicode(iter,options));
416             if(sepIndex==srcArr.length){
417                 break;
418             }
419             // Unlike the ToASCII operation we don't normalize the label separators
420             result.append(srcArr[sepIndex]);
421             // increment the sepIndex to skip past the separator
422             sepIndex++;
423             oldSepIndex =sepIndex;
424         }
425         if(result.length() > MAX_DOMAIN_NAME_LENGTH){
426             throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
427         }
428         return result;
429     }
430
431     public static int compare(String s1, String s2, int options) throws StringPrepParseException{
432         StringBuffer s1Out = convertIDNToASCII(s1, options);
433         StringBuffer s2Out = convertIDNToASCII(s2, options);
434         return compareCaseInsensitiveASCII(s1Out,s2Out);
435     }
436 }