2 *******************************************************************************
\r
3 * Copyright (C) 2003-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import com.ibm.icu.impl.Punycode;
\r
14 * IDNA API implements the IDNA protocol as defined in the <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
\r
15 * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
\r
16 * containing non-ASCII code points are required to be processed by
\r
17 * ToASCII operation before passing it to resolver libraries. Domain names
\r
18 * that are obtained from resolver libraries are required to be processed by
\r
19 * ToUnicode operation before displaying the domain name to the user.
\r
20 * IDNA requires that implementations process input strings with
\r
21 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a>,
\r
22 * which is a profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a> ,
\r
23 * and then with <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a>.
\r
24 * Implementations of IDNA MUST fully implement Nameprep and Punycode;
\r
25 * neither Nameprep nor Punycode are optional.
\r
26 * The input and output of ToASCII and ToUnicode operations are Unicode
\r
27 * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
\r
28 * multiple times to an input string will yield the same result as applying the operation
\r
30 * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
\r
31 * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
\r
33 * @author Ram Viswanadha
\r
36 public final class IDNA {
\r
38 /* IDNA ACE Prefix is "xn--" */
\r
39 private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
\r
40 //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
\r
42 private static final int MAX_LABEL_LENGTH = 63;
\r
43 private static final int HYPHEN = 0x002D;
\r
44 private static final int CAPITAL_A = 0x0041;
\r
45 private static final int CAPITAL_Z = 0x005A;
\r
46 private static final int LOWER_CASE_DELTA = 0x0020;
\r
47 private static final int FULL_STOP = 0x002E;
\r
48 private static final int MAX_DOMAIN_NAME_LENGTH = 255;
\r
50 * Option to prohibit processing of unassigned codepoints in the input and
\r
51 * do not check if the input conforms to STD-3 ASCII rules.
\r
53 * @see #convertToASCII #convertToUnicode
\r
56 public static final int DEFAULT = 0x0000;
\r
58 * Option to allow processing of unassigned codepoints in the input
\r
60 * @see #convertToASCII #convertToUnicode
\r
63 public static final int ALLOW_UNASSIGNED = 0x0001;
\r
65 * Option to check if input conforms to STD-3 ASCII rules
\r
67 * @see #convertToASCII #convertToUnicode
\r
70 public static final int USE_STD3_RULES = 0x0002;
\r
72 // static final singleton object that is initialized
\r
73 // at class initialization time, hence guaranteed to
\r
74 // be initialized and thread safe
\r
75 private static final IDNA singleton = new IDNA();
\r
77 // The NamePrep profile object
\r
78 private StringPrep namePrep;
\r
80 /* private constructor to prevent construction of the object */
\r
82 namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
\r
85 private static boolean startsWithPrefix(StringBuffer src){
\r
86 boolean startsWithPrefix = true;
\r
88 if(src.length() < ACE_PREFIX.length){
\r
91 for(int i=0; i<ACE_PREFIX.length;i++){
\r
92 if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
\r
93 startsWithPrefix = false;
\r
96 return startsWithPrefix;
\r
99 private static char toASCIILower(char ch){
\r
100 if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
\r
101 return (char)(ch + LOWER_CASE_DELTA);
\r
106 private static StringBuffer toASCIILower(StringBuffer src){
\r
107 StringBuffer dest = new StringBuffer();
\r
108 for(int i=0; i<src.length();i++){
\r
109 dest.append(toASCIILower(src.charAt(i)));
\r
114 private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
\r
117 for(int i =0;/* no condition */;i++) {
\r
118 /* If we reach the ends of both strings then they match */
\r
119 if(i == s1.length()) {
\r
126 /* Case-insensitive comparison */
\r
128 rc=toASCIILower(c1)-toASCIILower(c2);
\r
136 private static int getSeparatorIndex(char[] src,int start, int limit){
\r
137 for(; start<limit;start++){
\r
138 if(isLabelSeparator(src[start])){
\r
142 // we have not found the separator just return length
\r
147 private static int getSeparatorIndex(UCharacterIterator iter){
\r
148 int currentIndex = iter.getIndex();
\r
149 int separatorIndex = 0;
\r
151 while((ch=iter.next())!= UCharacterIterator.DONE){
\r
152 if(isLabelSeparator(ch)){
\r
153 separatorIndex = iter.getIndex();
\r
154 iter.setIndex(currentIndex);
\r
155 return separatorIndex;
\r
159 iter.setIndex(currentIndex);
\r
160 // we have not found the separator just return the length
\r
166 private static boolean isLDHChar(int ch){
\r
167 // high runner case
\r
171 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
\r
172 if( (ch==0x002D) ||
\r
173 (0x0030 <= ch && ch <= 0x0039) ||
\r
174 (0x0041 <= ch && ch <= 0x005A) ||
\r
175 (0x0061 <= ch && ch <= 0x007A)
\r
183 * Ascertain if the given code point is a label separator as
\r
184 * defined by the IDNA RFC
\r
186 * @param ch The code point to be ascertained
\r
187 * @return true if the char is a label separator
\r
190 private static boolean isLabelSeparator(int ch){
\r
203 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
204 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
205 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
206 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
207 * "www","example", and "com".
\r
209 * @param src The input string to be processed
\r
210 * @param options A bit set of options:
\r
211 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
212 * and do not use STD3 ASCII rules
\r
213 * If unassigned code points are found the operation fails with
\r
214 * StringPrepParseException.
\r
216 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
217 * If this option is set, the unassigned code points are in the input
\r
218 * are treated as normal Unicode code points.
\r
220 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
221 * If this option is set and the input does not satisfy STD3 rules,
\r
222 * the operation will fail with ParseException
\r
223 * @return StringBuffer the converted String
\r
224 * @throws StringPrepParseException When an error occurs for parsing a string.
\r
227 public static StringBuffer convertToASCII(String src, int options)
\r
228 throws StringPrepParseException{
\r
229 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
230 return convertToASCII(iter,options);
\r
234 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
235 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
236 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
237 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
238 * "www","example", and "com".
\r
240 * @param src The input string as StringBuffer to be processed
\r
241 * @param options A bit set of options:
\r
242 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
243 * and do not use STD3 ASCII rules
\r
244 * If unassigned code points are found the operation fails with
\r
247 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
248 * If this option is set, the unassigned code points are in the input
\r
249 * are treated as normal Unicode code points.
\r
251 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
252 * If this option is set and the input does not satisfy STD3 rules,
\r
253 * the operation will fail with ParseException
\r
254 * @return StringBuffer the converted String
\r
257 public static StringBuffer convertToASCII(StringBuffer src, int options)
\r
258 throws StringPrepParseException{
\r
259 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
260 return convertToASCII(iter,options);
\r
264 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
265 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
266 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
267 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
268 * "www","example", and "com".
\r
270 * @param src The input string as UCharacterIterator to be processed
\r
271 * @param options A bit set of options:
\r
272 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
273 * and do not use STD3 ASCII rules
\r
274 * If unassigned code points are found the operation fails with
\r
277 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
278 * If this option is set, the unassigned code points are in the input
\r
279 * are treated as normal Unicode code points.
\r
281 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
282 * If this option is set and the input does not satisfy STD3 rules,
\r
283 * the operation will fail with ParseException
\r
284 * @return StringBuffer the converted String
\r
287 public static StringBuffer convertToASCII(UCharacterIterator src, int options)
\r
288 throws StringPrepParseException{
\r
290 boolean[] caseFlags = null;
\r
292 // the source contains all ascii codepoints
\r
293 boolean srcIsASCII = true;
\r
294 // assume the source contains all LDH codepoints
\r
295 boolean srcIsLDH = true;
\r
298 boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
\r
301 while((ch = src.next())!= UCharacterIterator.DONE){
\r
303 srcIsASCII = false;
\r
308 StringBuffer processOut = null;
\r
309 // step 2 is performed only if the source contains non ASCII
\r
312 processOut = singleton.namePrep.prepare(src, options);
\r
314 processOut = new StringBuffer(src.getText());
\r
316 int poLen = processOut.length();
\r
319 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
\r
321 StringBuffer dest = new StringBuffer();
\r
323 // reset the variable to verify if output of prepare is ASCII or not
\r
327 for(int j=0;j<poLen;j++ ){
\r
328 ch=processOut.charAt(j);
\r
330 srcIsASCII = false;
\r
331 }else if(isLDHChar(ch)==false){
\r
332 // here we do not assemble surrogates
\r
333 // since we know that LDH code points
\r
334 // are in the ASCII range only
\r
340 if(useSTD3ASCIIRules == true){
\r
341 // verify 3a and 3b
\r
342 if( srcIsLDH == false /* source contains some non-LDH characters */
\r
343 || processOut.charAt(0) == HYPHEN
\r
344 || processOut.charAt(processOut.length()-1) == HYPHEN){
\r
346 /* populate the parseError struct */
\r
347 if(srcIsLDH==false){
\r
348 throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
\r
349 StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
350 processOut.toString(),
\r
351 (failPos>0) ? (failPos-1) : failPos);
\r
352 }else if(processOut.charAt(0) == HYPHEN){
\r
353 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
354 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
\r
357 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
358 StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
359 processOut.toString(),
\r
360 (poLen>0) ? poLen-1 : poLen);
\r
368 // step 5 : verify the sequence does not begin with ACE prefix
\r
369 if(!startsWithPrefix(processOut)){
\r
371 //step 6: encode the sequence with punycode
\r
372 caseFlags = new boolean[poLen];
\r
374 StringBuffer punyout = Punycode.encode(processOut,caseFlags);
\r
376 // convert all codepoints to lower case ASCII
\r
377 StringBuffer lowerOut = toASCIILower(punyout);
\r
379 //Step 7: prepend the ACE prefix
\r
380 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
\r
381 //Step 6: copy the contents in b2 into dest
\r
382 dest.append(lowerOut);
\r
385 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
\r
386 StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
\r
389 if(dest.length() > MAX_LABEL_LENGTH){
\r
390 throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
\r
391 StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
\r
397 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
398 * This operation is done on complete domain names, e.g: "www.example.com".
\r
399 * It is important to note that this operation can fail. If it fails, then the input
\r
400 * domain name cannot be used as an Internationalized Domain Name and the application
\r
401 * should have methods defined to deal with the failure.
\r
403 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
404 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
405 * and then convert. This function does not offer that level of granularity. The options once
\r
406 * set will apply to all labels in the domain name
\r
408 * @param src The input string as UCharacterIterator to be processed
\r
409 * @param options A bit set of options:
\r
410 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
411 * and do not use STD3 ASCII rules
\r
412 * If unassigned code points are found the operation fails with
\r
415 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
416 * If this option is set, the unassigned code points are in the input
\r
417 * are treated as normal Unicode code points.
\r
419 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
420 * If this option is set and the input does not satisfy STD3 rules,
\r
421 * the operation will fail with ParseException
\r
422 * @return StringBuffer the converted String
\r
425 public static StringBuffer convertIDNToASCII(UCharacterIterator src, int options)
\r
426 throws StringPrepParseException{
\r
427 return convertIDNToASCII(src.getText(), options);
\r
431 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
432 * This operation is done on complete domain names, e.g: "www.example.com".
\r
433 * It is important to note that this operation can fail. If it fails, then the input
\r
434 * domain name cannot be used as an Internationalized Domain Name and the application
\r
435 * should have methods defined to deal with the failure.
\r
437 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
438 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
439 * and then convert. This function does not offer that level of granularity. The options once
\r
440 * set will apply to all labels in the domain name
\r
442 * @param src The input string as a StringBuffer to be processed
\r
443 * @param options A bit set of options:
\r
444 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
445 * and do not use STD3 ASCII rules
\r
446 * If unassigned code points are found the operation fails with
\r
449 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
450 * If this option is set, the unassigned code points are in the input
\r
451 * are treated as normal Unicode code points.
\r
453 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
454 * If this option is set and the input does not satisfy STD3 rules,
\r
455 * the operation will fail with ParseException
\r
456 * @return StringBuffer the converted String
\r
459 public static StringBuffer convertIDNToASCII(StringBuffer src, int options)
\r
460 throws StringPrepParseException{
\r
461 return convertIDNToASCII(src.toString(), options);
\r
465 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
466 * This operation is done on complete domain names, e.g: "www.example.com".
\r
467 * It is important to note that this operation can fail. If it fails, then the input
\r
468 * domain name cannot be used as an Internationalized Domain Name and the application
\r
469 * should have methods defined to deal with the failure.
\r
471 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
472 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
473 * and then convert. This function does not offer that level of granularity. The options once
\r
474 * set will apply to all labels in the domain name
\r
476 * @param src The input string to be processed
\r
477 * @param options A bit set of options:
\r
478 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
479 * and do not use STD3 ASCII rules
\r
480 * If unassigned code points are found the operation fails with
\r
483 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
484 * If this option is set, the unassigned code points are in the input
\r
485 * are treated as normal Unicode code points.
\r
487 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
488 * If this option is set and the input does not satisfy STD3 rules,
\r
489 * the operation will fail with ParseException
\r
490 * @return StringBuffer the converted String
\r
493 public static StringBuffer convertIDNToASCII(String src,int options)
\r
494 throws StringPrepParseException{
\r
496 char[] srcArr = src.toCharArray();
\r
497 StringBuffer result = new StringBuffer();
\r
501 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
\r
502 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
\r
503 //make sure this is not a root label separator.
\r
504 if(!(label.length()==0 && sepIndex==srcArr.length)){
\r
505 UCharacterIterator iter = UCharacterIterator.getInstance(label);
\r
506 result.append(convertToASCII(iter,options));
\r
508 if(sepIndex==srcArr.length){
\r
512 // increment the sepIndex to skip past the separator
\r
514 oldSepIndex = sepIndex;
\r
515 result.append((char)FULL_STOP);
\r
517 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
\r
518 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
\r
525 * This function implements the ToUnicode operation as defined in the IDNA RFC.
\r
526 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
527 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
528 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
529 * "www","example", and "com".
\r
531 * @param src The input string to be processed
\r
532 * @param options A bit set of options:
\r
533 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
534 * and do not use STD3 ASCII rules
\r
535 * If unassigned code points are found the operation fails with
\r
538 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
539 * If this option is set, the unassigned code points are in the input
\r
540 * are treated as normal Unicode code points.
\r
542 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
543 * If this option is set and the input does not satisfy STD3 rules,
\r
544 * the operation will fail with ParseException
\r
545 * @return StringBuffer the converted String
\r
548 public static StringBuffer convertToUnicode(String src, int options)
\r
549 throws StringPrepParseException{
\r
550 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
551 return convertToUnicode(iter,options);
\r
555 * This function implements the ToUnicode operation as defined in the IDNA RFC.
\r
556 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
557 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
558 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
559 * "www","example", and "com".
\r
561 * @param src The input string as StringBuffer to be processed
\r
562 * @param options A bit set of options:
\r
563 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
564 * and do not use STD3 ASCII rules
\r
565 * If unassigned code points are found the operation fails with
\r
568 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
569 * If this option is set, the unassigned code points are in the input
\r
570 * are treated as normal Unicode code points.
\r
572 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
573 * If this option is set and the input does not satisfy STD3 rules,
\r
574 * the operation will fail with ParseException
\r
575 * @return StringBuffer the converted String
\r
578 public static StringBuffer convertToUnicode(StringBuffer src, int options)
\r
579 throws StringPrepParseException{
\r
580 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
581 return convertToUnicode(iter,options);
\r
585 * Function that implements the ToUnicode operation as defined in the IDNA RFC.
\r
586 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
587 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
588 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
589 * "www","example", and "com".
\r
591 * @param src The input string as UCharacterIterator to be processed
\r
592 * @param options A bit set of options:
\r
593 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
594 * and do not use STD3 ASCII rules
\r
595 * If unassigned code points are found the operation fails with
\r
598 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
599 * If this option is set, the unassigned code points are in the input
\r
600 * are treated as normal Unicode code points.
\r
602 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
603 * If this option is set and the input does not satisfy STD3 rules,
\r
604 * the operation will fail with ParseException
\r
605 * @return StringBuffer the converted String
\r
608 public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
\r
609 throws StringPrepParseException{
\r
611 boolean[] caseFlags = null;
\r
613 // the source contains all ascii codepoints
\r
614 boolean srcIsASCII = true;
\r
615 // assume the source contains all LDH codepoints
\r
616 //boolean srcIsLDH = true;
\r
619 //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
\r
621 //int failPos = -1;
\r
623 int saveIndex = src.getIndex();
\r
624 // step 1: find out if all the codepoints in src are ASCII
\r
625 while((ch=src.next())!= UCharacterIterator.DONE){
\r
627 srcIsASCII = false;
\r
628 }/*else if((srcIsLDH = isLDHChar(ch))==false){
\r
629 failPos = src.getIndex();
\r
632 StringBuffer processOut;
\r
634 if(srcIsASCII == false){
\r
636 // step 2: process the string
\r
637 src.setIndex(saveIndex);
\r
638 processOut = singleton.namePrep.prepare(src,options);
\r
639 } catch (StringPrepParseException ex) {
\r
640 return new StringBuffer(src.getText());
\r
644 //just point to source
\r
645 processOut = new StringBuffer(src.getText());
\r
648 // The RFC states that
\r
650 // ToUnicode never fails. If any step fails, then the original input
\r
651 // is returned immediately in that step.
\r
654 //step 3: verify ACE Prefix
\r
655 if(startsWithPrefix(processOut)){
\r
656 StringBuffer decodeOut = null;
\r
658 //step 4: Remove the ACE Prefix
\r
659 String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
\r
661 //step 5: Decode using punycode
\r
663 decodeOut = Punycode.decode(new StringBuffer(temp),caseFlags);
\r
664 } catch (StringPrepParseException e) {
\r
668 //step 6:Apply toASCII
\r
669 if (decodeOut != null) {
\r
670 StringBuffer toASCIIOut = convertToASCII(decodeOut, options);
\r
673 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
\r
674 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
\r
675 // StringPrepParseException.VERIFICATION_ERROR);
\r
680 //step 8: return output of step 5
\r
681 if (decodeOut != null) {
\r
687 // // verify that STD3 ASCII rules are satisfied
\r
688 // if(useSTD3ASCIIRules == true){
\r
689 // if( srcIsLDH == false /* source contains some non-LDH characters */
\r
690 // || processOut.charAt(0) == HYPHEN
\r
691 // || processOut.charAt(processOut.length()-1) == HYPHEN){
\r
693 // if(srcIsLDH==false){
\r
694 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
695 // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
\r
696 // (failPos>0) ? (failPos-1) : failPos);
\r
697 // }else if(processOut.charAt(0) == HYPHEN){
\r
698 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
699 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
700 // processOut.toString(),0);
\r
703 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
704 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
705 // processOut.toString(),
\r
706 // processOut.length());
\r
711 // // just return the source
\r
712 // return new StringBuffer(src.getText());
\r
715 return new StringBuffer(src.getText());
\r
719 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
720 * This operation is done on complete domain names, e.g: "www.example.com".
\r
722 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
723 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
724 * and then convert. This function does not offer that level of granularity. The options once
\r
725 * set will apply to all labels in the domain name
\r
727 * @param src The input string as UCharacterIterator to be processed
\r
728 * @param options A bit set of options:
\r
729 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
730 * and do not use STD3 ASCII rules
\r
731 * If unassigned code points are found the operation fails with
\r
734 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
735 * If this option is set, the unassigned code points are in the input
\r
736 * are treated as normal Unicode code points.
\r
738 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
739 * If this option is set and the input does not satisfy STD3 rules,
\r
740 * the operation will fail with ParseException
\r
741 * @return StringBuffer the converted String
\r
744 public static StringBuffer convertIDNToUnicode(UCharacterIterator src, int options)
\r
745 throws StringPrepParseException{
\r
746 return convertIDNToUnicode(src.getText(), options);
\r
750 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
751 * This operation is done on complete domain names, e.g: "www.example.com".
\r
753 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
754 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
755 * and then convert. This function does not offer that level of granularity. The options once
\r
756 * set will apply to all labels in the domain name
\r
758 * @param src The input string as StringBuffer to be processed
\r
759 * @param options A bit set of options:
\r
760 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
761 * and do not use STD3 ASCII rules
\r
762 * If unassigned code points are found the operation fails with
\r
765 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
766 * If this option is set, the unassigned code points are in the input
\r
767 * are treated as normal Unicode code points.
\r
769 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
770 * If this option is set and the input does not satisfy STD3 rules,
\r
771 * the operation will fail with ParseException
\r
772 * @return StringBuffer the converted String
\r
775 public static StringBuffer convertIDNToUnicode(StringBuffer src, int options)
\r
776 throws StringPrepParseException{
\r
777 return convertIDNToUnicode(src.toString(), options);
\r
781 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
782 * This operation is done on complete domain names, e.g: "www.example.com".
\r
784 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
785 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
786 * and then convert. This function does not offer that level of granularity. The options once
\r
787 * set will apply to all labels in the domain name
\r
789 * @param src The input string to be processed
\r
790 * @param options A bit set of options:
\r
791 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
792 * and do not use STD3 ASCII rules
\r
793 * If unassigned code points are found the operation fails with
\r
796 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
797 * If this option is set, the unassigned code points are in the input
\r
798 * are treated as normal Unicode code points.
\r
800 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
801 * If this option is set and the input does not satisfy STD3 rules,
\r
802 * the operation will fail with ParseException
\r
803 * @return StringBuffer the converted String
\r
806 public static StringBuffer convertIDNToUnicode(String src, int options)
\r
807 throws StringPrepParseException{
\r
809 char[] srcArr = src.toCharArray();
\r
810 StringBuffer result = new StringBuffer();
\r
814 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
\r
815 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
\r
816 if(label.length()==0 && sepIndex!=srcArr.length ){
\r
817 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
\r
819 UCharacterIterator iter = UCharacterIterator.getInstance(label);
\r
820 result.append(convertToUnicode(iter,options));
\r
821 if(sepIndex==srcArr.length){
\r
824 // Unlike the ToASCII operation we don't normalize the label separators
\r
825 result.append(srcArr[sepIndex]);
\r
826 // increment the sepIndex to skip past the separator
\r
828 oldSepIndex =sepIndex;
\r
830 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
\r
831 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
\r
837 * Compare two IDN strings for equivalence.
\r
838 * This function splits the domain names into labels and compares them.
\r
839 * According to IDN RFC, whenever two labels are compared, they are
\r
840 * considered equal if and only if their ASCII forms (obtained by
\r
841 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
842 * Two domain names are considered a match if and only if all labels
\r
843 * match regardless of whether label separators match.
\r
845 * @param s1 First IDN string as StringBuffer
\r
846 * @param s2 Second IDN string as StringBuffer
\r
847 * @param options A bit set of options:
\r
848 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
849 * and do not use STD3 ASCII rules
\r
850 * If unassigned code points are found the operation fails with
\r
853 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
854 * If this option is set, the unassigned code points are in the input
\r
855 * are treated as normal Unicode code points.
\r
857 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
858 * If this option is set and the input does not satisfy STD3 rules,
\r
859 * the operation will fail with ParseException
\r
860 * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
\r
864 public static int compare(StringBuffer s1, StringBuffer s2, int options)
\r
865 throws StringPrepParseException{
\r
866 if(s1==null || s2 == null){
\r
867 throw new IllegalArgumentException("One of the source buffers is null");
\r
869 StringBuffer s1Out = convertIDNToASCII(s1.toString(),options);
\r
870 StringBuffer s2Out = convertIDNToASCII(s2.toString(), options);
\r
871 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r
875 * Compare two IDN strings for equivalence.
\r
876 * This function splits the domain names into labels and compares them.
\r
877 * According to IDN RFC, whenever two labels are compared, they are
\r
878 * considered equal if and only if their ASCII forms (obtained by
\r
879 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
880 * Two domain names are considered a match if and only if all labels
\r
881 * match regardless of whether label separators match.
\r
883 * @param s1 First IDN string
\r
884 * @param s2 Second IDN string
\r
885 * @param options A bit set of options:
\r
886 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
887 * and do not use STD3 ASCII rules
\r
888 * If unassigned code points are found the operation fails with
\r
891 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
892 * If this option is set, the unassigned code points are in the input
\r
893 * are treated as normal Unicode code points.
\r
895 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
896 * If this option is set and the input does not satisfy STD3 rules,
\r
897 * the operation will fail with ParseException
\r
898 * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
\r
902 public static int compare(String s1, String s2, int options)
\r
903 throws StringPrepParseException{
\r
904 if(s1==null || s2 == null){
\r
905 throw new IllegalArgumentException("One of the source buffers is null");
\r
907 StringBuffer s1Out = convertIDNToASCII(s1, options);
\r
908 StringBuffer s2Out = convertIDNToASCII(s2, options);
\r
909 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r
912 * Compare two IDN strings for equivalence.
\r
913 * This function splits the domain names into labels and compares them.
\r
914 * According to IDN RFC, whenever two labels are compared, they are
\r
915 * considered equal if and only if their ASCII forms (obtained by
\r
916 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
917 * Two domain names are considered a match if and only if all labels
\r
918 * match regardless of whether label separators match.
\r
920 * @param s1 First IDN string as UCharacterIterator
\r
921 * @param s2 Second IDN string as UCharacterIterator
\r
922 * @param options A bit set of options:
\r
923 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
924 * and do not use STD3 ASCII rules
\r
925 * If unassigned code points are found the operation fails with
\r
928 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
929 * If this option is set, the unassigned code points are in the input
\r
930 * are treated as normal Unicode code points.
\r
932 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
933 * If this option is set and the input does not satisfy STD3 rules,
\r
934 * the operation will fail with ParseException
\r
935 * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
\r
939 public static int compare(UCharacterIterator s1, UCharacterIterator s2, int options)
\r
940 throws StringPrepParseException{
\r
941 if(s1==null || s2 == null){
\r
942 throw new IllegalArgumentException("One of the source buffers is null");
\r
944 StringBuffer s1Out = convertIDNToASCII(s1.getText(), options);
\r
945 StringBuffer s2Out = convertIDNToASCII(s2.getText(), options);
\r
946 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r