2 *******************************************************************************
\r
3 * Copyright (C) 2003-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.text;
\r
10 import java.text.ParseException;
\r
12 import com.ibm.icu.impl.Punycode;
\r
16 * IDNA API implements the IDNA protocol as defined in the <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
\r
17 * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
\r
18 * containing non-ASCII code points are required to be processed by
\r
19 * ToASCII operation before passing it to resolver libraries. Domain names
\r
20 * that are obtained from resolver libraries are required to be processed by
\r
21 * ToUnicode operation before displaying the domain name to the user.
\r
22 * IDNA requires that implementations process input strings with
\r
23 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a>,
\r
24 * which is a profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a> ,
\r
25 * and then with <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a>.
\r
26 * Implementations of IDNA MUST fully implement Nameprep and Punycode;
\r
27 * neither Nameprep nor Punycode are optional.
\r
28 * The input and output of ToASCII and ToUnicode operations are Unicode
\r
29 * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
\r
30 * multiple times to an input string will yield the same result as applying the operation
\r
32 * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
\r
33 * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
\r
35 * @author Ram Viswanadha
\r
38 public final class IDNA {
\r
40 /* IDNA ACE Prefix is "xn--" */
\r
41 private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
\r
42 //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
\r
44 private static final int MAX_LABEL_LENGTH = 63;
\r
45 private static final int HYPHEN = 0x002D;
\r
46 private static final int CAPITAL_A = 0x0041;
\r
47 private static final int CAPITAL_Z = 0x005A;
\r
48 private static final int LOWER_CASE_DELTA = 0x0020;
\r
49 private static final int FULL_STOP = 0x002E;
\r
50 private static final int MAX_DOMAIN_NAME_LENGTH = 255;
\r
52 * Option to prohibit processing of unassigned codepoints in the input and
\r
53 * do not check if the input conforms to STD-3 ASCII rules.
\r
55 * @see #convertToASCII #convertToUnicode
\r
58 public static final int DEFAULT = 0x0000;
\r
60 * Option to allow processing of unassigned codepoints in the input
\r
62 * @see #convertToASCII #convertToUnicode
\r
65 public static final int ALLOW_UNASSIGNED = 0x0001;
\r
67 * Option to check if input conforms to STD-3 ASCII rules
\r
69 * @see #convertToASCII #convertToUnicode
\r
72 public static final int USE_STD3_RULES = 0x0002;
\r
74 // static final singleton object that is initialized
\r
75 // at class initialization time, hence guaranteed to
\r
76 // be initialized and thread safe
\r
77 private static final IDNA singleton = new IDNA();
\r
79 // The NamePrep profile object
\r
80 private StringPrep namePrep;
\r
82 /* private constructor to prevent construction of the object */
\r
84 namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
\r
87 private static boolean startsWithPrefix(StringBuffer src){
\r
88 boolean startsWithPrefix = true;
\r
90 if(src.length() < ACE_PREFIX.length){
\r
93 for(int i=0; i<ACE_PREFIX.length;i++){
\r
94 if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
\r
95 startsWithPrefix = false;
\r
98 return startsWithPrefix;
\r
101 private static char toASCIILower(char ch){
\r
102 if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
\r
103 return (char)(ch + LOWER_CASE_DELTA);
\r
108 private static StringBuffer toASCIILower(StringBuffer src){
\r
109 StringBuffer dest = new StringBuffer();
\r
110 for(int i=0; i<src.length();i++){
\r
111 dest.append(toASCIILower(src.charAt(i)));
\r
116 private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
\r
119 for(int i =0;/* no condition */;i++) {
\r
120 /* If we reach the ends of both strings then they match */
\r
121 if(i == s1.length()) {
\r
128 /* Case-insensitive comparison */
\r
130 rc=toASCIILower(c1)-toASCIILower(c2);
\r
138 private static int getSeparatorIndex(char[] src,int start, int limit){
\r
139 for(; start<limit;start++){
\r
140 if(isLabelSeparator(src[start])){
\r
144 // we have not found the separator just return length
\r
149 private static int getSeparatorIndex(UCharacterIterator iter){
\r
150 int currentIndex = iter.getIndex();
\r
151 int separatorIndex = 0;
\r
153 while((ch=iter.next())!= UCharacterIterator.DONE){
\r
154 if(isLabelSeparator(ch)){
\r
155 separatorIndex = iter.getIndex();
\r
156 iter.setIndex(currentIndex);
\r
157 return separatorIndex;
\r
161 iter.setIndex(currentIndex);
\r
162 // we have not found the separator just return the length
\r
168 private static boolean isLDHChar(int ch){
\r
169 // high runner case
\r
173 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
\r
174 if( (ch==0x002D) ||
\r
175 (0x0030 <= ch && ch <= 0x0039) ||
\r
176 (0x0041 <= ch && ch <= 0x005A) ||
\r
177 (0x0061 <= ch && ch <= 0x007A)
\r
185 * Ascertain if the given code point is a label separator as
\r
186 * defined by the IDNA RFC
\r
188 * @param ch The code point to be ascertained
\r
189 * @return true if the char is a label separator
\r
192 private static boolean isLabelSeparator(int ch){
\r
205 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
206 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
207 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
208 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
209 * "www","example", and "com".
\r
211 * @param src The input string to be processed
\r
212 * @param options A bit set of options:
\r
213 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
214 * and do not use STD3 ASCII rules
\r
215 * If unassigned code points are found the operation fails with
\r
216 * StringPrepParseException.
\r
218 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
219 * If this option is set, the unassigned code points are in the input
\r
220 * are treated as normal Unicode code points.
\r
222 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
223 * If this option is set and the input does not satisfy STD3 rules,
\r
224 * the operation will fail with ParseException
\r
225 * @return StringBuffer the converted String
\r
226 * @throws StringPrepParseException
\r
229 public static StringBuffer convertToASCII(String src, int options)
\r
230 throws StringPrepParseException{
\r
231 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
232 return convertToASCII(iter,options);
\r
236 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
237 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
238 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
239 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
240 * "www","example", and "com".
\r
242 * @param src The input string as StringBuffer to be processed
\r
243 * @param options A bit set of options:
\r
244 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
245 * and do not use STD3 ASCII rules
\r
246 * If unassigned code points are found the operation fails with
\r
249 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
250 * If this option is set, the unassigned code points are in the input
\r
251 * are treated as normal Unicode code points.
\r
253 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
254 * If this option is set and the input does not satisfy STD3 rules,
\r
255 * the operation will fail with ParseException
\r
256 * @return StringBuffer the converted String
\r
257 * @throws ParseException
\r
260 public static StringBuffer convertToASCII(StringBuffer src, int options)
\r
261 throws StringPrepParseException{
\r
262 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
263 return convertToASCII(iter,options);
\r
267 * This function implements the ToASCII operation as defined in the IDNA RFC.
\r
268 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
269 * ASCII names. A label is an individual part of a domain name. Labels are usually
\r
270 * separated by dots; e.g." "www.example.com" is composed of 3 labels
\r
271 * "www","example", and "com".
\r
273 * @param src The input string as UCharacterIterator to be processed
\r
274 * @param options A bit set of options:
\r
275 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
276 * and do not use STD3 ASCII rules
\r
277 * If unassigned code points are found the operation fails with
\r
280 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
281 * If this option is set, the unassigned code points are in the input
\r
282 * are treated as normal Unicode code points.
\r
284 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
285 * If this option is set and the input does not satisfy STD3 rules,
\r
286 * the operation will fail with ParseException
\r
287 * @return StringBuffer the converted String
\r
288 * @throws ParseException
\r
291 public static StringBuffer convertToASCII(UCharacterIterator src, int options)
\r
292 throws StringPrepParseException{
\r
294 boolean[] caseFlags = null;
\r
296 // the source contains all ascii codepoints
\r
297 boolean srcIsASCII = true;
\r
298 // assume the source contains all LDH codepoints
\r
299 boolean srcIsLDH = true;
\r
302 boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
\r
305 while((ch = src.next())!= UCharacterIterator.DONE){
\r
307 srcIsASCII = false;
\r
312 StringBuffer processOut = null;
\r
313 // step 2 is performed only if the source contains non ASCII
\r
316 processOut = singleton.namePrep.prepare(src, options);
\r
318 processOut = new StringBuffer(src.getText());
\r
320 int poLen = processOut.length();
\r
323 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
\r
325 StringBuffer dest = new StringBuffer();
\r
327 // reset the variable to verify if output of prepare is ASCII or not
\r
331 for(int j=0;j<poLen;j++ ){
\r
332 ch=processOut.charAt(j);
\r
334 srcIsASCII = false;
\r
335 }else if(isLDHChar(ch)==false){
\r
336 // here we do not assemble surrogates
\r
337 // since we know that LDH code points
\r
338 // are in the ASCII range only
\r
344 if(useSTD3ASCIIRules == true){
\r
345 // verify 3a and 3b
\r
346 if( srcIsLDH == false /* source contains some non-LDH characters */
\r
347 || processOut.charAt(0) == HYPHEN
\r
348 || processOut.charAt(processOut.length()-1) == HYPHEN){
\r
350 /* populate the parseError struct */
\r
351 if(srcIsLDH==false){
\r
352 throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
\r
353 StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
354 processOut.toString(),
\r
355 (failPos>0) ? (failPos-1) : failPos);
\r
356 }else if(processOut.charAt(0) == HYPHEN){
\r
357 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
358 StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
\r
361 throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
362 StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
363 processOut.toString(),
\r
364 (poLen>0) ? poLen-1 : poLen);
\r
372 // step 5 : verify the sequence does not begin with ACE prefix
\r
373 if(!startsWithPrefix(processOut)){
\r
375 //step 6: encode the sequence with punycode
\r
376 caseFlags = new boolean[poLen];
\r
378 StringBuffer punyout = Punycode.encode(processOut,caseFlags);
\r
380 // convert all codepoints to lower case ASCII
\r
381 StringBuffer lowerOut = toASCIILower(punyout);
\r
383 //Step 7: prepend the ACE prefix
\r
384 dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
\r
385 //Step 6: copy the contents in b2 into dest
\r
386 dest.append(lowerOut);
\r
389 throw new StringPrepParseException("The input does not start with the ACE Prefix.",
\r
390 StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
\r
393 if(dest.length() > MAX_LABEL_LENGTH){
\r
394 throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
\r
395 StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
\r
401 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
402 * This operation is done on complete domain names, e.g: "www.example.com".
\r
403 * It is important to note that this operation can fail. If it fails, then the input
\r
404 * domain name cannot be used as an Internationalized Domain Name and the application
\r
405 * should have methods defined to deal with the failure.
\r
407 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
408 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
409 * and then convert. This function does not offer that level of granularity. The options once
\r
410 * set will apply to all labels in the domain name
\r
412 * @param src The input string as UCharacterIterator to be processed
\r
413 * @param options A bit set of options:
\r
414 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
415 * and do not use STD3 ASCII rules
\r
416 * If unassigned code points are found the operation fails with
\r
419 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
420 * If this option is set, the unassigned code points are in the input
\r
421 * are treated as normal Unicode code points.
\r
423 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
424 * If this option is set and the input does not satisfy STD3 rules,
\r
425 * the operation will fail with ParseException
\r
426 * @return StringBuffer the converted String
\r
427 * @throws ParseException
\r
430 public static StringBuffer convertIDNToASCII(UCharacterIterator src, int options)
\r
431 throws StringPrepParseException{
\r
432 return convertIDNToASCII(src.getText(), options);
\r
436 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
437 * This operation is done on complete domain names, e.g: "www.example.com".
\r
438 * It is important to note that this operation can fail. If it fails, then the input
\r
439 * domain name cannot be used as an Internationalized Domain Name and the application
\r
440 * should have methods defined to deal with the failure.
\r
442 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
443 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
444 * and then convert. This function does not offer that level of granularity. The options once
\r
445 * set will apply to all labels in the domain name
\r
447 * @param src The input string as a StringBuffer to be processed
\r
448 * @param options A bit set of options:
\r
449 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
450 * and do not use STD3 ASCII rules
\r
451 * If unassigned code points are found the operation fails with
\r
454 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
455 * If this option is set, the unassigned code points are in the input
\r
456 * are treated as normal Unicode code points.
\r
458 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
459 * If this option is set and the input does not satisfy STD3 rules,
\r
460 * the operation will fail with ParseException
\r
461 * @return StringBuffer the converted String
\r
462 * @throws ParseException
\r
465 public static StringBuffer convertIDNToASCII(StringBuffer src, int options)
\r
466 throws StringPrepParseException{
\r
467 return convertIDNToASCII(src.toString(), options);
\r
471 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
\r
472 * This operation is done on complete domain names, e.g: "www.example.com".
\r
473 * It is important to note that this operation can fail. If it fails, then the input
\r
474 * domain name cannot be used as an Internationalized Domain Name and the application
\r
475 * should have methods defined to deal with the failure.
\r
477 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
478 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
479 * and then convert. This function does not offer that level of granularity. The options once
\r
480 * set will apply to all labels in the domain name
\r
482 * @param src The input string to be processed
\r
483 * @param options A bit set of options:
\r
484 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
485 * and do not use STD3 ASCII rules
\r
486 * If unassigned code points are found the operation fails with
\r
489 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
490 * If this option is set, the unassigned code points are in the input
\r
491 * are treated as normal Unicode code points.
\r
493 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
494 * If this option is set and the input does not satisfy STD3 rules,
\r
495 * the operation will fail with ParseException
\r
496 * @return StringBuffer the converted String
\r
497 * @throws ParseException
\r
500 public static StringBuffer convertIDNToASCII(String src,int options)
\r
501 throws StringPrepParseException{
\r
503 char[] srcArr = src.toCharArray();
\r
504 StringBuffer result = new StringBuffer();
\r
508 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
\r
509 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
\r
510 //make sure this is not a root label separator.
\r
511 if(!(label.length()==0 && sepIndex==srcArr.length)){
\r
512 UCharacterIterator iter = UCharacterIterator.getInstance(label);
\r
513 result.append(convertToASCII(iter,options));
\r
515 if(sepIndex==srcArr.length){
\r
519 // increment the sepIndex to skip past the separator
\r
521 oldSepIndex = sepIndex;
\r
522 result.append((char)FULL_STOP);
\r
524 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
\r
525 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
\r
532 * This function implements the ToUnicode operation as defined in the IDNA RFC.
\r
533 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
534 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
535 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
536 * "www","example", and "com".
\r
538 * @param src The input string to be processed
\r
539 * @param options A bit set of options:
\r
540 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
541 * and do not use STD3 ASCII rules
\r
542 * If unassigned code points are found the operation fails with
\r
545 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
546 * If this option is set, the unassigned code points are in the input
\r
547 * are treated as normal Unicode code points.
\r
549 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
550 * If this option is set and the input does not satisfy STD3 rules,
\r
551 * the operation will fail with ParseException
\r
552 * @return StringBuffer the converted String
\r
553 * @throws ParseException
\r
556 public static StringBuffer convertToUnicode(String src, int options)
\r
557 throws StringPrepParseException{
\r
558 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
559 return convertToUnicode(iter,options);
\r
563 * This function implements the ToUnicode operation as defined in the IDNA RFC.
\r
564 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
565 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
566 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
567 * "www","example", and "com".
\r
569 * @param src The input string as StringBuffer to be processed
\r
570 * @param options A bit set of options:
\r
571 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
572 * and do not use STD3 ASCII rules
\r
573 * If unassigned code points are found the operation fails with
\r
576 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
577 * If this option is set, the unassigned code points are in the input
\r
578 * are treated as normal Unicode code points.
\r
580 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
581 * If this option is set and the input does not satisfy STD3 rules,
\r
582 * the operation will fail with ParseException
\r
583 * @return StringBuffer the converted String
\r
584 * @throws ParseException
\r
587 public static StringBuffer convertToUnicode(StringBuffer src, int options)
\r
588 throws StringPrepParseException{
\r
589 UCharacterIterator iter = UCharacterIterator.getInstance(src);
\r
590 return convertToUnicode(iter,options);
\r
594 * Function that implements the ToUnicode operation as defined in the IDNA RFC.
\r
595 * This operation is done on <b>single labels</b> before sending it to something that expects
\r
596 * Unicode names. A label is an individual part of a domain name. Labels are usually
\r
597 * separated by dots; for e.g." "www.example.com" is composed of 3 labels
\r
598 * "www","example", and "com".
\r
600 * @param src The input string as UCharacterIterator to be processed
\r
601 * @param options A bit set of options:
\r
602 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
603 * and do not use STD3 ASCII rules
\r
604 * If unassigned code points are found the operation fails with
\r
607 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
608 * If this option is set, the unassigned code points are in the input
\r
609 * are treated as normal Unicode code points.
\r
611 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
612 * If this option is set and the input does not satisfy STD3 rules,
\r
613 * the operation will fail with ParseException
\r
614 * @return StringBuffer the converted String
\r
615 * @throws ParseException
\r
618 public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
\r
619 throws StringPrepParseException{
\r
621 boolean[] caseFlags = null;
\r
623 // the source contains all ascii codepoints
\r
624 boolean srcIsASCII = true;
\r
625 // assume the source contains all LDH codepoints
\r
626 //boolean srcIsLDH = true;
\r
629 //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
\r
631 //int failPos = -1;
\r
633 int saveIndex = src.getIndex();
\r
634 // step 1: find out if all the codepoints in src are ASCII
\r
635 while((ch=src.next())!= UCharacterIterator.DONE){
\r
637 srcIsASCII = false;
\r
638 }/*else if((srcIsLDH = isLDHChar(ch))==false){
\r
639 failPos = src.getIndex();
\r
642 StringBuffer processOut;
\r
644 if(srcIsASCII == false){
\r
646 // step 2: process the string
\r
647 src.setIndex(saveIndex);
\r
648 processOut = singleton.namePrep.prepare(src,options);
\r
649 } catch (StringPrepParseException ex) {
\r
650 return new StringBuffer(src.getText());
\r
654 //just point to source
\r
655 processOut = new StringBuffer(src.getText());
\r
658 // The RFC states that
\r
660 // ToUnicode never fails. If any step fails, then the original input
\r
661 // is returned immediately in that step.
\r
664 //step 3: verify ACE Prefix
\r
665 if(startsWithPrefix(processOut)){
\r
666 StringBuffer decodeOut = null;
\r
668 //step 4: Remove the ACE Prefix
\r
669 String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
\r
671 //step 5: Decode using punycode
\r
673 decodeOut = Punycode.decode(new StringBuffer(temp),caseFlags);
\r
674 } catch (StringPrepParseException e) {
\r
678 //step 6:Apply toASCII
\r
679 if (decodeOut != null) {
\r
680 StringBuffer toASCIIOut = convertToASCII(decodeOut, options);
\r
683 if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
\r
684 // throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
\r
685 // StringPrepParseException.VERIFICATION_ERROR);
\r
690 //step 8: return output of step 5
\r
691 if (decodeOut != null) {
\r
697 // // verify that STD3 ASCII rules are satisfied
\r
698 // if(useSTD3ASCIIRules == true){
\r
699 // if( srcIsLDH == false /* source contains some non-LDH characters */
\r
700 // || processOut.charAt(0) == HYPHEN
\r
701 // || processOut.charAt(processOut.length()-1) == HYPHEN){
\r
703 // if(srcIsLDH==false){
\r
704 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
705 // StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
\r
706 // (failPos>0) ? (failPos-1) : failPos);
\r
707 // }else if(processOut.charAt(0) == HYPHEN){
\r
708 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
709 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
710 // processOut.toString(),0);
\r
713 // throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
\r
714 // StringPrepParseException.STD3_ASCII_RULES_ERROR,
\r
715 // processOut.toString(),
\r
716 // processOut.length());
\r
721 // // just return the source
\r
722 // return new StringBuffer(src.getText());
\r
725 return new StringBuffer(src.getText());
\r
729 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
730 * This operation is done on complete domain names, e.g: "www.example.com".
\r
732 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
733 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
734 * and then convert. This function does not offer that level of granularity. The options once
\r
735 * set will apply to all labels in the domain name
\r
737 * @param src The input string as UCharacterIterator to be processed
\r
738 * @param options A bit set of options:
\r
739 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
740 * and do not use STD3 ASCII rules
\r
741 * If unassigned code points are found the operation fails with
\r
744 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
745 * If this option is set, the unassigned code points are in the input
\r
746 * are treated as normal Unicode code points.
\r
748 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
749 * If this option is set and the input does not satisfy STD3 rules,
\r
750 * the operation will fail with ParseException
\r
751 * @return StringBuffer the converted String
\r
752 * @throws ParseException
\r
755 public static StringBuffer convertIDNToUnicode(UCharacterIterator src, int options)
\r
756 throws StringPrepParseException{
\r
757 return convertIDNToUnicode(src.getText(), options);
\r
761 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
762 * This operation is done on complete domain names, e.g: "www.example.com".
\r
764 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
765 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
766 * and then convert. This function does not offer that level of granularity. The options once
\r
767 * set will apply to all labels in the domain name
\r
769 * @param src The input string as StringBuffer to be processed
\r
770 * @param options A bit set of options:
\r
771 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
772 * and do not use STD3 ASCII rules
\r
773 * If unassigned code points are found the operation fails with
\r
776 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
777 * If this option is set, the unassigned code points are in the input
\r
778 * are treated as normal Unicode code points.
\r
780 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
781 * If this option is set and the input does not satisfy STD3 rules,
\r
782 * the operation will fail with ParseException
\r
783 * @return StringBuffer the converted String
\r
784 * @throws ParseException
\r
787 public static StringBuffer convertIDNToUnicode(StringBuffer src, int options)
\r
788 throws StringPrepParseException{
\r
789 return convertIDNToUnicode(src.toString(), options);
\r
793 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
\r
794 * This operation is done on complete domain names, e.g: "www.example.com".
\r
796 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
\r
797 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
\r
798 * and then convert. This function does not offer that level of granularity. The options once
\r
799 * set will apply to all labels in the domain name
\r
801 * @param src The input string to be processed
\r
802 * @param options A bit set of options:
\r
803 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
804 * and do not use STD3 ASCII rules
\r
805 * If unassigned code points are found the operation fails with
\r
808 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
809 * If this option is set, the unassigned code points are in the input
\r
810 * are treated as normal Unicode code points.
\r
812 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
813 * If this option is set and the input does not satisfy STD3 rules,
\r
814 * the operation will fail with ParseException
\r
815 * @return StringBuffer the converted String
\r
816 * @throws ParseException
\r
819 public static StringBuffer convertIDNToUnicode(String src, int options)
\r
820 throws StringPrepParseException{
\r
822 char[] srcArr = src.toCharArray();
\r
823 StringBuffer result = new StringBuffer();
\r
827 sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
\r
828 String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
\r
829 if(label.length()==0 && sepIndex!=srcArr.length ){
\r
830 throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
\r
832 UCharacterIterator iter = UCharacterIterator.getInstance(label);
\r
833 result.append(convertToUnicode(iter,options));
\r
834 if(sepIndex==srcArr.length){
\r
837 // Unlike the ToASCII operation we don't normalize the label separators
\r
838 result.append(srcArr[sepIndex]);
\r
839 // increment the sepIndex to skip past the separator
\r
841 oldSepIndex =sepIndex;
\r
843 if(result.length() > MAX_DOMAIN_NAME_LENGTH){
\r
844 throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
\r
850 * Compare two IDN strings for equivalence.
\r
851 * This function splits the domain names into labels and compares them.
\r
852 * According to IDN RFC, whenever two labels are compared, they are
\r
853 * considered equal if and only if their ASCII forms (obtained by
\r
854 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
855 * Two domain names are considered a match if and only if all labels
\r
856 * match regardless of whether label separators match.
\r
858 * @param s1 First IDN string as StringBuffer
\r
859 * @param s2 Second IDN string as StringBuffer
\r
860 * @param options A bit set of options:
\r
861 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
862 * and do not use STD3 ASCII rules
\r
863 * If unassigned code points are found the operation fails with
\r
866 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
867 * If this option is set, the unassigned code points are in the input
\r
868 * are treated as normal Unicode code points.
\r
870 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
871 * If this option is set and the input does not satisfy STD3 rules,
\r
872 * the operation will fail with ParseException
\r
873 * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
\r
874 * @throws ParseException
\r
878 public static int compare(StringBuffer s1, StringBuffer s2, int options)
\r
879 throws StringPrepParseException{
\r
880 if(s1==null || s2 == null){
\r
881 throw new IllegalArgumentException("One of the source buffers is null");
\r
883 StringBuffer s1Out = convertIDNToASCII(s1.toString(),options);
\r
884 StringBuffer s2Out = convertIDNToASCII(s2.toString(), options);
\r
885 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r
889 * Compare two IDN strings for equivalence.
\r
890 * This function splits the domain names into labels and compares them.
\r
891 * According to IDN RFC, whenever two labels are compared, they are
\r
892 * considered equal if and only if their ASCII forms (obtained by
\r
893 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
894 * Two domain names are considered a match if and only if all labels
\r
895 * match regardless of whether label separators match.
\r
897 * @param s1 First IDN string
\r
898 * @param s2 Second IDN string
\r
899 * @param options A bit set of options:
\r
900 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
901 * and do not use STD3 ASCII rules
\r
902 * If unassigned code points are found the operation fails with
\r
905 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
906 * If this option is set, the unassigned code points are in the input
\r
907 * are treated as normal Unicode code points.
\r
909 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
910 * If this option is set and the input does not satisfy STD3 rules,
\r
911 * the operation will fail with ParseException
\r
912 * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
\r
913 * @throws ParseException
\r
917 public static int compare(String s1, String s2, int options)
\r
918 throws StringPrepParseException{
\r
919 if(s1==null || s2 == null){
\r
920 throw new IllegalArgumentException("One of the source buffers is null");
\r
922 StringBuffer s1Out = convertIDNToASCII(s1, options);
\r
923 StringBuffer s2Out = convertIDNToASCII(s2, options);
\r
924 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r
927 * Compare two IDN strings for equivalence.
\r
928 * This function splits the domain names into labels and compares them.
\r
929 * According to IDN RFC, whenever two labels are compared, they are
\r
930 * considered equal if and only if their ASCII forms (obtained by
\r
931 * applying toASCII) match using an case-insensitive ASCII comparison.
\r
932 * Two domain names are considered a match if and only if all labels
\r
933 * match regardless of whether label separators match.
\r
935 * @param s1 First IDN string as UCharacterIterator
\r
936 * @param s2 Second IDN string as UCharacterIterator
\r
937 * @param options A bit set of options:
\r
938 * - IDNA.DEFAULT Use default options, i.e., do not process unassigned code points
\r
939 * and do not use STD3 ASCII rules
\r
940 * If unassigned code points are found the operation fails with
\r
943 * - IDNA.ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
\r
944 * If this option is set, the unassigned code points are in the input
\r
945 * are treated as normal Unicode code points.
\r
947 * - IDNA.USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
\r
948 * If this option is set and the input does not satisfy STD3 rules,
\r
949 * the operation will fail with ParseException
\r
950 * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
\r
951 * @throws ParseException
\r
955 public static int compare(UCharacterIterator s1, UCharacterIterator s2, int options)
\r
956 throws StringPrepParseException{
\r
957 if(s1==null || s2 == null){
\r
958 throw new IllegalArgumentException("One of the source buffers is null");
\r
960 StringBuffer s1Out = convertIDNToASCII(s1.getText(), options);
\r
961 StringBuffer s2Out = convertIDNToASCII(s2.getText(), options);
\r
962 return compareCaseInsensitiveASCII(s1Out,s2Out);
\r