]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
Upgrade ICU4J.
[Dictionary.git] / jars / icu4j-52_1 / main / tests / core / src / com / ibm / icu / dev / test / normalizer / UTS46Test.java
1 /*
2 *******************************************************************************
3 * Copyright (C) 2010-2011, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 */
7 package com.ibm.icu.dev.test.normalizer;
8
9 import java.util.Collections;
10 import java.util.EnumSet;
11 import java.util.Map;
12 import java.util.Set;
13 import java.util.TreeMap;
14
15 import com.ibm.icu.dev.test.TestFmwk;
16 import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
17 import com.ibm.icu.text.IDNA;
18
19 /**
20  * UTS #46 (IDNA2008) test.
21  * @author Markus Scherer
22  * @since 2010jul10
23  */
24 public class UTS46Test extends TestFmwk {
25     public static void main(String[] args) throws Exception {
26         new UTS46Test().run(args);
27     }
28     public UTS46Test() {
29         int commonOptions=
30             IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|
31             IDNA.CHECK_CONTEXTJ|IDNA.CHECK_CONTEXTO;
32         trans=IDNA.getUTS46Instance(commonOptions);
33         nontrans=IDNA.getUTS46Instance(commonOptions|
34                                        IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE);
35     }
36
37     public void TestAPI() {
38         StringBuilder result=new StringBuilder();
39         IDNA.Info info=new IDNA.Info();
40         String input="www.eXample.cOm";
41         String expected="www.example.com";
42         trans.nameToASCII(input, result, info);
43         if(info.hasErrors() || !UTF16Plus.equal(result, expected)) {
44             errln(String.format("T.nameToASCII(www.example.com) info.errors=%s result matches=%b",
45                                 info.getErrors(), UTF16Plus.equal(result, expected)));
46         }
47         input="xn--bcher.de-65a";
48         expected="xn--bcher\uFFFDde-65a";
49         nontrans.labelToASCII(input, result, info);
50         if( !info.getErrors().equals(EnumSet.of(IDNA.Error.LABEL_HAS_DOT, IDNA.Error.INVALID_ACE_LABEL)) ||
51             !UTF16Plus.equal(result, expected)
52         ) {
53             errln(String.format("N.labelToASCII(label-with-dot) failed with errors %s",
54                                 info.getErrors()));
55         }
56         // Java API tests that are not parallel to C++ tests
57         // because the C++ specifics (error codes etc.) do not apply here.
58         String resultString=trans.nameToUnicode("fA\u00DF.de", result, info).toString();
59         if(info.hasErrors() || !resultString.equals("fass.de")) {
60             errln(String.format("T.nameToUnicode(fA\u00DF.de) info.errors=%s result matches=%b",
61                                 info.getErrors(), resultString.equals("fass.de")));
62         }
63         try {
64             nontrans.labelToUnicode(result, result, info);
65             errln("N.labelToUnicode(result, result) did not throw an Exception");
66         } catch(Exception e) {
67             // as expected (should be an IllegalArgumentException, or an ICU version of it)
68         }
69     }
70
71     public void TestNotSTD3() {
72         IDNA not3=IDNA.getUTS46Instance(IDNA.CHECK_BIDI);
73         String input="\u0000A_2+2=4\n.e\u00DFen.net";
74         StringBuilder result=new StringBuilder();
75         IDNA.Info info=new IDNA.Info();
76         if( !not3.nameToUnicode(input, result, info).toString().equals("\u0000a_2+2=4\n.essen.net") ||
77             info.hasErrors()
78         ) {
79             errln(String.format("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %s string %s",
80                                 info.getErrors(), prettify(result.toString())));
81         }
82         // A space (BiDi class WS) is not allowed in a BiDi domain name.
83         input="a z.xn--4db.edu";
84         not3.nameToASCII(input, result, info);
85         if(!UTF16Plus.equal(result, input) || !info.getErrors().equals(EnumSet.of(IDNA.Error.BIDI))) {
86             errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
87         }
88         // Characters that are canonically equivalent to sequences with non-LDH ASCII.
89         input="a\u2260b\u226Ec\u226Fd";
90         not3.nameToUnicode(input, result, info);
91         if(!UTF16Plus.equal(result, input) || info.hasErrors()) {
92             errln(String.format("notSTD3.nameToUnicode(equiv to non-LDH ASCII) unexpected errors %s string %s",
93                                 info.getErrors().toString(), prettify(result.toString())));
94         }
95     }
96
97     private static final Map<String, IDNA.Error> errorNamesToErrors;
98     static {
99         errorNamesToErrors=new TreeMap<String, IDNA.Error>();
100         errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
101         errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
102         errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);
103         errorNamesToErrors.put("UIDNA_ERROR_LEADING_HYPHEN", IDNA.Error.LEADING_HYPHEN);
104         errorNamesToErrors.put("UIDNA_ERROR_TRAILING_HYPHEN", IDNA.Error.TRAILING_HYPHEN);
105         errorNamesToErrors.put("UIDNA_ERROR_HYPHEN_3_4", IDNA.Error.HYPHEN_3_4);
106         errorNamesToErrors.put("UIDNA_ERROR_LEADING_COMBINING_MARK", IDNA.Error.LEADING_COMBINING_MARK);
107         errorNamesToErrors.put("UIDNA_ERROR_DISALLOWED", IDNA.Error.DISALLOWED);
108         errorNamesToErrors.put("UIDNA_ERROR_PUNYCODE", IDNA.Error.PUNYCODE);
109         errorNamesToErrors.put("UIDNA_ERROR_LABEL_HAS_DOT", IDNA.Error.LABEL_HAS_DOT);
110         errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL);
111         errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI);
112         errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ);
113         errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_PUNCTUATION", IDNA.Error.CONTEXTO_PUNCTUATION);
114         errorNamesToErrors.put("UIDNA_ERROR_CONTEXTO_DIGITS", IDNA.Error.CONTEXTO_DIGITS);
115     }
116
117     private static final class TestCase {
118         private TestCase() {
119             errors=EnumSet.noneOf(IDNA.Error.class);
120         }
121         private void set(String[] data) {
122             s=data[0];
123             o=data[1];
124             u=data[2];
125             errors.clear();
126             if(data[3].length()!=0) {
127                 for(String e: data[3].split("\\|")) {
128                     errors.add(errorNamesToErrors.get(e));
129                 }
130             }
131         }
132         // Input string and options string (Nontransitional/Transitional/Both).
133         private String s, o;
134         // Expected Unicode result string.
135         private String u;
136         private EnumSet<IDNA.Error> errors;
137     };
138
139     private static final String testCases[][]={
140         { "www.eXample.cOm", "B",  // all ASCII
141           "www.example.com", "" },
142         { "B\u00FCcher.de", "B",  // u-umlaut
143           "b\u00FCcher.de", "" },
144         { "\u00D6BB", "B",  // O-umlaut
145           "\u00F6bb", "" },
146         { "fa\u00DF.de", "N",  // sharp s
147           "fa\u00DF.de", "" },
148         { "fa\u00DF.de", "T",  // sharp s
149           "fass.de", "" },
150         { "XN--fA-hia.dE", "B",  // sharp s in Punycode
151           "fa\u00DF.de", "" },
152         { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N",  // Greek with final sigma
153           "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "" },
154         { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T",  // Greek with final sigma
155           "\u03B2\u03CC\u03BB\u03BF\u03C3.com", "" },
156         { "xn--nxasmm1c", "B",  // Greek with final sigma in Punycode
157           "\u03B2\u03CC\u03BB\u03BF\u03C2", "" },
158         { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N",  // "Sri" in "Sri Lanka" has a ZWJ
159           "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
160         { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T",  // "Sri" in "Sri Lanka" has a ZWJ
161           "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", "" },
162         { "www.xn--10cl1a0b660p.com", "B",  // "Sri" in Punycode
163           "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
164         { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N",  // ZWNJ
165           "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "" },
166         { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T",  // ZWNJ
167           "\u0646\u0627\u0645\u0647\u0627\u06CC", "" },
168         { "xn--mgba3gch31f060k.com", "B",  // ZWNJ in Punycode
169           "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", "" },
170         { "a.b\uFF0Ec\u3002d\uFF61", "B",
171           "a.b.c.d.", "" },
172         { "U\u0308.xn--tda", "B",  // U+umlaut.u-umlaut
173           "\u00FC.\u00FC", "" },
174         { "xn--u-ccb", "B",  // u+umlaut in Punycode
175           "xn--u-ccb\uFFFD", "UIDNA_ERROR_INVALID_ACE_LABEL" },
176         { "a\u2488com", "B",  // contains 1-dot
177           "a\uFFFDcom", "UIDNA_ERROR_DISALLOWED" },
178         { "xn--a-ecp.ru", "B",  // contains 1-dot in Punycode
179           "xn--a-ecp\uFFFD.ru", "UIDNA_ERROR_INVALID_ACE_LABEL" },
180         { "xn--0.pt", "B",  // invalid Punycode
181           "xn--0\uFFFD.pt", "UIDNA_ERROR_PUNYCODE" },
182         { "xn--a.pt", "B",  // U+0080
183           "xn--a\uFFFD.pt", "UIDNA_ERROR_INVALID_ACE_LABEL" },
184         { "xn--a-\u00C4.pt", "B",  // invalid Punycode
185           "xn--a-\u00E4.pt", "UIDNA_ERROR_PUNYCODE" },
186         { "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B",  // Japanese with fullwidth ".jp"
187           "\u65E5\u672C\u8A9E.jp", "" },
188         { "\u2615", "B", "\u2615", "" },  // Unicode 4.0 HOT BEVERAGE
189         // some characters are disallowed because they are canonically equivalent
190         // to sequences with non-LDH ASCII
191         { "a\u2260b\u226Ec\u226Fd", "B",
192           "a\uFFFDb\uFFFDc\uFFFDd", "UIDNA_ERROR_DISALLOWED" },
193         // many deviation characters, test the special mapping code
194         { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
195           "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
196           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
197           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
198           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N",
199           "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
200           "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
201           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
202           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
203           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz",
204           "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ" },
205         { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
206           "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
207           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
208           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
209           "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T",
210           "1.assbcssssssssd"+
211           "\u03C3\u03C3sssssssssssssssse"+
212           "ssssssssssssssssssssx"+
213           "ssssssssssssssssssssy"+
214           "sssssssssssssss\u015Dssz", "UIDNA_ERROR_LABEL_TOO_LONG" },
215         // "xn--bss" with deviation characters
216         { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N",
217           "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "UIDNA_ERROR_CONTEXTJ" },
218         { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T",
219           "\u5919", "" },
220         // "xn--bssffl" written as:
221         // 02E3 MODIFIER LETTER SMALL X
222         // 034F COMBINING GRAPHEME JOINER (ignored)
223         // 2115 DOUBLE-STRUCK CAPITAL N
224         // 200B ZERO WIDTH SPACE (ignored)
225         // FE63 SMALL HYPHEN-MINUS
226         // 00AD SOFT HYPHEN (ignored)
227         // FF0D FULLWIDTH HYPHEN-MINUS
228         // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored)
229         // 212C SCRIPT CAPITAL B
230         // FE00 VARIATION SELECTOR-1 (ignored)
231         // 017F LATIN SMALL LETTER LONG S
232         // 2064 INVISIBLE PLUS (ignored)
233         // 1D530 MATHEMATICAL FRAKTUR SMALL S
234         // E01EF VARIATION SELECTOR-256 (ignored)
235         // FB04 LATIN SMALL LIGATURE FFL
236         { "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C"+
237           "\u212C\uFE00\u017F\u2064"+"\uD835\uDD30\uDB40\uDDEF"/*1D530 E01EF*/+"\uFB04", "B",
238           "\u5921\u591E\u591C\u5919", "" },
239         { "123456789012345678901234567890123456789012345678901234567890123."+
240           "123456789012345678901234567890123456789012345678901234567890123."+
241           "123456789012345678901234567890123456789012345678901234567890123."+
242           "1234567890123456789012345678901234567890123456789012345678901", "B",
243           "123456789012345678901234567890123456789012345678901234567890123."+
244           "123456789012345678901234567890123456789012345678901234567890123."+
245           "123456789012345678901234567890123456789012345678901234567890123."+
246           "1234567890123456789012345678901234567890123456789012345678901", "" },
247         { "123456789012345678901234567890123456789012345678901234567890123."+
248           "123456789012345678901234567890123456789012345678901234567890123."+
249           "123456789012345678901234567890123456789012345678901234567890123."+
250           "1234567890123456789012345678901234567890123456789012345678901.", "B",
251           "123456789012345678901234567890123456789012345678901234567890123."+
252           "123456789012345678901234567890123456789012345678901234567890123."+
253           "123456789012345678901234567890123456789012345678901234567890123."+
254           "1234567890123456789012345678901234567890123456789012345678901.", "" },
255         // Domain name >256 characters, forces slow path in UTF-8 processing.
256         { "123456789012345678901234567890123456789012345678901234567890123."+
257           "123456789012345678901234567890123456789012345678901234567890123."+
258           "123456789012345678901234567890123456789012345678901234567890123."+
259           "123456789012345678901234567890123456789012345678901234567890123."+
260           "12345678901234567890123456789012345678901234567890123456789012", "B",
261           "123456789012345678901234567890123456789012345678901234567890123."+
262           "123456789012345678901234567890123456789012345678901234567890123."+
263           "123456789012345678901234567890123456789012345678901234567890123."+
264           "123456789012345678901234567890123456789012345678901234567890123."+
265           "12345678901234567890123456789012345678901234567890123456789012",
266           "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
267         { "123456789012345678901234567890123456789012345678901234567890123."+
268           "123456789012345678901234567890123456789012345678901234567890123."+
269           "123456789012345678901234567890123456789012345678901234567890123."+
270           "123456789012345678901234567890123456789012345678901234567890123."+
271           "1234567890123456789012345678901234567890123456789\u05D0", "B",
272           "123456789012345678901234567890123456789012345678901234567890123."+
273           "123456789012345678901234567890123456789012345678901234567890123."+
274           "123456789012345678901234567890123456789012345678901234567890123."+
275           "123456789012345678901234567890123456789012345678901234567890123."+
276           "1234567890123456789012345678901234567890123456789\u05D0",
277           "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI" },
278         { "123456789012345678901234567890123456789012345678901234567890123."+
279           "1234567890123456789012345678901234567890123456789012345678901234."+
280           "123456789012345678901234567890123456789012345678901234567890123."+
281           "123456789012345678901234567890123456789012345678901234567890", "B",
282           "123456789012345678901234567890123456789012345678901234567890123."+
283           "1234567890123456789012345678901234567890123456789012345678901234."+
284           "123456789012345678901234567890123456789012345678901234567890123."+
285           "123456789012345678901234567890123456789012345678901234567890",
286           "UIDNA_ERROR_LABEL_TOO_LONG" },
287         { "123456789012345678901234567890123456789012345678901234567890123."+
288           "1234567890123456789012345678901234567890123456789012345678901234."+
289           "123456789012345678901234567890123456789012345678901234567890123."+
290           "123456789012345678901234567890123456789012345678901234567890.", "B",
291           "123456789012345678901234567890123456789012345678901234567890123."+
292           "1234567890123456789012345678901234567890123456789012345678901234."+
293           "123456789012345678901234567890123456789012345678901234567890123."+
294           "123456789012345678901234567890123456789012345678901234567890.",
295           "UIDNA_ERROR_LABEL_TOO_LONG" },
296         { "123456789012345678901234567890123456789012345678901234567890123."+
297           "1234567890123456789012345678901234567890123456789012345678901234."+
298           "123456789012345678901234567890123456789012345678901234567890123."+
299           "1234567890123456789012345678901234567890123456789012345678901", "B",
300           "123456789012345678901234567890123456789012345678901234567890123."+
301           "1234567890123456789012345678901234567890123456789012345678901234."+
302           "123456789012345678901234567890123456789012345678901234567890123."+
303           "1234567890123456789012345678901234567890123456789012345678901",
304           "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
305         // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te
306         { "\u00E41234567890123456789012345678901234567890123456789012345", "B",
307           "\u00E41234567890123456789012345678901234567890123456789012345", "" },
308         { "1234567890\u00E41234567890123456789012345678901234567890123456", "B",
309           "1234567890\u00E41234567890123456789012345678901234567890123456", "UIDNA_ERROR_LABEL_TOO_LONG" },
310         { "123456789012345678901234567890123456789012345678901234567890123."+
311           "1234567890\u00E4123456789012345678901234567890123456789012345."+
312           "123456789012345678901234567890123456789012345678901234567890123."+
313           "1234567890123456789012345678901234567890123456789012345678901", "B",
314           "123456789012345678901234567890123456789012345678901234567890123."+
315           "1234567890\u00E4123456789012345678901234567890123456789012345."+
316           "123456789012345678901234567890123456789012345678901234567890123."+
317           "1234567890123456789012345678901234567890123456789012345678901", "" },
318         { "123456789012345678901234567890123456789012345678901234567890123."+
319           "1234567890\u00E4123456789012345678901234567890123456789012345."+
320           "123456789012345678901234567890123456789012345678901234567890123."+
321           "1234567890123456789012345678901234567890123456789012345678901.", "B",
322           "123456789012345678901234567890123456789012345678901234567890123."+
323           "1234567890\u00E4123456789012345678901234567890123456789012345."+
324           "123456789012345678901234567890123456789012345678901234567890123."+
325           "1234567890123456789012345678901234567890123456789012345678901.", "" },
326         { "123456789012345678901234567890123456789012345678901234567890123."+
327           "1234567890\u00E4123456789012345678901234567890123456789012345."+
328           "123456789012345678901234567890123456789012345678901234567890123."+
329           "12345678901234567890123456789012345678901234567890123456789012", "B",
330           "123456789012345678901234567890123456789012345678901234567890123."+
331           "1234567890\u00E4123456789012345678901234567890123456789012345."+
332           "123456789012345678901234567890123456789012345678901234567890123."+
333           "12345678901234567890123456789012345678901234567890123456789012",
334           "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
335         { "123456789012345678901234567890123456789012345678901234567890123."+
336           "1234567890\u00E41234567890123456789012345678901234567890123456."+
337           "123456789012345678901234567890123456789012345678901234567890123."+
338           "123456789012345678901234567890123456789012345678901234567890", "B",
339           "123456789012345678901234567890123456789012345678901234567890123."+
340           "1234567890\u00E41234567890123456789012345678901234567890123456."+
341           "123456789012345678901234567890123456789012345678901234567890123."+
342           "123456789012345678901234567890123456789012345678901234567890",
343           "UIDNA_ERROR_LABEL_TOO_LONG" },
344         { "123456789012345678901234567890123456789012345678901234567890123."+
345           "1234567890\u00E41234567890123456789012345678901234567890123456."+
346           "123456789012345678901234567890123456789012345678901234567890123."+
347           "123456789012345678901234567890123456789012345678901234567890.", "B",
348           "123456789012345678901234567890123456789012345678901234567890123."+
349           "1234567890\u00E41234567890123456789012345678901234567890123456."+
350           "123456789012345678901234567890123456789012345678901234567890123."+
351           "123456789012345678901234567890123456789012345678901234567890.",
352           "UIDNA_ERROR_LABEL_TOO_LONG" },
353         { "123456789012345678901234567890123456789012345678901234567890123."+
354           "1234567890\u00E41234567890123456789012345678901234567890123456."+
355           "123456789012345678901234567890123456789012345678901234567890123."+
356           "1234567890123456789012345678901234567890123456789012345678901", "B",
357           "123456789012345678901234567890123456789012345678901234567890123."+
358           "1234567890\u00E41234567890123456789012345678901234567890123456."+
359           "123456789012345678901234567890123456789012345678901234567890123."+
360           "1234567890123456789012345678901234567890123456789012345678901",
361           "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
362         // hyphen errors and empty-label errors
363         // "xn---q----jra"=="-q--a-umlaut-"
364         { "a.b..-q--a-.e", "B", "a.b..-q--a-.e",
365           "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
366           "UIDNA_ERROR_HYPHEN_3_4" },
367         { "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e",
368           "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
369           "UIDNA_ERROR_HYPHEN_3_4" },
370         { "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e",
371           "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
372           "UIDNA_ERROR_HYPHEN_3_4" },
373         { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
374         { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
375         { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
376         { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
377         { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
378         { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
379         { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
380         { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
381         { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
382         { "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
383         { "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", "UIDNA_ERROR_LEADING_COMBINING_MARK" },
384         { "a.b.xn--c-bcb.d", "B",
385           "a.b.xn--c-bcb\uFFFD.d", "UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL" },
386         // BiDi
387         { "A0", "B", "a0", "" },
388         { "0A", "B", "0a", "" },  // all-LTR is ok to start with a digit (EN)
389         { "0A.\u05D0", "B",  // ASCII label does not start with L/R/AL
390           "0a.\u05D0", "UIDNA_ERROR_BIDI" },
391         { "c.xn--0-eha.xn--4db", "B",  // 2nd label does not start with L/R/AL
392           "c.0\u00FC.\u05D0", "UIDNA_ERROR_BIDI" },
393         { "b-.\u05D0", "B",  // label does not end with L/EN
394           "b-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
395         { "d.xn----dha.xn--4db", "B",  // 2nd label does not end with L/EN
396           "d.\u00FC-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
397         { "a\u05D0", "B", "a\u05D0", "UIDNA_ERROR_BIDI" },  // first dir != last dir
398         { "\u05D0\u05C7", "B", "\u05D0\u05C7", "" },
399         { "\u05D09\u05C7", "B", "\u05D09\u05C7", "" },
400         { "\u05D0a\u05C7", "B", "\u05D0a\u05C7", "UIDNA_ERROR_BIDI" },  // first dir != last dir
401         { "\u05D0\u05EA", "B", "\u05D0\u05EA", "" },
402         { "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", "" },
403         { "a\u05D0Tz", "B", "a\u05D0tz", "UIDNA_ERROR_BIDI" },  // mixed dir
404         { "\u05D0T\u05EA", "B", "\u05D0t\u05EA", "UIDNA_ERROR_BIDI" },  // mixed dir
405         { "\u05D07\u05EA", "B", "\u05D07\u05EA", "" },
406         { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" },  // Arabic 7 in the middle
407         { "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" },  // AN digit in LTR
408         { "\u05D07\u0667\u05EA", "B",  // mixed EN/AN digits in RTL
409           "\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" },
410         // ZWJ
411         { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" },  // Virama+ZWJ
412         { "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
413         { "\u200D", "N", "\u200D", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
414         // ZWNJ
415         { "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", "" },  // Virama+ZWNJ
416         { "\u0BB9\u200C", "N", "\u0BB9\u200C", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
417         { "\u200C", "N", "\u200C", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
418         { "\u0644\u0670\u200C\u06ED\u06EF", "N",  // Joining types D T ZWNJ T R
419           "\u0644\u0670\u200C\u06ED\u06EF", "" },
420         { "\u0644\u0670\u200C\u06EF", "N",  // D T ZWNJ R
421           "\u0644\u0670\u200C\u06EF", "" },
422         { "\u0644\u200C\u06ED\u06EF", "N",  // D ZWNJ T R
423           "\u0644\u200C\u06ED\u06EF", "" },
424         { "\u0644\u200C\u06EF", "N",  // D ZWNJ R
425           "\u0644\u200C\u06EF", "" },
426         { "\u0644\u0670\u200C\u06ED", "N",  // D T ZWNJ T
427           "\u0644\u0670\u200C\u06ED", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
428         { "\u06EF\u200C\u06EF", "N",  // R ZWNJ R
429           "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" },
430         { "\u0644\u200C", "N",  // D ZWNJ
431           "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
432         { "\u0660\u0661", "B",  // Arabic-Indic Digits alone
433           "\u0660\u0661", "UIDNA_ERROR_BIDI" },
434         { "\u06F0\u06F1", "B",  // Extended Arabic-Indic Digits alone
435           "\u06F0\u06F1", "" },
436         { "\u0660\u06F1", "B",  // Mixed Arabic-Indic Digits
437           "\u0660\u06F1", "UIDNA_ERROR_CONTEXTO_DIGITS|UIDNA_ERROR_BIDI" },
438         // All of the CONTEXTO "Would otherwise have been DISALLOWED" characters
439         // in their correct contexts,
440         // then each in incorrect context.
441         { "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "B",
442           "l\u00B7l\u4E00\u0375\u03B1\u05D0\u05F3\u05F4\u30FB", "UIDNA_ERROR_BIDI" },
443         { "l\u00B7", "B",
444           "l\u00B7", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
445         { "\u00B7l", "B",
446           "\u00B7l", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
447         { "\u0375", "B",
448           "\u0375", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
449         { "\u03B1\u05F3", "B",
450           "\u03B1\u05F3", "UIDNA_ERROR_CONTEXTO_PUNCTUATION|UIDNA_ERROR_BIDI" },
451         { "\u05F4", "B",
452           "\u05F4", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
453         { "l\u30FB", "B",
454           "l\u30FB", "UIDNA_ERROR_CONTEXTO_PUNCTUATION" },
455         // { "", "B",
456         //   "", "" },
457     };
458
459     public void TestSomeCases() {
460         StringBuilder aT=new StringBuilder(), uT=new StringBuilder();
461         StringBuilder aN=new StringBuilder(), uN=new StringBuilder();
462         IDNA.Info aTInfo=new IDNA.Info(), uTInfo=new IDNA.Info();
463         IDNA.Info aNInfo=new IDNA.Info(), uNInfo=new IDNA.Info();
464
465         StringBuilder aTuN=new StringBuilder(), uTaN=new StringBuilder();
466         StringBuilder aNuN=new StringBuilder(), uNaN=new StringBuilder();
467         IDNA.Info aTuNInfo=new IDNA.Info(), uTaNInfo=new IDNA.Info();
468         IDNA.Info aNuNInfo=new IDNA.Info(), uNaNInfo=new IDNA.Info();
469
470         StringBuilder aTL=new StringBuilder(), uTL=new StringBuilder();
471         StringBuilder aNL=new StringBuilder(), uNL=new StringBuilder();
472         IDNA.Info aTLInfo=new IDNA.Info(), uTLInfo=new IDNA.Info();
473         IDNA.Info aNLInfo=new IDNA.Info(), uNLInfo=new IDNA.Info();
474
475         EnumSet<IDNA.Error> uniErrors=EnumSet.noneOf(IDNA.Error.class);
476
477         TestCase testCase=new TestCase();
478         int i;
479         for(i=0; i<testCases.length; ++i) {
480             testCase.set(testCases[i]);
481             String input=testCase.s;
482             String expected=testCase.u;
483             // ToASCII/ToUnicode, transitional/nontransitional
484             try {
485                 trans.nameToASCII(input, aT, aTInfo);
486                 trans.nameToUnicode(input, uT, uTInfo);
487                 nontrans.nameToASCII(input, aN, aNInfo);
488                 nontrans.nameToUnicode(input, uN, uNInfo);
489             } catch(Exception e) {
490                 errln(String.format("first-level processing [%d/%s] %s - %s",
491                                     i, testCase.o, testCase.s, e));
492                 continue;
493             }
494             // ToUnicode does not set length errors.
495             uniErrors.clear();
496             uniErrors.addAll(testCase.errors);
497             uniErrors.removeAll(lengthErrors);
498             char mode=testCase.o.charAt(0);
499             if(mode=='B' || mode=='N') {
500                 if(!sameErrors(uNInfo, uniErrors)) {
501                     errln(String.format("N.nameToUnicode([%d] %s) unexpected errors %s",
502                                         i, testCase.s, uNInfo.getErrors()));
503                     continue;
504                 }
505                 if(!UTF16Plus.equal(uN, expected)) {
506                     errln(String.format("N.nameToUnicode([%d] %s) unexpected string %s",
507                                         i, testCase.s, prettify(uN.toString())));
508                     continue;
509                 }
510                 if(!sameErrors(aNInfo, testCase.errors)) {
511                     errln(String.format("N.nameToASCII([%d] %s) unexpected errors %s",
512                                         i, testCase.s, aNInfo.getErrors()));
513                     continue;
514                 }
515             }
516             if(mode=='B' || mode=='T') {
517                 if(!sameErrors(uTInfo, uniErrors)) {
518                     errln(String.format("T.nameToUnicode([%d] %s) unexpected errors %s",
519                                         i, testCase.s, uTInfo.getErrors()));
520                     continue;
521                 }
522                 if(!UTF16Plus.equal(uT, expected)) {
523                     errln(String.format("T.nameToUnicode([%d] %s) unexpected string %s",
524                                         i, testCase.s, prettify(uT.toString())));
525                     continue;
526                 }
527                 if(!sameErrors(aTInfo, testCase.errors)) {
528                     errln(String.format("T.nameToASCII([%d] %s) unexpected errors %s",
529                                         i, testCase.s, aTInfo.getErrors()));
530                     continue;
531                 }
532             }
533             // ToASCII is all-ASCII if no severe errors
534             if(!hasCertainErrors(aNInfo, severeErrors) && !isASCII(aN)) {
535                 errln(String.format("N.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
536                                     i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
537                 continue;
538             }
539             if(!hasCertainErrors(aTInfo, severeErrors) && !isASCII(aT)) {
540                 errln(String.format("T.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
541                                     i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
542                 continue;
543             }
544             if(isVerbose()) {
545                 char m= mode=='B' ? mode : 'N';
546                 logln(String.format("%c.nameToASCII([%d] %s) (errors %s) result string: %s",
547                                     m, i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
548                 if(mode!='B') {
549                     logln(String.format("T.nameToASCII([%d] %s) (errors %s) result string: %s",
550                                         i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
551                 }
552             }
553             // second-level processing
554             try {
555                 nontrans.nameToUnicode(aT, aTuN, aTuNInfo);
556                 nontrans.nameToASCII(uT, uTaN, uTaNInfo);
557                 nontrans.nameToUnicode(aN, aNuN, aNuNInfo);
558                 nontrans.nameToASCII(uN, uNaN, uNaNInfo);
559             } catch(Exception e) {
560                 errln(String.format("second-level processing [%d/%s] %s - %s",
561                                     i, testCase.o, testCase.s, e));
562                 continue;
563             }
564             if(!UTF16Plus.equal(aN, uNaN)) {
565                 errln(String.format("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "+
566                                     "(errors %s) %s vs. %s",
567                                     i, testCase.s, aNInfo.getErrors(),
568                                     prettify(aN.toString()), prettify(uNaN.toString())));
569                 continue;
570             }
571             if(!UTF16Plus.equal(aT, uTaN)) {
572                 errln(String.format("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "+
573                                     "(errors %s) %s vs. %s",
574                                     i, testCase.s, aNInfo.getErrors(),
575                                     prettify(aT.toString()), prettify(uTaN.toString())));
576                 continue;
577             }
578             if(!UTF16Plus.equal(uN, aNuN)) {
579                 errln(String.format("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "+
580                                     "(errors %s) %s vs. %s",
581                                     i, testCase.s, uNInfo.getErrors(), prettify(uN.toString()), prettify(aNuN.toString())));
582                 continue;
583             }
584             if(!UTF16Plus.equal(uT, aTuN)) {
585                 errln(String.format("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "+
586                                     "(errors %s) %s vs. %s",
587                                     i, testCase.s, uNInfo.getErrors(),
588                                     prettify(uT.toString()), prettify(aTuN.toString())));
589                 continue;
590             }
591             // labelToUnicode
592             try {
593                 trans.labelToASCII(input, aTL, aTLInfo);
594                 trans.labelToUnicode(input, uTL, uTLInfo);
595                 nontrans.labelToASCII(input, aNL, aNLInfo);
596                 nontrans.labelToUnicode(input, uNL, uNLInfo);
597             } catch(Exception e) {
598                 errln(String.format("labelToXYZ processing [%d/%s] %s - %s",
599                                     i, testCase.o, testCase.s, e));
600                 continue;
601             }
602             if(aN.indexOf(".")<0) {
603                 if(!UTF16Plus.equal(aN, aNL) || !sameErrors(aNInfo, aNLInfo)) {
604                     errln(String.format("N.nameToASCII([%d] %s)!=N.labelToASCII() "+
605                                         "(errors %s vs %s) %s vs. %s",
606                                         i, testCase.s, aNInfo.getErrors().toString(), aNLInfo.getErrors().toString(),
607                                         prettify(aN.toString()), prettify(aNL.toString())));
608                     continue;
609                 }
610             } else {
611                 if(!hasError(aNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
612                     errln(String.format("N.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
613                                         i, testCase.s, aNLInfo.getErrors()));
614                     continue;
615                 }
616             }
617             if(aT.indexOf(".")<0) {
618                 if(!UTF16Plus.equal(aT, aTL) || !sameErrors(aTInfo, aTLInfo)) {
619                     errln(String.format("T.nameToASCII([%d] %s)!=T.labelToASCII() "+
620                                         "(errors %s vs %s) %s vs. %s",
621                                         i, testCase.s, aTInfo.getErrors().toString(), aTLInfo.getErrors().toString(),
622                                         prettify(aT.toString()), prettify(aTL.toString())));
623                     continue;
624                 }
625             } else {
626                 if(!hasError(aTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
627                     errln(String.format("T.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
628                                         i, testCase.s, aTLInfo.getErrors()));
629                     continue;
630                 }
631             }
632             if(uN.indexOf(".")<0) {
633                 if(!UTF16Plus.equal(uN, uNL) || !sameErrors(uNInfo, uNLInfo)) {
634                     errln(String.format("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "+
635                                         "(errors %s vs %s) %s vs. %s",
636                                         i, testCase.s, uNInfo.getErrors().toString(), uNLInfo.getErrors().toString(),
637                                         prettify(uN.toString()), prettify(uNL.toString())));
638                     continue;
639                 }
640             } else {
641                 if(!hasError(uNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
642                     errln(String.format("N.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
643                                         i, testCase.s, uNLInfo.getErrors()));
644                     continue;
645                 }
646             }
647             if(uT.indexOf(".")<0) {
648                 if(!UTF16Plus.equal(uT, uTL) || !sameErrors(uTInfo, uTLInfo)) {
649                     errln(String.format("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "+
650                                         "(errors %s vs %s) %s vs. %s",
651                                         i, testCase.s, uTInfo.getErrors().toString(), uTLInfo.getErrors().toString(),
652                                         prettify(uT.toString()), prettify(uTL.toString())));
653                     continue;
654                 }
655             } else {
656                 if(!hasError(uTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
657                     errln(String.format("T.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
658                                         i, testCase.s, uTLInfo.getErrors()));
659                     continue;
660                 }
661             }
662             // Differences between transitional and nontransitional processing
663             if(mode=='B') {
664                 if( aNInfo.isTransitionalDifferent() ||
665                     aTInfo.isTransitionalDifferent() ||
666                     uNInfo.isTransitionalDifferent() ||
667                     uTInfo.isTransitionalDifferent() ||
668                     aNLInfo.isTransitionalDifferent() ||
669                     aTLInfo.isTransitionalDifferent() ||
670                     uNLInfo.isTransitionalDifferent() ||
671                     uTLInfo.isTransitionalDifferent()
672                 ) {
673                     errln(String.format("B.process([%d] %s) isTransitionalDifferent()", i, testCase.s));
674                     continue;
675                 }
676                 if( !UTF16Plus.equal(aN, aT) || !UTF16Plus.equal(uN, uT) ||
677                     !UTF16Plus.equal(aNL, aTL) || !UTF16Plus.equal(uNL, uTL) ||
678                     !sameErrors(aNInfo, aTInfo) || !sameErrors(uNInfo, uTInfo) ||
679                     !sameErrors(aNLInfo, aTLInfo) || !sameErrors(uNLInfo, uTLInfo)
680                 ) {
681                     errln(String.format("N.process([%d] %s) vs. T.process() different errors or result strings",
682                                         i, testCase.s));
683                     continue;
684                 }
685             } else {
686                 if( !aNInfo.isTransitionalDifferent() ||
687                     !aTInfo.isTransitionalDifferent() ||
688                     !uNInfo.isTransitionalDifferent() ||
689                     !uTInfo.isTransitionalDifferent() ||
690                     !aNLInfo.isTransitionalDifferent() ||
691                     !aTLInfo.isTransitionalDifferent() ||
692                     !uNLInfo.isTransitionalDifferent() ||
693                     !uTLInfo.isTransitionalDifferent()
694                 ) {
695                     errln(String.format("%s.process([%d] %s) !isTransitionalDifferent()",
696                                         testCase.o, i, testCase.s));
697                     continue;
698                 }
699                 if( UTF16Plus.equal(aN, aT) || UTF16Plus.equal(uN, uT) ||
700                     UTF16Plus.equal(aNL, aTL) || UTF16Plus.equal(uNL, uTL)
701                 ) {
702                     errln(String.format("N.process([%d] %s) vs. T.process() same result strings",
703                                         i, testCase.s));
704                     continue;
705                 }
706             }
707         }
708     }
709
710     private final IDNA trans, nontrans;
711
712     private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of(
713         IDNA.Error.LEADING_COMBINING_MARK,
714         IDNA.Error.DISALLOWED,
715         IDNA.Error.PUNYCODE,
716         IDNA.Error.LABEL_HAS_DOT,
717         IDNA.Error.INVALID_ACE_LABEL);
718     private static final EnumSet<IDNA.Error> lengthErrors=EnumSet.of(
719             IDNA.Error.EMPTY_LABEL,
720             IDNA.Error.LABEL_TOO_LONG,
721             IDNA.Error.DOMAIN_NAME_TOO_LONG);
722
723     private boolean hasError(IDNA.Info info, IDNA.Error error) {
724         return info.getErrors().contains(error);
725     }
726     // assumes that certainErrors is not empty
727     private boolean hasCertainErrors(Set<IDNA.Error> errors, Set<IDNA.Error> certainErrors) {
728         return !errors.isEmpty() && !Collections.disjoint(errors, certainErrors);
729     }
730     private boolean hasCertainErrors(IDNA.Info info, Set<IDNA.Error> certainErrors) {
731         return hasCertainErrors(info.getErrors(), certainErrors);
732     }
733     private boolean sameErrors(Set<IDNA.Error> a, Set<IDNA.Error> b) {
734         return a.equals(b);
735     }
736     private boolean sameErrors(IDNA.Info a, IDNA.Info b) {
737         return sameErrors(a.getErrors(), b.getErrors());
738     }
739     private boolean sameErrors(IDNA.Info a, Set<IDNA.Error> b) {
740         return sameErrors(a.getErrors(), b);
741     }
742
743     private static boolean
744     isASCII(CharSequence str) {
745         int length=str.length();
746         for(int i=0; i<length; ++i) {
747             if(str.charAt(i)>=0x80) {
748                 return false;
749             }
750         }
751         return true;
752     }
753 }