]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/tests/translit/src/com/ibm/icu/dev/test/translit/WriteCharts.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / tests / translit / src / com / ibm / icu / dev / test / translit / WriteCharts.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7  \r
8 package com.ibm.icu.dev.test.translit;\r
9 import java.io.File;\r
10 import java.io.FileOutputStream;\r
11 import java.io.IOException;\r
12 import java.io.OutputStreamWriter;\r
13 import java.io.PrintWriter;\r
14 import java.util.Enumeration;\r
15 import java.util.Iterator;\r
16 import java.util.Map;\r
17 import java.util.Set;\r
18 import java.util.TreeMap;\r
19 import java.util.TreeSet;\r
20 \r
21 import com.ibm.icu.lang.UCharacter;\r
22 import com.ibm.icu.lang.UScript;\r
23 import com.ibm.icu.text.Normalizer;\r
24 import com.ibm.icu.text.Transliterator;\r
25 import com.ibm.icu.text.UTF16;\r
26 import com.ibm.icu.text.UnicodeSet;\r
27 import com.ibm.icu.text.UnicodeSetIterator;\r
28 \r
29 public class WriteCharts {\r
30     public static void main(String[] args) throws IOException {\r
31         if (false) {\r
32             printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");\r
33         }\r
34         String testSet = "";\r
35         if (args.length == 0) args = getAllScripts();\r
36         for (int i = 0; i < args.length; ++i) {\r
37     // Enumeration enum = Transliterator.getAvailableIDs();\r
38             if (args[i].startsWith("[")) {\r
39                 testSet = args[i];\r
40             } else {\r
41                 print(testSet, args[i]);\r
42                 testSet = "";\r
43             }\r
44         }\r
45     }\r
46     \r
47     public static void printSet(String source) {\r
48         UnicodeSet s = new UnicodeSet(source);\r
49         System.out.println("Printout for '" + source + "'");\r
50         int count = s.getRangeCount();\r
51         for (int i = 0; i < count; ++i) {\r
52             int start = s.getRangeStart(i);\r
53             int end = s.getRangeEnd(i);\r
54             System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));\r
55         }\r
56     }\r
57     \r
58     public static String[] getAllScripts() {\r
59         Set set = new TreeSet();\r
60         int scripts[];\r
61         Enumeration sources = Transliterator.getAvailableSources();\r
62         while(sources.hasMoreElements()) {\r
63             String source = (String) sources.nextElement();\r
64             scripts = UScript.getCode(source);\r
65             if (scripts == null) {\r
66                 System.out.println("[Skipping " + source + "]");\r
67                 continue;\r
68             }\r
69             int sourceScript = scripts[0];\r
70             System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));\r
71             Enumeration targets = Transliterator.getAvailableTargets(source);\r
72             while(targets.hasMoreElements()) {\r
73                 String target = (String) targets.nextElement();\r
74                 scripts = UScript.getCode(target);\r
75                 if (scripts == null\r
76                         || priority(scripts[0]) < priority(sourceScript)) {\r
77                     // skip doing both directions\r
78                     System.out.println("[Skipping '" + source + "-" + target + "']");\r
79                     continue;\r
80                 }\r
81                 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));\r
82                 Enumeration variants = Transliterator.getAvailableVariants(source, target);\r
83                 while(variants.hasMoreElements()) {\r
84                     String variant = (String) variants.nextElement();\r
85                     String id = source + "-" + target;\r
86                     if (variant.length() != 0) {\r
87                         id += "/" + variant;\r
88                         if (false) {\r
89                             System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);\r
90                             continue;\r
91                         }\r
92                     }\r
93                     System.out.println("\t\t\t\tAdding: '" + id + "'");\r
94                     set.add(id);\r
95                 }\r
96             }\r
97         }\r
98         String[] results = new String[set.size()];\r
99         set.toArray(results);\r
100         return results;\r
101     }\r
102     \r
103     static public int priority(int script) {\r
104         if (script == UScript.LATIN) return -2;\r
105         return script;\r
106     }\r
107     \r
108     public static String showScripts(int[] scripts) {\r
109         StringBuffer results = new StringBuffer();\r
110         for (int i = 0; i < scripts.length; ++i) {\r
111             if (i != 0) results.append(", ");\r
112             results.append(UScript.getName(scripts[i]));\r
113         }\r
114         return results.toString();\r
115     }\r
116     \r
117     public static void print(String testSet, String rawId) throws IOException {\r
118         System.out.println("Processing " + rawId);\r
119         Transliterator t = Transliterator.getInstance(rawId);\r
120         String id = t.getID();\r
121         \r
122         // clean up IDs. Ought to be API for getting source, target, variant\r
123         int minusPos = id.indexOf('-');\r
124         String source = id.substring(0,minusPos);\r
125         String target = id.substring(minusPos+1);\r
126         int slashPos = target.indexOf('/');\r
127         if (slashPos >= 0) target = target.substring(0,slashPos);\r
128         \r
129         // check that the source is a script\r
130         if (testSet.equals("")) {\r
131             int[] scripts = UScript.getCode(source);\r
132             if (scripts == null) {\r
133                 System.out.println("FAILED: " \r
134                     + Transliterator.getDisplayName(id)\r
135                     + " does not have a script as the source");\r
136                 return;\r
137             } else {\r
138                 testSet = "[:" + source + ":]";\r
139                 if (source.equalsIgnoreCase("katakana")) {\r
140                     testSet = "[" + testSet + "\u30FC]";\r
141                     printSet(testSet);\r
142                 }\r
143             }\r
144         }\r
145         UnicodeSet sourceSet = new UnicodeSet(testSet);\r
146 \r
147         // check that the target is a script\r
148         int[] scripts = UScript.getCode(target);\r
149         if (scripts == null) {\r
150             target = "[:Latin:]";\r
151         } else {\r
152             target = "[:" + target + ":]";\r
153         }\r
154         UnicodeSet targetSet = new UnicodeSet(target);        \r
155         \r
156         Transliterator inverse = t.getInverse();\r
157         \r
158         //Transliterator hex = Transliterator.getInstance("Any-Hex");\r
159         \r
160                 \r
161         // iterate through script\r
162         System.out.println("Transliterating " + sourceSet.toPattern(true) \r
163             + " with " + Transliterator.getDisplayName(id));\r
164                 \r
165         UnicodeSet leftOverSet = new UnicodeSet(targetSet);\r
166         UnicodeSet privateUse = new UnicodeSet("[:private use:]");\r
167             \r
168         Map map = new TreeMap();\r
169         \r
170         UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);\r
171         targetSetPlusAnyways.addAll(okAnyway);\r
172         \r
173         UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);\r
174         sourceSetPlusAnyways.addAll(okAnyway);\r
175         \r
176         UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);\r
177         \r
178         while (usi.next()) {\r
179             int j = usi.codepoint;\r
180             /*\r
181         int count = sourceSet.getRangeCount();\r
182         for (int i = 0; i < count; ++i) {\r
183             int end = sourceSet.getRangeEnd(i);\r
184             for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {\r
185             */\r
186                // String flag = "";\r
187                 String ss = UTF16.valueOf(j);\r
188                 String ts = t.transliterate(ss);\r
189                 char group = 0;\r
190                 if (!targetSetPlusAnyways.containsAll(ts)) {\r
191                     group |= 1;\r
192                 }\r
193                 if (UTF16.countCodePoint(ts) == 1) {\r
194                     leftOverSet.remove(UTF16.charAt(ts,0));\r
195                 }\r
196                 String rt = inverse.transliterate(ts);\r
197                 if (!sourceSetPlusAnyways.containsAll(rt)) {\r
198                     group |= 2;\r
199                 } else if (!ss.equals(rt)) {\r
200                     group |= 4;\r
201                 }\r
202                 \r
203                 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {\r
204                     group |= 16;\r
205                 }\r
206                     \r
207                 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))\r
208                         + "\u0000" + ss, \r
209                     "<td class='s'>" + ss + "<br><tt>" + hex(ss)\r
210                         + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)\r
211                         + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );\r
212                 \r
213                 // Check Duals\r
214                 /*\r
215                 int maxDual = 200;\r
216               dual:\r
217                 for (int i2 = 0; i2 < count; ++i2) {\r
218                     int end2 = sourceSet.getRangeEnd(i2);\r
219                     for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {\r
220                         String ss2 = UTF16.valueOf(j2);\r
221                         String ts2 = t.transliterate(ss2);\r
222                         String rt2 = inverse.transliterate(ts2);\r
223                         \r
224                         String ss12 = ss + ss2;\r
225                         String ts12 = t.transliterate(ss + ss12);\r
226                         String rt12 = inverse.transliterate(ts12);\r
227                         if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;   \r
228                         if (--maxDual < 0) break dual;\r
229                         \r
230                         // transliteration of whole differs from that of parts\r
231                         group = 0x100;\r
232                         map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))\r
233                                 + "\u0000" + ss12, \r
234                             "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)\r
235                                 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)\r
236                                 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );\r
237                     }\r
238                 }\r
239                 */\r
240             //}\r
241         }\r
242         \r
243         \r
244         leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA\r
245         \r
246         /*int count = leftOverSet.getRangeCount();\r
247         for (int i = 0; i < count; ++i) {\r
248             int end = leftOverSet.getRangeEnd(i);\r
249             for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {\r
250             */\r
251             \r
252         usi.reset(leftOverSet);\r
253         while (usi.next()) {\r
254             int j = usi.codepoint;\r
255             \r
256                 String ts = UTF16.valueOf(j);\r
257                 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);\r
258                 // if (!decomp.equals(ts)) continue;\r
259                 \r
260                 String rt = inverse.transliterate(ts);\r
261                 // String flag = "";\r
262                 char group = 0x80;\r
263                     \r
264                 if (!sourceSetPlusAnyways.containsAll(rt)) {\r
265                     group |= 8;\r
266                 }\r
267                 if (!privateUse.containsNone(rt)) {\r
268                     group |= 16;\r
269                 }\r
270                     \r
271                 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts, \r
272                     "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)\r
273                     + "</tt></td><td class='r'>"\r
274                     + rt + "<br><tt>" + hex(rt) + "</tt></td>");\r
275             //}\r
276         }\r
277 \r
278         // make file name and open\r
279         File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");\r
280         String filename = f.getCanonicalFile().toString();\r
281         PrintWriter out = new PrintWriter(\r
282             new OutputStreamWriter(\r
283                 new FileOutputStream(filename), "UTF-8"));\r
284         //out.print('\uFEFF'); // BOM\r
285         \r
286         System.out.println("Writing " + filename);\r
287         \r
288         try {\r
289             out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");\r
290             out.println("<HTML><HEAD>");\r
291             out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");\r
292             out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");\r
293             \r
294             out.println("<BODY>");\r
295             out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");\r
296             out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");\r
297             out.println("The samples are mechanically generated, and only include single characters");\r
298             out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");\r
299             out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");\r
300             out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");\r
301             \r
302             // set up the headers\r
303             int columnCount = 3;\r
304             String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";\r
305             String headers = headerBase;\r
306             for (int i = columnCount - 1; i > 0; --i) {\r
307                 if (i != columnCount - 1) headers += "<th>&nbsp;</th>";\r
308                 headers += headerBase;\r
309             }\r
310             \r
311             String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";\r
312             String tableFooter = "</table></p>";\r
313             out.println("<h2>Round Trip</h2>");\r
314             out.println(tableHeader);\r
315             \r
316             Iterator it = map.keySet().iterator();\r
317             char lastGroup = 0;\r
318             int count = 0;\r
319             int column = 0;\r
320             while (it.hasNext()) {\r
321                 String key = (String) it.next();\r
322                 char group = key.charAt(0);\r
323                 if (group != lastGroup || count++ > 50) {\r
324                     lastGroup = group;\r
325                     count = 0;\r
326                     if (column != 0) {\r
327                         out.println("</tr>");\r
328                         column = 0;\r
329                     }\r
330                     out.println(tableFooter);\r
331                     \r
332                     // String title = "";\r
333                     if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");\r
334                     else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");\r
335                     else out.println("<hr><h2>Round Trip</h2>");\r
336                     if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");\r
337                     if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");\r
338                     if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");\r
339                     if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");\r
340                     if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");\r
341                                         \r
342                     out.println(tableHeader);\r
343                     column = 0;\r
344                 }\r
345                 String value = (String) map.get(key);\r
346                 if (column++ == 0) out.print("<tr>");\r
347                 else out.print("<th>&nbsp;</th>");\r
348                 out.println(value);\r
349                 if (column == 3) {\r
350                     out.println("</tr>");\r
351                     column = 0;\r
352                 }\r
353             }\r
354             if (column != 0) {\r
355                 out.println("</tr>");\r
356                 column = 0;\r
357             }\r
358             out.println(tableFooter + "</BODY></HTML>");\r
359             \r
360         } finally {\r
361             out.close();\r
362         }\r
363     }\r
364     \r
365     public static String hex(String s) {\r
366         int cp;\r
367         StringBuffer results = new StringBuffer();\r
368         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {\r
369             cp = UTF16.charAt(s, i);\r
370             if (i != 0) results.append(' ');\r
371             results.append(Integer.toHexString(cp));\r
372         }\r
373         return results.toString().toUpperCase();\r
374     }\r
375     \r
376     static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");\r
377     \r
378     /*\r
379     // tests whether a string is in a set. Also checks for Common and Inherited\r
380     public static boolean isIn(String s, UnicodeSet set) {\r
381         int cp;\r
382         for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {\r
383             cp = UTF16.charAt(s, i);\r
384             if (set.contains(cp)) continue;\r
385             if (okAnyway.contains(cp)) continue;\r
386             return false;\r
387         }\r
388         return true;\r
389     }\r
390     */\r
391     \r
392 }\r
393