]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_2_1-src/src/com/ibm/icu/dev/test/translit/WriteCharts.java
icu4jsrc
[Dictionary.git] / jars / icu4j-4_2_1-src / src / com / ibm / icu / dev / test / translit / WriteCharts.java
1 /*\r
2  *******************************************************************************\r
3  * Copyright (C) 1996-2006, International Business Machines Corporation and    *\r
4  * others. All Rights Reserved.                                                *\r
5  *******************************************************************************\r
6  */\r
7  \r
8 package com.ibm.icu.dev.test.translit;\r
9 import com.ibm.icu.lang.*;\r
10 import com.ibm.icu.text.*;\r
11 import java.util.*;\r
12 import java.io.*;\r
13 \r
14 public class WriteCharts {\r
15     public static void main(String[] args) throws IOException {\r
16         if (false) {\r
17             printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");\r
18         }\r
19         String testSet = "";\r
20         if (args.length == 0) args = getAllScripts();\r
21         for (int i = 0; i < args.length; ++i) {\r
22     // Enumeration enum = Transliterator.getAvailableIDs();\r
23             if (args[i].startsWith("[")) {\r
24                 testSet = args[i];\r
25             } else {\r
26                 print(testSet, args[i]);\r
27                 testSet = "";\r
28             }\r
29         }\r
30     }\r
31     \r
32     public static void printSet(String source) {\r
33         UnicodeSet s = new UnicodeSet(source);\r
34         System.out.println("Printout for '" + source + "'");\r
35         int count = s.getRangeCount();\r
36         for (int i = 0; i < count; ++i) {\r
37             int start = s.getRangeStart(i);\r
38             int end = s.getRangeEnd(i);\r
39             System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));\r
40         }\r
41     }\r
42     \r
43     public static String[] getAllScripts() {\r
44         Set set = new TreeSet();\r
45         int scripts[];\r
46         Enumeration sources = Transliterator.getAvailableSources();\r
47         while(sources.hasMoreElements()) {\r
48             String source = (String) sources.nextElement();\r
49             scripts = UScript.getCode(source);\r
50             if (scripts == null) {\r
51                 System.out.println("[Skipping " + source + "]");\r
52                 continue;\r
53             }\r
54             int sourceScript = scripts[0];\r
55             System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));\r
56             Enumeration targets = Transliterator.getAvailableTargets(source);\r
57             while(targets.hasMoreElements()) {\r
58                 String target = (String) targets.nextElement();\r
59                 scripts = UScript.getCode(target);\r
60                 if (scripts == null\r
61                         || priority(scripts[0]) < priority(sourceScript)) {\r
62                     // skip doing both directions\r
63                     System.out.println("[Skipping '" + source + "-" + target + "']");\r
64                     continue;\r
65                 }\r
66                 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));\r
67                 Enumeration variants = Transliterator.getAvailableVariants(source, target);\r
68                 while(variants.hasMoreElements()) {\r
69                     String variant = (String) variants.nextElement();\r
70                     String id = source + "-" + target;\r
71                     if (variant.length() != 0) {\r
72                         id += "/" + variant;\r
73                         if (false) {\r
74                             System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);\r
75                             continue;\r
76                         }\r
77                     }\r
78                     System.out.println("\t\t\t\tAdding: '" + id + "'");\r
79                     set.add(id);\r
80                 }\r
81             }\r
82         }\r
83         String[] results = new String[set.size()];\r
84         set.toArray(results);\r
85         return results;\r
86     }\r
87     \r
88     static public int priority(int script) {\r
89         if (script == UScript.LATIN) return -2;\r
90         return script;\r
91     }\r
92     \r
93     public static String showScripts(int[] scripts) {\r
94         StringBuffer results = new StringBuffer();\r
95         for (int i = 0; i < scripts.length; ++i) {\r
96             if (i != 0) results.append(", ");\r
97             results.append(UScript.getName(scripts[i]));\r
98         }\r
99         return results.toString();\r
100     }\r
101     \r
102     public static void print(String testSet, String rawId) throws IOException {\r
103         System.out.println("Processing " + rawId);\r
104         Transliterator t = Transliterator.getInstance(rawId);\r
105         String id = t.getID();\r
106         \r
107         // clean up IDs. Ought to be API for getting source, target, variant\r
108         int minusPos = id.indexOf('-');\r
109         String source = id.substring(0,minusPos);\r
110         String target = id.substring(minusPos+1);\r
111         int slashPos = target.indexOf('/');\r
112         if (slashPos >= 0) target = target.substring(0,slashPos);\r
113         \r
114         // check that the source is a script\r
115         if (testSet.equals("")) {\r
116             int[] scripts = UScript.getCode(source);\r
117             if (scripts == null) {\r
118                 System.out.println("FAILED: " \r
119                     + Transliterator.getDisplayName(id)\r
120                     + " does not have a script as the source");\r
121                 return;\r
122             } else {\r
123                 testSet = "[:" + source + ":]";\r
124                 if (source.equalsIgnoreCase("katakana")) {\r
125                     testSet = "[" + testSet + "\u30FC]";\r
126                     printSet(testSet);\r
127                 }\r
128             }\r
129         }\r
130         UnicodeSet sourceSet = new UnicodeSet(testSet);\r
131 \r
132         // check that the target is a script\r
133         int[] scripts = UScript.getCode(target);\r
134         if (scripts == null) {\r
135             target = "[:Latin:]";\r
136         } else {\r
137             target = "[:" + target + ":]";\r
138         }\r
139         UnicodeSet targetSet = new UnicodeSet(target);        \r
140         \r
141         Transliterator inverse = t.getInverse();\r
142         \r
143         //Transliterator hex = Transliterator.getInstance("Any-Hex");\r
144         \r
145                 \r
146         // iterate through script\r
147         System.out.println("Transliterating " + sourceSet.toPattern(true) \r
148             + " with " + Transliterator.getDisplayName(id));\r
149                 \r
150         UnicodeSet leftOverSet = new UnicodeSet(targetSet);\r
151         UnicodeSet privateUse = new UnicodeSet("[:private use:]");\r
152             \r
153         Map map = new TreeMap();\r
154         \r
155         UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);\r
156         targetSetPlusAnyways.addAll(okAnyway);\r
157         \r
158         UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);\r
159         sourceSetPlusAnyways.addAll(okAnyway);\r
160         \r
161         UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);\r
162         \r
163         while (usi.next()) {\r
164             int j = usi.codepoint;\r
165             /*\r
166         int count = sourceSet.getRangeCount();\r
167         for (int i = 0; i < count; ++i) {\r
168             int end = sourceSet.getRangeEnd(i);\r
169             for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {\r
170             */\r
171                // String flag = "";\r
172                 String ss = UTF16.valueOf(j);\r
173                 String ts = t.transliterate(ss);\r
174                 char group = 0;\r
175                 if (!targetSetPlusAnyways.containsAll(ts)) {\r
176                     group |= 1;\r
177                 }\r
178                 if (UTF16.countCodePoint(ts) == 1) {\r
179                     leftOverSet.remove(UTF16.charAt(ts,0));\r
180                 }\r
181                 String rt = inverse.transliterate(ts);\r
182                 if (!sourceSetPlusAnyways.containsAll(rt)) {\r
183                     group |= 2;\r
184                 } else if (!ss.equals(rt)) {\r
185                     group |= 4;\r
186                 }\r
187                 \r
188                 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {\r
189                     group |= 16;\r
190                 }\r
191                     \r
192                 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))\r
193                         + "\u0000" + ss, \r
194                     "<td class='s'>" + ss + "<br><tt>" + hex(ss)\r
195                         + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)\r
196                         + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );\r
197                 \r
198                 // Check Duals\r
199                 /*\r
200                 int maxDual = 200;\r
201               dual:\r
202                 for (int i2 = 0; i2 < count; ++i2) {\r
203                     int end2 = sourceSet.getRangeEnd(i2);\r
204                     for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {\r
205                         String ss2 = UTF16.valueOf(j2);\r
206                         String ts2 = t.transliterate(ss2);\r
207                         String rt2 = inverse.transliterate(ts2);\r
208                         \r
209                         String ss12 = ss + ss2;\r
210                         String ts12 = t.transliterate(ss + ss12);\r
211                         String rt12 = inverse.transliterate(ts12);\r
212                         if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;   \r
213                         if (--maxDual < 0) break dual;\r
214                         \r
215                         // transliteration of whole differs from that of parts\r
216                         group = 0x100;\r
217                         map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))\r
218                                 + "\u0000" + ss12, \r
219                             "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)\r
220                                 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)\r
221                                 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );\r
222                     }\r
223                 }\r
224                 */\r
225             //}\r
226         }\r
227         \r
228         \r
229         leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA\r
230         \r
231         /*int count = leftOverSet.getRangeCount();\r
232         for (int i = 0; i < count; ++i) {\r
233             int end = leftOverSet.getRangeEnd(i);\r
234             for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {\r
235             */\r
236             \r
237         usi.reset(leftOverSet);\r
238         while (usi.next()) {\r
239             int j = usi.codepoint;\r
240             \r
241                 String ts = UTF16.valueOf(j);\r
242                 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);\r
243                 // if (!decomp.equals(ts)) continue;\r
244                 \r
245                 String rt = inverse.transliterate(ts);\r
246                 // String flag = "";\r
247                 char group = 0x80;\r
248                     \r
249                 if (!sourceSetPlusAnyways.containsAll(rt)) {\r
250                     group |= 8;\r
251                 }\r
252                 if (!privateUse.containsNone(rt)) {\r
253                     group |= 16;\r
254                 }\r
255                     \r
256                 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts, \r
257                     "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)\r
258                     + "</tt></td><td class='r'>"\r
259                     + rt + "<br><tt>" + hex(rt) + "</tt></td>");\r
260             //}\r
261         }\r
262 \r
263         // make file name and open\r
264         File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");\r
265         String filename = f.getCanonicalFile().toString();\r
266         PrintWriter out = new PrintWriter(\r
267             new OutputStreamWriter(\r
268                 new FileOutputStream(filename), "UTF-8"));\r
269         //out.print('\uFEFF'); // BOM\r
270         \r
271         System.out.println("Writing " + filename);\r
272         \r
273         try {\r
274             out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");\r
275             out.println("<HTML><HEAD>");\r
276             out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");\r
277             out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");\r
278             \r
279             out.println("<BODY>");\r
280             out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");\r
281             out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");\r
282             out.println("The samples are mechanically generated, and only include single characters");\r
283             out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");\r
284             out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");\r
285             out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");\r
286             \r
287             // set up the headers\r
288             int columnCount = 3;\r
289             String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";\r
290             String headers = headerBase;\r
291             for (int i = columnCount - 1; i > 0; --i) {\r
292                 if (i != columnCount - 1) headers += "<th>&nbsp;</th>";\r
293                 headers += headerBase;\r
294             }\r
295             \r
296             String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";\r
297             String tableFooter = "</table></p>";\r
298             out.println("<h2>Round Trip</h2>");\r
299             out.println(tableHeader);\r
300             \r
301             Iterator it = map.keySet().iterator();\r
302             char lastGroup = 0;\r
303             int count = 0;\r
304             int column = 0;\r
305             while (it.hasNext()) {\r
306                 String key = (String) it.next();\r
307                 char group = key.charAt(0);\r
308                 if (group != lastGroup || count++ > 50) {\r
309                     lastGroup = group;\r
310                     count = 0;\r
311                     if (column != 0) {\r
312                         out.println("</tr>");\r
313                         column = 0;\r
314                     }\r
315                     out.println(tableFooter);\r
316                     \r
317                     // String title = "";\r
318                     if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");\r
319                     else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");\r
320                     else out.println("<hr><h2>Round Trip</h2>");\r
321                     if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");\r
322                     if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");\r
323                     if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");\r
324                     if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");\r
325                     if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");\r
326                                         \r
327                     out.println(tableHeader);\r
328                     column = 0;\r
329                 }\r
330                 String value = (String) map.get(key);\r
331                 if (column++ == 0) out.print("<tr>");\r
332                 else out.print("<th>&nbsp;</th>");\r
333                 out.println(value);\r
334                 if (column == 3) {\r
335                     out.println("</tr>");\r
336                     column = 0;\r
337                 }\r
338             }\r
339             if (column != 0) {\r
340                 out.println("</tr>");\r
341                 column = 0;\r
342             }\r
343             out.println(tableFooter + "</BODY></HTML>");\r
344             \r
345         } finally {\r
346             out.close();\r
347         }\r
348     }\r
349     \r
350     public static String hex(String s) {\r
351         int cp;\r
352         StringBuffer results = new StringBuffer();\r
353         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {\r
354             cp = UTF16.charAt(s, i);\r
355             if (i != 0) results.append(' ');\r
356             results.append(Integer.toHexString(cp));\r
357         }\r
358         return results.toString().toUpperCase();\r
359     }\r
360     \r
361     static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");\r
362     \r
363     /*\r
364     // tests whether a string is in a set. Also checks for Common and Inherited\r
365     public static boolean isIn(String s, UnicodeSet set) {\r
366         int cp;\r
367         for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {\r
368             cp = UTF16.charAt(s, i);\r
369             if (set.contains(cp)) continue;\r
370             if (okAnyway.contains(cp)) continue;\r
371             return false;\r
372         }\r
373         return true;\r
374     }\r
375     */\r
376     \r
377 }\r
378