2 *******************************************************************************
\r
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
8 package com.ibm.icu.dev.test.translit;
\r
9 import com.ibm.icu.lang.*;
\r
10 import com.ibm.icu.text.*;
\r
14 public class WriteCharts {
\r
15 public static void main(String[] args) throws IOException {
\r
17 printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
\r
19 String testSet = "";
\r
20 if (args.length == 0) args = getAllScripts();
\r
21 for (int i = 0; i < args.length; ++i) {
\r
22 // Enumeration enum = Transliterator.getAvailableIDs();
\r
23 if (args[i].startsWith("[")) {
\r
26 print(testSet, args[i]);
\r
32 public static void printSet(String source) {
\r
33 UnicodeSet s = new UnicodeSet(source);
\r
34 System.out.println("Printout for '" + source + "'");
\r
35 int count = s.getRangeCount();
\r
36 for (int i = 0; i < count; ++i) {
\r
37 int start = s.getRangeStart(i);
\r
38 int end = s.getRangeEnd(i);
\r
39 System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));
\r
43 public static String[] getAllScripts() {
\r
44 Set set = new TreeSet();
\r
46 Enumeration sources = Transliterator.getAvailableSources();
\r
47 while(sources.hasMoreElements()) {
\r
48 String source = (String) sources.nextElement();
\r
49 scripts = UScript.getCode(source);
\r
50 if (scripts == null) {
\r
51 System.out.println("[Skipping " + source + "]");
\r
54 int sourceScript = scripts[0];
\r
55 System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
\r
56 Enumeration targets = Transliterator.getAvailableTargets(source);
\r
57 while(targets.hasMoreElements()) {
\r
58 String target = (String) targets.nextElement();
\r
59 scripts = UScript.getCode(target);
\r
61 || priority(scripts[0]) < priority(sourceScript)) {
\r
62 // skip doing both directions
\r
63 System.out.println("[Skipping '" + source + "-" + target + "']");
\r
66 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
\r
67 Enumeration variants = Transliterator.getAvailableVariants(source, target);
\r
68 while(variants.hasMoreElements()) {
\r
69 String variant = (String) variants.nextElement();
\r
70 String id = source + "-" + target;
\r
71 if (variant.length() != 0) {
\r
72 id += "/" + variant;
\r
74 System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
\r
78 System.out.println("\t\t\t\tAdding: '" + id + "'");
\r
83 String[] results = new String[set.size()];
\r
84 set.toArray(results);
\r
88 static public int priority(int script) {
\r
89 if (script == UScript.LATIN) return -2;
\r
93 public static String showScripts(int[] scripts) {
\r
94 StringBuffer results = new StringBuffer();
\r
95 for (int i = 0; i < scripts.length; ++i) {
\r
96 if (i != 0) results.append(", ");
\r
97 results.append(UScript.getName(scripts[i]));
\r
99 return results.toString();
\r
102 public static void print(String testSet, String rawId) throws IOException {
\r
103 System.out.println("Processing " + rawId);
\r
104 Transliterator t = Transliterator.getInstance(rawId);
\r
105 String id = t.getID();
\r
107 // clean up IDs. Ought to be API for getting source, target, variant
\r
108 int minusPos = id.indexOf('-');
\r
109 String source = id.substring(0,minusPos);
\r
110 String target = id.substring(minusPos+1);
\r
111 int slashPos = target.indexOf('/');
\r
112 if (slashPos >= 0) target = target.substring(0,slashPos);
\r
114 // check that the source is a script
\r
115 if (testSet.equals("")) {
\r
116 int[] scripts = UScript.getCode(source);
\r
117 if (scripts == null) {
\r
118 System.out.println("FAILED: "
\r
119 + Transliterator.getDisplayName(id)
\r
120 + " does not have a script as the source");
\r
123 testSet = "[:" + source + ":]";
\r
124 if (source.equalsIgnoreCase("katakana")) {
\r
125 testSet = "[" + testSet + "\u30FC]";
\r
130 UnicodeSet sourceSet = new UnicodeSet(testSet);
\r
132 // check that the target is a script
\r
133 int[] scripts = UScript.getCode(target);
\r
134 if (scripts == null) {
\r
135 target = "[:Latin:]";
\r
137 target = "[:" + target + ":]";
\r
139 UnicodeSet targetSet = new UnicodeSet(target);
\r
141 Transliterator inverse = t.getInverse();
\r
143 //Transliterator hex = Transliterator.getInstance("Any-Hex");
\r
146 // iterate through script
\r
147 System.out.println("Transliterating " + sourceSet.toPattern(true)
\r
148 + " with " + Transliterator.getDisplayName(id));
\r
150 UnicodeSet leftOverSet = new UnicodeSet(targetSet);
\r
151 UnicodeSet privateUse = new UnicodeSet("[:private use:]");
\r
153 Map map = new TreeMap();
\r
155 UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);
\r
156 targetSetPlusAnyways.addAll(okAnyway);
\r
158 UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);
\r
159 sourceSetPlusAnyways.addAll(okAnyway);
\r
161 UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);
\r
163 while (usi.next()) {
\r
164 int j = usi.codepoint;
\r
166 int count = sourceSet.getRangeCount();
\r
167 for (int i = 0; i < count; ++i) {
\r
168 int end = sourceSet.getRangeEnd(i);
\r
169 for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {
\r
171 // String flag = "";
\r
172 String ss = UTF16.valueOf(j);
\r
173 String ts = t.transliterate(ss);
\r
175 if (!targetSetPlusAnyways.containsAll(ts)) {
\r
178 if (UTF16.countCodePoint(ts) == 1) {
\r
179 leftOverSet.remove(UTF16.charAt(ts,0));
\r
181 String rt = inverse.transliterate(ts);
\r
182 if (!sourceSetPlusAnyways.containsAll(rt)) {
\r
184 } else if (!ss.equals(rt)) {
\r
188 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {
\r
192 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
\r
194 "<td class='s'>" + ss + "<br><tt>" + hex(ss)
\r
195 + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
\r
196 + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
\r
202 for (int i2 = 0; i2 < count; ++i2) {
\r
203 int end2 = sourceSet.getRangeEnd(i2);
\r
204 for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
\r
205 String ss2 = UTF16.valueOf(j2);
\r
206 String ts2 = t.transliterate(ss2);
\r
207 String rt2 = inverse.transliterate(ts2);
\r
209 String ss12 = ss + ss2;
\r
210 String ts12 = t.transliterate(ss + ss12);
\r
211 String rt12 = inverse.transliterate(ts12);
\r
212 if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
\r
213 if (--maxDual < 0) break dual;
\r
215 // transliteration of whole differs from that of parts
\r
217 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
\r
218 + "\u0000" + ss12,
\r
219 "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
\r
220 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
\r
221 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
\r
229 leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
\r
231 /*int count = leftOverSet.getRangeCount();
\r
232 for (int i = 0; i < count; ++i) {
\r
233 int end = leftOverSet.getRangeEnd(i);
\r
234 for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {
\r
237 usi.reset(leftOverSet);
\r
238 while (usi.next()) {
\r
239 int j = usi.codepoint;
\r
241 String ts = UTF16.valueOf(j);
\r
242 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);
\r
243 // if (!decomp.equals(ts)) continue;
\r
245 String rt = inverse.transliterate(ts);
\r
246 // String flag = "";
\r
249 if (!sourceSetPlusAnyways.containsAll(rt)) {
\r
252 if (!privateUse.containsNone(rt)) {
\r
256 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
\r
257 "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
\r
258 + "</tt></td><td class='r'>"
\r
259 + rt + "<br><tt>" + hex(rt) + "</tt></td>");
\r
263 // make file name and open
\r
264 File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
\r
265 String filename = f.getCanonicalFile().toString();
\r
266 PrintWriter out = new PrintWriter(
\r
267 new OutputStreamWriter(
\r
268 new FileOutputStream(filename), "UTF-8"));
\r
269 //out.print('\uFEFF'); // BOM
\r
271 System.out.println("Writing " + filename);
\r
274 out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
\r
275 out.println("<HTML><HEAD>");
\r
276 out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
\r
277 out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
\r
279 out.println("<BODY>");
\r
280 out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
\r
281 out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
\r
282 out.println("The samples are mechanically generated, and only include single characters");
\r
283 out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
\r
284 out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
\r
285 out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");
\r
287 // set up the headers
\r
288 int columnCount = 3;
\r
289 String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
\r
290 String headers = headerBase;
\r
291 for (int i = columnCount - 1; i > 0; --i) {
\r
292 if (i != columnCount - 1) headers += "<th> </th>";
\r
293 headers += headerBase;
\r
296 String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
\r
297 String tableFooter = "</table></p>";
\r
298 out.println("<h2>Round Trip</h2>");
\r
299 out.println(tableHeader);
\r
301 Iterator it = map.keySet().iterator();
\r
302 char lastGroup = 0;
\r
305 while (it.hasNext()) {
\r
306 String key = (String) it.next();
\r
307 char group = key.charAt(0);
\r
308 if (group != lastGroup || count++ > 50) {
\r
312 out.println("</tr>");
\r
315 out.println(tableFooter);
\r
317 // String title = "";
\r
318 if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
\r
319 else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
\r
320 else out.println("<hr><h2>Round Trip</h2>");
\r
321 if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
\r
322 if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
\r
323 if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
\r
324 if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
\r
325 if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
\r
327 out.println(tableHeader);
\r
330 String value = (String) map.get(key);
\r
331 if (column++ == 0) out.print("<tr>");
\r
332 else out.print("<th> </th>");
\r
333 out.println(value);
\r
335 out.println("</tr>");
\r
340 out.println("</tr>");
\r
343 out.println(tableFooter + "</BODY></HTML>");
\r
350 public static String hex(String s) {
\r
352 StringBuffer results = new StringBuffer();
\r
353 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
\r
354 cp = UTF16.charAt(s, i);
\r
355 if (i != 0) results.append(' ');
\r
356 results.append(Integer.toHexString(cp));
\r
358 return results.toString().toUpperCase();
\r
361 static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
\r
364 // tests whether a string is in a set. Also checks for Common and Inherited
\r
365 public static boolean isIn(String s, UnicodeSet set) {
\r
367 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
\r
368 cp = UTF16.charAt(s, i);
\r
369 if (set.contains(cp)) continue;
\r
370 if (okAnyway.contains(cp)) continue;
\r