2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
8 package com.ibm.icu.dev.test.translit;
10 import java.io.FileOutputStream;
11 import java.io.IOException;
12 import java.io.OutputStreamWriter;
13 import java.io.PrintWriter;
14 import java.util.Enumeration;
15 import java.util.Iterator;
18 import java.util.TreeMap;
19 import java.util.TreeSet;
21 import com.ibm.icu.lang.UCharacter;
22 import com.ibm.icu.lang.UScript;
23 import com.ibm.icu.text.Normalizer;
24 import com.ibm.icu.text.Transliterator;
25 import com.ibm.icu.text.UTF16;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
29 public class WriteCharts {
30 public static void main(String[] args) throws IOException {
32 printSet("[[\u0000-\u007E \u30A1-\u30FC \uFF61-\uFF9F\u3001\u3002][:Katakana:][:Mark:]]");
35 if (args.length == 0) args = getAllScripts();
36 for (int i = 0; i < args.length; ++i) {
37 // Enumeration enum = Transliterator.getAvailableIDs();
38 if (args[i].startsWith("[")) {
41 print(testSet, args[i]);
47 public static void printSet(String source) {
48 UnicodeSet s = new UnicodeSet(source);
49 System.out.println("Printout for '" + source + "'");
50 int count = s.getRangeCount();
51 for (int i = 0; i < count; ++i) {
52 int start = s.getRangeStart(i);
53 int end = s.getRangeEnd(i);
54 System.out.println(Integer.toString(start,16) + ".." + Integer.toString(end,16));
58 public static String[] getAllScripts() {
59 Set set = new TreeSet();
61 Enumeration sources = Transliterator.getAvailableSources();
62 while(sources.hasMoreElements()) {
63 String source = (String) sources.nextElement();
64 scripts = UScript.getCode(source);
65 if (scripts == null) {
66 System.out.println("[Skipping " + source + "]");
69 int sourceScript = scripts[0];
70 System.out.println("Source: " + source + ";\tScripts: " + showScripts(scripts));
71 Enumeration targets = Transliterator.getAvailableTargets(source);
72 while(targets.hasMoreElements()) {
73 String target = (String) targets.nextElement();
74 scripts = UScript.getCode(target);
76 || priority(scripts[0]) < priority(sourceScript)) {
77 // skip doing both directions
78 System.out.println("[Skipping '" + source + "-" + target + "']");
81 System.out.println("\tTarget: " + target + ";\tScripts: " + showScripts(scripts));
82 Enumeration variants = Transliterator.getAvailableVariants(source, target);
83 while(variants.hasMoreElements()) {
84 String variant = (String) variants.nextElement();
85 String id = source + "-" + target;
86 if (variant.length() != 0) {
89 System.out.println("SKIPPING VARIANT, SINCE IT CURRENTLY BREAKS!\t" + id);
93 System.out.println("\t\t\t\tAdding: '" + id + "'");
98 String[] results = new String[set.size()];
103 static public int priority(int script) {
104 if (script == UScript.LATIN) return -2;
108 public static String showScripts(int[] scripts) {
109 StringBuffer results = new StringBuffer();
110 for (int i = 0; i < scripts.length; ++i) {
111 if (i != 0) results.append(", ");
112 results.append(UScript.getName(scripts[i]));
114 return results.toString();
117 public static void print(String testSet, String rawId) throws IOException {
118 System.out.println("Processing " + rawId);
119 Transliterator t = Transliterator.getInstance(rawId);
120 String id = t.getID();
122 // clean up IDs. Ought to be API for getting source, target, variant
123 int minusPos = id.indexOf('-');
124 String source = id.substring(0,minusPos);
125 String target = id.substring(minusPos+1);
126 int slashPos = target.indexOf('/');
127 if (slashPos >= 0) target = target.substring(0,slashPos);
129 // check that the source is a script
130 if (testSet.equals("")) {
131 int[] scripts = UScript.getCode(source);
132 if (scripts == null) {
133 System.out.println("FAILED: "
134 + Transliterator.getDisplayName(id)
135 + " does not have a script as the source");
138 testSet = "[:" + source + ":]";
139 if (source.equalsIgnoreCase("katakana")) {
140 testSet = "[" + testSet + "\u30FC]";
145 UnicodeSet sourceSet = new UnicodeSet(testSet);
147 // check that the target is a script
148 int[] scripts = UScript.getCode(target);
149 if (scripts == null) {
150 target = "[:Latin:]";
152 target = "[:" + target + ":]";
154 UnicodeSet targetSet = new UnicodeSet(target);
156 Transliterator inverse = t.getInverse();
158 //Transliterator hex = Transliterator.getInstance("Any-Hex");
161 // iterate through script
162 System.out.println("Transliterating " + sourceSet.toPattern(true)
163 + " with " + Transliterator.getDisplayName(id));
165 UnicodeSet leftOverSet = new UnicodeSet(targetSet);
166 UnicodeSet privateUse = new UnicodeSet("[:private use:]");
168 Map map = new TreeMap();
170 UnicodeSet targetSetPlusAnyways = new UnicodeSet(targetSet);
171 targetSetPlusAnyways.addAll(okAnyway);
173 UnicodeSet sourceSetPlusAnyways = new UnicodeSet(sourceSet);
174 sourceSetPlusAnyways.addAll(okAnyway);
176 UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet);
179 int j = usi.codepoint;
181 int count = sourceSet.getRangeCount();
182 for (int i = 0; i < count; ++i) {
183 int end = sourceSet.getRangeEnd(i);
184 for (int j = sourceSet.getRangeStart(i); j <= end; ++j) {
187 String ss = UTF16.valueOf(j);
188 String ts = t.transliterate(ss);
190 if (!targetSetPlusAnyways.containsAll(ts)) {
193 if (UTF16.countCodePoint(ts) == 1) {
194 leftOverSet.remove(UTF16.charAt(ts,0));
196 String rt = inverse.transliterate(ts);
197 if (!sourceSetPlusAnyways.containsAll(rt)) {
199 } else if (!ss.equals(rt)) {
203 if (!privateUse.containsNone(ts) || !privateUse.containsNone(rt)) {
207 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
209 "<td class='s'>" + ss + "<br><tt>" + hex(ss)
210 + "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
211 + "</tt></td><td class='r'>" + rt + "<br><tt>" + hex(rt) + "</tt></td>" );
217 for (int i2 = 0; i2 < count; ++i2) {
218 int end2 = sourceSet.getRangeEnd(i2);
219 for (int j2 = sourceSet.getRangeStart(i2); j2 <= end; ++j2) {
220 String ss2 = UTF16.valueOf(j2);
221 String ts2 = t.transliterate(ss2);
222 String rt2 = inverse.transliterate(ts2);
224 String ss12 = ss + ss2;
225 String ts12 = t.transliterate(ss + ss12);
226 String rt12 = inverse.transliterate(ts12);
227 if (ts12.equals(ts + ts2) && rt12.equals(rt + rt2)) continue;
228 if (--maxDual < 0) break dual;
230 // transliteration of whole differs from that of parts
232 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss12, Normalizer.DECOMP_COMPAT, 0))
234 "<td class='s'>" + ss12 + "<br><tt>" + hex(ss12)
235 + "</tt></td><td class='t'>" + ts12 + "<br><tt>" + hex(ts12)
236 + "</tt></td><td class='r'>" + rt12 + "<br><tt>" + hex(rt12) + "</tt></td>" );
244 leftOverSet.remove(0x0100,0x02FF); // remove extended & IPA
246 /*int count = leftOverSet.getRangeCount();
247 for (int i = 0; i < count; ++i) {
248 int end = leftOverSet.getRangeEnd(i);
249 for (int j = leftOverSet.getRangeStart(i); j <= end; ++j) {
252 usi.reset(leftOverSet);
254 int j = usi.codepoint;
256 String ts = UTF16.valueOf(j);
257 // String decomp = Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0);
258 // if (!decomp.equals(ts)) continue;
260 String rt = inverse.transliterate(ts);
264 if (!sourceSetPlusAnyways.containsAll(rt)) {
267 if (!privateUse.containsNone(rt)) {
271 map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
272 "<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
273 + "</tt></td><td class='r'>"
274 + rt + "<br><tt>" + hex(rt) + "</tt></td>");
278 // make file name and open
279 File f = new File("transliteration/chart_" + id.replace('/', '_') + ".html");
280 String filename = f.getCanonicalFile().toString();
281 PrintWriter out = new PrintWriter(
282 new OutputStreamWriter(
283 new FileOutputStream(filename), "UTF-8"));
284 //out.print('\uFEFF'); // BOM
286 System.out.println("Writing " + filename);
289 out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
290 out.println("<HTML><HEAD>");
291 out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
292 out.println("<link rel='stylesheet' href='http://www.unicode.org/charts/uca/charts.css' type='text/css'>");
294 out.println("<BODY>");
295 out.println("<h1>Transliteration Samples for '" + Transliterator.getDisplayName(id) + "'</h1>");
296 out.println("<p>This file illustrates the transliterations of " + Transliterator.getDisplayName(id) + ".");
297 out.println("The samples are mechanically generated, and only include single characters");
298 out.println("from the source set. Thus it will <i>not</i> contain examples where the transliteration");
299 out.println("depends on the context around the character. For a more detailed -- and interactive -- example, see the");
300 out.println("<a href='http://demo.icu-project.org/icu-bin/translit'>Transliteration Demo</a></p><hr>");
302 // set up the headers
304 String headerBase = "<th>Source</th><th>Target</th><th>Return</th>";
305 String headers = headerBase;
306 for (int i = columnCount - 1; i > 0; --i) {
307 if (i != columnCount - 1) headers += "<th> </th>";
308 headers += headerBase;
311 String tableHeader = "<p><table border='1'><tr>" + headers + "</tr>";
312 String tableFooter = "</table></p>";
313 out.println("<h2>Round Trip</h2>");
314 out.println(tableHeader);
316 Iterator it = map.keySet().iterator();
320 while (it.hasNext()) {
321 String key = (String) it.next();
322 char group = key.charAt(0);
323 if (group != lastGroup || count++ > 50) {
327 out.println("</tr>");
330 out.println(tableFooter);
332 // String title = "";
333 if ((group & 0x100) != 0) out.println("<hr><h2>Duals</h2>");
334 else if ((group & 0x80) != 0) out.println("<hr><h2>Completeness</h2>");
335 else out.println("<hr><h2>Round Trip</h2>");
336 if ((group & 16) != 0) out.println("<h3>Errors: Contains Private Use Characters</h3>");
337 if ((group & 8) != 0) out.println("<h3>Possible Errors: Return not in Source Set</h3>");
338 if ((group & 4) != 0) out.println("<h3>One-Way Mapping: Return not equal to Source</h3>");
339 if ((group & 2) != 0) out.println("<h3>Errors: Return not in Source Set</h3>");
340 if ((group & 1) != 0) out.println("<h3>Errors: Target not in Target Set</h3>");
342 out.println(tableHeader);
345 String value = (String) map.get(key);
346 if (column++ == 0) out.print("<tr>");
347 else out.print("<th> </th>");
350 out.println("</tr>");
355 out.println("</tr>");
358 out.println(tableFooter + "</BODY></HTML>");
365 public static String hex(String s) {
367 StringBuffer results = new StringBuffer();
368 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
369 cp = UTF16.charAt(s, i);
370 if (i != 0) results.append(' ');
371 results.append(Integer.toHexString(cp));
373 return results.toString().toUpperCase();
376 static final UnicodeSet okAnyway = new UnicodeSet("[^[:Letter:]]");
379 // tests whether a string is in a set. Also checks for Common and Inherited
380 public static boolean isIn(String s, UnicodeSet set) {
382 for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
383 cp = UTF16.charAt(s, i);
384 if (set.contains(cp)) continue;
385 if (okAnyway.contains(cp)) continue;