2 *****************************************************************
\r
3 * Copyright (c) 2002-2009, International Business Machines Corporation
\r
4 * and others. All Rights Reserved.
\r
5 *****************************************************************
\r
6 * Date Name Description
\r
7 * 06/06/2002 aliu Creation.
\r
8 *****************************************************************
\r
10 package com.ibm.icu.text;
\r
11 import com.ibm.icu.lang.UScript;
\r
12 import java.lang.Math;
\r
13 import java.util.Enumeration;
\r
14 import java.util.HashSet;
\r
15 import java.util.HashMap;
\r
16 import java.util.Map;
\r
17 import java.util.MissingResourceException;
\r
18 import java.util.Set;
\r
19 import java.util.Vector;
\r
21 * A transliterator that translates multiple input scripts to a single
\r
22 * output script. It is named Any-T or Any-T/V, where T is the target
\r
23 * and V is the optional variant. The target T is a script.
\r
25 * <p>An AnyTransliterator partitions text into runs of the same
\r
26 * script, together with adjacent COMMON or INHERITED characters.
\r
27 * After determining the script of each run, it transliterates from
\r
28 * that script to the given target/variant. It does so by
\r
29 * instantiating a transliterator from the source script to the
\r
30 * target/variant. If a run consists only of the target script,
\r
31 * COMMON, or INHERITED characters, then the run is not changed.
\r
33 * <p>At startup, all possible AnyTransliterators are registered with
\r
34 * the system, as determined by examining the registered script
\r
40 class AnyTransliterator extends Transliterator {
\r
42 //------------------------------------------------------------
\r
45 static final char TARGET_SEP = '-';
\r
46 static final char VARIANT_SEP = '/';
\r
47 static final String ANY = "Any";
\r
48 static final String NULL_ID = "Null";
\r
49 static final String LATIN_PIVOT = "-Latin;Latin-";
\r
52 * Cache mapping UScriptCode values to Transliterator*.
\r
57 * The target or target/variant string.
\r
59 private String target;
\r
62 * The target script code. Never USCRIPT_INVALID_CODE.
\r
64 private int targetScript;
\r
67 * Special code for handling width characters
\r
69 private Transliterator widthFix = Transliterator.getInstance("[[:dt=Nar:][:dt=Wide:]] nfkd");
\r
72 * Implements {@link Transliterator#handleTransliterate}.
\r
74 protected void handleTransliterate(Replaceable text,
\r
75 Position pos, boolean isIncremental) {
\r
76 int allStart = pos.start;
\r
77 int allLimit = pos.limit;
\r
79 ScriptRunIterator it =
\r
80 new ScriptRunIterator(text, pos.contextStart, pos.contextLimit);
\r
83 // Ignore runs in the ante context
\r
84 if (it.limit <= allStart) continue;
\r
86 // Try to instantiate transliterator from it.scriptCode to
\r
87 // our target or target/variant
\r
88 Transliterator t = getTransliterator(it.scriptCode);
\r
91 // We have no transliterator. Do nothing, but keep
\r
92 // pos.start up to date.
\r
93 pos.start = it.limit;
\r
97 // If the run end is before the transliteration limit, do
\r
98 // a non-incremental transliteration. Otherwise do an
\r
100 boolean incremental = isIncremental && (it.limit >= allLimit);
\r
102 pos.start = Math.max(allStart, it.start);
\r
103 pos.limit = Math.min(allLimit, it.limit);
\r
104 int limit = pos.limit;
\r
105 t.filteredTransliterate(text, pos, incremental);
\r
106 int delta = pos.limit - limit;
\r
108 it.adjustLimit(delta);
\r
110 // We're done if we enter the post context
\r
111 if (it.limit >= allLimit) break;
\r
114 // Restore limit. pos.start is fine where the last transliterator
\r
115 // left it, or at the end of the last run.
\r
116 pos.limit = allLimit;
\r
120 * Private constructor
\r
121 * @param id the ID of the form S-T or S-T/V, where T is theTarget
\r
122 * and V is theVariant. Must not be empty.
\r
123 * @param theTarget the target name. Must not be empty, and must
\r
124 * name a script corresponding to theTargetScript.
\r
125 * @param theVariant the variant name, or the empty string if
\r
126 * there is no variant
\r
127 * @param theTargetScript the script code corresponding to
\r
130 private AnyTransliterator(String id,
\r
133 int theTargetScript) {
\r
135 targetScript = theTargetScript;
\r
136 cache = new HashMap();
\r
138 target = theTarget;
\r
139 if (theVariant.length() > 0) {
\r
140 target = theTarget + VARIANT_SEP + theVariant;
\r
148 * @param targetScript2
\r
152 public AnyTransliterator(String id, UnicodeFilter filter, String target2,
\r
153 int targetScript2, Transliterator widthFix2, Map cache2) {
\r
155 targetScript = targetScript2;
\r
161 * Returns a transliterator from the given source to our target or
\r
162 * target/variant. Returns NULL if the source is the same as our
\r
163 * target script, or if the source is USCRIPT_INVALID_CODE.
\r
164 * Caches the result and returns the same transliterator the next
\r
165 * time. The caller does NOT own the result and must not delete
\r
168 private Transliterator getTransliterator(int source) {
\r
169 if (source == targetScript || source == UScript.INVALID_CODE) {
\r
170 if (isWide(targetScript)) {
\r
177 Integer key = new Integer(source);
\r
178 Transliterator t = (Transliterator) cache.get(key);
\r
180 String sourceName = UScript.getName(source);
\r
181 String id = sourceName + TARGET_SEP + target;
\r
184 t = Transliterator.getInstance(id, FORWARD);
\r
185 } catch (RuntimeException e) { }
\r
188 // Try to pivot around Latin, our most common script
\r
189 id = sourceName + LATIN_PIVOT + target;
\r
191 t = Transliterator.getInstance(id, FORWARD);
\r
192 } catch (RuntimeException e) { }
\r
196 if (!isWide(targetScript)) {
\r
197 Vector v = new Vector();
\r
200 t = new CompoundTransliterator(v);
\r
203 } else if (!isWide(targetScript)) {
\r
212 * @param targetScript2
\r
215 private boolean isWide(int script) {
\r
216 return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA;
\r
220 * Registers standard transliterators with the system. Called by
\r
221 * Transliterator during initialization. Scan all current targets
\r
222 * and register those that are scripts T as Any-T/V.
\r
224 static void register() {
\r
226 HashMap seen = new HashMap(); // old code used set, but was dependent on order
\r
228 for (Enumeration s=Transliterator.getAvailableSources(); s.hasMoreElements(); ) {
\r
229 String source = (String) s.nextElement();
\r
231 // Ignore the "Any" source
\r
232 if (source.equalsIgnoreCase(ANY)) continue;
\r
234 for (Enumeration t=Transliterator.getAvailableTargets(source);
\r
235 t.hasMoreElements(); ) {
\r
236 String target = (String) t.nextElement();
\r
238 // Get the script code for the target. If not a script, ignore.
\r
239 int targetScript = scriptNameToCode(target);
\r
240 if (targetScript == UScript.INVALID_CODE) continue;
\r
242 Set seenVariants = (Set) seen.get(target);
\r
243 if (seenVariants == null) {
\r
244 seen.put(target, seenVariants = new HashSet());
\r
247 for (Enumeration v=Transliterator.getAvailableVariants(source, target);
\r
248 v.hasMoreElements(); ) {
\r
249 String variant = (String) v.nextElement();
\r
251 // Only process each target/variant pair once
\r
252 if (seenVariants.contains(variant)) continue;
\r
253 seenVariants.add(variant);
\r
256 id = TransliteratorIDParser.STVtoID(ANY, target, variant);
\r
257 AnyTransliterator trans = new AnyTransliterator(id, target, variant,
\r
259 Transliterator.registerInstance(trans);
\r
260 Transliterator.registerSpecialInverse(target, NULL_ID, false);
\r
267 * Return the script code for a given name, or
\r
268 * UScript.INVALID_CODE if not found.
\r
270 private static int scriptNameToCode(String name) {
\r
272 int[] codes = UScript.getCode(name);
\r
273 return codes != null ? codes[0] : UScript.INVALID_CODE;
\r
274 }catch( MissingResourceException e){
\r
275 return UScript.INVALID_CODE;
\r
279 //------------------------------------------------------------
\r
280 // ScriptRunIterator
\r
283 * Returns a series of ranges corresponding to scripts. They will be
\r
286 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
\r
287 * | | - first run (start, limit)
\r
288 * | | - second run (start, limit)
\r
290 * That is, the runs will overlap. The reason for this is so that a
\r
291 * transliterator can consider common characters both before and after
\r
294 private static class ScriptRunIterator {
\r
296 private Replaceable text;
\r
297 private int textStart;
\r
298 private int textLimit;
\r
301 * The code of the current run, valid after next() returns. May
\r
302 * be UScript.INVALID_CODE if and only if the entire text is
\r
303 * COMMON/INHERITED.
\r
305 public int scriptCode;
\r
308 * The start of the run, inclusive, valid after next() returns.
\r
313 * The end of the run, exclusive, valid after next() returns.
\r
318 * Constructs a run iterator over the given text from start
\r
319 * (inclusive) to limit (exclusive).
\r
321 public ScriptRunIterator(Replaceable text, int start, int limit) {
\r
323 this.textStart = start;
\r
324 this.textLimit = limit;
\r
325 this.limit = start;
\r
330 * Returns TRUE if there are any more runs. TRUE is always
\r
331 * returned at least once. Upon return, the caller should
\r
332 * examine scriptCode, start, and limit.
\r
334 public boolean next() {
\r
338 scriptCode = UScript.INVALID_CODE; // don't know script yet
\r
342 if (start == textLimit) {
\r
346 // Move start back to include adjacent COMMON or INHERITED
\r
348 while (start > textStart) {
\r
349 ch = text.char32At(start - 1); // look back
\r
350 s = UScript.getScript(ch);
\r
351 if (s == UScript.COMMON || s == UScript.INHERITED) {
\r
358 // Move limit ahead to include COMMON, INHERITED, and characters
\r
359 // of the current script.
\r
360 while (limit < textLimit) {
\r
361 ch = text.char32At(limit); // look ahead
\r
362 s = UScript.getScript(ch);
\r
363 if (s != UScript.COMMON && s != UScript.INHERITED) {
\r
364 if (scriptCode == UScript.INVALID_CODE) {
\r
366 } else if (s != scriptCode) {
\r
373 // Return TRUE even if the entire text is COMMON / INHERITED, in
\r
374 // which case scriptCode will be UScript.INVALID_CODE.
\r
379 * Adjusts internal indices for a change in the limit index of the
\r
380 * given delta. A positive delta means the limit has increased.
\r
382 public void adjustLimit(int delta) {
\r
384 textLimit += delta;
\r
389 * Temporary hack for registry problem. Needs to be replaced by better architecture.
\r
393 public Transliterator safeClone() {
\r
394 UnicodeFilter filter = getFilter();
\r
395 if (filter != null && filter instanceof UnicodeSet) {
\r
396 filter = new UnicodeSet((UnicodeSet)filter);
\r
398 return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache);
\r