2 *****************************************************************
\r
3 * Copyright (c) 2002-2010, International Business Machines Corporation
\r
4 * and others. All Rights Reserved.
\r
5 *****************************************************************
\r
6 * Date Name Description
\r
7 * 06/06/2002 aliu Creation.
\r
8 *****************************************************************
\r
10 package com.ibm.icu.text;
\r
11 import java.util.Enumeration;
\r
12 import java.util.HashMap;
\r
13 import java.util.HashSet;
\r
14 import java.util.Map;
\r
15 import java.util.MissingResourceException;
\r
16 import java.util.Set;
\r
17 import java.util.Vector;
\r
19 import com.ibm.icu.lang.UScript;
\r
21 * A transliterator that translates multiple input scripts to a single
\r
22 * output script. It is named Any-T or Any-T/V, where T is the target
\r
23 * and V is the optional variant. The target T is a script.
\r
25 * <p>An AnyTransliterator partitions text into runs of the same
\r
26 * script, together with adjacent COMMON or INHERITED characters.
\r
27 * After determining the script of each run, it transliterates from
\r
28 * that script to the given target/variant. It does so by
\r
29 * instantiating a transliterator from the source script to the
\r
30 * target/variant. If a run consists only of the target script,
\r
31 * COMMON, or INHERITED characters, then the run is not changed.
\r
33 * <p>At startup, all possible AnyTransliterators are registered with
\r
34 * the system, as determined by examining the registered script
\r
40 class AnyTransliterator extends Transliterator {
\r
42 //------------------------------------------------------------
\r
45 static final char TARGET_SEP = '-';
\r
46 static final char VARIANT_SEP = '/';
\r
47 static final String ANY = "Any";
\r
48 static final String NULL_ID = "Null";
\r
49 static final String LATIN_PIVOT = "-Latin;Latin-";
\r
52 * Cache mapping UScriptCode values to Transliterator*.
\r
54 private Map<Integer, Transliterator> cache;
\r
57 * The target or target/variant string.
\r
59 private String target;
\r
62 * The target script code. Never USCRIPT_INVALID_CODE.
\r
64 private int targetScript;
\r
67 * Special code for handling width characters
\r
69 private Transliterator widthFix = Transliterator.getInstance("[[:dt=Nar:][:dt=Wide:]] nfkd");
\r
72 * Implements {@link Transliterator#handleTransliterate}.
\r
74 protected void handleTransliterate(Replaceable text,
\r
75 Position pos, boolean isIncremental) {
\r
76 int allStart = pos.start;
\r
77 int allLimit = pos.limit;
\r
79 ScriptRunIterator it =
\r
80 new ScriptRunIterator(text, pos.contextStart, pos.contextLimit);
\r
83 // Ignore runs in the ante context
\r
84 if (it.limit <= allStart) continue;
\r
86 // Try to instantiate transliterator from it.scriptCode to
\r
87 // our target or target/variant
\r
88 Transliterator t = getTransliterator(it.scriptCode);
\r
91 // We have no transliterator. Do nothing, but keep
\r
92 // pos.start up to date.
\r
93 pos.start = it.limit;
\r
97 // If the run end is before the transliteration limit, do
\r
98 // a non-incremental transliteration. Otherwise do an
\r
100 boolean incremental = isIncremental && (it.limit >= allLimit);
\r
102 pos.start = Math.max(allStart, it.start);
\r
103 pos.limit = Math.min(allLimit, it.limit);
\r
104 int limit = pos.limit;
\r
105 t.filteredTransliterate(text, pos, incremental);
\r
106 int delta = pos.limit - limit;
\r
108 it.adjustLimit(delta);
\r
110 // We're done if we enter the post context
\r
111 if (it.limit >= allLimit) break;
\r
114 // Restore limit. pos.start is fine where the last transliterator
\r
115 // left it, or at the end of the last run.
\r
116 pos.limit = allLimit;
\r
120 * Private constructor
\r
121 * @param id the ID of the form S-T or S-T/V, where T is theTarget
\r
122 * and V is theVariant. Must not be empty.
\r
123 * @param theTarget the target name. Must not be empty, and must
\r
124 * name a script corresponding to theTargetScript.
\r
125 * @param theVariant the variant name, or the empty string if
\r
126 * there is no variant
\r
127 * @param theTargetScript the script code corresponding to
\r
130 private AnyTransliterator(String id,
\r
133 int theTargetScript) {
\r
135 targetScript = theTargetScript;
\r
136 cache = new HashMap<Integer, Transliterator>();
\r
138 target = theTarget;
\r
139 if (theVariant.length() > 0) {
\r
140 target = theTarget + VARIANT_SEP + theVariant;
\r
145 * @param id the ID of the form S-T or S-T/V, where T is theTarget
\r
146 * and V is theVariant. Must not be empty.
\r
147 * @param filter The Unicode filter.
\r
148 * @param target2 the target name.
\r
149 * @param targetScript2 the script code corresponding to theTarget.
\r
150 * @param widthFix2 The Transliterator width fix.
\r
151 * @param cache2 The Map object for cache.
\r
153 public AnyTransliterator(String id, UnicodeFilter filter, String target2,
\r
154 int targetScript2, Transliterator widthFix2, Map<Integer, Transliterator> cache2) {
\r
156 targetScript = targetScript2;
\r
162 * Returns a transliterator from the given source to our target or
\r
163 * target/variant. Returns NULL if the source is the same as our
\r
164 * target script, or if the source is USCRIPT_INVALID_CODE.
\r
165 * Caches the result and returns the same transliterator the next
\r
166 * time. The caller does NOT own the result and must not delete
\r
169 private Transliterator getTransliterator(int source) {
\r
170 if (source == targetScript || source == UScript.INVALID_CODE) {
\r
171 if (isWide(targetScript)) {
\r
178 Integer key = new Integer(source);
\r
179 Transliterator t = cache.get(key);
\r
181 String sourceName = UScript.getName(source);
\r
182 String id = sourceName + TARGET_SEP + target;
\r
185 t = Transliterator.getInstance(id, FORWARD);
\r
186 } catch (RuntimeException e) { }
\r
189 // Try to pivot around Latin, our most common script
\r
190 id = sourceName + LATIN_PIVOT + target;
\r
192 t = Transliterator.getInstance(id, FORWARD);
\r
193 } catch (RuntimeException e) { }
\r
197 if (!isWide(targetScript)) {
\r
198 Vector<Transliterator> v = new Vector<Transliterator>();
\r
201 t = new CompoundTransliterator(v);
\r
204 } else if (!isWide(targetScript)) {
\r
213 * @param targetScript2
\r
216 private boolean isWide(int script) {
\r
217 return script == UScript.BOPOMOFO || script == UScript.HAN || script == UScript.HANGUL || script == UScript.HIRAGANA || script == UScript.KATAKANA;
\r
221 * Registers standard transliterators with the system. Called by
\r
222 * Transliterator during initialization. Scan all current targets
\r
223 * and register those that are scripts T as Any-T/V.
\r
225 static void register() {
\r
227 HashMap<String, Set<String>> seen = new HashMap<String, Set<String>>(); // old code used set, but was dependent on order
\r
229 for (Enumeration<String> s = Transliterator.getAvailableSources(); s.hasMoreElements(); ) {
\r
230 String source = s.nextElement();
\r
232 // Ignore the "Any" source
\r
233 if (source.equalsIgnoreCase(ANY)) continue;
\r
235 for (Enumeration<String> t = Transliterator.getAvailableTargets(source);
\r
236 t.hasMoreElements(); ) {
\r
237 String target = t.nextElement();
\r
239 // Get the script code for the target. If not a script, ignore.
\r
240 int targetScript = scriptNameToCode(target);
\r
241 if (targetScript == UScript.INVALID_CODE) {
\r
245 Set<String> seenVariants = seen.get(target);
\r
246 if (seenVariants == null) {
\r
247 seen.put(target, seenVariants = new HashSet<String>());
\r
250 for (Enumeration<String> v = Transliterator.getAvailableVariants(source, target);
\r
251 v.hasMoreElements(); ) {
\r
252 String variant = v.nextElement();
\r
254 // Only process each target/variant pair once
\r
255 if (seenVariants.contains(variant)) {
\r
258 seenVariants.add(variant);
\r
261 id = TransliteratorIDParser.STVtoID(ANY, target, variant);
\r
262 AnyTransliterator trans = new AnyTransliterator(id, target, variant,
\r
264 Transliterator.registerInstance(trans);
\r
265 Transliterator.registerSpecialInverse(target, NULL_ID, false);
\r
272 * Return the script code for a given name, or
\r
273 * UScript.INVALID_CODE if not found.
\r
275 private static int scriptNameToCode(String name) {
\r
277 int[] codes = UScript.getCode(name);
\r
278 return codes != null ? codes[0] : UScript.INVALID_CODE;
\r
279 }catch( MissingResourceException e){
\r
281 return UScript.INVALID_CODE;
\r
286 //------------------------------------------------------------
\r
287 // ScriptRunIterator
\r
290 * Returns a series of ranges corresponding to scripts. They will be
\r
293 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
\r
294 * | | - first run (start, limit)
\r
295 * | | - second run (start, limit)
\r
297 * That is, the runs will overlap. The reason for this is so that a
\r
298 * transliterator can consider common characters both before and after
\r
301 private static class ScriptRunIterator {
\r
303 private Replaceable text;
\r
304 private int textStart;
\r
305 private int textLimit;
\r
308 * The code of the current run, valid after next() returns. May
\r
309 * be UScript.INVALID_CODE if and only if the entire text is
\r
310 * COMMON/INHERITED.
\r
312 public int scriptCode;
\r
315 * The start of the run, inclusive, valid after next() returns.
\r
320 * The end of the run, exclusive, valid after next() returns.
\r
325 * Constructs a run iterator over the given text from start
\r
326 * (inclusive) to limit (exclusive).
\r
328 public ScriptRunIterator(Replaceable text, int start, int limit) {
\r
330 this.textStart = start;
\r
331 this.textLimit = limit;
\r
332 this.limit = start;
\r
337 * Returns TRUE if there are any more runs. TRUE is always
\r
338 * returned at least once. Upon return, the caller should
\r
339 * examine scriptCode, start, and limit.
\r
341 public boolean next() {
\r
345 scriptCode = UScript.INVALID_CODE; // don't know script yet
\r
349 if (start == textLimit) {
\r
353 // Move start back to include adjacent COMMON or INHERITED
\r
355 while (start > textStart) {
\r
356 ch = text.char32At(start - 1); // look back
\r
357 s = UScript.getScript(ch);
\r
358 if (s == UScript.COMMON || s == UScript.INHERITED) {
\r
365 // Move limit ahead to include COMMON, INHERITED, and characters
\r
366 // of the current script.
\r
367 while (limit < textLimit) {
\r
368 ch = text.char32At(limit); // look ahead
\r
369 s = UScript.getScript(ch);
\r
370 if (s != UScript.COMMON && s != UScript.INHERITED) {
\r
371 if (scriptCode == UScript.INVALID_CODE) {
\r
373 } else if (s != scriptCode) {
\r
380 // Return TRUE even if the entire text is COMMON / INHERITED, in
\r
381 // which case scriptCode will be UScript.INVALID_CODE.
\r
386 * Adjusts internal indices for a change in the limit index of the
\r
387 * given delta. A positive delta means the limit has increased.
\r
389 public void adjustLimit(int delta) {
\r
391 textLimit += delta;
\r
396 * Temporary hack for registry problem. Needs to be replaced by better architecture.
\r
398 public Transliterator safeClone() {
\r
399 UnicodeFilter filter = getFilter();
\r
400 if (filter != null && filter instanceof UnicodeSet) {
\r
401 filter = new UnicodeSet((UnicodeSet)filter);
\r
403 return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache);
\r