2 *******************************************************************************
\r
3 * Copyright (C) 1996-2009, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import com.ibm.icu.impl.ICUResourceBundle;
\r
10 import com.ibm.icu.impl.Utility;
\r
11 import com.ibm.icu.impl.UtilityExtensions;
\r
12 import com.ibm.icu.util.CaseInsensitiveString;
\r
13 import com.ibm.icu.util.ULocale;
\r
14 import com.ibm.icu.util.UResourceBundle;
\r
16 import java.text.MessageFormat;
\r
17 import java.util.Enumeration;
\r
18 import java.util.Hashtable;
\r
19 import java.util.Locale;
\r
20 import java.util.MissingResourceException;
\r
21 import java.util.Vector;
\r
24 * <code>Transliterator</code> is an abstract class that
\r
25 * transliterates text from one format to another. The most common
\r
26 * kind of transliterator is a script, or alphabet, transliterator.
\r
27 * For example, a Russian to Latin transliterator changes Russian text
\r
28 * written in Cyrillic characters to phonetically equivalent Latin
\r
29 * characters. It does not <em>translate</em> Russian to English!
\r
30 * Transliteration, unlike translation, operates on characters, without
\r
31 * reference to the meanings of words and sentences.
\r
33 * <p>Although script conversion is its most common use, a
\r
34 * transliterator can actually perform a more general class of tasks.
\r
35 * In fact, <code>Transliterator</code> defines a very general API
\r
36 * which specifies only that a segment of the input text is replaced
\r
37 * by new text. The particulars of this conversion are determined
\r
38 * entirely by subclasses of <code>Transliterator</code>.
\r
40 * <p><b>Transliterators are stateless</b>
\r
42 * <p><code>Transliterator</code> objects are <em>stateless</em>; they
\r
43 * retain no information between calls to
\r
44 * <code>transliterate()</code>. As a result, threads may share
\r
45 * transliterators without synchronizing them. This might seem to
\r
46 * limit the complexity of the transliteration operation. In
\r
47 * practice, subclasses perform complex transliterations by delaying
\r
48 * the replacement of text until it is known that no other
\r
49 * replacements are possible. In other words, although the
\r
50 * <code>Transliterator</code> objects are stateless, the source text
\r
51 * itself embodies all the needed information, and delayed operation
\r
52 * allows arbitrary complexity.
\r
54 * <p><b>Batch transliteration</b>
\r
56 * <p>The simplest way to perform transliteration is all at once, on a
\r
57 * string of existing text. This is referred to as <em>batch</em>
\r
58 * transliteration. For example, given a string <code>input</code>
\r
59 * and a transliterator <code>t</code>, the call
\r
61 * <blockquote><code>String result = t.transliterate(input);
\r
62 * </code></blockquote>
\r
64 * will transliterate it and return the result. Other methods allow
\r
65 * the client to specify a substring to be transliterated and to use
\r
66 * {@link Replaceable} objects instead of strings, in order to
\r
67 * preserve out-of-band information (such as text styles).
\r
69 * <p><b>Keyboard transliteration</b>
\r
71 * <p>Somewhat more involved is <em>keyboard</em>, or incremental
\r
72 * transliteration. This is the transliteration of text that is
\r
73 * arriving from some source (typically the user's keyboard) one
\r
74 * character at a time, or in some other piecemeal fashion.
\r
76 * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
\r
77 * stores the text. As text is inserted, as much as possible is
\r
78 * transliterated on the fly. This means a GUI that displays the
\r
79 * contents of the buffer may show text being modified as each new
\r
80 * character arrives.
\r
82 * <p>Consider the simple <code>RuleBasedTransliterator</code>:
\r
84 * <blockquote><code>
\r
87 * </code></blockquote>
\r
89 * When the user types 't', nothing will happen, since the
\r
90 * transliterator is waiting to see if the next character is 'h'. To
\r
91 * remedy this, we introduce the notion of a cursor, marked by a '|'
\r
92 * in the output string:
\r
94 * <blockquote><code>
\r
97 * </code></blockquote>
\r
99 * Now when the user types 't', tau appears, and if the next character
\r
100 * is 'h', the tau changes to a theta. This is accomplished by
\r
101 * maintaining a cursor position (independent of the insertion point,
\r
102 * and invisible in the GUI) across calls to
\r
103 * <code>transliterate()</code>. Typically, the cursor will
\r
104 * be coincident with the insertion point, but in a case like the one
\r
105 * above, it will precede the insertion point.
\r
107 * <p>Keyboard transliteration methods maintain a set of three indices
\r
108 * that are updated with each call to
\r
109 * <code>transliterate()</code>, including the cursor, start,
\r
110 * and limit. These indices are changed by the method, and they are
\r
111 * passed in and out via a Position object. The <code>start</code> index
\r
112 * marks the beginning of the substring that the transliterator will
\r
113 * look at. It is advanced as text becomes committed (but it is not
\r
114 * the committed index; that's the <code>cursor</code>). The
\r
115 * <code>cursor</code> index, described above, marks the point at
\r
116 * which the transliterator last stopped, either because it reached
\r
117 * the end, or because it required more characters to disambiguate
\r
118 * between possible inputs. The <code>cursor</code> can also be
\r
119 * explicitly set by rules in a <code>RuleBasedTransliterator</code>.
\r
120 * Any characters before the <code>cursor</code> index are frozen;
\r
121 * future keyboard transliteration calls within this input sequence
\r
122 * will not change them. New text is inserted at the
\r
123 * <code>limit</code> index, which marks the end of the substring that
\r
124 * the transliterator looks at.
\r
126 * <p>Because keyboard transliteration assumes that more characters
\r
127 * are to arrive, it is conservative in its operation. It only
\r
128 * transliterates when it can do so unambiguously. Otherwise it waits
\r
129 * for more characters to arrive. When the client code knows that no
\r
130 * more characters are forthcoming, perhaps because the user has
\r
131 * performed some input termination operation, then it should call
\r
132 * <code>finishTransliteration()</code> to complete any
\r
133 * pending transliterations.
\r
135 * <p><b>Inverses</b>
\r
137 * <p>Pairs of transliterators may be inverses of one another. For
\r
138 * example, if transliterator <b>A</b> transliterates characters by
\r
139 * incrementing their Unicode value (so "abc" -> "def"), and
\r
140 * transliterator <b>B</b> decrements character values, then <b>A</b>
\r
141 * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
\r
142 * with <b>B</b> in a compound transliterator, the result is the
\r
143 * indentity transliterator, that is, a transliterator that does not
\r
144 * change its input text.
\r
146 * The <code>Transliterator</code> method <code>getInverse()</code>
\r
147 * returns a transliterator's inverse, if one exists, or
\r
148 * <code>null</code> otherwise. However, the result of
\r
149 * <code>getInverse()</code> usually will <em>not</em> be a true
\r
150 * mathematical inverse. This is because true inverse transliterators
\r
151 * are difficult to formulate. For example, consider two
\r
152 * transliterators: <b>AB</b>, which transliterates the character 'A'
\r
153 * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
\r
154 * seem that these are exact inverses, since
\r
156 * <blockquote>"A" x <b>AB</b> -> "B"<br>
\r
157 * "B" x <b>BA</b> -> "A"</blockquote>
\r
159 * where 'x' represents transliteration. However,
\r
161 * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
\r
162 * "BBCD" x <b>BA</b> -> "AACD"</blockquote>
\r
164 * so <b>AB</b> composed with <b>BA</b> is not the
\r
165 * identity. Nonetheless, <b>BA</b> may be usefully considered to be
\r
166 * <b>AB</b>'s inverse, and it is on this basis that
\r
167 * <b>AB</b><code>.getInverse()</code> could legitimately return
\r
170 * <p><b>IDs and display names</b>
\r
172 * <p>A transliterator is designated by a short identifier string or
\r
173 * <em>ID</em>. IDs follow the format <em>source-destination</em>,
\r
174 * where <em>source</em> describes the entity being replaced, and
\r
175 * <em>destination</em> describes the entity replacing
\r
176 * <em>source</em>. The entities may be the names of scripts,
\r
177 * particular sequences of characters, or whatever else it is that the
\r
178 * transliterator converts to or from. For example, a transliterator
\r
179 * from Russian to Latin might be named "Russian-Latin". A
\r
180 * transliterator from keyboard escape sequences to Latin-1 characters
\r
181 * might be named "KeyboardEscape-Latin1". By convention, system
\r
182 * entity names are in English, with the initial letters of words
\r
183 * capitalized; user entity names may follow any format so long as
\r
184 * they do not contain dashes.
\r
186 * <p>In addition to programmatic IDs, transliterator objects have
\r
187 * display names for presentation in user interfaces, returned by
\r
188 * {@link #getDisplayName}.
\r
190 * <p><b>Factory methods and registration</b>
\r
192 * <p>In general, client code should use the factory method
\r
193 * <code>getInstance()</code> to obtain an instance of a
\r
194 * transliterator given its ID. Valid IDs may be enumerated using
\r
195 * <code>getAvailableIDs()</code>. Since transliterators are
\r
196 * stateless, multiple calls to <code>getInstance()</code> with the
\r
197 * same ID will return the same object.
\r
199 * <p>In addition to the system transliterators registered at startup,
\r
200 * user transliterators may be registered by calling
\r
201 * <code>registerInstance()</code> at run time. To register a
\r
202 * transliterator subclass without instantiating it (until it is
\r
203 * needed), users may call <code>registerClass()</code>.
\r
205 * <p><b>Composed transliterators</b>
\r
207 * <p>In addition to built-in system transliterators like
\r
208 * "Latin-Greek", there are also built-in <em>composed</em>
\r
209 * transliterators. These are implemented by composing two or more
\r
210 * component transliterators. For example, if we have scripts "A",
\r
211 * "B", "C", and "D", and we want to transliterate between all pairs
\r
212 * of them, then we need to write 12 transliterators: "A-B", "A-C",
\r
213 * "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
\r
214 * convert all scripts to an intermediate script "M", then instead of
\r
215 * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
\r
216 * "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
\r
217 * win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
\r
218 * <em>n</em>, so as <em>n</em> gets larger the gain becomes
\r
219 * significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
\r
220 * difference.) Note the use of "~" rather than "-" for the script
\r
221 * separator here; this indicates that the given transliterator is
\r
222 * intended to be composed with others, rather than be used as is.
\r
224 * <p>Composed transliterators can be instantiated as usual. For
\r
225 * example, the system transliterator "Devanagari-Gujarati" is a
\r
226 * composed transliterator built internally as
\r
227 * "Devanagari~InterIndic;InterIndic~Gujarati". When this
\r
228 * transliterator is instantiated, it appears externally to be a
\r
229 * standard transliterator (e.g., getID() returns
\r
230 * "Devanagari-Gujarati").
\r
232 * <p><b>Subclassing</b>
\r
234 * <p>Subclasses must implement the abstract method
\r
235 * <code>handleTransliterate()</code>. <p>Subclasses should override
\r
236 * the <code>transliterate()</code> method taking a
\r
237 * <code>Replaceable</code> and the <code>transliterate()</code>
\r
238 * method taking a <code>String</code> and <code>StringBuffer</code>
\r
239 * if the performance of these methods can be improved over the
\r
240 * performance obtained by the default implementations in this class.
\r
242 * <p>Copyright © IBM Corporation 1999. All rights reserved.
\r
247 public abstract class Transliterator implements StringTransform {
\r
249 * Direction constant indicating the forward direction in a transliterator,
\r
250 * e.g., the forward rules of a RuleBasedTransliterator. An "A-B"
\r
251 * transliterator transliterates A to B when operating in the forward
\r
252 * direction, and B to A when operating in the reverse direction.
\r
255 public static final int FORWARD = 0;
\r
258 * Direction constant indicating the reverse direction in a transliterator,
\r
259 * e.g., the reverse rules of a RuleBasedTransliterator. An "A-B"
\r
260 * transliterator transliterates A to B when operating in the forward
\r
261 * direction, and B to A when operating in the reverse direction.
\r
264 public static final int REVERSE = 1;
\r
267 * Position structure for incremental transliteration. This data
\r
268 * structure defines two substrings of the text being
\r
269 * transliterated. The first region, [contextStart,
\r
270 * contextLimit), defines what characters the transliterator will
\r
271 * read as context. The second region, [start, limit), defines
\r
272 * what characters will actually be transliterated. The second
\r
273 * region should be a subset of the first.
\r
275 * <p>After a transliteration operation, some of the indices in this
\r
276 * structure will be modified. See the field descriptions for
\r
279 * <p>contextStart <= start <= limit <= contextLimit
\r
281 * <p>Note: All index values in this structure must be at code point
\r
282 * boundaries. That is, none of them may occur between two code units
\r
283 * of a surrogate pair. If any index does split a surrogate pair,
\r
284 * results are unspecified.
\r
287 public static class Position {
\r
290 * Beginning index, inclusive, of the context to be considered for
\r
291 * a transliteration operation. The transliterator will ignore
\r
292 * anything before this index. INPUT/OUTPUT parameter: This parameter
\r
293 * is updated by a transliteration operation to reflect the maximum
\r
294 * amount of antecontext needed by a transliterator.
\r
297 public int contextStart;
\r
300 * Ending index, exclusive, of the context to be considered for a
\r
301 * transliteration operation. The transliterator will ignore
\r
302 * anything at or after this index. INPUT/OUTPUT parameter: This
\r
303 * parameter is updated to reflect changes in the length of the
\r
304 * text, but points to the same logical position in the text.
\r
307 public int contextLimit;
\r
310 * Beginning index, inclusive, of the text to be transliteratd.
\r
311 * INPUT/OUTPUT parameter: This parameter is advanced past
\r
312 * characters that have already been transliterated by a
\r
313 * transliteration operation.
\r
319 * Ending index, exclusive, of the text to be transliteratd.
\r
320 * INPUT/OUTPUT parameter: This parameter is updated to reflect
\r
321 * changes in the length of the text, but points to the same
\r
322 * logical position in the text.
\r
328 * Constructs a Position object with start, limit,
\r
329 * contextStart, and contextLimit all equal to zero.
\r
332 public Position() {
\r
337 * Constructs a Position object with the given start,
\r
338 * contextStart, and contextLimit. The limit is set to the
\r
342 public Position(int contextStart, int contextLimit, int start) {
\r
343 this(contextStart, contextLimit, start, contextLimit);
\r
347 * Constructs a Position object with the given start, limit,
\r
348 * contextStart, and contextLimit.
\r
351 public Position(int contextStart, int contextLimit,
\r
352 int start, int limit) {
\r
353 this.contextStart = contextStart;
\r
354 this.contextLimit = contextLimit;
\r
355 this.start = start;
\r
356 this.limit = limit;
\r
360 * Constructs a Position object that is a copy of another.
\r
363 public Position(Position pos) {
\r
368 * Copies the indices of this position from another.
\r
371 public void set(Position pos) {
\r
372 contextStart = pos.contextStart;
\r
373 contextLimit = pos.contextLimit;
\r
379 * Returns true if this Position is equal to the given object.
\r
382 public boolean equals(Object obj) {
\r
383 if (obj instanceof Position) {
\r
384 Position pos = (Position) obj;
\r
385 return contextStart == pos.contextStart &&
\r
386 contextLimit == pos.contextLimit &&
\r
387 start == pos.start &&
\r
388 limit == pos.limit;
\r
394 * Returns a string representation of this Position.
\r
397 public String toString() {
\r
398 return "[cs=" + contextStart
\r
401 + ", cl=" + contextLimit
\r
406 * Check all bounds. If they are invalid, throw an exception.
\r
407 * @param length the length of the string this object applies to
\r
408 * @exception IllegalArgumentException if any indices are out
\r
412 public final void validate(int length) {
\r
413 if (contextStart < 0 ||
\r
414 start < contextStart ||
\r
416 contextLimit < limit ||
\r
417 length < contextLimit) {
\r
418 throw new IllegalArgumentException("Invalid Position {cs=" +
\r
419 contextStart + ", s=" +
\r
422 contextLimit + "}, len=" +
\r
429 * Programmatic name, e.g., "Latin-Arabic".
\r
434 * This transliterator's filter. Any character for which
\r
435 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
\r
436 * altered by this transliterator. If <tt>filter</tt> is
\r
437 * <tt>null</tt> then no filtering is applied.
\r
439 private UnicodeFilter filter;
\r
441 private int maximumContextLength = 0;
\r
444 * System transliterator registry.
\r
446 private static TransliteratorRegistry registry;
\r
448 private static Hashtable displayNameCache;
\r
451 * Prefix for resource bundle key for the display name for a
\r
452 * transliterator. The ID is appended to this to form the key.
\r
453 * The resource bundle value should be a String.
\r
455 private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%";
\r
458 * Prefix for resource bundle key for the display name for a
\r
459 * transliterator SCRIPT. The ID is appended to this to form the key.
\r
460 * The resource bundle value should be a String.
\r
462 private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%";
\r
465 * Resource bundle key for display name pattern.
\r
466 * The resource bundle value should be a String forming a
\r
467 * MessageFormat pattern, e.g.:
\r
468 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
\r
470 private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
\r
473 * Delimiter between elements in a compound ID.
\r
476 static final char ID_DELIM = ';';
\r
479 * Delimiter before target in an ID.
\r
482 static final char ID_SEP = '-';
\r
485 * Delimiter before variant in an ID.
\r
488 static final char VARIANT_SEP = '/';
\r
491 * To enable debugging output in the Transliterator component, set
\r
494 * N.B. Make sure to recompile all of the com.ibm.icu.text package
\r
495 * after changing this. Easiest way to do this is 'ant clean
\r
496 * core' ('ant' will NOT pick up the dependency automatically).
\r
498 * <<This generates a lot of output.>>
\r
500 static final boolean DEBUG = false;
\r
503 * Default constructor.
\r
504 * @param ID the string identifier for this transliterator
\r
505 * @param filter the filter. Any character for which
\r
506 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
\r
507 * altered by this transliterator. If <tt>filter</tt> is
\r
508 * <tt>null</tt> then no filtering is applied.
\r
511 protected Transliterator(String ID, UnicodeFilter filter) {
\r
513 throw new NullPointerException();
\r
516 this.filter = filter;
\r
520 * Transliterates a segment of a string, with optional filtering.
\r
522 * @param text the string to be transliterated
\r
523 * @param start the beginning index, inclusive; <code>0 <= start
\r
525 * @param limit the ending index, exclusive; <code>start <= limit
\r
526 * <= text.length()</code>.
\r
527 * @return The new limit index. The text previously occupying <code>[start,
\r
528 * limit)</code> has been transliterated, possibly to a string of a different
\r
529 * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
\r
530 * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
\r
531 * the returned value is -1 and the input string remains unchanged.
\r
534 public final int transliterate(Replaceable text, int start, int limit) {
\r
537 text.length() < limit) {
\r
541 Position pos = new Position(start, limit, start);
\r
542 filteredTransliterate(text, pos, false, true);
\r
547 * Transliterates an entire string in place. Convenience method.
\r
548 * @param text the string to be transliterated
\r
551 public final void transliterate(Replaceable text) {
\r
552 transliterate(text, 0, text.length());
\r
556 * Transliterate an entire string and returns the result. Convenience method.
\r
558 * @param text the string to be transliterated
\r
559 * @return The transliterated text
\r
562 public final String transliterate(String text) {
\r
563 ReplaceableString result = new ReplaceableString(text);
\r
564 transliterate(result);
\r
565 return result.toString();
\r
569 * Transliterates the portion of the text buffer that can be
\r
570 * transliterated unambiguosly after new text has been inserted,
\r
571 * typically as a result of a keyboard event. The new text in
\r
572 * <code>insertion</code> will be inserted into <code>text</code>
\r
573 * at <code>index.contextLimit</code>, advancing
\r
574 * <code>index.contextLimit</code> by <code>insertion.length()</code>.
\r
575 * Then the transliterator will try to transliterate characters of
\r
576 * <code>text</code> between <code>index.start</code> and
\r
577 * <code>index.contextLimit</code>. Characters before
\r
578 * <code>index.start</code> will not be changed.
\r
580 * <p>Upon return, values in <code>index</code> will be updated.
\r
581 * <code>index.contextStart</code> will be advanced to the first
\r
582 * character that future calls to this method will read.
\r
583 * <code>index.start</code> and <code>index.contextLimit</code> will
\r
584 * be adjusted to delimit the range of text that future calls to
\r
585 * this method may change.
\r
587 * <p>Typical usage of this method begins with an initial call
\r
588 * with <code>index.contextStart</code> and <code>index.contextLimit</code>
\r
589 * set to indicate the portion of <code>text</code> to be
\r
590 * transliterated, and <code>index.start == index.contextStart</code>.
\r
591 * Thereafter, <code>index</code> can be used without
\r
592 * modification in future calls, provided that all changes to
\r
593 * <code>text</code> are made via this method.
\r
595 * <p>This method assumes that future calls may be made that will
\r
596 * insert new text into the buffer. As a result, it only performs
\r
597 * unambiguous transliterations. After the last call to this
\r
598 * method, there may be untransliterated text that is waiting for
\r
599 * more input to resolve an ambiguity. In order to perform these
\r
600 * pending transliterations, clients should call {@link
\r
601 * #finishTransliteration} after the last call to this
\r
602 * method has been made.
\r
604 * @param text the buffer holding transliterated and untransliterated text
\r
605 * @param index the start and limit of the text, the position
\r
606 * of the cursor, and the start and limit of transliteration.
\r
607 * @param insertion text to be inserted and possibly
\r
608 * transliterated into the translation buffer at
\r
609 * <code>index.contextLimit</code>. If <code>null</code> then no text
\r
611 * @see #handleTransliterate
\r
612 * @exception IllegalArgumentException if <code>index</code>
\r
616 public final void transliterate(Replaceable text, Position index,
\r
617 String insertion) {
\r
618 index.validate(text.length());
\r
620 // int originalStart = index.contextStart;
\r
621 if (insertion != null) {
\r
622 text.replace(index.limit, index.limit, insertion);
\r
623 index.limit += insertion.length();
\r
624 index.contextLimit += insertion.length();
\r
627 if (index.limit > 0 &&
\r
628 UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) {
\r
629 // Oops, there is a dangling lead surrogate in the buffer.
\r
630 // This will break most transliterators, since they will
\r
631 // assume it is part of a pair. Don't transliterate until
\r
632 // more text comes in.
\r
636 filteredTransliterate(text, index, true, true);
\r
639 // This doesn't work once we add quantifier support. Need to rewrite
\r
640 // this code to support quantifiers and 'use maximum backup <n>;'.
\r
642 // index.contextStart = Math.max(index.start - getMaximumContextLength(),
\r
647 * Transliterates the portion of the text buffer that can be
\r
648 * transliterated unambiguosly after a new character has been
\r
649 * inserted, typically as a result of a keyboard event. This is a
\r
650 * convenience method; see {@link #transliterate(Replaceable,
\r
651 * Transliterator.Position, String)} for details.
\r
652 * @param text the buffer holding transliterated and
\r
653 * untransliterated text
\r
654 * @param index the start and limit of the text, the position
\r
655 * of the cursor, and the start and limit of transliteration.
\r
656 * @param insertion text to be inserted and possibly
\r
657 * transliterated into the translation buffer at
\r
658 * <code>index.contextLimit</code>.
\r
659 * @see #transliterate(Replaceable, Transliterator.Position, String)
\r
662 public final void transliterate(Replaceable text, Position index,
\r
664 transliterate(text, index, UTF16.valueOf(insertion));
\r
668 * Transliterates the portion of the text buffer that can be
\r
669 * transliterated unambiguosly. This is a convenience method; see
\r
670 * {@link #transliterate(Replaceable, Transliterator.Position,
\r
671 * String)} for details.
\r
672 * @param text the buffer holding transliterated and
\r
673 * untransliterated text
\r
674 * @param index the start and limit of the text, the position
\r
675 * of the cursor, and the start and limit of transliteration.
\r
676 * @see #transliterate(Replaceable, Transliterator.Position, String)
\r
679 public final void transliterate(Replaceable text, Position index) {
\r
680 transliterate(text, index, null);
\r
684 * Finishes any pending transliterations that were waiting for
\r
685 * more characters. Clients should call this method as the last
\r
686 * call after a sequence of one or more calls to
\r
687 * <code>transliterate()</code>.
\r
688 * @param text the buffer holding transliterated and
\r
689 * untransliterated text.
\r
690 * @param index the array of indices previously passed to {@link
\r
694 public final void finishTransliteration(Replaceable text,
\r
696 index.validate(text.length());
\r
697 filteredTransliterate(text, index, false, true);
\r
701 * Abstract method that concrete subclasses define to implement
\r
702 * their transliteration algorithm. This method handles both
\r
703 * incremental and non-incremental transliteration. Let
\r
704 * <code>originalStart</code> refer to the value of
\r
705 * <code>pos.start</code> upon entry.
\r
708 * <li>If <code>incremental</code> is false, then this method
\r
709 * should transliterate all characters between
\r
710 * <code>pos.start</code> and <code>pos.limit</code>. Upon return
\r
711 * <code>pos.start</code> must == <code> pos.limit</code>.</li>
\r
713 * <li>If <code>incremental</code> is true, then this method
\r
714 * should transliterate all characters between
\r
715 * <code>pos.start</code> and <code>pos.limit</code> that can be
\r
716 * unambiguously transliterated, regardless of future insertions
\r
717 * of text at <code>pos.limit</code>. Upon return,
\r
718 * <code>pos.start</code> should be in the range
\r
719 * [<code>originalStart</code>, <code>pos.limit</code>).
\r
720 * <code>pos.start</code> should be positioned such that
\r
721 * characters [<code>originalStart</code>, <code>
\r
722 * pos.start</code>) will not be changed in the future by this
\r
723 * transliterator and characters [<code>pos.start</code>,
\r
724 * <code>pos.limit</code>) are unchanged.</li>
\r
727 * <p>Implementations of this method should also obey the
\r
728 * following invariants:</p>
\r
731 * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
\r
732 * should be updated to reflect changes in length of the text
\r
733 * between <code>pos.start</code> and <code>pos.limit</code>. The
\r
734 * difference <code> pos.contextLimit - pos.limit</code> should
\r
737 * <li><code>pos.contextStart</code> should not change.</li>
\r
739 * <li>Upon return, neither <code>pos.start</code> nor
\r
740 * <code>pos.limit</code> should be less than
\r
741 * <code>originalStart</code>.</li>
\r
743 * <li>Text before <code>originalStart</code> and text after
\r
744 * <code>pos.limit</code> should not change.</li>
\r
746 * <li>Text before <code>pos.contextStart</code> and text after
\r
747 * <code> pos.contextLimit</code> should be ignored.</li>
\r
750 * <p>Subclasses may safely assume that all characters in
\r
751 * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
\r
752 * In other words, the filter has already been applied by the time
\r
753 * this method is called. See
\r
754 * <code>filteredTransliterate()</code>.
\r
756 * <p>This method is <b>not</b> for public consumption. Calling
\r
757 * this method directly will transliterate
\r
758 * [<code>pos.start</code>, <code>pos.limit</code>) without
\r
759 * applying the filter. End user code should call <code>
\r
760 * transliterate()</code> instead of this method. Subclass code
\r
761 * should call <code>filteredTransliterate()</code> instead of
\r
764 * @param text the buffer holding transliterated and
\r
765 * untransliterated text
\r
767 * @param pos the indices indicating the start, limit, context
\r
768 * start, and context limit of the text.
\r
770 * @param incremental if true, assume more text may be inserted at
\r
771 * <code>pos.limit</code> and act accordingly. Otherwise,
\r
772 * transliterate all text between <code>pos.start</code> and
\r
773 * <code>pos.limit</code> and move <code>pos.start</code> up to
\r
774 * <code>pos.limit</code>.
\r
776 * @see #transliterate
\r
779 protected abstract void handleTransliterate(Replaceable text,
\r
780 Position pos, boolean incremental);
\r
783 * Top-level transliteration method, handling filtering, incremental and
\r
784 * non-incremental transliteration, and rollback. All transliteration
\r
785 * public API methods eventually call this method with a rollback argument
\r
786 * of TRUE. Other entities may call this method but rollback should be
\r
789 * <p>If this transliterator has a filter, break up the input text into runs
\r
790 * of unfiltered characters. Pass each run to
\r
791 * <subclass>.handleTransliterate().
\r
793 * <p>In incremental mode, if rollback is TRUE, perform a special
\r
794 * incremental procedure in which several passes are made over the input
\r
795 * text, adding one character at a time, and committing successful
\r
796 * transliterations as they occur. Unsuccessful transliterations are rolled
\r
797 * back and retried with additional characters to give correct results.
\r
799 * @param text the text to be transliterated
\r
800 * @param index the position indices
\r
801 * @param incremental if TRUE, then assume more characters may be inserted
\r
802 * at index.limit, and postpone processing to accomodate future incoming
\r
804 * @param rollback if TRUE and if incremental is TRUE, then perform special
\r
805 * incremental processing, as described above, and undo partial
\r
806 * transliterations where necessary. If incremental is FALSE then this
\r
807 * parameter is ignored.
\r
809 private void filteredTransliterate(Replaceable text,
\r
811 boolean incremental,
\r
812 boolean rollback) {
\r
813 // Short circuit path for transliterators with no filter in
\r
814 // non-incremental mode.
\r
815 if (filter == null && !rollback) {
\r
816 handleTransliterate(text, index, incremental);
\r
820 //----------------------------------------------------------------------
\r
821 // This method processes text in two groupings:
\r
823 // RUNS -- A run is a contiguous group of characters which are contained
\r
824 // in the filter for this transliterator (filter.contains(ch) == true).
\r
825 // Text outside of runs may appear as context but it is not modified.
\r
826 // The start and limit Position values are narrowed to each run.
\r
828 // PASSES (incremental only) -- To make incremental mode work correctly,
\r
829 // each run is broken up into n passes, where n is the length (in code
\r
830 // points) of the run. Each pass contains the first n characters. If a
\r
831 // pass is completely transliterated, it is committed, and further passes
\r
832 // include characters after the committed text. If a pass is blocked,
\r
833 // and does not transliterate completely, then this method rolls back
\r
834 // the changes made during the pass, extends the pass by one code point,
\r
835 // and tries again.
\r
836 //----------------------------------------------------------------------
\r
838 // globalLimit is the limit value for the entire operation. We
\r
839 // set index.limit to the end of each unfiltered run before
\r
840 // calling handleTransliterate(), so we need to maintain the real
\r
841 // value of index.limit here. After each transliteration, we
\r
842 // update globalLimit for insertions or deletions that have
\r
844 int globalLimit = index.limit;
\r
846 // If there is a non-null filter, then break the input text up. Say the
\r
847 // input text has the form:
\r
849 // where 'x' represents a filtered character (filter.contains('x') ==
\r
850 // false). Then we break this up into:
\r
852 // Each pass through the loop consumes a run of filtered
\r
853 // characters (which are ignored) and a subsequent run of
\r
854 // unfiltered characters (which are transliterated).
\r
856 StringBuffer log = null;
\r
858 log = new StringBuffer();
\r
863 if (filter != null) {
\r
864 // Narrow the range to be transliterated to the first run
\r
865 // of unfiltered characters at or after index.start.
\r
867 // Advance past filtered chars
\r
869 while (index.start < globalLimit &&
\r
870 !filter.contains(c=text.char32At(index.start))) {
\r
871 index.start += UTF16.getCharCount(c);
\r
874 // Find the end of this run of unfiltered chars
\r
875 index.limit = index.start;
\r
876 while (index.limit < globalLimit &&
\r
877 filter.contains(c=text.char32At(index.limit))) {
\r
878 index.limit += UTF16.getCharCount(c);
\r
882 // Check to see if the unfiltered run is empty. This only
\r
883 // happens at the end of the string when all the remaining
\r
884 // characters are filtered.
\r
885 if (index.start == index.limit) {
\r
889 // Is this run incremental? If there is additional
\r
890 // filtered text (if limit < globalLimit) then we pass in
\r
891 // an incremental value of FALSE to force the subclass to
\r
892 // complete the transliteration for this run.
\r
893 boolean isIncrementalRun =
\r
894 (index.limit < globalLimit ? false : incremental);
\r
898 // Implement rollback. To understand the need for rollback,
\r
899 // consider the following transliterator:
\r
903 // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
\r
905 // Now apply "v" to the input text "a". The result is "b". But if
\r
906 // the transliteration is done incrementally, then the NFD holds
\r
907 // things up after "t" has already transformed "a" to "A". When
\r
908 // finishTransliterate() is called, "A" is _not_ processed because
\r
909 // it gets excluded by the [:Ll:] filter, and the end result is "A"
\r
910 // -- incorrect. The problem is that the filter is applied to a
\r
911 // partially-transliterated result, when we only want it to apply to
\r
912 // input text. Although this example describes a compound
\r
913 // transliterator containing NFD and a specific filter, it can
\r
914 // happen with any transliterator which does a partial
\r
915 // transformation in incremental mode into characters outside its
\r
918 // To handle this, when in incremental mode we supply characters to
\r
919 // handleTransliterate() in several passes. Each pass adds one more
\r
920 // input character to the input text. That is, for input "ABCD", we
\r
921 // first try "A", then "AB", then "ABC", and finally "ABCD". If at
\r
922 // any point we block (upon return, start < limit) then we roll
\r
923 // back. If at any point we complete the run (upon return start ==
\r
924 // limit) then we commit that run.
\r
926 if (rollback && isIncrementalRun) {
\r
930 System.out.println("filteredTransliterate{"+getID()+"}i: IN=" +
\r
931 UtilityExtensions.formatInput(text, index));
\r
934 int runStart = index.start;
\r
935 int runLimit = index.limit;
\r
936 int runLength = runLimit - runStart;
\r
938 // Make a rollback copy at the end of the string
\r
939 int rollbackOrigin = text.length();
\r
940 text.copy(runStart, runLimit, rollbackOrigin);
\r
942 // Variables reflecting the commitment of completely
\r
943 // transliterated text. passStart is the runStart, advanced
\r
944 // past committed text. rollbackStart is the rollbackOrigin,
\r
945 // advanced past rollback text that corresponds to committed
\r
947 int passStart = runStart;
\r
948 int rollbackStart = rollbackOrigin;
\r
950 // The limit for each pass; we advance by one code point with
\r
952 int passLimit = index.start;
\r
954 // Total length, in 16-bit code units, of uncommitted text.
\r
955 // This is the length to be rolled back.
\r
956 int uncommittedLength = 0;
\r
958 // Total delta (change in length) for all passes
\r
959 int totalDelta = 0;
\r
961 // PASS MAIN LOOP -- Start with a single character, and extend
\r
962 // the text by one character at a time. Roll back partial
\r
963 // transliterations and commit complete transliterations.
\r
965 // Length of additional code point, either one or two
\r
967 UTF16.getCharCount(text.char32At(passLimit));
\r
968 passLimit += charLength;
\r
969 if (passLimit > runLimit) {
\r
972 uncommittedLength += charLength;
\r
974 index.limit = passLimit;
\r
978 log.append("filteredTransliterate{"+getID()+"}i: ");
\r
979 UtilityExtensions.formatInput(log, text, index);
\r
982 // Delegate to subclass for actual transliteration. Upon
\r
983 // return, start will be updated to point after the
\r
984 // transliterated text, and limit and contextLimit will be
\r
985 // adjusted for length changes.
\r
986 handleTransliterate(text, index, true);
\r
989 log.append(" => ");
\r
990 UtilityExtensions.formatInput(log, text, index);
\r
993 delta = index.limit - passLimit; // change in length
\r
995 // We failed to completely transliterate this pass.
\r
996 // Roll back the text. Indices remain unchanged; reset
\r
997 // them where necessary.
\r
998 if (index.start != index.limit) {
\r
999 // Find the rollbackStart, adjusted for length changes
\r
1000 // and the deletion of partially transliterated text.
\r
1001 int rs = rollbackStart + delta - (index.limit - passStart);
\r
1003 // Delete the partially transliterated text
\r
1004 text.replace(passStart, index.limit, "");
\r
1006 // Copy the rollback text back
\r
1007 text.copy(rs, rs + uncommittedLength, passStart);
\r
1009 // Restore indices to their original values
\r
1010 index.start = passStart;
\r
1011 index.limit = passLimit;
\r
1012 index.contextLimit -= delta;
\r
1015 log.append(" (ROLLBACK)");
\r
1019 // We did completely transliterate this pass. Update the
\r
1020 // commit indices to record how far we got. Adjust indices
\r
1021 // for length change.
\r
1023 // Move the pass indices past the committed text.
\r
1024 passStart = passLimit = index.start;
\r
1026 // Adjust the rollbackStart for length changes and move
\r
1027 // it past the committed text. All characters we've
\r
1028 // processed to this point are committed now, so zero
\r
1029 // out the uncommittedLength.
\r
1030 rollbackStart += delta + uncommittedLength;
\r
1031 uncommittedLength = 0;
\r
1033 // Adjust indices for length changes.
\r
1034 runLimit += delta;
\r
1035 totalDelta += delta;
\r
1039 System.out.println(Utility.escape(log.toString()));
\r
1043 // Adjust overall limit and rollbackOrigin for insertions and
\r
1044 // deletions. Don't need to worry about contextLimit because
\r
1045 // handleTransliterate() maintains that.
\r
1046 rollbackOrigin += totalDelta;
\r
1047 globalLimit += totalDelta;
\r
1049 // Delete the rollback copy
\r
1050 text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
\r
1052 // Move start past committed text
\r
1053 index.start = passStart;
\r
1057 // Delegate to subclass for actual transliteration.
\r
1060 log.append("filteredTransliterate{"+getID()+"}: ");
\r
1061 UtilityExtensions.formatInput(log, text, index);
\r
1064 int limit = index.limit;
\r
1065 handleTransliterate(text, index, isIncrementalRun);
\r
1066 delta = index.limit - limit; // change in length
\r
1069 log.append(" => ");
\r
1070 UtilityExtensions.formatInput(log, text, index);
\r
1073 // In a properly written transliterator, start == limit after
\r
1074 // handleTransliterate() returns when incremental is false.
\r
1075 // Catch cases where the subclass doesn't do this, and throw
\r
1076 // an exception. (Just pinning start to limit is a bad idea,
\r
1077 // because what's probably happening is that the subclass
\r
1078 // isn't transliterating all the way to the end, and it should
\r
1079 // in non-incremental mode.)
\r
1080 if (!isIncrementalRun && index.start != index.limit) {
\r
1081 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID());
\r
1084 // Adjust overall limit for insertions/deletions. Don't need
\r
1085 // to worry about contextLimit because handleTransliterate()
\r
1086 // maintains that.
\r
1087 globalLimit += delta;
\r
1090 System.out.println(Utility.escape(log.toString()));
\r
1094 if (filter == null || isIncrementalRun) {
\r
1098 // If we did completely transliterate this
\r
1099 // run, then repeat with the next unfiltered run.
\r
1102 // Start is valid where it is. Limit needs to be put back where
\r
1103 // it was, modulo adjustments for deletions/insertions.
\r
1104 index.limit = globalLimit;
\r
1107 System.out.println("filteredTransliterate{"+getID()+"}: OUT=" +
\r
1108 UtilityExtensions.formatInput(text, index));
\r
1113 * Transliterate a substring of text, as specified by index, taking filters
\r
1114 * into account. This method is for subclasses that need to delegate to
\r
1115 * another transliterator, such as CompoundTransliterator.
\r
1116 * @param text the text to be transliterated
\r
1117 * @param index the position indices
\r
1118 * @param incremental if TRUE, then assume more characters may be inserted
\r
1119 * at index.limit, and postpone processing to accomodate future incoming
\r
1123 public void filteredTransliterate(Replaceable text,
\r
1125 boolean incremental) {
\r
1126 filteredTransliterate(text, index, incremental, false);
\r
1130 * Returns the length of the longest context required by this transliterator.
\r
1131 * This is <em>preceding</em> context. The default value is zero, but
\r
1132 * subclasses can change this by calling <code>setMaximumContextLength()</code>.
\r
1133 * For example, if a transliterator translates "ddd" (where
\r
1134 * d is any digit) to "555" when preceded by "(ddd)", then the preceding
\r
1135 * context length is 5, the length of "(ddd)".
\r
1137 * @return The maximum number of preceding context characters this
\r
1138 * transliterator needs to examine
\r
1141 public final int getMaximumContextLength() {
\r
1142 return maximumContextLength;
\r
1146 * Method for subclasses to use to set the maximum context length.
\r
1147 * @see #getMaximumContextLength
\r
1150 protected void setMaximumContextLength(int a) {
\r
1152 throw new IllegalArgumentException("Invalid context length " + a);
\r
1154 maximumContextLength = a;
\r
1158 * Returns a programmatic identifier for this transliterator.
\r
1159 * If this identifier is passed to <code>getInstance()</code>, it
\r
1160 * will return this object, if it has been registered.
\r
1161 * @see #registerClass
\r
1162 * @see #getAvailableIDs
\r
1165 public final String getID() {
\r
1170 * Set the programmatic identifier for this transliterator. Only
\r
1171 * for use by subclasses.
\r
1174 protected final void setID(String id) {
\r
1179 * Returns a name for this transliterator that is appropriate for
\r
1180 * display to the user in the default locale. See {@link
\r
1181 * #getDisplayName(String,Locale)} for details.
\r
1184 public final static String getDisplayName(String ID) {
\r
1185 return getDisplayName(ID, ULocale.getDefault());
\r
1189 * Returns a name for this transliterator that is appropriate for
\r
1190 * display to the user in the given locale. This name is taken
\r
1191 * from the locale resource data in the standard manner of the
\r
1192 * <code>java.text</code> package.
\r
1194 * <p>If no localized names exist in the system resource bundles,
\r
1195 * a name is synthesized using a localized
\r
1196 * <code>MessageFormat</code> pattern from the resource data. The
\r
1197 * arguments to this pattern are an integer followed by one or two
\r
1198 * strings. The integer is the number of strings, either 1 or 2.
\r
1199 * The strings are formed by splitting the ID for this
\r
1200 * transliterator at the first '-'. If there is no '-', then the
\r
1201 * entire ID forms the only string.
\r
1202 * @param inLocale the Locale in which the display name should be
\r
1204 * @see java.text.MessageFormat
\r
1207 public static String getDisplayName(String id, Locale inLocale) {
\r
1208 return getDisplayName(id, ULocale.forLocale(inLocale));
\r
1212 * Returns a name for this transliterator that is appropriate for
\r
1213 * display to the user in the given locale. This name is taken
\r
1214 * from the locale resource data in the standard manner of the
\r
1215 * <code>java.text</code> package.
\r
1217 * <p>If no localized names exist in the system resource bundles,
\r
1218 * a name is synthesized using a localized
\r
1219 * <code>MessageFormat</code> pattern from the resource data. The
\r
1220 * arguments to this pattern are an integer followed by one or two
\r
1221 * strings. The integer is the number of strings, either 1 or 2.
\r
1222 * The strings are formed by splitting the ID for this
\r
1223 * transliterator at the first '-'. If there is no '-', then the
\r
1224 * entire ID forms the only string.
\r
1225 * @param inLocale the ULocale in which the display name should be
\r
1227 * @see java.text.MessageFormat
\r
1230 public static String getDisplayName(String id, ULocale inLocale) {
\r
1232 // Resource bundle containing display name keys and the
\r
1233 // RB_RULE_BASED_IDS array.
\r
1235 //If we ever integrate this with the Sun JDK, the resource bundle
\r
1236 // root will change to sun.text.resources.LocaleElements
\r
1238 ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle.
\r
1239 getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, inLocale);
\r
1241 // Normalize the ID
\r
1242 String stv[] = TransliteratorIDParser.IDtoSTV(id);
\r
1243 if (stv == null) {
\r
1244 // No target; malformed id
\r
1247 String ID = stv[0] + '-' + stv[1];
\r
1248 if (stv[2] != null && stv[2].length() > 0) {
\r
1249 ID = ID + '/' + stv[2];
\r
1252 // Use the registered display name, if any
\r
1253 String n = (String) displayNameCache.get(new CaseInsensitiveString(ID));
\r
1258 // Use display name for the entire transliterator, if it
\r
1261 return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
\r
1262 } catch (MissingResourceException e) {}
\r
1265 // Construct the formatter first; if getString() fails
\r
1266 // we'll exit the try block
\r
1267 MessageFormat format = new MessageFormat(
\r
1268 bundle.getString(RB_DISPLAY_NAME_PATTERN));
\r
1269 // Construct the argument array
\r
1270 Object[] args = new Object[] { new Integer(2), stv[0], stv[1] };
\r
1272 // Use display names for the scripts, if they exist
\r
1273 for (int j=1; j<=2; ++j) {
\r
1275 args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX +
\r
1276 (String) args[j]);
\r
1277 } catch (MissingResourceException e) {}
\r
1280 // Format it using the pattern in the resource
\r
1281 return (stv[2].length() > 0) ?
\r
1282 (format.format(args) + '/' + stv[2]) :
\r
1283 format.format(args);
\r
1284 } catch (MissingResourceException e2) {}
\r
1286 // We should not reach this point unless there is something
\r
1287 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
\r
1288 // been deleted from the root RB_LOCALE_ELEMENTS resource.
\r
1289 throw new RuntimeException();
\r
1293 * Returns the filter used by this transliterator, or <tt>null</tt>
\r
1294 * if this transliterator uses no filter.
\r
1297 public final UnicodeFilter getFilter() {
\r
1302 * Changes the filter used by this transliterator. If the filter
\r
1303 * is set to <tt>null</tt> then no filtering will occur.
\r
1305 * <p>Callers must take care if a transliterator is in use by
\r
1306 * multiple threads. The filter should not be changed by one
\r
1307 * thread while another thread may be transliterating.
\r
1310 public void setFilter(UnicodeFilter filter) {
\r
1311 this.filter = filter;
\r
1315 * Returns a <code>Transliterator</code> object given its ID.
\r
1316 * The ID must be either a system transliterator ID or a ID registered
\r
1317 * using <code>registerClass()</code>.
\r
1319 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
\r
1320 * @return A <code>Transliterator</code> object with the given ID
\r
1321 * @exception IllegalArgumentException if the given ID is invalid.
\r
1324 public static final Transliterator getInstance(String ID) {
\r
1325 return getInstance(ID, FORWARD);
\r
1329 * Returns a <code>Transliterator</code> object given its ID.
\r
1330 * The ID must be either a system transliterator ID or a ID registered
\r
1331 * using <code>registerClass()</code>.
\r
1333 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
\r
1334 * @param dir either FORWARD or REVERSE. If REVERSE then the
\r
1335 * inverse of the given ID is instantiated.
\r
1336 * @return A <code>Transliterator</code> object with the given ID
\r
1337 * @exception IllegalArgumentException if the given ID is invalid.
\r
1338 * @see #registerClass
\r
1339 * @see #getAvailableIDs
\r
1343 public static Transliterator getInstance(String ID,
\r
1345 StringBuffer canonID = new StringBuffer();
\r
1346 Vector list = new Vector();
\r
1347 UnicodeSet[] globalFilter = new UnicodeSet[1];
\r
1348 if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) {
\r
1349 throw new IllegalArgumentException("Invalid ID " + ID);
\r
1352 TransliteratorIDParser.instantiateList(list);
\r
1354 // assert(list.size() > 0);
\r
1355 Transliterator t = null;
\r
1356 if (list.size() > 1 || Utility.indexOf(canonID, ";") >= 0) {
\r
1357 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
\r
1358 // has one child transliterator. This is so that toRules() will return the right thing
\r
1359 // (without any inactive ID), but our main ID still comes out correct. That is, if we
\r
1360 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
\r
1361 // even though the ID is "(Lower);Latin-Greek;".
\r
1362 t = new CompoundTransliterator(list);
\r
1365 t = (Transliterator)list.elementAt(0);
\r
1368 t.setID(canonID.toString());
\r
1369 if (globalFilter[0] != null) {
\r
1370 t.setFilter(globalFilter[0]);
\r
1376 * Create a transliterator from a basic ID. This is an ID
\r
1377 * containing only the forward direction source, target, and
\r
1379 * @param id a basic ID of the form S-T or S-T/V.
\r
1380 * @param canonID canonical ID to apply to the result, or
\r
1381 * null to leave the ID unchanged
\r
1382 * @return a newly created Transliterator or null if the ID is
\r
1385 static Transliterator getBasicInstance(String id, String canonID) {
\r
1386 StringBuffer s = new StringBuffer();
\r
1387 Transliterator t = registry.get(id, s);
\r
1388 if (s.length() != 0) {
\r
1390 // Instantiate an alias
\r
1391 t = getInstance(s.toString(), FORWARD);
\r
1393 if (t != null && canonID != null) {
\r
1400 * Returns a <code>Transliterator</code> object constructed from
\r
1401 * the given rule string. This will be a RuleBasedTransliterator,
\r
1402 * if the rule string contains only rules, or a
\r
1403 * CompoundTransliterator, if it contains ID blocks, or a
\r
1404 * NullTransliterator, if it contains ID blocks which parse as
\r
1405 * empty for the given direction.
\r
1408 public static final Transliterator createFromRules(String ID, String rules, int dir) {
\r
1409 Transliterator t = null;
\r
1411 TransliteratorParser parser = new TransliteratorParser();
\r
1412 parser.parse(rules, dir);
\r
1414 // NOTE: The logic here matches that in TransliteratorRegistry.
\r
1415 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
\r
1416 t = new NullTransliterator();
\r
1418 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
\r
1419 t = new RuleBasedTransliterator(ID, (RuleBasedTransliterator.Data)parser.dataVector.get(0), null);
\r
1421 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
\r
1422 // idBlock, no data -- this is an alias. The ID has
\r
1423 // been munged from reverse into forward mode, if
\r
1424 // necessary, so instantiate the ID in the forward
\r
1426 if (parser.compoundFilter != null)
\r
1427 t = getInstance(parser.compoundFilter.toPattern(false) + ";"
\r
1428 + (String)parser.idBlockVector.get(0));
\r
1430 t = getInstance((String)parser.idBlockVector.get(0));
\r
1438 Vector transliterators = new Vector();
\r
1439 int passNumber = 1;
\r
1441 int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size());
\r
1442 for (int i = 0; i < limit; i++) {
\r
1443 if (i < parser.idBlockVector.size()) {
\r
1444 String idBlock = (String)parser.idBlockVector.get(i);
\r
1445 if (idBlock.length() > 0) {
\r
1446 Transliterator temp = getInstance(idBlock);
\r
1447 if (!(temp instanceof NullTransliterator))
\r
1448 transliterators.add(getInstance(idBlock));
\r
1451 if (i < parser.dataVector.size()) {
\r
1452 RuleBasedTransliterator.Data data = (RuleBasedTransliterator.Data)parser.dataVector.get(i);
\r
1453 transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null));
\r
1457 t = new CompoundTransliterator(transliterators, passNumber - 1);
\r
1459 if (parser.compoundFilter != null) {
\r
1460 t.setFilter(parser.compoundFilter);
\r
1468 * Returns a rule string for this transliterator.
\r
1469 * @param escapeUnprintable if true, then unprintable characters
\r
1470 * will be converted to escape form backslash-'u' or
\r
1474 public String toRules(boolean escapeUnprintable) {
\r
1475 return baseToRules(escapeUnprintable);
\r
1479 * Returns a rule string for this transliterator. This is
\r
1480 * a non-overrideable base class implementation that subclasses
\r
1481 * may call. It simply munges the ID into the correct format,
\r
1482 * that is, "foo" => "::foo".
\r
1483 * @param escapeUnprintable if true, then unprintable characters
\r
1484 * will be converted to escape form backslash-'u' or
\r
1488 protected final String baseToRules(boolean escapeUnprintable) {
\r
1489 // The base class implementation of toRules munges the ID into
\r
1490 // the correct format. That is: foo => ::foo
\r
1491 // KEEP in sync with rbt_pars
\r
1492 if (escapeUnprintable) {
\r
1493 StringBuffer rulesSource = new StringBuffer();
\r
1494 String id = getID();
\r
1495 for (int i=0; i<id.length();) {
\r
1496 int c = UTF16.charAt(id, i);
\r
1497 if (!Utility.escapeUnprintable(rulesSource, c)) {
\r
1498 UTF16.append(rulesSource, c);
\r
1500 i += UTF16.getCharCount(c);
\r
1502 rulesSource.insert(0, "::");
\r
1503 rulesSource.append(ID_DELIM);
\r
1504 return rulesSource.toString();
\r
1506 return "::" + getID() + ID_DELIM;
\r
1510 * Return the elements that make up this transliterator. For
\r
1511 * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
\r
1512 * were created, the return value of this method would be an array
\r
1513 * of the three transliterator objects that make up that
\r
1514 * transliterator: [NFD, Jamo-Latin, Latin-Greek].
\r
1516 * <p>If this transliterator is not composed of other
\r
1517 * transliterators, then this method will return an array of
\r
1518 * length one containing a reference to this transliterator.
\r
1519 * @return an array of one or more transliterators that make up
\r
1520 * this transliterator
\r
1523 public Transliterator[] getElements() {
\r
1524 Transliterator result[];
\r
1525 if (this instanceof CompoundTransliterator) {
\r
1526 CompoundTransliterator cpd = (CompoundTransliterator) this;
\r
1527 result = new Transliterator[cpd.getCount()];
\r
1528 for (int i=0; i<result.length; ++i) {
\r
1529 result[i] = cpd.getTransliterator(i);
\r
1532 result = new Transliterator[] { this };
\r
1538 * Returns the set of all characters that may be modified in the
\r
1539 * input text by this Transliterator. This incorporates this
\r
1540 * object's current filter; if the filter is changed, the return
\r
1541 * value of this function will change. The default implementation
\r
1542 * returns an empty set. Some subclasses may override {@link
\r
1543 * #handleGetSourceSet} to return a more precise result. The
\r
1544 * return result is approximate in any case and is intended for
\r
1545 * use by tests, tools, or utilities.
\r
1546 * @see #getTargetSet
\r
1547 * @see #handleGetSourceSet
\r
1550 public final UnicodeSet getSourceSet() {
\r
1551 UnicodeSet set = handleGetSourceSet();
\r
1552 if (filter != null) {
\r
1553 UnicodeSet filterSet;
\r
1554 // Most, but not all filters will be UnicodeSets. Optimize for
\r
1555 // the high-runner case.
\r
1557 filterSet = (UnicodeSet) filter;
\r
1558 } catch (ClassCastException e) {
\r
1559 filterSet = new UnicodeSet();
\r
1560 filter.addMatchSetTo(filterSet);
\r
1562 set.retainAll(filterSet);
\r
1568 * Framework method that returns the set of all characters that
\r
1569 * may be modified in the input text by this Transliterator,
\r
1570 * ignoring the effect of this object's filter. The base class
\r
1571 * implementation returns the empty set. Subclasses that wish to
\r
1572 * implement this should override this method.
\r
1573 * @return the set of characters that this transliterator may
\r
1574 * modify. The set may be modified, so subclasses should return a
\r
1575 * newly-created object.
\r
1576 * @see #getSourceSet
\r
1577 * @see #getTargetSet
\r
1580 protected UnicodeSet handleGetSourceSet() {
\r
1581 return new UnicodeSet();
\r
1585 * Returns the set of all characters that may be generated as
\r
1586 * replacement text by this transliterator. The default
\r
1587 * implementation returns the empty set. Some subclasses may
\r
1588 * override this method to return a more precise result. The
\r
1589 * return result is approximate in any case and is intended for
\r
1590 * use by tests, tools, or utilities requiring such
\r
1591 * meta-information.
\r
1592 * @see #getTargetSet
\r
1595 public UnicodeSet getTargetSet() {
\r
1596 return new UnicodeSet();
\r
1600 * Returns this transliterator's inverse. See the class
\r
1601 * documentation for details. This implementation simply inverts
\r
1602 * the two entities in the ID and attempts to retrieve the
\r
1603 * resulting transliterator. That is, if <code>getID()</code>
\r
1604 * returns "A-B", then this method will return the result of
\r
1605 * <code>getInstance("B-A")</code>, or <code>null</code> if that
\r
1608 * <p>Subclasses with knowledge of their inverse may wish to
\r
1609 * override this method.
\r
1611 * @return a transliterator that is an inverse, not necessarily
\r
1612 * exact, of this transliterator, or <code>null</code> if no such
\r
1613 * transliterator is registered.
\r
1614 * @see #registerClass
\r
1617 public final Transliterator getInverse() {
\r
1618 return getInstance(ID, REVERSE);
\r
1622 * Registers a subclass of <code>Transliterator</code> with the
\r
1623 * system. This subclass must have a public constructor taking no
\r
1624 * arguments. When that constructor is called, the resulting
\r
1625 * object must return the <code>ID</code> passed to this method if
\r
1626 * its <code>getID()</code> method is called.
\r
1628 * @param ID the result of <code>getID()</code> for this
\r
1630 * @param transClass a subclass of <code>Transliterator</code>
\r
1631 * @see #unregister
\r
1634 public static void registerClass(String ID, Class transClass, String displayName) {
\r
1635 registry.put(ID, transClass, true);
\r
1636 if (displayName != null) {
\r
1637 displayNameCache.put(new CaseInsensitiveString(ID), displayName);
\r
1642 * Register a factory object with the given ID. The factory
\r
1643 * method should return a new instance of the given transliterator.
\r
1644 * @param ID the ID of this transliterator
\r
1645 * @param factory the factory object
\r
1648 public static void registerFactory(String ID, Factory factory) {
\r
1649 registry.put(ID, factory, true);
\r
1653 * Register a Transliterator object with the given ID.
\r
1654 * @param trans the Transliterator object
\r
1657 public static void registerInstance(Transliterator trans) {
\r
1658 registry.put(trans.getID(), trans, true);
\r
1662 * Register a Transliterator object with the given ID.
\r
1663 * @param ID the ID of this transliterator
\r
1664 * @param trans the Transliterator object
\r
1667 static void registerInstance(Transliterator trans, boolean visible) {
\r
1668 registry.put(trans.getID(), trans, visible);
\r
1672 * Register an ID as an alias of another ID. Instantiating
\r
1673 * alias ID produces the same result as instantiating the original ID.
\r
1674 * This is generally used to create short aliases of compound IDs.
\r
1675 * @param aliasID The new ID being registered.
\r
1676 * @param realID The existing ID that the new ID should be an alias of.
\r
1679 public static void registerAlias(String aliasID, String realID) {
\r
1680 registry.put(aliasID, realID, true);
\r
1684 * Register two targets as being inverses of one another. For
\r
1685 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
\r
1686 * Transliterator to form the following inverse relationships:
\r
1689 * Any-NFC => Any-NFD
\r
1691 * Any-NFD => Any-NFC</pre>
\r
1693 * (Without the special inverse registration, the inverse of NFC
\r
1694 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
\r
1695 * that the presence or absence of "Any-" is preserved.
\r
1697 * <p>The relationship is symmetrical; registering (a, b) is
\r
1698 * equivalent to registering (b, a).
\r
1700 * <p>The relevant IDs must still be registered separately as
\r
1701 * factories or classes.
\r
1703 * <p>Only the targets are specified. Special inverses always
\r
1704 * have the form Any-Target1 <=> Any-Target2. The target should
\r
1705 * have canonical casing (the casing desired to be produced when
\r
1706 * an inverse is formed) and should contain no whitespace or other
\r
1707 * extraneous characters.
\r
1709 * @param target the target against which to register the inverse
\r
1710 * @param inverseTarget the inverse of target, that is
\r
1711 * Any-target.getInverse() => Any-inverseTarget
\r
1712 * @param bidirectional if true, register the reverse relation
\r
1713 * as well, that is, Any-inverseTarget.getInverse() => Any-target
\r
1716 static void registerSpecialInverse(String target,
\r
1717 String inverseTarget,
\r
1718 boolean bidirectional) {
\r
1719 TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional);
\r
1723 * Unregisters a transliterator or class. This may be either
\r
1724 * a system transliterator or a user transliterator or class.
\r
1726 * @param ID the ID of the transliterator or class
\r
1727 * @see #registerClass
\r
1730 public static void unregister(String ID) {
\r
1731 displayNameCache.remove(new CaseInsensitiveString(ID));
\r
1732 registry.remove(ID);
\r
1736 * Returns an enumeration over the programmatic names of registered
\r
1737 * <code>Transliterator</code> objects. This includes both system
\r
1738 * transliterators and user transliterators registered using
\r
1739 * <code>registerClass()</code>. The enumerated names may be
\r
1740 * passed to <code>getInstance()</code>.
\r
1742 * @return An <code>Enumeration</code> over <code>String</code> objects
\r
1743 * @see #getInstance
\r
1744 * @see #registerClass
\r
1747 public static final Enumeration getAvailableIDs() {
\r
1748 return registry.getAvailableIDs();
\r
1752 * Returns an enumeration over the source names of registered
\r
1753 * transliterators. Source names may be passed to
\r
1754 * getAvailableTargets() to obtain available targets for each
\r
1758 public static final Enumeration getAvailableSources() {
\r
1759 return registry.getAvailableSources();
\r
1763 * Returns an enumeration over the target names of registered
\r
1764 * transliterators having a given source name. Target names may
\r
1765 * be passed to getAvailableVariants() to obtain available
\r
1766 * variants for each source and target pair.
\r
1769 public static final Enumeration getAvailableTargets(String source) {
\r
1770 return registry.getAvailableTargets(source);
\r
1774 * Returns an enumeration over the variant names of registered
\r
1775 * transliterators having a given source name and target name.
\r
1778 public static final Enumeration getAvailableVariants(String source,
\r
1780 return registry.getAvailableVariants(source, target);
\r
1782 private static final String INDEX = "index",
\r
1783 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs";
\r
1785 registry = new TransliteratorRegistry();
\r
1787 // The display name cache starts out empty
\r
1788 displayNameCache = new Hashtable();
\r
1789 /* The following code parses the index table located in
\r
1790 * icu/data/translit/root.txt. The index is an n x 4 table
\r
1791 * that follows this format:
\r
1794 * resource{"<resource>"}
\r
1795 * direction{"<direction>"}
\r
1800 * resource{"<resource>"}
\r
1801 * direction{"<direction"}
\r
1805 * alias{"<getInstanceArg"}
\r
1807 * <id> is the ID of the system transliterator being defined. These
\r
1808 * are public IDs enumerated by Transliterator.getAvailableIDs(),
\r
1809 * unless the second field is "internal".
\r
1811 * <resource> is a ResourceReader resource name. Currently these refer
\r
1812 * to file names under com/ibm/text/resources. This string is passed
\r
1813 * directly to ResourceReader, together with <encoding>.
\r
1815 * <direction> is either "FORWARD" or "REVERSE".
\r
1817 * <getInstanceArg> is a string to be passed directly to
\r
1818 * Transliterator.getInstance(). The returned Transliterator object
\r
1819 * then has its ID changed to <id> and is returned.
\r
1821 * The extra blank field on "alias" lines is to make the array square.
\r
1823 UResourceBundle bundle, transIDs, colBund;
\r
1824 bundle = UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, INDEX);
\r
1825 transIDs = bundle.get(RB_RULE_BASED_IDS);
\r
1828 maxRows = transIDs.getSize();
\r
1829 for (row = 0; row < maxRows; row++) {
\r
1830 colBund = transIDs.get(row);
\r
1831 String ID = colBund.getKey();
\r
1832 UResourceBundle res = colBund.get(0);
\r
1833 String type = res.getKey();
\r
1834 if (type.equals("file") || type.equals("internal")) {
\r
1835 // Rest of line is <resource>:<encoding>:<direction>
\r
1837 String resString = res.getString("resource");
\r
1839 String direction = res.getString("direction");
\r
1840 switch (direction.charAt(0)) {
\r
1848 throw new RuntimeException("Can't parse direction: " + direction);
\r
1851 resString, // resource
\r
1852 "UTF-16", // encoding
\r
1854 !type.equals("internal"));
\r
1855 } else if (type.equals("alias")) {
\r
1856 //'alias'; row[2]=createInstance argument
\r
1857 String resString = res.getString();
\r
1858 registry.put(ID, resString, true);
\r
1861 throw new RuntimeException("Unknow type: " + type);
\r
1865 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false);
\r
1867 // Register non-rule-based transliterators
\r
1868 registerClass(NullTransliterator._ID,
\r
1869 NullTransliterator.class, null);
\r
1870 RemoveTransliterator.register();
\r
1871 EscapeTransliterator.register();
\r
1872 UnescapeTransliterator.register();
\r
1873 LowercaseTransliterator.register();
\r
1874 UppercaseTransliterator.register();
\r
1875 TitlecaseTransliterator.register();
\r
1876 CaseFoldTransliterator.register();
\r
1877 UnicodeNameTransliterator.register();
\r
1878 NameUnicodeTransliterator.register();
\r
1879 NormalizationTransliterator.register();
\r
1880 BreakTransliterator.register();
\r
1881 AnyTransliterator.register(); // do this last!
\r
1885 * Register the script-based "Any" transliterators: Any-Latin, Any-Greek
\r
1889 public static void registerAny() {
\r
1890 AnyTransliterator.register();
\r
1894 * The factory interface for transliterators. Transliterator
\r
1895 * subclasses can register factory objects for IDs using the
\r
1896 * registerFactory() method of Transliterator. When invoked, the
\r
1897 * factory object will be passed the ID being instantiated. This
\r
1898 * makes it possible to register one factory method to more than
\r
1899 * one ID, or for a factory method to parameterize its result
\r
1900 * based on the variant.
\r
1903 public static interface Factory {
\r
1905 * Return a transliterator for the given ID.
\r
1908 Transliterator getInstance(String ID);
\r
1912 * Implements StringTransform via this method.
\r
1913 * @param source text to be transformed (eg lowercased)
\r
1917 public String transform(String source) {
\r
1918 return transliterate(source);
\r