2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.text.MessageFormat;
\r
10 import java.util.Enumeration;
\r
11 import java.util.Hashtable;
\r
12 import java.util.Locale;
\r
13 import java.util.MissingResourceException;
\r
14 import java.util.Vector;
\r
16 import com.ibm.icu.impl.ICUResourceBundle;
\r
17 import com.ibm.icu.impl.Utility;
\r
18 import com.ibm.icu.impl.UtilityExtensions;
\r
19 import com.ibm.icu.text.RuleBasedTransliterator.Data;
\r
20 import com.ibm.icu.text.TransliteratorIDParser.SingleID;
\r
21 import com.ibm.icu.util.CaseInsensitiveString;
\r
22 import com.ibm.icu.util.ULocale;
\r
23 import com.ibm.icu.util.UResourceBundle;
\r
26 * <code>Transliterator</code> is an abstract class that
\r
27 * transliterates text from one format to another. The most common
\r
28 * kind of transliterator is a script, or alphabet, transliterator.
\r
29 * For example, a Russian to Latin transliterator changes Russian text
\r
30 * written in Cyrillic characters to phonetically equivalent Latin
\r
31 * characters. It does not <em>translate</em> Russian to English!
\r
32 * Transliteration, unlike translation, operates on characters, without
\r
33 * reference to the meanings of words and sentences.
\r
35 * <p>Although script conversion is its most common use, a
\r
36 * transliterator can actually perform a more general class of tasks.
\r
37 * In fact, <code>Transliterator</code> defines a very general API
\r
38 * which specifies only that a segment of the input text is replaced
\r
39 * by new text. The particulars of this conversion are determined
\r
40 * entirely by subclasses of <code>Transliterator</code>.
\r
42 * <p><b>Transliterators are stateless</b>
\r
44 * <p><code>Transliterator</code> objects are <em>stateless</em>; they
\r
45 * retain no information between calls to
\r
46 * <code>transliterate()</code>. As a result, threads may share
\r
47 * transliterators without synchronizing them. This might seem to
\r
48 * limit the complexity of the transliteration operation. In
\r
49 * practice, subclasses perform complex transliterations by delaying
\r
50 * the replacement of text until it is known that no other
\r
51 * replacements are possible. In other words, although the
\r
52 * <code>Transliterator</code> objects are stateless, the source text
\r
53 * itself embodies all the needed information, and delayed operation
\r
54 * allows arbitrary complexity.
\r
56 * <p><b>Batch transliteration</b>
\r
58 * <p>The simplest way to perform transliteration is all at once, on a
\r
59 * string of existing text. This is referred to as <em>batch</em>
\r
60 * transliteration. For example, given a string <code>input</code>
\r
61 * and a transliterator <code>t</code>, the call
\r
63 * <blockquote><code>String result = t.transliterate(input);
\r
64 * </code></blockquote>
\r
66 * will transliterate it and return the result. Other methods allow
\r
67 * the client to specify a substring to be transliterated and to use
\r
68 * {@link Replaceable} objects instead of strings, in order to
\r
69 * preserve out-of-band information (such as text styles).
\r
71 * <p><b>Keyboard transliteration</b>
\r
73 * <p>Somewhat more involved is <em>keyboard</em>, or incremental
\r
74 * transliteration. This is the transliteration of text that is
\r
75 * arriving from some source (typically the user's keyboard) one
\r
76 * character at a time, or in some other piecemeal fashion.
\r
78 * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
\r
79 * stores the text. As text is inserted, as much as possible is
\r
80 * transliterated on the fly. This means a GUI that displays the
\r
81 * contents of the buffer may show text being modified as each new
\r
82 * character arrives.
\r
84 * <p>Consider the simple <code>RuleBasedTransliterator</code>:
\r
86 * <blockquote><code>
\r
89 * </code></blockquote>
\r
91 * When the user types 't', nothing will happen, since the
\r
92 * transliterator is waiting to see if the next character is 'h'. To
\r
93 * remedy this, we introduce the notion of a cursor, marked by a '|'
\r
94 * in the output string:
\r
96 * <blockquote><code>
\r
99 * </code></blockquote>
\r
101 * Now when the user types 't', tau appears, and if the next character
\r
102 * is 'h', the tau changes to a theta. This is accomplished by
\r
103 * maintaining a cursor position (independent of the insertion point,
\r
104 * and invisible in the GUI) across calls to
\r
105 * <code>transliterate()</code>. Typically, the cursor will
\r
106 * be coincident with the insertion point, but in a case like the one
\r
107 * above, it will precede the insertion point.
\r
109 * <p>Keyboard transliteration methods maintain a set of three indices
\r
110 * that are updated with each call to
\r
111 * <code>transliterate()</code>, including the cursor, start,
\r
112 * and limit. These indices are changed by the method, and they are
\r
113 * passed in and out via a Position object. The <code>start</code> index
\r
114 * marks the beginning of the substring that the transliterator will
\r
115 * look at. It is advanced as text becomes committed (but it is not
\r
116 * the committed index; that's the <code>cursor</code>). The
\r
117 * <code>cursor</code> index, described above, marks the point at
\r
118 * which the transliterator last stopped, either because it reached
\r
119 * the end, or because it required more characters to disambiguate
\r
120 * between possible inputs. The <code>cursor</code> can also be
\r
121 * explicitly set by rules in a <code>RuleBasedTransliterator</code>.
\r
122 * Any characters before the <code>cursor</code> index are frozen;
\r
123 * future keyboard transliteration calls within this input sequence
\r
124 * will not change them. New text is inserted at the
\r
125 * <code>limit</code> index, which marks the end of the substring that
\r
126 * the transliterator looks at.
\r
128 * <p>Because keyboard transliteration assumes that more characters
\r
129 * are to arrive, it is conservative in its operation. It only
\r
130 * transliterates when it can do so unambiguously. Otherwise it waits
\r
131 * for more characters to arrive. When the client code knows that no
\r
132 * more characters are forthcoming, perhaps because the user has
\r
133 * performed some input termination operation, then it should call
\r
134 * <code>finishTransliteration()</code> to complete any
\r
135 * pending transliterations.
\r
137 * <p><b>Inverses</b>
\r
139 * <p>Pairs of transliterators may be inverses of one another. For
\r
140 * example, if transliterator <b>A</b> transliterates characters by
\r
141 * incrementing their Unicode value (so "abc" -> "def"), and
\r
142 * transliterator <b>B</b> decrements character values, then <b>A</b>
\r
143 * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
\r
144 * with <b>B</b> in a compound transliterator, the result is the
\r
145 * indentity transliterator, that is, a transliterator that does not
\r
146 * change its input text.
\r
148 * The <code>Transliterator</code> method <code>getInverse()</code>
\r
149 * returns a transliterator's inverse, if one exists, or
\r
150 * <code>null</code> otherwise. However, the result of
\r
151 * <code>getInverse()</code> usually will <em>not</em> be a true
\r
152 * mathematical inverse. This is because true inverse transliterators
\r
153 * are difficult to formulate. For example, consider two
\r
154 * transliterators: <b>AB</b>, which transliterates the character 'A'
\r
155 * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
\r
156 * seem that these are exact inverses, since
\r
158 * <blockquote>"A" x <b>AB</b> -> "B"<br>
\r
159 * "B" x <b>BA</b> -> "A"</blockquote>
\r
161 * where 'x' represents transliteration. However,
\r
163 * <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
\r
164 * "BBCD" x <b>BA</b> -> "AACD"</blockquote>
\r
166 * so <b>AB</b> composed with <b>BA</b> is not the
\r
167 * identity. Nonetheless, <b>BA</b> may be usefully considered to be
\r
168 * <b>AB</b>'s inverse, and it is on this basis that
\r
169 * <b>AB</b><code>.getInverse()</code> could legitimately return
\r
172 * <p><b>IDs and display names</b>
\r
174 * <p>A transliterator is designated by a short identifier string or
\r
175 * <em>ID</em>. IDs follow the format <em>source-destination</em>,
\r
176 * where <em>source</em> describes the entity being replaced, and
\r
177 * <em>destination</em> describes the entity replacing
\r
178 * <em>source</em>. The entities may be the names of scripts,
\r
179 * particular sequences of characters, or whatever else it is that the
\r
180 * transliterator converts to or from. For example, a transliterator
\r
181 * from Russian to Latin might be named "Russian-Latin". A
\r
182 * transliterator from keyboard escape sequences to Latin-1 characters
\r
183 * might be named "KeyboardEscape-Latin1". By convention, system
\r
184 * entity names are in English, with the initial letters of words
\r
185 * capitalized; user entity names may follow any format so long as
\r
186 * they do not contain dashes.
\r
188 * <p>In addition to programmatic IDs, transliterator objects have
\r
189 * display names for presentation in user interfaces, returned by
\r
190 * {@link #getDisplayName}.
\r
192 * <p><b>Factory methods and registration</b>
\r
194 * <p>In general, client code should use the factory method
\r
195 * <code>getInstance()</code> to obtain an instance of a
\r
196 * transliterator given its ID. Valid IDs may be enumerated using
\r
197 * <code>getAvailableIDs()</code>. Since transliterators are
\r
198 * stateless, multiple calls to <code>getInstance()</code> with the
\r
199 * same ID will return the same object.
\r
201 * <p>In addition to the system transliterators registered at startup,
\r
202 * user transliterators may be registered by calling
\r
203 * <code>registerInstance()</code> at run time. To register a
\r
204 * transliterator subclass without instantiating it (until it is
\r
205 * needed), users may call <code>registerClass()</code>.
\r
207 * <p><b>Composed transliterators</b>
\r
209 * <p>In addition to built-in system transliterators like
\r
210 * "Latin-Greek", there are also built-in <em>composed</em>
\r
211 * transliterators. These are implemented by composing two or more
\r
212 * component transliterators. For example, if we have scripts "A",
\r
213 * "B", "C", and "D", and we want to transliterate between all pairs
\r
214 * of them, then we need to write 12 transliterators: "A-B", "A-C",
\r
215 * "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
\r
216 * convert all scripts to an intermediate script "M", then instead of
\r
217 * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
\r
218 * "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
\r
219 * win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
\r
220 * <em>n</em>, so as <em>n</em> gets larger the gain becomes
\r
221 * significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
\r
222 * difference.) Note the use of "~" rather than "-" for the script
\r
223 * separator here; this indicates that the given transliterator is
\r
224 * intended to be composed with others, rather than be used as is.
\r
226 * <p>Composed transliterators can be instantiated as usual. For
\r
227 * example, the system transliterator "Devanagari-Gujarati" is a
\r
228 * composed transliterator built internally as
\r
229 * "Devanagari~InterIndic;InterIndic~Gujarati". When this
\r
230 * transliterator is instantiated, it appears externally to be a
\r
231 * standard transliterator (e.g., getID() returns
\r
232 * "Devanagari-Gujarati").
\r
234 * <p><b>Subclassing</b>
\r
236 * <p>Subclasses must implement the abstract method
\r
237 * <code>handleTransliterate()</code>. <p>Subclasses should override
\r
238 * the <code>transliterate()</code> method taking a
\r
239 * <code>Replaceable</code> and the <code>transliterate()</code>
\r
240 * method taking a <code>String</code> and <code>StringBuffer</code>
\r
241 * if the performance of these methods can be improved over the
\r
242 * performance obtained by the default implementations in this class.
\r
244 * <p>Copyright © IBM Corporation 1999. All rights reserved.
\r
249 public abstract class Transliterator implements StringTransform {
\r
251 * Direction constant indicating the forward direction in a transliterator,
\r
252 * e.g., the forward rules of a RuleBasedTransliterator. An "A-B"
\r
253 * transliterator transliterates A to B when operating in the forward
\r
254 * direction, and B to A when operating in the reverse direction.
\r
257 public static final int FORWARD = 0;
\r
260 * Direction constant indicating the reverse direction in a transliterator,
\r
261 * e.g., the reverse rules of a RuleBasedTransliterator. An "A-B"
\r
262 * transliterator transliterates A to B when operating in the forward
\r
263 * direction, and B to A when operating in the reverse direction.
\r
266 public static final int REVERSE = 1;
\r
269 * Position structure for incremental transliteration. This data
\r
270 * structure defines two substrings of the text being
\r
271 * transliterated. The first region, [contextStart,
\r
272 * contextLimit), defines what characters the transliterator will
\r
273 * read as context. The second region, [start, limit), defines
\r
274 * what characters will actually be transliterated. The second
\r
275 * region should be a subset of the first.
\r
277 * <p>After a transliteration operation, some of the indices in this
\r
278 * structure will be modified. See the field descriptions for
\r
281 * <p>contextStart <= start <= limit <= contextLimit
\r
283 * <p>Note: All index values in this structure must be at code point
\r
284 * boundaries. That is, none of them may occur between two code units
\r
285 * of a surrogate pair. If any index does split a surrogate pair,
\r
286 * results are unspecified.
\r
289 public static class Position {
\r
292 * Beginning index, inclusive, of the context to be considered for
\r
293 * a transliteration operation. The transliterator will ignore
\r
294 * anything before this index. INPUT/OUTPUT parameter: This parameter
\r
295 * is updated by a transliteration operation to reflect the maximum
\r
296 * amount of antecontext needed by a transliterator.
\r
299 public int contextStart;
\r
302 * Ending index, exclusive, of the context to be considered for a
\r
303 * transliteration operation. The transliterator will ignore
\r
304 * anything at or after this index. INPUT/OUTPUT parameter: This
\r
305 * parameter is updated to reflect changes in the length of the
\r
306 * text, but points to the same logical position in the text.
\r
309 public int contextLimit;
\r
312 * Beginning index, inclusive, of the text to be transliteratd.
\r
313 * INPUT/OUTPUT parameter: This parameter is advanced past
\r
314 * characters that have already been transliterated by a
\r
315 * transliteration operation.
\r
321 * Ending index, exclusive, of the text to be transliteratd.
\r
322 * INPUT/OUTPUT parameter: This parameter is updated to reflect
\r
323 * changes in the length of the text, but points to the same
\r
324 * logical position in the text.
\r
330 * Constructs a Position object with start, limit,
\r
331 * contextStart, and contextLimit all equal to zero.
\r
334 public Position() {
\r
339 * Constructs a Position object with the given start,
\r
340 * contextStart, and contextLimit. The limit is set to the
\r
344 public Position(int contextStart, int contextLimit, int start) {
\r
345 this(contextStart, contextLimit, start, contextLimit);
\r
349 * Constructs a Position object with the given start, limit,
\r
350 * contextStart, and contextLimit.
\r
353 public Position(int contextStart, int contextLimit,
\r
354 int start, int limit) {
\r
355 this.contextStart = contextStart;
\r
356 this.contextLimit = contextLimit;
\r
357 this.start = start;
\r
358 this.limit = limit;
\r
362 * Constructs a Position object that is a copy of another.
\r
365 public Position(Position pos) {
\r
370 * Copies the indices of this position from another.
\r
373 public void set(Position pos) {
\r
374 contextStart = pos.contextStart;
\r
375 contextLimit = pos.contextLimit;
\r
381 * Returns true if this Position is equal to the given object.
\r
384 public boolean equals(Object obj) {
\r
385 if (obj instanceof Position) {
\r
386 Position pos = (Position) obj;
\r
387 return contextStart == pos.contextStart &&
\r
388 contextLimit == pos.contextLimit &&
\r
389 start == pos.start &&
\r
390 limit == pos.limit;
\r
396 * Returns a string representation of this Position.
\r
399 public String toString() {
\r
400 return "[cs=" + contextStart
\r
403 + ", cl=" + contextLimit
\r
408 * Check all bounds. If they are invalid, throw an exception.
\r
409 * @param length the length of the string this object applies to
\r
410 * @exception IllegalArgumentException if any indices are out
\r
414 public final void validate(int length) {
\r
415 if (contextStart < 0 ||
\r
416 start < contextStart ||
\r
418 contextLimit < limit ||
\r
419 length < contextLimit) {
\r
420 throw new IllegalArgumentException("Invalid Position {cs=" +
\r
421 contextStart + ", s=" +
\r
424 contextLimit + "}, len=" +
\r
431 * Programmatic name, e.g., "Latin-Arabic".
\r
436 * This transliterator's filter. Any character for which
\r
437 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
\r
438 * altered by this transliterator. If <tt>filter</tt> is
\r
439 * <tt>null</tt> then no filtering is applied.
\r
441 private UnicodeFilter filter;
\r
443 private int maximumContextLength = 0;
\r
446 * System transliterator registry.
\r
448 private static TransliteratorRegistry registry;
\r
450 private static Hashtable<CaseInsensitiveString, String> displayNameCache;
\r
453 * Prefix for resource bundle key for the display name for a
\r
454 * transliterator. The ID is appended to this to form the key.
\r
455 * The resource bundle value should be a String.
\r
457 private static final String RB_DISPLAY_NAME_PREFIX = "%Translit%%";
\r
460 * Prefix for resource bundle key for the display name for a
\r
461 * transliterator SCRIPT. The ID is appended to this to form the key.
\r
462 * The resource bundle value should be a String.
\r
464 private static final String RB_SCRIPT_DISPLAY_NAME_PREFIX = "%Translit%";
\r
467 * Resource bundle key for display name pattern.
\r
468 * The resource bundle value should be a String forming a
\r
469 * MessageFormat pattern, e.g.:
\r
470 * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
\r
472 private static final String RB_DISPLAY_NAME_PATTERN = "TransliteratorNamePattern";
\r
475 * Delimiter between elements in a compound ID.
\r
477 static final char ID_DELIM = ';';
\r
480 * Delimiter before target in an ID.
\r
482 static final char ID_SEP = '-';
\r
485 * Delimiter before variant in an ID.
\r
487 static final char VARIANT_SEP = '/';
\r
490 * To enable debugging output in the Transliterator component, set
\r
493 * N.B. Make sure to recompile all of the com.ibm.icu.text package
\r
494 * after changing this. Easiest way to do this is 'ant clean
\r
495 * core' ('ant' will NOT pick up the dependency automatically).
\r
497 * <<This generates a lot of output.>>
\r
499 static final boolean DEBUG = false;
\r
502 * Default constructor.
\r
503 * @param ID the string identifier for this transliterator
\r
504 * @param filter the filter. Any character for which
\r
505 * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
\r
506 * altered by this transliterator. If <tt>filter</tt> is
\r
507 * <tt>null</tt> then no filtering is applied.
\r
510 protected Transliterator(String ID, UnicodeFilter filter) {
\r
512 throw new NullPointerException();
\r
515 this.filter = filter;
\r
519 * Transliterates a segment of a string, with optional filtering.
\r
521 * @param text the string to be transliterated
\r
522 * @param start the beginning index, inclusive; <code>0 <= start
\r
524 * @param limit the ending index, exclusive; <code>start <= limit
\r
525 * <= text.length()</code>.
\r
526 * @return The new limit index. The text previously occupying <code>[start,
\r
527 * limit)</code> has been transliterated, possibly to a string of a different
\r
528 * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
\r
529 * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
\r
530 * the returned value is -1 and the input string remains unchanged.
\r
533 public final int transliterate(Replaceable text, int start, int limit) {
\r
536 text.length() < limit) {
\r
540 Position pos = new Position(start, limit, start);
\r
541 filteredTransliterate(text, pos, false, true);
\r
546 * Transliterates an entire string in place. Convenience method.
\r
547 * @param text the string to be transliterated
\r
550 public final void transliterate(Replaceable text) {
\r
551 transliterate(text, 0, text.length());
\r
555 * Transliterate an entire string and returns the result. Convenience method.
\r
557 * @param text the string to be transliterated
\r
558 * @return The transliterated text
\r
561 public final String transliterate(String text) {
\r
562 ReplaceableString result = new ReplaceableString(text);
\r
563 transliterate(result);
\r
564 return result.toString();
\r
568 * Transliterates the portion of the text buffer that can be
\r
569 * transliterated unambiguosly after new text has been inserted,
\r
570 * typically as a result of a keyboard event. The new text in
\r
571 * <code>insertion</code> will be inserted into <code>text</code>
\r
572 * at <code>index.contextLimit</code>, advancing
\r
573 * <code>index.contextLimit</code> by <code>insertion.length()</code>.
\r
574 * Then the transliterator will try to transliterate characters of
\r
575 * <code>text</code> between <code>index.start</code> and
\r
576 * <code>index.contextLimit</code>. Characters before
\r
577 * <code>index.start</code> will not be changed.
\r
579 * <p>Upon return, values in <code>index</code> will be updated.
\r
580 * <code>index.contextStart</code> will be advanced to the first
\r
581 * character that future calls to this method will read.
\r
582 * <code>index.start</code> and <code>index.contextLimit</code> will
\r
583 * be adjusted to delimit the range of text that future calls to
\r
584 * this method may change.
\r
586 * <p>Typical usage of this method begins with an initial call
\r
587 * with <code>index.contextStart</code> and <code>index.contextLimit</code>
\r
588 * set to indicate the portion of <code>text</code> to be
\r
589 * transliterated, and <code>index.start == index.contextStart</code>.
\r
590 * Thereafter, <code>index</code> can be used without
\r
591 * modification in future calls, provided that all changes to
\r
592 * <code>text</code> are made via this method.
\r
594 * <p>This method assumes that future calls may be made that will
\r
595 * insert new text into the buffer. As a result, it only performs
\r
596 * unambiguous transliterations. After the last call to this
\r
597 * method, there may be untransliterated text that is waiting for
\r
598 * more input to resolve an ambiguity. In order to perform these
\r
599 * pending transliterations, clients should call {@link
\r
600 * #finishTransliteration} after the last call to this
\r
601 * method has been made.
\r
603 * @param text the buffer holding transliterated and untransliterated text
\r
604 * @param index the start and limit of the text, the position
\r
605 * of the cursor, and the start and limit of transliteration.
\r
606 * @param insertion text to be inserted and possibly
\r
607 * transliterated into the translation buffer at
\r
608 * <code>index.contextLimit</code>. If <code>null</code> then no text
\r
610 * @see #handleTransliterate
\r
611 * @exception IllegalArgumentException if <code>index</code>
\r
615 public final void transliterate(Replaceable text, Position index,
\r
616 String insertion) {
\r
617 index.validate(text.length());
\r
619 // int originalStart = index.contextStart;
\r
620 if (insertion != null) {
\r
621 text.replace(index.limit, index.limit, insertion);
\r
622 index.limit += insertion.length();
\r
623 index.contextLimit += insertion.length();
\r
626 if (index.limit > 0 &&
\r
627 UTF16.isLeadSurrogate(text.charAt(index.limit - 1))) {
\r
628 // Oops, there is a dangling lead surrogate in the buffer.
\r
629 // This will break most transliterators, since they will
\r
630 // assume it is part of a pair. Don't transliterate until
\r
631 // more text comes in.
\r
635 filteredTransliterate(text, index, true, true);
\r
638 // This doesn't work once we add quantifier support. Need to rewrite
\r
639 // this code to support quantifiers and 'use maximum backup <n>;'.
\r
641 // index.contextStart = Math.max(index.start - getMaximumContextLength(),
\r
646 * Transliterates the portion of the text buffer that can be
\r
647 * transliterated unambiguosly after a new character has been
\r
648 * inserted, typically as a result of a keyboard event. This is a
\r
649 * convenience method; see {@link #transliterate(Replaceable,
\r
650 * Transliterator.Position, String)} for details.
\r
651 * @param text the buffer holding transliterated and
\r
652 * untransliterated text
\r
653 * @param index the start and limit of the text, the position
\r
654 * of the cursor, and the start and limit of transliteration.
\r
655 * @param insertion text to be inserted and possibly
\r
656 * transliterated into the translation buffer at
\r
657 * <code>index.contextLimit</code>.
\r
658 * @see #transliterate(Replaceable, Transliterator.Position, String)
\r
661 public final void transliterate(Replaceable text, Position index,
\r
663 transliterate(text, index, UTF16.valueOf(insertion));
\r
667 * Transliterates the portion of the text buffer that can be
\r
668 * transliterated unambiguosly. This is a convenience method; see
\r
669 * {@link #transliterate(Replaceable, Transliterator.Position,
\r
670 * String)} for details.
\r
671 * @param text the buffer holding transliterated and
\r
672 * untransliterated text
\r
673 * @param index the start and limit of the text, the position
\r
674 * of the cursor, and the start and limit of transliteration.
\r
675 * @see #transliterate(Replaceable, Transliterator.Position, String)
\r
678 public final void transliterate(Replaceable text, Position index) {
\r
679 transliterate(text, index, null);
\r
683 * Finishes any pending transliterations that were waiting for
\r
684 * more characters. Clients should call this method as the last
\r
685 * call after a sequence of one or more calls to
\r
686 * <code>transliterate()</code>.
\r
687 * @param text the buffer holding transliterated and
\r
688 * untransliterated text.
\r
689 * @param index the array of indices previously passed to {@link
\r
693 public final void finishTransliteration(Replaceable text,
\r
695 index.validate(text.length());
\r
696 filteredTransliterate(text, index, false, true);
\r
700 * Abstract method that concrete subclasses define to implement
\r
701 * their transliteration algorithm. This method handles both
\r
702 * incremental and non-incremental transliteration. Let
\r
703 * <code>originalStart</code> refer to the value of
\r
704 * <code>pos.start</code> upon entry.
\r
707 * <li>If <code>incremental</code> is false, then this method
\r
708 * should transliterate all characters between
\r
709 * <code>pos.start</code> and <code>pos.limit</code>. Upon return
\r
710 * <code>pos.start</code> must == <code> pos.limit</code>.</li>
\r
712 * <li>If <code>incremental</code> is true, then this method
\r
713 * should transliterate all characters between
\r
714 * <code>pos.start</code> and <code>pos.limit</code> that can be
\r
715 * unambiguously transliterated, regardless of future insertions
\r
716 * of text at <code>pos.limit</code>. Upon return,
\r
717 * <code>pos.start</code> should be in the range
\r
718 * [<code>originalStart</code>, <code>pos.limit</code>).
\r
719 * <code>pos.start</code> should be positioned such that
\r
720 * characters [<code>originalStart</code>, <code>
\r
721 * pos.start</code>) will not be changed in the future by this
\r
722 * transliterator and characters [<code>pos.start</code>,
\r
723 * <code>pos.limit</code>) are unchanged.</li>
\r
726 * <p>Implementations of this method should also obey the
\r
727 * following invariants:</p>
\r
730 * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
\r
731 * should be updated to reflect changes in length of the text
\r
732 * between <code>pos.start</code> and <code>pos.limit</code>. The
\r
733 * difference <code> pos.contextLimit - pos.limit</code> should
\r
736 * <li><code>pos.contextStart</code> should not change.</li>
\r
738 * <li>Upon return, neither <code>pos.start</code> nor
\r
739 * <code>pos.limit</code> should be less than
\r
740 * <code>originalStart</code>.</li>
\r
742 * <li>Text before <code>originalStart</code> and text after
\r
743 * <code>pos.limit</code> should not change.</li>
\r
745 * <li>Text before <code>pos.contextStart</code> and text after
\r
746 * <code> pos.contextLimit</code> should be ignored.</li>
\r
749 * <p>Subclasses may safely assume that all characters in
\r
750 * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
\r
751 * In other words, the filter has already been applied by the time
\r
752 * this method is called. See
\r
753 * <code>filteredTransliterate()</code>.
\r
755 * <p>This method is <b>not</b> for public consumption. Calling
\r
756 * this method directly will transliterate
\r
757 * [<code>pos.start</code>, <code>pos.limit</code>) without
\r
758 * applying the filter. End user code should call <code>
\r
759 * transliterate()</code> instead of this method. Subclass code
\r
760 * should call <code>filteredTransliterate()</code> instead of
\r
763 * @param text the buffer holding transliterated and
\r
764 * untransliterated text
\r
766 * @param pos the indices indicating the start, limit, context
\r
767 * start, and context limit of the text.
\r
769 * @param incremental if true, assume more text may be inserted at
\r
770 * <code>pos.limit</code> and act accordingly. Otherwise,
\r
771 * transliterate all text between <code>pos.start</code> and
\r
772 * <code>pos.limit</code> and move <code>pos.start</code> up to
\r
773 * <code>pos.limit</code>.
\r
775 * @see #transliterate
\r
778 protected abstract void handleTransliterate(Replaceable text,
\r
779 Position pos, boolean incremental);
\r
782 * Top-level transliteration method, handling filtering, incremental and
\r
783 * non-incremental transliteration, and rollback. All transliteration
\r
784 * public API methods eventually call this method with a rollback argument
\r
785 * of TRUE. Other entities may call this method but rollback should be
\r
788 * <p>If this transliterator has a filter, break up the input text into runs
\r
789 * of unfiltered characters. Pass each run to
\r
790 * <subclass>.handleTransliterate().
\r
792 * <p>In incremental mode, if rollback is TRUE, perform a special
\r
793 * incremental procedure in which several passes are made over the input
\r
794 * text, adding one character at a time, and committing successful
\r
795 * transliterations as they occur. Unsuccessful transliterations are rolled
\r
796 * back and retried with additional characters to give correct results.
\r
798 * @param text the text to be transliterated
\r
799 * @param index the position indices
\r
800 * @param incremental if TRUE, then assume more characters may be inserted
\r
801 * at index.limit, and postpone processing to accomodate future incoming
\r
803 * @param rollback if TRUE and if incremental is TRUE, then perform special
\r
804 * incremental processing, as described above, and undo partial
\r
805 * transliterations where necessary. If incremental is FALSE then this
\r
806 * parameter is ignored.
\r
808 private void filteredTransliterate(Replaceable text,
\r
810 boolean incremental,
\r
811 boolean rollback) {
\r
812 // Short circuit path for transliterators with no filter in
\r
813 // non-incremental mode.
\r
814 if (filter == null && !rollback) {
\r
815 handleTransliterate(text, index, incremental);
\r
819 //----------------------------------------------------------------------
\r
820 // This method processes text in two groupings:
\r
822 // RUNS -- A run is a contiguous group of characters which are contained
\r
823 // in the filter for this transliterator (filter.contains(ch) == true).
\r
824 // Text outside of runs may appear as context but it is not modified.
\r
825 // The start and limit Position values are narrowed to each run.
\r
827 // PASSES (incremental only) -- To make incremental mode work correctly,
\r
828 // each run is broken up into n passes, where n is the length (in code
\r
829 // points) of the run. Each pass contains the first n characters. If a
\r
830 // pass is completely transliterated, it is committed, and further passes
\r
831 // include characters after the committed text. If a pass is blocked,
\r
832 // and does not transliterate completely, then this method rolls back
\r
833 // the changes made during the pass, extends the pass by one code point,
\r
834 // and tries again.
\r
835 //----------------------------------------------------------------------
\r
837 // globalLimit is the limit value for the entire operation. We
\r
838 // set index.limit to the end of each unfiltered run before
\r
839 // calling handleTransliterate(), so we need to maintain the real
\r
840 // value of index.limit here. After each transliteration, we
\r
841 // update globalLimit for insertions or deletions that have
\r
843 int globalLimit = index.limit;
\r
845 // If there is a non-null filter, then break the input text up. Say the
\r
846 // input text has the form:
\r
848 // where 'x' represents a filtered character (filter.contains('x') ==
\r
849 // false). Then we break this up into:
\r
851 // Each pass through the loop consumes a run of filtered
\r
852 // characters (which are ignored) and a subsequent run of
\r
853 // unfiltered characters (which are transliterated).
\r
855 StringBuffer log = null;
\r
857 log = new StringBuffer();
\r
862 if (filter != null) {
\r
863 // Narrow the range to be transliterated to the first run
\r
864 // of unfiltered characters at or after index.start.
\r
866 // Advance past filtered chars
\r
868 while (index.start < globalLimit &&
\r
869 !filter.contains(c=text.char32At(index.start))) {
\r
870 index.start += UTF16.getCharCount(c);
\r
873 // Find the end of this run of unfiltered chars
\r
874 index.limit = index.start;
\r
875 while (index.limit < globalLimit &&
\r
876 filter.contains(c=text.char32At(index.limit))) {
\r
877 index.limit += UTF16.getCharCount(c);
\r
881 // Check to see if the unfiltered run is empty. This only
\r
882 // happens at the end of the string when all the remaining
\r
883 // characters are filtered.
\r
884 if (index.start == index.limit) {
\r
888 // Is this run incremental? If there is additional
\r
889 // filtered text (if limit < globalLimit) then we pass in
\r
890 // an incremental value of FALSE to force the subclass to
\r
891 // complete the transliteration for this run.
\r
892 boolean isIncrementalRun =
\r
893 (index.limit < globalLimit ? false : incremental);
\r
897 // Implement rollback. To understand the need for rollback,
\r
898 // consider the following transliterator:
\r
902 // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
\r
904 // Now apply "v" to the input text "a". The result is "b". But if
\r
905 // the transliteration is done incrementally, then the NFD holds
\r
906 // things up after "t" has already transformed "a" to "A". When
\r
907 // finishTransliterate() is called, "A" is _not_ processed because
\r
908 // it gets excluded by the [:Ll:] filter, and the end result is "A"
\r
909 // -- incorrect. The problem is that the filter is applied to a
\r
910 // partially-transliterated result, when we only want it to apply to
\r
911 // input text. Although this example describes a compound
\r
912 // transliterator containing NFD and a specific filter, it can
\r
913 // happen with any transliterator which does a partial
\r
914 // transformation in incremental mode into characters outside its
\r
917 // To handle this, when in incremental mode we supply characters to
\r
918 // handleTransliterate() in several passes. Each pass adds one more
\r
919 // input character to the input text. That is, for input "ABCD", we
\r
920 // first try "A", then "AB", then "ABC", and finally "ABCD". If at
\r
921 // any point we block (upon return, start < limit) then we roll
\r
922 // back. If at any point we complete the run (upon return start ==
\r
923 // limit) then we commit that run.
\r
925 if (rollback && isIncrementalRun) {
\r
929 System.out.println("filteredTransliterate{"+getID()+"}i: IN=" +
\r
930 UtilityExtensions.formatInput(text, index));
\r
933 int runStart = index.start;
\r
934 int runLimit = index.limit;
\r
935 int runLength = runLimit - runStart;
\r
937 // Make a rollback copy at the end of the string
\r
938 int rollbackOrigin = text.length();
\r
939 text.copy(runStart, runLimit, rollbackOrigin);
\r
941 // Variables reflecting the commitment of completely
\r
942 // transliterated text. passStart is the runStart, advanced
\r
943 // past committed text. rollbackStart is the rollbackOrigin,
\r
944 // advanced past rollback text that corresponds to committed
\r
946 int passStart = runStart;
\r
947 int rollbackStart = rollbackOrigin;
\r
949 // The limit for each pass; we advance by one code point with
\r
951 int passLimit = index.start;
\r
953 // Total length, in 16-bit code units, of uncommitted text.
\r
954 // This is the length to be rolled back.
\r
955 int uncommittedLength = 0;
\r
957 // Total delta (change in length) for all passes
\r
958 int totalDelta = 0;
\r
960 // PASS MAIN LOOP -- Start with a single character, and extend
\r
961 // the text by one character at a time. Roll back partial
\r
962 // transliterations and commit complete transliterations.
\r
964 // Length of additional code point, either one or two
\r
966 UTF16.getCharCount(text.char32At(passLimit));
\r
967 passLimit += charLength;
\r
968 if (passLimit > runLimit) {
\r
971 uncommittedLength += charLength;
\r
973 index.limit = passLimit;
\r
977 log.append("filteredTransliterate{"+getID()+"}i: ");
\r
978 UtilityExtensions.formatInput(log, text, index);
\r
981 // Delegate to subclass for actual transliteration. Upon
\r
982 // return, start will be updated to point after the
\r
983 // transliterated text, and limit and contextLimit will be
\r
984 // adjusted for length changes.
\r
985 handleTransliterate(text, index, true);
\r
988 log.append(" => ");
\r
989 UtilityExtensions.formatInput(log, text, index);
\r
992 delta = index.limit - passLimit; // change in length
\r
994 // We failed to completely transliterate this pass.
\r
995 // Roll back the text. Indices remain unchanged; reset
\r
996 // them where necessary.
\r
997 if (index.start != index.limit) {
\r
998 // Find the rollbackStart, adjusted for length changes
\r
999 // and the deletion of partially transliterated text.
\r
1000 int rs = rollbackStart + delta - (index.limit - passStart);
\r
1002 // Delete the partially transliterated text
\r
1003 text.replace(passStart, index.limit, "");
\r
1005 // Copy the rollback text back
\r
1006 text.copy(rs, rs + uncommittedLength, passStart);
\r
1008 // Restore indices to their original values
\r
1009 index.start = passStart;
\r
1010 index.limit = passLimit;
\r
1011 index.contextLimit -= delta;
\r
1014 log.append(" (ROLLBACK)");
\r
1018 // We did completely transliterate this pass. Update the
\r
1019 // commit indices to record how far we got. Adjust indices
\r
1020 // for length change.
\r
1022 // Move the pass indices past the committed text.
\r
1023 passStart = passLimit = index.start;
\r
1025 // Adjust the rollbackStart for length changes and move
\r
1026 // it past the committed text. All characters we've
\r
1027 // processed to this point are committed now, so zero
\r
1028 // out the uncommittedLength.
\r
1029 rollbackStart += delta + uncommittedLength;
\r
1030 uncommittedLength = 0;
\r
1032 // Adjust indices for length changes.
\r
1033 runLimit += delta;
\r
1034 totalDelta += delta;
\r
1038 System.out.println(Utility.escape(log.toString()));
\r
1042 // Adjust overall limit and rollbackOrigin for insertions and
\r
1043 // deletions. Don't need to worry about contextLimit because
\r
1044 // handleTransliterate() maintains that.
\r
1045 rollbackOrigin += totalDelta;
\r
1046 globalLimit += totalDelta;
\r
1048 // Delete the rollback copy
\r
1049 text.replace(rollbackOrigin, rollbackOrigin + runLength, "");
\r
1051 // Move start past committed text
\r
1052 index.start = passStart;
\r
1056 // Delegate to subclass for actual transliteration.
\r
1059 log.append("filteredTransliterate{"+getID()+"}: ");
\r
1060 UtilityExtensions.formatInput(log, text, index);
\r
1063 int limit = index.limit;
\r
1064 handleTransliterate(text, index, isIncrementalRun);
\r
1065 delta = index.limit - limit; // change in length
\r
1068 log.append(" => ");
\r
1069 UtilityExtensions.formatInput(log, text, index);
\r
1072 // In a properly written transliterator, start == limit after
\r
1073 // handleTransliterate() returns when incremental is false.
\r
1074 // Catch cases where the subclass doesn't do this, and throw
\r
1075 // an exception. (Just pinning start to limit is a bad idea,
\r
1076 // because what's probably happening is that the subclass
\r
1077 // isn't transliterating all the way to the end, and it should
\r
1078 // in non-incremental mode.)
\r
1079 if (!isIncrementalRun && index.start != index.limit) {
\r
1080 throw new RuntimeException("ERROR: Incomplete non-incremental transliteration by " + getID());
\r
1083 // Adjust overall limit for insertions/deletions. Don't need
\r
1084 // to worry about contextLimit because handleTransliterate()
\r
1085 // maintains that.
\r
1086 globalLimit += delta;
\r
1089 System.out.println(Utility.escape(log.toString()));
\r
1093 if (filter == null || isIncrementalRun) {
\r
1097 // If we did completely transliterate this
\r
1098 // run, then repeat with the next unfiltered run.
\r
1101 // Start is valid where it is. Limit needs to be put back where
\r
1102 // it was, modulo adjustments for deletions/insertions.
\r
1103 index.limit = globalLimit;
\r
1106 System.out.println("filteredTransliterate{"+getID()+"}: OUT=" +
\r
1107 UtilityExtensions.formatInput(text, index));
\r
1112 * Transliterate a substring of text, as specified by index, taking filters
\r
1113 * into account. This method is for subclasses that need to delegate to
\r
1114 * another transliterator, such as CompoundTransliterator.
\r
1115 * @param text the text to be transliterated
\r
1116 * @param index the position indices
\r
1117 * @param incremental if TRUE, then assume more characters may be inserted
\r
1118 * at index.limit, and postpone processing to accomodate future incoming
\r
1122 public void filteredTransliterate(Replaceable text,
\r
1124 boolean incremental) {
\r
1125 filteredTransliterate(text, index, incremental, false);
\r
1129 * Returns the length of the longest context required by this transliterator.
\r
1130 * This is <em>preceding</em> context. The default value is zero, but
\r
1131 * subclasses can change this by calling <code>setMaximumContextLength()</code>.
\r
1132 * For example, if a transliterator translates "ddd" (where
\r
1133 * d is any digit) to "555" when preceded by "(ddd)", then the preceding
\r
1134 * context length is 5, the length of "(ddd)".
\r
1136 * @return The maximum number of preceding context characters this
\r
1137 * transliterator needs to examine
\r
1140 public final int getMaximumContextLength() {
\r
1141 return maximumContextLength;
\r
1145 * Method for subclasses to use to set the maximum context length.
\r
1146 * @see #getMaximumContextLength
\r
1149 protected void setMaximumContextLength(int a) {
\r
1151 throw new IllegalArgumentException("Invalid context length " + a);
\r
1153 maximumContextLength = a;
\r
1157 * Returns a programmatic identifier for this transliterator.
\r
1158 * If this identifier is passed to <code>getInstance()</code>, it
\r
1159 * will return this object, if it has been registered.
\r
1160 * @see #registerClass
\r
1161 * @see #getAvailableIDs
\r
1164 public final String getID() {
\r
1169 * Set the programmatic identifier for this transliterator. Only
\r
1170 * for use by subclasses.
\r
1173 protected final void setID(String id) {
\r
1178 * Returns a name for this transliterator that is appropriate for
\r
1179 * display to the user in the default locale. See {@link
\r
1180 * #getDisplayName(String,Locale)} for details.
\r
1183 public final static String getDisplayName(String ID) {
\r
1184 return getDisplayName(ID, ULocale.getDefault());
\r
1188 * Returns a name for this transliterator that is appropriate for
\r
1189 * display to the user in the given locale. This name is taken
\r
1190 * from the locale resource data in the standard manner of the
\r
1191 * <code>java.text</code> package.
\r
1193 * <p>If no localized names exist in the system resource bundles,
\r
1194 * a name is synthesized using a localized
\r
1195 * <code>MessageFormat</code> pattern from the resource data. The
\r
1196 * arguments to this pattern are an integer followed by one or two
\r
1197 * strings. The integer is the number of strings, either 1 or 2.
\r
1198 * The strings are formed by splitting the ID for this
\r
1199 * transliterator at the first '-'. If there is no '-', then the
\r
1200 * entire ID forms the only string.
\r
1201 * @param inLocale the Locale in which the display name should be
\r
1203 * @see java.text.MessageFormat
\r
1206 public static String getDisplayName(String id, Locale inLocale) {
\r
1207 return getDisplayName(id, ULocale.forLocale(inLocale));
\r
1211 * Returns a name for this transliterator that is appropriate for
\r
1212 * display to the user in the given locale. This name is taken
\r
1213 * from the locale resource data in the standard manner of the
\r
1214 * <code>java.text</code> package.
\r
1216 * <p>If no localized names exist in the system resource bundles,
\r
1217 * a name is synthesized using a localized
\r
1218 * <code>MessageFormat</code> pattern from the resource data. The
\r
1219 * arguments to this pattern are an integer followed by one or two
\r
1220 * strings. The integer is the number of strings, either 1 or 2.
\r
1221 * The strings are formed by splitting the ID for this
\r
1222 * transliterator at the first '-'. If there is no '-', then the
\r
1223 * entire ID forms the only string.
\r
1224 * @param inLocale the ULocale in which the display name should be
\r
1226 * @see java.text.MessageFormat
\r
1229 public static String getDisplayName(String id, ULocale inLocale) {
\r
1231 // Resource bundle containing display name keys and the
\r
1232 // RB_RULE_BASED_IDS array.
\r
1234 //If we ever integrate this with the Sun JDK, the resource bundle
\r
1235 // root will change to sun.text.resources.LocaleElements
\r
1237 ICUResourceBundle bundle = (ICUResourceBundle)UResourceBundle.
\r
1238 getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, inLocale);
\r
1240 // Normalize the ID
\r
1241 String stv[] = TransliteratorIDParser.IDtoSTV(id);
\r
1242 if (stv == null) {
\r
1243 // No target; malformed id
\r
1246 String ID = stv[0] + '-' + stv[1];
\r
1247 if (stv[2] != null && stv[2].length() > 0) {
\r
1248 ID = ID + '/' + stv[2];
\r
1251 // Use the registered display name, if any
\r
1252 String n = displayNameCache.get(new CaseInsensitiveString(ID));
\r
1257 // Use display name for the entire transliterator, if it
\r
1260 return bundle.getString(RB_DISPLAY_NAME_PREFIX + ID);
\r
1261 } catch (MissingResourceException e) {}
\r
1264 // Construct the formatter first; if getString() fails
\r
1265 // we'll exit the try block
\r
1266 MessageFormat format = new MessageFormat(
\r
1267 bundle.getString(RB_DISPLAY_NAME_PATTERN));
\r
1268 // Construct the argument array
\r
1269 Object[] args = new Object[] { new Integer(2), stv[0], stv[1] };
\r
1271 // Use display names for the scripts, if they exist
\r
1272 for (int j=1; j<=2; ++j) {
\r
1274 args[j] = bundle.getString(RB_SCRIPT_DISPLAY_NAME_PREFIX +
\r
1275 (String) args[j]);
\r
1276 } catch (MissingResourceException e) {}
\r
1279 // Format it using the pattern in the resource
\r
1280 return (stv[2].length() > 0) ?
\r
1281 (format.format(args) + '/' + stv[2]) :
\r
1282 format.format(args);
\r
1283 } catch (MissingResourceException e2) {}
\r
1285 // We should not reach this point unless there is something
\r
1286 // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
\r
1287 // been deleted from the root RB_LOCALE_ELEMENTS resource.
\r
1288 throw new RuntimeException();
\r
1292 * Returns the filter used by this transliterator, or <tt>null</tt>
\r
1293 * if this transliterator uses no filter.
\r
1296 public final UnicodeFilter getFilter() {
\r
1301 * Changes the filter used by this transliterator. If the filter
\r
1302 * is set to <tt>null</tt> then no filtering will occur.
\r
1304 * <p>Callers must take care if a transliterator is in use by
\r
1305 * multiple threads. The filter should not be changed by one
\r
1306 * thread while another thread may be transliterating.
\r
1309 public void setFilter(UnicodeFilter filter) {
\r
1310 this.filter = filter;
\r
1314 * Returns a <code>Transliterator</code> object given its ID.
\r
1315 * The ID must be either a system transliterator ID or a ID registered
\r
1316 * using <code>registerClass()</code>.
\r
1318 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
\r
1319 * @return A <code>Transliterator</code> object with the given ID
\r
1320 * @exception IllegalArgumentException if the given ID is invalid.
\r
1323 public static final Transliterator getInstance(String ID) {
\r
1324 return getInstance(ID, FORWARD);
\r
1328 * Returns a <code>Transliterator</code> object given its ID.
\r
1329 * The ID must be either a system transliterator ID or a ID registered
\r
1330 * using <code>registerClass()</code>.
\r
1332 * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
\r
1333 * @param dir either FORWARD or REVERSE. If REVERSE then the
\r
1334 * inverse of the given ID is instantiated.
\r
1335 * @return A <code>Transliterator</code> object with the given ID
\r
1336 * @exception IllegalArgumentException if the given ID is invalid.
\r
1337 * @see #registerClass
\r
1338 * @see #getAvailableIDs
\r
1342 public static Transliterator getInstance(String ID,
\r
1344 StringBuffer canonID = new StringBuffer();
\r
1345 Vector<SingleID> list = new Vector<SingleID>();
\r
1346 UnicodeSet[] globalFilter = new UnicodeSet[1];
\r
1347 if (!TransliteratorIDParser.parseCompoundID(ID, dir, canonID, list, globalFilter)) {
\r
1348 throw new IllegalArgumentException("Invalid ID " + ID);
\r
1351 Vector<Transliterator> translits = TransliteratorIDParser.instantiateList(list);
\r
1353 // assert(list.size() > 0);
\r
1354 Transliterator t = null;
\r
1355 if (list.size() > 1 || canonID.indexOf(";") >= 0) {
\r
1356 // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
\r
1357 // has one child transliterator. This is so that toRules() will return the right thing
\r
1358 // (without any inactive ID), but our main ID still comes out correct. That is, if we
\r
1359 // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
\r
1360 // even though the ID is "(Lower);Latin-Greek;".
\r
1361 t = new CompoundTransliterator(translits);
\r
1364 t = translits.elementAt(0);
\r
1367 t.setID(canonID.toString());
\r
1368 if (globalFilter[0] != null) {
\r
1369 t.setFilter(globalFilter[0]);
\r
1375 * Create a transliterator from a basic ID. This is an ID
\r
1376 * containing only the forward direction source, target, and
\r
1378 * @param id a basic ID of the form S-T or S-T/V.
\r
1379 * @param canonID canonical ID to apply to the result, or
\r
1380 * null to leave the ID unchanged
\r
1381 * @return a newly created Transliterator or null if the ID is
\r
1384 static Transliterator getBasicInstance(String id, String canonID) {
\r
1385 StringBuffer s = new StringBuffer();
\r
1386 Transliterator t = registry.get(id, s);
\r
1387 if (s.length() != 0) {
\r
1389 // Instantiate an alias
\r
1390 t = getInstance(s.toString(), FORWARD);
\r
1392 if (t != null && canonID != null) {
\r
1399 * Returns a <code>Transliterator</code> object constructed from
\r
1400 * the given rule string. This will be a RuleBasedTransliterator,
\r
1401 * if the rule string contains only rules, or a
\r
1402 * CompoundTransliterator, if it contains ID blocks, or a
\r
1403 * NullTransliterator, if it contains ID blocks which parse as
\r
1404 * empty for the given direction.
\r
1407 public static final Transliterator createFromRules(String ID, String rules, int dir) {
\r
1408 Transliterator t = null;
\r
1410 TransliteratorParser parser = new TransliteratorParser();
\r
1411 parser.parse(rules, dir);
\r
1413 // NOTE: The logic here matches that in TransliteratorRegistry.
\r
1414 if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
\r
1415 t = new NullTransliterator();
\r
1417 else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
\r
1418 t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), null);
\r
1420 else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
\r
1421 // idBlock, no data -- this is an alias. The ID has
\r
1422 // been munged from reverse into forward mode, if
\r
1423 // necessary, so instantiate the ID in the forward
\r
1425 if (parser.compoundFilter != null) {
\r
1426 t = getInstance(parser.compoundFilter.toPattern(false) + ";"
\r
1427 + parser.idBlockVector.get(0));
\r
1429 t = getInstance(parser.idBlockVector.get(0));
\r
1437 Vector<Transliterator> transliterators = new Vector<Transliterator>();
\r
1438 int passNumber = 1;
\r
1440 int limit = Math.max(parser.idBlockVector.size(), parser.dataVector.size());
\r
1441 for (int i = 0; i < limit; i++) {
\r
1442 if (i < parser.idBlockVector.size()) {
\r
1443 String idBlock = parser.idBlockVector.get(i);
\r
1444 if (idBlock.length() > 0) {
\r
1445 Transliterator temp = getInstance(idBlock);
\r
1446 if (!(temp instanceof NullTransliterator))
\r
1447 transliterators.add(getInstance(idBlock));
\r
1450 if (i < parser.dataVector.size()) {
\r
1451 Data data = parser.dataVector.get(i);
\r
1452 transliterators.add(new RuleBasedTransliterator("%Pass" + passNumber++, data, null));
\r
1456 t = new CompoundTransliterator(transliterators, passNumber - 1);
\r
1458 if (parser.compoundFilter != null) {
\r
1459 t.setFilter(parser.compoundFilter);
\r
1467 * Returns a rule string for this transliterator.
\r
1468 * @param escapeUnprintable if true, then unprintable characters
\r
1469 * will be converted to escape form backslash-'u' or
\r
1473 public String toRules(boolean escapeUnprintable) {
\r
1474 return baseToRules(escapeUnprintable);
\r
1478 * Returns a rule string for this transliterator. This is
\r
1479 * a non-overrideable base class implementation that subclasses
\r
1480 * may call. It simply munges the ID into the correct format,
\r
1481 * that is, "foo" => "::foo".
\r
1482 * @param escapeUnprintable if true, then unprintable characters
\r
1483 * will be converted to escape form backslash-'u' or
\r
1487 protected final String baseToRules(boolean escapeUnprintable) {
\r
1488 // The base class implementation of toRules munges the ID into
\r
1489 // the correct format. That is: foo => ::foo
\r
1490 // KEEP in sync with rbt_pars
\r
1491 if (escapeUnprintable) {
\r
1492 StringBuffer rulesSource = new StringBuffer();
\r
1493 String id = getID();
\r
1494 for (int i=0; i<id.length();) {
\r
1495 int c = UTF16.charAt(id, i);
\r
1496 if (!Utility.escapeUnprintable(rulesSource, c)) {
\r
1497 UTF16.append(rulesSource, c);
\r
1499 i += UTF16.getCharCount(c);
\r
1501 rulesSource.insert(0, "::");
\r
1502 rulesSource.append(ID_DELIM);
\r
1503 return rulesSource.toString();
\r
1505 return "::" + getID() + ID_DELIM;
\r
1509 * Return the elements that make up this transliterator. For
\r
1510 * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
\r
1511 * were created, the return value of this method would be an array
\r
1512 * of the three transliterator objects that make up that
\r
1513 * transliterator: [NFD, Jamo-Latin, Latin-Greek].
\r
1515 * <p>If this transliterator is not composed of other
\r
1516 * transliterators, then this method will return an array of
\r
1517 * length one containing a reference to this transliterator.
\r
1518 * @return an array of one or more transliterators that make up
\r
1519 * this transliterator
\r
1522 public Transliterator[] getElements() {
\r
1523 Transliterator result[];
\r
1524 if (this instanceof CompoundTransliterator) {
\r
1525 CompoundTransliterator cpd = (CompoundTransliterator) this;
\r
1526 result = new Transliterator[cpd.getCount()];
\r
1527 for (int i=0; i<result.length; ++i) {
\r
1528 result[i] = cpd.getTransliterator(i);
\r
1531 result = new Transliterator[] { this };
\r
1537 * Returns the set of all characters that may be modified in the
\r
1538 * input text by this Transliterator. This incorporates this
\r
1539 * object's current filter; if the filter is changed, the return
\r
1540 * value of this function will change. The default implementation
\r
1541 * returns an empty set. Some subclasses may override {@link
\r
1542 * #handleGetSourceSet} to return a more precise result. The
\r
1543 * return result is approximate in any case and is intended for
\r
1544 * use by tests, tools, or utilities.
\r
1545 * @see #getTargetSet
\r
1546 * @see #handleGetSourceSet
\r
1549 public final UnicodeSet getSourceSet() {
\r
1550 UnicodeSet set = handleGetSourceSet();
\r
1551 if (filter != null) {
\r
1552 UnicodeSet filterSet;
\r
1553 // Most, but not all filters will be UnicodeSets. Optimize for
\r
1554 // the high-runner case.
\r
1556 filterSet = (UnicodeSet) filter;
\r
1557 } catch (ClassCastException e) {
\r
1558 filterSet = new UnicodeSet();
\r
1559 filter.addMatchSetTo(filterSet);
\r
1561 set.retainAll(filterSet);
\r
1567 * Framework method that returns the set of all characters that
\r
1568 * may be modified in the input text by this Transliterator,
\r
1569 * ignoring the effect of this object's filter. The base class
\r
1570 * implementation returns the empty set. Subclasses that wish to
\r
1571 * implement this should override this method.
\r
1572 * @return the set of characters that this transliterator may
\r
1573 * modify. The set may be modified, so subclasses should return a
\r
1574 * newly-created object.
\r
1575 * @see #getSourceSet
\r
1576 * @see #getTargetSet
\r
1579 protected UnicodeSet handleGetSourceSet() {
\r
1580 return new UnicodeSet();
\r
1584 * Returns the set of all characters that may be generated as
\r
1585 * replacement text by this transliterator. The default
\r
1586 * implementation returns the empty set. Some subclasses may
\r
1587 * override this method to return a more precise result. The
\r
1588 * return result is approximate in any case and is intended for
\r
1589 * use by tests, tools, or utilities requiring such
\r
1590 * meta-information.
\r
1591 * @see #getTargetSet
\r
1594 public UnicodeSet getTargetSet() {
\r
1595 return new UnicodeSet();
\r
1599 * Returns this transliterator's inverse. See the class
\r
1600 * documentation for details. This implementation simply inverts
\r
1601 * the two entities in the ID and attempts to retrieve the
\r
1602 * resulting transliterator. That is, if <code>getID()</code>
\r
1603 * returns "A-B", then this method will return the result of
\r
1604 * <code>getInstance("B-A")</code>, or <code>null</code> if that
\r
1607 * <p>Subclasses with knowledge of their inverse may wish to
\r
1608 * override this method.
\r
1610 * @return a transliterator that is an inverse, not necessarily
\r
1611 * exact, of this transliterator, or <code>null</code> if no such
\r
1612 * transliterator is registered.
\r
1613 * @see #registerClass
\r
1616 public final Transliterator getInverse() {
\r
1617 return getInstance(ID, REVERSE);
\r
1621 * Registers a subclass of <code>Transliterator</code> with the
\r
1622 * system. This subclass must have a public constructor taking no
\r
1623 * arguments. When that constructor is called, the resulting
\r
1624 * object must return the <code>ID</code> passed to this method if
\r
1625 * its <code>getID()</code> method is called.
\r
1627 * @param ID the result of <code>getID()</code> for this
\r
1629 * @param transClass a subclass of <code>Transliterator</code>
\r
1630 * @see #unregister
\r
1633 public static void registerClass(String ID, Class<? extends Transliterator> transClass, String displayName) {
\r
1634 registry.put(ID, transClass, true);
\r
1635 if (displayName != null) {
\r
1636 displayNameCache.put(new CaseInsensitiveString(ID), displayName);
\r
1641 * Register a factory object with the given ID. The factory
\r
1642 * method should return a new instance of the given transliterator.
\r
1643 * @param ID the ID of this transliterator
\r
1644 * @param factory the factory object
\r
1647 public static void registerFactory(String ID, Factory factory) {
\r
1648 registry.put(ID, factory, true);
\r
1652 * Register a Transliterator object with the given ID.
\r
1653 * @param trans the Transliterator object
\r
1656 public static void registerInstance(Transliterator trans) {
\r
1657 registry.put(trans.getID(), trans, true);
\r
1661 * Register a Transliterator object with the given ID.
\r
1662 * @param ID the ID of this transliterator
\r
1663 * @param trans the Transliterator object
\r
1665 static void registerInstance(Transliterator trans, boolean visible) {
\r
1666 registry.put(trans.getID(), trans, visible);
\r
1670 * Register an ID as an alias of another ID. Instantiating
\r
1671 * alias ID produces the same result as instantiating the original ID.
\r
1672 * This is generally used to create short aliases of compound IDs.
\r
1673 * @param aliasID The new ID being registered.
\r
1674 * @param realID The existing ID that the new ID should be an alias of.
\r
1677 public static void registerAlias(String aliasID, String realID) {
\r
1678 registry.put(aliasID, realID, true);
\r
1682 * Register two targets as being inverses of one another. For
\r
1683 * example, calling registerSpecialInverse("NFC", "NFD", true) causes
\r
1684 * Transliterator to form the following inverse relationships:
\r
1687 * Any-NFC => Any-NFD
\r
1689 * Any-NFD => Any-NFC</pre>
\r
1691 * (Without the special inverse registration, the inverse of NFC
\r
1692 * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
\r
1693 * that the presence or absence of "Any-" is preserved.
\r
1695 * <p>The relationship is symmetrical; registering (a, b) is
\r
1696 * equivalent to registering (b, a).
\r
1698 * <p>The relevant IDs must still be registered separately as
\r
1699 * factories or classes.
\r
1701 * <p>Only the targets are specified. Special inverses always
\r
1702 * have the form Any-Target1 <=> Any-Target2. The target should
\r
1703 * have canonical casing (the casing desired to be produced when
\r
1704 * an inverse is formed) and should contain no whitespace or other
\r
1705 * extraneous characters.
\r
1707 * @param target the target against which to register the inverse
\r
1708 * @param inverseTarget the inverse of target, that is
\r
1709 * Any-target.getInverse() => Any-inverseTarget
\r
1710 * @param bidirectional if true, register the reverse relation
\r
1711 * as well, that is, Any-inverseTarget.getInverse() => Any-target
\r
1713 static void registerSpecialInverse(String target,
\r
1714 String inverseTarget,
\r
1715 boolean bidirectional) {
\r
1716 TransliteratorIDParser.registerSpecialInverse(target, inverseTarget, bidirectional);
\r
1720 * Unregisters a transliterator or class. This may be either
\r
1721 * a system transliterator or a user transliterator or class.
\r
1723 * @param ID the ID of the transliterator or class
\r
1724 * @see #registerClass
\r
1727 public static void unregister(String ID) {
\r
1728 displayNameCache.remove(new CaseInsensitiveString(ID));
\r
1729 registry.remove(ID);
\r
1733 * Returns an enumeration over the programmatic names of registered
\r
1734 * <code>Transliterator</code> objects. This includes both system
\r
1735 * transliterators and user transliterators registered using
\r
1736 * <code>registerClass()</code>. The enumerated names may be
\r
1737 * passed to <code>getInstance()</code>.
\r
1739 * @return An <code>Enumeration</code> over <code>String</code> objects
\r
1740 * @see #getInstance
\r
1741 * @see #registerClass
\r
1744 public static final Enumeration<String> getAvailableIDs() {
\r
1745 return registry.getAvailableIDs();
\r
1749 * Returns an enumeration over the source names of registered
\r
1750 * transliterators. Source names may be passed to
\r
1751 * getAvailableTargets() to obtain available targets for each
\r
1755 public static final Enumeration<String> getAvailableSources() {
\r
1756 return registry.getAvailableSources();
\r
1760 * Returns an enumeration over the target names of registered
\r
1761 * transliterators having a given source name. Target names may
\r
1762 * be passed to getAvailableVariants() to obtain available
\r
1763 * variants for each source and target pair.
\r
1766 public static final Enumeration<String> getAvailableTargets(String source) {
\r
1767 return registry.getAvailableTargets(source);
\r
1771 * Returns an enumeration over the variant names of registered
\r
1772 * transliterators having a given source name and target name.
\r
1775 public static final Enumeration<String> getAvailableVariants(String source,
\r
1777 return registry.getAvailableVariants(source, target);
\r
1779 private static final String INDEX = "index",
\r
1780 RB_RULE_BASED_IDS ="RuleBasedTransliteratorIDs";
\r
1782 registry = new TransliteratorRegistry();
\r
1784 // The display name cache starts out empty
\r
1785 displayNameCache = new Hashtable<CaseInsensitiveString, String>();
\r
1786 /* The following code parses the index table located in
\r
1787 * icu/data/translit/root.txt. The index is an n x 4 table
\r
1788 * that follows this format:
\r
1791 * resource{"<resource>"}
\r
1792 * direction{"<direction>"}
\r
1797 * resource{"<resource>"}
\r
1798 * direction{"<direction"}
\r
1802 * alias{"<getInstanceArg"}
\r
1804 * <id> is the ID of the system transliterator being defined. These
\r
1805 * are public IDs enumerated by Transliterator.getAvailableIDs(),
\r
1806 * unless the second field is "internal".
\r
1808 * <resource> is a ResourceReader resource name. Currently these refer
\r
1809 * to file names under com/ibm/text/resources. This string is passed
\r
1810 * directly to ResourceReader, together with <encoding>.
\r
1812 * <direction> is either "FORWARD" or "REVERSE".
\r
1814 * <getInstanceArg> is a string to be passed directly to
\r
1815 * Transliterator.getInstance(). The returned Transliterator object
\r
1816 * then has its ID changed to <id> and is returned.
\r
1818 * The extra blank field on "alias" lines is to make the array square.
\r
1820 UResourceBundle bundle, transIDs, colBund;
\r
1821 bundle = UResourceBundle.getBundleInstance(ICUResourceBundle.ICU_TRANSLIT_BASE_NAME, INDEX);
\r
1822 transIDs = bundle.get(RB_RULE_BASED_IDS);
\r
1825 maxRows = transIDs.getSize();
\r
1826 for (row = 0; row < maxRows; row++) {
\r
1827 colBund = transIDs.get(row);
\r
1828 String ID = colBund.getKey();
\r
1829 UResourceBundle res = colBund.get(0);
\r
1830 String type = res.getKey();
\r
1831 if (type.equals("file") || type.equals("internal")) {
\r
1832 // Rest of line is <resource>:<encoding>:<direction>
\r
1834 String resString = res.getString("resource");
\r
1836 String direction = res.getString("direction");
\r
1837 switch (direction.charAt(0)) {
\r
1845 throw new RuntimeException("Can't parse direction: " + direction);
\r
1848 resString, // resource
\r
1849 "UTF-16", // encoding
\r
1851 !type.equals("internal"));
\r
1852 } else if (type.equals("alias")) {
\r
1853 //'alias'; row[2]=createInstance argument
\r
1854 String resString = res.getString();
\r
1855 registry.put(ID, resString, true);
\r
1858 throw new RuntimeException("Unknow type: " + type);
\r
1862 registerSpecialInverse(NullTransliterator.SHORT_ID, NullTransliterator.SHORT_ID, false);
\r
1864 // Register non-rule-based transliterators
\r
1865 registerClass(NullTransliterator._ID,
\r
1866 NullTransliterator.class, null);
\r
1867 RemoveTransliterator.register();
\r
1868 EscapeTransliterator.register();
\r
1869 UnescapeTransliterator.register();
\r
1870 LowercaseTransliterator.register();
\r
1871 UppercaseTransliterator.register();
\r
1872 TitlecaseTransliterator.register();
\r
1873 CaseFoldTransliterator.register();
\r
1874 UnicodeNameTransliterator.register();
\r
1875 NameUnicodeTransliterator.register();
\r
1876 NormalizationTransliterator.register();
\r
1877 BreakTransliterator.register();
\r
1878 AnyTransliterator.register(); // do this last!
\r
1882 * Register the script-based "Any" transliterators: Any-Latin, Any-Greek
\r
1884 * @deprecated This API is ICU internal only.
\r
1886 public static void registerAny() {
\r
1887 AnyTransliterator.register();
\r
1891 * The factory interface for transliterators. Transliterator
\r
1892 * subclasses can register factory objects for IDs using the
\r
1893 * registerFactory() method of Transliterator. When invoked, the
\r
1894 * factory object will be passed the ID being instantiated. This
\r
1895 * makes it possible to register one factory method to more than
\r
1896 * one ID, or for a factory method to parameterize its result
\r
1897 * based on the variant.
\r
1900 public static interface Factory {
\r
1902 * Return a transliterator for the given ID.
\r
1905 Transliterator getInstance(String ID);
\r
1909 * Implements StringTransform via this method.
\r
1910 * @param source text to be transformed (eg lowercased)
\r
1914 public String transform(String source) {
\r
1915 return transliterate(source);
\r