2 *******************************************************************************
\r
3 * Copyright (C) 1996-2007, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
10 import com.ibm.icu.impl.UtilityExtensions;
\r
13 * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
\r
14 * the transliteration in one direction from one set of characters or short
\r
15 * strings to another. A <code>RuleBasedTransliterator</code> consists of up to
\r
16 * two such sets, one for the forward direction, and one for the reverse.
\r
18 * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
\r
19 * finding a matching rule at a given point in the text. This is accomplished
\r
20 * by the <code>findMatch()</code> method.
\r
22 * <p>Copyright © IBM Corporation 1999. All rights reserved.
\r
26 class TransliterationRuleSet {
\r
28 * Vector of rules, in the order added.
\r
30 private Vector ruleVector;
\r
33 * Length of the longest preceding context
\r
35 private int maxContextLength;
\r
38 * Sorted and indexed table of rules. This is created by freeze() from
\r
39 * the rules in ruleVector. rules.length >= ruleVector.size(), and the
\r
40 * references in rules[] are aliases of the references in ruleVector.
\r
41 * A single rule in ruleVector is listed one or more times in rules[].
\r
43 private TransliterationRule[] rules;
\r
46 * Index table. For text having a first character c, compute x = c&0xFF.
\r
47 * Now use rules[index[x]..index[x+1]-1]. This index table is created by
\r
50 private int[] index;
\r
53 * Construct a new empty rule set.
\r
55 public TransliterationRuleSet() {
\r
56 ruleVector = new Vector();
\r
57 maxContextLength = 0;
\r
61 * Return the maximum context length.
\r
62 * @return the length of the longest preceding context.
\r
64 public int getMaximumContextLength() {
\r
65 return maxContextLength;
\r
69 * Add a rule to this set. Rules are added in order, and order is
\r
71 * @param rule the rule to add
\r
73 public void addRule(TransliterationRule rule) {
\r
74 ruleVector.addElement(rule);
\r
76 if ((len = rule.getAnteContextLength()) > maxContextLength) {
\r
77 maxContextLength = len;
\r
84 * Close this rule set to further additions, check it for masked rules,
\r
85 * and index it to optimize performance.
\r
86 * @exception IllegalArgumentException if some rules are masked
\r
88 public void freeze() {
\r
89 /* Construct the rule array and index table. We reorder the
\r
90 * rules by sorting them into 256 bins. Each bin contains all
\r
91 * rules matching the index value for that bin. A rule
\r
92 * matches an index value if string whose first key character
\r
93 * has a low byte equal to the index value can match the rule.
\r
95 * Each bin contains zero or more rules, in the same order
\r
96 * they were found originally. However, the total rules in
\r
97 * the bins may exceed the number in the original vector,
\r
98 * since rules that have a variable as their first key
\r
99 * character will generally fall into more than one bin.
\r
101 * That is, each bin contains all rules that either have that
\r
102 * first index value as their first key character, or have
\r
103 * a set containing the index value as their first character.
\r
105 int n = ruleVector.size();
\r
106 index = new int[257]; // [sic]
\r
107 Vector v = new Vector(2*n); // heuristic; adjust as needed
\r
109 /* Precompute the index values. This saves a LOT of time.
\r
111 int[] indexValue = new int[n];
\r
112 for (int j=0; j<n; ++j) {
\r
113 TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
\r
114 indexValue[j] = r.getIndexValue();
\r
116 for (int x=0; x<256; ++x) {
\r
117 index[x] = v.size();
\r
118 for (int j=0; j<n; ++j) {
\r
119 if (indexValue[j] >= 0) {
\r
120 if (indexValue[j] == x) {
\r
121 v.addElement(ruleVector.elementAt(j));
\r
124 // If the indexValue is < 0, then the first key character is
\r
125 // a set, and we must use the more time-consuming
\r
126 // matchesIndexValue check. In practice this happens
\r
127 // rarely, so we seldom tread this code path.
\r
128 TransliterationRule r = (TransliterationRule) ruleVector.elementAt(j);
\r
129 if (r.matchesIndexValue(x)) {
\r
135 index[256] = v.size();
\r
137 /* Freeze things into an array.
\r
139 rules = new TransliterationRule[v.size()];
\r
142 StringBuffer errors = null;
\r
144 /* Check for masking. This is MUCH faster than our old check,
\r
145 * which was each rule against each following rule, since we
\r
146 * only have to check for masking within each bin now. It's
\r
147 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
\r
148 * count, and n2 is the per-bin rule count. But n2<<n1, so
\r
151 for (int x=0; x<256; ++x) {
\r
152 for (int j=index[x]; j<index[x+1]-1; ++j) {
\r
153 TransliterationRule r1 = rules[j];
\r
154 for (int k=j+1; k<index[x+1]; ++k) {
\r
155 TransliterationRule r2 = rules[k];
\r
156 if (r1.masks(r2)) {
\r
157 if (errors == null) {
\r
158 errors = new StringBuffer();
\r
160 errors.append("\n");
\r
162 errors.append("Rule " + r1 + " masks " + r2);
\r
168 if (errors != null) {
\r
169 throw new IllegalArgumentException(errors.toString());
\r
174 * Transliterate the given text with the given UTransPosition
\r
175 * indices. Return TRUE if the transliteration should continue
\r
176 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
\r
177 * Note that FALSE is only ever returned if isIncremental is TRUE.
\r
178 * @param text the text to be transliterated
\r
179 * @param pos the position indices, which will be updated
\r
180 * @param incremental if TRUE, assume new text may be inserted
\r
181 * at index.limit, and return FALSE if thre is a partial match.
\r
182 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
\r
183 * indicating that transliteration should stop until more text
\r
186 public boolean transliterate(Replaceable text,
\r
187 Transliterator.Position pos,
\r
188 boolean incremental) {
\r
189 int indexByte = text.char32At(pos.start) & 0xFF;
\r
190 for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
\r
191 int m = rules[i].matchAndReplace(text, pos, incremental);
\r
193 case UnicodeMatcher.U_MATCH:
\r
194 if (Transliterator.DEBUG) {
\r
195 System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
\r
196 rules[i].toRule(true) + " => " +
\r
197 UtilityExtensions.formatInput(text, pos));
\r
200 case UnicodeMatcher.U_PARTIAL_MATCH:
\r
201 if (Transliterator.DEBUG) {
\r
202 System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
\r
203 rules[i].toRule(true) + " => " +
\r
204 UtilityExtensions.formatInput(text, pos));
\r
208 if (Transliterator.DEBUG) {
\r
209 System.out.println("Rule: no match " + rules[i]);
\r
213 // No match or partial match from any rule
\r
214 pos.start += UTF16.getCharCount(text.char32At(pos.start));
\r
215 if (Transliterator.DEBUG) {
\r
216 System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
\r
217 UtilityExtensions.formatInput(text, pos));
\r
223 * Create rule strings that represents this rule set.
\r
225 String toRules(boolean escapeUnprintable) {
\r
227 int count = ruleVector.size();
\r
228 StringBuffer ruleSource = new StringBuffer();
\r
229 for (i=0; i<count; ++i) {
\r
231 ruleSource.append('\n');
\r
233 TransliterationRule r =
\r
234 (TransliterationRule) ruleVector.elementAt(i);
\r
235 ruleSource.append(r.toRule(escapeUnprintable));
\r
237 return ruleSource.toString();
\r
241 * Return the set of all characters that may be modified (getTarget=false)
\r
242 * or emitted (getTarget=true) by this set.
\r
244 UnicodeSet getSourceTargetSet(boolean getTarget) {
\r
245 UnicodeSet set = new UnicodeSet();
\r
246 int count = ruleVector.size();
\r
247 for (int i=0; i<count; ++i) {
\r
248 TransliterationRule r =
\r
249 (TransliterationRule) ruleVector.elementAt(i);
\r
251 r.addTargetSetTo(set);
\r
253 r.addSourceSetTo(set);
\r