2 *******************************************************************************
\r
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.util.Vector;
\r
11 import com.ibm.icu.impl.UtilityExtensions;
\r
14 * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
\r
15 * the transliteration in one direction from one set of characters or short
\r
16 * strings to another. A <code>RuleBasedTransliterator</code> consists of up to
\r
17 * two such sets, one for the forward direction, and one for the reverse.
\r
19 * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
\r
20 * finding a matching rule at a given point in the text. This is accomplished
\r
21 * by the <code>findMatch()</code> method.
\r
23 * <p>Copyright © IBM Corporation 1999. All rights reserved.
\r
27 class TransliterationRuleSet {
\r
29 * Vector of rules, in the order added.
\r
31 private Vector<TransliterationRule> ruleVector;
\r
34 * Length of the longest preceding context
\r
36 private int maxContextLength;
\r
39 * Sorted and indexed table of rules. This is created by freeze() from
\r
40 * the rules in ruleVector. rules.length >= ruleVector.size(), and the
\r
41 * references in rules[] are aliases of the references in ruleVector.
\r
42 * A single rule in ruleVector is listed one or more times in rules[].
\r
44 private TransliterationRule[] rules;
\r
47 * Index table. For text having a first character c, compute x = c&0xFF.
\r
48 * Now use rules[index[x]..index[x+1]-1]. This index table is created by
\r
51 private int[] index;
\r
54 * Construct a new empty rule set.
\r
56 public TransliterationRuleSet() {
\r
57 ruleVector = new Vector<TransliterationRule>();
\r
58 maxContextLength = 0;
\r
62 * Return the maximum context length.
\r
63 * @return the length of the longest preceding context.
\r
65 public int getMaximumContextLength() {
\r
66 return maxContextLength;
\r
70 * Add a rule to this set. Rules are added in order, and order is
\r
72 * @param rule the rule to add
\r
74 public void addRule(TransliterationRule rule) {
\r
75 ruleVector.addElement(rule);
\r
77 if ((len = rule.getAnteContextLength()) > maxContextLength) {
\r
78 maxContextLength = len;
\r
85 * Close this rule set to further additions, check it for masked rules,
\r
86 * and index it to optimize performance.
\r
87 * @exception IllegalArgumentException if some rules are masked
\r
89 public void freeze() {
\r
90 /* Construct the rule array and index table. We reorder the
\r
91 * rules by sorting them into 256 bins. Each bin contains all
\r
92 * rules matching the index value for that bin. A rule
\r
93 * matches an index value if string whose first key character
\r
94 * has a low byte equal to the index value can match the rule.
\r
96 * Each bin contains zero or more rules, in the same order
\r
97 * they were found originally. However, the total rules in
\r
98 * the bins may exceed the number in the original vector,
\r
99 * since rules that have a variable as their first key
\r
100 * character will generally fall into more than one bin.
\r
102 * That is, each bin contains all rules that either have that
\r
103 * first index value as their first key character, or have
\r
104 * a set containing the index value as their first character.
\r
106 int n = ruleVector.size();
\r
107 index = new int[257]; // [sic]
\r
108 Vector<TransliterationRule> v = new Vector<TransliterationRule>(2*n); // heuristic; adjust as needed
\r
110 /* Precompute the index values. This saves a LOT of time.
\r
112 int[] indexValue = new int[n];
\r
113 for (int j=0; j<n; ++j) {
\r
114 TransliterationRule r = ruleVector.elementAt(j);
\r
115 indexValue[j] = r.getIndexValue();
\r
117 for (int x=0; x<256; ++x) {
\r
118 index[x] = v.size();
\r
119 for (int j=0; j<n; ++j) {
\r
120 if (indexValue[j] >= 0) {
\r
121 if (indexValue[j] == x) {
\r
122 v.addElement(ruleVector.elementAt(j));
\r
125 // If the indexValue is < 0, then the first key character is
\r
126 // a set, and we must use the more time-consuming
\r
127 // matchesIndexValue check. In practice this happens
\r
128 // rarely, so we seldom tread this code path.
\r
129 TransliterationRule r = ruleVector.elementAt(j);
\r
130 if (r.matchesIndexValue(x)) {
\r
136 index[256] = v.size();
\r
138 /* Freeze things into an array.
\r
140 rules = new TransliterationRule[v.size()];
\r
143 StringBuilder errors = null;
\r
145 /* Check for masking. This is MUCH faster than our old check,
\r
146 * which was each rule against each following rule, since we
\r
147 * only have to check for masking within each bin now. It's
\r
148 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
\r
149 * count, and n2 is the per-bin rule count. But n2<<n1, so
\r
152 for (int x=0; x<256; ++x) {
\r
153 for (int j=index[x]; j<index[x+1]-1; ++j) {
\r
154 TransliterationRule r1 = rules[j];
\r
155 for (int k=j+1; k<index[x+1]; ++k) {
\r
156 TransliterationRule r2 = rules[k];
\r
157 if (r1.masks(r2)) {
\r
158 if (errors == null) {
\r
159 errors = new StringBuilder();
\r
161 errors.append("\n");
\r
163 errors.append("Rule " + r1 + " masks " + r2);
\r
169 if (errors != null) {
\r
170 throw new IllegalArgumentException(errors.toString());
\r
175 * Transliterate the given text with the given UTransPosition
\r
176 * indices. Return TRUE if the transliteration should continue
\r
177 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
\r
178 * Note that FALSE is only ever returned if isIncremental is TRUE.
\r
179 * @param text the text to be transliterated
\r
180 * @param pos the position indices, which will be updated
\r
181 * @param incremental if TRUE, assume new text may be inserted
\r
182 * at index.limit, and return FALSE if thre is a partial match.
\r
183 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
\r
184 * indicating that transliteration should stop until more text
\r
187 public boolean transliterate(Replaceable text,
\r
188 Transliterator.Position pos,
\r
189 boolean incremental) {
\r
190 int indexByte = text.char32At(pos.start) & 0xFF;
\r
191 for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
\r
192 int m = rules[i].matchAndReplace(text, pos, incremental);
\r
194 case UnicodeMatcher.U_MATCH:
\r
195 if (Transliterator.DEBUG) {
\r
196 System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
\r
197 rules[i].toRule(true) + " => " +
\r
198 UtilityExtensions.formatInput(text, pos));
\r
201 case UnicodeMatcher.U_PARTIAL_MATCH:
\r
202 if (Transliterator.DEBUG) {
\r
203 System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
\r
204 rules[i].toRule(true) + " => " +
\r
205 UtilityExtensions.formatInput(text, pos));
\r
209 if (Transliterator.DEBUG) {
\r
210 System.out.println("Rule: no match " + rules[i]);
\r
214 // No match or partial match from any rule
\r
215 pos.start += UTF16.getCharCount(text.char32At(pos.start));
\r
216 if (Transliterator.DEBUG) {
\r
217 System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
\r
218 UtilityExtensions.formatInput(text, pos));
\r
224 * Create rule strings that represents this rule set.
\r
226 String toRules(boolean escapeUnprintable) {
\r
228 int count = ruleVector.size();
\r
229 StringBuilder ruleSource = new StringBuilder();
\r
230 for (i=0; i<count; ++i) {
\r
232 ruleSource.append('\n');
\r
234 TransliterationRule r = ruleVector.elementAt(i);
\r
235 ruleSource.append(r.toRule(escapeUnprintable));
\r
237 return ruleSource.toString();
\r
241 * Return the set of all characters that may be modified (getTarget=false)
\r
242 * or emitted (getTarget=true) by this set.
\r
244 UnicodeSet getSourceTargetSet(boolean getTarget) {
\r
245 UnicodeSet set = new UnicodeSet();
\r
246 int count = ruleVector.size();
\r
247 for (int i=0; i<count; ++i) {
\r
248 TransliterationRule r = ruleVector.elementAt(i);
\r
250 r.addTargetSetTo(set);
\r
252 r.addSourceSetTo(set);
\r