2 *******************************************************************************
3 * Copyright (C) 1996-2010, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.util.ArrayList;
10 import java.util.List;
12 import com.ibm.icu.impl.UtilityExtensions;
15 * A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes
16 * the transliteration in one direction from one set of characters or short
17 * strings to another. A <code>RuleBasedTransliterator</code> consists of up to
18 * two such sets, one for the forward direction, and one for the reverse.
20 * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
21 * finding a matching rule at a given point in the text. This is accomplished
22 * by the <code>findMatch()</code> method.
24 * <p>Copyright © IBM Corporation 1999. All rights reserved.
28 class TransliterationRuleSet {
30 * Vector of rules, in the order added.
32 private List<TransliterationRule> ruleVector;
35 * Length of the longest preceding context
37 private int maxContextLength;
40 * Sorted and indexed table of rules. This is created by freeze() from
41 * the rules in ruleVector. rules.length >= ruleVector.size(), and the
42 * references in rules[] are aliases of the references in ruleVector.
43 * A single rule in ruleVector is listed one or more times in rules[].
45 private TransliterationRule[] rules;
48 * Index table. For text having a first character c, compute x = c&0xFF.
49 * Now use rules[index[x]..index[x+1]-1]. This index table is created by
55 * Construct a new empty rule set.
57 public TransliterationRuleSet() {
58 ruleVector = new ArrayList<TransliterationRule>();
63 * Return the maximum context length.
64 * @return the length of the longest preceding context.
66 public int getMaximumContextLength() {
67 return maxContextLength;
71 * Add a rule to this set. Rules are added in order, and order is
73 * @param rule the rule to add
75 public void addRule(TransliterationRule rule) {
78 if ((len = rule.getAnteContextLength()) > maxContextLength) {
79 maxContextLength = len;
86 * Close this rule set to further additions, check it for masked rules,
87 * and index it to optimize performance.
88 * @exception IllegalArgumentException if some rules are masked
90 public void freeze() {
91 /* Construct the rule array and index table. We reorder the
92 * rules by sorting them into 256 bins. Each bin contains all
93 * rules matching the index value for that bin. A rule
94 * matches an index value if string whose first key character
95 * has a low byte equal to the index value can match the rule.
97 * Each bin contains zero or more rules, in the same order
98 * they were found originally. However, the total rules in
99 * the bins may exceed the number in the original vector,
100 * since rules that have a variable as their first key
101 * character will generally fall into more than one bin.
103 * That is, each bin contains all rules that either have that
104 * first index value as their first key character, or have
105 * a set containing the index value as their first character.
107 int n = ruleVector.size();
108 index = new int[257]; // [sic]
109 List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed
111 /* Precompute the index values. This saves a LOT of time.
113 int[] indexValue = new int[n];
114 for (int j=0; j<n; ++j) {
115 TransliterationRule r = ruleVector.get(j);
116 indexValue[j] = r.getIndexValue();
118 for (int x=0; x<256; ++x) {
120 for (int j=0; j<n; ++j) {
121 if (indexValue[j] >= 0) {
122 if (indexValue[j] == x) {
123 v.add(ruleVector.get(j));
126 // If the indexValue is < 0, then the first key character is
127 // a set, and we must use the more time-consuming
128 // matchesIndexValue check. In practice this happens
129 // rarely, so we seldom tread this code path.
130 TransliterationRule r = ruleVector.get(j);
131 if (r.matchesIndexValue(x)) {
137 index[256] = v.size();
139 /* Freeze things into an array.
141 rules = new TransliterationRule[v.size()];
144 StringBuilder errors = null;
146 /* Check for masking. This is MUCH faster than our old check,
147 * which was each rule against each following rule, since we
148 * only have to check for masking within each bin now. It's
149 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
150 * count, and n2 is the per-bin rule count. But n2<<n1, so
153 for (int x=0; x<256; ++x) {
154 for (int j=index[x]; j<index[x+1]-1; ++j) {
155 TransliterationRule r1 = rules[j];
156 for (int k=j+1; k<index[x+1]; ++k) {
157 TransliterationRule r2 = rules[k];
159 if (errors == null) {
160 errors = new StringBuilder();
164 errors.append("Rule " + r1 + " masks " + r2);
170 if (errors != null) {
171 throw new IllegalArgumentException(errors.toString());
176 * Transliterate the given text with the given UTransPosition
177 * indices. Return TRUE if the transliteration should continue
178 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
179 * Note that FALSE is only ever returned if isIncremental is TRUE.
180 * @param text the text to be transliterated
181 * @param pos the position indices, which will be updated
182 * @param incremental if TRUE, assume new text may be inserted
183 * at index.limit, and return FALSE if thre is a partial match.
184 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
185 * indicating that transliteration should stop until more text
188 public boolean transliterate(Replaceable text,
189 Transliterator.Position pos,
190 boolean incremental) {
191 int indexByte = text.char32At(pos.start) & 0xFF;
192 for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
193 int m = rules[i].matchAndReplace(text, pos, incremental);
195 case UnicodeMatcher.U_MATCH:
196 if (Transliterator.DEBUG) {
197 System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
198 rules[i].toRule(true) + " => " +
199 UtilityExtensions.formatInput(text, pos));
202 case UnicodeMatcher.U_PARTIAL_MATCH:
203 if (Transliterator.DEBUG) {
204 System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
205 rules[i].toRule(true) + " => " +
206 UtilityExtensions.formatInput(text, pos));
210 if (Transliterator.DEBUG) {
211 System.out.println("Rule: no match " + rules[i]);
215 // No match or partial match from any rule
216 pos.start += UTF16.getCharCount(text.char32At(pos.start));
217 if (Transliterator.DEBUG) {
218 System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
219 UtilityExtensions.formatInput(text, pos));
225 * Create rule strings that represents this rule set.
227 String toRules(boolean escapeUnprintable) {
229 int count = ruleVector.size();
230 StringBuilder ruleSource = new StringBuilder();
231 for (i=0; i<count; ++i) {
233 ruleSource.append('\n');
235 TransliterationRule r = ruleVector.get(i);
236 ruleSource.append(r.toRule(escapeUnprintable));
238 return ruleSource.toString();
241 // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
242 // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
243 void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
244 UnicodeSet currentFilter = new UnicodeSet(filter);
245 UnicodeSet revisiting = new UnicodeSet();
246 int count = ruleVector.size();
247 for (int i=0; i<count; ++i) {
248 TransliterationRule r = ruleVector.get(i);
249 r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
250 currentFilter.addAll(revisiting);