jars/icu4j-52_1/main/classes/translit/src/com/ibm/icu/text/TransliterationRuleSet.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 import java.util.ArrayList;
  10 import java.util.List;
  11
  12 import com.ibm.icu.impl.UtilityExtensions;
  13
  14 /**
  15  * A set of rules for a <code>RuleBasedTransliterator</code>.  This set encodes
  16  * the transliteration in one direction from one set of characters or short
  17  * strings to another.  A <code>RuleBasedTransliterator</code> consists of up to
  18  * two such sets, one for the forward direction, and one for the reverse.
  19  *
  20  * <p>A <code>TransliterationRuleSet</code> has one important operation, that of
  21  * finding a matching rule at a given point in the text.  This is accomplished
  22  * by the <code>findMatch()</code> method.
  23  *
  24  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
  25  *
  26  * @author Alan Liu
  27  */
  28 class TransliterationRuleSet {
  29     /**
  30      * Vector of rules, in the order added.
  31      */
  32     private List<TransliterationRule> ruleVector;
  33
  34     /**
  35      * Length of the longest preceding context
  36      */
  37     private int maxContextLength;
  38
  39     /**
  40      * Sorted and indexed table of rules.  This is created by freeze() from
  41      * the rules in ruleVector.  rules.length >= ruleVector.size(), and the
  42      * references in rules[] are aliases of the references in ruleVector.
  43      * A single rule in ruleVector is listed one or more times in rules[].
  44      */
  45     private TransliterationRule[] rules;
  46
  47     /**
  48      * Index table.  For text having a first character c, compute x = c&0xFF.
  49      * Now use rules[index[x]..index[x+1]-1].  This index table is created by
  50      * freeze().
  51      */
  52     private int[] index;
  53
  54     /**
  55      * Construct a new empty rule set.
  56      */
  57     public TransliterationRuleSet() {
  58         ruleVector = new ArrayList<TransliterationRule>();
  59         maxContextLength = 0;
  60     }
  61
  62     /**
  63      * Return the maximum context length.
  64      * @return the length of the longest preceding context.
  65      */
  66     public int getMaximumContextLength() {
  67         return maxContextLength;
  68     }
  69
  70     /**
  71      * Add a rule to this set.  Rules are added in order, and order is
  72      * significant.
  73      * @param rule the rule to add
  74      */
  75     public void addRule(TransliterationRule rule) {
  76         ruleVector.add(rule);
  77         int len;
  78         if ((len = rule.getAnteContextLength()) > maxContextLength) {
  79             maxContextLength = len;
  80         }
  81
  82         rules = null;
  83     }
  84
  85     /**
  86      * Close this rule set to further additions, check it for masked rules,
  87      * and index it to optimize performance.
  88      * @exception IllegalArgumentException if some rules are masked
  89      */
  90     public void freeze() {
  91         /* Construct the rule array and index table.  We reorder the
  92          * rules by sorting them into 256 bins.  Each bin contains all
  93          * rules matching the index value for that bin.  A rule
  94          * matches an index value if string whose first key character
  95          * has a low byte equal to the index value can match the rule.
  96          *
  97          * Each bin contains zero or more rules, in the same order
  98          * they were found originally.  However, the total rules in
  99          * the bins may exceed the number in the original vector,
 100          * since rules that have a variable as their first key
 101          * character will generally fall into more than one bin.
 102          *
 103          * That is, each bin contains all rules that either have that
 104          * first index value as their first key character, or have
 105          * a set containing the index value as their first character.
 106          */
 107         int n = ruleVector.size();
 108         index = new int[257]; // [sic]
 109         List<TransliterationRule> v = new ArrayList<TransliterationRule>(2*n); // heuristic; adjust as needed
 110
 111         /* Precompute the index values.  This saves a LOT of time.
 112          */
 113         int[] indexValue = new int[n];
 114         for (int j=0; j<n; ++j) {
 115             TransliterationRule r = ruleVector.get(j);
 116             indexValue[j] = r.getIndexValue();
 117         }
 118         for (int x=0; x<256; ++x) {
 119             index[x] = v.size();
 120             for (int j=0; j<n; ++j) {
 121                 if (indexValue[j] >= 0) {
 122                     if (indexValue[j] == x) {
 123                         v.add(ruleVector.get(j));
 124                     }
 125                 } else {
 126                     // If the indexValue is < 0, then the first key character is
 127                     // a set, and we must use the more time-consuming
 128                     // matchesIndexValue check.  In practice this happens
 129                     // rarely, so we seldom tread this code path.
 130                     TransliterationRule r = ruleVector.get(j);
 131                     if (r.matchesIndexValue(x)) {
 132                         v.add(r);
 133                     }
 134                 }
 135             }
 136         }
 137         index[256] = v.size();
 138
 139         /* Freeze things into an array.
 140          */
 141         rules = new TransliterationRule[v.size()];
 142         v.toArray(rules);
 143
 144         StringBuilder errors = null;
 145
 146         /* Check for masking.  This is MUCH faster than our old check,
 147          * which was each rule against each following rule, since we
 148          * only have to check for masking within each bin now.  It's
 149          * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
 150          * count, and n2 is the per-bin rule count.  But n2<<n1, so
 151          * it's a big win.
 152          */
 153         for (int x=0; x<256; ++x) {
 154             for (int j=index[x]; j<index[x+1]-1; ++j) {
 155                 TransliterationRule r1 = rules[j];
 156                 for (int k=j+1; k<index[x+1]; ++k) {
 157                     TransliterationRule r2 = rules[k];
 158                     if (r1.masks(r2)) {
 159                         if (errors == null) {
 160                             errors = new StringBuilder();
 161                         } else {
 162                             errors.append("\n");
 163                         }
 164                         errors.append("Rule " + r1 + " masks " + r2);
 165                     }
 166                 }
 167             }
 168         }
 169
 170         if (errors != null) {
 171             throw new IllegalArgumentException(errors.toString());
 172         }
 173     }
 174
 175     /**
 176      * Transliterate the given text with the given UTransPosition
 177      * indices.  Return TRUE if the transliteration should continue
 178      * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
 179      * Note that FALSE is only ever returned if isIncremental is TRUE.
 180      * @param text the text to be transliterated
 181      * @param pos the position indices, which will be updated
 182      * @param incremental if TRUE, assume new text may be inserted
 183      * at index.limit, and return FALSE if thre is a partial match.
 184      * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
 185      * indicating that transliteration should stop until more text
 186      * arrives.
 187      */
 188     public boolean transliterate(Replaceable text,
 189                                  Transliterator.Position pos,
 190                                  boolean incremental) {
 191         int indexByte = text.char32At(pos.start) & 0xFF;
 192         for (int i=index[indexByte]; i<index[indexByte+1]; ++i) {
 193             int m = rules[i].matchAndReplace(text, pos, incremental);
 194             switch (m) {
 195             case UnicodeMatcher.U_MATCH:
 196                 if (Transliterator.DEBUG) {
 197                     System.out.println((incremental ? "Rule.i: match ":"Rule: match ") +
 198                                        rules[i].toRule(true) + " => " +
 199                                        UtilityExtensions.formatInput(text, pos));
 200                 }
 201                 return true;
 202             case UnicodeMatcher.U_PARTIAL_MATCH:
 203                 if (Transliterator.DEBUG) {
 204                     System.out.println((incremental ? "Rule.i: partial match ":"Rule: partial match ") +
 205                                        rules[i].toRule(true) + " => " +
 206                                        UtilityExtensions.formatInput(text, pos));
 207                 }
 208                 return false;
 209                 default:
 210                     if (Transliterator.DEBUG) {
 211                         System.out.println("Rule: no match " + rules[i]);
 212                     }
 213             }
 214         }
 215         // No match or partial match from any rule
 216         pos.start += UTF16.getCharCount(text.char32At(pos.start));
 217         if (Transliterator.DEBUG) {
 218             System.out.println((incremental ? "Rule.i: no match => ":"Rule: no match => ") +
 219                                UtilityExtensions.formatInput(text, pos));
 220         }
 221         return true;
 222     }
 223
 224     /**
 225      * Create rule strings that represents this rule set.
 226      */
 227     String toRules(boolean escapeUnprintable) {
 228         int i;
 229         int count = ruleVector.size();
 230         StringBuilder ruleSource = new StringBuilder();
 231         for (i=0; i<count; ++i) {
 232             if (i != 0) {
 233                 ruleSource.append('\n');
 234             }
 235             TransliterationRule r = ruleVector.get(i);
 236             ruleSource.append(r.toRule(escapeUnprintable));
 237         }
 238         return ruleSource.toString();
 239     }
 240
 241     // TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
 242     // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
 243     void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
 244         UnicodeSet currentFilter = new UnicodeSet(filter);
 245         UnicodeSet revisiting = new UnicodeSet();
 246         int count = ruleVector.size();
 247         for (int i=0; i<count; ++i) {
 248             TransliterationRule r = ruleVector.get(i);
 249             r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
 250             currentFilter.addAll(revisiting);
 251         }
 252     }
 253
 254 }