jars/icu4j-4_4_2-src/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 1996-2009, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 \r
   9 import java.util.Hashtable;\r
  10 \r
  11 \r
  12 /**\r
  13  * <code>RuleBasedTransliterator</code> is a transliterator\r
  14  * that reads a set of rules in order to determine how to perform\r
  15  * translations. Rule sets are stored in resource bundles indexed by\r
  16  * name. Rules within a rule set are separated by semicolons (';').\r
  17  * To include a literal semicolon, prefix it with a backslash ('\').\r
  18  * Whitespace, as defined by <code>UCharacterProperty.isRuleWhiteSpace()</code>,\r
  19  * is ignored. If the first non-blank character on a line is '#',\r
  20  * the entire line is ignored as a comment. </p>\r
  21  *\r
  22  * <p>Each set of rules consists of two groups, one forward, and one\r
  23  * reverse. This is a convention that is not enforced; rules for one\r
  24  * direction may be omitted, with the result that translations in\r
  25  * that direction will not modify the source text. In addition,\r
  26  * bidirectional forward-reverse rules may be specified for\r
  27  * symmetrical transformations.</p>\r
  28  *\r
  29  * <p><b>Rule syntax</b> </p>\r
  30  *\r
  31  * <p>Rule statements take one of the following forms: </p>\r
  32  *\r
  33  * <dl>\r
  34  *     <dt><code>$alefmadda=\u0622;</code></dt>\r
  35  *     <dd><strong>Variable definition.</strong> The name on the\r
  36  *         left is assigned the text on the right. In this example,\r
  37  *         after this statement, instances of the left hand name,\r
  38  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by\r
  39  *         the Unicode character U+0622. Variable names must begin\r
  40  *         with a letter and consist only of letters, digits, and\r
  41  *         underscores. Case is significant. Duplicate names cause\r
  42  *         an exception to be thrown, that is, variables cannot be\r
  43  *         redefined. The right hand side may contain well-formed\r
  44  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).\r
  45  *         The right hand side may contain embedded <code>UnicodeSet</code>\r
  46  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>\r
  47  *     <dd>&nbsp;</dd>\r
  48  *     <dt><code>ai&gt;$alefmadda;</code></dt>\r
  49  *     <dd><strong>Forward translation rule.</strong> This rule\r
  50  *         states that the string on the left will be changed to the\r
  51  *         string on the right when performing forward\r
  52  *         transliteration.</dd>\r
  53  *     <dt>&nbsp;</dt>\r
  54  *     <dt><code>ai&lt;$alefmadda;</code></dt>\r
  55  *     <dd><strong>Reverse translation rule.</strong> This rule\r
  56  *         states that the string on the right will be changed to\r
  57  *         the string on the left when performing reverse\r
  58  *         transliteration.</dd>\r
  59  * </dl>\r
  60  *\r
  61  * <dl>\r
  62  *     <dt><code>ai&lt;&gt;$alefmadda;</code></dt>\r
  63  *     <dd><strong>Bidirectional translation rule.</strong> This\r
  64  *         rule states that the string on the right will be changed\r
  65  *         to the string on the left when performing forward\r
  66  *         transliteration, and vice versa when performing reverse\r
  67  *         transliteration.</dd>\r
  68  * </dl>\r
  69  *\r
  70  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output\r
  71  * string</em>. The match pattern consists of literal characters,\r
  72  * optionally preceded by context, and optionally followed by\r
  73  * context. Context characters, like literal pattern characters,\r
  74  * must be matched in the text being transliterated. However, unlike\r
  75  * literal pattern characters, they are not replaced by the output\r
  76  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;\r
  77  * indicates the characters &quot;<code>def</code>&quot; must be\r
  78  * preceded by &quot;<code>abc</code>&quot; for a successful match.\r
  79  * If there is a successful match, &quot;<code>def</code>&quot; will\r
  80  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'\r
  81  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to\r
  82  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;\r
  83  * (or &quot;<code>123}456</code>&quot;) in which the literal\r
  84  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.\r
  85  * </p>\r
  86  *\r
  87  * <p>The output string of a forward or reverse rule consists of\r
  88  * characters to replace the literal pattern characters. If the\r
  89  * output string contains the character '<code>|</code>', this is\r
  90  * taken to indicate the location of the <em>cursor</em> after\r
  91  * replacement. The cursor is the point in the text at which the\r
  92  * next replacement, if any, will be applied. The cursor is usually\r
  93  * placed within the replacement text; however, it can actually be\r
  94  * placed into the precending or following context by using the\r
  95  * special character '<code>@</code>'. Examples:</p>\r
  96  *\r
  97  * <blockquote>\r
  98  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor\r
  99  *     before a<br>\r
 100  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between\r
 101  *     y and z</code></p>\r
 102  * </blockquote>\r
 103  *\r
 104  * <p><b>UnicodeSet</b></p>\r
 105  *\r
 106  * <p><code>UnicodeSet</code> patterns may appear anywhere that\r
 107  * makes sense. They may appear in variable definitions.\r
 108  * Contrariwise, <code>UnicodeSet</code> patterns may themselves\r
 109  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,\r
 110  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>\r
 111  *\r
 112  * <p><code>UnicodeSet</code> patterns may also be embedded directly\r
 113  * into rule strings. Thus, the following two rules are equivalent:</p>\r
 114  *\r
 115  * <blockquote>\r
 116  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>\r
 117  *     [aeiou]&gt;'*';\r
 118  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#\r
 119  *     Another way</code></p>\r
 120  * </blockquote>\r
 121  *\r
 122  * <p>See {@link UnicodeSet} for more documentation and examples.</p>\r
 123  *\r
 124  * <p><b>Segments</b></p>\r
 125  *\r
 126  * <p>Segments of the input string can be matched and copied to the\r
 127  * output string. This makes certain sets of rules simpler and more\r
 128  * general, and makes reordering possible. For example:</p>\r
 129  *\r
 130  * <blockquote>\r
 131  *     <p><code>([a-z]) &gt; $1 $1;\r
 132  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#\r
 133  *     double lowercase letters<br>\r
 134  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>\r
 135  * </blockquote>\r
 136  *\r
 137  * <p>The segment of the input string to be copied is delimited by\r
 138  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to\r
 139  * nine segments may be defined. Segments may not overlap. In the\r
 140  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;\r
 141  * represent the input string segments, in left-to-right order of\r
 142  * definition.</p>\r
 143  *\r
 144  * <p><b>Anchors</b></p>\r
 145  *\r
 146  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the\r
 147  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>\r
 148  *\r
 149  * <blockquote>\r
 150  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>\r
 151  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances\r
 152  *   of 'a'<br>\r
 153  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>\r
 154  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances\r
 155  *   of 'z'</code></p>\r
 156  * </blockquote>\r
 157  *\r
 158  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.\r
 159  * This is done by including a virtual anchor character '<code>$</code>' at the end of the\r
 160  * set pattern. Although this is usually the match chafacter for the end anchor, the set will\r
 161  * match either the beginning or the end of the text, depending on its placement. For\r
 162  * example:</p>\r
 163  *\r
 164  * <blockquote>\r
 165  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>\r
 166  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>\r
 167  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>\r
 168  * </blockquote>\r
 169  *\r
 170  * <p><b>Example</b> </p>\r
 171  *\r
 172  * <p>The following example rules illustrate many of the features of\r
 173  * the rule language. </p>\r
 174  *\r
 175  * <table border="0" cellpadding="4">\r
 176  *     <tr>\r
 177  *         <td valign="top">Rule 1.</td>\r
 178  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>\r
 179  *     </tr>\r
 180  *     <tr>\r
 181  *         <td valign="top">Rule 2.</td>\r
 182  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>\r
 183  *     </tr>\r
 184  *     <tr>\r
 185  *         <td valign="top">Rule 3.</td>\r
 186  *         <td valign="top" nowrap><code>yz&gt;q</code></td>\r
 187  *     </tr>\r
 188  * </table>\r
 189  *\r
 190  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;\r
 191  * yields the following results: </p>\r
 192  *\r
 193  * <table border="0" cellpadding="4">\r
 194  *     <tr>\r
 195  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>\r
 196  *         <td valign="top">Initial state, no rules match. Advance\r
 197  *         cursor.</td>\r
 198  *     </tr>\r
 199  *     <tr>\r
 200  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>\r
 201  *         <td valign="top">Still no match. Rule 1 does not match\r
 202  *         because the preceding context is not present.</td>\r
 203  *     </tr>\r
 204  *     <tr>\r
 205  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>\r
 206  *         <td valign="top">Still no match. Keep advancing until\r
 207  *         there is a match...</td>\r
 208  *     </tr>\r
 209  *     <tr>\r
 210  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>\r
 211  *         <td valign="top">...</td>\r
 212  *     </tr>\r
 213  *     <tr>\r
 214  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>\r
 215  *         <td valign="top">...</td>\r
 216  *     </tr>\r
 217  *     <tr>\r
 218  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>\r
 219  *         <td valign="top">...</td>\r
 220  *     </tr>\r
 221  *     <tr>\r
 222  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>\r
 223  *         <td valign="top">...</td>\r
 224  *     </tr>\r
 225  *     <tr>\r
 226  *         <td valign="top" nowrap><code>adefabc|defz</code></td>\r
 227  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;\r
 228  *         with &quot;<code>xy</code>&quot; and back up the cursor\r
 229  *         to before the '<code>y</code>'.</td>\r
 230  *     </tr>\r
 231  *     <tr>\r
 232  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>\r
 233  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is\r
 234  *         present, rule 2 does not match because the cursor is\r
 235  *         before the '<code>y</code>', not before the '<code>x</code>'.\r
 236  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;\r
 237  *         with &quot;<code>q</code>&quot;.</td>\r
 238  *     </tr>\r
 239  *     <tr>\r
 240  *         <td valign="top" nowrap><code>adefabcxq|</code></td>\r
 241  *         <td valign="top">The cursor is at the end;\r
 242  *         transliteration is complete.</td>\r
 243  *     </tr>\r
 244  * </table>\r
 245  *\r
 246  * <p>The order of rules is significant. If multiple rules may match\r
 247  * at some point, the first matching rule is applied. </p>\r
 248  *\r
 249  * <p>Forward and reverse rules may have an empty output string.\r
 250  * Otherwise, an empty left or right hand side of any statement is a\r
 251  * syntax error. </p>\r
 252  *\r
 253  * <p>Single quotes are used to quote any character other than a\r
 254  * digit or letter. To specify a single quote itself, inside or\r
 255  * outside of quotes, use two single quotes in a row. For example,\r
 256  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the\r
 257  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.\r
 258  * </p>\r
 259  *\r
 260  * <p><b>Notes</b> </p>\r
 261  *\r
 262  * <p>While a RuleBasedTransliterator is being built, it checks that\r
 263  * the rules are added in proper order. For example, if the rule\r
 264  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,\r
 265  * then the second rule will throw an exception. The reason is that\r
 266  * the second rule can never be triggered, since the first rule\r
 267  * always matches anything it matches. In other words, the first\r
 268  * rule <em>masks</em> the second rule. </p>\r
 269  *\r
 270  * <p>Copyright (c) IBM Corporation 1999-2000. All rights reserved.</p>\r
 271  *\r
 272  * @author Alan Liu\r
 273  * @internal\r
 274  * @deprecated This API is ICU internal only.\r
 275  */\r
 276 public class RuleBasedTransliterator extends Transliterator {\r
 277 \r
 278     private Data data;\r
 279 \r
 280     /**\r
 281      * Constructs a new transliterator from the given rules.\r
 282      * @param rules rules, separated by ';'\r
 283      * @param direction either FORWARD or REVERSE.\r
 284      * @exception IllegalArgumentException if rules are malformed\r
 285      * or direction is invalid.\r
 286      * @internal\r
 287      * @deprecated This API is ICU internal only.\r
 288      */\r
 289     /*public RuleBasedTransliterator(String ID, String rules, int direction,\r
 290                                    UnicodeFilter filter) {\r
 291         super(ID, filter);\r
 292         if (direction != FORWARD && direction != REVERSE) {\r
 293             throw new IllegalArgumentException("Invalid direction");\r
 294         }\r
 295 \r
 296         TransliteratorParser parser = new TransliteratorParser();\r
 297         parser.parse(rules, direction);\r
 298         if (parser.idBlockVector.size() != 0 ||\r
 299             parser.compoundFilter != null) {\r
 300             throw new IllegalArgumentException("::ID blocks illegal in RuleBasedTransliterator constructor");\r
 301         }\r
 302 \r
 303         data = (Data)parser.dataVector.get(0);\r
 304         setMaximumContextLength(data.ruleSet.getMaximumContextLength());\r
 305     }*/\r
 306 \r
 307     /**\r
 308      * Constructs a new transliterator from the given rules in the\r
 309      * <code>FORWARD</code> direction.\r
 310      * @param rules rules, separated by ';'\r
 311      * @exception IllegalArgumentException if rules are malformed\r
 312      * or direction is invalid.\r
 313      * @internal\r
 314      * @deprecated This API is ICU internal only.\r
 315      */\r
 316     /*public RuleBasedTransliterator(String ID, String rules) {\r
 317         this(ID, rules, FORWARD, null);\r
 318     }*/\r
 319 \r
 320     RuleBasedTransliterator(String ID, Data data, UnicodeFilter filter) {\r
 321         super(ID, filter);\r
 322         this.data = data;\r
 323         setMaximumContextLength(data.ruleSet.getMaximumContextLength());\r
 324     }\r
 325 \r
 326     /**\r
 327      * Implements {@link Transliterator#handleTransliterate}.\r
 328      * @internal\r
 329      * @deprecated This API is ICU internal only.\r
 330      */\r
 331     protected synchronized void handleTransliterate(Replaceable text,\r
 332                                        Position index, boolean incremental) {\r
 333         /* We keep start and limit fixed the entire time,\r
 334          * relative to the text -- limit may move numerically if text is\r
 335          * inserted or removed.  The cursor moves from start to limit, with\r
 336          * replacements happening under it.\r
 337          *\r
 338          * Example: rules 1. ab>x|y\r
 339          *                2. yc>z\r
 340          *\r
 341          * |eabcd   start - no match, advance cursor\r
 342          * e|abcd   match rule 1 - change text & adjust cursor\r
 343          * ex|ycd   match rule 2 - change text & adjust cursor\r
 344          * exz|d    no match, advance cursor\r
 345          * exzd|    done\r
 346          */\r
 347 \r
 348         /* A rule like\r
 349          *   a>b|a\r
 350          * creates an infinite loop. To prevent that, we put an arbitrary\r
 351          * limit on the number of iterations that we take, one that is\r
 352          * high enough that any reasonable rules are ok, but low enough to\r
 353          * prevent a server from hanging.  The limit is 16 times the\r
 354          * number of characters n, unless n is so large that 16n exceeds a\r
 355          * uint32_t.\r
 356          */\r
 357         int loopCount = 0;\r
 358         int loopLimit = (index.limit - index.start) << 4;\r
 359         if (loopLimit < 0) {\r
 360             loopLimit = 0x7FFFFFFF;\r
 361         }\r
 362 \r
 363         while (index.start < index.limit &&\r
 364                loopCount <= loopLimit &&\r
 365                data.ruleSet.transliterate(text, index, incremental)) {\r
 366             ++loopCount;\r
 367         }\r
 368     }\r
 369 \r
 370 \r
 371     static class Data {\r
 372         public Data() {\r
 373             variableNames = new Hashtable<String, char[]>();\r
 374             ruleSet = new TransliterationRuleSet();\r
 375         }\r
 376 \r
 377         /**\r
 378          * Rule table.  May be empty.\r
 379          */\r
 380         public TransliterationRuleSet ruleSet;\r
 381 \r
 382         /**\r
 383          * Map variable name (String) to variable (char[]).  A variable name\r
 384          * corresponds to zero or more characters, stored in a char[] array in\r
 385          * this hash.  One or more of these chars may also correspond to a\r
 386          * UnicodeSet, in which case the character in the char[] in this hash is\r
 387          * a stand-in: it is an index for a secondary lookup in\r
 388          * data.variables.  The stand-in also represents the UnicodeSet in\r
 389          * the stored rules.\r
 390          */\r
 391         Hashtable<String, char[]> variableNames;\r
 392 \r
 393         /**\r
 394          * Map category variable (Character) to UnicodeMatcher or UnicodeReplacer.\r
 395          * Variables that correspond to a set of characters are mapped\r
 396          * from variable name to a stand-in character in data.variableNames.\r
 397          * The stand-in then serves as a key in this hash to lookup the\r
 398          * actual UnicodeSet object.  In addition, the stand-in is\r
 399          * stored in the rule text to represent the set of characters.\r
 400          * variables[i] represents character (variablesBase + i).\r
 401          */\r
 402         Object[] variables;\r
 403 \r
 404         /**\r
 405          * The character that represents variables[0].  Characters\r
 406          * variablesBase through variablesBase +\r
 407          * variables.length - 1 represent UnicodeSet objects.\r
 408          */\r
 409         char variablesBase;\r
 410 \r
 411         /**\r
 412          * Return the UnicodeMatcher represented by the given character, or\r
 413          * null if none.\r
 414          */\r
 415         public UnicodeMatcher lookupMatcher(int standIn) {\r
 416             int i = standIn - variablesBase;\r
 417             return (i >= 0 && i < variables.length)\r
 418                 ? (UnicodeMatcher) variables[i] : null;\r
 419         }\r
 420 \r
 421         /**\r
 422          * Return the UnicodeReplacer represented by the given character, or\r
 423          * null if none.\r
 424          */\r
 425         public UnicodeReplacer lookupReplacer(int standIn) {\r
 426             int i = standIn - variablesBase;\r
 427             return (i >= 0 && i < variables.length)\r
 428                 ? (UnicodeReplacer) variables[i] : null;\r
 429         }\r
 430     }\r
 431 \r
 432 \r
 433     /**\r
 434      * Return a representation of this transliterator as source rules.\r
 435      * These rules will produce an equivalent transliterator if used\r
 436      * to construct a new transliterator.\r
 437      * @param escapeUnprintable if TRUE then convert unprintable\r
 438      * character to their hex escape representations, \\uxxxx or\r
 439      * \\Uxxxxxxxx.  Unprintable characters are those other than\r
 440      * U+000A, U+0020..U+007E.\r
 441      * @return rules string\r
 442      * @internal\r
 443      * @deprecated This API is ICU internal only.\r
 444      */\r
 445     public String toRules(boolean escapeUnprintable) {\r
 446         return data.ruleSet.toRules(escapeUnprintable);\r
 447     }\r
 448 \r
 449     /**\r
 450      * Return the set of all characters that may be modified by this\r
 451      * Transliterator, ignoring the effect of our filter.\r
 452      * @internal\r
 453      * @deprecated This API is ICU internal only.\r
 454      */\r
 455     protected UnicodeSet handleGetSourceSet() {\r
 456         return data.ruleSet.getSourceTargetSet(false);\r
 457     }\r
 458 \r
 459     /**\r
 460      * Returns the set of all characters that may be generated as\r
 461      * replacement text by this transliterator.\r
 462      * @internal\r
 463      * @deprecated This API is ICU internal only.\r
 464      */\r
 465     public UnicodeSet getTargetSet() {\r
 466         return data.ruleSet.getSourceTargetSet(true);\r
 467     }\r
 468 \r
 469     /**\r
 470      * Temporary hack for registry problem. Needs to be replaced by better architecture.\r
 471      * @internal\r
 472      * @deprecated This API is ICU internal only.\r
 473      */\r
 474     public Transliterator safeClone() {\r
 475         UnicodeFilter filter = getFilter();\r
 476         if (filter != null && filter instanceof UnicodeSet) {\r
 477             filter = new UnicodeSet((UnicodeSet)filter);\r
 478         }\r
 479         return new RuleBasedTransliterator(getID(), data, filter);\r
 480     }\r
 481 }\r
 482 \r
 483 \r