jars/icu4j-52_1/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java

   1 /*
   2  *******************************************************************************
   3  * Copyright (C) 1996-2010, International Business Machines Corporation and    *
   4  * others. All Rights Reserved.                                                *
   5  *******************************************************************************
   6  */
   7 package com.ibm.icu.text;
   8
   9 import com.ibm.icu.impl.Utility;
  10
  11 /**
  12  * A transliteration rule used by
  13  * <code>RuleBasedTransliterator</code>.
  14  * <code>TransliterationRule</code> is an immutable object.
  15  *
  16  * <p>A rule consists of an input pattern and an output string.  When
  17  * the input pattern is matched, the output string is emitted.  The
  18  * input pattern consists of zero or more characters which are matched
  19  * exactly (the key) and optional context.  Context must match if it
  20  * is specified.  Context may be specified before the key, after the
  21  * key, or both.  The key, preceding context, and following context
  22  * may contain variables.  Variables represent a set of Unicode
  23  * characters, such as the letters <i>a</i> through <i>z</i>.
  24  * Variables are detected by looking up each character in a supplied
  25  * variable list to see if it has been so defined.
  26  *
  27  * <p>A rule may contain segments in its input string and segment
  28  * references in its output string.  A segment is a substring of the
  29  * input pattern, indicated by an offset and limit.  The segment may
  30  * be in the preceding or following context.  It may not span a
  31  * context boundary.  A segment reference is a special character in
  32  * the output string that causes a segment of the input string (not
  33  * the input pattern) to be copied to the output string.  The range of
  34  * special characters that represent segment references is defined by
  35  * RuleBasedTransliterator.Data.
  36  *
  37  * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
  38  * string "abc.123" to "ab1.c23".
  39  *
  40  * <p>Copyright &copy; IBM Corporation 1999.  All rights reserved.
  41  *
  42  * @author Alan Liu
  43  */
  44 class TransliterationRule {
  45
  46     // TODO Eliminate the pattern and keyLength data members.  They
  47     // are used only by masks() and getIndexValue() which are called
  48     // only during build time, not during run-time.  Perhaps these
  49     // methods and pattern/keyLength can be isolated into a separate
  50     // object.
  51
  52     /**
  53      * The match that must occur before the key, or null if there is no
  54      * preceding context.
  55      */
  56     private StringMatcher anteContext;
  57
  58     /**
  59      * The matcher object for the key.  If null, then the key is empty.
  60      */
  61     private StringMatcher key;
  62
  63     /**
  64      * The match that must occur after the key, or null if there is no
  65      * following context.
  66      */
  67     private StringMatcher postContext;
  68
  69     /**
  70      * The object that performs the replacement if the key,
  71      * anteContext, and postContext are matched.  Never null.
  72      */
  73     private UnicodeReplacer output;
  74
  75     /**
  76      * The string that must be matched, consisting of the anteContext, key,
  77      * and postContext, concatenated together, in that order.  Some components
  78      * may be empty (zero length).
  79      * @see anteContextLength
  80      * @see keyLength
  81      */
  82     private String pattern;
  83
  84     /**
  85      * An array of matcher objects corresponding to the input pattern
  86      * segments.  If there are no segments this is null.  N.B. This is
  87      * a UnicodeMatcher for generality, but in practice it is always a
  88      * StringMatcher.  In the future we may generalize this, but for
  89      * now we sometimes cast down to StringMatcher.
  90      */
  91     UnicodeMatcher[] segments;
  92
  93     /**
  94      * The length of the string that must match before the key.  If
  95      * zero, then there is no matching requirement before the key.
  96      * Substring [0,anteContextLength) of pattern is the anteContext.
  97      */
  98     private int anteContextLength;
  99
 100     /**
 101      * The length of the key.  Substring [anteContextLength,
 102      * anteContextLength + keyLength) is the key.
 103      */
 104     private int keyLength;
 105
 106     /**
 107      * Miscellaneous attributes.
 108      */
 109     byte flags;
 110
 111     /**
 112      * Flag attributes.
 113      */
 114     static final int ANCHOR_START = 1;
 115     static final int ANCHOR_END   = 2;
 116
 117     /**
 118      * An alias pointer to the data for this rule.  The data provides
 119      * lookup services for matchers and segments.
 120      */
 121     private final RuleBasedTransliterator.Data data;
 122
 123
 124     /**
 125      * Construct a new rule with the given input, output text, and other
 126      * attributes.  A cursor position may be specified for the output text.
 127      * @param input input string, including key and optional ante and
 128      * post context
 129      * @param anteContextPos offset into input to end of ante context, or -1 if
 130      * none.  Must be <= input.length() if not -1.
 131      * @param postContextPos offset into input to start of post context, or -1
 132      * if none.  Must be <= input.length() if not -1, and must be >=
 133      * anteContextPos.
 134      * @param output output string
 135      * @param cursorPos offset into output at which cursor is located, or -1 if
 136      * none.  If less than zero, then the cursor is placed after the
 137      * <code>output</code>; that is, -1 is equivalent to
 138      * <code>output.length()</code>.  If greater than
 139      * <code>output.length()</code> then an exception is thrown.
 140      * @param cursorOffset an offset to be added to cursorPos to position the
 141      * cursor either in the ante context, if < 0, or in the post context, if >
 142      * 0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
 143      * "xyz" and moves the cursor to before "a".  It would have a cursorOffset
 144      * of -3.
 145      * @param segs array of UnicodeMatcher corresponding to input pattern
 146      * segments, or null if there are none
 147      * @param anchorStart true if the the rule is anchored on the left to
 148      * the context start
 149      * @param anchorEnd true if the rule is anchored on the right to the
 150      * context limit
 151      */
 152     public TransliterationRule(String input,
 153                                int anteContextPos, int postContextPos,
 154                                String output,
 155                                int cursorPos, int cursorOffset,
 156                                UnicodeMatcher[] segs,
 157                                boolean anchorStart, boolean anchorEnd,
 158                                RuleBasedTransliterator.Data theData) {
 159         data = theData;
 160
 161         // Do range checks only when warranted to save time
 162         if (anteContextPos < 0) {
 163             anteContextLength = 0;
 164         } else {
 165             if (anteContextPos > input.length()) {
 166                 throw new IllegalArgumentException("Invalid ante context");
 167             }
 168             anteContextLength = anteContextPos;
 169         }
 170         if (postContextPos < 0) {
 171             keyLength = input.length() - anteContextLength;
 172         } else {
 173             if (postContextPos < anteContextLength ||
 174                 postContextPos > input.length()) {
 175                 throw new IllegalArgumentException("Invalid post context");
 176             }
 177             keyLength = postContextPos - anteContextLength;
 178         }
 179         if (cursorPos < 0) {
 180             cursorPos = output.length();
 181         } else if (cursorPos > output.length()) {
 182             throw new IllegalArgumentException("Invalid cursor position");
 183         }
 184
 185         // We don't validate the segments array.  The caller must
 186         // guarantee that the segments are well-formed (that is, that
 187         // all $n references in the output refer to indices of this
 188         // array, and that no array elements are null).
 189         this.segments = segs;
 190
 191         pattern = input;
 192         flags = 0;
 193         if (anchorStart) {
 194             flags |= ANCHOR_START;
 195         }
 196         if (anchorEnd) {
 197             flags |= ANCHOR_END;
 198         }
 199
 200         anteContext = null;
 201         if (anteContextLength > 0) {
 202             anteContext = new StringMatcher(pattern.substring(0, anteContextLength),
 203                                             0, data);
 204         }
 205
 206         key = null;
 207         if (keyLength > 0) {
 208             key = new StringMatcher(pattern.substring(anteContextLength, anteContextLength + keyLength),
 209                                     0, data);
 210         }
 211
 212         int postContextLength = pattern.length() - keyLength - anteContextLength;
 213         postContext = null;
 214         if (postContextLength > 0) {
 215             postContext = new StringMatcher(pattern.substring(anteContextLength + keyLength),
 216                                             0, data);
 217         }
 218
 219         this.output = new StringReplacer(output, cursorPos + cursorOffset, data);
 220     }
 221
 222     /**
 223      * Return the preceding context length.  This method is needed to
 224      * support the <code>Transliterator</code> method
 225      * <code>getMaximumContextLength()</code>.
 226      */
 227     public int getAnteContextLength() {
 228         return anteContextLength + (((flags & ANCHOR_START) != 0) ? 1 : 0);
 229     }
 230
 231     /**
 232      * Internal method.  Returns 8-bit index value for this rule.
 233      * This is the low byte of the first character of the key,
 234      * unless the first character of the key is a set.  If it's a
 235      * set, or otherwise can match multiple keys, the index value is -1.
 236      */
 237     final int getIndexValue() {
 238         if (anteContextLength == pattern.length()) {
 239             // A pattern with just ante context {such as foo)>bar} can
 240             // match any key.
 241             return -1;
 242         }
 243         int c = UTF16.charAt(pattern, anteContextLength);
 244         return data.lookupMatcher(c) == null ? (c & 0xFF) : -1;
 245     }
 246
 247     /**
 248      * Internal method.  Returns true if this rule matches the given
 249      * index value.  The index value is an 8-bit integer, 0..255,
 250      * representing the low byte of the first character of the key.
 251      * It matches this rule if it matches the first character of the
 252      * key, or if the first character of the key is a set, and the set
 253      * contains any character with a low byte equal to the index
 254      * value.  If the rule contains only ante context, as in foo)>bar,
 255      * then it will match any key.
 256      */
 257     final boolean matchesIndexValue(int v) {
 258         // Delegate to the key, or if there is none, to the postContext.
 259         // If there is neither then we match any key; return true.
 260         UnicodeMatcher m = (key != null) ? key : postContext;
 261         return (m != null) ? m.matchesIndexValue(v) : true;
 262     }
 263
 264     /**
 265      * Return true if this rule masks another rule.  If r1 masks r2 then
 266      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
 267      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
 268      * "[c]a>x" masks "[dc]a>y".
 269      */
 270     public boolean masks(TransliterationRule r2) {
 271         /* Rule r1 masks rule r2 if the string formed of the
 272          * antecontext, key, and postcontext overlaps in the following
 273          * way:
 274          *
 275          * r1:      aakkkpppp
 276          * r2:     aaakkkkkpppp
 277          *            ^
 278          *
 279          * The strings must be aligned at the first character of the
 280          * key.  The length of r1 to the left of the alignment point
 281          * must be <= the length of r2 to the left; ditto for the
 282          * right.  The characters of r1 must equal (or be a superset
 283          * of) the corresponding characters of r2.  The superset
 284          * operation should be performed to check for UnicodeSet
 285          * masking.
 286          *
 287          * Anchors:  Two patterns that differ only in anchors only
 288          * mask one another if they are exactly equal, and r2 has
 289          * all the anchors r1 has (optionally, plus some).  Here Y
 290          * means the row masks the column, N means it doesn't.
 291          *
 292          *         ab   ^ab    ab$  ^ab$
 293          *   ab    Y     Y     Y     Y
 294          *  ^ab    N     Y     N     Y
 295          *   ab$   N     N     Y     Y
 296          *  ^ab$   N     N     N     Y
 297          *
 298          * Post context: {a}b masks ab, but not vice versa, since {a}b
 299          * matches everything ab matches, and {a}b matches {|a|}b but ab
 300          * does not.  Pre context is different (a{b} does not align with
 301          * ab).
 302          */
 303
 304         /* LIMITATION of the current mask algorithm: Some rule
 305          * maskings are currently not detected.  For example,
 306          * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
 307          */
 308
 309         int len = pattern.length();
 310         int left = anteContextLength;
 311         int left2 = r2.anteContextLength;
 312         int right = pattern.length() - left;
 313         int right2 = r2.pattern.length() - left2;
 314
 315         // TODO Clean this up -- some logic might be combinable with the
 316         // next statement.
 317
 318         // Test for anchor masking
 319         if (left == left2 && right == right2 &&
 320             keyLength <= r2.keyLength &&
 321             r2.pattern.regionMatches(0, pattern, 0, len)) {
 322             // The following boolean logic implements the table above
 323             return (flags == r2.flags) ||
 324                 (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) ||
 325                 (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0));
 326         }
 327
 328         return left <= left2 &&
 329             (right < right2 ||
 330              (right == right2 && keyLength <= r2.keyLength)) &&
 331             r2.pattern.regionMatches(left2 - left, pattern, 0, len);
 332     }
 333
 334     static final int posBefore(Replaceable str, int pos) {
 335         return (pos > 0) ?
 336             pos - UTF16.getCharCount(str.char32At(pos-1)) :
 337             pos - 1;
 338     }
 339
 340     static final int posAfter(Replaceable str, int pos) {
 341         return (pos >= 0 && pos < str.length()) ?
 342             pos + UTF16.getCharCount(str.char32At(pos)) :
 343             pos + 1;
 344     }
 345
 346     /**
 347      * Attempt a match and replacement at the given position.  Return
 348      * the degree of match between this rule and the given text.  The
 349      * degree of match may be mismatch, a partial match, or a full
 350      * match.  A mismatch means at least one character of the text
 351      * does not match the context or key.  A partial match means some
 352      * context and key characters match, but the text is not long
 353      * enough to match all of them.  A full match means all context
 354      * and key characters match.
 355      *
 356      * If a full match is obtained, perform a replacement, update pos,
 357      * and return U_MATCH.  Otherwise both text and pos are unchanged.
 358      *
 359      * @param text the text
 360      * @param pos the position indices
 361      * @param incremental if TRUE, test for partial matches that may
 362      * be completed by additional text inserted at pos.limit.
 363      * @return one of <code>U_MISMATCH</code>,
 364      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
 365      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
 366      */
 367     public int matchAndReplace(Replaceable text,
 368                                Transliterator.Position pos,
 369                                boolean incremental) {
 370         // Matching and replacing are done in one method because the
 371         // replacement operation needs information obtained during the
 372         // match.  Another way to do this is to have the match method
 373         // create a match result struct with relevant offsets, and to pass
 374         // this into the replace method.
 375
 376         // ============================ MATCH ===========================
 377
 378         // Reset segment match data
 379         if (segments != null) {
 380             for (int i=0; i<segments.length; ++i) {
 381                 ((StringMatcher) segments[i]).resetMatch();
 382             }
 383         }
 384
 385         int keyLimit;
 386         int[] intRef = new int[1];
 387
 388         // ------------------------ Ante Context ------------------------
 389
 390         // A mismatch in the ante context, or with the start anchor,
 391         // is an outright U_MISMATCH regardless of whether we are
 392         // incremental or not.
 393         int oText; // offset into 'text'
 394         int minOText;
 395
 396         // Note (1): We process text in 16-bit code units, rather than
 397         // 32-bit code points.  This works because stand-ins are
 398         // always in the BMP and because we are doing a literal match
 399         // operation, which can be done 16-bits at a time.
 400
 401         int anteLimit = posBefore(text, pos.contextStart);
 402
 403         int match;
 404
 405         // Start reverse match at char before pos.start
 406         intRef[0] = posBefore(text, pos.start);
 407
 408         if (anteContext != null) {
 409             match = anteContext.matches(text, intRef, anteLimit, false);
 410             if (match != UnicodeMatcher.U_MATCH) {
 411                 return UnicodeMatcher.U_MISMATCH;
 412             }
 413         }
 414
 415         oText = intRef[0];
 416
 417         minOText = posAfter(text, oText);
 418
 419         // ------------------------ Start Anchor ------------------------
 420
 421         if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
 422             return UnicodeMatcher.U_MISMATCH;
 423         }
 424
 425         // -------------------- Key and Post Context --------------------
 426
 427         intRef[0] = pos.start;
 428
 429         if (key != null) {
 430             match = key.matches(text, intRef, pos.limit, incremental);
 431             if (match != UnicodeMatcher.U_MATCH) {
 432                 return match;
 433             }
 434         }
 435
 436         keyLimit = intRef[0];
 437
 438         if (postContext != null) {
 439             if (incremental && keyLimit == pos.limit) {
 440                 // The key matches just before pos.limit, and there is
 441                 // a postContext.  Since we are in incremental mode,
 442                 // we must assume more characters may be inserted at
 443                 // pos.limit -- this is a partial match.
 444                 return UnicodeMatcher.U_PARTIAL_MATCH;
 445             }
 446
 447             match = postContext.matches(text, intRef, pos.contextLimit, incremental);
 448             if (match != UnicodeMatcher.U_MATCH) {
 449                 return match;
 450             }
 451         }
 452
 453         oText = intRef[0];
 454
 455         // ------------------------- Stop Anchor ------------------------
 456
 457         if (((flags & ANCHOR_END)) != 0) {
 458             if (oText != pos.contextLimit) {
 459                 return UnicodeMatcher.U_MISMATCH;
 460             }
 461             if (incremental) {
 462                 return UnicodeMatcher.U_PARTIAL_MATCH;
 463             }
 464         }
 465
 466         // =========================== REPLACE ==========================
 467
 468         // We have a full match.  The key is between pos.start and
 469         // keyLimit.
 470
 471         int newLength = output.replace(text, pos.start, keyLimit, intRef);
 472         int lenDelta = newLength - (keyLimit - pos.start);
 473         int newStart = intRef[0];
 474
 475         oText += lenDelta;
 476         pos.limit += lenDelta;
 477         pos.contextLimit += lenDelta;
 478         // Restrict new value of start to [minOText, min(oText, pos.limit)].
 479         pos.start = Math.max(minOText, Math.min(Math.min(oText, pos.limit), newStart));
 480         return UnicodeMatcher.U_MATCH;
 481     }
 482
 483     /**
 484      * Create a source string that represents this rule.  Append it to the
 485      * given string.
 486      */
 487     public String toRule(boolean escapeUnprintable) {
 488        // int i;
 489
 490         StringBuffer rule = new StringBuffer();
 491
 492         // Accumulate special characters (and non-specials following them)
 493         // into quoteBuf.  Append quoteBuf, within single quotes, when
 494         // a non-quoted element must be inserted.
 495         StringBuffer quoteBuf = new StringBuffer();
 496
 497         // Do not emit the braces '{' '}' around the pattern if there
 498         // is neither anteContext nor postContext.
 499         boolean emitBraces =
 500             (anteContext != null) || (postContext != null);
 501
 502         // Emit start anchor
 503         if ((flags & ANCHOR_START) != 0) {
 504             rule.append('^');
 505         }
 506
 507         // Emit the input pattern
 508         Utility.appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
 509
 510         if (emitBraces) {
 511             Utility.appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
 512         }
 513
 514         Utility.appendToRule(rule, key, escapeUnprintable, quoteBuf);
 515
 516         if (emitBraces) {
 517             Utility.appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
 518         }
 519
 520         Utility.appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
 521
 522         // Emit end anchor
 523         if ((flags & ANCHOR_END) != 0) {
 524             rule.append('$');
 525         }
 526
 527         Utility.appendToRule(rule, " > ", true, escapeUnprintable, quoteBuf);
 528
 529         // Emit the output pattern
 530
 531         Utility.appendToRule(rule, output.toReplacerPattern(escapeUnprintable),
 532                      true, escapeUnprintable, quoteBuf);
 533
 534         Utility.appendToRule(rule, ';', true, escapeUnprintable, quoteBuf);
 535
 536         return rule.toString();
 537     }
 538
 539     /**
 540      * Return a string representation of this object.
 541      * @return string representation of this object
 542      */
 543     public String toString() {
 544         return '{' + toRule(true) + '}';
 545     }
 546
 547     /**
 548      * Find the source and target sets, subject to the input filter.
 549      * There is a known issue with filters containing multiple characters.
 550      */
 551     // TODO: Problem: the rule is [{ab}]c > x
 552     // The filter is [a{bc}].
 553     // If the input is abc, then the rule will work.
 554     // However, following code applying the filter won't catch that case.
 555
 556     void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) {
 557         int limit = anteContextLength + keyLength;
 558         UnicodeSet tempSource = new UnicodeSet();
 559         UnicodeSet temp = new UnicodeSet();
 560
 561         // We need to walk through the pattern.
 562         // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
 563         for (int i=anteContextLength; i<limit; ) {
 564             int ch = UTF16.charAt(pattern, i);
 565             i += UTF16.getCharCount(ch);
 566             UnicodeMatcher matcher = data.lookupMatcher(ch);
 567             if (matcher == null) {
 568                 if (!filter.contains(ch)) {
 569                     return;
 570                 }
 571                 tempSource.add(ch);
 572             } else {
 573                 try {
 574                     if (!filter.containsSome((UnicodeSet) matcher)) {
 575                         return;
 576                     }
 577                     matcher.addMatchSetTo(tempSource);
 578                 } catch (ClassCastException e) { // if the matcher is not a UnicodeSet
 579                     temp.clear();
 580                     matcher.addMatchSetTo(temp);
 581                     if (!filter.containsSome(temp)) {
 582                         return;
 583                     }
 584                     tempSource.addAll(temp);
 585                 }
 586             }
 587         }
 588         // if we made our way through the gauntlet, add to source/target
 589         sourceSet.addAll(tempSource);
 590         output.addReplacementSetTo(targetSet);
 591     }
 592 }