2 *******************************************************************************
\r
3 * Copyright (C) 1996-2007, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import com.ibm.icu.impl.Utility;
\r
12 * A transliteration rule used by
\r
13 * <code>RuleBasedTransliterator</code>.
\r
14 * <code>TransliterationRule</code> is an immutable object.
\r
16 * <p>A rule consists of an input pattern and an output string. When
\r
17 * the input pattern is matched, the output string is emitted. The
\r
18 * input pattern consists of zero or more characters which are matched
\r
19 * exactly (the key) and optional context. Context must match if it
\r
20 * is specified. Context may be specified before the key, after the
\r
21 * key, or both. The key, preceding context, and following context
\r
22 * may contain variables. Variables represent a set of Unicode
\r
23 * characters, such as the letters <i>a</i> through <i>z</i>.
\r
24 * Variables are detected by looking up each character in a supplied
\r
25 * variable list to see if it has been so defined.
\r
27 * <p>A rule may contain segments in its input string and segment
\r
28 * references in its output string. A segment is a substring of the
\r
29 * input pattern, indicated by an offset and limit. The segment may
\r
30 * be in the preceding or following context. It may not span a
\r
31 * context boundary. A segment reference is a special character in
\r
32 * the output string that causes a segment of the input string (not
\r
33 * the input pattern) to be copied to the output string. The range of
\r
34 * special characters that represent segment references is defined by
\r
35 * RuleBasedTransliterator.Data.
\r
37 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input
\r
38 * string "abc.123" to "ab1.c23".
\r
40 * <p>Copyright © IBM Corporation 1999. All rights reserved.
\r
44 class TransliterationRule {
\r
46 // TODO Eliminate the pattern and keyLength data members. They
\r
47 // are used only by masks() and getIndexValue() which are called
\r
48 // only during build time, not during run-time. Perhaps these
\r
49 // methods and pattern/keyLength can be isolated into a separate
\r
53 * The match that must occur before the key, or null if there is no
\r
54 * preceding context.
\r
56 private StringMatcher anteContext;
\r
59 * The matcher object for the key. If null, then the key is empty.
\r
61 private StringMatcher key;
\r
64 * The match that must occur after the key, or null if there is no
\r
65 * following context.
\r
67 private StringMatcher postContext;
\r
70 * The object that performs the replacement if the key,
\r
71 * anteContext, and postContext are matched. Never null.
\r
73 private UnicodeReplacer output;
\r
76 * The string that must be matched, consisting of the anteContext, key,
\r
77 * and postContext, concatenated together, in that order. Some components
\r
78 * may be empty (zero length).
\r
79 * @see anteContextLength
\r
82 private String pattern;
\r
85 * An array of matcher objects corresponding to the input pattern
\r
86 * segments. If there are no segments this is null. N.B. This is
\r
87 * a UnicodeMatcher for generality, but in practice it is always a
\r
88 * StringMatcher. In the future we may generalize this, but for
\r
89 * now we sometimes cast down to StringMatcher.
\r
91 UnicodeMatcher[] segments;
\r
94 * The length of the string that must match before the key. If
\r
95 * zero, then there is no matching requirement before the key.
\r
96 * Substring [0,anteContextLength) of pattern is the anteContext.
\r
98 private int anteContextLength;
\r
101 * The length of the key. Substring [anteContextLength,
\r
102 * anteContextLength + keyLength) is the key.
\r
104 private int keyLength;
\r
107 * Miscellaneous attributes.
\r
114 static final int ANCHOR_START = 1;
\r
115 static final int ANCHOR_END = 2;
\r
118 * An alias pointer to the data for this rule. The data provides
\r
119 * lookup services for matchers and segments.
\r
121 private final RuleBasedTransliterator.Data data;
\r
125 * Construct a new rule with the given input, output text, and other
\r
126 * attributes. A cursor position may be specified for the output text.
\r
127 * @param input input string, including key and optional ante and
\r
129 * @param anteContextPos offset into input to end of ante context, or -1 if
\r
130 * none. Must be <= input.length() if not -1.
\r
131 * @param postContextPos offset into input to start of post context, or -1
\r
132 * if none. Must be <= input.length() if not -1, and must be >=
\r
134 * @param output output string
\r
135 * @param cursorPos offset into output at which cursor is located, or -1 if
\r
136 * none. If less than zero, then the cursor is placed after the
\r
137 * <code>output</code>; that is, -1 is equivalent to
\r
138 * <code>output.length()</code>. If greater than
\r
139 * <code>output.length()</code> then an exception is thrown.
\r
140 * @param cursorOffset an offset to be added to cursorPos to position the
\r
141 * cursor either in the ante context, if < 0, or in the post context, if >
\r
142 * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
\r
143 * "xyz" and moves the cursor to before "a". It would have a cursorOffset
\r
145 * @param segs array of UnicodeMatcher corresponding to input pattern
\r
146 * segments, or null if there are none
\r
147 * @param anchorStart true if the the rule is anchored on the left to
\r
148 * the context start
\r
149 * @param anchorEnd true if the rule is anchored on the right to the
\r
152 public TransliterationRule(String input,
\r
153 int anteContextPos, int postContextPos,
\r
155 int cursorPos, int cursorOffset,
\r
156 UnicodeMatcher[] segs,
\r
157 boolean anchorStart, boolean anchorEnd,
\r
158 RuleBasedTransliterator.Data theData) {
\r
161 // Do range checks only when warranted to save time
\r
162 if (anteContextPos < 0) {
\r
163 anteContextLength = 0;
\r
165 if (anteContextPos > input.length()) {
\r
166 throw new IllegalArgumentException("Invalid ante context");
\r
168 anteContextLength = anteContextPos;
\r
170 if (postContextPos < 0) {
\r
171 keyLength = input.length() - anteContextLength;
\r
173 if (postContextPos < anteContextLength ||
\r
174 postContextPos > input.length()) {
\r
175 throw new IllegalArgumentException("Invalid post context");
\r
177 keyLength = postContextPos - anteContextLength;
\r
179 if (cursorPos < 0) {
\r
180 cursorPos = output.length();
\r
181 } else if (cursorPos > output.length()) {
\r
182 throw new IllegalArgumentException("Invalid cursor position");
\r
185 // We don't validate the segments array. The caller must
\r
186 // guarantee that the segments are well-formed (that is, that
\r
187 // all $n references in the output refer to indices of this
\r
188 // array, and that no array elements are null).
\r
189 this.segments = segs;
\r
194 flags |= ANCHOR_START;
\r
197 flags |= ANCHOR_END;
\r
200 anteContext = null;
\r
201 if (anteContextLength > 0) {
\r
202 anteContext = new StringMatcher(pattern.substring(0, anteContextLength),
\r
207 if (keyLength > 0) {
\r
208 key = new StringMatcher(pattern.substring(anteContextLength, anteContextLength + keyLength),
\r
212 int postContextLength = pattern.length() - keyLength - anteContextLength;
\r
213 postContext = null;
\r
214 if (postContextLength > 0) {
\r
215 postContext = new StringMatcher(pattern.substring(anteContextLength + keyLength),
\r
219 this.output = new StringReplacer(output, cursorPos + cursorOffset, data);
\r
223 * Return the preceding context length. This method is needed to
\r
224 * support the <code>Transliterator</code> method
\r
225 * <code>getMaximumContextLength()</code>.
\r
227 public int getAnteContextLength() {
\r
228 return anteContextLength + (((flags & ANCHOR_START) != 0) ? 1 : 0);
\r
232 * Internal method. Returns 8-bit index value for this rule.
\r
233 * This is the low byte of the first character of the key,
\r
234 * unless the first character of the key is a set. If it's a
\r
235 * set, or otherwise can match multiple keys, the index value is -1.
\r
237 final int getIndexValue() {
\r
238 if (anteContextLength == pattern.length()) {
\r
239 // A pattern with just ante context {such as foo)>bar} can
\r
243 int c = UTF16.charAt(pattern, anteContextLength);
\r
244 return data.lookupMatcher(c) == null ? (c & 0xFF) : -1;
\r
248 * Internal method. Returns true if this rule matches the given
\r
249 * index value. The index value is an 8-bit integer, 0..255,
\r
250 * representing the low byte of the first character of the key.
\r
251 * It matches this rule if it matches the first character of the
\r
252 * key, or if the first character of the key is a set, and the set
\r
253 * contains any character with a low byte equal to the index
\r
254 * value. If the rule contains only ante context, as in foo)>bar,
\r
255 * then it will match any key.
\r
257 final boolean matchesIndexValue(int v) {
\r
258 // Delegate to the key, or if there is none, to the postContext.
\r
259 // If there is neither then we match any key; return true.
\r
260 UnicodeMatcher m = (key != null) ? key : postContext;
\r
261 return (m != null) ? m.matchesIndexValue(v) : true;
\r
265 * Return true if this rule masks another rule. If r1 masks r2 then
\r
266 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks
\r
267 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y".
\r
268 * "[c]a>x" masks "[dc]a>y".
\r
270 public boolean masks(TransliterationRule r2) {
\r
271 /* Rule r1 masks rule r2 if the string formed of the
\r
272 * antecontext, key, and postcontext overlaps in the following
\r
279 * The strings must be aligned at the first character of the
\r
280 * key. The length of r1 to the left of the alignment point
\r
281 * must be <= the length of r2 to the left; ditto for the
\r
282 * right. The characters of r1 must equal (or be a superset
\r
283 * of) the corresponding characters of r2. The superset
\r
284 * operation should be performed to check for UnicodeSet
\r
287 * Anchors: Two patterns that differ only in anchors only
\r
288 * mask one another if they are exactly equal, and r2 has
\r
289 * all the anchors r1 has (optionally, plus some). Here Y
\r
290 * means the row masks the column, N means it doesn't.
\r
298 * Post context: {a}b masks ab, but not vice versa, since {a}b
\r
299 * matches everything ab matches, and {a}b matches {|a|}b but ab
\r
300 * does not. Pre context is different (a{b} does not align with
\r
304 /* LIMITATION of the current mask algorithm: Some rule
\r
305 * maskings are currently not detected. For example,
\r
306 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO
\r
309 int len = pattern.length();
\r
310 int left = anteContextLength;
\r
311 int left2 = r2.anteContextLength;
\r
312 int right = pattern.length() - left;
\r
313 int right2 = r2.pattern.length() - left2;
\r
315 // TODO Clean this up -- some logic might be combinable with the
\r
318 // Test for anchor masking
\r
319 if (left == left2 && right == right2 &&
\r
320 keyLength <= r2.keyLength &&
\r
321 r2.pattern.regionMatches(0, pattern, 0, len)) {
\r
322 // The following boolean logic implements the table above
\r
323 return (flags == r2.flags) ||
\r
324 (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) ||
\r
325 (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0));
\r
328 return left <= left2 &&
\r
330 (right == right2 && keyLength <= r2.keyLength)) &&
\r
331 r2.pattern.regionMatches(left2 - left, pattern, 0, len);
\r
334 static final int posBefore(Replaceable str, int pos) {
\r
336 pos - UTF16.getCharCount(str.char32At(pos-1)) :
\r
340 static final int posAfter(Replaceable str, int pos) {
\r
341 return (pos >= 0 && pos < str.length()) ?
\r
342 pos + UTF16.getCharCount(str.char32At(pos)) :
\r
347 * Attempt a match and replacement at the given position. Return
\r
348 * the degree of match between this rule and the given text. The
\r
349 * degree of match may be mismatch, a partial match, or a full
\r
350 * match. A mismatch means at least one character of the text
\r
351 * does not match the context or key. A partial match means some
\r
352 * context and key characters match, but the text is not long
\r
353 * enough to match all of them. A full match means all context
\r
354 * and key characters match.
\r
356 * If a full match is obtained, perform a replacement, update pos,
\r
357 * and return U_MATCH. Otherwise both text and pos are unchanged.
\r
359 * @param text the text
\r
360 * @param pos the position indices
\r
361 * @param incremental if TRUE, test for partial matches that may
\r
362 * be completed by additional text inserted at pos.limit.
\r
363 * @return one of <code>U_MISMATCH</code>,
\r
364 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If
\r
365 * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
\r
367 public int matchAndReplace(Replaceable text,
\r
368 Transliterator.Position pos,
\r
369 boolean incremental) {
\r
370 // Matching and replacing are done in one method because the
\r
371 // replacement operation needs information obtained during the
\r
372 // match. Another way to do this is to have the match method
\r
373 // create a match result struct with relevant offsets, and to pass
\r
374 // this into the replace method.
\r
376 // ============================ MATCH ===========================
\r
378 // Reset segment match data
\r
379 if (segments != null) {
\r
380 for (int i=0; i<segments.length; ++i) {
\r
381 ((StringMatcher) segments[i]).resetMatch();
\r
386 int[] intRef = new int[1];
\r
388 // ------------------------ Ante Context ------------------------
\r
390 // A mismatch in the ante context, or with the start anchor,
\r
391 // is an outright U_MISMATCH regardless of whether we are
\r
392 // incremental or not.
\r
393 int oText; // offset into 'text'
\r
396 // Note (1): We process text in 16-bit code units, rather than
\r
397 // 32-bit code points. This works because stand-ins are
\r
398 // always in the BMP and because we are doing a literal match
\r
399 // operation, which can be done 16-bits at a time.
\r
401 int anteLimit = posBefore(text, pos.contextStart);
\r
405 // Start reverse match at char before pos.start
\r
406 intRef[0] = posBefore(text, pos.start);
\r
408 if (anteContext != null) {
\r
409 match = anteContext.matches(text, intRef, anteLimit, false);
\r
410 if (match != UnicodeMatcher.U_MATCH) {
\r
411 return UnicodeMatcher.U_MISMATCH;
\r
417 minOText = posAfter(text, oText);
\r
419 // ------------------------ Start Anchor ------------------------
\r
421 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
\r
422 return UnicodeMatcher.U_MISMATCH;
\r
425 // -------------------- Key and Post Context --------------------
\r
427 intRef[0] = pos.start;
\r
430 match = key.matches(text, intRef, pos.limit, incremental);
\r
431 if (match != UnicodeMatcher.U_MATCH) {
\r
436 keyLimit = intRef[0];
\r
438 if (postContext != null) {
\r
439 if (incremental && keyLimit == pos.limit) {
\r
440 // The key matches just before pos.limit, and there is
\r
441 // a postContext. Since we are in incremental mode,
\r
442 // we must assume more characters may be inserted at
\r
443 // pos.limit -- this is a partial match.
\r
444 return UnicodeMatcher.U_PARTIAL_MATCH;
\r
447 match = postContext.matches(text, intRef, pos.contextLimit, incremental);
\r
448 if (match != UnicodeMatcher.U_MATCH) {
\r
455 // ------------------------- Stop Anchor ------------------------
\r
457 if (((flags & ANCHOR_END)) != 0) {
\r
458 if (oText != pos.contextLimit) {
\r
459 return UnicodeMatcher.U_MISMATCH;
\r
462 return UnicodeMatcher.U_PARTIAL_MATCH;
\r
466 // =========================== REPLACE ==========================
\r
468 // We have a full match. The key is between pos.start and
\r
471 int newLength = output.replace(text, pos.start, keyLimit, intRef);
\r
472 int lenDelta = newLength - (keyLimit - pos.start);
\r
473 int newStart = intRef[0];
\r
476 pos.limit += lenDelta;
\r
477 pos.contextLimit += lenDelta;
\r
478 // Restrict new value of start to [minOText, min(oText, pos.limit)].
\r
479 pos.start = Math.max(minOText, Math.min(Math.min(oText, pos.limit), newStart));
\r
480 return UnicodeMatcher.U_MATCH;
\r
484 * Create a source string that represents this rule. Append it to the
\r
487 public String toRule(boolean escapeUnprintable) {
\r
490 StringBuffer rule = new StringBuffer();
\r
492 // Accumulate special characters (and non-specials following them)
\r
493 // into quoteBuf. Append quoteBuf, within single quotes, when
\r
494 // a non-quoted element must be inserted.
\r
495 StringBuffer quoteBuf = new StringBuffer();
\r
497 // Do not emit the braces '{' '}' around the pattern if there
\r
498 // is neither anteContext nor postContext.
\r
499 boolean emitBraces =
\r
500 (anteContext != null) || (postContext != null);
\r
502 // Emit start anchor
\r
503 if ((flags & ANCHOR_START) != 0) {
\r
507 // Emit the input pattern
\r
508 Utility.appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
\r
511 Utility.appendToRule(rule, '{', true, escapeUnprintable, quoteBuf);
\r
514 Utility.appendToRule(rule, key, escapeUnprintable, quoteBuf);
\r
517 Utility.appendToRule(rule, '}', true, escapeUnprintable, quoteBuf);
\r
520 Utility.appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
\r
523 if ((flags & ANCHOR_END) != 0) {
\r
527 Utility.appendToRule(rule, " > ", true, escapeUnprintable, quoteBuf);
\r
529 // Emit the output pattern
\r
531 Utility.appendToRule(rule, output.toReplacerPattern(escapeUnprintable),
\r
532 true, escapeUnprintable, quoteBuf);
\r
534 Utility.appendToRule(rule, ';', true, escapeUnprintable, quoteBuf);
\r
536 return rule.toString();
\r
540 * Return a string representation of this object.
\r
541 * @return string representation of this object
\r
543 public String toString() {
\r
544 return '{' + toRule(true) + '}';
\r
548 * Union the set of all characters that may be modified by this rule
\r
549 * into the given set.
\r
551 void addSourceSetTo(UnicodeSet toUnionTo) {
\r
552 int limit = anteContextLength + keyLength;
\r
553 for (int i=anteContextLength; i<limit; ) {
\r
554 int ch = UTF16.charAt(pattern, i);
\r
555 i += UTF16.getCharCount(ch);
\r
556 UnicodeMatcher matcher = data.lookupMatcher(ch);
\r
557 if (matcher == null) {
\r
560 matcher.addMatchSetTo(toUnionTo);
\r
566 * Union the set of all characters that may be emitted by this rule
\r
567 * into the given set.
\r
569 void addTargetSetTo(UnicodeSet toUnionTo) {
\r
570 output.addReplacementSetTo(toUnionTo);
\r