2 *******************************************************************************
3 * Copyright (C) 2001-2004, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *******************************************************************************
7 package com.ibm.icu.text;
8 import com.ibm.icu.impl.Utility;
11 * An object that matches a fixed input string, implementing the
12 * UnicodeMatcher API. This object also implements the
13 * UnicodeReplacer API, allowing it to emit the matched text as
14 * output. Since the match text may contain flexible match elements,
15 * such as UnicodeSets, the emitted text is not the match pattern, but
16 * instead a substring of the actual matched text. Following
17 * convention, the output text is the leftmost match seen up to this
20 * A StringMatcher may represent a segment, in which case it has a
21 * positive segment number. This affects how the matcher converts
22 * itself to a pattern but does not otherwise affect its function.
24 * A StringMatcher that is not a segment should not be used as a
27 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
30 * The text to be matched.
32 private String pattern;
35 * Start offset, in the match text, of the <em>rightmost</em>
38 private int matchStart;
41 * Limit offset, in the match text, of the <em>rightmost</em>
44 private int matchLimit;
47 * The segment number, 1-based, or 0 if not a segment.
49 private int segmentNumber;
52 * Context object that maps stand-ins to matcher and replacer
55 private final RuleBasedTransliterator.Data data;
58 * Construct a matcher that matches the given pattern string.
59 * @param theString the pattern to be matched, possibly containing
60 * stand-ins that represent nested UnicodeMatcher objects.
61 * @param segmentNum the segment number from 1..n, or 0 if this is
63 * @param theData context object mapping stand-ins to
64 * UnicodeMatcher objects.
66 public StringMatcher(String theString,
68 RuleBasedTransliterator.Data theData) {
71 matchStart = matchLimit = -1;
72 segmentNumber = segmentNum;
76 * Construct a matcher that matches a substring of the given
78 * @param theString the pattern to be matched, possibly containing
79 * stand-ins that represent nested UnicodeMatcher objects.
80 * @param start first character of theString to be matched
81 * @param limit index after the last character of theString to be
83 * @param segmentNum the segment number from 1..n, or 0 if this is
85 * @param theData context object mapping stand-ins to
86 * UnicodeMatcher objects.
88 public StringMatcher(String theString,
92 RuleBasedTransliterator.Data theData) {
93 this(theString.substring(start, limit), segmentNum, theData);
97 * Implement UnicodeMatcher
99 public int matches(Replaceable text,
102 boolean incremental) {
103 // Note (1): We process text in 16-bit code units, rather than
104 // 32-bit code points. This works because stand-ins are
105 // always in the BMP and because we are doing a literal match
106 // operation, which can be done 16-bits at a time.
108 int[] cursor = new int[] { offset[0] };
109 if (limit < cursor[0]) {
110 // Match in the reverse direction
111 for (i=pattern.length()-1; i>=0; --i) {
112 char keyChar = pattern.charAt(i); // OK; see note (1) above
113 UnicodeMatcher subm = data.lookupMatcher(keyChar);
115 if (cursor[0] > limit &&
116 keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
123 subm.matches(text, cursor, limit, incremental);
129 // Record the match position, but adjust for a normal
130 // forward start, limit, and only if a prior match does not
131 // exist -- we want the rightmost match.
132 if (matchStart < 0) {
133 matchStart = cursor[0]+1;
134 matchLimit = offset[0]+1;
137 for (i=0; i<pattern.length(); ++i) {
138 if (incremental && cursor[0] == limit) {
139 // We've reached the context limit without a mismatch and
140 // without completing our match.
141 return U_PARTIAL_MATCH;
143 char keyChar = pattern.charAt(i); // OK; see note (1) above
144 UnicodeMatcher subm = data.lookupMatcher(keyChar);
146 // Don't need the cursor < limit check if
147 // incremental is true (because it's done above); do need
149 if (cursor[0] < limit &&
150 keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
157 subm.matches(text, cursor, limit, incremental);
163 // Record the match position
164 matchStart = offset[0];
165 matchLimit = cursor[0];
168 offset[0] = cursor[0];
173 * Implement UnicodeMatcher
175 public String toPattern(boolean escapeUnprintable) {
176 StringBuffer result = new StringBuffer();
177 StringBuffer quoteBuf = new StringBuffer();
178 if (segmentNumber > 0) { // i.e., if this is a segment
181 for (int i=0; i<pattern.length(); ++i) {
182 char keyChar = pattern.charAt(i); // OK; see note (1) above
183 UnicodeMatcher m = data.lookupMatcher(keyChar);
185 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
187 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
188 true, escapeUnprintable, quoteBuf);
191 if (segmentNumber > 0) { // i.e., if this is a segment
194 // Flush quoteBuf out to result
195 Utility.appendToRule(result, -1,
196 true, escapeUnprintable, quoteBuf);
197 return result.toString();
201 * Implement UnicodeMatcher
203 public boolean matchesIndexValue(int v) {
204 if (pattern.length() == 0) {
207 int c = UTF16.charAt(pattern, 0);
208 UnicodeMatcher m = data.lookupMatcher(c);
209 return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
213 * Implementation of UnicodeMatcher API. Union the set of all
214 * characters that may be matched by this object into the given
216 * @param toUnionTo the set into which to union the source characters
218 public void addMatchSetTo(UnicodeSet toUnionTo) {
220 for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
221 ch = UTF16.charAt(pattern, i);
222 UnicodeMatcher matcher = data.lookupMatcher(ch);
223 if (matcher == null) {
226 matcher.addMatchSetTo(toUnionTo);
232 * UnicodeReplacer API
234 public int replace(Replaceable text,
241 // Copy segment with out-of-band data
243 // If there was no match, that means that a quantifier
244 // matched zero-length. E.g., x (a)* y matched "xy".
245 if (matchStart >= 0) {
246 if (matchStart != matchLimit) {
247 text.copy(matchStart, matchLimit, dest);
248 outLen = matchLimit - matchStart;
252 text.replace(start, limit, ""); // delete original text
258 * UnicodeReplacer API
260 public String toReplacerPattern(boolean escapeUnprintable) {
261 // assert(segmentNumber > 0);
262 StringBuffer rule = new StringBuffer("$");
263 Utility.appendNumber(rule, segmentNumber, 10, 1);
264 return rule.toString();
268 * Remove any match data. This must be called before performing a
269 * set of matches with this segment.
271 public void resetMatch() {
272 matchStart = matchLimit = -1;
276 * Union the set of all characters that may output by this object
277 * into the given set.
278 * @param toUnionTo the set into which to union the output characters
280 public void addReplacementSetTo(UnicodeSet toUnionTo) {
281 // The output of this replacer varies; it is the source text between
282 // matchStart and matchLimit. Since this varies depending on the
283 // input text, we can't compute it here. We can either do nothing
284 // or we can add ALL characters to the set. It's probably more useful