2 *******************************************************************************
\r
3 * Copyright (C) 2001-2004, International Business Machines Corporation and *
\r
4 * others. All Rights Reserved. *
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
8 import com.ibm.icu.impl.Utility;
\r
11 * An object that matches a fixed input string, implementing the
\r
12 * UnicodeMatcher API. This object also implements the
\r
13 * UnicodeReplacer API, allowing it to emit the matched text as
\r
14 * output. Since the match text may contain flexible match elements,
\r
15 * such as UnicodeSets, the emitted text is not the match pattern, but
\r
16 * instead a substring of the actual matched text. Following
\r
17 * convention, the output text is the leftmost match seen up to this
\r
20 * A StringMatcher may represent a segment, in which case it has a
\r
21 * positive segment number. This affects how the matcher converts
\r
22 * itself to a pattern but does not otherwise affect its function.
\r
24 * A StringMatcher that is not a segment should not be used as a
\r
27 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
\r
30 * The text to be matched.
\r
32 private String pattern;
\r
35 * Start offset, in the match text, of the <em>rightmost</em>
\r
38 private int matchStart;
\r
41 * Limit offset, in the match text, of the <em>rightmost</em>
\r
44 private int matchLimit;
\r
47 * The segment number, 1-based, or 0 if not a segment.
\r
49 private int segmentNumber;
\r
52 * Context object that maps stand-ins to matcher and replacer
\r
55 private final RuleBasedTransliterator.Data data;
\r
58 * Construct a matcher that matches the given pattern string.
\r
59 * @param theString the pattern to be matched, possibly containing
\r
60 * stand-ins that represent nested UnicodeMatcher objects.
\r
61 * @param segmentNum the segment number from 1..n, or 0 if this is
\r
63 * @param theData context object mapping stand-ins to
\r
64 * UnicodeMatcher objects.
\r
66 public StringMatcher(String theString,
\r
68 RuleBasedTransliterator.Data theData) {
\r
70 pattern = theString;
\r
71 matchStart = matchLimit = -1;
\r
72 segmentNumber = segmentNum;
\r
76 * Construct a matcher that matches a substring of the given
\r
78 * @param theString the pattern to be matched, possibly containing
\r
79 * stand-ins that represent nested UnicodeMatcher objects.
\r
80 * @param start first character of theString to be matched
\r
81 * @param limit index after the last character of theString to be
\r
83 * @param segmentNum the segment number from 1..n, or 0 if this is
\r
85 * @param theData context object mapping stand-ins to
\r
86 * UnicodeMatcher objects.
\r
88 public StringMatcher(String theString,
\r
92 RuleBasedTransliterator.Data theData) {
\r
93 this(theString.substring(start, limit), segmentNum, theData);
\r
97 * Implement UnicodeMatcher
\r
99 public int matches(Replaceable text,
\r
102 boolean incremental) {
\r
103 // Note (1): We process text in 16-bit code units, rather than
\r
104 // 32-bit code points. This works because stand-ins are
\r
105 // always in the BMP and because we are doing a literal match
\r
106 // operation, which can be done 16-bits at a time.
\r
108 int[] cursor = new int[] { offset[0] };
\r
109 if (limit < cursor[0]) {
\r
110 // Match in the reverse direction
\r
111 for (i=pattern.length()-1; i>=0; --i) {
\r
112 char keyChar = pattern.charAt(i); // OK; see note (1) above
\r
113 UnicodeMatcher subm = data.lookupMatcher(keyChar);
\r
114 if (subm == null) {
\r
115 if (cursor[0] > limit &&
\r
116 keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
\r
123 subm.matches(text, cursor, limit, incremental);
\r
124 if (m != U_MATCH) {
\r
129 // Record the match position, but adjust for a normal
\r
130 // forward start, limit, and only if a prior match does not
\r
131 // exist -- we want the rightmost match.
\r
132 if (matchStart < 0) {
\r
133 matchStart = cursor[0]+1;
\r
134 matchLimit = offset[0]+1;
\r
137 for (i=0; i<pattern.length(); ++i) {
\r
138 if (incremental && cursor[0] == limit) {
\r
139 // We've reached the context limit without a mismatch and
\r
140 // without completing our match.
\r
141 return U_PARTIAL_MATCH;
\r
143 char keyChar = pattern.charAt(i); // OK; see note (1) above
\r
144 UnicodeMatcher subm = data.lookupMatcher(keyChar);
\r
145 if (subm == null) {
\r
146 // Don't need the cursor < limit check if
\r
147 // incremental is true (because it's done above); do need
\r
149 if (cursor[0] < limit &&
\r
150 keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
\r
157 subm.matches(text, cursor, limit, incremental);
\r
158 if (m != U_MATCH) {
\r
163 // Record the match position
\r
164 matchStart = offset[0];
\r
165 matchLimit = cursor[0];
\r
168 offset[0] = cursor[0];
\r
173 * Implement UnicodeMatcher
\r
175 public String toPattern(boolean escapeUnprintable) {
\r
176 StringBuffer result = new StringBuffer();
\r
177 StringBuffer quoteBuf = new StringBuffer();
\r
178 if (segmentNumber > 0) { // i.e., if this is a segment
\r
179 result.append('(');
\r
181 for (int i=0; i<pattern.length(); ++i) {
\r
182 char keyChar = pattern.charAt(i); // OK; see note (1) above
\r
183 UnicodeMatcher m = data.lookupMatcher(keyChar);
\r
185 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
\r
187 Utility.appendToRule(result, m.toPattern(escapeUnprintable),
\r
188 true, escapeUnprintable, quoteBuf);
\r
191 if (segmentNumber > 0) { // i.e., if this is a segment
\r
192 result.append(')');
\r
194 // Flush quoteBuf out to result
\r
195 Utility.appendToRule(result, -1,
\r
196 true, escapeUnprintable, quoteBuf);
\r
197 return result.toString();
\r
201 * Implement UnicodeMatcher
\r
203 public boolean matchesIndexValue(int v) {
\r
204 if (pattern.length() == 0) {
\r
207 int c = UTF16.charAt(pattern, 0);
\r
208 UnicodeMatcher m = data.lookupMatcher(c);
\r
209 return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
\r
213 * Implementation of UnicodeMatcher API. Union the set of all
\r
214 * characters that may be matched by this object into the given
\r
216 * @param toUnionTo the set into which to union the source characters
\r
218 public void addMatchSetTo(UnicodeSet toUnionTo) {
\r
220 for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {
\r
221 ch = UTF16.charAt(pattern, i);
\r
222 UnicodeMatcher matcher = data.lookupMatcher(ch);
\r
223 if (matcher == null) {
\r
226 matcher.addMatchSetTo(toUnionTo);
\r
232 * UnicodeReplacer API
\r
234 public int replace(Replaceable text,
\r
241 // Copy segment with out-of-band data
\r
243 // If there was no match, that means that a quantifier
\r
244 // matched zero-length. E.g., x (a)* y matched "xy".
\r
245 if (matchStart >= 0) {
\r
246 if (matchStart != matchLimit) {
\r
247 text.copy(matchStart, matchLimit, dest);
\r
248 outLen = matchLimit - matchStart;
\r
252 text.replace(start, limit, ""); // delete original text
\r
258 * UnicodeReplacer API
\r
260 public String toReplacerPattern(boolean escapeUnprintable) {
\r
261 // assert(segmentNumber > 0);
\r
262 StringBuffer rule = new StringBuffer("$");
\r
263 Utility.appendNumber(rule, segmentNumber, 10, 1);
\r
264 return rule.toString();
\r
268 * Remove any match data. This must be called before performing a
\r
269 * set of matches with this segment.
\r
271 public void resetMatch() {
\r
272 matchStart = matchLimit = -1;
\r
276 * Union the set of all characters that may output by this object
\r
277 * into the given set.
\r
278 * @param toUnionTo the set into which to union the output characters
\r
280 public void addReplacementSetTo(UnicodeSet toUnionTo) {
\r
281 // The output of this replacer varies; it is the source text between
\r
282 // matchStart and matchLimit. Since this varies depending on the
\r
283 // input text, we can't compute it here. We can either do nothing
\r
284 // or we can add ALL characters to the set. It's probably more useful
\r