/* ******************************************************************************* * Copyright (C) 2001-2004, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.text; import com.ibm.icu.impl.Utility; /** * An object that matches a fixed input string, implementing the * UnicodeMatcher API. This object also implements the * UnicodeReplacer API, allowing it to emit the matched text as * output. Since the match text may contain flexible match elements, * such as UnicodeSets, the emitted text is not the match pattern, but * instead a substring of the actual matched text. Following * convention, the output text is the leftmost match seen up to this * point. * * A StringMatcher may represent a segment, in which case it has a * positive segment number. This affects how the matcher converts * itself to a pattern but does not otherwise affect its function. * * A StringMatcher that is not a segment should not be used as a * UnicodeReplacer. */ class StringMatcher implements UnicodeMatcher, UnicodeReplacer { /** * The text to be matched. */ private String pattern; /** * Start offset, in the match text, of the rightmost * match. */ private int matchStart; /** * Limit offset, in the match text, of the rightmost * match. */ private int matchLimit; /** * The segment number, 1-based, or 0 if not a segment. */ private int segmentNumber; /** * Context object that maps stand-ins to matcher and replacer * objects. */ private final RuleBasedTransliterator.Data data; /** * Construct a matcher that matches the given pattern string. * @param theString the pattern to be matched, possibly containing * stand-ins that represent nested UnicodeMatcher objects. * @param segmentNum the segment number from 1..n, or 0 if this is * not a segment. * @param theData context object mapping stand-ins to * UnicodeMatcher objects. */ public StringMatcher(String theString, int segmentNum, RuleBasedTransliterator.Data theData) { data = theData; pattern = theString; matchStart = matchLimit = -1; segmentNumber = segmentNum; } /** * Construct a matcher that matches a substring of the given * pattern string. * @param theString the pattern to be matched, possibly containing * stand-ins that represent nested UnicodeMatcher objects. * @param start first character of theString to be matched * @param limit index after the last character of theString to be * matched. * @param segmentNum the segment number from 1..n, or 0 if this is * not a segment. * @param theData context object mapping stand-ins to * UnicodeMatcher objects. */ public StringMatcher(String theString, int start, int limit, int segmentNum, RuleBasedTransliterator.Data theData) { this(theString.substring(start, limit), segmentNum, theData); } /** * Implement UnicodeMatcher */ public int matches(Replaceable text, int[] offset, int limit, boolean incremental) { // Note (1): We process text in 16-bit code units, rather than // 32-bit code points. This works because stand-ins are // always in the BMP and because we are doing a literal match // operation, which can be done 16-bits at a time. int i; int[] cursor = new int[] { offset[0] }; if (limit < cursor[0]) { // Match in the reverse direction for (i=pattern.length()-1; i>=0; --i) { char keyChar = pattern.charAt(i); // OK; see note (1) above UnicodeMatcher subm = data.lookupMatcher(keyChar); if (subm == null) { if (cursor[0] > limit && keyChar == text.charAt(cursor[0])) { // OK; see note (1) above --cursor[0]; } else { return U_MISMATCH; } } else { int m = subm.matches(text, cursor, limit, incremental); if (m != U_MATCH) { return m; } } } // Record the match position, but adjust for a normal // forward start, limit, and only if a prior match does not // exist -- we want the rightmost match. if (matchStart < 0) { matchStart = cursor[0]+1; matchLimit = offset[0]+1; } } else { for (i=0; i 0) { // i.e., if this is a segment result.append('('); } for (int i=0; i 0) { // i.e., if this is a segment result.append(')'); } // Flush quoteBuf out to result Utility.appendToRule(result, -1, true, escapeUnprintable, quoteBuf); return result.toString(); } /** * Implement UnicodeMatcher */ public boolean matchesIndexValue(int v) { if (pattern.length() == 0) { return true; } int c = UTF16.charAt(pattern, 0); UnicodeMatcher m = data.lookupMatcher(c); return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v); } /** * Implementation of UnicodeMatcher API. Union the set of all * characters that may be matched by this object into the given * set. * @param toUnionTo the set into which to union the source characters */ public void addMatchSetTo(UnicodeSet toUnionTo) { int ch; for (int i=0; i= 0) { if (matchStart != matchLimit) { text.copy(matchStart, matchLimit, dest); outLen = matchLimit - matchStart; } } text.replace(start, limit, ""); // delete original text return outLen; } /** * UnicodeReplacer API */ public String toReplacerPattern(boolean escapeUnprintable) { // assert(segmentNumber > 0); StringBuffer rule = new StringBuffer("$"); Utility.appendNumber(rule, segmentNumber, 10, 1); return rule.toString(); } /** * Remove any match data. This must be called before performing a * set of matches with this segment. */ public void resetMatch() { matchStart = matchLimit = -1; } /** * Union the set of all characters that may output by this object * into the given set. * @param toUnionTo the set into which to union the output characters */ public void addReplacementSetTo(UnicodeSet toUnionTo) { // The output of this replacer varies; it is the source text between // matchStart and matchLimit. Since this varies depending on the // input text, we can't compute it here. We can either do nothing // or we can add ALL characters to the set. It's probably more useful // to do nothing. } } //eof