jars/icu4j-4_4_2-src/main/classes/translit/src/com/ibm/icu/text/StringMatcher.java

   1 /*\r
   2  *******************************************************************************\r
   3  * Copyright (C) 2001-2004, International Business Machines Corporation and    *\r
   4  * others. All Rights Reserved.                                                *\r
   5  *******************************************************************************\r
   6  */\r
   7 package com.ibm.icu.text;\r
   8 import com.ibm.icu.impl.Utility;\r
   9 \r
  10 /**\r
  11  * An object that matches a fixed input string, implementing the\r
  12  * UnicodeMatcher API.  This object also implements the\r
  13  * UnicodeReplacer API, allowing it to emit the matched text as\r
  14  * output.  Since the match text may contain flexible match elements,\r
  15  * such as UnicodeSets, the emitted text is not the match pattern, but\r
  16  * instead a substring of the actual matched text.  Following\r
  17  * convention, the output text is the leftmost match seen up to this\r
  18  * point.\r
  19  *\r
  20  * A StringMatcher may represent a segment, in which case it has a\r
  21  * positive segment number.  This affects how the matcher converts\r
  22  * itself to a pattern but does not otherwise affect its function.\r
  23  *\r
  24  * A StringMatcher that is not a segment should not be used as a\r
  25  * UnicodeReplacer.\r
  26  */\r
  27 class StringMatcher implements UnicodeMatcher, UnicodeReplacer {\r
  28 \r
  29     /**\r
  30      * The text to be matched.\r
  31      */\r
  32     private String pattern;\r
  33 \r
  34     /**\r
  35      * Start offset, in the match text, of the <em>rightmost</em>\r
  36      * match.\r
  37      */\r
  38     private int matchStart;\r
  39     \r
  40     /**\r
  41      * Limit offset, in the match text, of the <em>rightmost</em>\r
  42      * match.\r
  43      */\r
  44     private int matchLimit;\r
  45 \r
  46     /**\r
  47      * The segment number, 1-based, or 0 if not a segment.\r
  48      */\r
  49     private int segmentNumber;\r
  50 \r
  51     /**\r
  52      * Context object that maps stand-ins to matcher and replacer\r
  53      * objects.\r
  54      */\r
  55     private final RuleBasedTransliterator.Data data;\r
  56 \r
  57     /**\r
  58      * Construct a matcher that matches the given pattern string.\r
  59      * @param theString the pattern to be matched, possibly containing\r
  60      * stand-ins that represent nested UnicodeMatcher objects.\r
  61      * @param segmentNum the segment number from 1..n, or 0 if this is\r
  62      * not a segment.\r
  63      * @param theData context object mapping stand-ins to\r
  64      * UnicodeMatcher objects.\r
  65      */\r
  66     public StringMatcher(String theString,\r
  67                          int segmentNum,\r
  68                          RuleBasedTransliterator.Data theData) {\r
  69         data = theData;\r
  70         pattern = theString;\r
  71         matchStart = matchLimit = -1;\r
  72         segmentNumber = segmentNum;\r
  73     }\r
  74 \r
  75     /**\r
  76      * Construct a matcher that matches a substring of the given\r
  77      * pattern string.\r
  78      * @param theString the pattern to be matched, possibly containing\r
  79      * stand-ins that represent nested UnicodeMatcher objects.\r
  80      * @param start first character of theString to be matched\r
  81      * @param limit index after the last character of theString to be\r
  82      * matched.\r
  83      * @param segmentNum the segment number from 1..n, or 0 if this is\r
  84      * not a segment.\r
  85      * @param theData context object mapping stand-ins to\r
  86      * UnicodeMatcher objects.\r
  87      */\r
  88     public StringMatcher(String theString,\r
  89                          int start,\r
  90                          int limit,\r
  91                          int segmentNum,\r
  92                          RuleBasedTransliterator.Data theData) {\r
  93         this(theString.substring(start, limit), segmentNum, theData);\r
  94     }\r
  95 \r
  96     /**\r
  97      * Implement UnicodeMatcher\r
  98      */\r
  99     public int matches(Replaceable text,\r
 100                        int[] offset,\r
 101                        int limit,\r
 102                        boolean incremental) {\r
 103         // Note (1): We process text in 16-bit code units, rather than\r
 104         // 32-bit code points.  This works because stand-ins are\r
 105         // always in the BMP and because we are doing a literal match\r
 106         // operation, which can be done 16-bits at a time.\r
 107         int i;\r
 108         int[] cursor = new int[] { offset[0] };\r
 109         if (limit < cursor[0]) {\r
 110             // Match in the reverse direction\r
 111             for (i=pattern.length()-1; i>=0; --i) {\r
 112                 char keyChar = pattern.charAt(i); // OK; see note (1) above\r
 113                 UnicodeMatcher subm = data.lookupMatcher(keyChar);\r
 114                 if (subm == null) {\r
 115                     if (cursor[0] > limit &&\r
 116                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above\r
 117                         --cursor[0];\r
 118                     } else {\r
 119                         return U_MISMATCH;\r
 120                     }\r
 121                 } else {\r
 122                     int m =\r
 123                         subm.matches(text, cursor, limit, incremental);\r
 124                     if (m != U_MATCH) {\r
 125                         return m;\r
 126                     }\r
 127                 }\r
 128             }\r
 129             // Record the match position, but adjust for a normal\r
 130             // forward start, limit, and only if a prior match does not\r
 131             // exist -- we want the rightmost match.\r
 132             if (matchStart < 0) {\r
 133                 matchStart = cursor[0]+1;\r
 134                 matchLimit = offset[0]+1;\r
 135             }\r
 136         } else {\r
 137             for (i=0; i<pattern.length(); ++i) {\r
 138                 if (incremental && cursor[0] == limit) {\r
 139                     // We've reached the context limit without a mismatch and\r
 140                     // without completing our match.\r
 141                     return U_PARTIAL_MATCH;\r
 142                 }\r
 143                 char keyChar = pattern.charAt(i); // OK; see note (1) above\r
 144                 UnicodeMatcher subm = data.lookupMatcher(keyChar);\r
 145                 if (subm == null) {\r
 146                     // Don't need the cursor < limit check if\r
 147                     // incremental is true (because it's done above); do need\r
 148                     // it otherwise.\r
 149                     if (cursor[0] < limit &&\r
 150                         keyChar == text.charAt(cursor[0])) { // OK; see note (1) above\r
 151                         ++cursor[0];\r
 152                     } else {\r
 153                         return U_MISMATCH;\r
 154                     }\r
 155                 } else {\r
 156                     int m =\r
 157                         subm.matches(text, cursor, limit, incremental);\r
 158                     if (m != U_MATCH) {\r
 159                         return m;\r
 160                     }\r
 161                 }\r
 162             }\r
 163             // Record the match position\r
 164             matchStart = offset[0];\r
 165             matchLimit = cursor[0];\r
 166         }\r
 167 \r
 168         offset[0] = cursor[0];\r
 169         return U_MATCH;\r
 170     }\r
 171 \r
 172     /**\r
 173      * Implement UnicodeMatcher\r
 174      */\r
 175     public String toPattern(boolean escapeUnprintable) {\r
 176         StringBuffer result = new StringBuffer();\r
 177         StringBuffer quoteBuf = new StringBuffer();\r
 178         if (segmentNumber > 0) { // i.e., if this is a segment\r
 179             result.append('(');\r
 180         }\r
 181         for (int i=0; i<pattern.length(); ++i) {\r
 182             char keyChar = pattern.charAt(i); // OK; see note (1) above\r
 183             UnicodeMatcher m = data.lookupMatcher(keyChar);\r
 184             if (m == null) {\r
 185                 Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);\r
 186             } else {\r
 187                 Utility.appendToRule(result, m.toPattern(escapeUnprintable),\r
 188                                      true, escapeUnprintable, quoteBuf);\r
 189             }\r
 190         }\r
 191         if (segmentNumber > 0) { // i.e., if this is a segment\r
 192             result.append(')');\r
 193         }\r
 194         // Flush quoteBuf out to result\r
 195         Utility.appendToRule(result, -1,\r
 196                              true, escapeUnprintable, quoteBuf);\r
 197         return result.toString();\r
 198     }\r
 199 \r
 200     /**\r
 201      * Implement UnicodeMatcher\r
 202      */\r
 203     public boolean matchesIndexValue(int v) {\r
 204         if (pattern.length() == 0) {\r
 205             return true;\r
 206         }\r
 207         int c = UTF16.charAt(pattern, 0);\r
 208         UnicodeMatcher m = data.lookupMatcher(c);\r
 209         return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);\r
 210     }\r
 211 \r
 212     /**\r
 213      * Implementation of UnicodeMatcher API.  Union the set of all\r
 214      * characters that may be matched by this object into the given\r
 215      * set.\r
 216      * @param toUnionTo the set into which to union the source characters\r
 217      */\r
 218     public void addMatchSetTo(UnicodeSet toUnionTo) {\r
 219         int ch;\r
 220         for (int i=0; i<pattern.length(); i+=UTF16.getCharCount(ch)) {\r
 221             ch = UTF16.charAt(pattern, i);\r
 222             UnicodeMatcher matcher = data.lookupMatcher(ch);\r
 223             if (matcher == null) {\r
 224                 toUnionTo.add(ch);\r
 225             } else {\r
 226                 matcher.addMatchSetTo(toUnionTo);\r
 227             }\r
 228         }\r
 229     }\r
 230 \r
 231     /**\r
 232      * UnicodeReplacer API\r
 233      */\r
 234     public int replace(Replaceable text,\r
 235                        int start,\r
 236                        int limit,\r
 237                        int[] cursor) {\r
 238 \r
 239         int outLen = 0;\r
 240 \r
 241         // Copy segment with out-of-band data\r
 242         int dest = limit;\r
 243         // If there was no match, that means that a quantifier\r
 244         // matched zero-length.  E.g., x (a)* y matched "xy".\r
 245         if (matchStart >= 0) {\r
 246             if (matchStart != matchLimit) {\r
 247                 text.copy(matchStart, matchLimit, dest);\r
 248                 outLen = matchLimit - matchStart;\r
 249             }\r
 250         }\r
 251 \r
 252         text.replace(start, limit, ""); // delete original text\r
 253 \r
 254         return outLen;\r
 255     }\r
 256 \r
 257     /**\r
 258      * UnicodeReplacer API\r
 259      */\r
 260     public String toReplacerPattern(boolean escapeUnprintable) {\r
 261         // assert(segmentNumber > 0);\r
 262         StringBuffer rule = new StringBuffer("$");\r
 263         Utility.appendNumber(rule, segmentNumber, 10, 1);\r
 264         return rule.toString();\r
 265     }\r
 266 \r
 267     /**\r
 268      * Remove any match data.  This must be called before performing a\r
 269      * set of matches with this segment.\r
 270      */\r
 271     public void resetMatch() {\r
 272         matchStart = matchLimit = -1;\r
 273     }\r
 274 \r
 275     /**\r
 276      * Union the set of all characters that may output by this object\r
 277      * into the given set.\r
 278      * @param toUnionTo the set into which to union the output characters\r
 279      */\r
 280     public void addReplacementSetTo(UnicodeSet toUnionTo) {\r
 281         // The output of this replacer varies; it is the source text between\r
 282         // matchStart and matchLimit.  Since this varies depending on the\r
 283         // input text, we can't compute it here.  We can either do nothing\r
 284         // or we can add ALL characters to the set.  It's probably more useful\r
 285         // to do nothing.\r
 286     }\r
 287 }\r
 288 \r
 289 //eof\r