jars/icu4j-4_4_2-src/main/classes/translit/src/com/ibm/icu/text/StringReplacer.java

   1 /*\r
   2 **********************************************************************\r
   3 *   Copyright (c) 2002-2007, International Business Machines Corporation\r
   4 *   and others.  All Rights Reserved.\r
   5 **********************************************************************\r
   6 *   Date        Name        Description\r
   7 *   01/14/2002  aliu        Creation.\r
   8 **********************************************************************\r
   9 */\r
  10 \r
  11 package com.ibm.icu.text;\r
  12 import com.ibm.icu.impl.Utility;\r
  13 \r
  14 /**\r
  15  * A replacer that produces static text as its output.  The text may\r
  16  * contain transliterator stand-in characters that represent nested\r
  17  * UnicodeReplacer objects, making it possible to encode a tree of\r
  18  * replacers in a StringReplacer.  A StringReplacer that contains such\r
  19  * stand-ins is called a <em>complex</em> StringReplacer.  A complex\r
  20  * StringReplacer has a slower processing loop than a non-complex one.\r
  21  * @author Alan Liu\r
  22  */\r
  23 class StringReplacer implements UnicodeReplacer {\r
  24 \r
  25     /**\r
  26      * Output text, possibly containing stand-in characters that\r
  27      * represent nested UnicodeReplacers.\r
  28      */\r
  29     private String output;\r
  30 \r
  31     /**\r
  32      * Cursor position.  Value is ignored if hasCursor is false.\r
  33      */\r
  34     private int cursorPos;\r
  35 \r
  36     /**\r
  37      * True if this object outputs a cursor position.\r
  38      */\r
  39     private boolean hasCursor;\r
  40 \r
  41     /**\r
  42      * A complex object contains nested replacers and requires more\r
  43      * complex processing.  StringReplacers are initially assumed to\r
  44      * be complex.  If no nested replacers are seen during processing,\r
  45      * then isComplex is set to false, and future replacements are\r
  46      * short circuited for better performance.\r
  47      */\r
  48     private boolean isComplex;\r
  49 \r
  50     /**\r
  51      * Object that translates stand-in characters in 'output' to\r
  52      * UnicodeReplacer objects.\r
  53      */\r
  54     private final RuleBasedTransliterator.Data data;\r
  55 \r
  56     /**\r
  57      * Construct a StringReplacer that sets the emits the given output\r
  58      * text and sets the cursor to the given position.\r
  59      * @param theOutput text that will replace input text when the\r
  60      * replace() method is called.  May contain stand-in characters\r
  61      * that represent nested replacers.\r
  62      * @param theCursorPos cursor position that will be returned by\r
  63      * the replace() method\r
  64      * @param theData transliterator context object that translates\r
  65      * stand-in characters to UnicodeReplacer objects\r
  66      */\r
  67     public StringReplacer(String theOutput,\r
  68                           int theCursorPos,\r
  69                           RuleBasedTransliterator.Data theData) {\r
  70         output = theOutput;\r
  71         cursorPos = theCursorPos;\r
  72         hasCursor = true;\r
  73         data = theData;\r
  74         isComplex = true;\r
  75     }\r
  76 \r
  77     /**\r
  78      * Construct a StringReplacer that sets the emits the given output\r
  79      * text and does not modify the cursor.\r
  80      * @param theOutput text that will replace input text when the\r
  81      * replace() method is called.  May contain stand-in characters\r
  82      * that represent nested replacers.\r
  83      * @param theData transliterator context object that translates\r
  84      * stand-in characters to UnicodeReplacer objects\r
  85      */\r
  86     public StringReplacer(String theOutput,\r
  87                           RuleBasedTransliterator.Data theData) {\r
  88         output = theOutput;\r
  89         cursorPos = 0;\r
  90         hasCursor = false;\r
  91         data = theData;\r
  92         isComplex = true;\r
  93     }\r
  94 \r
  95 //=    public static UnicodeReplacer valueOf(String output,\r
  96 //=                                          int cursorPos,\r
  97 //=                                          RuleBasedTransliterator.Data data) {\r
  98 //=        if (output.length() == 1) {\r
  99 //=            char c = output.charAt(0);\r
 100 //=            UnicodeReplacer r = data.lookupReplacer(c);\r
 101 //=            if (r != null) {\r
 102 //=                return r;\r
 103 //=            }\r
 104 //=        }\r
 105 //=        return new StringReplacer(output, cursorPos, data);\r
 106 //=    }\r
 107 \r
 108     /**\r
 109      * UnicodeReplacer API\r
 110      */\r
 111     public int replace(Replaceable text,\r
 112                        int start,\r
 113                        int limit,\r
 114                        int[] cursor) {\r
 115         int outLen;\r
 116         int newStart = 0;\r
 117 \r
 118         // NOTE: It should be possible to _always_ run the complex\r
 119         // processing code; just slower.  If not, then there is a bug\r
 120         // in the complex processing code.\r
 121 \r
 122         // Simple (no nested replacers) Processing Code :\r
 123         if (!isComplex) {\r
 124             text.replace(start, limit, output);\r
 125             outLen = output.length();\r
 126 \r
 127             // Setup default cursor position (for cursorPos within output)\r
 128             newStart = cursorPos;\r
 129         }\r
 130 \r
 131         // Complex (nested replacers) Processing Code :\r
 132         else {\r
 133             /* When there are segments to be copied, use the Replaceable.copy()\r
 134              * API in order to retain out-of-band data.  Copy everything to the\r
 135              * end of the string, then copy them back over the key.  This preserves\r
 136              * the integrity of indices into the key and surrounding context while\r
 137              * generating the output text.\r
 138              */\r
 139             StringBuffer buf = new StringBuffer();\r
 140             int oOutput; // offset into 'output'\r
 141             isComplex = false;\r
 142 \r
 143             // The temporary buffer starts at tempStart, and extends\r
 144             // to destLimit + tempExtra.  The start of the buffer has a single\r
 145             // character from before the key.  This provides style\r
 146             // data when addition characters are filled into the\r
 147             // temporary buffer.  If there is nothing to the left, use\r
 148             // the non-character U+FFFF, which Replaceable subclasses\r
 149             // should treat specially as a "no-style character."\r
 150             // destStart points to the point after the style context\r
 151             // character, so it is tempStart+1 or tempStart+2.\r
 152             int tempStart = text.length(); // start of temp buffer\r
 153             int destStart = tempStart; // copy new text to here\r
 154             if (start > 0) {\r
 155                 int len = UTF16.getCharCount(text.char32At(start-1));\r
 156                 text.copy(start-len, start, tempStart);\r
 157                 destStart += len;\r
 158             } else {\r
 159                 text.replace(tempStart, tempStart, "\uFFFF");\r
 160                 destStart++;\r
 161             }\r
 162             int destLimit = destStart;\r
 163             int tempExtra = 0; // temp chars after destLimit\r
 164 \r
 165             for (oOutput=0; oOutput<output.length(); ) {\r
 166                 if (oOutput == cursorPos) {\r
 167                     // Record the position of the cursor\r
 168                     newStart = buf.length() + destLimit - destStart; // relative to start\r
 169                     // the buf.length() was inserted for bug 5789\r
 170                     // the problem is that if we are accumulating into a buffer (when r == null below)\r
 171                     // then the actual length of the text at that point needs to add the buf length.\r
 172                     // there was an alternative suggested in #5789, but that looks like it won't work\r
 173                     // if we have accumulated some stuff in the dest part AND have a non-zero buffer.\r
 174                 }\r
 175                 int c = UTF16.charAt(output, oOutput);\r
 176 \r
 177                 // When we are at the last position copy the right style\r
 178                 // context character into the temporary buffer.  We don't\r
 179                 // do this before because it will provide an incorrect\r
 180                 // right context for previous replace() operations.\r
 181                 int nextIndex = oOutput + UTF16.getCharCount(c);\r
 182                 if (nextIndex == output.length()) {\r
 183                     tempExtra = UTF16.getCharCount(text.char32At(limit));\r
 184                     text.copy(limit, limit+tempExtra, destLimit);\r
 185                 }\r
 186 \r
 187                 UnicodeReplacer r = data.lookupReplacer(c);\r
 188                 if (r == null) {\r
 189                     // Accumulate straight (non-segment) text.\r
 190                     UTF16.append(buf, c);\r
 191                 } else {\r
 192                     isComplex = true;\r
 193 \r
 194                     // Insert any accumulated straight text.\r
 195                     if (buf.length() > 0) {\r
 196                         text.replace(destLimit, destLimit, buf.toString());\r
 197                         destLimit += buf.length();\r
 198                         buf.setLength(0);\r
 199                     }\r
 200 \r
 201                     // Delegate output generation to replacer object\r
 202                     int len = r.replace(text, destLimit, destLimit, cursor);\r
 203                     destLimit += len;\r
 204                 }\r
 205                 oOutput = nextIndex;\r
 206             }\r
 207             // Insert any accumulated straight text.\r
 208             if (buf.length() > 0) {\r
 209                 text.replace(destLimit, destLimit, buf.toString());\r
 210                 destLimit += buf.length();\r
 211             }\r
 212             if (oOutput == cursorPos) {\r
 213                 // Record the position of the cursor\r
 214                 newStart = destLimit - destStart; // relative to start\r
 215             }\r
 216 \r
 217             outLen = destLimit - destStart;\r
 218 \r
 219             // Copy new text to start, and delete it\r
 220             text.copy(destStart, destLimit, start);\r
 221             text.replace(tempStart + outLen, destLimit + tempExtra + outLen, "");\r
 222 \r
 223             // Delete the old text (the key)\r
 224             text.replace(start + outLen, limit + outLen, "");\r
 225         }        \r
 226 \r
 227         if (hasCursor) {\r
 228             // Adjust the cursor for positions outside the key.  These\r
 229             // refer to code points rather than code units.  If cursorPos\r
 230             // is within the output string, then use newStart, which has\r
 231             // already been set above.\r
 232             if (cursorPos < 0) {\r
 233                 newStart = start;\r
 234                 int n = cursorPos;\r
 235                 // Outside the output string, cursorPos counts code points\r
 236                 while (n < 0 && newStart > 0) {\r
 237                     newStart -= UTF16.getCharCount(text.char32At(newStart-1));\r
 238                     ++n;\r
 239                 }\r
 240                 newStart += n;\r
 241             } else if (cursorPos > output.length()) {\r
 242                 newStart = start + outLen;\r
 243                 int n = cursorPos - output.length();\r
 244                 // Outside the output string, cursorPos counts code points\r
 245                 while (n > 0 && newStart < text.length()) {\r
 246                     newStart += UTF16.getCharCount(text.char32At(newStart));\r
 247                     --n;\r
 248                 }\r
 249                 newStart += n;\r
 250             } else {\r
 251                 // Cursor is within output string.  It has been set up above\r
 252                 // to be relative to start.\r
 253                 newStart += start;\r
 254             }\r
 255 \r
 256             cursor[0] = newStart;\r
 257         }\r
 258 \r
 259         return outLen;\r
 260     }\r
 261 \r
 262     /**\r
 263      * UnicodeReplacer API\r
 264      */\r
 265     public String toReplacerPattern(boolean escapeUnprintable) {\r
 266         StringBuffer rule = new StringBuffer();\r
 267         StringBuffer quoteBuf = new StringBuffer();\r
 268 \r
 269         int cursor = cursorPos;\r
 270 \r
 271         // Handle a cursor preceding the output\r
 272         if (hasCursor && cursor < 0) {\r
 273             while (cursor++ < 0) {\r
 274                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);\r
 275             }\r
 276             // Fall through and append '|' below\r
 277         }\r
 278 \r
 279         for (int i=0; i<output.length(); ++i) {\r
 280             if (hasCursor && i == cursor) {\r
 281                 Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);\r
 282             }\r
 283             char c = output.charAt(i); // Ok to use 16-bits here\r
 284 \r
 285             UnicodeReplacer r = data.lookupReplacer(c);\r
 286             if (r == null) {\r
 287                 Utility.appendToRule(rule, c, false, escapeUnprintable, quoteBuf);\r
 288             } else {\r
 289                 StringBuffer buf = new StringBuffer(" ");\r
 290                 buf.append(r.toReplacerPattern(escapeUnprintable));\r
 291                 buf.append(' ');\r
 292                 Utility.appendToRule(rule, buf.toString(),\r
 293                                      true, escapeUnprintable, quoteBuf);\r
 294             }\r
 295         }\r
 296 \r
 297         // Handle a cursor after the output.  Use > rather than >= because\r
 298         // if cursor == output.length() it is at the end of the output,\r
 299         // which is the default position, so we need not emit it.\r
 300         if (hasCursor && cursor > output.length()) {\r
 301             cursor -= output.length();\r
 302             while (cursor-- > 0) {\r
 303                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);\r
 304             }\r
 305             Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);\r
 306         }\r
 307         // Flush quoteBuf out to result\r
 308         Utility.appendToRule(rule, -1,\r
 309                              true, escapeUnprintable, quoteBuf);\r
 310 \r
 311         return rule.toString();\r
 312     }\r
 313 \r
 314     /**\r
 315      * Union the set of all characters that may output by this object\r
 316      * into the given set.\r
 317      * @param toUnionTo the set into which to union the output characters\r
 318      */\r
 319     public void addReplacementSetTo(UnicodeSet toUnionTo) {\r
 320         int ch;\r
 321         for (int i=0; i<output.length(); i+=UTF16.getCharCount(ch)) {\r
 322             ch = UTF16.charAt(output, i);\r
 323             UnicodeReplacer r = data.lookupReplacer(ch);\r
 324             if (r == null) {\r
 325                 toUnionTo.add(ch);\r
 326             } else {\r
 327                 r.addReplacementSetTo(toUnionTo);\r
 328             }\r
 329         }\r
 330     }\r
 331 }\r
 332 \r
 333 //eof\r