jars/icu4j-52_1/main/classes/translit/src/com/ibm/icu/text/StringReplacer.java

   1 /*
   2 **********************************************************************
   3 *   Copyright (c) 2002-2007, International Business Machines Corporation
   4 *   and others.  All Rights Reserved.
   5 **********************************************************************
   6 *   Date        Name        Description
   7 *   01/14/2002  aliu        Creation.
   8 **********************************************************************
   9 */
  10
  11 package com.ibm.icu.text;
  12 import com.ibm.icu.impl.Utility;
  13
  14 /**
  15  * A replacer that produces static text as its output.  The text may
  16  * contain transliterator stand-in characters that represent nested
  17  * UnicodeReplacer objects, making it possible to encode a tree of
  18  * replacers in a StringReplacer.  A StringReplacer that contains such
  19  * stand-ins is called a <em>complex</em> StringReplacer.  A complex
  20  * StringReplacer has a slower processing loop than a non-complex one.
  21  * @author Alan Liu
  22  */
  23 class StringReplacer implements UnicodeReplacer {
  24
  25     /**
  26      * Output text, possibly containing stand-in characters that
  27      * represent nested UnicodeReplacers.
  28      */
  29     private String output;
  30
  31     /**
  32      * Cursor position.  Value is ignored if hasCursor is false.
  33      */
  34     private int cursorPos;
  35
  36     /**
  37      * True if this object outputs a cursor position.
  38      */
  39     private boolean hasCursor;
  40
  41     /**
  42      * A complex object contains nested replacers and requires more
  43      * complex processing.  StringReplacers are initially assumed to
  44      * be complex.  If no nested replacers are seen during processing,
  45      * then isComplex is set to false, and future replacements are
  46      * short circuited for better performance.
  47      */
  48     private boolean isComplex;
  49
  50     /**
  51      * Object that translates stand-in characters in 'output' to
  52      * UnicodeReplacer objects.
  53      */
  54     private final RuleBasedTransliterator.Data data;
  55
  56     /**
  57      * Construct a StringReplacer that sets the emits the given output
  58      * text and sets the cursor to the given position.
  59      * @param theOutput text that will replace input text when the
  60      * replace() method is called.  May contain stand-in characters
  61      * that represent nested replacers.
  62      * @param theCursorPos cursor position that will be returned by
  63      * the replace() method
  64      * @param theData transliterator context object that translates
  65      * stand-in characters to UnicodeReplacer objects
  66      */
  67     public StringReplacer(String theOutput,
  68                           int theCursorPos,
  69                           RuleBasedTransliterator.Data theData) {
  70         output = theOutput;
  71         cursorPos = theCursorPos;
  72         hasCursor = true;
  73         data = theData;
  74         isComplex = true;
  75     }
  76
  77     /**
  78      * Construct a StringReplacer that sets the emits the given output
  79      * text and does not modify the cursor.
  80      * @param theOutput text that will replace input text when the
  81      * replace() method is called.  May contain stand-in characters
  82      * that represent nested replacers.
  83      * @param theData transliterator context object that translates
  84      * stand-in characters to UnicodeReplacer objects
  85      */
  86     public StringReplacer(String theOutput,
  87                           RuleBasedTransliterator.Data theData) {
  88         output = theOutput;
  89         cursorPos = 0;
  90         hasCursor = false;
  91         data = theData;
  92         isComplex = true;
  93     }
  94
  95 //=    public static UnicodeReplacer valueOf(String output,
  96 //=                                          int cursorPos,
  97 //=                                          RuleBasedTransliterator.Data data) {
  98 //=        if (output.length() == 1) {
  99 //=            char c = output.charAt(0);
 100 //=            UnicodeReplacer r = data.lookupReplacer(c);
 101 //=            if (r != null) {
 102 //=                return r;
 103 //=            }
 104 //=        }
 105 //=        return new StringReplacer(output, cursorPos, data);
 106 //=    }
 107
 108     /**
 109      * UnicodeReplacer API
 110      */
 111     public int replace(Replaceable text,
 112                        int start,
 113                        int limit,
 114                        int[] cursor) {
 115         int outLen;
 116         int newStart = 0;
 117
 118         // NOTE: It should be possible to _always_ run the complex
 119         // processing code; just slower.  If not, then there is a bug
 120         // in the complex processing code.
 121
 122         // Simple (no nested replacers) Processing Code :
 123         if (!isComplex) {
 124             text.replace(start, limit, output);
 125             outLen = output.length();
 126
 127             // Setup default cursor position (for cursorPos within output)
 128             newStart = cursorPos;
 129         }
 130
 131         // Complex (nested replacers) Processing Code :
 132         else {
 133             /* When there are segments to be copied, use the Replaceable.copy()
 134              * API in order to retain out-of-band data.  Copy everything to the
 135              * end of the string, then copy them back over the key.  This preserves
 136              * the integrity of indices into the key and surrounding context while
 137              * generating the output text.
 138              */
 139             StringBuffer buf = new StringBuffer();
 140             int oOutput; // offset into 'output'
 141             isComplex = false;
 142
 143             // The temporary buffer starts at tempStart, and extends
 144             // to destLimit + tempExtra.  The start of the buffer has a single
 145             // character from before the key.  This provides style
 146             // data when addition characters are filled into the
 147             // temporary buffer.  If there is nothing to the left, use
 148             // the non-character U+FFFF, which Replaceable subclasses
 149             // should treat specially as a "no-style character."
 150             // destStart points to the point after the style context
 151             // character, so it is tempStart+1 or tempStart+2.
 152             int tempStart = text.length(); // start of temp buffer
 153             int destStart = tempStart; // copy new text to here
 154             if (start > 0) {
 155                 int len = UTF16.getCharCount(text.char32At(start-1));
 156                 text.copy(start-len, start, tempStart);
 157                 destStart += len;
 158             } else {
 159                 text.replace(tempStart, tempStart, "\uFFFF");
 160                 destStart++;
 161             }
 162             int destLimit = destStart;
 163             int tempExtra = 0; // temp chars after destLimit
 164
 165             for (oOutput=0; oOutput<output.length(); ) {
 166                 if (oOutput == cursorPos) {
 167                     // Record the position of the cursor
 168                     newStart = buf.length() + destLimit - destStart; // relative to start
 169                     // the buf.length() was inserted for bug 5789
 170                     // the problem is that if we are accumulating into a buffer (when r == null below)
 171                     // then the actual length of the text at that point needs to add the buf length.
 172                     // there was an alternative suggested in #5789, but that looks like it won't work
 173                     // if we have accumulated some stuff in the dest part AND have a non-zero buffer.
 174                 }
 175                 int c = UTF16.charAt(output, oOutput);
 176
 177                 // When we are at the last position copy the right style
 178                 // context character into the temporary buffer.  We don't
 179                 // do this before because it will provide an incorrect
 180                 // right context for previous replace() operations.
 181                 int nextIndex = oOutput + UTF16.getCharCount(c);
 182                 if (nextIndex == output.length()) {
 183                     tempExtra = UTF16.getCharCount(text.char32At(limit));
 184                     text.copy(limit, limit+tempExtra, destLimit);
 185                 }
 186
 187                 UnicodeReplacer r = data.lookupReplacer(c);
 188                 if (r == null) {
 189                     // Accumulate straight (non-segment) text.
 190                     UTF16.append(buf, c);
 191                 } else {
 192                     isComplex = true;
 193
 194                     // Insert any accumulated straight text.
 195                     if (buf.length() > 0) {
 196                         text.replace(destLimit, destLimit, buf.toString());
 197                         destLimit += buf.length();
 198                         buf.setLength(0);
 199                     }
 200
 201                     // Delegate output generation to replacer object
 202                     int len = r.replace(text, destLimit, destLimit, cursor);
 203                     destLimit += len;
 204                 }
 205                 oOutput = nextIndex;
 206             }
 207             // Insert any accumulated straight text.
 208             if (buf.length() > 0) {
 209                 text.replace(destLimit, destLimit, buf.toString());
 210                 destLimit += buf.length();
 211             }
 212             if (oOutput == cursorPos) {
 213                 // Record the position of the cursor
 214                 newStart = destLimit - destStart; // relative to start
 215             }
 216
 217             outLen = destLimit - destStart;
 218
 219             // Copy new text to start, and delete it
 220             text.copy(destStart, destLimit, start);
 221             text.replace(tempStart + outLen, destLimit + tempExtra + outLen, "");
 222
 223             // Delete the old text (the key)
 224             text.replace(start + outLen, limit + outLen, "");
 225         }
 226
 227         if (hasCursor) {
 228             // Adjust the cursor for positions outside the key.  These
 229             // refer to code points rather than code units.  If cursorPos
 230             // is within the output string, then use newStart, which has
 231             // already been set above.
 232             if (cursorPos < 0) {
 233                 newStart = start;
 234                 int n = cursorPos;
 235                 // Outside the output string, cursorPos counts code points
 236                 while (n < 0 && newStart > 0) {
 237                     newStart -= UTF16.getCharCount(text.char32At(newStart-1));
 238                     ++n;
 239                 }
 240                 newStart += n;
 241             } else if (cursorPos > output.length()) {
 242                 newStart = start + outLen;
 243                 int n = cursorPos - output.length();
 244                 // Outside the output string, cursorPos counts code points
 245                 while (n > 0 && newStart < text.length()) {
 246                     newStart += UTF16.getCharCount(text.char32At(newStart));
 247                     --n;
 248                 }
 249                 newStart += n;
 250             } else {
 251                 // Cursor is within output string.  It has been set up above
 252                 // to be relative to start.
 253                 newStart += start;
 254             }
 255
 256             cursor[0] = newStart;
 257         }
 258
 259         return outLen;
 260     }
 261
 262     /**
 263      * UnicodeReplacer API
 264      */
 265     public String toReplacerPattern(boolean escapeUnprintable) {
 266         StringBuffer rule = new StringBuffer();
 267         StringBuffer quoteBuf = new StringBuffer();
 268
 269         int cursor = cursorPos;
 270
 271         // Handle a cursor preceding the output
 272         if (hasCursor && cursor < 0) {
 273             while (cursor++ < 0) {
 274                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
 275             }
 276             // Fall through and append '|' below
 277         }
 278
 279         for (int i=0; i<output.length(); ++i) {
 280             if (hasCursor && i == cursor) {
 281                 Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
 282             }
 283             char c = output.charAt(i); // Ok to use 16-bits here
 284
 285             UnicodeReplacer r = data.lookupReplacer(c);
 286             if (r == null) {
 287                 Utility.appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
 288             } else {
 289                 StringBuffer buf = new StringBuffer(" ");
 290                 buf.append(r.toReplacerPattern(escapeUnprintable));
 291                 buf.append(' ');
 292                 Utility.appendToRule(rule, buf.toString(),
 293                                      true, escapeUnprintable, quoteBuf);
 294             }
 295         }
 296
 297         // Handle a cursor after the output.  Use > rather than >= because
 298         // if cursor == output.length() it is at the end of the output,
 299         // which is the default position, so we need not emit it.
 300         if (hasCursor && cursor > output.length()) {
 301             cursor -= output.length();
 302             while (cursor-- > 0) {
 303                 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
 304             }
 305             Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
 306         }
 307         // Flush quoteBuf out to result
 308         Utility.appendToRule(rule, -1,
 309                              true, escapeUnprintable, quoteBuf);
 310
 311         return rule.toString();
 312     }
 313
 314     /**
 315      * Union the set of all characters that may output by this object
 316      * into the given set.
 317      * @param toUnionTo the set into which to union the output characters
 318      */
 319     public void addReplacementSetTo(UnicodeSet toUnionTo) {
 320         int ch;
 321         for (int i=0; i<output.length(); i+=UTF16.getCharCount(ch)) {
 322             ch = UTF16.charAt(output, i);
 323             UnicodeReplacer r = data.lookupReplacer(ch);
 324             if (r == null) {
 325                 toUnionTo.add(ch);
 326             } else {
 327                 r.addReplacementSetTo(toUnionTo);
 328             }
 329         }
 330     }
 331 }
 332
 333 //eof