2 **********************************************************************
\r
3 * Copyright (c) 2002-2007, International Business Machines Corporation
\r
4 * and others. All Rights Reserved.
\r
5 **********************************************************************
\r
6 * Date Name Description
\r
7 * 01/14/2002 aliu Creation.
\r
8 **********************************************************************
\r
11 package com.ibm.icu.text;
\r
12 import com.ibm.icu.impl.Utility;
\r
15 * A replacer that produces static text as its output. The text may
\r
16 * contain transliterator stand-in characters that represent nested
\r
17 * UnicodeReplacer objects, making it possible to encode a tree of
\r
18 * replacers in a StringReplacer. A StringReplacer that contains such
\r
19 * stand-ins is called a <em>complex</em> StringReplacer. A complex
\r
20 * StringReplacer has a slower processing loop than a non-complex one.
\r
23 class StringReplacer implements UnicodeReplacer {
\r
26 * Output text, possibly containing stand-in characters that
\r
27 * represent nested UnicodeReplacers.
\r
29 private String output;
\r
32 * Cursor position. Value is ignored if hasCursor is false.
\r
34 private int cursorPos;
\r
37 * True if this object outputs a cursor position.
\r
39 private boolean hasCursor;
\r
42 * A complex object contains nested replacers and requires more
\r
43 * complex processing. StringReplacers are initially assumed to
\r
44 * be complex. If no nested replacers are seen during processing,
\r
45 * then isComplex is set to false, and future replacements are
\r
46 * short circuited for better performance.
\r
48 private boolean isComplex;
\r
51 * Object that translates stand-in characters in 'output' to
\r
52 * UnicodeReplacer objects.
\r
54 private final RuleBasedTransliterator.Data data;
\r
57 * Construct a StringReplacer that sets the emits the given output
\r
58 * text and sets the cursor to the given position.
\r
59 * @param theOutput text that will replace input text when the
\r
60 * replace() method is called. May contain stand-in characters
\r
61 * that represent nested replacers.
\r
62 * @param theCursorPos cursor position that will be returned by
\r
63 * the replace() method
\r
64 * @param theData transliterator context object that translates
\r
65 * stand-in characters to UnicodeReplacer objects
\r
67 public StringReplacer(String theOutput,
\r
69 RuleBasedTransliterator.Data theData) {
\r
71 cursorPos = theCursorPos;
\r
78 * Construct a StringReplacer that sets the emits the given output
\r
79 * text and does not modify the cursor.
\r
80 * @param theOutput text that will replace input text when the
\r
81 * replace() method is called. May contain stand-in characters
\r
82 * that represent nested replacers.
\r
83 * @param theData transliterator context object that translates
\r
84 * stand-in characters to UnicodeReplacer objects
\r
86 public StringReplacer(String theOutput,
\r
87 RuleBasedTransliterator.Data theData) {
\r
95 //= public static UnicodeReplacer valueOf(String output,
\r
97 //= RuleBasedTransliterator.Data data) {
\r
98 //= if (output.length() == 1) {
\r
99 //= char c = output.charAt(0);
\r
100 //= UnicodeReplacer r = data.lookupReplacer(c);
\r
101 //= if (r != null) {
\r
105 //= return new StringReplacer(output, cursorPos, data);
\r
109 * UnicodeReplacer API
\r
111 public int replace(Replaceable text,
\r
118 // NOTE: It should be possible to _always_ run the complex
\r
119 // processing code; just slower. If not, then there is a bug
\r
120 // in the complex processing code.
\r
122 // Simple (no nested replacers) Processing Code :
\r
124 text.replace(start, limit, output);
\r
125 outLen = output.length();
\r
127 // Setup default cursor position (for cursorPos within output)
\r
128 newStart = cursorPos;
\r
131 // Complex (nested replacers) Processing Code :
\r
133 /* When there are segments to be copied, use the Replaceable.copy()
\r
134 * API in order to retain out-of-band data. Copy everything to the
\r
135 * end of the string, then copy them back over the key. This preserves
\r
136 * the integrity of indices into the key and surrounding context while
\r
137 * generating the output text.
\r
139 StringBuffer buf = new StringBuffer();
\r
140 int oOutput; // offset into 'output'
\r
143 // The temporary buffer starts at tempStart, and extends
\r
144 // to destLimit + tempExtra. The start of the buffer has a single
\r
145 // character from before the key. This provides style
\r
146 // data when addition characters are filled into the
\r
147 // temporary buffer. If there is nothing to the left, use
\r
148 // the non-character U+FFFF, which Replaceable subclasses
\r
149 // should treat specially as a "no-style character."
\r
150 // destStart points to the point after the style context
\r
151 // character, so it is tempStart+1 or tempStart+2.
\r
152 int tempStart = text.length(); // start of temp buffer
\r
153 int destStart = tempStart; // copy new text to here
\r
155 int len = UTF16.getCharCount(text.char32At(start-1));
\r
156 text.copy(start-len, start, tempStart);
\r
159 text.replace(tempStart, tempStart, "\uFFFF");
\r
162 int destLimit = destStart;
\r
163 int tempExtra = 0; // temp chars after destLimit
\r
165 for (oOutput=0; oOutput<output.length(); ) {
\r
166 if (oOutput == cursorPos) {
\r
167 // Record the position of the cursor
\r
168 newStart = buf.length() + destLimit - destStart; // relative to start
\r
169 // the buf.length() was inserted for bug 5789
\r
170 // the problem is that if we are accumulating into a buffer (when r == null below)
\r
171 // then the actual length of the text at that point needs to add the buf length.
\r
172 // there was an alternative suggested in #5789, but that looks like it won't work
\r
173 // if we have accumulated some stuff in the dest part AND have a non-zero buffer.
\r
175 int c = UTF16.charAt(output, oOutput);
\r
177 // When we are at the last position copy the right style
\r
178 // context character into the temporary buffer. We don't
\r
179 // do this before because it will provide an incorrect
\r
180 // right context for previous replace() operations.
\r
181 int nextIndex = oOutput + UTF16.getCharCount(c);
\r
182 if (nextIndex == output.length()) {
\r
183 tempExtra = UTF16.getCharCount(text.char32At(limit));
\r
184 text.copy(limit, limit+tempExtra, destLimit);
\r
187 UnicodeReplacer r = data.lookupReplacer(c);
\r
189 // Accumulate straight (non-segment) text.
\r
190 UTF16.append(buf, c);
\r
194 // Insert any accumulated straight text.
\r
195 if (buf.length() > 0) {
\r
196 text.replace(destLimit, destLimit, buf.toString());
\r
197 destLimit += buf.length();
\r
201 // Delegate output generation to replacer object
\r
202 int len = r.replace(text, destLimit, destLimit, cursor);
\r
205 oOutput = nextIndex;
\r
207 // Insert any accumulated straight text.
\r
208 if (buf.length() > 0) {
\r
209 text.replace(destLimit, destLimit, buf.toString());
\r
210 destLimit += buf.length();
\r
212 if (oOutput == cursorPos) {
\r
213 // Record the position of the cursor
\r
214 newStart = destLimit - destStart; // relative to start
\r
217 outLen = destLimit - destStart;
\r
219 // Copy new text to start, and delete it
\r
220 text.copy(destStart, destLimit, start);
\r
221 text.replace(tempStart + outLen, destLimit + tempExtra + outLen, "");
\r
223 // Delete the old text (the key)
\r
224 text.replace(start + outLen, limit + outLen, "");
\r
228 // Adjust the cursor for positions outside the key. These
\r
229 // refer to code points rather than code units. If cursorPos
\r
230 // is within the output string, then use newStart, which has
\r
231 // already been set above.
\r
232 if (cursorPos < 0) {
\r
235 // Outside the output string, cursorPos counts code points
\r
236 while (n < 0 && newStart > 0) {
\r
237 newStart -= UTF16.getCharCount(text.char32At(newStart-1));
\r
241 } else if (cursorPos > output.length()) {
\r
242 newStart = start + outLen;
\r
243 int n = cursorPos - output.length();
\r
244 // Outside the output string, cursorPos counts code points
\r
245 while (n > 0 && newStart < text.length()) {
\r
246 newStart += UTF16.getCharCount(text.char32At(newStart));
\r
251 // Cursor is within output string. It has been set up above
\r
252 // to be relative to start.
\r
256 cursor[0] = newStart;
\r
263 * UnicodeReplacer API
\r
265 public String toReplacerPattern(boolean escapeUnprintable) {
\r
266 StringBuffer rule = new StringBuffer();
\r
267 StringBuffer quoteBuf = new StringBuffer();
\r
269 int cursor = cursorPos;
\r
271 // Handle a cursor preceding the output
\r
272 if (hasCursor && cursor < 0) {
\r
273 while (cursor++ < 0) {
\r
274 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
\r
276 // Fall through and append '|' below
\r
279 for (int i=0; i<output.length(); ++i) {
\r
280 if (hasCursor && i == cursor) {
\r
281 Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
\r
283 char c = output.charAt(i); // Ok to use 16-bits here
\r
285 UnicodeReplacer r = data.lookupReplacer(c);
\r
287 Utility.appendToRule(rule, c, false, escapeUnprintable, quoteBuf);
\r
289 StringBuffer buf = new StringBuffer(" ");
\r
290 buf.append(r.toReplacerPattern(escapeUnprintable));
\r
292 Utility.appendToRule(rule, buf.toString(),
\r
293 true, escapeUnprintable, quoteBuf);
\r
297 // Handle a cursor after the output. Use > rather than >= because
\r
298 // if cursor == output.length() it is at the end of the output,
\r
299 // which is the default position, so we need not emit it.
\r
300 if (hasCursor && cursor > output.length()) {
\r
301 cursor -= output.length();
\r
302 while (cursor-- > 0) {
\r
303 Utility.appendToRule(rule, '@', true, escapeUnprintable, quoteBuf);
\r
305 Utility.appendToRule(rule, '|', true, escapeUnprintable, quoteBuf);
\r
307 // Flush quoteBuf out to result
\r
308 Utility.appendToRule(rule, -1,
\r
309 true, escapeUnprintable, quoteBuf);
\r
311 return rule.toString();
\r
315 * Union the set of all characters that may output by this object
\r
316 * into the given set.
\r
317 * @param toUnionTo the set into which to union the output characters
\r
319 public void addReplacementSetTo(UnicodeSet toUnionTo) {
\r
321 for (int i=0; i<output.length(); i+=UTF16.getCharCount(ch)) {
\r
322 ch = UTF16.charAt(output, i);
\r
323 UnicodeReplacer r = data.lookupReplacer(ch);
\r
327 r.addReplacementSetTo(toUnionTo);
\r