2 *******************************************************************************
\r
3 * Copyright (C) 2009-2010, International Business Machines
\r
4 * Corporation and others. All Rights Reserved.
\r
5 *******************************************************************************
\r
7 package com.ibm.icu.text;
\r
9 import java.io.IOException;
\r
12 * Normalization filtered by a UnicodeSet.
\r
13 * Normalizes portions of the text contained in the filter set and leaves
\r
14 * portions not contained in the filter set unchanged.
\r
15 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
\r
16 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
\r
17 * This class implements all of (and only) the Normalizer2 API.
\r
18 * An instance of this class is unmodifiable/immutable.
\r
20 * @provisional This API might change or be removed in a future release.
\r
21 * @author Markus W. Scherer
\r
23 public class FilteredNormalizer2 extends Normalizer2 {
\r
25 * Constructs a filtered normalizer wrapping any Normalizer2 instance
\r
27 * Both are aliased and must not be modified or deleted while this object
\r
29 * The filter set should be frozen; otherwise the performance will suffer greatly.
\r
30 * @param n2 wrapped Normalizer2 instance
\r
31 * @param filterSet UnicodeSet which determines the characters to be normalized
\r
33 * @provisional This API might change or be removed in a future release.
\r
35 public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
\r
42 * @provisional This API might change or be removed in a future release.
\r
45 public StringBuilder normalize(CharSequence src, StringBuilder dest) {
\r
47 throw new IllegalArgumentException();
\r
50 normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
\r
54 * @internal ICU 4.4 TODO: propose for 4.6
\r
55 * @provisional This API might change or be removed in a future release.
\r
57 public Appendable normalize(CharSequence src, Appendable dest) {
\r
59 throw new IllegalArgumentException();
\r
61 return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
\r
66 * @provisional This API might change or be removed in a future release.
\r
69 public StringBuilder normalizeSecondAndAppend(
\r
70 StringBuilder first, CharSequence second) {
\r
71 return normalizeSecondAndAppend(first, second, true);
\r
75 * @provisional This API might change or be removed in a future release.
\r
78 public StringBuilder append(StringBuilder first, CharSequence second) {
\r
79 return normalizeSecondAndAppend(first, second, false);
\r
84 * @provisional This API might change or be removed in a future release.
\r
87 public boolean isNormalized(CharSequence s) {
\r
88 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
89 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
\r
90 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
\r
91 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
\r
92 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
94 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
\r
97 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
\r
99 prevSpanLimit=spanLimit;
\r
105 * @provisional This API might change or be removed in a future release.
\r
108 public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
\r
109 Normalizer.QuickCheckResult result=Normalizer.YES;
\r
110 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
111 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
\r
112 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
\r
113 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
\r
114 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
116 Normalizer.QuickCheckResult qcResult=
\r
117 norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
\r
118 if(qcResult==Normalizer.NO) {
\r
120 } else if(qcResult==Normalizer.MAYBE) {
\r
123 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
\r
125 prevSpanLimit=spanLimit;
\r
131 * @provisional This API might change or be removed in a future release.
\r
134 public int spanQuickCheckYes(CharSequence s) {
\r
135 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
136 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
\r
137 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
\r
138 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
\r
139 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
143 norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
\r
144 if(yesLimit<spanLimit) {
\r
147 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
\r
149 prevSpanLimit=spanLimit;
\r
156 * @provisional This API might change or be removed in a future release.
\r
159 public boolean hasBoundaryBefore(int c) {
\r
160 return !set.contains(c) || norm2.hasBoundaryBefore(c);
\r
165 * @provisional This API might change or be removed in a future release.
\r
168 public boolean hasBoundaryAfter(int c) {
\r
169 return !set.contains(c) || norm2.hasBoundaryAfter(c);
\r
174 * @provisional This API might change or be removed in a future release.
\r
177 public boolean isInert(int c) {
\r
178 return !set.contains(c) || norm2.isInert(c);
\r
181 // Internal: No argument checking, and appends to dest.
\r
182 // Pass as input spanCondition the one that is likely to yield a non-zero
\r
183 // span length at the start of src.
\r
184 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
\r
185 // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
\r
186 // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
\r
187 // an in-filter prefix.
\r
188 private Appendable normalize(CharSequence src, Appendable dest,
\r
189 UnicodeSet.SpanCondition spanCondition) {
\r
190 // Don't throw away destination buffer between iterations.
\r
191 StringBuilder tempDest=new StringBuilder();
\r
193 for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
\r
194 int spanLimit=set.span(src, prevSpanLimit, spanCondition);
\r
195 int spanLength=spanLimit-prevSpanLimit;
\r
196 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
\r
197 if(spanLength!=0) {
\r
198 dest.append(src, prevSpanLimit, spanLimit);
\r
200 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
\r
202 if(spanLength!=0) {
\r
203 // Not norm2.normalizeSecondAndAppend() because we do not want
\r
204 // to modify the non-filter part of dest.
\r
205 dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
\r
207 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
\r
209 prevSpanLimit=spanLimit;
\r
211 } catch(IOException e) {
\r
212 throw new RuntimeException(e);
\r
217 private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
\r
218 boolean doNormalize) {
\r
219 if(first==second) {
\r
220 throw new IllegalArgumentException();
\r
222 if(first.length()==0) {
\r
224 return normalize(second, first);
\r
226 return first.append(second);
\r
229 // merge the in-filter suffix of the first string with the in-filter prefix of the second
\r
230 int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
\r
231 if(prefixLimit!=0) {
\r
232 CharSequence prefix=second.subSequence(0, prefixLimit);
\r
233 int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
\r
234 if(suffixStart==0) {
\r
236 norm2.normalizeSecondAndAppend(first, prefix);
\r
238 norm2.append(first, prefix);
\r
241 StringBuilder middle=new StringBuilder(first.subSequence(suffixStart, 0x7fffffff));
\r
243 norm2.normalizeSecondAndAppend(middle, prefix);
\r
245 norm2.append(middle, prefix);
\r
247 first.delete(suffixStart, 0x7fffffff).append(middle);
\r
250 if(prefixLimit<second.length()) {
\r
251 CharSequence rest=second.subSequence(prefixLimit, 0x7fffffff);
\r
253 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
\r
255 first.append(rest);
\r
261 private Normalizer2 norm2;
\r
262 private UnicodeSet set;
\r