2 *******************************************************************************
3 * Copyright (C) 2009-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
7 package com.ibm.icu.text;
9 import java.io.IOException;
12 * Normalization filtered by a UnicodeSet.
13 * Normalizes portions of the text contained in the filter set and leaves
14 * portions not contained in the filter set unchanged.
15 * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
16 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
17 * This class implements all of (and only) the Normalizer2 API.
18 * An instance of this class is unmodifiable/immutable.
20 * @author Markus W. Scherer
22 public class FilteredNormalizer2 extends Normalizer2 {
24 * Constructs a filtered normalizer wrapping any Normalizer2 instance
26 * Both are aliased and must not be modified or deleted while this object
28 * The filter set should be frozen; otherwise the performance will suffer greatly.
29 * @param n2 wrapped Normalizer2 instance
30 * @param filterSet UnicodeSet which determines the characters to be normalized
33 public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
43 public StringBuilder normalize(CharSequence src, StringBuilder dest) {
45 throw new IllegalArgumentException();
48 normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
56 public Appendable normalize(CharSequence src, Appendable dest) {
58 throw new IllegalArgumentException();
60 return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
68 public StringBuilder normalizeSecondAndAppend(
69 StringBuilder first, CharSequence second) {
70 return normalizeSecondAndAppend(first, second, true);
77 public StringBuilder append(StringBuilder first, CharSequence second) {
78 return normalizeSecondAndAppend(first, second, false);
86 public String getDecomposition(int c) {
87 return set.contains(c) ? norm2.getDecomposition(c) : null;
95 public String getRawDecomposition(int c) {
96 return set.contains(c) ? norm2.getRawDecomposition(c) : null;
104 public int composePair(int a, int b) {
105 return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
113 public int getCombiningClass(int c) {
114 return set.contains(c) ? norm2.getCombiningClass(c) : 0;
122 public boolean isNormalized(CharSequence s) {
123 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
124 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
125 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
126 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
127 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
129 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
132 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
134 prevSpanLimit=spanLimit;
144 public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
145 Normalizer.QuickCheckResult result=Normalizer.YES;
146 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
147 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
148 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
149 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
150 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
152 Normalizer.QuickCheckResult qcResult=
153 norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
154 if(qcResult==Normalizer.NO) {
156 } else if(qcResult==Normalizer.MAYBE) {
159 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
161 prevSpanLimit=spanLimit;
170 public int spanQuickCheckYes(CharSequence s) {
171 UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
172 for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
173 int spanLimit=set.span(s, prevSpanLimit, spanCondition);
174 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
175 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
179 norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
180 if(yesLimit<spanLimit) {
183 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
185 prevSpanLimit=spanLimit;
195 public boolean hasBoundaryBefore(int c) {
196 return !set.contains(c) || norm2.hasBoundaryBefore(c);
204 public boolean hasBoundaryAfter(int c) {
205 return !set.contains(c) || norm2.hasBoundaryAfter(c);
213 public boolean isInert(int c) {
214 return !set.contains(c) || norm2.isInert(c);
217 // Internal: No argument checking, and appends to dest.
218 // Pass as input spanCondition the one that is likely to yield a non-zero
219 // span length at the start of src.
220 // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
221 // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
222 // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
223 // an in-filter prefix.
224 private Appendable normalize(CharSequence src, Appendable dest,
225 UnicodeSet.SpanCondition spanCondition) {
226 // Don't throw away destination buffer between iterations.
227 StringBuilder tempDest=new StringBuilder();
229 for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
230 int spanLimit=set.span(src, prevSpanLimit, spanCondition);
231 int spanLength=spanLimit-prevSpanLimit;
232 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
234 dest.append(src, prevSpanLimit, spanLimit);
236 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
239 // Not norm2.normalizeSecondAndAppend() because we do not want
240 // to modify the non-filter part of dest.
241 dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
243 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
245 prevSpanLimit=spanLimit;
247 } catch(IOException e) {
248 throw new RuntimeException(e);
253 private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
254 boolean doNormalize) {
256 throw new IllegalArgumentException();
258 if(first.length()==0) {
260 return normalize(second, first);
262 return first.append(second);
265 // merge the in-filter suffix of the first string with the in-filter prefix of the second
266 int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
268 CharSequence prefix=second.subSequence(0, prefixLimit);
269 int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
272 norm2.normalizeSecondAndAppend(first, prefix);
274 norm2.append(first, prefix);
277 StringBuilder middle=new StringBuilder(
278 first.subSequence(suffixStart, first.length()));
280 norm2.normalizeSecondAndAppend(middle, prefix);
282 norm2.append(middle, prefix);
284 first.delete(suffixStart, 0x7fffffff).append(middle);
287 if(prefixLimit<second.length()) {
288 CharSequence rest=second.subSequence(prefixLimit, second.length());
290 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
298 private Normalizer2 norm2;
299 private UnicodeSet set;