]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/FilteredNormalizer2.java
Added flags.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / core / src / com / ibm / icu / text / FilteredNormalizer2.java
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2009-2013, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 */
7 package com.ibm.icu.text;
8
9 import java.io.IOException;
10
11 /**
12  * Normalization filtered by a UnicodeSet.
13  * Normalizes portions of the text contained in the filter set and leaves
14  * portions not contained in the filter set unchanged.
15  * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
16  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
17  * This class implements all of (and only) the Normalizer2 API.
18  * An instance of this class is unmodifiable/immutable.
19  * @stable ICU 4.4
20  * @author Markus W. Scherer
21  */
22 public class FilteredNormalizer2 extends Normalizer2 {
23     /**
24      * Constructs a filtered normalizer wrapping any Normalizer2 instance
25      * and a filter set.
26      * Both are aliased and must not be modified or deleted while this object
27      * is used.
28      * The filter set should be frozen; otherwise the performance will suffer greatly.
29      * @param n2 wrapped Normalizer2 instance
30      * @param filterSet UnicodeSet which determines the characters to be normalized
31      * @stable ICU 4.4
32      */
33     public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
34         norm2=n2;
35         set=filterSet;
36     }
37
38     /**
39      * {@inheritDoc}
40      * @stable ICU 4.4
41      */
42     @Override
43     public StringBuilder normalize(CharSequence src, StringBuilder dest) {
44         if(dest==src) {
45             throw new IllegalArgumentException();
46         }
47         dest.setLength(0);
48         normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
49         return dest;
50     }
51     /**
52      * {@inheritDoc}
53      * @stable ICU 4.6
54      */
55     @Override
56     public Appendable normalize(CharSequence src, Appendable dest) {
57         if(dest==src) {
58             throw new IllegalArgumentException();
59         }
60         return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
61     }
62
63     /**
64      * {@inheritDoc}
65      * @stable ICU 4.4
66      */
67     @Override
68     public StringBuilder normalizeSecondAndAppend(
69             StringBuilder first, CharSequence second) {
70         return normalizeSecondAndAppend(first, second, true);
71     }
72     /**
73      * {@inheritDoc}
74      * @stable ICU 4.4
75      */
76     @Override
77     public StringBuilder append(StringBuilder first, CharSequence second) {
78         return normalizeSecondAndAppend(first, second, false);
79     }
80
81     /**
82      * {@inheritDoc}
83      * @stable ICU 4.6
84      */
85     @Override
86     public String getDecomposition(int c) {
87         return set.contains(c) ? norm2.getDecomposition(c) : null;
88     }
89
90     /**
91      * {@inheritDoc}
92      * @stable ICU 49
93      */
94     @Override
95     public String getRawDecomposition(int c) {
96         return set.contains(c) ? norm2.getRawDecomposition(c) : null;
97     }
98
99     /**
100      * {@inheritDoc}
101      * @stable ICU 49
102      */
103     @Override
104     public int composePair(int a, int b) {
105         return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
106     }
107
108     /**
109      * {@inheritDoc}
110      * @stable ICU 49
111      */
112     @Override
113     public int getCombiningClass(int c) {
114         return set.contains(c) ? norm2.getCombiningClass(c) : 0;
115     }
116
117     /**
118      * {@inheritDoc}
119      * @stable ICU 4.4
120      */
121     @Override
122     public boolean isNormalized(CharSequence s) {
123         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
124         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
125             int spanLimit=set.span(s, prevSpanLimit, spanCondition);
126             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
127                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
128             } else {
129                 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
130                     return false;
131                 }
132                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
133             }
134             prevSpanLimit=spanLimit;
135         }
136         return true;
137     }
138
139     /**
140      * {@inheritDoc}
141      * @stable ICU 4.4
142      */
143     @Override
144     public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
145         Normalizer.QuickCheckResult result=Normalizer.YES;
146         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
147         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
148             int spanLimit=set.span(s, prevSpanLimit, spanCondition);
149             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
150                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
151             } else {
152                 Normalizer.QuickCheckResult qcResult=
153                     norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
154                 if(qcResult==Normalizer.NO) {
155                     return qcResult;
156                 } else if(qcResult==Normalizer.MAYBE) {
157                     result=qcResult;
158                 }
159                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
160             }
161             prevSpanLimit=spanLimit;
162         }
163         return result;
164     }
165     /**
166      * {@inheritDoc}
167      * @stable ICU 4.4
168      */
169     @Override
170     public int spanQuickCheckYes(CharSequence s) {
171         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
172         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
173             int spanLimit=set.span(s, prevSpanLimit, spanCondition);
174             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
175                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;
176             } else {
177                 int yesLimit=
178                     prevSpanLimit+
179                     norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
180                 if(yesLimit<spanLimit) {
181                     return yesLimit;
182                 }
183                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
184             }
185             prevSpanLimit=spanLimit;
186         }
187         return s.length();
188     }
189
190     /**
191      * {@inheritDoc}
192      * @stable ICU 4.4
193      */
194     @Override
195     public boolean hasBoundaryBefore(int c) {
196         return !set.contains(c) || norm2.hasBoundaryBefore(c);
197     }
198
199     /**
200      * {@inheritDoc}
201      * @stable ICU 4.4
202      */
203     @Override
204     public boolean hasBoundaryAfter(int c) {
205         return !set.contains(c) || norm2.hasBoundaryAfter(c);
206     }
207
208     /**
209      * {@inheritDoc}
210      * @stable ICU 4.4
211      */
212     @Override
213     public boolean isInert(int c) {
214         return !set.contains(c) || norm2.isInert(c);
215     }
216
217     // Internal: No argument checking, and appends to dest.
218     // Pass as input spanCondition the one that is likely to yield a non-zero
219     // span length at the start of src.
220     // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
221     // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
222     // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
223     // an in-filter prefix.
224     private Appendable normalize(CharSequence src, Appendable dest,
225                                  UnicodeSet.SpanCondition spanCondition) {
226         // Don't throw away destination buffer between iterations.
227         StringBuilder tempDest=new StringBuilder();
228         try {
229             for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
230                 int spanLimit=set.span(src, prevSpanLimit, spanCondition);
231                 int spanLength=spanLimit-prevSpanLimit;
232                 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
233                     if(spanLength!=0) {
234                         dest.append(src, prevSpanLimit, spanLimit);
235                     }
236                     spanCondition=UnicodeSet.SpanCondition.SIMPLE;
237                 } else {
238                     if(spanLength!=0) {
239                         // Not norm2.normalizeSecondAndAppend() because we do not want
240                         // to modify the non-filter part of dest.
241                         dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
242                     }
243                     spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
244                 }
245                 prevSpanLimit=spanLimit;
246             }
247         } catch(IOException e) {
248             throw new RuntimeException(e);
249         }
250         return dest;
251     }
252
253     private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
254                                                    boolean doNormalize) {
255         if(first==second) {
256             throw new IllegalArgumentException();
257         }
258         if(first.length()==0) {
259             if(doNormalize) {
260                 return normalize(second, first);
261             } else {
262                 return first.append(second);
263             }
264         }
265         // merge the in-filter suffix of the first string with the in-filter prefix of the second
266         int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
267         if(prefixLimit!=0) {
268             CharSequence prefix=second.subSequence(0, prefixLimit);
269             int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
270             if(suffixStart==0) {
271                 if(doNormalize) {
272                     norm2.normalizeSecondAndAppend(first, prefix);
273                 } else {
274                     norm2.append(first, prefix);
275                 }
276             } else {
277                 StringBuilder middle=new StringBuilder(
278                         first.subSequence(suffixStart, first.length()));
279                 if(doNormalize) {
280                     norm2.normalizeSecondAndAppend(middle, prefix);
281                 } else {
282                     norm2.append(middle, prefix);
283                 }
284                 first.delete(suffixStart, 0x7fffffff).append(middle);
285             }
286         }
287         if(prefixLimit<second.length()) {
288             CharSequence rest=second.subSequence(prefixLimit, second.length());
289             if(doNormalize) {
290                 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
291             } else {
292                 first.append(rest);
293             }
294         }
295         return first;
296     }
297
298     private Normalizer2 norm2;
299     private UnicodeSet set;
300 };