]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-4_4_2-src/main/classes/core/src/com/ibm/icu/text/FilteredNormalizer2.java
go
[Dictionary.git] / jars / icu4j-4_4_2-src / main / classes / core / src / com / ibm / icu / text / FilteredNormalizer2.java
1 /*\r
2 *******************************************************************************\r
3 *   Copyright (C) 2009-2010, International Business Machines\r
4 *   Corporation and others.  All Rights Reserved.\r
5 *******************************************************************************\r
6 */\r
7 package com.ibm.icu.text;\r
8 \r
9 import java.io.IOException;\r
10 \r
11 /**\r
12  * Normalization filtered by a UnicodeSet.\r
13  * Normalizes portions of the text contained in the filter set and leaves\r
14  * portions not contained in the filter set unchanged.\r
15  * Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).\r
16  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".\r
17  * This class implements all of (and only) the Normalizer2 API.\r
18  * An instance of this class is unmodifiable/immutable.\r
19  * @draft ICU 4.4\r
20  * @provisional This API might change or be removed in a future release.\r
21  * @author Markus W. Scherer\r
22  */\r
23 public class FilteredNormalizer2 extends Normalizer2 {\r
24     /**\r
25      * Constructs a filtered normalizer wrapping any Normalizer2 instance\r
26      * and a filter set.\r
27      * Both are aliased and must not be modified or deleted while this object\r
28      * is used.\r
29      * The filter set should be frozen; otherwise the performance will suffer greatly.\r
30      * @param n2 wrapped Normalizer2 instance\r
31      * @param filterSet UnicodeSet which determines the characters to be normalized\r
32      * @draft ICU 4.4\r
33      * @provisional This API might change or be removed in a future release.\r
34      */\r
35     public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {\r
36         norm2=n2;\r
37         set=filterSet;\r
38     }\r
39 \r
40     /** {@inheritDoc}\r
41      * @draft ICU 4.4\r
42      * @provisional This API might change or be removed in a future release.\r
43      */\r
44     @Override\r
45     public StringBuilder normalize(CharSequence src, StringBuilder dest) {\r
46         if(dest==src) {\r
47             throw new IllegalArgumentException();\r
48         }\r
49         dest.setLength(0);\r
50         normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);\r
51         return dest;\r
52     }\r
53     /** {@inheritDoc}\r
54      * @internal ICU 4.4 TODO: propose for 4.6\r
55      * @provisional This API might change or be removed in a future release.\r
56      */\r
57     public Appendable normalize(CharSequence src, Appendable dest) {\r
58         if(dest==src) {\r
59             throw new IllegalArgumentException();\r
60         }\r
61         return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);\r
62     }\r
63 \r
64     /** {@inheritDoc}\r
65      * @draft ICU 4.4\r
66      * @provisional This API might change or be removed in a future release.\r
67      */\r
68     @Override\r
69     public StringBuilder normalizeSecondAndAppend(\r
70             StringBuilder first, CharSequence second) {\r
71         return normalizeSecondAndAppend(first, second, true);\r
72     }\r
73     /** {@inheritDoc}\r
74      * @draft ICU 4.4\r
75      * @provisional This API might change or be removed in a future release.\r
76      */\r
77     @Override\r
78     public StringBuilder append(StringBuilder first, CharSequence second) {\r
79         return normalizeSecondAndAppend(first, second, false);\r
80     }\r
81 \r
82     /** {@inheritDoc}\r
83      * @draft ICU 4.4\r
84      * @provisional This API might change or be removed in a future release.\r
85      */\r
86     @Override\r
87     public boolean isNormalized(CharSequence s) {\r
88         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
89         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {\r
90             int spanLimit=set.span(s, prevSpanLimit, spanCondition);\r
91             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {\r
92                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
93             } else {\r
94                 if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {\r
95                     return false;\r
96                 }\r
97                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;\r
98             }\r
99             prevSpanLimit=spanLimit;\r
100         }\r
101         return true;\r
102     }\r
103     /** {@inheritDoc}\r
104      * @draft ICU 4.4\r
105      * @provisional This API might change or be removed in a future release.\r
106      */\r
107     @Override\r
108     public Normalizer.QuickCheckResult quickCheck(CharSequence s) {\r
109         Normalizer.QuickCheckResult result=Normalizer.YES;\r
110         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
111         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {\r
112             int spanLimit=set.span(s, prevSpanLimit, spanCondition);\r
113             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {\r
114                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
115             } else {\r
116                 Normalizer.QuickCheckResult qcResult=\r
117                     norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));\r
118                 if(qcResult==Normalizer.NO) {\r
119                     return qcResult;\r
120                 } else if(qcResult==Normalizer.MAYBE) {\r
121                     result=qcResult;\r
122                 }\r
123                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;\r
124             }\r
125             prevSpanLimit=spanLimit;\r
126         }\r
127         return result;\r
128     }\r
129     /** {@inheritDoc}\r
130      * @draft ICU 4.4\r
131      * @provisional This API might change or be removed in a future release.\r
132      */\r
133     @Override\r
134     public int spanQuickCheckYes(CharSequence s) {\r
135         UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
136         for(int prevSpanLimit=0; prevSpanLimit<s.length();) {\r
137             int spanLimit=set.span(s, prevSpanLimit, spanCondition);\r
138             if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {\r
139                 spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
140             } else {\r
141                 int yesLimit=\r
142                     prevSpanLimit+\r
143                     norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));\r
144                 if(yesLimit<spanLimit) {\r
145                     return yesLimit;\r
146                 }\r
147                 spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;\r
148             }\r
149             prevSpanLimit=spanLimit;\r
150         }\r
151         return s.length();\r
152     }\r
153 \r
154     /** {@inheritDoc}\r
155      * @draft ICU 4.4\r
156      * @provisional This API might change or be removed in a future release.\r
157      */\r
158     @Override\r
159     public boolean hasBoundaryBefore(int c) {\r
160         return !set.contains(c) || norm2.hasBoundaryBefore(c);\r
161     }\r
162 \r
163     /** {@inheritDoc}\r
164      * @draft ICU 4.4\r
165      * @provisional This API might change or be removed in a future release.\r
166      */\r
167     @Override\r
168     public boolean hasBoundaryAfter(int c) {\r
169         return !set.contains(c) || norm2.hasBoundaryAfter(c);\r
170     }\r
171 \r
172     /** {@inheritDoc}\r
173      * @draft ICU 4.4\r
174      * @provisional This API might change or be removed in a future release.\r
175      */\r
176     @Override\r
177     public boolean isInert(int c) {\r
178         return !set.contains(c) || norm2.isInert(c);\r
179     }\r
180 \r
181     // Internal: No argument checking, and appends to dest.\r
182     // Pass as input spanCondition the one that is likely to yield a non-zero\r
183     // span length at the start of src.\r
184     // For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,\r
185     // UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src\r
186     // and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after\r
187     // an in-filter prefix.\r
188     private Appendable normalize(CharSequence src, Appendable dest,\r
189                                  UnicodeSet.SpanCondition spanCondition) {\r
190         // Don't throw away destination buffer between iterations.\r
191         StringBuilder tempDest=new StringBuilder();\r
192         try {\r
193             for(int prevSpanLimit=0; prevSpanLimit<src.length();) {\r
194                 int spanLimit=set.span(src, prevSpanLimit, spanCondition);\r
195                 int spanLength=spanLimit-prevSpanLimit;\r
196                 if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {\r
197                     if(spanLength!=0) {\r
198                         dest.append(src, prevSpanLimit, spanLimit);\r
199                     }\r
200                     spanCondition=UnicodeSet.SpanCondition.SIMPLE;\r
201                 } else {\r
202                     if(spanLength!=0) {\r
203                         // Not norm2.normalizeSecondAndAppend() because we do not want\r
204                         // to modify the non-filter part of dest.\r
205                         dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));\r
206                     }\r
207                     spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;\r
208                 }\r
209                 prevSpanLimit=spanLimit;\r
210             }\r
211         } catch(IOException e) {\r
212             throw new RuntimeException(e);\r
213         }\r
214         return dest;\r
215     }\r
216 \r
217     private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,\r
218                                                    boolean doNormalize) {\r
219         if(first==second) {\r
220             throw new IllegalArgumentException();\r
221         }\r
222         if(first.length()==0) {\r
223             if(doNormalize) {\r
224                 return normalize(second, first);\r
225             } else {\r
226                 return first.append(second);\r
227             }\r
228         }\r
229         // merge the in-filter suffix of the first string with the in-filter prefix of the second\r
230         int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);\r
231         if(prefixLimit!=0) {\r
232             CharSequence prefix=second.subSequence(0, prefixLimit);\r
233             int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);\r
234             if(suffixStart==0) {\r
235                 if(doNormalize) {\r
236                     norm2.normalizeSecondAndAppend(first, prefix);\r
237                 } else {\r
238                     norm2.append(first, prefix);\r
239                 }\r
240             } else {\r
241                 StringBuilder middle=new StringBuilder(first.subSequence(suffixStart, 0x7fffffff));\r
242                 if(doNormalize) {\r
243                     norm2.normalizeSecondAndAppend(middle, prefix);\r
244                 } else {\r
245                     norm2.append(middle, prefix);\r
246                 }\r
247                 first.delete(suffixStart, 0x7fffffff).append(middle);\r
248             }\r
249         }\r
250         if(prefixLimit<second.length()) {\r
251             CharSequence rest=second.subSequence(prefixLimit, 0x7fffffff);\r
252             if(doNormalize) {\r
253                 normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);\r
254             } else {\r
255                 first.append(rest);\r
256             }\r
257         }\r
258         return first;\r
259     }\r
260 \r
261     private Normalizer2 norm2;\r
262     private UnicodeSet set;\r
263 };\r