]> gitweb.fperrin.net Git - Dictionary.git/blob - jars/icu4j-52_1/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java
Clean up imports.
[Dictionary.git] / jars / icu4j-52_1 / main / classes / core / src / com / ibm / icu / text / CjkBreakEngine.java
1 /*
2  *******************************************************************************
3  * Copyright (C) 2012-2013, International Business Machines Corporation and         *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8
9 import static com.ibm.icu.impl.CharacterIteration.DONE32;
10 import static com.ibm.icu.impl.CharacterIteration.current32;
11 import static com.ibm.icu.impl.CharacterIteration.next32;
12
13 import java.io.IOException;
14 import java.text.CharacterIterator;
15 import java.util.Stack;
16
17 import com.ibm.icu.impl.Assert;
18
19 class CjkBreakEngine implements LanguageBreakEngine {
20     private static final UnicodeSet fHangulWordSet = new UnicodeSet();
21     private static final UnicodeSet fHanWordSet = new UnicodeSet();
22     private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
23     private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
24     static {
25         fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
26         fHanWordSet.applyPattern("[:Han:]");
27         fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
28         fHiraganaWordSet.applyPattern("[:Hiragana:]");
29         
30         // freeze them all
31         fHangulWordSet.freeze();
32         fHanWordSet.freeze();
33         fKatakanaWordSet.freeze();
34         fHiraganaWordSet.freeze();
35     }
36
37     private final UnicodeSet fWordSet;
38     private DictionaryMatcher fDictionary = null;
39     
40     public CjkBreakEngine(boolean korean) throws IOException {
41         fDictionary = DictionaryData.loadDictionaryFor("Hira");
42         if (korean) {
43             fWordSet = fHangulWordSet;
44         } else {
45             fWordSet = new UnicodeSet();
46             fWordSet.addAll(fHanWordSet);
47             fWordSet.addAll(fKatakanaWordSet);
48             fWordSet.addAll(fHiraganaWordSet);
49             fWordSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
50             fWordSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
51         }
52     }
53
54     public boolean handles(int c, int breakType) {
55         return (breakType == BreakIterator.KIND_WORD) &&
56                 (fWordSet.contains(c));
57     }
58
59     private static final int kMaxKatakanaLength = 8;
60     private static final int kMaxKatakanaGroupLength = 20;
61     private static final int maxSnlp = 255;
62     private static final int kint32max = Integer.MAX_VALUE;
63     private static int getKatakanaCost(int wordlength) {
64         int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
65         return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
66     }
67     
68     private static boolean isKatakana(int value) {
69         return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
70                 (value >= 0xFF66 && value <= 0xFF9F);
71     }
72     
73     public int findBreaks(CharacterIterator inText, int startPos, int endPos,
74             boolean reverse, int breakType, Stack<Integer> foundBreaks) {
75         if (startPos >= endPos) {
76             return 0;
77         }
78
79         inText.setIndex(startPos);
80
81         int inputLength = endPos - startPos;
82         int[] charPositions = new int[inputLength + 1];
83         StringBuffer s = new StringBuffer("");
84         inText.setIndex(startPos);
85         while (inText.getIndex() < endPos) {
86             s.append(inText.current());
87             inText.next();
88         }
89         String prenormstr = s.toString();
90         boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
91                                Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
92         CharacterIterator text = inText;
93         int numChars = 0;
94         if (isNormalized) {
95             int index = 0;
96             charPositions[0] = 0;
97             while (index < prenormstr.length()) {
98                 int codepoint = prenormstr.codePointAt(index);
99                 index += Character.charCount(codepoint);
100                 numChars++;
101                 charPositions[numChars] = index;
102             }
103         } else {
104             String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
105             text = new java.text.StringCharacterIterator(normStr);
106             charPositions = new int[normStr.length() + 1];
107             Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
108             int index = 0;
109             charPositions[0] = 0;
110             while (index < normalizer.endIndex()) {
111                 normalizer.next();
112                 numChars++;
113                 index = normalizer.getIndex();
114                 charPositions[numChars] = index;
115             }
116         }
117         
118         // From here on out, do the algorithm. Note that our indices
119         // refer to indices within the normalized string.
120         int[] bestSnlp = new int[numChars + 1];
121         bestSnlp[0] = 0;
122         for (int i = 1; i <= numChars; i++) {
123             bestSnlp[i] = kint32max;
124         }
125
126         int[] prev = new int[numChars + 1];
127         for (int i = 0; i <= numChars; i++) {
128             prev[i] = -1;
129         }
130         
131         final int maxWordSize = 20;
132         int values[] = new int[numChars];
133         int lengths[] = new int[numChars];
134         // dynamic programming to find the best segmentation
135         boolean is_prev_katakana = false;
136         for (int i = 0; i < numChars; i++) {
137             text.setIndex(i);
138             if (bestSnlp[i] == kint32max) {
139                 continue;
140             }
141             
142             int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
143             int[] count_ = new int[1];
144             fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
145             int count = count_[0];
146             
147             // if there are no single character matches found in the dictionary 
148             // starting with this character, treat character as a 1-character word
149             // with the highest value possible (i.e. the least likely to occur).
150             // Exclude Korean characters from this treatment, as they should be 
151             // left together by default.
152             if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
153                 values[count] = maxSnlp;
154                 lengths[count] = 1;
155                 count++;
156             }
157
158             for (int j = 0; j < count; j++) {
159                 int newSnlp = bestSnlp[i] + values[j];
160                 if (newSnlp < bestSnlp[lengths[j] + i]) {
161                     bestSnlp[lengths[j] + i] = newSnlp;
162                     prev[lengths[j] + i] = i;
163                 }
164             }
165             
166             // In Japanese, single-character Katakana words are pretty rare.
167             // So we apply the following heuristic to Katakana: any continuous
168             // run of Katakana characters is considered a candidate word with
169             // a default cost specified in the katakanaCost table according 
170             // to its length.
171             text.setIndex(i);
172             boolean is_katakana = isKatakana(current32(text));
173             if (!is_prev_katakana && is_katakana) {
174                 int j = i + 1;
175                 next32(text);
176                 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
177                     next32(text);
178                     ++j;
179                 }
180                 
181                 if ((j - i) < kMaxKatakanaGroupLength) {
182                     int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
183                     if (newSnlp < bestSnlp[j]) {
184                         bestSnlp[j] = newSnlp;
185                         prev[j] = i;
186                     }
187                 }
188             }
189             is_prev_katakana = is_katakana;
190         }
191
192         int t_boundary[] = new int[numChars + 1];
193         int numBreaks = 0;
194         if (bestSnlp[numChars] == kint32max) {
195             t_boundary[numBreaks] = numChars;
196             numBreaks++;
197         } else {
198             for (int i = numChars; i > 0; i = prev[i]) {
199                 t_boundary[numBreaks] = i;
200                 numBreaks++;
201             }
202             Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
203         }
204
205         if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
206             t_boundary[numBreaks++] = 0;
207         }
208
209         for (int i = numBreaks - 1; i >= 0; i--) {
210             int pos = charPositions[t_boundary[i]] + startPos;
211             if (!(foundBreaks.contains(pos) || pos == startPos))
212                 foundBreaks.push(charPositions[t_boundary[i]] + startPos);
213         }
214
215         if (!foundBreaks.empty() && foundBreaks.peek() == endPos)
216             foundBreaks.pop();
217         if (!foundBreaks.empty()) 
218             inText.setIndex(foundBreaks.peek());
219         return 0;
220     }
221 }